From: Eric Biggers Date: Sun, 11 Jan 2015 16:24:07 +0000 (-0600) Subject: Merge LZX compression updates X-Git-Tag: v1.8.0~66 X-Git-Url: https://wimlib.net/git/?p=wimlib;a=commitdiff_plain;h=3e8aa757aaa63297f0d54007adf46411778fb6a8 Merge LZX compression updates --- diff --git a/Makefile.am b/Makefile.am index 78154fdc..157fd59e 100644 --- a/Makefile.am +++ b/Makefile.am @@ -59,12 +59,9 @@ libwim_la_SOURCES = \ src/iterate_dir.c \ src/join.c \ src/lookup_table.c \ - src/lz_binary_trees.c \ - src/lz_hash_chains.c \ src/lz_lcp_interval_tree.c \ src/lz_linked_suffix_array.c \ src/lz_mf.c \ - src/lz_null.c \ src/lz_repsearch.c \ src/lz_suffix_array_utils.c \ src/lzms_common.c \ @@ -127,7 +124,7 @@ libwim_la_SOURCES = \ include/wimlib/list.h \ include/wimlib/lookup_table.h \ include/wimlib/lz_extend.h \ - include/wimlib/lz_hash3.h \ + include/wimlib/lz_hash.h \ include/wimlib/lz_mf.h \ include/wimlib/lz_mf_ops.h \ include/wimlib/lz_repsearch.h \ @@ -138,8 +135,6 @@ libwim_la_SOURCES = \ include/wimlib/lzx_constants.h \ include/wimlib/matchfinder_avx2.h \ include/wimlib/matchfinder_common.h \ - include/wimlib/matchfinder_nonsliding.h \ - include/wimlib/matchfinder_sliding.h \ include/wimlib/matchfinder_sse2.h \ include/wimlib/metadata.h \ include/wimlib/pathlist.h \ diff --git a/README b/README index 6b4b174d..18b18817 100644 --- a/README +++ b/README @@ -78,14 +78,14 @@ create the file. When applicable, the results with the equivalent Microsoft implementation in WIMGAPI is included. ============================================================================= - | Compression || wimlib (v1.7.4) | WIMGAPI (Windows 8.1) | + | Compression || wimlib (v1.7.5-BETA) | WIMGAPI (Windows 8.1) | ============================================================================= | None [1] || 361,314,224 in 2.4s | 361,315,338 in 4.5s | | XPRESS [2] || 138,218,750 in 3.0s | 140,457,436 in 6.0s | | XPRESS (slow) [3] || 135,173,511 in 8.9s | N/A | - | LZX (quick) [4] || 130,332,007 in 4.1s | N/A | - | LZX (normal) [5] || 126,714,807 in 12.5s | 127,293,240 in 19.2s | - | LZX (slow) [6] || 126,150,743 in 20.5s | N/A | + | LZX (quick) [4] || 130,207,195 in 3.8s | N/A | + | LZX (normal) [5] || 126,522,539 in 10.4s | 127,293,240 in 19.2s | + | LZX (slow) [6] || 126,042,313 in 17.3s | N/A | | LZMS (non-solid) [7] || 121,909,792 in 11.9s | N/A | | LZMS (solid) [8] || 93,650,936 in 45.0s | 88,771,192 in 109.2 | | "WIMBoot" [9] || 167,023,719 in 3.5s | 169,109,211 in 10.4s | @@ -149,24 +149,24 @@ formats/programs: | WIM (WIMGAPI, None) | 2,814,254 | | WIM (wimlib, None) | 2,814,216 | | WIM (WIMGAPI, XPRESS) | 825,536 | - | WIM (wimlib, XPRESS) | 790,016 | + | WIM (wimlib, XPRESS) | 789,296 | | tar.gz (gzip, default) | 738,796 | | ZIP (Info-ZIP, default) | 735,334 | | tar.gz (gzip, -9) | 733,971 | | ZIP (Info-ZIP, -9) | 732,297 | - | WIM (wimlib, LZX quick) | 704,006 | + | WIM (wimlib, LZX quick) | 690,110 | | WIM (WIMGAPI, LZX) | 651,866 | - | WIM (wimlib, LZX normal) | 632,614 | - | WIM (wimlib, LZX slow) | 625,050 | + | WIM (wimlib, LZX normal) | 624,634 | + | WIM (wimlib, LZX slow) | 620,728 | | WIM (wimlib, LZMS non-solid) | 581,960 | | tar.bz2 (bzip, default) | 565,008 | | tar.bz2 (bzip, -9) | 565,008 | - | WIM (wimlib, LZX solid) | 532,700 | + | WIM (wimlib, LZX solid) | 527,688 | | WIM (wimlib, LZMS solid) | 525,990 | - | WIM (wimlib, LZX solid, slow) | 525,140 | | WIM (wimlib, LZMS solid, slow) | 523,728 | + | WIM (wimlib, LZX solid, slow) | 522,042 | | WIM (WIMGAPI, LZMS solid) | 521,366 | - | WIM (wimlib, LZX solid, very slow) | 520,832 | + | WIM (wimlib, LZX solid, very slow) | 519,546 | | tar.xz (xz, default) | 486,916 | | tar.xz (xz, -9) | 486,904 | | 7z (7-zip, default) | 484,700 | diff --git a/include/wimlib/bt_matchfinder.h b/include/wimlib/bt_matchfinder.h index 43654c5b..1d30e362 100644 --- a/include/wimlib/bt_matchfinder.h +++ b/include/wimlib/bt_matchfinder.h @@ -1,229 +1,336 @@ /* * bt_matchfinder.h * - * Copyright (c) 2014 Eric Biggers. All rights reserved. + * Author: Eric Biggers + * Year: 2014, 2015 * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. + * ---------------------------------------------------------------------------- * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. + * This is a Binary Trees (bt) based matchfinder. * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * The data structure is a hash table where each hash bucket contains a binary + * tree of sequences whose first 3 bytes share the same hash code. Each + * sequence is identified by its starting position in the input buffer. Each + * binary tree is always sorted such that each left child represents a sequence + * lexicographically lesser than its parent and each right child represents a + * sequence lexicographically greater than its parent. + * + * The algorithm processes the input buffer sequentially. At each byte + * position, the hash code of the first 3 bytes of the sequence beginning at + * that position (the sequence being matched against) is computed. This + * identifies the hash bucket to use for that position. Then, a new binary tree + * node is created to represent the current sequence. Then, in a single tree + * traversal, the hash bucket's binary tree is searched for matches and is + * re-rooted at the new node. + * + * Compared to the simpler algorithm that uses linked lists instead of binary + * trees (see hc_matchfinder.h), the binary tree version gains more information + * at each node visitation. Ideally, the binary tree version will examine only + * 'log(n)' nodes to find the same matches that the linked list version will + * find by examining 'n' nodes. In addition, the binary tree version can + * examine fewer bytes at each node by taking advantage of the common prefixes + * that result from the sort order, whereas the linked list version may have to + * examine up to the full length of the match at each node. + * + * However, it is not always best to use the binary tree version. It requires + * nearly twice as much memory as the linked list version, and it takes time to + * keep the binary trees sorted, even at positions where the compressor does not + * need matches. Generally, when doing fast compression on small buffers, + * binary trees are the wrong approach. They are best suited for thorough + * compression and/or large buffers. + * + * ---------------------------------------------------------------------------- */ #ifndef _BT_MATCHFINDER_H #define _BT_MATCHFINDER_H #include "wimlib/lz_extend.h" -#include "wimlib/lz_hash3.h" +#include "wimlib/lz_hash.h" #include "wimlib/matchfinder_common.h" -#include "wimlib/unaligned.h" - -#ifndef BT_MATCHFINDER_HASH_ORDER -# if MATCHFINDER_WINDOW_ORDER < 14 -# define BT_MATCHFINDER_HASH_ORDER 14 -# else -# define BT_MATCHFINDER_HASH_ORDER 15 -# endif + +#if MATCHFINDER_MAX_WINDOW_ORDER < 13 +# define BT_MATCHFINDER_HASH_ORDER 14 +#elif MATCHFINDER_MAX_WINDOW_ORDER < 15 +# define BT_MATCHFINDER_HASH_ORDER 15 +#else +# define BT_MATCHFINDER_HASH_ORDER 16 #endif #define BT_MATCHFINDER_HASH_LENGTH (1UL << BT_MATCHFINDER_HASH_ORDER) -#define BT_MATCHFINDER_TOTAL_LENGTH \ - (BT_MATCHFINDER_HASH_LENGTH + (2UL * MATCHFINDER_WINDOW_SIZE)) - struct bt_matchfinder { - union { - pos_t mf_data[BT_MATCHFINDER_TOTAL_LENGTH]; - struct { - pos_t hash_tab[BT_MATCHFINDER_HASH_LENGTH]; - pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE]; - }; - }; + pos_t hash_tab[BT_MATCHFINDER_HASH_LENGTH]; + pos_t child_tab[]; } _aligned_attribute(MATCHFINDER_ALIGNMENT); +/* Return the number of bytes that must be allocated for a 'bt_matchfinder' that + * can work with buffers up to the specified size. */ +static inline size_t +bt_matchfinder_size(size_t max_bufsize) +{ + return sizeof(pos_t) * (BT_MATCHFINDER_HASH_LENGTH + (2 * max_bufsize)); +} + +/* Prepare the matchfinder for a new input buffer. */ static inline void bt_matchfinder_init(struct bt_matchfinder *mf) { matchfinder_init(mf->hash_tab, BT_MATCHFINDER_HASH_LENGTH); } -#if MATCHFINDER_IS_SLIDING -static inline void -bt_matchfinder_slide_window(struct bt_matchfinder *mf) +static inline u32 +bt_matchfinder_hash_3_bytes(const u8 *in_next) { - matchfinder_rebase(mf->mf_data, BT_MATCHFINDER_TOTAL_LENGTH); + return lz_hash_3_bytes(in_next, BT_MATCHFINDER_HASH_ORDER); } -#endif -static inline unsigned +static inline pos_t * +bt_child(struct bt_matchfinder *mf, pos_t node, int offset) +{ + if (MATCHFINDER_MAX_WINDOW_ORDER < sizeof(pos_t) * 8) { + /* no cast needed */ + return &mf->child_tab[(node << 1) + offset]; + } else { + return &mf->child_tab[((size_t)node << 1) + offset]; + } +} + +static inline pos_t * +bt_left_child(struct bt_matchfinder *mf, pos_t node) +{ + return bt_child(mf, node, 0); +} + +static inline pos_t * +bt_right_child(struct bt_matchfinder *mf, pos_t node) +{ + return bt_child(mf, node, 1); +} + +/* + * Retrieve a list of matches with the current position. + * + * @mf + * The matchfinder structure. + * @in_begin + * Pointer to the beginning of the input buffer. + * @in_next + * Pointer to the next byte in the input buffer to process. This is the + * pointer to the sequence being matched against. + * @min_len + * Only record matches that are at least this long. + * @max_len + * The maximum permissible match length at this position. + * @nice_len + * Stop searching if a match of at least this length is found. + * @max_search_depth + * Limit on the number of potential matches to consider. + * @next_hash + * Pointer to the hash code for the current sequence, which was computed + * one position in advance so that the binary tree root could be + * prefetched. This is an input/output parameter. + * @best_len_ret + * The length of the longest match found is written here. (This is + * actually redundant with the 'struct lz_match' array, but this is easier + * for the compiler to optimize when inlined and the caller immediately + * does a check against 'best_len'.) + * @lz_matchptr + * An array in which this function will record the matches. The recorded + * matches will be sorted by strictly increasing length and strictly + * increasing offset. The maximum number of matches that may be found is + * 'min(nice_len, max_len) - 3 + 1'. + * + * The return value is a pointer to the next available slot in the @lz_matchptr + * array. (If no matches were found, this will be the same as @lz_matchptr.) + */ +static inline struct lz_match * bt_matchfinder_get_matches(struct bt_matchfinder * const restrict mf, - const u8 * const in_base, + const u8 * const in_begin, const u8 * const in_next, const unsigned min_len, const unsigned max_len, const unsigned nice_len, const unsigned max_search_depth, - unsigned long *prev_hash, - struct lz_match * const restrict matches) + u32 * restrict next_hash, + unsigned * restrict best_len_ret, + struct lz_match * restrict lz_matchptr) { - struct lz_match *lz_matchptr = matches; unsigned depth_remaining = max_search_depth; - unsigned hash; - pos_t cur_match; + u32 hash; + pos_t cur_node; const u8 *matchptr; - unsigned best_len; pos_t *pending_lt_ptr, *pending_gt_ptr; unsigned best_lt_len, best_gt_len; unsigned len; - pos_t *children; + unsigned best_len = min_len - 1; - if (unlikely(max_len < LZ_HASH_REQUIRED_NBYTES + 1)) - return 0; + if (unlikely(max_len < LZ_HASH_REQUIRED_NBYTES + 1)) { + *best_len_ret = best_len; + return lz_matchptr; + } - hash = *prev_hash; - *prev_hash = lz_hash(in_next + 1, BT_MATCHFINDER_HASH_ORDER); - prefetch(&mf->hash_tab[*prev_hash]); - cur_match = mf->hash_tab[hash]; - mf->hash_tab[hash] = in_next - in_base; + hash = *next_hash; + *next_hash = bt_matchfinder_hash_3_bytes(in_next + 1); + cur_node = mf->hash_tab[hash]; + mf->hash_tab[hash] = in_next - in_begin; + prefetch(&mf->hash_tab[*next_hash]); - best_len = min_len - 1; - pending_lt_ptr = &mf->child_tab[(in_next - in_base) << 1]; - pending_gt_ptr = &mf->child_tab[((in_next - in_base) << 1) + 1]; + pending_lt_ptr = bt_left_child(mf, in_next - in_begin); + pending_gt_ptr = bt_right_child(mf, in_next - in_begin); best_lt_len = 0; best_gt_len = 0; - for (;;) { - if (!matchfinder_match_in_window(cur_match, - in_base, in_next) || - !depth_remaining--) - { - *pending_lt_ptr = MATCHFINDER_INITVAL; - *pending_gt_ptr = MATCHFINDER_INITVAL; - return lz_matchptr - matches; - } + len = 0; - matchptr = &in_base[cur_match]; - len = min(best_lt_len, best_gt_len); + if (!matchfinder_node_valid(cur_node)) { + *pending_lt_ptr = MATCHFINDER_NULL; + *pending_gt_ptr = MATCHFINDER_NULL; + *best_len_ret = best_len; + return lz_matchptr; + } - children = &mf->child_tab[(unsigned long) - matchfinder_slot_for_match(cur_match) << 1]; + for (;;) { + matchptr = &in_begin[cur_node]; if (matchptr[len] == in_next[len]) { - len = lz_extend(in_next, matchptr, len + 1, max_len); - if (len > best_len) { best_len = len; - lz_matchptr->length = len; lz_matchptr->offset = in_next - matchptr; lz_matchptr++; - if (len >= nice_len) { - *pending_lt_ptr = children[0]; - *pending_gt_ptr = children[1]; - return lz_matchptr - matches; + *pending_lt_ptr = *bt_left_child(mf, cur_node); + *pending_gt_ptr = *bt_right_child(mf, cur_node); + *best_len_ret = best_len; + return lz_matchptr; } } } if (matchptr[len] < in_next[len]) { - *pending_lt_ptr = cur_match; - pending_lt_ptr = &children[1]; - cur_match = *pending_lt_ptr; + *pending_lt_ptr = cur_node; + pending_lt_ptr = bt_right_child(mf, cur_node); + cur_node = *pending_lt_ptr; best_lt_len = len; + if (best_gt_len < len) + len = best_gt_len; } else { - *pending_gt_ptr = cur_match; - pending_gt_ptr = &children[0]; - cur_match = *pending_gt_ptr; + *pending_gt_ptr = cur_node; + pending_gt_ptr = bt_left_child(mf, cur_node); + cur_node = *pending_gt_ptr; best_gt_len = len; + if (best_lt_len < len) + len = best_lt_len; + } + + if (!matchfinder_node_valid(cur_node) || !--depth_remaining) { + *pending_lt_ptr = MATCHFINDER_NULL; + *pending_gt_ptr = MATCHFINDER_NULL; + *best_len_ret = best_len; + return lz_matchptr; } } } +/* + * Advance the matchfinder, but don't record any matches. + * + * @mf + * The matchfinder structure. + * @in_begin + * Pointer to the beginning of the input buffer. + * @in_next + * Pointer to the next byte in the input buffer to process. + * @in_end + * Pointer to the end of the input buffer. + * @nice_len + * Stop searching if a match of at least this length is found. + * @max_search_depth + * Limit on the number of potential matches to consider. + * @next_hash + * Pointer to the hash code for the current sequence, which was computed + * one position in advance so that the binary tree root could be + * prefetched. This is an input/output parameter. + * + * Note: this is very similar to bt_matchfinder_get_matches() because both + * functions must do hashing and tree re-rooting. This version just doesn't + * actually record any matches. + */ static inline void bt_matchfinder_skip_position(struct bt_matchfinder * const restrict mf, - const u8 * const in_base, + const u8 * const in_begin, const u8 * const in_next, const u8 * const in_end, const unsigned nice_len, const unsigned max_search_depth, - unsigned long *prev_hash) + u32 * restrict next_hash) { unsigned depth_remaining = max_search_depth; - unsigned hash; - pos_t cur_match; + u32 hash; + pos_t cur_node; const u8 *matchptr; pos_t *pending_lt_ptr, *pending_gt_ptr; unsigned best_lt_len, best_gt_len; unsigned len; - pos_t *children; if (unlikely(in_end - in_next < LZ_HASH_REQUIRED_NBYTES + 1)) return; - hash = *prev_hash; - *prev_hash = lz_hash(in_next + 1, BT_MATCHFINDER_HASH_ORDER); - prefetch(&mf->hash_tab[*prev_hash]); - cur_match = mf->hash_tab[hash]; - mf->hash_tab[hash] = in_next - in_base; + hash = *next_hash; + *next_hash = bt_matchfinder_hash_3_bytes(in_next + 1); + cur_node = mf->hash_tab[hash]; + mf->hash_tab[hash] = in_next - in_begin; + prefetch(&mf->hash_tab[*next_hash]); depth_remaining = max_search_depth; - pending_lt_ptr = &mf->child_tab[(in_next - in_base) << 1]; - pending_gt_ptr = &mf->child_tab[((in_next - in_base) << 1) + 1]; + pending_lt_ptr = bt_left_child(mf, in_next - in_begin); + pending_gt_ptr = bt_right_child(mf, in_next - in_begin); best_lt_len = 0; best_gt_len = 0; - for (;;) { - if (!matchfinder_match_in_window(cur_match, - in_base, in_next) || - !depth_remaining--) - { - *pending_lt_ptr = MATCHFINDER_INITVAL; - *pending_gt_ptr = MATCHFINDER_INITVAL; - return; - } + len = 0; - matchptr = &in_base[cur_match]; - len = min(best_lt_len, best_gt_len); + if (!matchfinder_node_valid(cur_node)) { + *pending_lt_ptr = MATCHFINDER_NULL; + *pending_gt_ptr = MATCHFINDER_NULL; + return; + } - children = &mf->child_tab[(unsigned long) - matchfinder_slot_for_match(cur_match) << 1]; + for (;;) { + matchptr = &in_begin[cur_node]; if (matchptr[len] == in_next[len]) { len = lz_extend(in_next, matchptr, len + 1, nice_len); if (len == nice_len) { - *pending_lt_ptr = children[0]; - *pending_gt_ptr = children[1]; + *pending_lt_ptr = *bt_left_child(mf, cur_node); + *pending_gt_ptr = *bt_right_child(mf, cur_node); return; } } if (matchptr[len] < in_next[len]) { - *pending_lt_ptr = cur_match; - pending_lt_ptr = &children[1]; - cur_match = *pending_lt_ptr; + *pending_lt_ptr = cur_node; + pending_lt_ptr = bt_right_child(mf, cur_node); + cur_node = *pending_lt_ptr; best_lt_len = len; + if (best_gt_len < len) + len = best_gt_len; } else { - *pending_gt_ptr = cur_match; - pending_gt_ptr = &children[0]; - cur_match = *pending_gt_ptr; + *pending_gt_ptr = cur_node; + pending_gt_ptr = bt_left_child(mf, cur_node); + cur_node = *pending_gt_ptr; best_gt_len = len; + if (best_lt_len < len) + len = best_lt_len; + } + + if (!matchfinder_node_valid(cur_node) || !--depth_remaining) { + *pending_lt_ptr = MATCHFINDER_NULL; + *pending_gt_ptr = MATCHFINDER_NULL; + return; } } } diff --git a/include/wimlib/endianness.h b/include/wimlib/endianness.h index d4de3350..f6bb01dd 100644 --- a/include/wimlib/endianness.h +++ b/include/wimlib/endianness.h @@ -3,6 +3,9 @@ * * Macros and inline functions for endianness conversion. * + * Author: Eric Biggers + * Year: 2014, 2015 + * * The author dedicates this file to the public domain. * You can do whatever you want with this file. */ diff --git a/include/wimlib/hc_matchfinder.h b/include/wimlib/hc_matchfinder.h index 02271ff4..1403a157 100644 --- a/include/wimlib/hc_matchfinder.h +++ b/include/wimlib/hc_matchfinder.h @@ -1,108 +1,161 @@ /* * hc_matchfinder.h * - * Copyright (c) 2014 Eric Biggers. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * Author: Eric Biggers + * Year: 2014, 2015 + * + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. + * + * --------------------------------------------------------------------------- + * + * Algorithm + * + * This is a Hash Chains (hc) based matchfinder. + * + * The data structure is a hash table where each hash bucket contains a linked + * list (or "chain") of sequences whose first 3 bytes share the same hash code. + * Each sequence is identified by its starting position in the input buffer. + * + * The algorithm processes the input buffer sequentially. At each byte + * position, the hash code of the first 3 bytes of the sequence beginning at + * that position (the sequence being matched against) is computed. This + * identifies the hash bucket to use for that position. Then, this hash + * bucket's linked list is searched for matches. Then, a new linked list node + * is created to represent the current sequence and is prepended to the list. + * + * This algorithm has several useful properties: + * + * - It only finds true Lempel-Ziv matches; i.e., those where the matching + * sequence occurs prior to the sequence being matched against. + * + * - The sequences in each linked list are always sorted by decreasing starting + * position. Therefore, the closest (smallest offset) matches are found + * first, which in many compression formats tend to be the cheapest to encode. + * + * - Although fast running time is not guaranteed due to the possibility of the + * lists getting very long, the worst degenerate behavior can be easily + * prevented by capping the number of nodes searched at each position. + * + * - If the compressor decides not to search for matches at a certain position, + * then that position can be quickly inserted without searching the list. + * + * - The algorithm is adaptable to sliding windows: just store the positions + * relative to a "base" value that is updated from time to time, and stop + * searching each list when the sequences get too far away. + * + * --------------------------------------------------------------------------- + * + * Notes on usage + * + * You must define MATCHFINDER_MAX_WINDOW_ORDER before including this header + * because that determines which integer type to use for positions. Since + * 16-bit integers are faster than 32-bit integers due to reduced memory usage + * (and therefore reduced cache pressure), the code only uses 32-bit integers if + * they are needed to represent all possible positions. + * + * You must allocate the 'struct hc_matchfinder' on a + * MATCHFINDER_ALIGNMENT-aligned boundary, and its necessary allocation size + * must be gotten by calling hc_matchfinder_size(). + * + * ---------------------------------------------------------------------------- + * + * Optimizations + * + * The longest_match() and skip_positions() functions are inlined into the + * compressors that use them. This isn't just about saving the overhead of a + * function call. These functions are intended to be called from the inner + * loops of compressors, where giving the compiler more control over register + * allocation is very helpful. There is also significant benefit to be gained + * from allowing the CPU to predict branches independently at each call site. + * For example, "lazy"-style compressors can be written with two calls to + * longest_match(), each of which starts with a different 'best_len' and + * therefore has significantly different performance characteristics. + * + * Although any hash function can be used, a multiplicative hash is fast and + * works well. + * + * On some processors, it is significantly faster to extend matches by whole + * words (32 or 64 bits) instead of by individual bytes. For this to be the + * case, the processor must implement unaligned memory accesses efficiently and + * must have either a fast "find first set bit" instruction or a fast "find last + * set bit" instruction, depending on the processor's endianness. + * + * The code uses one loop for finding the first match and one loop for finding a + * longer match. Each of these loops is tuned for its respective task and in + * combination are faster than a single generalized loop that handles both + * tasks. + * + * The code also uses a tight inner loop that only compares the last and first + * bytes of a potential match. It is only when these bytes match that a full + * match extension is attempted. + * + * ---------------------------------------------------------------------------- */ #ifndef _HC_MATCHFINDER_H #define _HC_MATCHFINDER_H #include "wimlib/lz_extend.h" -#include "wimlib/lz_hash3.h" +#include "wimlib/lz_hash.h" #include "wimlib/matchfinder_common.h" #include "wimlib/unaligned.h" -#ifndef HC_MATCHFINDER_HASH_ORDER -# if MATCHFINDER_WINDOW_ORDER < 14 -# define HC_MATCHFINDER_HASH_ORDER 14 -# else -# define HC_MATCHFINDER_HASH_ORDER 15 -# endif +#if MATCHFINDER_MAX_WINDOW_ORDER < 14 +# define HC_MATCHFINDER_HASH_ORDER 14 +#else +# define HC_MATCHFINDER_HASH_ORDER 15 #endif #define HC_MATCHFINDER_HASH_LENGTH (1UL << HC_MATCHFINDER_HASH_ORDER) -#define HC_MATCHFINDER_TOTAL_LENGTH \ - (HC_MATCHFINDER_HASH_LENGTH + MATCHFINDER_WINDOW_SIZE) - struct hc_matchfinder { - union { - pos_t mf_data[HC_MATCHFINDER_TOTAL_LENGTH]; - struct { - pos_t hash_tab[HC_MATCHFINDER_HASH_LENGTH]; - pos_t next_tab[MATCHFINDER_WINDOW_SIZE]; - }; - }; + pos_t hash_tab[HC_MATCHFINDER_HASH_LENGTH]; + pos_t next_tab[]; } _aligned_attribute(MATCHFINDER_ALIGNMENT); -/* - * Call before running the first byte through the matchfinder. - */ -static inline void -hc_matchfinder_init(struct hc_matchfinder *mf) +/* Return the number of bytes that must be allocated for a 'hc_matchfinder' that + * can work with buffers up to the specified size. */ +static inline size_t +hc_matchfinder_size(size_t max_bufsize) { - matchfinder_init(mf->hash_tab, HC_MATCHFINDER_HASH_LENGTH); + return sizeof(pos_t) * (HC_MATCHFINDER_HASH_LENGTH + max_bufsize); } -#if MATCHFINDER_IS_SLIDING +/* Prepare the matchfinder for a new input buffer. */ static inline void -hc_matchfinder_slide_window(struct hc_matchfinder *mf) +hc_matchfinder_init(struct hc_matchfinder *mf) { - matchfinder_rebase(mf->mf_data, HC_MATCHFINDER_TOTAL_LENGTH); + matchfinder_init(mf->hash_tab, HC_MATCHFINDER_HASH_LENGTH); } -#endif /* - * Find the longest match longer than 'best_len'. + * Find the longest match longer than 'best_len' bytes. * * @mf * The matchfinder structure. - * @in_base - * Pointer to the next byte in the input buffer to process _at the last - * time hc_matchfinder_init() or hc_matchfinder_slide_window() was called_. + * @in_begin + * Pointer to the beginning of the input buffer. * @in_next * Pointer to the next byte in the input buffer to process. This is the - * pointer to the bytes being matched against. + * pointer to the sequence being matched against. * @best_len - * Require a match at least this long. + * Require a match longer than this length. * @max_len - * Maximum match length to return. + * The maximum permissible match length at this position. * @nice_len * Stop searching if a match of at least this length is found. * @max_search_depth * Limit on the number of potential matches to consider. * @offset_ret - * The match offset is returned here. + * If a match is found, its offset is returned in this location. * * Return the length of the match found, or 'best_len' if no match longer than * 'best_len' was found. */ static inline unsigned hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf, - const u8 * const in_base, + const u8 * const in_begin, const u8 * const in_next, unsigned best_len, const unsigned max_len, @@ -114,61 +167,55 @@ hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf, const u8 *best_matchptr = best_matchptr; /* uninitialized */ const u8 *matchptr; unsigned len; - unsigned hash; - pos_t cur_match; u32 first_3_bytes; + u32 hash; + pos_t cur_node; - /* Insert the current sequence into the appropriate hash chain. */ + /* Insert the current sequence into the appropriate linked list. */ if (unlikely(max_len < LZ_HASH_REQUIRED_NBYTES)) goto out; first_3_bytes = load_u24_unaligned(in_next); - hash = lz_hash_u24(first_3_bytes, HC_MATCHFINDER_HASH_ORDER); - cur_match = mf->hash_tab[hash]; - mf->next_tab[in_next - in_base] = cur_match; - mf->hash_tab[hash] = in_next - in_base; + hash = lz_hash(first_3_bytes, HC_MATCHFINDER_HASH_ORDER); + cur_node = mf->hash_tab[hash]; + mf->next_tab[in_next - in_begin] = cur_node; + mf->hash_tab[hash] = in_next - in_begin; if (unlikely(best_len >= max_len)) goto out; - /* Search the appropriate hash chain for matches. */ + /* Search the appropriate linked list for matches. */ - if (!(matchfinder_match_in_window(cur_match, in_base, in_next))) + if (!(matchfinder_node_valid(cur_node))) goto out; if (best_len < 3) { for (;;) { /* No length 3 match found yet. * Check the first 3 bytes. */ - matchptr = &in_base[cur_match]; + matchptr = &in_begin[cur_node]; if (load_u24_unaligned(matchptr) == first_3_bytes) break; - /* Not a match; keep trying. */ - cur_match = mf->next_tab[ - matchfinder_slot_for_match(cur_match)]; - if (!matchfinder_match_in_window(cur_match, - in_base, in_next)) - goto out; - if (!--depth_remaining) + /* The first 3 bytes did not match. Keep trying. */ + cur_node = mf->next_tab[cur_node]; + if (!matchfinder_node_valid(cur_node) || !--depth_remaining) goto out; } - /* Found a length 3 match. */ + /* Found a match of length >= 3. Extend it to its full length. */ best_matchptr = matchptr; best_len = lz_extend(in_next, best_matchptr, 3, max_len); if (best_len >= nice_len) goto out; - cur_match = mf->next_tab[matchfinder_slot_for_match(cur_match)]; - if (!matchfinder_match_in_window(cur_match, in_base, in_next)) - goto out; - if (!--depth_remaining) + cur_node = mf->next_tab[cur_node]; + if (!matchfinder_node_valid(cur_node) || !--depth_remaining) goto out; } for (;;) { for (;;) { - matchptr = &in_base[cur_match]; + matchptr = &in_begin[cur_node]; /* Already found a length 3 match. Try for a longer match; * start by checking the last 2 bytes and the first 4 bytes. */ @@ -182,17 +229,16 @@ hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf, #endif break; - cur_match = mf->next_tab[matchfinder_slot_for_match(cur_match)]; - if (!matchfinder_match_in_window(cur_match, in_base, in_next)) - goto out; - if (!--depth_remaining) + cur_node = mf->next_tab[cur_node]; + if (!matchfinder_node_valid(cur_node) || !--depth_remaining) goto out; } - if (UNALIGNED_ACCESS_IS_FAST) - len = 4; - else - len = 0; + #if UNALIGNED_ACCESS_IS_FAST + len = 4; + #else + len = 0; + #endif len = lz_extend(in_next, matchptr, len, max_len); if (len > best_len) { best_len = len; @@ -200,10 +246,8 @@ hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf, if (best_len >= nice_len) goto out; } - cur_match = mf->next_tab[matchfinder_slot_for_match(cur_match)]; - if (!matchfinder_match_in_window(cur_match, in_base, in_next)) - goto out; - if (!--depth_remaining) + cur_node = mf->next_tab[cur_node]; + if (!matchfinder_node_valid(cur_node) || !--depth_remaining) goto out; } out: @@ -212,36 +256,35 @@ out: } /* - * Advance the match-finder, but don't search for matches. + * Advance the matchfinder, but don't search for matches. * * @mf * The matchfinder structure. - * @in_base - * Pointer to the next byte in the input buffer to process _at the last - * time hc_matchfinder_init() or hc_matchfinder_slide_window() was called_. + * @in_begin + * Pointer to the beginning of the input buffer. * @in_next * Pointer to the next byte in the input buffer to process. * @in_end * Pointer to the end of the input buffer. * @count - * Number of bytes to skip; must be > 0. + * The number of bytes to advance. Must be > 0. */ static inline void hc_matchfinder_skip_positions(struct hc_matchfinder * restrict mf, - const u8 *in_base, + const u8 *in_begin, const u8 *in_next, const u8 *in_end, unsigned count) { - unsigned hash; + u32 hash; if (unlikely(in_next + count >= in_end - LZ_HASH_REQUIRED_NBYTES)) return; do { - hash = lz_hash(in_next, HC_MATCHFINDER_HASH_ORDER); - mf->next_tab[in_next - in_base] = mf->hash_tab[hash]; - mf->hash_tab[hash] = in_next - in_base; + hash = lz_hash_3_bytes(in_next, HC_MATCHFINDER_HASH_ORDER); + mf->next_tab[in_next - in_begin] = mf->hash_tab[hash]; + mf->hash_tab[hash] = in_next - in_begin; in_next++; } while (--count); } diff --git a/include/wimlib/lz_extend.h b/include/wimlib/lz_extend.h index a3780e1d..bd00183b 100644 --- a/include/wimlib/lz_extend.h +++ b/include/wimlib/lz_extend.h @@ -3,6 +3,9 @@ * * Fast match extension for Lempel-Ziv matchfinding. * + * Author: Eric Biggers + * Year: 2014, 2015 + * * The author dedicates this file to the public domain. * You can do whatever you want with this file. */ diff --git a/include/wimlib/lz_hash3.h b/include/wimlib/lz_hash.h similarity index 76% rename from include/wimlib/lz_hash3.h rename to include/wimlib/lz_hash.h index 13e56db7..dbab5894 100644 --- a/include/wimlib/lz_hash3.h +++ b/include/wimlib/lz_hash.h @@ -1,14 +1,17 @@ /* - * lz_hash3.h + * lz_hash.h * - * 3-byte hashing for Lempel-Ziv matchfinding. + * Hashing for Lempel-Ziv matchfinding. + * + * Author: Eric Biggers + * Year: 2014, 2015 * * The author dedicates this file to the public domain. * You can do whatever you want with this file. */ -#ifndef _WIMLIB_LZ_HASH3_H -#define _WIMLIB_LZ_HASH3_H +#ifndef _WIMLIB_LZ_HASH_H +#define _WIMLIB_LZ_HASH_H #include "wimlib/unaligned.h" @@ -34,7 +37,7 @@ load_u24_unaligned(const u8 *p) } static inline u32 -lz_hash_u24(u32 str, unsigned num_bits) +lz_hash(u32 str, unsigned num_bits) { return (u32)(str * LZ_HASH_MULTIPLIER) >> (32 - num_bits); } @@ -46,16 +49,13 @@ lz_hash_u24(u32 str, unsigned num_bits) * some architectures. */ static inline u32 -lz_hash(const u8 *p, unsigned num_bits) +lz_hash_3_bytes(const u8 *p, unsigned num_bits) { - return lz_hash_u24(load_u24_unaligned(p), num_bits); + return lz_hash(load_u24_unaligned(p), num_bits); } -/* The number of bytes being hashed. */ -#define LZ_HASH_NBYTES 3 - /* Number of bytes the hash function actually requires be available, due to the * possibility of an unaligned load. */ #define LZ_HASH_REQUIRED_NBYTES (UNALIGNED_ACCESS_IS_FAST ? 4 : 3) -#endif /* _WIMLIB_LZ_HASH3_H */ +#endif /* _WIMLIB_LZ_HASH_H */ diff --git a/include/wimlib/lz_mf.h b/include/wimlib/lz_mf.h index 9424e19d..699c13c6 100644 --- a/include/wimlib/lz_mf.h +++ b/include/wimlib/lz_mf.h @@ -61,6 +61,11 @@ * works best for your parsing strategy, and your typical data and block sizes. */ +/* + * TODO: this API is going to go away eventually. It has too much indirection + * and is not flexible enough. + */ + #ifndef _WIMLIB_LZ_MF_H #define _WIMLIB_LZ_MF_H @@ -95,52 +100,6 @@ struct lz_match { * Specifies a match-finding algorithm. */ enum lz_mf_algo { - - /* - * Use the default algorithm for the specified maximum window size. - */ - LZ_MF_DEFAULT = 0, - - /* - * "Null" algorithm that never reports any matches. - * - * This algorithm exists for comparison, benchmarking, and testing - * purposes only. It is not intended to be used in real compressors. - */ - LZ_MF_NULL = 1, - - /* - * Hash Chain match-finding algorithm. - * - * This works well on small windows. - * - * The memory usage is 4 bytes per position, plus 131072 bytes for a - * hash table. - * - * lz_mf_skip_positions() with this algorithm is very fast, so it's good - * if you're doing "greedy" rather than "optimal" parsing. However, if - * using large windows you might be better off with binary trees or - * suffix arrays, even if doing greedy parsing. - */ - LZ_MF_HASH_CHAINS = 3, - - /* - * Binary Tree match-finding algorithm. - * - * This works well on small to medium-sized windows. - * - * The memory usage is 8 bytes per position, plus 262144 bytes for a - * hash table. - * - * lz_mf_skip_positions() with this algorithm takes a significant amount - * of time, almost as much as a call to lz_mf_get_matches(). This makes - * this algorithm better suited for optimal parsing than for greedy - * parsing. However, if the window size becomes sufficiently large, - * this algorithm can outperform hash chains, even when using greedy - * parsing. - */ - LZ_MF_BINARY_TREES = 4, - /* * Longest Common Prefix Interval Tree match-finding algorithm. * @@ -149,15 +108,8 @@ enum lz_mf_algo { * currently limited to a maximum window size of 33554432 bytes. * * The memory usage is 12 bytes per position. - * - * Unlike the hash chain and binary tree algorithms, the LCP interval - * tree algorithm performs most of its work in lz_mf_load_window(). The - * calls to lz_mf_get_matches() and lz_mf_skip_positions() take - * relatively little time, and lz_mf_skip_positions() is not much faster - * than lz_mf_get_matches(). Therefore, if you're using this algorithm - * you might as well be doing "optimal" rather than "greedy" parsing. */ - LZ_MF_LCP_INTERVAL_TREE = 5, + LZ_MF_LCP_INTERVAL_TREE, /* * Linked Suffix Array match-finding algorithm. @@ -170,7 +122,7 @@ enum lz_mf_algo { * interval tree algorithm. However, it can be used on windows * exceeding the 33554432 byte limit of the LCP interval tree algorithm. */ - LZ_MF_LINKED_SUFFIX_ARRAY = 6, + LZ_MF_LINKED_SUFFIX_ARRAY, }; /* Parameters for Lempel-Ziv match-finding. */ @@ -179,9 +131,6 @@ struct lz_mf_params { /* * The match-finding algorithm to use. This must be one of the 'enum * lz_mf_algo' constants defined above. - * - * If this is LZ_MF_DEFAULT, the default algorithm for the specified - * @max_window_size is used. */ u32 algorithm; @@ -237,15 +186,10 @@ struct lz_mf_params { u32 max_match_len; /* - * When using the hash chains or binary trees match-finding algorithm, - * this parameter defines the maximum number of search steps at each - * position. A typical value to use is 32. Higher values result in - * better matches and slower performance. - * - * The suffix array-based match-finding algorithms treat this parameter - * slightly differently because they find the longest matches first. - * They still honor the intent of the parameter but may scale it down to - * an appropriate value. + * This value describes the maximum amount of work that the + * match-finding algorithm will do at each position. A typical value to + * use is 32. Higher values result in better matches and slower + * performance. * * If this parameter is 0, the match-finding algorithm sets it to a * default value. @@ -253,11 +197,10 @@ struct lz_mf_params { u32 max_search_depth; /* - * When using the hash chains, binary trees, or LCP interval tree - * match-finding algorithm, this parameter defines the maximum match - * length to which the full algorithm will be applied. This can also be - * thought of as the length above which the algorithm will not try to - * search for additional matches. + * This parameter defines the maximum match length to which the full + * algorithm will be applied. This can also be thought of as the length + * above which the algorithm will not try to search for additional + * matches. * * Usually, setting this parameter to a reasonable value (such as 24, * 32, or 48) will speed up match-finding but will not hurt the diff --git a/include/wimlib/lz_repsearch.h b/include/wimlib/lz_repsearch.h index 0f031c12..b3ab80aa 100644 --- a/include/wimlib/lz_repsearch.h +++ b/include/wimlib/lz_repsearch.h @@ -3,6 +3,9 @@ * * Fast searching for repeat offset matches. * + * Author: Eric Biggers + * Year: 2014, 2015 + * * The author dedicates this file to the public domain. * You can do whatever you want with this file. */ diff --git a/include/wimlib/lzx_common.h b/include/wimlib/lzx_common.h index b3ae85a7..78c92ec1 100644 --- a/include/wimlib/lzx_common.h +++ b/include/wimlib/lzx_common.h @@ -7,64 +7,67 @@ #ifndef _LZX_COMMON_H #define _LZX_COMMON_H -#include "wimlib/assert.h" #include "wimlib/bitops.h" -#include "wimlib/compiler.h" #include "wimlib/lzx_constants.h" -#include "wimlib/util.h" #include "wimlib/types.h" //#define ENABLE_LZX_DEBUG #ifdef ENABLE_LZX_DEBUG -# define LZX_ASSERT wimlib_assert +# include "wimlib/assert.h" +# define LZX_ASSERT wimlib_assert #else -# define LZX_ASSERT(...) +# define LZX_ASSERT(...) #endif -extern const u32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS]; +extern const u32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS + 1]; -extern const u8 lzx_extra_offset_bits[LZX_MAX_OFFSET_SLOTS]; +extern const u8 lzx_extra_offset_bits[LZX_MAX_OFFSET_SLOTS + 1]; -/* Returns the LZX offset slot that corresponds to a given adjusted offset. +/* + * Return the offset slot for the specified match offset. + * + * This returns the smallest i such that: * - * Logically, this returns the smallest i such that - * adjusted_offset >= lzx_offset_slot_base[i]. + * offset + LZX_OFFSET_ADJUSTMENT >= lzx_offset_slot_base[i] * - * The actual implementation below takes advantage of the regularity of the - * numbers in the lzx_offset_slot_base array to calculate the slot directly from - * the adjusted offset without actually looking at the array. + * However, the actual implementation below takes advantage of the regularity of + * the offset slot bases to calculate the slot directly from the adjusted offset + * without actually looking at the array. */ static inline unsigned -lzx_get_offset_slot_raw(u32 adjusted_offset) +lzx_get_offset_slot(u32 offset) { + u32 adjusted_offset = offset + LZX_OFFSET_ADJUSTMENT; if (adjusted_offset >= 196608) { return (adjusted_offset >> 17) + 34; } else { - LZX_ASSERT(2 <= adjusted_offset && adjusted_offset < 655360); unsigned mssb_idx = fls32(adjusted_offset); return (mssb_idx << 1) | ((adjusted_offset >> (mssb_idx - 1)) & 1); } } -extern unsigned lzx_get_window_order(size_t max_block_size); - -extern unsigned lzx_get_num_main_syms(unsigned window_order); - -/* Least-recently used queue for match offsets. */ -struct lzx_lru_queue { - u32 R[LZX_NUM_RECENT_OFFSETS]; -} _aligned_attribute(sizeof(unsigned long)); +static inline unsigned +lzx_main_symbol_for_literal(unsigned literal) +{ + return literal; +} -/* Initialize the LZX least-recently-used match offset queue at the beginning of - * a new window for either decompression or compression. */ -static inline void -lzx_lru_queue_init(struct lzx_lru_queue *queue) +static inline unsigned +lzx_main_symbol_for_match(unsigned offset_slot, unsigned len_header) { - for (unsigned i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) - queue->R[i] = 1; + return LZX_NUM_CHARS + (offset_slot * LZX_NUM_LEN_HEADERS) + len_header; } +extern unsigned +lzx_get_window_order(size_t max_bufsize); + +extern unsigned +lzx_get_num_offset_slots(unsigned window_order); + +extern unsigned +lzx_get_num_main_syms(unsigned window_order); + extern void lzx_do_e8_preprocessing(u8 *data, u32 size); diff --git a/include/wimlib/lzx_constants.h b/include/wimlib/lzx_constants.h index 49cf8faf..beeff436 100644 --- a/include/wimlib/lzx_constants.h +++ b/include/wimlib/lzx_constants.h @@ -7,38 +7,43 @@ #ifndef _LZX_CONSTANTS_H #define _LZX_CONSTANTS_H +/* Number of literal byte values. */ +#define LZX_NUM_CHARS 256 + /* The smallest and largest allowed match lengths. */ #define LZX_MIN_MATCH_LEN 2 #define LZX_MAX_MATCH_LEN 257 -/* Number of values an uncompressed literal byte can represent. */ -#define LZX_NUM_CHARS 256 +/* Number of distinct match lengths that can be represented. */ +#define LZX_NUM_LENS (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1) + +/* Number of match lengths for which no length symbol is required. */ +#define LZX_NUM_PRIMARY_LENS 7 +#define LZX_NUM_LEN_HEADERS (LZX_NUM_PRIMARY_LENS + 1) /* Valid values of the 3-bit block type field. */ #define LZX_BLOCKTYPE_VERBATIM 1 #define LZX_BLOCKTYPE_ALIGNED 2 #define LZX_BLOCKTYPE_UNCOMPRESSED 3 -/* Maximum value of the "length header" portion of a main symbol. If the length - * header has this value, then the match length is at least LZX_NUM_PRIMARY_LENS - * + LZX_MIN_MATCH_LEN, and a length symbol follows. */ -#define LZX_NUM_PRIMARY_LENS 7 - -/* Maximum number of offset slots. The actual number of offset slots will - * depend on the window size. */ -#define LZX_MAX_OFFSET_SLOTS 51 - +/* 'LZX_MIN_WINDOW_SIZE' and 'LZX_MAX_WINDOW_SIZE' are the minimum and maximum + * sizes of the sliding window. */ #define LZX_MIN_WINDOW_ORDER 15 #define LZX_MAX_WINDOW_ORDER 21 -#define LZX_MIN_WINDOW_SIZE (1U << LZX_MIN_WINDOW_ORDER) /* 32768 */ -#define LZX_MAX_WINDOW_SIZE (1U << LZX_MAX_WINDOW_ORDER) /* 2097152 */ +#define LZX_MIN_WINDOW_SIZE (1UL << LZX_MIN_WINDOW_ORDER) /* 32768 */ +#define LZX_MAX_WINDOW_SIZE (1UL << LZX_MAX_WINDOW_ORDER) /* 2097152 */ + +/* Maximum number of offset slots. (The actual number of offset slots depends + * on the window size.) */ +#define LZX_MAX_OFFSET_SLOTS 50 -/* Maximum number of symbols in the main code. The actual number of symbols in - * the main code will depend on the window size. */ -#define LZX_MAINCODE_MAX_NUM_SYMBOLS (LZX_NUM_CHARS + (LZX_MAX_OFFSET_SLOTS << 3)) +/* Maximum number of symbols in the main code. (The actual number of symbols in + * the main code depends on the window size.) */ +#define LZX_MAINCODE_MAX_NUM_SYMBOLS \ + (LZX_NUM_CHARS + (LZX_MAX_OFFSET_SLOTS * LZX_NUM_LEN_HEADERS)) /* Number of symbols in the length code. */ -#define LZX_LENCODE_NUM_SYMBOLS 249 +#define LZX_LENCODE_NUM_SYMBOLS (LZX_NUM_LENS - LZX_NUM_PRIMARY_LENS) /* Number of symbols in the pre-code. */ #define LZX_PRECODE_NUM_SYMBOLS 20 @@ -46,8 +51,16 @@ /* Number of bits in which each pre-code codeword length is represented. */ #define LZX_PRECODE_ELEMENT_SIZE 4 +/* Number of low-order bits of each match offset that are entropy-encoded in + * aligned offset blocks. */ +#define LZX_NUM_ALIGNED_OFFSET_BITS 3 + /* Number of symbols in the aligned offset code. */ -#define LZX_ALIGNEDCODE_NUM_SYMBOLS 8 +#define LZX_ALIGNEDCODE_NUM_SYMBOLS (1 << LZX_NUM_ALIGNED_OFFSET_BITS) + +/* Mask for the match offset bits that are entropy-encoded in aligned offset + * blocks. */ +#define LZX_ALIGNED_OFFSET_BITMASK ((1 << LZX_NUM_ALIGNED_OFFSET_BITS) - 1) /* Number of bits in which each aligned offset codeword length is represented. */ #define LZX_ALIGNEDCODE_ELEMENT_SIZE 3 @@ -55,8 +68,8 @@ /* Maximum lengths (in bits) for length-limited Huffman code construction. */ #define LZX_MAX_MAIN_CODEWORD_LEN 16 #define LZX_MAX_LEN_CODEWORD_LEN 16 -#define LZX_MAX_PRE_CODEWORD_LEN 16 -#define LZX_MAX_ALIGNED_CODEWORD_LEN 8 +#define LZX_MAX_PRE_CODEWORD_LEN ((1 << LZX_PRECODE_ELEMENT_SIZE) - 1) +#define LZX_MAX_ALIGNED_CODEWORD_LEN ((1 << LZX_ALIGNEDCODE_ELEMENT_SIZE) - 1) /* For LZX-compressed blocks in WIM resources, this value is always used as the * filesize parameter for the call instruction (0xe8 byte) preprocessing, even @@ -72,7 +85,7 @@ /* Number of offsets in the recent (or "repeat") offsets queue. */ #define LZX_NUM_RECENT_OFFSETS 3 -/* An offset of n bytes is actually encoded as (n + LZX_OFFSET_OFFSET). */ -#define LZX_OFFSET_OFFSET (LZX_NUM_RECENT_OFFSETS - 1) +/* An offset of n bytes is actually encoded as (n + LZX_OFFSET_ADJUSTMENT). */ +#define LZX_OFFSET_ADJUSTMENT (LZX_NUM_RECENT_OFFSETS - 1) #endif /* _LZX_CONSTANTS_H */ diff --git a/include/wimlib/matchfinder_avx2.h b/include/wimlib/matchfinder_avx2.h index fe98b636..bdf10d21 100644 --- a/include/wimlib/matchfinder_avx2.h +++ b/include/wimlib/matchfinder_avx2.h @@ -2,6 +2,12 @@ * matchfinder_avx2.h * * Matchfinding routines optimized for Intel AVX2 (Advanced Vector Extensions). + * + * Author: Eric Biggers + * Year: 2014, 2015 + * + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. */ #include @@ -16,9 +22,9 @@ matchfinder_init_avx2(pos_t *data, size_t size) return false; if (sizeof(pos_t) == 2) - v = _mm256_set1_epi16(MATCHFINDER_INITVAL); + v = _mm256_set1_epi16((u16)MATCHFINDER_NULL); else if (sizeof(pos_t) == 4) - v = _mm256_set1_epi32(MATCHFINDER_INITVAL); + v = _mm256_set1_epi32((u32)MATCHFINDER_NULL); else return false; @@ -33,32 +39,3 @@ matchfinder_init_avx2(pos_t *data, size_t size) } while (--n); return true; } - -static inline bool -matchfinder_rebase_avx2(pos_t *data, size_t size) -{ - __m256i v, *p; - size_t n; - - if ((size % sizeof(__m256i) * 4 != 0)) - return false; - - if (sizeof(pos_t) == 2) - v = _mm256_set1_epi16((pos_t)-MATCHFINDER_WINDOW_SIZE); - else if (sizeof(pos_t) == 4) - v = _mm256_set1_epi32((pos_t)-MATCHFINDER_WINDOW_SIZE); - else - return false; - - p = (__m256i *)data; - n = size / (sizeof(__m256i) * 4); - do { - /* PADDSW: Add Packed Signed Integers With Signed Saturation */ - p[0] = _mm256_adds_epi16(p[0], v); - p[1] = _mm256_adds_epi16(p[1], v); - p[2] = _mm256_adds_epi16(p[2], v); - p[3] = _mm256_adds_epi16(p[3], v); - p += 4; - } while (--n); - return true; -} diff --git a/include/wimlib/matchfinder_common.h b/include/wimlib/matchfinder_common.h index 66a966a3..15543edd 100644 --- a/include/wimlib/matchfinder_common.h +++ b/include/wimlib/matchfinder_common.h @@ -3,30 +3,11 @@ * * Common code for Lempel-Ziv matchfinding. * - * Copyright (c) 2014 Eric Biggers. All rights reserved. + * Author: Eric Biggers + * Year: 2014, 2015 * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. */ #ifndef _MATCHFINDER_COMMON_H @@ -36,20 +17,42 @@ #include -#ifndef MATCHFINDER_WINDOW_ORDER -# error "MATCHFINDER_WINDOW_ORDER must be defined!" +#ifndef MATCHFINDER_MAX_WINDOW_ORDER +# error "MATCHFINDER_MAX_WINDOW_ORDER must be defined!" #endif -#ifndef MATCHFINDER_IS_SLIDING -# error "MATCHFINDER_IS_SLIDING must be defined!" +#if MATCHFINDER_MAX_WINDOW_ORDER <= 16 +typedef u16 pos_t; +#else +typedef u32 pos_t; #endif -#define MATCHFINDER_WINDOW_SIZE ((size_t)1 << MATCHFINDER_WINDOW_ORDER) +#if MATCHFINDER_MAX_WINDOW_ORDER != 16 && MATCHFINDER_MAX_WINDOW_ORDER != 32 + +/* Not all the bits of the position type are needed, so the sign bit can be + * reserved to mean "out of bounds". */ +#define MATCHFINDER_NULL ((pos_t)-1) + +static inline bool +matchfinder_node_valid(pos_t node) +{ + return !(node & ((pos_t)1 << (sizeof(pos_t) * 8 - 1))); +} -#if MATCHFINDER_IS_SLIDING -# include "matchfinder_sliding.h" #else -# include "matchfinder_nonsliding.h" + +/* All bits of the position type are needed, so use 0 to mean "out of bounds". + * This prevents the beginning of the buffer from matching anything; however, + * this doesn't matter much. */ + +#define MATCHFINDER_NULL ((pos_t)0) + +static inline bool +matchfinder_node_valid(pos_t node) +{ + return node != 0; +} + #endif #define MATCHFINDER_ALIGNMENT 8 @@ -86,7 +89,7 @@ static inline bool matchfinder_memset_init_okay(void) { /* All bytes must match in order to use memset. */ - const pos_t v = MATCHFINDER_INITVAL; + const pos_t v = MATCHFINDER_NULL; if (sizeof(pos_t) == 2) return (u8)v == (u8)(v >> 8); if (sizeof(pos_t) == 4) @@ -119,73 +122,12 @@ matchfinder_init(pos_t *data, size_t num_entries) #endif if (matchfinder_memset_init_okay()) { - memset(data, (u8)MATCHFINDER_INITVAL, size); + memset(data, (u8)MATCHFINDER_NULL, size); return; } for (size_t i = 0; i < num_entries; i++) - data[i] = MATCHFINDER_INITVAL; -} - -#if MATCHFINDER_IS_SLIDING -/* - * Slide the matchfinder by WINDOW_SIZE bytes. - * - * This must be called just after each WINDOW_SIZE bytes have been run through - * the matchfinder. - * - * This will subtract WINDOW_SIZE bytes from each entry in the array specified. - * The effect is that all entries are updated to be relative to the current - * position, rather than the position WINDOW_SIZE bytes prior. - * - * Underflow is detected and replaced with signed saturation. This ensures that - * once the sliding window has passed over a position, that position forever - * remains out of bounds. - * - * The array passed in must contain all matchfinder data that is - * position-relative. Concretely, this will include the hash table as well as - * the table of positions that is used to link together the sequences in each - * hash bucket. Note that in the latter table, the links are 1-ary in the case - * of "hash chains", and 2-ary in the case of "binary trees". In either case, - * the links need to be rebased in the same way. - */ -static inline void -matchfinder_rebase(pos_t *data, size_t num_entries) -{ - const size_t size = num_entries * sizeof(data[0]); - -#ifdef __AVX2__ - if (matchfinder_rebase_avx2(data, size)) - return; -#endif - -#ifdef __SSE2__ - if (matchfinder_rebase_sse2(data, size)) - return; -#endif - - if (MATCHFINDER_WINDOW_SIZE == 32768) { - /* Branchless version for 32768 byte windows. If the value was - * already negative, clear all bits except the sign bit; this - * changes the value to -32768. Otherwise, set the sign bit; - * this is equivalent to subtracting 32768. */ - for (size_t i = 0; i < num_entries; i++) { - u16 v = data[i]; - u16 sign_bit = v & 0x8000; - v &= sign_bit - ((sign_bit >> 15) ^ 1); - v |= 0x8000; - data[i] = v; - } - return; - } - - for (size_t i = 0; i < num_entries; i++) { - if (data[i] >= 0) - data[i] -= (pos_t)-MATCHFINDER_WINDOW_SIZE; - else - data[i] = (pos_t)-MATCHFINDER_WINDOW_SIZE; - } + data[i] = MATCHFINDER_NULL; } -#endif /* MATCHFINDER_IS_SLIDING */ #endif /* _MATCHFINDER_COMMON_H */ diff --git a/include/wimlib/matchfinder_nonsliding.h b/include/wimlib/matchfinder_nonsliding.h deleted file mode 100644 index e08f4614..00000000 --- a/include/wimlib/matchfinder_nonsliding.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * matchfinder_nonsliding.h - * - * Definitions for nonsliding window matchfinders. - * - * "Nonsliding window" means that any prior sequence can be matched. - */ - -#if MATCHFINDER_WINDOW_ORDER <= 16 -typedef u16 pos_t; -#else -typedef u32 pos_t; -#endif - -#if MATCHFINDER_WINDOW_ORDER != 16 && MATCHFINDER_WINDOW_ORDER != 32 - -/* Not all the bits of the position type are needed, so the sign bit can be - * reserved to mean "out of bounds". */ -#define MATCHFINDER_INITVAL ((pos_t)-1) - -static inline bool -matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next) -{ - return !(cur_match & ((pos_t)1 << (sizeof(pos_t) * 8 - 1))); -} - -#else - -/* All bits of the position type are needed, so use 0 to mean "out of bounds". - * This prevents the beginning of the buffer from matching anything; however, - * this doesn't matter much. */ - -#define MATCHFINDER_INITVAL ((pos_t)0) - -static inline bool -matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next) -{ - return cur_match != 0; -} - -#endif - -static inline pos_t -matchfinder_slot_for_match(pos_t cur_match) -{ - return cur_match; -} diff --git a/include/wimlib/matchfinder_sliding.h b/include/wimlib/matchfinder_sliding.h deleted file mode 100644 index 4b8a5159..00000000 --- a/include/wimlib/matchfinder_sliding.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * matchfinder_sliding.h - * - * Definitions for sliding window matchfinders. - * - * "Sliding window" means that only sequences beginning in the most recent - * MATCHFINDER_WINDOW_SIZE bytes can be matched. - */ - -#if MATCHFINDER_WINDOW_ORDER <= 15 -typedef s16 pos_t; -#else -typedef s32 pos_t; -#endif - -#define MATCHFINDER_INITVAL ((pos_t)-MATCHFINDER_WINDOW_SIZE) - -/* In the sliding window case, positions are stored relative to 'in_base'. */ - -static inline bool -matchfinder_match_in_window(pos_t cur_match, const u8 *in_base, const u8 *in_next) -{ - return cur_match > (pos_t)((in_next - in_base) - MATCHFINDER_WINDOW_SIZE); -} - -static inline pos_t -matchfinder_slot_for_match(pos_t cur_match) -{ - return cur_match & (MATCHFINDER_WINDOW_SIZE - 1); -} diff --git a/include/wimlib/matchfinder_sse2.h b/include/wimlib/matchfinder_sse2.h index cc276002..9b0b080b 100644 --- a/include/wimlib/matchfinder_sse2.h +++ b/include/wimlib/matchfinder_sse2.h @@ -2,6 +2,12 @@ * matchfinder_sse2.h * * Matchfinding routines optimized for Intel SSE2 (Streaming SIMD Extensions). + * + * Author: Eric Biggers + * Year: 2014, 2015 + * + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. */ #include @@ -16,9 +22,9 @@ matchfinder_init_sse2(pos_t *data, size_t size) return false; if (sizeof(pos_t) == 2) - v = _mm_set1_epi16(MATCHFINDER_INITVAL); + v = _mm_set1_epi16((u16)MATCHFINDER_NULL); else if (sizeof(pos_t) == 4) - v = _mm_set1_epi32(MATCHFINDER_INITVAL); + v = _mm_set1_epi32((u32)MATCHFINDER_NULL); else return false; @@ -33,32 +39,3 @@ matchfinder_init_sse2(pos_t *data, size_t size) } while (--n); return true; } - -static inline bool -matchfinder_rebase_sse2(pos_t *data, size_t size) -{ - __m128i v, *p; - size_t n; - - if ((size % sizeof(__m128i) * 4 != 0)) - return false; - - if (sizeof(pos_t) == 2) - v = _mm_set1_epi16((pos_t)-MATCHFINDER_WINDOW_SIZE); - else if (sizeof(pos_t) == 4) - v = _mm_set1_epi32((pos_t)-MATCHFINDER_WINDOW_SIZE); - else - return false; - - p = (__m128i *)data; - n = size / (sizeof(__m128i) * 4); - do { - /* PADDSW: Add Packed Signed Integers With Signed Saturation */ - p[0] = _mm_adds_epi16(p[0], v); - p[1] = _mm_adds_epi16(p[1], v); - p[2] = _mm_adds_epi16(p[2], v); - p[3] = _mm_adds_epi16(p[3], v); - p += 4; - } while (--n); - return true; -} diff --git a/include/wimlib/unaligned.h b/include/wimlib/unaligned.h index 5f920e75..9bd2a55f 100644 --- a/include/wimlib/unaligned.h +++ b/include/wimlib/unaligned.h @@ -3,6 +3,9 @@ * * Inline functions for unaligned memory accesses. * + * Author: Eric Biggers + * Year: 2014, 2015 + * * The author dedicates this file to the public domain. * You can do whatever you want with this file. */ diff --git a/src/lz_binary_trees.c b/src/lz_binary_trees.c deleted file mode 100644 index fa866b89..00000000 --- a/src/lz_binary_trees.c +++ /dev/null @@ -1,613 +0,0 @@ -/* - * lz_binary_trees.c - * - * Binary tree match-finder for Lempel-Ziv compression. - * - * Copyright (c) 2014 Eric Biggers. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * Note: the binary tree search/update algorithm is based on LzFind.c from - * 7-Zip, which was written by Igor Pavlov and released into the public domain. - */ - -#ifdef HAVE_CONFIG_H -# include "config.h" -#endif - -#include "wimlib/lz_extend.h" -#include "wimlib/lz_hash3.h" -#include "wimlib/lz_mf.h" -#include "wimlib/util.h" - -#include - -/* log2 of the number of buckets in the hash table. This can be changed. */ -#define LZ_BT_HASH_ORDER 16 - -#define LZ_BT_HASH_LEN (1 << LZ_BT_HASH_ORDER) - -/* Number of entries in the digram table. - * - * Note: You rarely get length-2 matches if you use length-3 hashing. But - * since binary trees are typically used for higher compression ratios than hash - * chains, it is helpful for this match-finder to find length-2 matches as well. - * Therefore this match-finder also uses a digram table to find length-2 matches - * when the minimum match length is 2. */ -#define LZ_BT_DIGRAM_TAB_LEN (256 * 256) - -struct lz_bt { - struct lz_mf base; - u32 *hash_tab; - u32 *digram_tab; - u32 *child_tab; - u32 next_hash; - u16 next_digram; -}; - -static inline u32 -lz_bt_hash(const u8 *p) -{ - return lz_hash(p, LZ_BT_HASH_ORDER); -} - -static void -lz_bt_set_default_params(struct lz_mf_params *params) -{ - if (params->min_match_len == 0) - params->min_match_len = 2; - - if (params->max_match_len == 0) - params->max_match_len = UINT32_MAX; - - if (params->max_search_depth == 0) - params->max_search_depth = 50; - - if (params->nice_match_len == 0) - params->nice_match_len = 24; - - if (params->nice_match_len < params->min_match_len) - params->nice_match_len = params->min_match_len; - - if (params->nice_match_len > params->max_match_len) - params->nice_match_len = params->max_match_len; -} - -static bool -lz_bt_params_valid(const struct lz_mf_params *params) -{ - return true; -} - -static u64 -lz_bt_get_needed_memory(u32 max_window_size) -{ - u64 len = 0; - - len += LZ_BT_HASH_LEN; /* hash_tab */ - len += LZ_BT_DIGRAM_TAB_LEN; /* digram_tab */ - len += 2 * (u64)max_window_size; /* child_tab */ - - return len * sizeof(u32); -} - -static bool -lz_bt_init(struct lz_mf *_mf) -{ - struct lz_bt *mf = (struct lz_bt *)_mf; - struct lz_mf_params *params = &mf->base.params; - size_t len = 0; - - lz_bt_set_default_params(params); - - /* Allocate space for 'hash_tab', 'digram_tab', and 'child_tab'. */ - - len += LZ_BT_HASH_LEN; - if (params->min_match_len == 2) - len += LZ_BT_DIGRAM_TAB_LEN; - len += 2 * params->max_window_size; - - mf->hash_tab = MALLOC(len * sizeof(u32)); - if (!mf->hash_tab) - return false; - - if (params->min_match_len == 2) { - mf->digram_tab = mf->hash_tab + LZ_BT_HASH_LEN; - mf->child_tab = mf->digram_tab + LZ_BT_DIGRAM_TAB_LEN; - } else { - mf->child_tab = mf->hash_tab + LZ_BT_HASH_LEN; - } - - return true; -} - -static void -lz_bt_load_window(struct lz_mf *_mf, const u8 window[], u32 size) -{ - struct lz_bt *mf = (struct lz_bt *)_mf; - size_t clear_len; - - /* Clear hash_tab and digram_tab. - * Note: child_tab need not be cleared. */ - clear_len = LZ_BT_HASH_LEN; - if (mf->digram_tab) - clear_len += LZ_BT_DIGRAM_TAB_LEN; - memset(mf->hash_tab, 0, clear_len * sizeof(u32)); -} - -/* - * Search the binary tree of the current hash code for matches. At the same - * time, update this tree to add the current position in the window. - * - * @window - * The window being searched. - * @cur_pos - * The current position in the window. - * @child_tab - * Table of child pointers for the binary tree. The children of the node - * for position 'i' in the window are child_tab[i * 2] and child_tab[i*2 + - * 1]. Zero is reserved for the 'null' value (no child). Consequently, we - * don't recognize matches beginning at position 0. In fact, the node for - * position 0 in the window will not be used at all, which is just as well - * because we use 0-based indices which don't work for position 0. - * @cur_match - * The position in the window at which the binary tree for the current hash - * code is rooted. This can be 0, which indicates that the binary tree for - * the current hash code is empty. - * @min_len - * Ignore matches shorter than this length. This must be at least 1. - * @nice_len - * Stop searching if a match of this length or longer is found. This must - * be less than or equal to @max_len. - * @max_len - * Maximum length of matches to return. This can be longer than @nice_len, - * in which case a match of length @nice_len will still cause the search to - * be terminated, but the match will be extended up to @max_len bytes - * first. - * @max_search_depth - * Stop if we reach this depth in the binary tree. - * @matches - * The array in which to produce the matches. The matches will be produced - * in order of increasing length and increasing offset. No more than one - * match shall have any given length, nor shall any match be shorter than - * @min_len, nor shall any match be longer than @max_len, nor shall any two - * matches have the same offset. - * - * Returns a pointer to the next free slot in @matches. - */ -static struct lz_match * -do_search(const u8 window[restrict], - const u32 cur_pos, - u32 child_tab[restrict], - u32 cur_match, - const u32 min_len, - const u32 nice_len, - const u32 max_len, - const u32 max_search_depth, - struct lz_match *lz_matchptr) -{ - /* - * Here's my explanation of how this code actually works. Beware: this - * algorithm is a *lot* trickier than searching for matches via hash - * chains. But it can be significantly better, especially when doing - * "optimal" parsing, which is why it gets used, e.g. in LZMA as well as - * here. - * - * --------------------------------------------------------------------- - * - * Data structure - * - * Basically, there is not just one binary tree, but rather one binary - * tree per hash code. For a given hash code, the binary tree indexes - * previous positions in the window that have that same hash code. The - * key for each node is the "string", or byte sequence, beginning at the - * corresponding position in the window. - * - * Each tree maintains the invariant that if node C is a child of node - * P, then the window position represented by node C is smaller than - * ("left of") the window position represented by node P. Equivalently, - * while descending into a tree, the match distances ("offsets") from - * the current position are non-decreasing --- actually strictly - * increasing, because each node represents a unique position. - * - * In addition, not all previous positions sharing the same hash code - * will necessarily be represented in each binary tree; see the - * "Updating" section. - * - * --------------------------------------------------------------------- - * - * Searching - * - * Suppose we want to search for LZ77-style matches with the string - * beginning at the current window position and extending for @max_len - * bytes. To do this, we can search for this string in the binary tree - * for this string's hash code. Each node visited during the search is - * a potential match. This method will find the matches efficiently - * because they will converge on the current string, due to the nature - * of the binary search. - * - * Naively, when visiting a node that represents a match of length N, we - * must compare N + 1 bytes in order to determine the length of that - * match and the lexicographic ordering of that match relative to the - * current string (which determines whether we need to step left or - * right into the next level of the tree, as per the standard binary - * tree search algorithm). However, as an optimization, we need not - * explicitly examine the full length of the match at each node. To see - * that this is true, suppose that we examine a node during the search, - * and we find that the corresponding match is less (alt. greater) than - * the current string. Then, because of how binary tree search - * operates, the match must be lexicographically greater (alt. lesser) - * than any ancestor node that corresponded to a match lexicographically - * lesser (alt. greater) than the current string. Therefore, the match - * must be at least as long as the match for any such ancestor node. - * Therefore, the lengths of lexicographically-lesser (alt. greater) - * matches must be non-decreasing as they are encountered by the tree - * search. - * - * Using this observation, we can maintain two variables, 'best_lt_len' - * and 'best_gt_len', that represent the length of the longest - * lexicographically lesser and greater, respectively, match that has - * been examined so far. Then, when examining a new match, we need - * only start comparing at the index min(best_lt_len, best_gt_len) byte. - * Note that we cannot know beforehand whether the match will be - * lexicographically lesser or greater, hence the need for taking the - * minimum of these two lengths. - * - * As noted earlier, as we descend into the tree, the potential matches - * will have strictly increasing offsets. To make things faster for - * higher-level parsing / match-choosing code, we do not want to return - * a shorter match that has a larger offset than a longer match. This - * is because a longer match can always be truncated to a shorter match - * if needed, and smaller offsets usually (depending on the compression - * format) take fewer bits to encode than larger offsets. - * Consequently, we keep a potential match only if it is longer than the - * previous longest match that has been found. This has the added - * advantage of producing the array of matches sorted by strictly - * increasing lengths as well as strictly decreasing offsets. - * - * In degenerate cases, the binary tree might become severely - * unbalanced. To prevent excessive running times, we stop immediately - * (and return any matches that happen to have been found so far) if the - * current depth exceeds @max_search_depth. Note that this cutoff can - * occur before the longest match has been found, which is usually bad - * for the compression ratio. - * - * --------------------------------------------------------------------- - * - * Updating - * - * I've explained how to find matches by searching the binary tree of - * the current hash code. But how do we get the binary tree in the - * first place? Since the tree is built incrementally, the real - * question is how do we update the tree to "add" the current window - * position. - * - * The tree maintains the invariant that a node's parent always has a - * larger position (a.k.a. smaller match offset) than itself. - * Therefore, the root node must always have the largest position; and - * since the current position is larger than any previous position, the - * current position must become the root of the tree. - * - * A correct, but silly, approach is to simply add the previous root as - * a child of the new root, using either the left or right child pointer - * depending on the lexicographic ordering of the strings. This works, - * but it really just produces a linked list, so it's not sufficient. - * - * Instead, we can initially mark the new root's left child pointer as - * "pending (less than)" and its right child pointer as "pending - * (greater than)". Then, during the search, when we examine a match - * that is lexicographically less than the current string, we link the - * "pending (less than)" pointer to the node of that match, then set the - * right child pointer of *that* node as "pending (less than)". - * Similarly, when we examine a match that is lexicographically greater - * than the current string, we link the "pending (greater than)" pointer - * to the node of that match, then set the left child pointer of *that* - * node as "pending (greater than)". - * - * If the search terminates before the current string is found (up to a - * precision of @nice_len bytes), then we set "pending (less than)" and - * "pending (greater than)" to point to nothing. Alternatively, if the - * search terminates due to finding the current string (up to a - * precision of @nice_len bytes), then we set "pending (less than)" and - * "pending (greater than)" to point to the appropriate children of that - * match. - * - * Why does this work? Well, we can think of it this way: the "pending - * (less than)" pointer is reserved for the next match we find that is - * lexicographically *less than* the current string, and the "pending - * (greater than)" pointer is reserved for the next match we find that - * is lexicographically *greater than* the current string. This - * explains why when we find a match that is lexicographically less than - * the current string, we set the "pending (less than)" pointer to point - * to that match. And the reason we change "pending (less than)" to the - * right pointer of the match in that case is because we're walking down - * into that subtree, and the next match lexicographically *less than* - * the current string is guaranteed to be lexicographically *greater - * than* that match, so it should be set as the right subtree of that - * match. But the next match in that subtree that is lexicographically - * *greater than* the current string will need to be moved to the - * "pending (greater than)" pointer farther up the tree. - * - * It's complicated, but it should make sense if you think about it. - * The algorithm basically just moves subtrees into the correct - * locations as it walks down the tree for the search. But also, if the - * algorithm actually finds a match of length @nice_len with the current - * string, it no longer needs that match node and can discard it. The - * algorithm also will discard nodes if the search terminates due to the - * depth limit. For these reasons, the binary tree might not, in fact, - * contain all valid positions. - */ - - u32 best_lt_len = 0; - u32 best_gt_len = 0; - u32 best_len = min_len - 1; - u32 *pending_lt_ptr = &child_tab[cur_pos * 2 + 0]; - u32 *pending_gt_ptr = &child_tab[cur_pos * 2 + 1]; - const u8 * const strptr = &window[cur_pos]; - u32 depth_remaining = max_search_depth; - - for (;;) { - const u8 *matchptr; - u32 len; - - if (cur_match == 0 || depth_remaining-- == 0) { - *pending_lt_ptr = 0; - *pending_gt_ptr = 0; - return lz_matchptr; - } - - matchptr = &window[cur_match]; - len = min(best_lt_len, best_gt_len); - - if (matchptr[len] == strptr[len]) { - - len = lz_extend(strptr, matchptr, len + 1, max_len); - - if (len > best_len) { - best_len = len; - - *lz_matchptr++ = (struct lz_match) { - .len = len, - .offset = strptr - matchptr, - }; - - if (len >= nice_len) { - *pending_lt_ptr = child_tab[cur_match * 2 + 0]; - *pending_gt_ptr = child_tab[cur_match * 2 + 1]; - return lz_matchptr; - } - } - } - - if (matchptr[len] < strptr[len]) { - *pending_lt_ptr = cur_match; - pending_lt_ptr = &child_tab[cur_match * 2 + 1]; - cur_match = *pending_lt_ptr; - best_lt_len = len; - } else { - *pending_gt_ptr = cur_match; - pending_gt_ptr = &child_tab[cur_match * 2 + 0]; - cur_match = *pending_gt_ptr; - best_gt_len = len; - } - } -} - -static u32 -lz_bt_get_matches(struct lz_mf *_mf, struct lz_match matches[]) -{ - struct lz_bt *mf = (struct lz_bt *)_mf; - const u8 * const window = mf->base.cur_window; - const u32 cur_pos = mf->base.cur_window_pos++; - const u32 bytes_remaining = mf->base.cur_window_size - cur_pos; - u32 min_len; - const u32 max_len = min(bytes_remaining, mf->base.params.max_match_len); - const u32 nice_len = min(max_len, mf->base.params.nice_match_len); - u32 hash; - u32 cur_match; - struct lz_match *lz_matchptr = matches; - - if (unlikely(bytes_remaining < LZ_HASH_REQUIRED_NBYTES + 1)) - return 0; - - if (mf->digram_tab) { - /* Search the digram table for a length 2 match. */ - - const u16 digram = mf->next_digram; - mf->next_digram = load_u16_unaligned(&window[cur_pos + 1]); - prefetch(&mf->digram_tab[mf->next_digram]); - cur_match = mf->digram_tab[digram]; - mf->digram_tab[digram] = cur_pos; - - /* We're only interested in matches of length exactly 2, since - * those won't be found during the binary tree search. - * - * Note: it's possible to extend this match as much as possible, - * then use its length plus 1 as min_len for the binary tree - * search. However I found this actually *reduced* performance - * slightly, evidently because the binary tree still needs to be - * searched/updated starting from the root in either case. */ - if (cur_match != 0 && window[cur_match + 2] != window[cur_pos + 2]) { - *lz_matchptr++ = (struct lz_match) { - .len = 2, - .offset = cur_pos - cur_match, - }; - } - min_len = 3; - } else { - min_len = mf->base.params.min_match_len; - } - - hash = mf->next_hash; - mf->next_hash = lz_bt_hash(&window[cur_pos + 1]); - prefetch(&mf->hash_tab[mf->next_hash]); - cur_match = mf->hash_tab[hash]; - mf->hash_tab[hash] = cur_pos; - - /* Search the binary tree of 'hash' for matches while re-rooting it at - * the current position. */ - lz_matchptr = do_search(window, - cur_pos, - mf->child_tab, - cur_match, - min_len, - nice_len, - max_len, - mf->base.params.max_search_depth, - lz_matchptr); - - /* Return the number of matches found. */ - return lz_matchptr - matches; -} - -/* This is very similar to do_search(), but it does not save any matches. - * See do_search() for explanatory comments. */ -static void -do_skip(const u8 window[restrict], - const u32 cur_pos, - u32 child_tab[restrict], - u32 cur_match, - const u32 nice_len, - const u32 max_search_depth) -{ - u32 best_lt_len = 0; - u32 best_gt_len = 0; - u32 *pending_lt_ptr = &child_tab[cur_pos * 2 + 0]; - u32 *pending_gt_ptr = &child_tab[cur_pos * 2 + 1]; - const u8 * const strptr = &window[cur_pos]; - u32 depth_remaining = max_search_depth; - for (;;) { - const u8 *matchptr; - u32 len; - - if (cur_match == 0 || depth_remaining-- == 0) { - *pending_lt_ptr = 0; - *pending_gt_ptr = 0; - return; - } - - matchptr = &window[cur_match]; - len = min(best_lt_len, best_gt_len); - - if (matchptr[len] == strptr[len]) { - len = lz_extend(strptr, matchptr, len + 1, nice_len); - if (len == nice_len) { - *pending_lt_ptr = child_tab[cur_match * 2 + 0]; - *pending_gt_ptr = child_tab[cur_match * 2 + 1]; - return; - } - } - if (matchptr[len] < strptr[len]) { - *pending_lt_ptr = cur_match; - pending_lt_ptr = &child_tab[cur_match * 2 + 1]; - cur_match = *pending_lt_ptr; - best_lt_len = len; - } else { - *pending_gt_ptr = cur_match; - pending_gt_ptr = &child_tab[cur_match * 2 + 0]; - cur_match = *pending_gt_ptr; - best_gt_len = len; - } - } -} - -static void -lz_bt_skip_positions(struct lz_mf *_mf, u32 n) -{ - struct lz_bt *mf = (struct lz_bt *)_mf; - const u8 * const window = mf->base.cur_window; - u32 cur_pos = mf->base.cur_window_pos; - u32 end_pos = cur_pos + n; - u32 bytes_remaining = mf->base.cur_window_size - cur_pos; - u32 hash; - u32 next_hash; - u32 cur_match; - u16 digram; - u16 next_digram; - - mf->base.cur_window_pos = end_pos; - - if (unlikely(bytes_remaining < n + (LZ_HASH_REQUIRED_NBYTES + 1) - 1)) { - /* Nearing end of window. */ - if (unlikely(bytes_remaining < (LZ_HASH_REQUIRED_NBYTES + 1))) - return; - - end_pos = cur_pos + bytes_remaining - (LZ_HASH_REQUIRED_NBYTES + 1) + 1; - } - - next_hash = mf->next_hash; - next_digram = mf->next_digram; - do { - if (mf->digram_tab) { - digram = next_digram; - next_digram = load_u16_unaligned(&window[cur_pos + 1]); - mf->digram_tab[digram] = cur_pos; - } - - hash = next_hash; - next_hash = lz_bt_hash(&window[cur_pos + 1]); - cur_match = mf->hash_tab[hash]; - mf->hash_tab[hash] = cur_pos; - - /* Update the binary tree for the appropriate hash code. */ - do_skip(window, - cur_pos, - mf->child_tab, - cur_match, - min(bytes_remaining, mf->base.params.nice_match_len), - mf->base.params.max_search_depth); - - bytes_remaining--; - } while (++cur_pos != end_pos); - - if (mf->digram_tab) { - prefetch(&mf->digram_tab[next_digram]); - mf->next_digram = next_digram; - } - - prefetch(&mf->hash_tab[next_hash]); - mf->next_hash = next_hash; -} - -static void -lz_bt_destroy(struct lz_mf *_mf) -{ - struct lz_bt *mf = (struct lz_bt *)_mf; - - FREE(mf->hash_tab); - /* mf->hash_tab shares storage with mf->digram_tab and mf->child_tab. */ -} - -const struct lz_mf_ops lz_binary_trees_ops = { - .params_valid = lz_bt_params_valid, - .get_needed_memory = lz_bt_get_needed_memory, - .init = lz_bt_init, - .load_window = lz_bt_load_window, - .get_matches = lz_bt_get_matches, - .skip_positions = lz_bt_skip_positions, - .destroy = lz_bt_destroy, - .struct_size = sizeof(struct lz_bt), -}; diff --git a/src/lz_hash_chains.c b/src/lz_hash_chains.c deleted file mode 100644 index c116f125..00000000 --- a/src/lz_hash_chains.c +++ /dev/null @@ -1,294 +0,0 @@ -/* - * lz_hash_chains.c - * - * Hash chain match-finder for Lempel-Ziv compression. - * - * Copyright (c) 2014 Eric Biggers. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifdef HAVE_CONFIG_H -# include "config.h" -#endif - -#include "wimlib/lz_extend.h" -#include "wimlib/lz_hash3.h" -#include "wimlib/lz_mf.h" -#include "wimlib/util.h" - -#include - -/* log2 of the number of buckets in the hash table. This can be changed. */ -#define LZ_HC_HASH_ORDER 15 - -#define LZ_HC_HASH_LEN (1 << LZ_HC_HASH_ORDER) - -struct lz_hc { - struct lz_mf base; - u32 *hash_tab; /* followed by 'prev_tab' in memory */ - u32 next_hash; -}; - -static inline u32 -lz_hc_hash(const u8 *p) -{ - return lz_hash(p, LZ_HC_HASH_ORDER); -} - -static void -lz_hc_set_default_params(struct lz_mf_params *params) -{ - if (params->min_match_len < LZ_HASH_NBYTES) - params->min_match_len = LZ_HASH_NBYTES; - - if (params->max_match_len == 0) - params->max_match_len = UINT32_MAX; - - if (params->max_search_depth == 0) - params->max_search_depth = 50; - - if (params->nice_match_len == 0) - params->nice_match_len = 24; - - if (params->nice_match_len < params->min_match_len) - params->nice_match_len = params->min_match_len; - - if (params->nice_match_len > params->max_match_len) - params->nice_match_len = params->max_match_len; -} - -static bool -lz_hc_params_valid(const struct lz_mf_params *_params) -{ - struct lz_mf_params params = *_params; - - lz_hc_set_default_params(¶ms); - - return (params.min_match_len <= params.max_match_len); -} - -static u64 -lz_hc_get_needed_memory(u32 max_window_size) -{ - u64 len = 0; - - len += LZ_HC_HASH_LEN; - len += max_window_size; - - return len * sizeof(u32); -} - -static bool -lz_hc_init(struct lz_mf *_mf) -{ - struct lz_hc *mf = (struct lz_hc *)_mf; - - lz_hc_set_default_params(&mf->base.params); - - mf->hash_tab = MALLOC(lz_hc_get_needed_memory(mf->base.params.max_window_size)); - if (!mf->hash_tab) - return false; - - return true; -} - -static void -lz_hc_load_window(struct lz_mf *_mf, const u8 window[], u32 size) -{ - struct lz_hc *mf = (struct lz_hc *)_mf; - - memset(mf->hash_tab, 0, LZ_HC_HASH_LEN * sizeof(u32)); -} - -static u32 -lz_hc_get_matches(struct lz_mf *_mf, struct lz_match matches[]) -{ - struct lz_hc *mf = (struct lz_hc *)_mf; - const u8 * const window = mf->base.cur_window; - const u32 cur_pos = mf->base.cur_window_pos++; - const u8 * const strptr = &window[cur_pos]; - const u32 bytes_remaining = mf->base.cur_window_size - cur_pos; - u32 * const prev_tab = mf->hash_tab + LZ_HC_HASH_LEN; - const u32 max_len = min(bytes_remaining, mf->base.params.max_match_len); - const u32 nice_len = min(max_len, mf->base.params.nice_match_len); - u32 best_len = mf->base.params.min_match_len - 1; - u32 depth_remaining = mf->base.params.max_search_depth; - struct lz_match *lz_matchptr = matches; - u32 hash; - u32 cur_match; - u32 sequence; - - if (unlikely(bytes_remaining < LZ_HASH_REQUIRED_NBYTES + 1)) - return 0; - - /* Insert the current position into the appropriate hash chain and set - * 'cur_match' to the previous head. - * - * For a slight performance improvement, we do each hash calculation one - * position in advance and prefetch the necessary hash table entry. */ - - hash = mf->next_hash; - mf->next_hash = lz_hc_hash(strptr + 1); - prefetch(&mf->hash_tab[mf->next_hash]); - cur_match = mf->hash_tab[hash]; - mf->hash_tab[hash] = cur_pos; - prev_tab[cur_pos] = cur_match; - - /* Ensure we can find a match of at least the requested length. */ - if (unlikely(best_len >= max_len)) - return 0; - - if (UNALIGNED_ACCESS_IS_FAST) - sequence = load_u24_unaligned(strptr); - - /* Search the appropriate hash chain for matches. */ - for (; cur_match && depth_remaining--; cur_match = prev_tab[cur_match]) { - - const u8 * const matchptr = &window[cur_match]; - u32 len; - - /* Considering the potential match at 'matchptr': is it longer - * than 'best_len'? - * - * The bytes at index 'best_len' are the most likely to differ, - * so check them first. */ - if (matchptr[best_len] != strptr[best_len]) - goto next_match; - - if (UNALIGNED_ACCESS_IS_FAST) { - if (load_u24_unaligned(matchptr) != sequence) - goto next_match; - - len = lz_extend(strptr, matchptr, 3, max_len); - - if (len > best_len) { - best_len = len; - - *lz_matchptr++ = (struct lz_match) { - .len = best_len, - .offset = strptr - matchptr, - }; - - if (best_len >= nice_len) - break; - } - } else { - - /* The bytes at indices 'best_len - 1' and '0' are less - * important to check separately. But doing so still - * gives a slight performance improvement, at least on - * x86_64, probably because they create separate - * branches for the CPU to predict independently of the - * branches in the main comparison loops. - */ - if (matchptr[best_len - 1] != strptr[best_len - 1] || - matchptr[0] != strptr[0]) - goto next_match; - - for (len = 1; len < best_len - 1; len++) - if (matchptr[len] != strptr[len]) - goto next_match; - - /* The match is the longest found so far --- at least - * 'best_len' + 1 bytes. Continue extending it. */ - - if (++best_len != max_len && - strptr[best_len] == matchptr[best_len]) - while (++best_len != max_len) - if (strptr[best_len] != matchptr[best_len]) - break; - - /* Record the match. */ - *lz_matchptr++ = (struct lz_match) { - .len = best_len, - .offset = strptr - matchptr, - }; - - /* Terminate the search if 'nice_len' was reached. */ - if (best_len >= nice_len) - break; - } - - next_match: - /* Continue to next match in the chain. */ - ; - } - - return lz_matchptr - matches; -} - -static void -lz_hc_skip_positions(struct lz_mf *_mf, u32 n) -{ - struct lz_hc *mf = (struct lz_hc *)_mf; - u32 * const hash_tab = mf->hash_tab; - u32 * const prev_tab = hash_tab + LZ_HC_HASH_LEN; - const u8 * const window = mf->base.cur_window; - u32 cur_pos = mf->base.cur_window_pos; - u32 end_pos = cur_pos + n; - const u32 bytes_remaining = mf->base.cur_window_size - cur_pos; - u32 hash; - u32 next_hash; - - mf->base.cur_window_pos = end_pos; - - if (unlikely(bytes_remaining < n + (LZ_HASH_REQUIRED_NBYTES + 1) - 1)) { - /* Nearing end of window. */ - if (unlikely(bytes_remaining < (LZ_HASH_REQUIRED_NBYTES + 1))) - return; - - end_pos = cur_pos + bytes_remaining - (LZ_HASH_REQUIRED_NBYTES + 1) + 1; - } - - next_hash = mf->next_hash; - do { - hash = next_hash; - next_hash = lz_hc_hash(&window[cur_pos + 1]); - prev_tab[cur_pos] = hash_tab[hash]; - hash_tab[hash] = cur_pos; - } while (++cur_pos != end_pos); - - prefetch(&hash_tab[next_hash]); - mf->next_hash = next_hash; -} - -static void -lz_hc_destroy(struct lz_mf *_mf) -{ - struct lz_hc *mf = (struct lz_hc *)_mf; - - FREE(mf->hash_tab); -} - -const struct lz_mf_ops lz_hash_chains_ops = { - .params_valid = lz_hc_params_valid, - .get_needed_memory = lz_hc_get_needed_memory, - .init = lz_hc_init, - .load_window = lz_hc_load_window, - .get_matches = lz_hc_get_matches, - .skip_positions = lz_hc_skip_positions, - .destroy = lz_hc_destroy, - .struct_size = sizeof(struct lz_hc), -}; diff --git a/src/lz_mf.c b/src/lz_mf.c index f9061827..ee7c80d8 100644 --- a/src/lz_mf.c +++ b/src/lz_mf.c @@ -39,33 +39,16 @@ /* Available match-finding algorithms. */ static const struct lz_mf_ops *mf_ops[] = { - [LZ_MF_NULL] = &lz_null_ops, - [LZ_MF_HASH_CHAINS] = &lz_hash_chains_ops, - [LZ_MF_BINARY_TREES] = &lz_binary_trees_ops, [LZ_MF_LCP_INTERVAL_TREE] = &lz_lcp_interval_tree_ops, [LZ_MF_LINKED_SUFFIX_ARRAY] = &lz_linked_suffix_array_ops, }; -/* - * Automatically select a match-finding algorithm to use, in the case that the - * user did not specify one. - */ static const struct lz_mf_ops * -select_mf_ops(enum lz_mf_algo algorithm, u32 max_window_size) +get_mf_ops(enum lz_mf_algo algorithm) { - if (algorithm == LZ_MF_DEFAULT) { - if (max_window_size <= 32768) - algorithm = LZ_MF_HASH_CHAINS; - else if (max_window_size <= 2097152) - algorithm = LZ_MF_BINARY_TREES; - else if (max_window_size <= 33554432) - algorithm = LZ_MF_LCP_INTERVAL_TREE; - else - algorithm = LZ_MF_LINKED_SUFFIX_ARRAY; - } - if ((int)algorithm < 0 || (int)algorithm >= ARRAY_LEN(mf_ops)) + if ((unsigned int)algorithm >= ARRAY_LEN(mf_ops)) return NULL; - return mf_ops[(int)algorithm]; + return mf_ops[(unsigned int)algorithm]; } /* @@ -83,7 +66,7 @@ lz_mf_get_needed_memory(enum lz_mf_algo algorithm, u32 max_window_size) { const struct lz_mf_ops *ops; - ops = select_mf_ops(algorithm, max_window_size); + ops = get_mf_ops(algorithm); if (!ops) return 0; return ops->struct_size + ops->get_needed_memory(max_window_size); @@ -97,8 +80,8 @@ lz_mf_params_valid(const struct lz_mf_params *params) { const struct lz_mf_ops *ops; - /* Require that a valid algorithm, or LZ_MF_DEFAULT, be specified. */ - ops = select_mf_ops(params->algorithm, params->max_window_size); + /* Require that a valid algorithm be specified. */ + ops = get_mf_ops(params->algorithm); if (!ops) return false; @@ -165,7 +148,7 @@ lz_mf_alloc(const struct lz_mf_params *params) /* Get the match-finder operations structure. Since we just validated * the parameters, this is guaranteed to return a valid structure. */ - ops = select_mf_ops(params->algorithm, params->max_window_size); + ops = get_mf_ops(params->algorithm); LZ_ASSERT(ops != NULL); /* Allocate memory for the match-finder structure. */ diff --git a/src/lz_null.c b/src/lz_null.c deleted file mode 100644 index 474ab13d..00000000 --- a/src/lz_null.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * lz_null.c - * - * Dummy "match-finder" for Lempel-Ziv compression. - * - * Copyright (c) 2014 Eric Biggers. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifdef HAVE_CONFIG_H -# include "config.h" -#endif - -#include "wimlib/lz_mf.h" - -static bool -lz_null_params_valid(const struct lz_mf_params *_params) -{ - return true; -} - -static u64 -lz_null_get_needed_memory(u32 max_window_size) -{ - return 0; -} - -static bool -lz_null_init(struct lz_mf *mf) -{ - if (mf->params.min_match_len == 0) - mf->params.min_match_len = 2; - - if (mf->params.max_match_len == 0) - mf->params.max_match_len = mf->params.max_window_size; - - return true; -} - -static void -lz_null_load_window(struct lz_mf *mf, const u8 window[], u32 size) -{ -} - -static u32 -lz_null_get_matches(struct lz_mf *mf, struct lz_match matches[]) -{ - mf->cur_window_pos++; - return 0; -} - -static void -lz_null_skip_positions(struct lz_mf *mf, u32 n) -{ - mf->cur_window_pos += n; -} - -static void -lz_null_destroy(struct lz_mf *mf) -{ -} - -const struct lz_mf_ops lz_null_ops = { - .params_valid = lz_null_params_valid, - .get_needed_memory = lz_null_get_needed_memory, - .init = lz_null_init, - .load_window = lz_null_load_window, - .get_matches = lz_null_get_matches, - .skip_positions = lz_null_skip_positions, - .destroy = lz_null_destroy, - .struct_size = sizeof(struct lz_mf), -}; diff --git a/src/lzms_compress.c b/src/lzms_compress.c index 096862e5..ded8c0bb 100644 --- a/src/lzms_compress.c +++ b/src/lzms_compress.c @@ -1422,9 +1422,7 @@ lzms_build_mf_params(const struct lzms_compressor_params *lzms_params, memset(mf_params, 0, sizeof(*mf_params)); /* Choose an appropriate match-finding algorithm. */ - if (max_window_size <= 2097152) - mf_params->algorithm = LZ_MF_BINARY_TREES; - else if (max_window_size <= 33554432) + if (max_window_size <= 33554432) mf_params->algorithm = LZ_MF_LCP_INTERVAL_TREE; else mf_params->algorithm = LZ_MF_LINKED_SUFFIX_ARRAY; diff --git a/src/lzx_common.c b/src/lzx_common.c index bd2e097f..0993f607 100644 --- a/src/lzx_common.c +++ b/src/lzx_common.c @@ -3,7 +3,7 @@ */ /* - * Copyright (C) 2012, 2013, 2014 Eric Biggers + * Copyright (C) 2012, 2013, 2014, 2015 Eric Biggers * * This file is free software; you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License as published by the Free @@ -41,7 +41,7 @@ /* Mapping: offset slot => first match offset that uses that offset slot. */ -const u32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS] = { +const u32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS + 1] = { 0 , 1 , 2 , 3 , 4 , /* 0 --- 4 */ 6 , 8 , 12 , 16 , 24 , /* 5 --- 9 */ 32 , 48 , 64 , 96 , 128 , /* 10 --- 14 */ @@ -52,12 +52,12 @@ const u32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS] = { 196608 , 262144 , 393216 , 524288 , 655360 , /* 35 --- 39 */ 786432 , 917504 , 1048576, 1179648, 1310720, /* 40 --- 44 */ 1441792, 1572864, 1703936, 1835008, 1966080, /* 45 --- 49 */ - 2097152 /* 50 */ + 2097152 /* extra */ }; /* Mapping: offset slot => how many extra bits must be read and added to the * corresponding offset slot base to decode the match offset. */ -const u8 lzx_extra_offset_bits[LZX_MAX_OFFSET_SLOTS] = { +const u8 lzx_extra_offset_bits[LZX_MAX_OFFSET_SLOTS + 1] = { 0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 , 3 , 3 , 4 , 4 , 5 , 5 , 6 , @@ -71,60 +71,45 @@ const u8 lzx_extra_offset_bits[LZX_MAX_OFFSET_SLOTS] = { 17 }; -/* Round the specified compression block size (not LZX block size) up to the - * next valid LZX window size, and return its order (log2). Or, if the block - * size is 0 or greater than the largest valid LZX window size, return 0. */ +/* Round the specified buffer size up to the next valid LZX window size, and + * return its order (log2). Or, if the buffer size is 0 or greater than the + * largest valid LZX window size, return 0. */ unsigned -lzx_get_window_order(size_t max_block_size) +lzx_get_window_order(size_t max_bufsize) { unsigned order; - if (max_block_size == 0 || max_block_size > LZX_MAX_WINDOW_SIZE) + if (max_bufsize == 0 || max_bufsize > LZX_MAX_WINDOW_SIZE) return 0; - order = fls32(max_block_size); + order = fls32(max_bufsize); - if (((u32)1 << order) != max_block_size) + if (((u32)1 << order) != max_bufsize) order++; return max(order, LZX_MIN_WINDOW_ORDER); } +unsigned +lzx_get_num_offset_slots(unsigned window_order) +{ + /* Note: one would expect that the maximum match offset would be + * 'window_size - LZX_MIN_MATCH_LEN', which would occur if the first two + * bytes were to match the last two bytes. However, the format + * disallows this case. This reduces the number of needed offset slots + * by 1. */ + u32 window_size = (u32)1 << window_order; + u32 max_offset = window_size - LZX_MIN_MATCH_LEN - 1; + return 1 + lzx_get_offset_slot(max_offset); +} + /* Given a valid LZX window order, return the number of symbols that will exist * in the main Huffman code. */ unsigned lzx_get_num_main_syms(unsigned window_order) { - u32 window_size = (u32)1 << window_order; - - /* NOTE: the calculation *should* be as follows: - * - * u32 max_offset = window_size - LZX_MIN_MATCH_LEN; - * u32 max_adjusted_offset = max_offset + LZX_OFFSET_OFFSET; - * u32 num_offset_slots = 1 + lzx_get_offset_slot_raw(max_adjusted_offset); - * - * However since LZX_MIN_MATCH_LEN == LZX_OFFSET_OFFSET, we would get - * max_adjusted_offset == window_size, which would bump the number of - * offset slots up by 1 since every valid LZX window size is equal to a - * offset slot base value. The format doesn't do this, and instead - * disallows matches with minimum length and maximum offset. This sets - * max_adjusted_offset = window_size - 1, so instead we must calculate: - * - * num_offset_slots = 1 + lzx_get_offset_slot_raw(window_size - 1); - * - * ... which is the same as - * - * num_offset_slots = lzx_get_offset_slot_raw(window_size); - * - * ... since every valid window size is equal to an offset base value. - */ - unsigned num_offset_slots = lzx_get_offset_slot_raw(window_size); - - /* Now calculate the number of main symbols as LZX_NUM_CHARS literal - * symbols, plus 8 symbols per offset slot (since there are 8 possible - * length headers, and we need all (offset slot, length header) - * combinations). */ - return LZX_NUM_CHARS + (num_offset_slots << 3); + return LZX_NUM_CHARS + (lzx_get_num_offset_slots(window_order) * + LZX_NUM_LEN_HEADERS); } static void diff --git a/src/lzx_compress.c b/src/lzx_compress.c index 6032f1e2..45bb019b 100644 --- a/src/lzx_compress.c +++ b/src/lzx_compress.c @@ -5,7 +5,7 @@ */ /* - * Copyright (C) 2012, 2013, 2014 Eric Biggers + * Copyright (C) 2012, 2013, 2014, 2015 Eric Biggers * * This file is free software; you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License as published by the Free @@ -33,8 +33,7 @@ * * This file may need some slight modifications to be used outside of the WIM * format. In particular, in other situations the LZX block header might be - * slightly different, and a sliding window rather than a fixed-size window - * might be required. + * slightly different, and sliding window support might be required. * * Note: LZX is a compression format derived from DEFLATE, the format used by * zlib and gzip. Both LZX and DEFLATE use LZ77 matching and Huffman coding. @@ -65,29 +64,79 @@ # include "config.h" #endif -#include "wimlib/compress_common.h" -#include "wimlib/compressor_ops.h" -#include "wimlib/endianness.h" -#include "wimlib/error.h" -#include "wimlib/lz_mf.h" -#include "wimlib/lz_repsearch.h" -#include "wimlib/lzx_common.h" -#include "wimlib/util.h" +/* + * Start a new LZX block (with new Huffman codes) after this many bytes. + * + * Note: actual block sizes may slightly exceed this value. + * + * TODO: recursive splitting and cost evaluation might be good for an extremely + * high compression mode, but otherwise it is almost always far too slow for how + * much it helps. Perhaps some sort of heuristic would be useful? + */ +#define LZX_DIV_BLOCK_SIZE 32768 -#include -#include +/* + * LZX_CACHE_PER_POS is the number of lz_match structures to reserve in the + * match cache for each byte position. This value should be high enough so that + * nearly the time, all matches found in a given block can fit in the match + * cache. However, fallback behavior on cache overflow is still required. + */ +#define LZX_CACHE_PER_POS 6 -#define LZX_OPTIM_ARRAY_LENGTH 4096 +#define LZX_CACHE_LEN (LZX_DIV_BLOCK_SIZE * (LZX_CACHE_PER_POS + 1)) -#define LZX_DIV_BLOCK_SIZE 32768 +#define LZX_MAX_MATCHES_PER_POS LZX_NUM_LENS + +/* + * LZX_BIT_COST is a scaling factor that represents the cost to output one bit. + * THis makes it possible to consider fractional bit costs. + * + * Note: this is only useful as a statistical trick for when the true costs are + * unknown. In reality, each token in LZX requires a whole number of bits to + * output. + */ +#define LZX_BIT_COST 16 + +/* + * Consideration of aligned offset costs is disabled for now, due to + * insufficient benefit gained from the time spent. + */ +#define LZX_CONSIDER_ALIGNED_COSTS 0 + +/* + * The maximum compression level at which we use the faster algorithm. + */ +#define LZX_MAX_FAST_LEVEL 34 + +/* + * LZX_HASH2_ORDER is the log base 2 of the number of entries in the hash table + * for finding length 2 matches. This can be as high as 16 (in which case the + * hash function is trivial), but using a smaller hash table actually speeds up + * compression due to reduced cache pressure. + */ +#define LZX_HASH2_ORDER 12 +#define LZX_HASH2_LENGTH (1UL << LZX_HASH2_ORDER) -#define LZX_CACHE_PER_POS 8 +#include "wimlib/lzx_common.h" + +/* + * The maximum allowed window order for the matchfinder. + */ +#define MATCHFINDER_MAX_WINDOW_ORDER LZX_MAX_WINDOW_ORDER -#define LZX_MAX_MATCHES_PER_POS (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1) +#include -#define LZX_CACHE_LEN (LZX_DIV_BLOCK_SIZE * (LZX_CACHE_PER_POS + 1)) +#include "wimlib/bt_matchfinder.h" +#include "wimlib/compress_common.h" +#include "wimlib/compressor_ops.h" +#include "wimlib/endianness.h" +#include "wimlib/error.h" +#include "wimlib/hc_matchfinder.h" +#include "wimlib/lz_extend.h" +#include "wimlib/unaligned.h" +#include "wimlib/util.h" -struct lzx_compressor; +struct lzx_output_bitstream; /* Codewords for the LZX Huffman codes. */ struct lzx_codewords { @@ -104,11 +153,23 @@ struct lzx_lens { u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS]; }; -/* Estimated cost, in bits, to output each symbol in the LZX Huffman codes. */ +/* Cost model for near-optimal parsing */ struct lzx_costs { - u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS]; - u8 len[LZX_LENCODE_NUM_SYMBOLS]; - u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS]; + + /* 'match_cost[offset_slot][len - LZX_MIN_MATCH_LEN]' is the cost for a + * length 'len' match that has an offset belonging to 'offset_slot'. */ + u32 match_cost[LZX_MAX_OFFSET_SLOTS][LZX_NUM_LENS]; + + /* Cost for each symbol in the main code */ + u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS]; + + /* Cost for each symbol in the length code */ + u32 len[LZX_LENCODE_NUM_SYMBOLS]; + +#if LZX_CONSIDER_ALIGNED_COSTS + /* Cost for each symbol in the aligned code */ + u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS]; +#endif }; /* Codewords and lengths for the LZX Huffman codes. */ @@ -134,33 +195,24 @@ struct lzx_item { u64 data; }; -/* Internal compression parameters */ -struct lzx_compressor_params { - u32 (*choose_items_for_block)(struct lzx_compressor *, u32, u32); - u32 num_optim_passes; - enum lz_mf_algo mf_algo; - u32 min_match_length; - u32 nice_match_length; - u32 max_search_depth; -}; - /* - * Match chooser position data: + * This structure represents a byte position in the input buffer and a node in + * the graph of possible match/literal choices. * - * An array of these structures is used during the near-optimal match-choosing - * algorithm. They correspond to consecutive positions in the window and are - * used to keep track of the cost to reach each position, and the match/literal - * choices that need to be chosen to reach that position. + * Logically, each incoming edge to this node is labeled with a literal or a + * match that can be taken to reach this position from an earlier position; and + * each outgoing edge from this node is labeled with a literal or a match that + * can be taken to advance from this position to a later position. */ -struct lzx_mc_pos_data { +struct lzx_optimum_node { /* The cost, in bits, of the lowest-cost path that has been found to * reach this position. This can change as progressively lower cost * paths are found to reach this position. */ u32 cost; -#define MC_INFINITE_COST UINT32_MAX - /* The match or literal that was taken to reach this position. This can + /* + * The match or literal that was taken to reach this position. This can * change as progressively lower cost paths are found to reach this * position. * @@ -175,105 +227,190 @@ struct lzx_mc_pos_data { * Repeat offset matches: * Low bits are the match length, high bits are the queue index. */ - u32 mc_item_data; -#define MC_OFFSET_SHIFT 9 -#define MC_LEN_MASK ((1 << MC_OFFSET_SHIFT) - 1) + u32 item; +#define OPTIMUM_OFFSET_SHIFT 9 +#define OPTIMUM_LEN_MASK ((1 << OPTIMUM_OFFSET_SHIFT) - 1) +} _aligned_attribute(8); - /* The state of the LZX recent match offsets queue at this position. - * This is filled in lazily, only after the minimum-cost path to this - * position is found. - * - * Note: the way we handle this adaptive state in the "minimum-cost" - * parse is actually only an approximation. It's possible for the - * globally optimal, minimum cost path to contain a prefix, ending at a - * position, where that path prefix is *not* the minimum cost path to - * that position. This can happen if such a path prefix results in a - * different adaptive state which results in lower costs later. We do - * not solve this problem; we only consider the lowest cost to reach - * each position, which seems to be an acceptable approximation. */ - struct lzx_lru_queue queue _aligned_attribute(16); - -} _aligned_attribute(16); - -/* State of the LZX compressor */ -struct lzx_compressor { +/* + * Least-recently-used queue for match offsets. + * + * This is represented as a 64-bit integer for efficiency. There are three + * offsets of 21 bits each. Bit 64 is garbage. + */ +struct lzx_lru_queue { + u64 R; +}; - /* Internal compression parameters */ - struct lzx_compressor_params params; +#define LZX_QUEUE64_OFFSET_SHIFT 21 +#define LZX_QUEUE64_OFFSET_MASK (((u64)1 << LZX_QUEUE64_OFFSET_SHIFT) - 1) - /* The preprocessed buffer of data being compressed */ - u8 *cur_window; +#define LZX_QUEUE64_R0_SHIFT (0 * LZX_QUEUE64_OFFSET_SHIFT) +#define LZX_QUEUE64_R1_SHIFT (1 * LZX_QUEUE64_OFFSET_SHIFT) +#define LZX_QUEUE64_R2_SHIFT (2 * LZX_QUEUE64_OFFSET_SHIFT) - /* Number of bytes of data to be compressed, which is the number of - * bytes of data in @cur_window that are actually valid. */ - u32 cur_window_size; +#define LZX_QUEUE64_R0_MASK (LZX_QUEUE64_OFFSET_MASK << LZX_QUEUE64_R0_SHIFT) +#define LZX_QUEUE64_R1_MASK (LZX_QUEUE64_OFFSET_MASK << LZX_QUEUE64_R1_SHIFT) +#define LZX_QUEUE64_R2_MASK (LZX_QUEUE64_OFFSET_MASK << LZX_QUEUE64_R2_SHIFT) - /* log2 order of the LZX window size for LZ match offset encoding - * purposes. Will be >= LZX_MIN_WINDOW_ORDER and <= - * LZX_MAX_WINDOW_ORDER. - * - * Note: 1 << @window_order is normally equal to @max_window_size, - * a.k.a. the allocated size of @cur_window, but it will be greater than - * @max_window_size in the event that the compressor was created with a - * non-power-of-2 block size. (See lzx_get_window_order().) */ +static inline void +lzx_lru_queue_init(struct lzx_lru_queue *queue) +{ + queue->R = ((u64)1 << LZX_QUEUE64_R0_SHIFT) | + ((u64)1 << LZX_QUEUE64_R1_SHIFT) | + ((u64)1 << LZX_QUEUE64_R2_SHIFT); +} + +static inline u64 +lzx_lru_queue_R0(struct lzx_lru_queue queue) +{ + return (queue.R >> LZX_QUEUE64_R0_SHIFT) & LZX_QUEUE64_OFFSET_MASK; +} + +static inline u64 +lzx_lru_queue_R1(struct lzx_lru_queue queue) +{ + return (queue.R >> LZX_QUEUE64_R1_SHIFT) & LZX_QUEUE64_OFFSET_MASK; +} + +static inline u64 +lzx_lru_queue_R2(struct lzx_lru_queue queue) +{ + return (queue.R >> LZX_QUEUE64_R2_SHIFT) & LZX_QUEUE64_OFFSET_MASK; +} + +/* Push a match offset onto the front (most recently used) end of the queue. */ +static inline struct lzx_lru_queue +lzx_lru_queue_push(struct lzx_lru_queue queue, u32 offset) +{ + return (struct lzx_lru_queue) { + .R = (queue.R << LZX_QUEUE64_OFFSET_SHIFT) | offset, + }; +} + +/* Pop a match offset off the front (most recently used) end of the queue. */ +static inline u32 +lzx_lru_queue_pop(struct lzx_lru_queue *queue_p) +{ + u32 offset = queue_p->R & LZX_QUEUE64_OFFSET_MASK; + queue_p->R >>= LZX_QUEUE64_OFFSET_SHIFT; + return offset; +} + +/* Swap a match offset to the front of the queue. */ +static inline struct lzx_lru_queue +lzx_lru_queue_swap(struct lzx_lru_queue queue, unsigned idx) +{ + if (idx == 0) + return queue; + + if (idx == 1) + return (struct lzx_lru_queue) { + .R = (lzx_lru_queue_R1(queue) << LZX_QUEUE64_R0_SHIFT) | + (lzx_lru_queue_R0(queue) << LZX_QUEUE64_R1_SHIFT) | + (queue.R & LZX_QUEUE64_R2_MASK), + }; + + return (struct lzx_lru_queue) { + .R = (lzx_lru_queue_R2(queue) << LZX_QUEUE64_R0_SHIFT) | + (queue.R & LZX_QUEUE64_R1_MASK) | + (lzx_lru_queue_R0(queue) << LZX_QUEUE64_R2_SHIFT), + }; +} + +/* The main LZX compressor structure */ +struct lzx_compressor { + + /* The "nice" match length: if a match of this length is found, then + * choose it immediately without further consideration. */ + unsigned nice_match_length; + + /* The maximum search depth: consider at most this many potential + * matches at each position. */ + unsigned max_search_depth; + + /* The log base 2 of the LZX window size for LZ match offset encoding + * purposes. This will be >= LZX_MIN_WINDOW_ORDER and <= + * LZX_MAX_WINDOW_ORDER. */ unsigned window_order; - /* Number of symbols in the main alphabet. This depends on + /* The number of symbols in the main alphabet. This depends on * @window_order, since @window_order determines the maximum possible - * offset. It does not, however, depend on the *actual* size of the - * current data buffer being processed, which might be less than 1 << - * @window_order. */ + * offset. */ unsigned num_main_syms; - /* Lempel-Ziv match-finder */ - struct lz_mf *mf; + /* Number of optimization passes per block */ + unsigned num_optim_passes; - /* Match-finder wrapper functions and data for near-optimal parsing. - * - * When doing more than one match-choosing pass over the data, matches - * found by the match-finder are cached to achieve a slight speedup when - * the same matches are needed on subsequent passes. This is suboptimal - * because different matches may be preferred with different cost - * models, but it is a very worthwhile speedup. */ - unsigned (*get_matches_func)(struct lzx_compressor *, const struct lz_match **); - void (*skip_bytes_func)(struct lzx_compressor *, unsigned n); - u32 match_window_pos; - u32 match_window_end; - struct lz_match *cached_matches; - struct lz_match *cache_ptr; - struct lz_match *cache_limit; - - /* Position data for near-optimal parsing. */ - struct lzx_mc_pos_data optimum[LZX_OPTIM_ARRAY_LENGTH + LZX_MAX_MATCH_LEN]; - - /* The cost model currently being used for near-optimal parsing. */ - struct lzx_costs costs; - - /* The current match offset LRU queue. */ - struct lzx_lru_queue queue; + /* The preprocessed buffer of data being compressed */ + u8 *in_buffer; - /* Frequency counters for the current block. */ - struct lzx_freqs freqs; + /* The number of bytes of data to be compressed, which is the number of + * bytes of data in @in_buffer that are actually valid. */ + size_t in_nbytes; - /* The Huffman codes for the current and previous blocks. */ - struct lzx_codes codes[2]; + /* Pointer to the compress() implementation chosen at allocation time */ + void (*impl)(struct lzx_compressor *, struct lzx_output_bitstream *); - /* Which 'struct lzx_codes' is being used for the current block. The - * other was used for the previous block (if this isn't the first - * block). */ - unsigned int codes_index; + /* The Huffman symbol frequency counters for the current block. */ + struct lzx_freqs freqs; - /* Dummy lengths that are always 0. */ - struct lzx_lens zero_lens; + /* The Huffman codes for the current and previous blocks. The one with + * index 'codes_index' is for the current block, and the other one is + * for the previous block. */ + struct lzx_codes codes[2]; + unsigned codes_index; - /* Matches/literals that were chosen for the current block. */ - struct lzx_item chosen_items[LZX_DIV_BLOCK_SIZE]; + /* The match/literal sequence the algorithm chose for the current block. + */ + struct lzx_item chosen_items[LZX_DIV_BLOCK_SIZE + LZX_MAX_MATCH_LEN + 1]; /* Table mapping match offset => offset slot for small offsets */ #define LZX_NUM_FAST_OFFSETS 32768 u8 offset_slot_fast[LZX_NUM_FAST_OFFSETS]; + + union { + /* Data for greedy or lazy parsing */ + struct { + /* Hash chains matchfinder (MUST BE LAST!!!) */ + struct hc_matchfinder hc_mf; + }; + + /* Data for near-optimal parsing */ + struct { + /* The graph nodes for the current block */ + struct lzx_optimum_node optimum_nodes[LZX_DIV_BLOCK_SIZE + + LZX_MAX_MATCH_LEN + 1]; + + /* The cost model for the current block */ + struct lzx_costs costs; + + /* Cached matches for the current block */ + struct lz_match match_cache[LZX_CACHE_LEN + 1 + + LZX_MAX_MATCHES_PER_POS]; + struct lz_match *cache_overflow_mark; + + /* Hash table for finding length 2 matches */ + pos_t hash2_tab[LZX_HASH2_LENGTH] + _aligned_attribute(MATCHFINDER_ALIGNMENT); + + /* Binary trees matchfinder (MUST BE LAST!!!) */ + struct bt_matchfinder bt_mf; + }; + }; }; +/* Compute a hash value for the next 2 bytes of uncompressed data. */ +static inline u32 +lz_hash_2_bytes(const u8 *in_next) +{ + u16 next_2_bytes = load_u16_unaligned(in_next); + if (LZX_HASH2_ORDER == 16) + return next_2_bytes; + else + return lz_hash(next_2_bytes, LZX_HASH2_ORDER); +} + /* * Structure to keep track of the current state of sending bits to the * compressed output buffer. @@ -310,7 +447,7 @@ struct lzx_output_bitstream { * Size of @buffer, in bytes. */ static void -lzx_init_output(struct lzx_output_bitstream *os, void *buffer, u32 size) +lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size) { os->bitbuf = 0; os->bitcount = 0; @@ -335,8 +472,8 @@ lzx_init_output(struct lzx_output_bitstream *os, void *buffer, u32 size) */ static inline void lzx_write_varbits(struct lzx_output_bitstream *os, - const u32 bits, const unsigned int num_bits, - const unsigned int max_num_bits) + const u32 bits, const unsigned num_bits, + const unsigned max_num_bits) { /* This code is optimized for LZX, which never needs to write more than * 17 bits at once. */ @@ -375,7 +512,7 @@ lzx_write_varbits(struct lzx_output_bitstream *os, * lzx_write_varbits(). */ static inline void lzx_write_bits(struct lzx_output_bitstream *os, - const u32 bits, const unsigned int num_bits) + const u32 bits, const unsigned num_bits) { lzx_write_varbits(os, bits, num_bits, num_bits); } @@ -401,10 +538,12 @@ lzx_flush_output(struct lzx_output_bitstream *os) * This takes as input the frequency tables for each code and produces as output * a set of tables that map symbols to codewords and codeword lengths. */ static void -lzx_make_huffman_codes(const struct lzx_freqs *freqs, struct lzx_codes *codes, - unsigned num_main_syms) +lzx_make_huffman_codes(struct lzx_compressor *c) { - make_canonical_huffman_code(num_main_syms, + const struct lzx_freqs *freqs = &c->freqs; + struct lzx_codes *codes = &c->codes[c->codes_index]; + + make_canonical_huffman_code(c->num_main_syms, LZX_MAX_MAIN_CODEWORD_LEN, freqs->main, codes->lens.main, @@ -423,6 +562,13 @@ lzx_make_huffman_codes(const struct lzx_freqs *freqs, struct lzx_codes *codes, codes->codewords.aligned); } +/* Reset the symbol frequencies for the LZX Huffman codes. */ +static void +lzx_reset_symbol_frequencies(struct lzx_compressor *c) +{ + memset(&c->freqs, 0, sizeof(c->freqs)); +} + static unsigned lzx_compute_precode_items(const u8 lens[restrict], const u8 prev_lens[restrict], @@ -634,17 +780,19 @@ lzx_write_item(struct lzx_output_bitstream *os, struct lzx_item item, extra_bits = data >> 23; - /*if (block_type == LZX_BLOCKTYPE_ALIGNED && num_extra_bits >= 3) {*/ - if ((num_extra_bits & ones_if_aligned) >= 3) { + if ((num_extra_bits & ones_if_aligned) >= LZX_NUM_ALIGNED_OFFSET_BITS) { /* Aligned offset blocks: The low 3 bits of the extra offset * bits are Huffman-encoded using the aligned offset code. The * remaining bits are output literally. */ - lzx_write_varbits(os, extra_bits >> 3, num_extra_bits - 3, 14); + lzx_write_varbits(os, extra_bits >> LZX_NUM_ALIGNED_OFFSET_BITS, + num_extra_bits - LZX_NUM_ALIGNED_OFFSET_BITS, + 17 - LZX_NUM_ALIGNED_OFFSET_BITS); - lzx_write_varbits(os, codes->codewords.aligned[extra_bits & 7], - codes->lens.aligned[extra_bits & 7], + lzx_write_varbits(os, + codes->codewords.aligned[extra_bits & LZX_ALIGNED_OFFSET_BITMASK], + codes->lens.aligned[extra_bits & LZX_ALIGNED_OFFSET_BITMASK], LZX_MAX_ALIGNED_CODEWORD_LEN); } else { /* Verbatim blocks, or fewer than 3 extra bits: All extra @@ -682,13 +830,12 @@ lzx_write_items(struct lzx_output_bitstream *os, int block_type, lzx_write_item(os, items[i], ones_if_aligned, codes); } -/* Write an LZX aligned offset or verbatim block to the output bitstream. */ static void lzx_write_compressed_block(int block_type, u32 block_size, unsigned window_order, unsigned num_main_syms, - struct lzx_item * chosen_items, + const struct lzx_item chosen_items[], u32 num_chosen_items, const struct lzx_codes * codes, const struct lzx_lens * prev_lens, @@ -752,223 +899,70 @@ lzx_write_compressed_block(int block_type, lzx_write_items(os, block_type, chosen_items, num_chosen_items, codes); } -/* Don't allow matches to span the end of an LZX block. */ -static inline unsigned -maybe_truncate_matches(struct lz_match matches[], unsigned num_matches, - struct lzx_compressor *c) -{ - if (c->match_window_end < c->cur_window_size && num_matches != 0) { - u32 limit = c->match_window_end - c->match_window_pos; - - if (limit >= LZX_MIN_MATCH_LEN) { - - unsigned i = num_matches - 1; - do { - if (matches[i].len >= limit) { - matches[i].len = limit; - - /* Truncation might produce multiple - * matches with length 'limit'. Keep at - * most 1. */ - num_matches = i + 1; - } - } while (i--); - } else { - num_matches = 0; - } - } - return num_matches; -} - -static unsigned -lzx_get_matches_fillcache_singleblock(struct lzx_compressor *c, - const struct lz_match **matches_ret) -{ - struct lz_match *cache_ptr; - struct lz_match *matches; - unsigned num_matches; - - cache_ptr = c->cache_ptr; - matches = cache_ptr + 1; - if (likely(cache_ptr <= c->cache_limit)) { - num_matches = lz_mf_get_matches(c->mf, matches); - cache_ptr->len = num_matches; - c->cache_ptr = matches + num_matches; - } else { - num_matches = 0; - } - c->match_window_pos++; - *matches_ret = matches; - return num_matches; -} - -static unsigned -lzx_get_matches_fillcache_multiblock(struct lzx_compressor *c, - const struct lz_match **matches_ret) +/* Given the frequencies of symbols in an LZX-compressed block and the + * corresponding Huffman codes, return LZX_BLOCKTYPE_ALIGNED or + * LZX_BLOCKTYPE_VERBATIM if an aligned offset or verbatim block, respectively, + * will take fewer bits to output. */ +static int +lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs, + const struct lzx_codes * codes) { - struct lz_match *cache_ptr; - struct lz_match *matches; - unsigned num_matches; - - cache_ptr = c->cache_ptr; - matches = cache_ptr + 1; - if (likely(cache_ptr <= c->cache_limit)) { - num_matches = lz_mf_get_matches(c->mf, matches); - num_matches = maybe_truncate_matches(matches, num_matches, c); - cache_ptr->len = num_matches; - c->cache_ptr = matches + num_matches; - } else { - num_matches = 0; - } - c->match_window_pos++; - *matches_ret = matches; - return num_matches; -} + u32 aligned_cost = 0; + u32 verbatim_cost = 0; -static unsigned -lzx_get_matches_usecache(struct lzx_compressor *c, - const struct lz_match **matches_ret) -{ - struct lz_match *cache_ptr; - struct lz_match *matches; - unsigned num_matches; - - cache_ptr = c->cache_ptr; - matches = cache_ptr + 1; - if (cache_ptr <= c->cache_limit) { - num_matches = cache_ptr->len; - c->cache_ptr = matches + num_matches; - } else { - num_matches = 0; + /* A verbatim block requires 3 bits in each place that an aligned symbol + * would be used in an aligned offset block. */ + for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) { + verbatim_cost += LZX_NUM_ALIGNED_OFFSET_BITS * freqs->aligned[i]; + aligned_cost += codes->lens.aligned[i] * freqs->aligned[i]; } - c->match_window_pos++; - *matches_ret = matches; - return num_matches; -} - -static unsigned -lzx_get_matches_usecache_nocheck(struct lzx_compressor *c, - const struct lz_match **matches_ret) -{ - struct lz_match *cache_ptr; - struct lz_match *matches; - unsigned num_matches; - - cache_ptr = c->cache_ptr; - matches = cache_ptr + 1; - num_matches = cache_ptr->len; - c->cache_ptr = matches + num_matches; - c->match_window_pos++; - *matches_ret = matches; - return num_matches; -} -static unsigned -lzx_get_matches_nocache_singleblock(struct lzx_compressor *c, - const struct lz_match **matches_ret) -{ - struct lz_match *matches; - unsigned num_matches; - - matches = c->cache_ptr; - num_matches = lz_mf_get_matches(c->mf, matches); - c->match_window_pos++; - *matches_ret = matches; - return num_matches; -} + /* Account for output of the aligned offset code. */ + aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE * LZX_ALIGNEDCODE_NUM_SYMBOLS; -static unsigned -lzx_get_matches_nocache_multiblock(struct lzx_compressor *c, - const struct lz_match **matches_ret) -{ - struct lz_match *matches; - unsigned num_matches; - - matches = c->cache_ptr; - num_matches = lz_mf_get_matches(c->mf, matches); - num_matches = maybe_truncate_matches(matches, num_matches, c); - c->match_window_pos++; - *matches_ret = matches; - return num_matches; + if (aligned_cost < verbatim_cost) + return LZX_BLOCKTYPE_ALIGNED; + else + return LZX_BLOCKTYPE_VERBATIM; } /* - * Find matches at the next position in the window. - * - * This uses a wrapper function around the underlying match-finder. + * Finish an LZX block: * - * Returns the number of matches found and sets *matches_ret to point to the - * matches array. The matches will be sorted by strictly increasing length and - * offset. + * - build the Huffman codes + * - decide whether to output the block as VERBATIM or ALIGNED + * - output the block + * - swap the indices of the current and previous Huffman codes */ -static inline unsigned -lzx_get_matches(struct lzx_compressor *c, const struct lz_match **matches_ret) -{ - return (*c->get_matches_func)(c, matches_ret); -} - -static void -lzx_skip_bytes_fillcache(struct lzx_compressor *c, unsigned n) -{ - struct lz_match *cache_ptr; - - cache_ptr = c->cache_ptr; - c->match_window_pos += n; - lz_mf_skip_positions(c->mf, n); - if (cache_ptr <= c->cache_limit) { - do { - cache_ptr->len = 0; - cache_ptr += 1; - } while (--n && cache_ptr <= c->cache_limit); - } - c->cache_ptr = cache_ptr; -} - -static void -lzx_skip_bytes_usecache(struct lzx_compressor *c, unsigned n) -{ - struct lz_match *cache_ptr; - - cache_ptr = c->cache_ptr; - c->match_window_pos += n; - if (cache_ptr <= c->cache_limit) { - do { - cache_ptr += 1 + cache_ptr->len; - } while (--n && cache_ptr <= c->cache_limit); - } - c->cache_ptr = cache_ptr; -} - static void -lzx_skip_bytes_usecache_nocheck(struct lzx_compressor *c, unsigned n) +lzx_finish_block(struct lzx_compressor *c, struct lzx_output_bitstream *os, + u32 block_size, u32 num_chosen_items) { - struct lz_match *cache_ptr; - - cache_ptr = c->cache_ptr; - c->match_window_pos += n; - do { - cache_ptr += 1 + cache_ptr->len; - } while (--n); - c->cache_ptr = cache_ptr; -} + int block_type; -static void -lzx_skip_bytes_nocache(struct lzx_compressor *c, unsigned n) -{ - c->match_window_pos += n; - lz_mf_skip_positions(c->mf, n); + lzx_make_huffman_codes(c); + + block_type = lzx_choose_verbatim_or_aligned(&c->freqs, + &c->codes[c->codes_index]); + lzx_write_compressed_block(block_type, + block_size, + c->window_order, + c->num_main_syms, + c->chosen_items, + num_chosen_items, + &c->codes[c->codes_index], + &c->codes[c->codes_index ^ 1].lens, + os); + c->codes_index ^= 1; } -/* - * Skip the specified number of positions in the window (don't search for - * matches at them). - * - * This uses a wrapper function around the underlying match-finder. - */ -static inline void -lzx_skip_bytes(struct lzx_compressor *c, unsigned n) +/* Return the offset slot for the specified offset, which must be + * less than LZX_NUM_FAST_OFFSETS. */ +static inline unsigned +lzx_get_offset_slot_fast(struct lzx_compressor *c, u32 offset) { - return (*c->skip_bytes_func)(c, n); + LZX_ASSERT(offset < LZX_NUM_FAST_OFFSETS); + return c->offset_slot_fast[offset]; } /* Tally, and optionally record, the specified literal byte. */ @@ -976,7 +970,7 @@ static inline void lzx_declare_literal(struct lzx_compressor *c, unsigned literal, struct lzx_item **next_chosen_item) { - unsigned main_symbol = literal; + unsigned main_symbol = lzx_main_symbol_for_literal(literal); c->freqs.main[main_symbol]++; @@ -994,8 +988,8 @@ lzx_declare_repeat_offset_match(struct lzx_compressor *c, struct lzx_item **next_chosen_item) { unsigned len_header; - unsigned main_symbol; unsigned len_symbol; + unsigned main_symbol; if (len - LZX_MIN_MATCH_LEN < LZX_NUM_PRIMARY_LENS) { len_header = len - LZX_MIN_MATCH_LEN; @@ -1006,7 +1000,7 @@ lzx_declare_repeat_offset_match(struct lzx_compressor *c, c->freqs.len[len_symbol]++; } - main_symbol = LZX_NUM_CHARS + ((rep_index << 3) | len_header); + main_symbol = lzx_main_symbol_for_match(rep_index, len_header); c->freqs.main[main_symbol]++; @@ -1023,8 +1017,8 @@ lzx_declare_explicit_offset_match(struct lzx_compressor *c, unsigned len, u32 of struct lzx_item **next_chosen_item) { unsigned len_header; - unsigned main_symbol; unsigned len_symbol; + unsigned main_symbol; unsigned offset_slot; unsigned num_extra_bits; u32 extra_bits; @@ -1038,22 +1032,27 @@ lzx_declare_explicit_offset_match(struct lzx_compressor *c, unsigned len, u32 of c->freqs.len[len_symbol]++; } - offset_slot = lzx_get_offset_slot_raw(offset + LZX_OFFSET_OFFSET); + offset_slot = (offset < LZX_NUM_FAST_OFFSETS) ? + lzx_get_offset_slot_fast(c, offset) : + lzx_get_offset_slot(offset); - main_symbol = LZX_NUM_CHARS + ((offset_slot << 3) | len_header); + main_symbol = lzx_main_symbol_for_match(offset_slot, len_header); c->freqs.main[main_symbol]++; - if (offset_slot >= 8) - c->freqs.aligned[(offset + LZX_OFFSET_OFFSET) & 7]++; + num_extra_bits = lzx_extra_offset_bits[offset_slot]; - if (next_chosen_item) { + if (num_extra_bits >= LZX_NUM_ALIGNED_OFFSET_BITS) + c->freqs.aligned[(offset + LZX_OFFSET_ADJUSTMENT) & + LZX_ALIGNED_OFFSET_BITMASK]++; - num_extra_bits = lzx_extra_offset_bits[offset_slot]; + if (next_chosen_item) { - extra_bits = (offset + LZX_OFFSET_OFFSET) - + extra_bits = (offset + LZX_OFFSET_ADJUSTMENT) - lzx_offset_slot_base[offset_slot]; + BUILD_BUG_ON(LZX_MAINCODE_MAX_NUM_SYMBOLS > (1 << 10)); + BUILD_BUG_ON(LZX_LENCODE_NUM_SYMBOLS > (1 << 8)); *(*next_chosen_item)++ = (struct lzx_item) { .data = (u64)main_symbol | ((u64)len_symbol << 10) | @@ -1063,13 +1062,14 @@ lzx_declare_explicit_offset_match(struct lzx_compressor *c, unsigned len, u32 of } } + /* Tally, and optionally record, the specified match or literal. */ static inline void -lzx_declare_item(struct lzx_compressor *c, u32 mc_item_data, +lzx_declare_item(struct lzx_compressor *c, u32 item, struct lzx_item **next_chosen_item) { - u32 len = mc_item_data & MC_LEN_MASK; - u32 offset_data = mc_item_data >> MC_OFFSET_SHIFT; + u32 len = item & OPTIMUM_LEN_MASK; + u32 offset_data = item >> OPTIMUM_OFFSET_SHIFT; if (len == 1) lzx_declare_literal(c, offset_data, next_chosen_item); @@ -1078,641 +1078,636 @@ lzx_declare_item(struct lzx_compressor *c, u32 mc_item_data, next_chosen_item); else lzx_declare_explicit_offset_match(c, len, - offset_data - LZX_OFFSET_OFFSET, + offset_data - LZX_OFFSET_ADJUSTMENT, next_chosen_item); } static inline void lzx_record_item_list(struct lzx_compressor *c, - struct lzx_mc_pos_data *cur_optimum_ptr, + struct lzx_optimum_node *cur_node, struct lzx_item **next_chosen_item) { - struct lzx_mc_pos_data *end_optimum_ptr; + struct lzx_optimum_node *end_node; u32 saved_item; u32 item; /* The list is currently in reverse order (last item to first item). * Reverse it. */ - end_optimum_ptr = cur_optimum_ptr; - saved_item = cur_optimum_ptr->mc_item_data; + end_node = cur_node; + saved_item = cur_node->item; do { item = saved_item; - cur_optimum_ptr -= item & MC_LEN_MASK; - saved_item = cur_optimum_ptr->mc_item_data; - cur_optimum_ptr->mc_item_data = item; - } while (cur_optimum_ptr != c->optimum); + cur_node -= item & OPTIMUM_LEN_MASK; + saved_item = cur_node->item; + cur_node->item = item; + } while (cur_node != c->optimum_nodes); /* Walk the list of items from beginning to end, tallying and recording * each item. */ do { - lzx_declare_item(c, cur_optimum_ptr->mc_item_data, next_chosen_item); - cur_optimum_ptr += (cur_optimum_ptr->mc_item_data) & MC_LEN_MASK; - } while (cur_optimum_ptr != end_optimum_ptr); + lzx_declare_item(c, cur_node->item, next_chosen_item); + cur_node += (cur_node->item) & OPTIMUM_LEN_MASK; + } while (cur_node != end_node); } static inline void -lzx_tally_item_list(struct lzx_compressor *c, struct lzx_mc_pos_data *cur_optimum_ptr) +lzx_tally_item_list(struct lzx_compressor *c, struct lzx_optimum_node *cur_node) { /* Since we're just tallying the items, we don't need to reverse the * list. Processing the items in reverse order is fine. */ do { - lzx_declare_item(c, cur_optimum_ptr->mc_item_data, NULL); - cur_optimum_ptr -= (cur_optimum_ptr->mc_item_data & MC_LEN_MASK); - } while (cur_optimum_ptr != c->optimum); -} - -/* Tally, and optionally (if next_chosen_item != NULL) record, in order, all - * items in the current list of items found by the match-chooser. */ -static void -lzx_declare_item_list(struct lzx_compressor *c, struct lzx_mc_pos_data *cur_optimum_ptr, - struct lzx_item **next_chosen_item) -{ - if (next_chosen_item) - lzx_record_item_list(c, cur_optimum_ptr, next_chosen_item); - else - lzx_tally_item_list(c, cur_optimum_ptr); + lzx_declare_item(c, cur_node->item, NULL); + cur_node -= (cur_node->item & OPTIMUM_LEN_MASK); + } while (cur_node != c->optimum_nodes); } -/* Set the cost model @c->costs from the Huffman codeword lengths specified in - * @lens. +/* + * Find an inexpensive path through the graph of possible match/literal choices + * for the current block. The nodes of the graph are + * c->optimum_nodes[0...block_size]. They correspond directly to the bytes in + * the current block, plus one extra node for end-of-block. The edges of the + * graph are matches and literals. The goal is to find the minimum cost path + * from 'c->optimum_nodes[0]' to 'c->optimum_nodes[block_size]'. * - * The cost model and codeword lengths are almost the same thing, but the - * Huffman codewords with length 0 correspond to symbols with zero frequency - * that still need to be assigned actual costs. The specific values assigned - * are arbitrary, but they should be fairly high (near the maximum codeword - * length) to take into account the fact that uses of these symbols are expected - * to be rare. */ -static void -lzx_set_costs(struct lzx_compressor *c, const struct lzx_lens * lens) + * The algorithm works forwards, starting at 'c->optimum_nodes[0]' and + * proceeding forwards one node at a time. At each node, a selection of matches + * (len >= 2), as well as the literal byte (len = 1), is considered. An item of + * length 'len' provides a new path to reach the node 'len' bytes later. If + * such a path is the lowest cost found so far to reach that later node, then + * that later node is updated with the new path. + * + * Note that although this algorithm is based on minimum cost path search, due + * to various simplifying assumptions the result is not guaranteed to be the + * true minimum cost, or "optimal", path over the graph of all valid LZX + * representations of this block. + * + * Also, note that because of the presence of the recent offsets queue (which is + * a type of adaptive state), the algorithm cannot work backwards and compute + * "cost to end" instead of "cost to beginning". Furthermore, the way the + * algorithm handles this adaptive state in the "minimum-cost" parse is actually + * only an approximation. It's possible for the globally optimal, minimum cost + * path to contain a prefix, ending at a position, where that path prefix is + * *not* the minimum cost path to that position. This can happen if such a path + * prefix results in a different adaptive state which results in lower costs + * later. The algorithm does not solve this problem; it only considers the + * lowest cost to reach each individual position. + */ +static struct lzx_lru_queue +lzx_find_min_cost_path(struct lzx_compressor * const restrict c, + const u8 * const restrict block_begin, + const u32 block_size, + const struct lzx_lru_queue initial_queue) { - unsigned i; + struct lzx_optimum_node *cur_node = c->optimum_nodes; + struct lzx_optimum_node * const end_node = &c->optimum_nodes[block_size]; + struct lz_match *cache_ptr = c->match_cache; + const u8 *in_next = block_begin; + const u8 * const block_end = block_begin + block_size; + + /* Instead of storing the match offset LRU queues in the + * 'lzx_optimum_node' structures, we save memory (and cache lines) by + * storing them in a smaller array. This works because the algorithm + * only requires a limited history of the adaptive state. Once a given + * state is more than LZX_MAX_MATCH_LEN bytes behind the current node, + * it is no longer needed. */ + struct lzx_lru_queue queues[512]; + + BUILD_BUG_ON(ARRAY_LEN(queues) < LZX_MAX_MATCH_LEN + 1); +#define QUEUE(in) (queues[(uintptr_t)(in) % ARRAY_LEN(queues)]) + + /* Initially, the cost to reach each node is "infinity". */ + memset(c->optimum_nodes, 0xFF, + (block_size + 1) * sizeof(c->optimum_nodes[0])); + + QUEUE(block_begin) = initial_queue; + + /* The following loop runs 'block_size' iterations, one per node. */ + do { + unsigned num_matches; + unsigned literal; + u32 cost; - /* Main code */ - for (i = 0; i < c->num_main_syms; i++) - c->costs.main[i] = lens->main[i] ? lens->main[i] : 15; + /* + * A selection of matches for the block was already saved in + * memory so that we don't have to run the uncompressed data + * through the matchfinder on every optimization pass. However, + * we still search for repeat offset matches during each + * optimization pass because we cannot predict the state of the + * recent offsets queue. But as a heuristic, we don't bother + * searching for repeat offset matches if the general-purpose + * matchfinder failed to find any matches. + * + * Note that a match of length n at some offset implies there is + * also a match of length l for LZX_MIN_MATCH_LEN <= l <= n at + * that same offset. In other words, we don't necessarily need + * to use the full length of a match. The key heuristic that + * saves a significicant amount of time is that for each + * distinct length, we only consider the smallest offset for + * which that length is available. This heuristic also applies + * to repeat offsets, which we order specially: R0 < R1 < R2 < + * any explicit offset. Of course, this heuristic may be + * produce suboptimal results because offset slots in LZX are + * subject to entropy encoding, but in practice this is a useful + * heuristic. + */ - /* Length code */ - for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) - c->costs.len[i] = lens->len[i] ? lens->len[i] : 15; + num_matches = cache_ptr->length; + cache_ptr++; - /* Aligned offset code */ - for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) - c->costs.aligned[i] = lens->aligned[i] ? lens->aligned[i] : 7; -} + if (num_matches) { + struct lz_match *end_matches = cache_ptr + num_matches; + unsigned next_len = LZX_MIN_MATCH_LEN; + unsigned max_len = min(block_end - in_next, LZX_MAX_MATCH_LEN); + const u8 *matchptr; + + /* Consider R0 match */ + matchptr = in_next - lzx_lru_queue_R0(QUEUE(in_next)); + if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next)) + goto R0_done; + BUILD_BUG_ON(LZX_MIN_MATCH_LEN != 2); + do { + u32 cost = cur_node->cost + + c->costs.match_cost[0][ + next_len - LZX_MIN_MATCH_LEN]; + if (cost <= (cur_node + next_len)->cost) { + (cur_node + next_len)->cost = cost; + (cur_node + next_len)->item = + (0 << OPTIMUM_OFFSET_SHIFT) | next_len; + } + if (unlikely(++next_len > max_len)) { + cache_ptr = end_matches; + goto done_matches; + } + } while (in_next[next_len - 1] == matchptr[next_len - 1]); + + R0_done: + + /* Consider R1 match */ + matchptr = in_next - lzx_lru_queue_R1(QUEUE(in_next)); + if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next)) + goto R1_done; + if (matchptr[next_len - 1] != in_next[next_len - 1]) + goto R1_done; + for (unsigned len = 2; len < next_len - 1; len++) + if (matchptr[len] != in_next[len]) + goto R1_done; + do { + u32 cost = cur_node->cost + + c->costs.match_cost[1][ + next_len - LZX_MIN_MATCH_LEN]; + if (cost <= (cur_node + next_len)->cost) { + (cur_node + next_len)->cost = cost; + (cur_node + next_len)->item = + (1 << OPTIMUM_OFFSET_SHIFT) | next_len; + } + if (unlikely(++next_len > max_len)) { + cache_ptr = end_matches; + goto done_matches; + } + } while (in_next[next_len - 1] == matchptr[next_len - 1]); + + R1_done: + + /* Consider R2 match */ + matchptr = in_next - lzx_lru_queue_R2(QUEUE(in_next)); + if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next)) + goto R2_done; + if (matchptr[next_len - 1] != in_next[next_len - 1]) + goto R2_done; + for (unsigned len = 2; len < next_len - 1; len++) + if (matchptr[len] != in_next[len]) + goto R2_done; + do { + u32 cost = cur_node->cost + + c->costs.match_cost[2][ + next_len - LZX_MIN_MATCH_LEN]; + if (cost <= (cur_node + next_len)->cost) { + (cur_node + next_len)->cost = cost; + (cur_node + next_len)->item = + (2 << OPTIMUM_OFFSET_SHIFT) | next_len; + } + if (unlikely(++next_len > max_len)) { + cache_ptr = end_matches; + goto done_matches; + } + } while (in_next[next_len - 1] == matchptr[next_len - 1]); -/* Set default LZX Huffman symbol costs to bootstrap the iterative optimization - * algorithm. */ -static void -lzx_set_default_costs(struct lzx_costs * costs, unsigned num_main_syms) -{ - unsigned i; + R2_done: - /* Main code (part 1): Literal symbols */ - for (i = 0; i < LZX_NUM_CHARS; i++) - costs->main[i] = 8; + while (next_len > cache_ptr->length) + if (++cache_ptr == end_matches) + goto done_matches; - /* Main code (part 2): Match header symbols */ - for (; i < num_main_syms; i++) - costs->main[i] = 10; + /* Consider explicit offset matches */ + do { + u32 offset = cache_ptr->offset; + u32 offset_data = offset + LZX_OFFSET_ADJUSTMENT; + unsigned offset_slot = (offset < LZX_NUM_FAST_OFFSETS) ? + lzx_get_offset_slot_fast(c, offset) : + lzx_get_offset_slot(offset); + do { + u32 cost = cur_node->cost + + c->costs.match_cost[offset_slot][ + next_len - LZX_MIN_MATCH_LEN]; + #if LZX_CONSIDER_ALIGNED_COSTS + if (lzx_extra_offset_bits[offset_slot] >= + LZX_NUM_ALIGNED_OFFSET_BITS) + cost += c->costs.aligned[offset_data & + LZX_ALIGNED_OFFSET_BITMASK]; + #endif + if (cost < (cur_node + next_len)->cost) { + (cur_node + next_len)->cost = cost; + (cur_node + next_len)->item = + (offset_data << OPTIMUM_OFFSET_SHIFT) | next_len; + } + } while (++next_len <= cache_ptr->length); + } while (++cache_ptr != end_matches); + } - /* Length code */ - for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) - costs->len[i] = 8; + done_matches: - /* Aligned offset code */ - for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) - costs->aligned[i] = 3; -} + /* Consider coding a literal. -/* Return the cost, in bits, to output a literal byte using the specified cost - * model. */ -static inline u32 -lzx_literal_cost(unsigned literal, const struct lzx_costs * costs) -{ - return costs->main[literal]; -} + * To avoid an extra branch, actually checking the preferability + * of coding the literal is integrated into the queue update + * code below. */ + literal = *in_next++; + cost = cur_node->cost + + c->costs.main[lzx_main_symbol_for_literal(literal)]; -/* Return the cost, in bits, to output a match of the specified length and - * offset slot using the specified cost model. Does not take into account - * extra offset bits. */ -static inline u32 -lzx_match_cost_raw(unsigned len, unsigned offset_slot, - const struct lzx_costs *costs) -{ - u32 cost; - unsigned len_header; - unsigned main_symbol; + /* Advance to the next position. */ + cur_node++; - if (len - LZX_MIN_MATCH_LEN < LZX_NUM_PRIMARY_LENS) { - len_header = len - LZX_MIN_MATCH_LEN; - cost = 0; - } else { - len_header = LZX_NUM_PRIMARY_LENS; - - /* Account for length symbol. */ - cost = costs->len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS]; - } + /* The lowest-cost path to the current position is now known. + * Finalize the recent offsets queue that results from taking + * this lowest-cost path. */ - /* Account for main symbol. */ - main_symbol = LZX_NUM_CHARS + ((offset_slot << 3) | len_header); - cost += costs->main[main_symbol]; + if (cost <= cur_node->cost) { + /* Literal: queue remains unchanged. */ + cur_node->cost = cost; + cur_node->item = (literal << OPTIMUM_OFFSET_SHIFT) | 1; + QUEUE(in_next) = QUEUE(in_next - 1); + } else { + /* Match: queue update is needed. */ + unsigned len = cur_node->item & OPTIMUM_LEN_MASK; + u32 offset_data = cur_node->item >> OPTIMUM_OFFSET_SHIFT; + if (offset_data >= LZX_NUM_RECENT_OFFSETS) { + /* Explicit offset match: insert offset at front */ + QUEUE(in_next) = + lzx_lru_queue_push(QUEUE(in_next - len), + offset_data - LZX_OFFSET_ADJUSTMENT); + } else { + /* Repeat offset match: swap offset to front */ + QUEUE(in_next) = + lzx_lru_queue_swap(QUEUE(in_next - len), + offset_data); + } + } + } while (cur_node != end_node); - return cost; + /* Return the match offset queue at the end of the minimum-cost path. */ + return QUEUE(block_end); } -/* Equivalent to lzx_match_cost_raw(), but assumes the length is small enough - * that it doesn't require a length symbol. */ -static inline u32 -lzx_match_cost_raw_smalllen(unsigned len, unsigned offset_slot, - const struct lzx_costs *costs) +/* Given the costs for the main and length codewords, compute 'match_costs'. */ +static void +lzx_compute_match_costs(struct lzx_compressor *c) { - LZX_ASSERT(len < LZX_MIN_MATCH_LEN + LZX_NUM_PRIMARY_LENS); - return costs->main[LZX_NUM_CHARS + - ((offset_slot << 3) | (len - LZX_MIN_MATCH_LEN))]; -} + unsigned num_offset_slots = lzx_get_num_offset_slots(c->window_order); + struct lzx_costs *costs = &c->costs; -/* - * Consider coding the match at repeat offset index @rep_idx. Consider each - * length from the minimum (2) to the full match length (@rep_len). - */ -static inline void -lzx_consider_repeat_offset_match(struct lzx_compressor *c, - struct lzx_mc_pos_data *cur_optimum_ptr, - unsigned rep_len, unsigned rep_idx) -{ - u32 base_cost = cur_optimum_ptr->cost; - u32 cost; - unsigned len; + for (unsigned offset_slot = 0; offset_slot < num_offset_slots; offset_slot++) { -#if 1 /* Optimized version */ + u32 extra_cost = (u32)lzx_extra_offset_bits[offset_slot] * LZX_BIT_COST; + unsigned main_symbol = lzx_main_symbol_for_match(offset_slot, 0); + unsigned i; - if (rep_len < LZX_MIN_MATCH_LEN + LZX_NUM_PRIMARY_LENS) { - /* All lengths being considered are small. */ - len = 2; - do { - cost = base_cost + - lzx_match_cost_raw_smalllen(len, rep_idx, &c->costs); - if (cost < (cur_optimum_ptr + len)->cost) { - (cur_optimum_ptr + len)->mc_item_data = - (rep_idx << MC_OFFSET_SHIFT) | len; - (cur_optimum_ptr + len)->cost = cost; - } - } while (++len <= rep_len); - } else { - /* Some lengths being considered are small, and some are big. - * Start with the optimized loop for small lengths, then switch - * to the optimized loop for big lengths. */ - len = 2; - do { - cost = base_cost + - lzx_match_cost_raw_smalllen(len, rep_idx, &c->costs); - if (cost < (cur_optimum_ptr + len)->cost) { - (cur_optimum_ptr + len)->mc_item_data = - (rep_idx << MC_OFFSET_SHIFT) | len; - (cur_optimum_ptr + len)->cost = cost; - } - } while (++len < LZX_MIN_MATCH_LEN + LZX_NUM_PRIMARY_LENS); + #if LZX_CONSIDER_ALIGNED_COSTS + if (lzx_extra_offset_bits[offset_slot] >= LZX_NUM_ALIGNED_OFFSET_BITS) + extra_cost -= LZX_NUM_ALIGNED_OFFSET_BITS * LZX_BIT_COST; + #endif - /* The main symbol is now fixed. */ - base_cost += c->costs.main[LZX_NUM_CHARS + - ((rep_idx << 3) | LZX_NUM_PRIMARY_LENS)]; - do { - cost = base_cost + - c->costs.len[len - LZX_MIN_MATCH_LEN - - LZX_NUM_PRIMARY_LENS]; - if (cost < (cur_optimum_ptr + len)->cost) { - (cur_optimum_ptr + len)->mc_item_data = - (rep_idx << MC_OFFSET_SHIFT) | len; - (cur_optimum_ptr + len)->cost = cost; - } - } while (++len <= rep_len); - } + for (i = 0; i < LZX_NUM_PRIMARY_LENS; i++) + costs->match_cost[offset_slot][i] = + costs->main[main_symbol++] + extra_cost; -#else /* Unoptimized version */ + extra_cost += costs->main[main_symbol]; - len = 2; - do { - cost = base_cost + - lzx_match_cost_raw(len, rep_idx, &c->costs); - if (cost < (cur_optimum_ptr + len)->cost) { - (cur_optimum_ptr + len)->mc_item_data = - (rep_idx << MC_OFFSET_SHIFT) | len; - (cur_optimum_ptr + len)->cost = cost; - } - } while (++len <= rep_len); -#endif + for (; i < LZX_NUM_LENS; i++) + costs->match_cost[offset_slot][i] = + costs->len[i - LZX_NUM_PRIMARY_LENS] + extra_cost; + } } -/* - * Consider coding each match in @matches as an explicit offset match. - * - * @matches must be sorted by strictly increasing length and strictly - * increasing offset. This is guaranteed by the match-finder. - * - * We consider each length from the minimum (2) to the longest - * (matches[num_matches - 1].len). For each length, we consider only - * the smallest offset for which that length is available. Although - * this is not guaranteed to be optimal due to the possibility of a - * larger offset costing less than a smaller offset to code, this is a - * very useful heuristic. - */ -static inline void -lzx_consider_explicit_offset_matches(struct lzx_compressor *c, - struct lzx_mc_pos_data *cur_optimum_ptr, - const struct lz_match matches[], - unsigned num_matches) +/* Set default LZX Huffman symbol costs to bootstrap the iterative optimization + * algorithm. */ +static void +lzx_set_default_costs(struct lzx_compressor *c, const u8 *block, u32 block_size) { - LZX_ASSERT(num_matches > 0); + u32 i; + bool have_byte[256]; + unsigned num_used_bytes; - unsigned i; - unsigned len; - unsigned offset_slot; - u32 position_cost; - u32 cost; - u32 offset_data; + /* The costs below are hard coded to use a scaling factor of 16. */ + BUILD_BUG_ON(LZX_BIT_COST != 16); + /* + * Heuristics: + * + * - Use smaller initial costs for literal symbols when the input buffer + * contains fewer distinct bytes. + * + * - Assume that match symbols are more costly than literal symbols. + * + * - Assume that length symbols for shorter lengths are less costly than + * length symbols for longer lengths. + */ -#if 1 /* Optimized version */ + for (i = 0; i < 256; i++) + have_byte[i] = false; - if (matches[num_matches - 1].offset < LZX_NUM_FAST_OFFSETS) { + for (i = 0; i < block_size; i++) + have_byte[block[i]] = true; - /* - * Offset is small; the offset slot can be looked up directly in - * c->offset_slot_fast. - * - * Additional optimizations: - * - * - Since the offset is small, it falls in the exponential part - * of the offset slot bases and the number of extra offset - * bits can be calculated directly as (offset_slot >> 1) - 1. - * - * - Just consider the number of extra offset bits; don't - * account for the aligned offset code. Usually this has - * almost no effect on the compression ratio. - * - * - Start out in a loop optimized for small lengths. When the - * length becomes high enough that a length symbol will be - * needed, jump into a loop optimized for big lengths. - */ + num_used_bytes = 0; + for (i = 0; i < 256; i++) + num_used_bytes += have_byte[i]; - LZX_ASSERT(offset_slot <= 37); /* for extra bits formula */ + for (i = 0; i < 256; i++) + c->costs.main[i] = 140 - (256 - num_used_bytes) / 4; - len = 2; - i = 0; - do { - offset_slot = c->offset_slot_fast[matches[i].offset]; - position_cost = cur_optimum_ptr->cost + - ((offset_slot >> 1) - 1); - offset_data = matches[i].offset + LZX_OFFSET_OFFSET; - do { - if (len >= LZX_MIN_MATCH_LEN + LZX_NUM_PRIMARY_LENS) - goto biglen; - cost = position_cost + - lzx_match_cost_raw_smalllen(len, offset_slot, - &c->costs); - if (cost < (cur_optimum_ptr + len)->cost) { - (cur_optimum_ptr + len)->cost = cost; - (cur_optimum_ptr + len)->mc_item_data = - (offset_data << MC_OFFSET_SHIFT) | len; - } - } while (++len <= matches[i].len); - } while (++i != num_matches); + for (; i < c->num_main_syms; i++) + c->costs.main[i] = 170; - return; + for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) + c->costs.len[i] = 103 + (i / 4); - do { - offset_slot = c->offset_slot_fast[matches[i].offset]; - biglen: - position_cost = cur_optimum_ptr->cost + - ((offset_slot >> 1) - 1) + - c->costs.main[LZX_NUM_CHARS + - ((offset_slot << 3) | - LZX_NUM_PRIMARY_LENS)]; - offset_data = matches[i].offset + LZX_OFFSET_OFFSET; - do { - cost = position_cost + - c->costs.len[len - LZX_MIN_MATCH_LEN - - LZX_NUM_PRIMARY_LENS]; - if (cost < (cur_optimum_ptr + len)->cost) { - (cur_optimum_ptr + len)->cost = cost; - (cur_optimum_ptr + len)->mc_item_data = - (offset_data << MC_OFFSET_SHIFT) | len; - } - } while (++len <= matches[i].len); - } while (++i != num_matches); - } else { - len = 2; - i = 0; - do { - offset_data = matches[i].offset + LZX_OFFSET_OFFSET; - offset_slot = lzx_get_offset_slot_raw(offset_data); - position_cost = cur_optimum_ptr->cost + - lzx_extra_offset_bits[offset_slot]; - do { - cost = position_cost + - lzx_match_cost_raw(len, offset_slot, &c->costs); - if (cost < (cur_optimum_ptr + len)->cost) { - (cur_optimum_ptr + len)->cost = cost; - (cur_optimum_ptr + len)->mc_item_data = - (offset_data << MC_OFFSET_SHIFT) | len; - } - } while (++len <= matches[i].len); - } while (++i != num_matches); - } +#if LZX_CONSIDER_ALIGNED_COSTS + for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) + c->costs.aligned[i] = LZX_NUM_ALIGNED_OFFSET_BITS * LZX_BIT_COST; +#endif -#else /* Unoptimized version */ + lzx_compute_match_costs(c); +} - unsigned num_extra_bits; +/* Update the current cost model to reflect the computed Huffman codes. */ +static void +lzx_update_costs(struct lzx_compressor *c) +{ + unsigned i; + const struct lzx_lens *lens = &c->codes[c->codes_index].lens; - len = 2; - i = 0; - do { - offset_data = matches[i].offset + LZX_OFFSET_OFFSET; - position_cost = cur_optimum_ptr->cost; - offset_slot = lzx_get_offset_slot_raw(offset_data); - num_extra_bits = lzx_extra_offset_bits[offset_slot]; - if (num_extra_bits >= 3) { - position_cost += num_extra_bits - 3; - position_cost += c->costs.aligned[offset_data & 7]; - } else { - position_cost += num_extra_bits; - } - do { - cost = position_cost + - lzx_match_cost_raw(len, offset_slot, &c->costs); - if (cost < (cur_optimum_ptr + len)->cost) { - (cur_optimum_ptr + len)->cost = cost; - (cur_optimum_ptr + len)->mc_item_data = - (offset_data << MC_OFFSET_SHIFT) | len; - } - } while (++len <= matches[i].len); - } while (++i != num_matches); + for (i = 0; i < c->num_main_syms; i++) + c->costs.main[i] = (lens->main[i] ? lens->main[i] : 15) * LZX_BIT_COST; + + for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) + c->costs.len[i] = (lens->len[i] ? lens->len[i] : 15) * LZX_BIT_COST; + +#if LZX_CONSIDER_ALIGNED_COSTS + for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) + c->costs.aligned[i] = (lens->aligned[i] ? lens->aligned[i] : 7) * LZX_BIT_COST; #endif + + lzx_compute_match_costs(c); } -/* - * Search for repeat offset matches with the current position. - */ -static inline unsigned -lzx_repsearch(const u8 * const strptr, const u32 bytes_remaining, - const struct lzx_lru_queue *queue, unsigned *rep_max_idx_ret) +static struct lzx_lru_queue +lzx_optimize_and_write_block(struct lzx_compressor *c, + struct lzx_output_bitstream *os, + const u8 *block_begin, const u32 block_size, + const struct lzx_lru_queue initial_queue) { - BUILD_BUG_ON(LZX_NUM_RECENT_OFFSETS != 3); - return lz_repsearch3(strptr, min(bytes_remaining, LZX_MAX_MATCH_LEN), - queue->R, rep_max_idx_ret); + unsigned num_passes_remaining = c->num_optim_passes; + struct lzx_item *next_chosen_item; + struct lzx_lru_queue new_queue; + + /* The first optimization pass uses a default cost model. Each + * additional optimization pass uses a cost model derived from the + * Huffman code computed in the previous pass. */ + + lzx_set_default_costs(c, block_begin, block_size); + lzx_reset_symbol_frequencies(c); + do { + new_queue = lzx_find_min_cost_path(c, block_begin, block_size, + initial_queue); + if (num_passes_remaining > 1) { + lzx_tally_item_list(c, c->optimum_nodes + block_size); + lzx_make_huffman_codes(c); + lzx_update_costs(c); + lzx_reset_symbol_frequencies(c); + } + } while (--num_passes_remaining); + + next_chosen_item = c->chosen_items; + lzx_record_item_list(c, c->optimum_nodes + block_size, &next_chosen_item); + lzx_finish_block(c, os, block_size, next_chosen_item - c->chosen_items); + return new_queue; } /* - * The main near-optimal parsing routine. - * - * Briefly, the algorithm does an approximate minimum-cost path search to find a - * "near-optimal" sequence of matches and literals to output, based on the - * current cost model. The algorithm steps forward, position by position (byte - * by byte), and updates the minimum cost path to reach each later position that - * can be reached using a match or literal from the current position. This is - * essentially Dijkstra's algorithm in disguise: the graph nodes are positions, - * the graph edges are possible matches/literals to code, and the cost of each - * edge is the estimated number of bits that will be required to output the - * corresponding match or literal. But one difference is that we actually - * compute the lowest-cost path in pieces, where each piece is terminated when - * there are no choices to be made. - * - * This function will run this algorithm on the portion of the window from - * &c->cur_window[c->match_window_pos] to &c->cur_window[c->match_window_end]. + * This is the "near-optimal" LZX compressor. * - * On entry, c->queue must be the current state of the match offset LRU queue, - * and c->costs must be the current cost model to use for Huffman symbols. + * For each block, it performs a relatively thorough graph search to find an + * inexpensive (in terms of compressed size) way to output that block. * - * On exit, c->queue will be the state that the LRU queue would be in if the - * chosen items were to be coded. - * - * If next_chosen_item != NULL, then all items chosen will be recorded (saved in - * the chosen_items array). Otherwise, all items chosen will only be tallied - * (symbol frequencies tallied in c->freqs). + * Note: there are actually many things this algorithm leaves on the table in + * terms of compression ratio. So although it may be "near-optimal", it is + * certainly not "optimal". The goal is not to produce the optimal compression + * ratio, which for LZX is probably impossible within any practical amount of + * time, but rather to produce a compression ratio significantly better than a + * simpler "greedy" or "lazy" parse while still being relatively fast. */ static void -lzx_optim_pass(struct lzx_compressor *c, struct lzx_item **next_chosen_item) +lzx_compress_near_optimal(struct lzx_compressor *c, + struct lzx_output_bitstream *os) { - const u8 *block_end; - struct lzx_lru_queue *begin_queue; - const u8 *window_ptr; - struct lzx_mc_pos_data *cur_optimum_ptr; - struct lzx_mc_pos_data *end_optimum_ptr; - const struct lz_match *matches; - unsigned num_matches; - unsigned longest_len; - unsigned rep_max_len; - unsigned rep_max_idx; - unsigned literal; - unsigned len; - u32 cost; - u32 offset_data; - - block_end = &c->cur_window[c->match_window_end]; - begin_queue = &c->queue; -begin: - /* Start building a new list of items, which will correspond to the next - * piece of the overall minimum-cost path. - * - * *begin_queue is the current state of the match offset LRU queue. */ - - window_ptr = &c->cur_window[c->match_window_pos]; - - if (window_ptr == block_end) { - c->queue = *begin_queue; - return; - } - - cur_optimum_ptr = c->optimum; - cur_optimum_ptr->cost = 0; - cur_optimum_ptr->queue = *begin_queue; - - end_optimum_ptr = cur_optimum_ptr; + const u8 * const in_begin = c->in_buffer; + const u8 * in_next = in_begin; + const u8 * const in_end = in_begin + c->in_nbytes; + unsigned max_len = LZX_MAX_MATCH_LEN; + unsigned nice_len = min(c->nice_match_length, max_len); + u32 next_hash; + struct lzx_lru_queue queue; - /* The following loop runs once for each per byte in the window, except - * in a couple shortcut cases. */ - for (;;) { + bt_matchfinder_init(&c->bt_mf); + matchfinder_init(c->hash2_tab, LZX_HASH2_LENGTH); + next_hash = bt_matchfinder_hash_3_bytes(in_next); + lzx_lru_queue_init(&queue); - /* Find explicit offset matches with the current position. */ - num_matches = lzx_get_matches(c, &matches); + do { + /* Starting a new block */ + const u8 * const in_block_begin = in_next; + const u8 * const in_block_end = + in_next + min(LZX_DIV_BLOCK_SIZE, in_end - in_next); - if (num_matches) { - /* - * Find the longest repeat offset match with the current - * position. - * - * Heuristics: - * - * - Only search for repeat offset matches if the - * match-finder already found at least one match. - * - * - Only consider the longest repeat offset match. It - * seems to be rare for the optimal parse to include a - * repeat offset match that doesn't have the longest - * length (allowing for the possibility that not all - * of that length is actually used). - */ - rep_max_len = lzx_repsearch(window_ptr, - block_end - window_ptr, - &cur_optimum_ptr->queue, - &rep_max_idx); - - if (rep_max_len) { - /* If there's a very long repeat offset match, - * choose it immediately. */ - if (rep_max_len >= c->params.nice_match_length) { - - swap(cur_optimum_ptr->queue.R[0], - cur_optimum_ptr->queue.R[rep_max_idx]); - begin_queue = &cur_optimum_ptr->queue; - - cur_optimum_ptr += rep_max_len; - cur_optimum_ptr->mc_item_data = - (rep_max_idx << MC_OFFSET_SHIFT) | - rep_max_len; - - lzx_skip_bytes(c, rep_max_len - 1); - break; + /* Run the block through the matchfinder and cache the matches. */ + struct lz_match *cache_ptr = c->match_cache; + do { + struct lz_match *lz_matchptr; + u32 hash2; + pos_t cur_match; + unsigned best_len; + + /* If approaching the end of the input buffer, adjust + * 'max_len' and 'nice_len' accordingly. */ + if (unlikely(max_len > in_end - in_next)) { + max_len = in_end - in_next; + nice_len = min(max_len, nice_len); + + /* This extra check is needed to ensure that + * reading the next 3 bytes when looking for a + * length 2 match is valid. In addition, we + * cannot allow ourselves to find a length 2 + * match of the very last two bytes with the + * very first two bytes, since such a match has + * an offset too large to be represented. */ + if (unlikely(max_len < + max(LZ_HASH_REQUIRED_NBYTES, 3))) + { + in_next++; + cache_ptr->length = 0; + cache_ptr++; + continue; } - - /* If reaching any positions for the first time, - * initialize their costs to "infinity". */ - while (end_optimum_ptr < cur_optimum_ptr + rep_max_len) - (++end_optimum_ptr)->cost = MC_INFINITE_COST; - - /* Consider coding a repeat offset match. */ - lzx_consider_repeat_offset_match(c, - cur_optimum_ptr, - rep_max_len, - rep_max_idx); } - longest_len = matches[num_matches - 1].len; - - /* If there's a very long explicit offset match, choose - * it immediately. */ - if (longest_len >= c->params.nice_match_length) { - - cur_optimum_ptr->queue.R[2] = - cur_optimum_ptr->queue.R[1]; - cur_optimum_ptr->queue.R[1] = - cur_optimum_ptr->queue.R[0]; - cur_optimum_ptr->queue.R[0] = - matches[num_matches - 1].offset; - begin_queue = &cur_optimum_ptr->queue; - - offset_data = matches[num_matches - 1].offset + - LZX_OFFSET_OFFSET; - cur_optimum_ptr += longest_len; - cur_optimum_ptr->mc_item_data = - (offset_data << MC_OFFSET_SHIFT) | - longest_len; - - lzx_skip_bytes(c, longest_len - 1); - break; + lz_matchptr = cache_ptr + 1; + + /* Check for a length 2 match. */ + hash2 = lz_hash_2_bytes(in_next); + cur_match = c->hash2_tab[hash2]; + c->hash2_tab[hash2] = in_next - in_begin; + if (matchfinder_node_valid(cur_match) && + (LZX_HASH2_ORDER == 16 || + load_u16_unaligned(&in_begin[cur_match]) == + load_u16_unaligned(in_next)) && + in_begin[cur_match + 2] != in_next[2]) + { + lz_matchptr->length = 2; + lz_matchptr->offset = in_next - &in_begin[cur_match]; + lz_matchptr++; } - /* If reaching any positions for the first time, - * initialize their costs to "infinity". */ - while (end_optimum_ptr < cur_optimum_ptr + longest_len) - (++end_optimum_ptr)->cost = MC_INFINITE_COST; + /* Check for matches of length >= 3. */ + lz_matchptr = bt_matchfinder_get_matches(&c->bt_mf, + in_begin, + in_next, + 3, + max_len, + nice_len, + c->max_search_depth, + &next_hash, + &best_len, + lz_matchptr); + in_next++; + cache_ptr->length = lz_matchptr - (cache_ptr + 1); + cache_ptr = lz_matchptr; - /* Consider coding an explicit offset match. */ - lzx_consider_explicit_offset_matches(c, cur_optimum_ptr, - matches, num_matches); - } else { - /* No matches found. The only choice at this position - * is to code a literal. */ - - if (end_optimum_ptr == cur_optimum_ptr) { - #if 1 - /* Optimization for single literals. */ - if (likely(cur_optimum_ptr == c->optimum)) { - lzx_declare_literal(c, *window_ptr++, - next_chosen_item); - if (window_ptr == block_end) { - c->queue = cur_optimum_ptr->queue; - return; + /* + * If there was a very long match found, then don't + * cache any matches for the bytes covered by that + * match. This avoids degenerate behavior when + * compressing highly redundant data, where the number + * of matches can be very large. + * + * This heuristic doesn't actually hurt the compression + * ratio very much. If there's a long match, then the + * data must be highly compressible, so it doesn't + * matter as much what we do. + */ + if (best_len >= nice_len) { + --best_len; + do { + if (unlikely(max_len > in_end - in_next)) { + max_len = in_end - in_next; + nice_len = min(max_len, nice_len); + if (unlikely(max_len < + max(LZ_HASH_REQUIRED_NBYTES, 3))) + { + in_next++; + cache_ptr->length = 0; + cache_ptr++; + continue; + } } - continue; - } - #endif - (++end_optimum_ptr)->cost = MC_INFINITE_COST; + c->hash2_tab[lz_hash_2_bytes(in_next)] = + in_next - in_begin; + bt_matchfinder_skip_position(&c->bt_mf, + in_begin, + in_next, + in_end, + nice_len, + c->max_search_depth, + &next_hash); + in_next++; + cache_ptr->length = 0; + cache_ptr++; + } while (--best_len); } - } + } while (in_next < in_block_end && + likely(cache_ptr < c->cache_overflow_mark)); - /* Consider coding a literal. + /* We've finished running the block through the matchfinder. + * Now choose a match/literal sequence and write the block. */ - * To avoid an extra unpredictable brench, actually checking the - * preferability of coding a literal is integrated into the - * queue update code below. */ - literal = *window_ptr++; - cost = cur_optimum_ptr->cost + lzx_literal_cost(literal, &c->costs); + queue = lzx_optimize_and_write_block(c, os, in_block_begin, + in_next - in_block_begin, + queue); + } while (in_next != in_end); +} - /* Advance to the next position. */ - cur_optimum_ptr++; +/* + * Given a pointer to the current byte sequence and the current list of recent + * match offsets, find the longest repeat offset match. + * + * If no match of at least 2 bytes is found, then return 0. + * + * If a match of at least 2 bytes is found, then return its length and set + * *rep_max_idx_ret to the index of its offset in @queue. +*/ +static unsigned +lzx_find_longest_repeat_offset_match(const u8 * const in_next, + const u32 bytes_remaining, + struct lzx_lru_queue queue, + unsigned *rep_max_idx_ret) +{ + BUILD_BUG_ON(LZX_NUM_RECENT_OFFSETS != 3); + LZX_ASSERT(bytes_remaining >= 2); - /* The lowest-cost path to the current position is now known. - * Finalize the recent offsets queue that results from taking - * this lowest-cost path. */ + const unsigned max_len = min(bytes_remaining, LZX_MAX_MATCH_LEN); + const u16 next_2_bytes = load_u16_unaligned(in_next); + const u8 *matchptr; + unsigned rep_max_len; + unsigned rep_max_idx; + unsigned rep_len; - if (cost < cur_optimum_ptr->cost) { - /* Literal: queue remains unchanged. */ - cur_optimum_ptr->cost = cost; - cur_optimum_ptr->mc_item_data = (literal << MC_OFFSET_SHIFT) | 1; - cur_optimum_ptr->queue = (cur_optimum_ptr - 1)->queue; - } else { - /* Match: queue update is needed. */ - len = cur_optimum_ptr->mc_item_data & MC_LEN_MASK; - offset_data = cur_optimum_ptr->mc_item_data >> MC_OFFSET_SHIFT; - if (offset_data >= LZX_NUM_RECENT_OFFSETS) { - /* Explicit offset match: offset is inserted at front */ - cur_optimum_ptr->queue.R[0] = offset_data - LZX_OFFSET_OFFSET; - cur_optimum_ptr->queue.R[1] = (cur_optimum_ptr - len)->queue.R[0]; - cur_optimum_ptr->queue.R[2] = (cur_optimum_ptr - len)->queue.R[1]; - } else { - /* Repeat offset match: offset is swapped to front */ - cur_optimum_ptr->queue = (cur_optimum_ptr - len)->queue; - swap(cur_optimum_ptr->queue.R[0], - cur_optimum_ptr->queue.R[offset_data]); - } + matchptr = in_next - lzx_lru_queue_pop(&queue); + if (load_u16_unaligned(matchptr) == next_2_bytes) + rep_max_len = lz_extend(in_next, matchptr, 2, max_len); + else + rep_max_len = 0; + rep_max_idx = 0; + + matchptr = in_next - lzx_lru_queue_pop(&queue); + if (load_u16_unaligned(matchptr) == next_2_bytes) { + rep_len = lz_extend(in_next, matchptr, 2, max_len); + if (rep_len > rep_max_len) { + rep_max_len = rep_len; + rep_max_idx = 1; } + } - /* - * This loop will terminate when either of the following - * conditions is true: - * - * (1) cur_optimum_ptr == end_optimum_ptr - * - * There are no paths that extend beyond the current - * position. In this case, any path to a later position - * must pass through the current position, so we can go - * ahead and choose the list of items that led to this - * position. - * - * (2) cur_optimum_ptr == &c->optimum[LZX_OPTIM_ARRAY_LENGTH] - * - * This bounds the number of times the algorithm can step - * forward before it is guaranteed to start choosing items. - * This limits the memory usage. But - * LZX_OPTIM_ARRAY_LENGTH is high enough that on most - * inputs this limit is never reached. - * - * Note: no check for end-of-block is needed because - * end-of-block will trigger condition (1). - */ - if (cur_optimum_ptr == end_optimum_ptr || - cur_optimum_ptr == &c->optimum[LZX_OPTIM_ARRAY_LENGTH]) - { - begin_queue = &cur_optimum_ptr->queue; - break; + matchptr = in_next - lzx_lru_queue_pop(&queue); + if (load_u16_unaligned(matchptr) == next_2_bytes) { + rep_len = lz_extend(in_next, matchptr, 2, max_len); + if (rep_len > rep_max_len) { + rep_max_len = rep_len; + rep_max_idx = 2; } } - /* Choose the current list of items that constitute the minimum-cost - * path to the current position. */ - lzx_declare_item_list(c, cur_optimum_ptr, next_chosen_item); - goto begin; + *rep_max_idx_ret = rep_max_idx; + return rep_max_len; } /* Fast heuristic scoring for lazy parsing: how "good" is this match? */ @@ -1721,300 +1716,212 @@ lzx_explicit_offset_match_score(unsigned len, u32 adjusted_offset) { unsigned score = len; - if (adjusted_offset < 2048) + if (adjusted_offset < 4096) score++; - if (adjusted_offset < 1024) + if (adjusted_offset < 256) score++; return score; } static inline unsigned -lzx_repeat_offset_match_score(unsigned len, unsigned slot) +lzx_repeat_offset_match_score(unsigned rep_len, unsigned rep_idx) { - return len + 3; + return rep_len + 3; } -/* Lazy parsing */ -static u32 -lzx_choose_lazy_items_for_block(struct lzx_compressor *c, - u32 block_start_pos, u32 block_size) +/* This is the "lazy" LZX compressor. */ +static void +lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os) { - const u8 *window_ptr; - const u8 *block_end; - struct lz_mf *mf; - struct lz_match *matches; - unsigned num_matches; - unsigned cur_len; - u32 cur_offset_data; - unsigned cur_score; - unsigned rep_max_len; - unsigned rep_max_idx; - unsigned rep_score; - unsigned prev_len; - unsigned prev_score; - u32 prev_offset_data; - unsigned skip_len; - struct lzx_item *next_chosen_item; - - window_ptr = &c->cur_window[block_start_pos]; - block_end = window_ptr + block_size; - matches = c->cached_matches; - mf = c->mf; - next_chosen_item = c->chosen_items; - - prev_len = 0; - prev_offset_data = 0; - prev_score = 0; - - while (window_ptr != block_end) { + const u8 * const in_begin = c->in_buffer; + const u8 * in_next = in_begin; + const u8 * const in_end = in_begin + c->in_nbytes; + unsigned max_len = LZX_MAX_MATCH_LEN; + unsigned nice_len = min(c->nice_match_length, max_len); + struct lzx_lru_queue queue; - /* Find explicit offset matches with the current position. */ - num_matches = lz_mf_get_matches(mf, matches); - window_ptr++; + hc_matchfinder_init(&c->hc_mf); + lzx_lru_queue_init(&queue); - if (num_matches == 0 || - (matches[num_matches - 1].len == 3 && - matches[num_matches - 1].offset >= 8192 - LZX_OFFSET_OFFSET && - matches[num_matches - 1].offset != c->queue.R[0] && - matches[num_matches - 1].offset != c->queue.R[1] && - matches[num_matches - 1].offset != c->queue.R[2])) - { - /* No match found, or the only match found was a distant - * length 3 match. Output the previous match if there - * is one; otherwise output a literal. */ + do { + /* Starting a new block */ + + const u8 * const in_block_begin = in_next; + const u8 * const in_block_end = + in_next + min(LZX_DIV_BLOCK_SIZE, in_end - in_next); + struct lzx_item *next_chosen_item = c->chosen_items; + unsigned cur_len; + u32 cur_offset; + u32 cur_offset_data; + unsigned cur_score; + unsigned next_len; + u32 next_offset; + u32 next_offset_data; + unsigned next_score; + unsigned rep_max_len; + unsigned rep_max_idx; + unsigned rep_score; + unsigned skip_len; + + lzx_reset_symbol_frequencies(c); - no_match_found: + do { + if (unlikely(max_len > in_end - in_next)) { + max_len = in_end - in_next; + nice_len = min(max_len, nice_len); + } - if (prev_len) { - skip_len = prev_len - 2; - goto output_prev_match; - } else { - lzx_declare_literal(c, *(window_ptr - 1), + /* Find the longest match at the current position. */ + + cur_len = hc_matchfinder_longest_match(&c->hc_mf, + in_begin, + in_next, + 2, + max_len, + nice_len, + c->max_search_depth, + &cur_offset); + if (cur_len < 3 || + (cur_len == 3 && + cur_offset >= 8192 - LZX_OFFSET_ADJUSTMENT && + cur_offset != lzx_lru_queue_R0(queue) && + cur_offset != lzx_lru_queue_R1(queue) && + cur_offset != lzx_lru_queue_R2(queue))) + { + /* There was no match found, or the only match found + * was a distant length 3 match. Output a literal. */ + lzx_declare_literal(c, *in_next++, &next_chosen_item); continue; } - } - - /* Find the longest repeat offset match with the current - * position. */ - if (likely(block_end - (window_ptr - 1) >= 2)) { - rep_max_len = lzx_repsearch((window_ptr - 1), - block_end - (window_ptr - 1), - &c->queue, &rep_max_idx); - } else { - rep_max_len = 0; - } - - cur_len = matches[num_matches - 1].len; - cur_offset_data = matches[num_matches - 1].offset + LZX_OFFSET_OFFSET; - cur_score = lzx_explicit_offset_match_score(cur_len, cur_offset_data); - - /* Select the better of the explicit and repeat offset matches. */ - if (rep_max_len >= 3 && - (rep_score = lzx_repeat_offset_match_score(rep_max_len, - rep_max_idx)) >= cur_score) - { - cur_len = rep_max_len; - cur_offset_data = rep_max_idx; - cur_score = rep_score; - } - - if (unlikely(cur_len > block_end - (window_ptr - 1))) { - /* Nearing end of block. */ - cur_len = block_end - (window_ptr - 1); - if (cur_len < 3) - goto no_match_found; - } - - if (prev_len == 0 || cur_score > prev_score) { - /* No previous match, or the current match is better - * than the previous match. - * - * If there's a previous match, then output a literal in - * its place. - * - * In both cases, if the current match is very long, - * then output it immediately. Otherwise, attempt a - * lazy match by waiting to see if there's a better - * match at the next position. */ - if (prev_len) - lzx_declare_literal(c, *(window_ptr - 2), &next_chosen_item); - - prev_len = cur_len; - prev_offset_data = cur_offset_data; - prev_score = cur_score; - - if (prev_len >= c->params.nice_match_length) { - skip_len = prev_len - 1; - goto output_prev_match; + if (cur_offset == lzx_lru_queue_R0(queue)) { + in_next++; + cur_offset_data = 0; + skip_len = cur_len - 1; + goto choose_cur_match; } - continue; - } - - /* Current match is not better than the previous match, so - * output the previous match. */ - - skip_len = prev_len - 2; - - output_prev_match: - if (prev_offset_data < LZX_NUM_RECENT_OFFSETS) { - lzx_declare_repeat_offset_match(c, prev_len, - prev_offset_data, - &next_chosen_item); - swap(c->queue.R[0], c->queue.R[prev_offset_data]); - } else { - lzx_declare_explicit_offset_match(c, prev_len, - prev_offset_data - LZX_OFFSET_OFFSET, - &next_chosen_item); - c->queue.R[2] = c->queue.R[1]; - c->queue.R[1] = c->queue.R[0]; - c->queue.R[0] = prev_offset_data - LZX_OFFSET_OFFSET; - } - lz_mf_skip_positions(mf, skip_len); - window_ptr += skip_len; - prev_len = 0; - } - - return next_chosen_item - c->chosen_items; -} - -/* Given the frequencies of symbols in an LZX-compressed block and the - * corresponding Huffman codes, return LZX_BLOCKTYPE_ALIGNED or - * LZX_BLOCKTYPE_VERBATIM if an aligned offset or verbatim block, respectively, - * will take fewer bits to output. */ -static int -lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs, - const struct lzx_codes * codes) -{ - u32 aligned_cost = 0; - u32 verbatim_cost = 0; - - /* A verbatim block requires 3 bits in each place that an aligned symbol - * would be used in an aligned offset block. */ - for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) { - verbatim_cost += 3 * freqs->aligned[i]; - aligned_cost += codes->lens.aligned[i] * freqs->aligned[i]; - } - - /* Account for output of the aligned offset code. */ - aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE * LZX_ALIGNEDCODE_NUM_SYMBOLS; - - if (aligned_cost < verbatim_cost) - return LZX_BLOCKTYPE_ALIGNED; - else - return LZX_BLOCKTYPE_VERBATIM; -} - -/* Near-optimal parsing */ -static u32 -lzx_choose_near_optimal_items_for_block(struct lzx_compressor *c, - u32 block_start_pos, u32 block_size) -{ - u32 num_passes_remaining = c->params.num_optim_passes; - struct lzx_lru_queue orig_queue; - struct lzx_item *next_chosen_item; - struct lzx_item **next_chosen_item_ptr; - - /* Choose appropriate match-finder wrapper functions. */ - if (num_passes_remaining > 1) { - if (block_size == c->cur_window_size) - c->get_matches_func = lzx_get_matches_fillcache_singleblock; - else - c->get_matches_func = lzx_get_matches_fillcache_multiblock; - c->skip_bytes_func = lzx_skip_bytes_fillcache; - } else { - if (block_size == c->cur_window_size) - c->get_matches_func = lzx_get_matches_nocache_singleblock; - else - c->get_matches_func = lzx_get_matches_nocache_multiblock; - c->skip_bytes_func = lzx_skip_bytes_nocache; - } - /* No matches will extend beyond the end of the block. */ - c->match_window_end = block_start_pos + block_size; + cur_offset_data = cur_offset + LZX_OFFSET_ADJUSTMENT; + cur_score = lzx_explicit_offset_match_score(cur_len, cur_offset_data); + + /* Consider a repeat offset match */ + rep_max_len = lzx_find_longest_repeat_offset_match(in_next, + in_end - in_next, + queue, + &rep_max_idx); + in_next++; + + if (rep_max_len >= 3 && + (rep_score = lzx_repeat_offset_match_score(rep_max_len, + rep_max_idx)) >= cur_score) + { + cur_len = rep_max_len; + cur_offset_data = rep_max_idx; + skip_len = rep_max_len - 1; + goto choose_cur_match; + } - /* The first optimization pass will use a default cost model. Each - * additional optimization pass will use a cost model computed from the - * previous pass. - * - * To improve performance we only generate the array containing the - * matches and literals in intermediate form on the final pass. For - * earlier passes, tallying symbol frequencies is sufficient. */ - lzx_set_default_costs(&c->costs, c->num_main_syms); + have_cur_match: - next_chosen_item_ptr = NULL; - orig_queue = c->queue; - do { - /* Reset the match-finder wrapper. */ - c->match_window_pos = block_start_pos; - c->cache_ptr = c->cached_matches; - - if (num_passes_remaining == 1) { - /* Last pass: actually generate the items. */ - next_chosen_item = c->chosen_items; - next_chosen_item_ptr = &next_chosen_item; - } + /* We have a match at the current position. */ - /* Choose the items. */ - lzx_optim_pass(c, next_chosen_item_ptr); + /* If we have a very long match, choose it immediately. */ + if (cur_len >= nice_len) { + skip_len = cur_len - 1; + goto choose_cur_match; + } - if (num_passes_remaining > 1) { - /* This isn't the last pass. */ + /* See if there's a better match at the next position. */ - /* Make the Huffman codes from the symbol frequencies. */ - lzx_make_huffman_codes(&c->freqs, &c->codes[c->codes_index], - c->num_main_syms); + if (unlikely(max_len > in_end - in_next)) { + max_len = in_end - in_next; + nice_len = min(max_len, nice_len); + } - /* Update symbol costs. */ - lzx_set_costs(c, &c->codes[c->codes_index].lens); + next_len = hc_matchfinder_longest_match(&c->hc_mf, + in_begin, + in_next, + cur_len - 2, + max_len, + nice_len, + c->max_search_depth / 2, + &next_offset); + + if (next_len <= cur_len - 2) { + in_next++; + skip_len = cur_len - 2; + goto choose_cur_match; + } - /* Reset symbol frequencies. */ - memset(&c->freqs, 0, sizeof(c->freqs)); + next_offset_data = next_offset + LZX_OFFSET_ADJUSTMENT; + next_score = lzx_explicit_offset_match_score(next_len, next_offset_data); + + rep_max_len = lzx_find_longest_repeat_offset_match(in_next, + in_end - in_next, + queue, + &rep_max_idx); + in_next++; + + if (rep_max_len >= 3 && + (rep_score = lzx_repeat_offset_match_score(rep_max_len, + rep_max_idx)) >= next_score) + { + + if (rep_score > cur_score) { + /* The next match is better, and it's a + * repeat offset match. */ + lzx_declare_literal(c, *(in_next - 2), + &next_chosen_item); + cur_len = rep_max_len; + cur_offset_data = rep_max_idx; + skip_len = cur_len - 1; + goto choose_cur_match; + } + } else { + if (next_score > cur_score) { + /* The next match is better, and it's an + * explicit offset match. */ + lzx_declare_literal(c, *(in_next - 2), + &next_chosen_item); + cur_len = next_len; + cur_offset_data = next_offset_data; + cur_score = next_score; + goto have_cur_match; + } + } - /* Reset the match offset LRU queue to what it was at - * the beginning of the block. */ - c->queue = orig_queue; + /* The original match was better. */ + skip_len = cur_len - 2; - /* Choose appopriate match-finder wrapper functions. */ - if (c->cache_ptr <= c->cache_limit) { - c->get_matches_func = lzx_get_matches_usecache_nocheck; - c->skip_bytes_func = lzx_skip_bytes_usecache_nocheck; + choose_cur_match: + if (cur_offset_data < LZX_NUM_RECENT_OFFSETS) { + lzx_declare_repeat_offset_match(c, cur_len, + cur_offset_data, + &next_chosen_item); + queue = lzx_lru_queue_swap(queue, cur_offset_data); } else { - c->get_matches_func = lzx_get_matches_usecache; - c->skip_bytes_func = lzx_skip_bytes_usecache; + lzx_declare_explicit_offset_match(c, cur_len, + cur_offset_data - LZX_OFFSET_ADJUSTMENT, + &next_chosen_item); + queue = lzx_lru_queue_push(queue, cur_offset_data - LZX_OFFSET_ADJUSTMENT); } - } - } while (--num_passes_remaining); - /* Return the number of items chosen. */ - return next_chosen_item - c->chosen_items; -} - -/* - * Choose the matches/literals with which to output the block of data beginning - * at '&c->cur_window[block_start_pos]' and extending for 'block_size' bytes. - * - * The frequences of the Huffman symbols in the block will be tallied in - * 'c->freqs'. - * - * 'c->queue' must specify the state of the queue at the beginning of this block. - * This function will update it to the state of the queue at the end of this - * block. - * - * Returns the number of matches/literals that were chosen and written to - * 'c->chosen_items' in the 'struct lzx_item' intermediate representation. - */ -static u32 -lzx_choose_items_for_block(struct lzx_compressor *c, - u32 block_start_pos, u32 block_size) -{ - return (*c->params.choose_items_for_block)(c, block_start_pos, block_size); + hc_matchfinder_skip_positions(&c->hc_mf, + in_begin, + in_next, + in_end, + skip_len); + in_next += skip_len; + } while (in_next < in_block_end); + + lzx_finish_block(c, os, in_next - in_block_begin, + next_chosen_item - c->chosen_items); + } while (in_next != in_end); } -/* Initialize c->offset_slot_fast. */ static void lzx_init_offset_slot_fast(struct lzx_compressor *c) { @@ -2022,292 +1929,146 @@ lzx_init_offset_slot_fast(struct lzx_compressor *c) for (u32 offset = 0; offset < LZX_NUM_FAST_OFFSETS; offset++) { - while (offset + LZX_OFFSET_OFFSET >= lzx_offset_slot_base[slot + 1]) + while (offset + LZX_OFFSET_ADJUSTMENT >= lzx_offset_slot_base[slot + 1]) slot++; c->offset_slot_fast[offset] = slot; } } -/* Set internal compression parameters for the specified compression level and - * maximum window size. */ -static void -lzx_build_params(unsigned int compression_level, u32 max_window_size, - struct lzx_compressor_params *lzx_params) +static size_t +lzx_get_compressor_size(size_t max_bufsize, unsigned compression_level) { - if (compression_level < 25) { - - /* Fast compression: Use lazy parsing. */ - - lzx_params->choose_items_for_block = lzx_choose_lazy_items_for_block; - lzx_params->num_optim_passes = 1; - - /* When lazy parsing, the hash chain match-finding algorithm is - * fastest unless the window is too large. - * - * TODO: something like hash arrays would actually be better - * than binary trees on large windows. */ - if (max_window_size <= 262144) - lzx_params->mf_algo = LZ_MF_HASH_CHAINS; - else - lzx_params->mf_algo = LZ_MF_BINARY_TREES; - - /* When lazy parsing, don't bother with length 2 matches. */ - lzx_params->min_match_length = 3; - - /* Scale nice_match_length and max_search_depth with the - * compression level. */ - lzx_params->nice_match_length = 25 + compression_level * 2; - lzx_params->max_search_depth = 25 + compression_level; + if (compression_level <= LZX_MAX_FAST_LEVEL) { + return offsetof(struct lzx_compressor, hc_mf) + + hc_matchfinder_size(max_bufsize); } else { - - /* Normal / high compression: Use near-optimal parsing. */ - - lzx_params->choose_items_for_block = lzx_choose_near_optimal_items_for_block; - - /* Set a number of optimization passes appropriate for the - * compression level. */ - - lzx_params->num_optim_passes = 1; - - if (compression_level >= 40) - lzx_params->num_optim_passes++; - - /* Use more optimization passes for higher compression levels. - * But the more passes there are, the less they help --- so - * don't add them linearly. */ - if (compression_level >= 70) { - lzx_params->num_optim_passes++; - if (compression_level >= 100) - lzx_params->num_optim_passes++; - if (compression_level >= 150) - lzx_params->num_optim_passes++; - if (compression_level >= 200) - lzx_params->num_optim_passes++; - if (compression_level >= 300) - lzx_params->num_optim_passes++; - } - - /* When doing near-optimal parsing, the hash chain match-finding - * algorithm is good if the window size is small and we're only - * doing one optimization pass. Otherwise, the binary tree - * algorithm is the way to go. */ - if (max_window_size <= 32768 && lzx_params->num_optim_passes == 1) - lzx_params->mf_algo = LZ_MF_HASH_CHAINS; - else - lzx_params->mf_algo = LZ_MF_BINARY_TREES; - - /* When doing near-optimal parsing, allow length 2 matches if - * the compression level is sufficiently high. */ - if (compression_level >= 45) - lzx_params->min_match_length = 2; - else - lzx_params->min_match_length = 3; - - /* Scale nice_match_length and max_search_depth with the - * compression level. */ - lzx_params->nice_match_length = min(((u64)compression_level * 32) / 50, - LZX_MAX_MATCH_LEN); - lzx_params->max_search_depth = min(((u64)compression_level * 50) / 50, - LZX_MAX_MATCH_LEN); + return offsetof(struct lzx_compressor, bt_mf) + + bt_matchfinder_size(max_bufsize); } } -/* Given the internal compression parameters and maximum window size, build the - * Lempel-Ziv match-finder parameters. */ -static void -lzx_build_mf_params(const struct lzx_compressor_params *lzx_params, - u32 max_window_size, struct lz_mf_params *mf_params) -{ - memset(mf_params, 0, sizeof(*mf_params)); - - mf_params->algorithm = lzx_params->mf_algo; - mf_params->max_window_size = max_window_size; - mf_params->min_match_len = lzx_params->min_match_length; - mf_params->max_match_len = LZX_MAX_MATCH_LEN; - mf_params->max_search_depth = lzx_params->max_search_depth; - mf_params->nice_match_len = lzx_params->nice_match_length; -} - -static void -lzx_free_compressor(void *_c); - static u64 -lzx_get_needed_memory(size_t max_block_size, unsigned int compression_level) +lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level) { - struct lzx_compressor_params params; u64 size = 0; - unsigned window_order; - u32 max_window_size; - window_order = lzx_get_window_order(max_block_size); - if (window_order == 0) + if (max_bufsize > LZX_MAX_WINDOW_SIZE) return 0; - max_window_size = max_block_size; - lzx_build_params(compression_level, max_window_size, ¶ms); - - size += sizeof(struct lzx_compressor); - - /* cur_window */ - size += max_window_size; - - /* mf */ - size += lz_mf_get_needed_memory(params.mf_algo, max_window_size); - - /* cached_matches */ - if (params.num_optim_passes > 1) - size += LZX_CACHE_LEN * sizeof(struct lz_match); - else - size += LZX_MAX_MATCHES_PER_POS * sizeof(struct lz_match); + size += lzx_get_compressor_size(max_bufsize, compression_level); + size += max_bufsize; /* in_buffer */ return size; } static int -lzx_create_compressor(size_t max_block_size, unsigned int compression_level, +lzx_create_compressor(size_t max_bufsize, unsigned compression_level, void **c_ret) { - struct lzx_compressor *c; - struct lzx_compressor_params params; - struct lz_mf_params mf_params; unsigned window_order; - u32 max_window_size; + struct lzx_compressor *c; - window_order = lzx_get_window_order(max_block_size); + window_order = lzx_get_window_order(max_bufsize); if (window_order == 0) return WIMLIB_ERR_INVALID_PARAM; - max_window_size = max_block_size; - - lzx_build_params(compression_level, max_window_size, ¶ms); - lzx_build_mf_params(¶ms, max_window_size, &mf_params); - if (!lz_mf_params_valid(&mf_params)) - return WIMLIB_ERR_INVALID_PARAM; - c = CALLOC(1, sizeof(struct lzx_compressor)); + c = ALIGNED_MALLOC(lzx_get_compressor_size(max_bufsize, + compression_level), + MATCHFINDER_ALIGNMENT); if (!c) - goto oom; + goto oom0; - c->params = params; c->num_main_syms = lzx_get_num_main_syms(window_order); c->window_order = window_order; - /* The window is allocated as 16-byte aligned to speed up memcpy() and - * enable lzx_e8_filter() optimization on x86_64. */ - c->cur_window = ALIGNED_MALLOC(max_window_size, 16); - if (!c->cur_window) - goto oom; - - c->mf = lz_mf_alloc(&mf_params); - if (!c->mf) - goto oom; - - if (params.num_optim_passes > 1) { - c->cached_matches = MALLOC(LZX_CACHE_LEN * - sizeof(struct lz_match)); - if (!c->cached_matches) - goto oom; - c->cache_limit = c->cached_matches + LZX_CACHE_LEN - - (LZX_MAX_MATCHES_PER_POS + 1); + c->in_buffer = MALLOC(max_bufsize); + if (!c->in_buffer) + goto oom1; + + if (compression_level <= LZX_MAX_FAST_LEVEL) { + + /* Fast compression: Use lazy parsing. */ + + c->impl = lzx_compress_lazy; + c->max_search_depth = (36 * compression_level) / 20; + c->nice_match_length = min((72 * compression_level) / 20, + LZX_MAX_MATCH_LEN); + } else { - c->cached_matches = MALLOC(LZX_MAX_MATCHES_PER_POS * - sizeof(struct lz_match)); - if (!c->cached_matches) - goto oom; + + /* Normal / high compression: Use near-optimal parsing. */ + + c->impl = lzx_compress_near_optimal; + + /* Scale nice_match_length and max_search_depth with the + * compression level. */ + c->max_search_depth = (24 * compression_level) / 50; + c->nice_match_length = min((32 * compression_level) / 50, + LZX_MAX_MATCH_LEN); + + /* Set a number of optimization passes appropriate for the + * compression level. */ + + c->num_optim_passes = 1; + + if (compression_level >= 45) + c->num_optim_passes++; + + /* Use more optimization passes for higher compression levels. + * But the more passes there are, the less they help --- so + * don't add them linearly. */ + if (compression_level >= 70) { + c->num_optim_passes++; + if (compression_level >= 100) + c->num_optim_passes++; + if (compression_level >= 150) + c->num_optim_passes++; + if (compression_level >= 200) + c->num_optim_passes++; + if (compression_level >= 300) + c->num_optim_passes++; + } + + c->cache_overflow_mark = &c->match_cache[LZX_CACHE_LEN]; } lzx_init_offset_slot_fast(c); - *c_ret = c; return 0; -oom: - lzx_free_compressor(c); +oom1: + ALIGNED_FREE(c); +oom0: return WIMLIB_ERR_NOMEM; } static size_t -lzx_compress(const void *uncompressed_data, size_t uncompressed_size, - void *compressed_data, size_t compressed_size_avail, void *_c) +lzx_compress(const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, void *_c) { struct lzx_compressor *c = _c; struct lzx_output_bitstream os; - u32 num_chosen_items; - const struct lzx_lens *prev_lens; - u32 block_start_pos; - u32 block_size; - int block_type; - /* Don't bother compressing very small inputs. */ - if (uncompressed_size < 100) + /* Don't bother trying to compress very small inputs. */ + if (in_nbytes < 100) return 0; - /* The input data must be preprocessed. To avoid changing the original - * input data, copy it to a temporary buffer. */ - memcpy(c->cur_window, uncompressed_data, uncompressed_size); - c->cur_window_size = uncompressed_size; - - /* Preprocess the data. */ - lzx_do_e8_preprocessing(c->cur_window, c->cur_window_size); - - /* Load the window into the match-finder. */ - lz_mf_load_window(c->mf, c->cur_window, c->cur_window_size); - - /* Initialize the match offset LRU queue. */ - lzx_lru_queue_init(&c->queue); - - /* Initialize the output bitstream. */ - lzx_init_output(&os, compressed_data, compressed_size_avail); + /* Copy the input data into the internal buffer and preprocess it. */ + memcpy(c->in_buffer, in, in_nbytes); + c->in_nbytes = in_nbytes; + lzx_do_e8_preprocessing(c->in_buffer, in_nbytes); - /* Compress the data block by block. - * - * TODO: The compression ratio could be slightly improved by performing - * data-dependent block splitting instead of using fixed-size blocks. - * Doing so well is a computationally hard problem, however. */ - block_start_pos = 0; + /* Initially, the previous Huffman codeword lengths are all zeroes. */ c->codes_index = 0; - prev_lens = &c->zero_lens; - do { - /* Compute the block size. */ - block_size = min(LZX_DIV_BLOCK_SIZE, - uncompressed_size - block_start_pos); - - /* Reset symbol frequencies. */ - memset(&c->freqs, 0, sizeof(c->freqs)); + memset(&c->codes[1].lens, 0, sizeof(struct lzx_lens)); - /* Prepare the matches/literals for the block. */ - num_chosen_items = lzx_choose_items_for_block(c, - block_start_pos, - block_size); - - /* Make the Huffman codes from the symbol frequencies. */ - lzx_make_huffman_codes(&c->freqs, &c->codes[c->codes_index], - c->num_main_syms); - - /* Choose the best block type. - * - * Note: we currently don't consider uncompressed blocks. */ - block_type = lzx_choose_verbatim_or_aligned(&c->freqs, - &c->codes[c->codes_index]); - - /* Write the compressed block to the output buffer. */ - lzx_write_compressed_block(block_type, - block_size, - c->window_order, - c->num_main_syms, - c->chosen_items, - num_chosen_items, - &c->codes[c->codes_index], - prev_lens, - &os); - - /* The current codeword lengths become the previous lengths. */ - prev_lens = &c->codes[c->codes_index].lens; - c->codes_index ^= 1; - - block_start_pos += block_size; + /* Initialize the output bitstream. */ + lzx_init_output(&os, out, out_nbytes_avail); - } while (block_start_pos != uncompressed_size); + /* Call the compression level-specific compress() function. */ + (*c->impl)(c, &os); + /* Flush the output bitstream and return the compressed size or 0. */ return lzx_flush_output(&os); } @@ -2316,12 +2077,8 @@ lzx_free_compressor(void *_c) { struct lzx_compressor *c = _c; - if (c) { - ALIGNED_FREE(c->cur_window); - lz_mf_free(c->mf); - FREE(c->cached_matches); - FREE(c); - } + FREE(c->in_buffer); + ALIGNED_FREE(c); } const struct compressor_ops lzx_compressor_ops = { diff --git a/src/lzx_decompress.c b/src/lzx_decompress.c index 57b32890..67fbea94 100644 --- a/src/lzx_decompress.c +++ b/src/lzx_decompress.c @@ -5,7 +5,7 @@ */ /* - * Copyright (C) 2012, 2013, 2014 Eric Biggers + * Copyright (C) 2012, 2013, 2014, 2015 Eric Biggers * * This file is free software; you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License as published by the Free @@ -91,6 +91,18 @@ struct lzx_tables { u8 alignedcode_lens[LZX_ALIGNEDCODE_NUM_SYMBOLS]; } _aligned_attribute(DECODE_TABLE_ALIGNMENT); +/* Least-recently used queue for match offsets. */ +struct lzx_lru_queue { + u32 R[LZX_NUM_RECENT_OFFSETS]; +}; + +static inline void +lzx_lru_queue_init(struct lzx_lru_queue *queue) +{ + for (unsigned i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) + queue->R[i] = 1; +} + /* The main LZX decompressor structure. * * Note: we keep track of most of the decompression state outside this @@ -443,15 +455,15 @@ lzx_decompress_block(int block_type, u32 block_size, /* Decode the length header and offset slot. */ mainsym -= LZX_NUM_CHARS; - match_len = mainsym & 0x7; - offset_slot = mainsym >> 3; + match_len = mainsym % LZX_NUM_LEN_HEADERS; + offset_slot = mainsym / LZX_NUM_LEN_HEADERS; /* If needed, read a length symbol to decode the full length. */ - if (match_len == 0x7) + if (match_len == LZX_NUM_PRIMARY_LENS) match_len += read_huffsym_using_lencode(istream, tables); match_len += LZX_MIN_MATCH_LEN; - if (offset_slot <= 2) { + if (offset_slot < LZX_NUM_RECENT_OFFSETS) { /* Repeat offset */ /* Note: This isn't a real LRU queue, since using the R2 @@ -475,18 +487,22 @@ lzx_decompress_block(int block_type, u32 block_size, * each offset are encoded using the aligned offset * code. Otherwise, all the extra bits are literal. */ - /*if (block_type == LZX_BLOCKTYPE_ALIGNED && num_extra_bits >= 3) {*/ - if ((num_extra_bits & ones_if_aligned) >= 3) { - match_offset += bitstream_read_bits(istream, num_extra_bits - 3) << 3; + if ((num_extra_bits & ones_if_aligned) >= LZX_NUM_ALIGNED_OFFSET_BITS) { + match_offset += + bitstream_read_bits(istream, + num_extra_bits - + LZX_NUM_ALIGNED_OFFSET_BITS) + << LZX_NUM_ALIGNED_OFFSET_BITS; match_offset += read_huffsym_using_alignedcode(istream, tables); } else { match_offset += bitstream_read_bits(istream, num_extra_bits); } /* Adjust the offset. */ - match_offset -= LZX_OFFSET_OFFSET; + match_offset -= LZX_OFFSET_ADJUSTMENT; /* Update the match offset LRU queue. */ + BUILD_BUG_ON(LZX_NUM_RECENT_OFFSETS != 3); queue->R[2] = queue->R[1]; queue->R[1] = queue->R[0]; queue->R[0] = match_offset; diff --git a/src/xpress_compress.c b/src/xpress_compress.c index 04105ae0..65c4b026 100644 --- a/src/xpress_compress.c +++ b/src/xpress_compress.c @@ -46,19 +46,18 @@ #define MIN_LEVEL_FOR_NEAR_OPTIMAL 60 /* - * The window order for the matchfinder. This must be the base 2 logarithm of - * the maximum buffer size. + * The maximum window order for the matchfinder. This must be the base 2 + * logarithm of the maximum buffer size. */ -#define MATCHFINDER_WINDOW_ORDER 16 +#define MATCHFINDER_MAX_WINDOW_ORDER 16 /* - * Although XPRESS can potentially use a sliding window, it isn't well suited - * for large buffers of data because there is no way to reset the Huffman code. - * Therefore, we only allow buffers in which there is no restriction on match - * offsets (no sliding window). This simplifies the code and allows some + * Note: although XPRESS can potentially use a sliding window, it isn't well + * suited for large buffers of data because there is no way to reset the Huffman + * code. Therefore, we only allow buffers in which there is no restriction on + * match offsets (no sliding window). This simplifies the code and allows some * optimizations. */ -#define MATCHFINDER_IS_SLIDING 0 #include @@ -122,21 +121,21 @@ struct xpress_compressor { union { /* Data for greedy or lazy parsing */ struct { - struct hc_matchfinder hc_mf; struct xpress_item *chosen_items; - u8 nonoptimal_end[0]; + struct hc_matchfinder hc_mf; + /* hc_mf must be last! */ }; #if SUPPORT_NEAR_OPTIMAL_PARSING /* Data for near-optimal parsing */ struct { - struct bt_matchfinder bt_mf; struct xpress_optimum_node *optimum_nodes; struct lz_match *match_cache; struct lz_match *cache_overflow_mark; unsigned num_optim_passes; u32 costs[XPRESS_NUM_SYMBOLS]; - u8 optimal_end[0]; + struct bt_matchfinder bt_mf; + /* bt_mf must be last! */ }; #endif }; @@ -527,9 +526,9 @@ xpress_compress_greedy(struct xpress_compressor * restrict c, const void * restrict in, size_t in_nbytes, void * restrict out, size_t out_nbytes_avail) { - const u8 * const in_base = in; - const u8 * in_next = in_base; - const u8 * const in_end = in_base + in_nbytes; + const u8 * const in_begin = in; + const u8 * in_next = in_begin; + const u8 * const in_end = in_begin + in_nbytes; struct xpress_item *next_chosen_item = c->chosen_items; unsigned len_3_too_far; @@ -545,7 +544,7 @@ xpress_compress_greedy(struct xpress_compressor * restrict c, unsigned offset; length = hc_matchfinder_longest_match(&c->hc_mf, - in_base, + in_begin, in_next, XPRESS_MIN_MATCH_LEN - 1, in_end - in_next, @@ -560,7 +559,7 @@ xpress_compress_greedy(struct xpress_compressor * restrict c, xpress_record_match(c, length, offset); in_next += 1; hc_matchfinder_skip_positions(&c->hc_mf, - in_base, + in_begin, in_next, in_end, length - 1); @@ -587,9 +586,9 @@ xpress_compress_lazy(struct xpress_compressor * restrict c, const void * restrict in, size_t in_nbytes, void * restrict out, size_t out_nbytes_avail) { - const u8 * const in_base = in; - const u8 * in_next = in_base; - const u8 * const in_end = in_base + in_nbytes; + const u8 * const in_begin = in; + const u8 * in_next = in_begin; + const u8 * const in_end = in_begin + in_nbytes; struct xpress_item *next_chosen_item = c->chosen_items; unsigned len_3_too_far; @@ -608,7 +607,7 @@ xpress_compress_lazy(struct xpress_compressor * restrict c, /* Find the longest match at the current position. */ cur_len = hc_matchfinder_longest_match(&c->hc_mf, - in_base, + in_begin, in_next, XPRESS_MIN_MATCH_LEN - 1, in_end - in_next, @@ -637,7 +636,7 @@ xpress_compress_lazy(struct xpress_compressor * restrict c, xpress_record_match(c, cur_len, cur_offset); hc_matchfinder_skip_positions(&c->hc_mf, - in_base, + in_begin, in_next, in_end, cur_len - 1); @@ -662,7 +661,7 @@ xpress_compress_lazy(struct xpress_compressor * restrict c, * longest_match() inlined at each. */ next_len = hc_matchfinder_longest_match(&c->hc_mf, - in_base, + in_begin, in_next, cur_len, in_end - in_next, @@ -685,7 +684,7 @@ xpress_compress_lazy(struct xpress_compressor * restrict c, *next_chosen_item++ = xpress_record_match(c, cur_len, cur_offset); hc_matchfinder_skip_positions(&c->hc_mf, - in_base, + in_begin, in_next, in_end, cur_len - 2); @@ -899,16 +898,18 @@ static struct lz_match * xpress_find_matches(struct xpress_compressor * restrict c, const void * restrict in, size_t in_nbytes) { - const u8 * const in_base = in; - const u8 *in_next = in_base; - const u8 * const in_end = in_base + in_nbytes; + const u8 * const in_begin = in; + const u8 *in_next = in_begin; + const u8 * const in_end = in_begin + in_nbytes; struct lz_match *cache_ptr = c->match_cache; - unsigned long prev_hash = 0; + u32 next_hash; bt_matchfinder_init(&c->bt_mf); + next_hash = bt_matchfinder_hash_3_bytes(in_next); do { - unsigned num_matches; + struct lz_match *matches; + unsigned best_len; /* If we've found so many matches that the cache might overflow * if we keep finding more, then stop finding matches. This @@ -922,56 +923,53 @@ xpress_find_matches(struct xpress_compressor * restrict c, return cache_ptr; } + matches = cache_ptr; + /* Find matches with the current position using the binary tree * matchfinder and save them in the next available slots in * the match cache. */ - num_matches = + cache_ptr = bt_matchfinder_get_matches(&c->bt_mf, - in_base, + in_begin, in_next, XPRESS_MIN_MATCH_LEN, in_end - in_next, min(in_end - in_next, c->nice_match_length), c->max_search_depth, - &prev_hash, + &next_hash, + &best_len, cache_ptr); - cache_ptr += num_matches; - cache_ptr->length = num_matches; + cache_ptr->length = cache_ptr - matches; cache_ptr->offset = *in_next; in_next++; cache_ptr++; - if (num_matches) { - /* - * If there was a very long match found, then don't - * cache any matches for the bytes covered by that - * match. This avoids degenerate behavior when - * compressing highly redundant data, where the number - * of matches can be very large. - * - * This heuristic doesn't actually hurt the compression - * ratio very much. If there's a long match, then the - * data must be highly compressible, so it doesn't - * matter as much what we do. - */ - unsigned best_len = cache_ptr[-2].length; - if (best_len >= c->nice_match_length) { - --best_len; - do { - bt_matchfinder_skip_position(&c->bt_mf, - in_base, - in_next, - in_end, - min(in_end - in_next, - c->nice_match_length), - c->max_search_depth, - &prev_hash); - - cache_ptr->length = 0; - cache_ptr->offset = *in_next++; - cache_ptr++; - } while (--best_len); - } + /* + * If there was a very long match found, then don't cache any + * matches for the bytes covered by that match. This avoids + * degenerate behavior when compressing highly redundant data, + * where the number of matches can be very large. + * + * This heuristic doesn't actually hurt the compression ratio + * very much. If there's a long match, then the data must be + * highly compressible, so it doesn't matter as much what we do. + */ + if (best_len >= c->nice_match_length) { + --best_len; + do { + bt_matchfinder_skip_position(&c->bt_mf, + in_begin, + in_next, + in_end, + min(in_end - in_next, + c->nice_match_length), + c->max_search_depth, + &next_hash); + + cache_ptr->length = 0; + cache_ptr->offset = *in_next++; + cache_ptr++; + } while (--best_len); } } while (in_next != in_end); @@ -1019,23 +1017,39 @@ xpress_compress_near_optimal(struct xpress_compressor * restrict c, #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ +static size_t +xpress_get_compressor_size(size_t max_bufsize, unsigned compression_level) +{ +#if SUPPORT_NEAR_OPTIMAL_PARSING + if (compression_level >= MIN_LEVEL_FOR_NEAR_OPTIMAL) + return offsetof(struct xpress_compressor, bt_mf) + + bt_matchfinder_size(max_bufsize); +#endif + + return offsetof(struct xpress_compressor, hc_mf) + + hc_matchfinder_size(max_bufsize); +} + static u64 xpress_get_needed_memory(size_t max_bufsize, unsigned compression_level) { - size_t size = 0; + u64 size = 0; if (max_bufsize > XPRESS_MAX_BUFSIZE) return 0; + size += xpress_get_compressor_size(max_bufsize, compression_level); + if (compression_level < MIN_LEVEL_FOR_NEAR_OPTIMAL || !SUPPORT_NEAR_OPTIMAL_PARSING) { - size += offsetof(struct xpress_compressor, nonoptimal_end); + /* chosen_items */ size += max_bufsize * sizeof(struct xpress_item); } #if SUPPORT_NEAR_OPTIMAL_PARSING else { - size += offsetof(struct xpress_compressor, optimal_end); + /* optimum_nodes */ size += (max_bufsize + 1) * sizeof(struct xpress_optimum_node); + /* match_cache */ size += ((max_bufsize * CACHE_RESERVE_PER_POS) + XPRESS_MAX_MATCH_LEN + max_bufsize) * sizeof(struct lz_match); @@ -1053,49 +1067,31 @@ xpress_create_compressor(size_t max_bufsize, unsigned compression_level, if (max_bufsize > XPRESS_MAX_BUFSIZE) return WIMLIB_ERR_INVALID_PARAM; - if (compression_level < 30) { - c = ALIGNED_MALLOC(offsetof(struct xpress_compressor, - nonoptimal_end), - MATCHFINDER_ALIGNMENT); - if (!c) - return WIMLIB_ERR_NOMEM; - c->impl = xpress_compress_greedy; - c->max_search_depth = (compression_level * 24) / 16; - c->nice_match_length = (compression_level * 48) / 16; - c->chosen_items = MALLOC(max_bufsize * sizeof(struct xpress_item)); - if (!c->chosen_items) { - ALIGNED_FREE(c); - return WIMLIB_ERR_NOMEM; - } - } else if (compression_level < MIN_LEVEL_FOR_NEAR_OPTIMAL || - !SUPPORT_NEAR_OPTIMAL_PARSING) + c = ALIGNED_MALLOC(xpress_get_compressor_size(max_bufsize, compression_level), + MATCHFINDER_ALIGNMENT); + if (!c) + goto oom0; + + if (compression_level < MIN_LEVEL_FOR_NEAR_OPTIMAL || + !SUPPORT_NEAR_OPTIMAL_PARSING) { - c = ALIGNED_MALLOC(offsetof(struct xpress_compressor, - nonoptimal_end), - MATCHFINDER_ALIGNMENT); - if (!c) - return WIMLIB_ERR_NOMEM; - - c->impl = xpress_compress_lazy; - c->max_search_depth = (compression_level * 24) / 32; - c->nice_match_length = (compression_level * 48) / 32; + c->chosen_items = MALLOC(max_bufsize * sizeof(struct xpress_item)); - if (!c->chosen_items) { - ALIGNED_FREE(c); - return WIMLIB_ERR_NOMEM; + if (!c->chosen_items) + goto oom1; + + if (compression_level < 30) { + c->impl = xpress_compress_greedy; + c->max_search_depth = (compression_level * 24) / 16; + c->nice_match_length = (compression_level * 48) / 16; + } else { + c->impl = xpress_compress_lazy; + c->max_search_depth = (compression_level * 24) / 32; + c->nice_match_length = (compression_level * 48) / 32; } } #if SUPPORT_NEAR_OPTIMAL_PARSING else { - c = ALIGNED_MALLOC(offsetof(struct xpress_compressor, - optimal_end), - MATCHFINDER_ALIGNMENT); - if (!c) - return WIMLIB_ERR_NOMEM; - c->impl = xpress_compress_near_optimal; - c->max_search_depth = (compression_level * 32) / 100; - c->nice_match_length = (compression_level * 50) / 100; - c->num_optim_passes = compression_level / 40; c->optimum_nodes = MALLOC((max_bufsize + 1) * sizeof(struct xpress_optimum_node)); @@ -1105,16 +1101,25 @@ xpress_create_compressor(size_t max_bufsize, unsigned compression_level, if (!c->optimum_nodes || !c->match_cache) { FREE(c->optimum_nodes); FREE(c->match_cache); - ALIGNED_FREE(c); - return WIMLIB_ERR_NOMEM; + goto oom1; } c->cache_overflow_mark = &c->match_cache[max_bufsize * CACHE_RESERVE_PER_POS]; + + c->impl = xpress_compress_near_optimal; + c->max_search_depth = (compression_level * 32) / 100; + c->nice_match_length = (compression_level * 50) / 100; + c->num_optim_passes = compression_level / 40; } #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ *c_ret = c; return 0; + +oom1: + ALIGNED_FREE(c); +oom0: + return WIMLIB_ERR_NOMEM; } static size_t @@ -1123,6 +1128,10 @@ xpress_compress(const void *in, size_t in_nbytes, { struct xpress_compressor *c = _c; + /* Don't bother trying to compress very small inputs. */ + if (in_nbytes < 25) + return 0; + if (out_nbytes_avail <= XPRESS_NUM_SYMBOLS / 2 + 4) return 0; @@ -1136,16 +1145,14 @@ xpress_free_compressor(void *_c) { struct xpress_compressor *c = _c; - if (c) { - #if SUPPORT_NEAR_OPTIMAL_PARSING - if (c->impl == xpress_compress_near_optimal) { - FREE(c->optimum_nodes); - FREE(c->match_cache); - } else - #endif - FREE(c->chosen_items); - ALIGNED_FREE(c); - } +#if SUPPORT_NEAR_OPTIMAL_PARSING + if (c->impl == xpress_compress_near_optimal) { + FREE(c->optimum_nodes); + FREE(c->match_cache); + } else +#endif + FREE(c->chosen_items); + ALIGNED_FREE(c); } const struct compressor_ops xpress_compressor_ops = {