From: Eric Biggers Date: Fri, 29 Aug 2014 03:31:13 +0000 (-0500) Subject: Factor out lz_repsearch() and also use for LZMS X-Git-Tag: v1.7.2~46 X-Git-Url: https://wimlib.net/git/?p=wimlib;a=commitdiff_plain;h=2915014b741493083571954e10c8add9dc09a522 Factor out lz_repsearch() and also use for LZMS --- diff --git a/Makefile.am b/Makefile.am index 1ce745a0..18c7f316 100644 --- a/Makefile.am +++ b/Makefile.am @@ -47,6 +47,7 @@ libwim_la_SOURCES = \ src/lz_linked_suffix_array.c \ src/lz_mf.c \ src/lz_null.c \ + src/lz_repsearch.c \ src/lz_suffix_array_utils.c \ src/lzms-common.c \ src/lzms-compress.c \ @@ -106,6 +107,7 @@ libwim_la_SOURCES = \ include/wimlib/lz_hash3.h \ include/wimlib/lz_mf.h \ include/wimlib/lz_mf_ops.h \ + include/wimlib/lz_repsearch.h \ include/wimlib/lz_suffix_array_utils.h \ include/wimlib/lzms.h \ include/wimlib/lzx.h \ diff --git a/include/wimlib/lz_repsearch.h b/include/wimlib/lz_repsearch.h new file mode 100644 index 00000000..fe59558b --- /dev/null +++ b/include/wimlib/lz_repsearch.h @@ -0,0 +1,55 @@ +/* + * lz_repsearch.h + * + * Fast searching for repeat offset matches. + * + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. + */ + +#ifndef _LZ_REPSEARCH_H +#define _LZ_REPSEARCH_H + +#include "wimlib/lz_extend.h" +#include "wimlib/util.h" + +extern u32 +lz_extend_repmatch(const u8 *strptr, const u8 *matchptr, u32 max_len); + +/* + * Find the longest repeat offset match. + * + * If no match of at least 2 bytes is found, then return 0. + * + * If a match of at least 2 bytes is found, then return its length and set + * *slot_ret to the index of its offset in @queue. + */ +static inline u32 +lz_repsearch(const u8 * const strptr, const u32 bytes_remaining, + const u32 max_match_len, const u32 repeat_offsets[], + const unsigned num_repeat_offsets, unsigned *slot_ret) +{ + u32 best_len = 0; + + if (likely(bytes_remaining >= 2)) { + const u32 max_len = min(max_match_len, bytes_remaining); + const u16 str = *(const u16 *)strptr; + + for (unsigned i = 0; i < num_repeat_offsets; i++) { + const u8 * const matchptr = strptr - repeat_offsets[i]; + + /* Check the first two bytes. If they match, then + * extend the match to its full length. */ + if (*(const u16 *)matchptr == str) { + const u32 len = lz_extend_repmatch(strptr, matchptr, max_len); + if (len > best_len) { + best_len = len; + *slot_ret = i; + } + } + } + } + return best_len; +} + +#endif /* _LZ_REPSEARCH_H */ diff --git a/src/lz_repsearch.c b/src/lz_repsearch.c new file mode 100644 index 00000000..2f78a61f --- /dev/null +++ b/src/lz_repsearch.c @@ -0,0 +1,18 @@ +/* + * lz_repsearch.c + * + * Fast searching for repeat offset matches. + * + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. + * + */ + +#include "wimlib/lz_repsearch.h" +#include "wimlib/lz_extend.h" + +u32 +lz_extend_repmatch(const u8 *strptr, const u8 *matchptr, u32 max_len) +{ + return lz_extend(strptr, matchptr, 2, max_len); +} diff --git a/src/lzms-compress.c b/src/lzms-compress.c index 21c82ea8..8ad19bc0 100644 --- a/src/lzms-compress.c +++ b/src/lzms-compress.c @@ -41,6 +41,7 @@ #include "wimlib/endianness.h" #include "wimlib/error.h" #include "wimlib/lz_mf.h" +#include "wimlib/lz_repsearch.h" #include "wimlib/lzms.h" #include "wimlib/util.h" @@ -840,6 +841,20 @@ lzms_get_lz_match_cost(struct lzms_compressor *ctx, lzms_get_length_cost(&ctx->length_encoder, length); } +static inline u32 +lzms_repsearch(const u8 * const strptr, const u32 bytes_remaining, + const struct lzms_lz_lru_queues *queue, u32 *offset_ret) +{ + u32 len; + unsigned slot = 0; + + len = lz_repsearch(strptr, bytes_remaining, UINT32_MAX, + queue->recent_offsets, LZMS_NUM_RECENT_OFFSETS, &slot); + *offset_ret = queue->recent_offsets[slot]; + return len; +} + + static struct lz_match lzms_match_chooser_reverse_list(struct lzms_compressor *ctx, unsigned cur_pos) { @@ -899,21 +914,12 @@ lzms_get_near_optimal_item(struct lzms_compressor *ctx) ctx->optimum_cur_idx = 0; ctx->optimum_end_idx = 0; - longest_rep_len = ctx->params.min_match_length - 1; if (lz_mf_get_position(ctx->mf) >= LZMS_MAX_INIT_RECENT_OFFSET) { - u32 limit = lz_mf_get_bytes_remaining(ctx->mf); - for (int i = 0; i < LZMS_NUM_RECENT_OFFSETS; i++) { - u32 offset = ctx->lru.lz.recent_offsets[i]; - const u8 *strptr = lz_mf_get_window_ptr(ctx->mf); - const u8 *matchptr = strptr - offset; - u32 len = 0; - while (len < limit && strptr[len] == matchptr[len]) - len++; - if (len > longest_rep_len) { - longest_rep_len = len; - longest_rep_offset = offset; - } - } + longest_rep_len = lzms_repsearch(lz_mf_get_window_ptr(ctx->mf), + lz_mf_get_bytes_remaining(ctx->mf), + &ctx->lru.lz, &longest_rep_offset); + } else { + longest_rep_len = 0; } if (longest_rep_len >= ctx->params.nice_match_length) { @@ -972,7 +978,7 @@ lzms_get_near_optimal_item(struct lzms_compressor *ctx) } end_pos = longest_len; - if (longest_rep_len >= ctx->params.min_match_length) { + if (longest_rep_len) { struct lzms_adaptive_state state; u32 cost; @@ -1002,21 +1008,13 @@ lzms_get_near_optimal_item(struct lzms_compressor *ctx) if (cur_pos == end_pos || cur_pos == ctx->params.optim_array_length) return lzms_match_chooser_reverse_list(ctx, cur_pos); - longest_rep_len = ctx->params.min_match_length - 1; if (lz_mf_get_position(ctx->mf) >= LZMS_MAX_INIT_RECENT_OFFSET) { - u32 limit = lz_mf_get_bytes_remaining(ctx->mf); - for (int i = 0; i < LZMS_NUM_RECENT_OFFSETS; i++) { - u32 offset = ctx->optimum[cur_pos].state.lru.recent_offsets[i]; - const u8 *strptr = lz_mf_get_window_ptr(ctx->mf); - const u8 *matchptr = strptr - offset; - u32 len = 0; - while (len < limit && strptr[len] == matchptr[len]) - len++; - if (len > longest_rep_len) { - longest_rep_len = len; - longest_rep_offset = offset; - } - } + longest_rep_len = lzms_repsearch(lz_mf_get_window_ptr(ctx->mf), + lz_mf_get_bytes_remaining(ctx->mf), + &ctx->optimum[cur_pos].state.lru, + &longest_rep_offset); + } else { + longest_rep_len = 0; } if (longest_rep_len >= ctx->params.nice_match_length) { diff --git a/src/lzx-compress.c b/src/lzx-compress.c index 9851faa4..cf5ea48e 100644 --- a/src/lzx-compress.c +++ b/src/lzx-compress.c @@ -198,8 +198,8 @@ #include "wimlib/compress_common.h" #include "wimlib/endianness.h" #include "wimlib/error.h" -#include "wimlib/lz_extend.h" #include "wimlib/lz_mf.h" +#include "wimlib/lz_repsearch.h" #include "wimlib/lzx.h" #include "wimlib/util.h" #include @@ -1501,28 +1501,9 @@ static inline u32 lzx_repsearch(const u8 * const strptr, const u32 bytes_remaining, const struct lzx_lru_queue *queue, unsigned *slot_ret) { - u32 best_len = 0; - BUILD_BUG_ON(LZX_MIN_MATCH_LEN != 2); - if (likely(bytes_remaining >= 2)) { - const u32 max_len = min(LZX_MAX_MATCH_LEN, bytes_remaining); - const u16 str = *(const u16 *)strptr; - for (unsigned i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) { - const u8 * const matchptr = strptr - queue->R[i]; - - /* Check the first two bytes. If they match, then - * extend the match to its full length. */ - if (*(const u16 *)matchptr == str) { - const u32 len = lz_extend(strptr, matchptr, - 2, max_len); - if (len > best_len) { - best_len = len; - *slot_ret = i; - } - } - } - } - return best_len; + return lz_repsearch(strptr, bytes_remaining, LZX_MAX_MATCH_LEN, + queue->R, LZX_NUM_RECENT_OFFSETS, slot_ret); } /*