From 8895e346ab7a4df65c980ba435d1e5d1c2654d3f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 14 Feb 2015 10:47:56 -0600 Subject: [PATCH] Heuristic sorting of streams for solid compression --- Makefile.am | 2 + include/wimlib/lookup_table.h | 19 ++- include/wimlib/solid.h | 9 ++ include/wimlib/wim.h | 3 + src/lookup_table.c | 2 +- src/solid.c | 245 ++++++++++++++++++++++++++++++++++ src/wim.c | 21 ++- src/write.c | 38 ++++-- 8 files changed, 318 insertions(+), 21 deletions(-) create mode 100644 include/wimlib/solid.h create mode 100644 src/solid.c diff --git a/Makefile.am b/Makefile.am index 21b40bcf..10511f78 100644 --- a/Makefile.am +++ b/Makefile.am @@ -76,6 +76,7 @@ libwim_la_SOURCES = \ src/reference.c \ src/security.c \ src/sha1.c \ + src/solid.c \ src/split.c \ src/reparse.c \ src/tagged_items.c \ @@ -140,6 +141,7 @@ libwim_la_SOURCES = \ include/wimlib/security.h \ include/wimlib/security_descriptor.h \ include/wimlib/sha1.h \ + include/wimlib/solid.h \ include/wimlib/textfile.h \ include/wimlib/timestamp.h \ include/wimlib/types.h \ diff --git a/include/wimlib/lookup_table.h b/include/wimlib/lookup_table.h index 9c727507..4158f157 100644 --- a/include/wimlib/lookup_table.h +++ b/include/wimlib/lookup_table.h @@ -213,9 +213,19 @@ struct wim_lookup_table_entry { /* Links streams being written to the WIM. */ struct list_head write_streams_list; - /* Metadata for this stream in the WIM being written. - */ - struct wim_reshdr out_reshdr; + union { + /* Metadata for this stream in the WIM being + * written. */ + struct wim_reshdr out_reshdr; + + struct { + /* Name under which this stream is being + * sorted; used only when sorting + * streams for solid compression. */ + utf16lechar *solid_sort_name; + size_t solid_sort_name_nbytes; + }; + }; }; /* Used temporarily during extraction. This is an array of @@ -339,6 +349,9 @@ extern int sort_stream_list_by_sequential_order(struct list_head *stream_list, size_t list_head_offset); +extern int +cmp_streams_by_sequential_order(const void *p1, const void *p2); + /* Utility functions */ extern int diff --git a/include/wimlib/solid.h b/include/wimlib/solid.h new file mode 100644 index 00000000..064f27fe --- /dev/null +++ b/include/wimlib/solid.h @@ -0,0 +1,9 @@ +#ifndef _WIMLIB_SOLID_H +#define _WIMLIB_SOLID_H + +struct list_head; + +extern int +sort_stream_list_for_solid_compression(struct list_head *stream_list); + +#endif /* _WIMLIB_SOLID_H */ diff --git a/include/wimlib/wim.h b/include/wimlib/wim.h index f02bf8c7..544ce180 100644 --- a/include/wimlib/wim.h +++ b/include/wimlib/wim.h @@ -204,6 +204,9 @@ write_wim_header_flags(u32 hdr_flags, struct filedes *out_fd); extern int select_wim_image(WIMStruct *wim, int image); +extern void +deselect_current_wim_image(WIMStruct *wim); + extern int for_image(WIMStruct *wim, int image, int (*visitor)(WIMStruct *)); diff --git a/src/lookup_table.c b/src/lookup_table.c index ec793205..26f231e4 100644 --- a/src/lookup_table.c +++ b/src/lookup_table.c @@ -415,7 +415,7 @@ for_lookup_table_entry(struct wim_lookup_table *table, * per-resource location order. For example, resources in WIM files are sorted * primarily by part number, then secondarily by offset, as to implement optimal * reading of either a standalone or split WIM. */ -static int +int cmp_streams_by_sequential_order(const void *p1, const void *p2) { const struct wim_lookup_table_entry *lte1, *lte2; diff --git a/src/solid.c b/src/solid.c new file mode 100644 index 00000000..4b791ea1 --- /dev/null +++ b/src/solid.c @@ -0,0 +1,245 @@ +/* + * solid.c + * + * Heuristic sorting of streams to optimize solid compression. + */ + +/* + * Copyright (C) 2015 Eric Biggers + * + * This file is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 3 of the License, or (at your option) any + * later version. + * + * This file is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this file; if not, see http://www.gnu.org/licenses/. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "wimlib/dentry.h" +#include "wimlib/encoding.h" +#include "wimlib/endianness.h" +#include "wimlib/lookup_table.h" +#include "wimlib/metadata.h" +#include "wimlib/paths.h" +#include "wimlib/solid.h" +#include "wimlib/unaligned.h" + +static const utf16lechar * +get_extension(const utf16lechar *name, size_t nbytes) +{ + const utf16lechar *p = name + (nbytes / sizeof(utf16lechar)); + for (;;) { + if (p == name) + return NULL; + if (*(p - 1) == cpu_to_le16('/') || *(p - 1) == cpu_to_le16('\\')) + return NULL; + if (*(p - 1) == cpu_to_le16('.')) + return p; + p--; + } +} + +/* + * Sort order for solid compression: + * + * 1. Streams without sort names + * - sorted by sequential order + * 2. Streams with sort names: + * a. Streams whose sort name does not have an extension + * - sorted by sort name + * b. Streams whose sort name has an extension + * - sorted primarily by extension (case insensitive), + * secondarily by sort name (case insensitive) + */ +static int +cmp_streams_by_solid_sort_name(const void *p1, const void *p2) +{ + const struct wim_lookup_table_entry *lte1, *lte2; + + lte1 = *(const struct wim_lookup_table_entry **)p1; + lte2 = *(const struct wim_lookup_table_entry **)p2; + + if (lte1->solid_sort_name) { + if (!lte2->solid_sort_name) + return 1; + const utf16lechar *extension1 = get_extension(lte1->solid_sort_name, + lte1->solid_sort_name_nbytes); + const utf16lechar *extension2 = get_extension(lte2->solid_sort_name, + lte2->solid_sort_name_nbytes); + if (extension1) { + if (!extension2) + return 1; + int res = cmp_utf16le_strings(extension1, + utf16le_strlen(extension1) / sizeof(utf16lechar), + extension2, + utf16le_strlen(extension2) / sizeof(utf16lechar), + true); /* case insensitive */ + if (res) + return res; + } else { + if (extension2) + return -1; + } + int res = cmp_utf16le_strings(lte1->solid_sort_name, + lte1->solid_sort_name_nbytes / sizeof(utf16lechar), + lte2->solid_sort_name, + lte2->solid_sort_name_nbytes / sizeof(utf16lechar), + true); /* case insensitive */ + if (res) + return res; + } else { + if (lte2->solid_sort_name) + return -1; + } + return cmp_streams_by_sequential_order(p1, p2); +} + +static void +lte_set_solid_sort_name_from_inode(struct wim_lookup_table_entry *lte, + const struct wim_inode *inode) +{ + const struct wim_dentry *dentry; + const utf16lechar *best_name = NULL; + size_t best_name_nbytes = SIZE_MAX; + + if (lte->solid_sort_name) /* Sort name already set? */ + return; + + /* If this file has multiple names, choose the shortest one. */ + inode_for_each_dentry(dentry, inode) { + if (dentry->file_name_nbytes < best_name_nbytes) { + best_name = dentry->file_name; + best_name_nbytes = dentry->file_name_nbytes; + } + } + lte->solid_sort_name = utf16le_dupz(best_name, best_name_nbytes); + lte->solid_sort_name_nbytes = best_name_nbytes; +} + +struct temp_lookup_table { + struct hlist_head *table; + size_t capacity; +}; + +static int +dentry_fill_in_solid_sort_names(struct wim_dentry *dentry, void *_lookup_table) +{ + const struct temp_lookup_table *lookup_table = _lookup_table; + const struct wim_inode *inode = dentry->d_inode; + const u8 *hash; + struct hlist_head *head; + struct hlist_node *cur; + struct wim_lookup_table_entry *lte; + + hash = inode_unnamed_stream_hash(inode); + head = &lookup_table->table[load_size_t_unaligned(hash) % + lookup_table->capacity]; + hlist_for_each_entry(lte, cur, head, hash_list_2) { + if (hashes_equal(hash, lte->hash)) { + lte_set_solid_sort_name_from_inode(lte, inode); + break; + } + } + return 0; +} + +static int +image_fill_in_solid_sort_names(WIMStruct *wim) +{ + return for_dentry_in_tree(wim_get_current_root_dentry(wim), + dentry_fill_in_solid_sort_names, + wim->private); +} + +int +sort_stream_list_for_solid_compression(struct list_head *stream_list) +{ + size_t num_streams = 0; + struct temp_lookup_table lookup_table; + WIMStruct *wims[128]; + int num_wims = 0; + struct wim_lookup_table_entry *lte; + int ret; + + /* Count the number of streams to be written. */ + list_for_each_entry(lte, stream_list, write_streams_list) + num_streams++; + + /* Allocate a temporary hash table for mapping stream hash => stream */ + lookup_table.capacity = num_streams; + lookup_table.table = CALLOC(lookup_table.capacity, + sizeof(lookup_table.table[0])); + if (!lookup_table.table) + return WIMLIB_ERR_NOMEM; + + /* + * For each stream to be written: + * - Reset the sort name + * - If it's in non-solid WIM resource, then save the WIMStruct. + * - If it's in a file on disk, then set its sort name from that. + */ + list_for_each_entry(lte, stream_list, write_streams_list) { + lte->solid_sort_name = NULL; + lte->solid_sort_name_nbytes = 0; + switch (lte->resource_location) { + case RESOURCE_IN_WIM: + if (lte->size != lte->rspec->uncompressed_size) + continue; + for (int i = 0; i < num_wims; i++) + if (lte->rspec->wim == wims[i]) + goto found_wim; + if (num_wims >= ARRAY_LEN(wims)) + continue; + wims[num_wims++] = lte->rspec->wim; + found_wim: + hlist_add_head(<e->hash_list_2, + &lookup_table.table[load_size_t_unaligned(lte->hash) % + lookup_table.capacity]); + break; + case RESOURCE_IN_FILE_ON_DISK: + #ifdef __WIN32__ + case RESOURCE_IN_WINNT_FILE_ON_DISK: + #endif + lte_set_solid_sort_name_from_inode(lte, lte->file_inode); + break; + default: + break; + } + } + + /* For each WIMStruct that was found, search for dentry references to + * each stream and fill in the sort name this way. This is useful e.g. + * when exporting a solid WIM file from a non-solid WIM file. */ + for (int i = 0; i < num_wims; i++) { + if (!wim_has_metadata(wims[i])) + continue; + wims[i]->private = &lookup_table; + ret = for_image(wims[i], WIMLIB_ALL_IMAGES, + image_fill_in_solid_sort_names); + if (ret) + goto out_free_lookup_table; + deselect_current_wim_image(wims[i]); + } + + ret = sort_stream_list(stream_list, + offsetof(struct wim_lookup_table_entry, + write_streams_list), + cmp_streams_by_solid_sort_name); + +out_free_solid_sort_names: + list_for_each_entry(lte, stream_list, write_streams_list) + FREE(lte->solid_sort_name); +out_free_lookup_table: + FREE(lookup_table.table); + return ret; +} diff --git a/src/wim.c b/src/wim.c index c9116dfb..27708e23 100644 --- a/src/wim.c +++ b/src/wim.c @@ -343,13 +343,7 @@ select_wim_image(WIMStruct *wim, int image) /* If a valid image is currently selected, its metadata can be freed if * it has not been modified. */ - if (wim->current_image != WIMLIB_NO_IMAGE) { - imd = wim_get_current_image_metadata(wim); - if (!imd->modified) { - wimlib_assert(list_empty(&imd->unhashed_streams)); - destroy_image_metadata(imd, NULL, false); - } - } + deselect_current_wim_image(wim); wim->current_image = image; imd = wim_get_current_image_metadata(wim); if (imd->root_dentry || imd->modified) { @@ -362,6 +356,19 @@ select_wim_image(WIMStruct *wim, int image) return ret; } +void +deselect_current_wim_image(WIMStruct *wim) +{ + struct wim_image_metadata *imd; + if (wim->current_image == WIMLIB_NO_IMAGE) + return; + imd = wim_get_current_image_metadata(wim); + if (!imd->modified) { + wimlib_assert(list_empty(&imd->unhashed_streams)); + destroy_image_metadata(imd, NULL, false); + } + wim->current_image = WIMLIB_NO_IMAGE; +} /* API function documented in wimlib.h */ WIMLIBAPI const tchar * diff --git a/src/write.c b/src/write.c index aab567f8..fcaa52e4 100644 --- a/src/write.c +++ b/src/write.c @@ -51,6 +51,7 @@ #include "wimlib/paths.h" #include "wimlib/progress.h" #include "wimlib/resource.h" +#include "wimlib/solid.h" #ifdef __WIN32__ # include "wimlib/win32.h" /* win32_rename_replacement() */ #endif @@ -1561,14 +1562,6 @@ write_stream_list(struct list_head *stream_list, memset(&ctx, 0, sizeof(ctx)); - /* Pre-sorting the streams is required for compute_stream_list_stats(). - * Afterwards, read_stream_list() need not sort them again. */ - ret = sort_stream_list_by_sequential_order(stream_list, - offsetof(struct wim_lookup_table_entry, - write_streams_list)); - if (ret) - return ret; - ctx.out_fd = out_fd; ctx.lookup_table = lookup_table; ctx.out_ctype = out_ctype; @@ -1576,6 +1569,33 @@ write_stream_list(struct list_head *stream_list, ctx.write_resource_flags = write_resource_flags; ctx.filter_ctx = filter_ctx; + /* + * We normally sort the streams to write by a "sequential" order that is + * optimized for reading. But when using solid compression, we instead + * sort the streams by file extension and file name (when applicable; + * and we don't do this for streams from solid resources) so that + * similar files are grouped together, which improves the compression + * ratio. This is somewhat of a hack since a stream does not + * necessarily correspond one-to-one with a filename, nor is there any + * guarantee that two files with similar names or extensions are + * actually similar in content. A potential TODO is to sort the streams + * based on some measure of similarity of their actual contents. + */ + + ret = sort_stream_list_by_sequential_order(stream_list, + offsetof(struct wim_lookup_table_entry, + write_streams_list)); + if (ret) + return ret; + + compute_stream_list_stats(stream_list, &ctx); + + if (write_resource_flags & WRITE_RESOURCE_FLAG_SOLID) { + ret = sort_stream_list_for_solid_compression(stream_list); + if (unlikely(ret)) + WARNING("Failed to sort streams for solid compression. Continuing anyways."); + } + if (out_ctype != WIMLIB_COMPRESSION_TYPE_NONE) { wimlib_assert(out_chunk_size != 0); if (out_chunk_size <= STACK_MAX) { @@ -1590,8 +1610,6 @@ write_stream_list(struct list_head *stream_list, } ctx.chunk_buf_filled = 0; - compute_stream_list_stats(stream_list, &ctx); - ctx.progress_data.progfunc = progfunc; ctx.progress_data.progctx = progctx; -- 2.43.0