4 * A blob table maps SHA-1 message digests to "blobs", which are nonempty
5 * sequences of binary data. Within a WIM file, blobs are single-instanced.
7 * This file also contains code to read and write the corresponding on-disk
8 * representation of this table in the WIM file format.
12 * Copyright (C) 2012, 2013, 2014, 2015 Eric Biggers
14 * This file is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 3 of the License, or (at your option) any
19 * This file is distributed in the hope that it will be useful, but WITHOUT
20 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
21 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
24 * You should have received a copy of the GNU Lesser General Public License
25 * along with this file; if not, see http://www.gnu.org/licenses/.
34 #include <unistd.h> /* for unlink() */
36 #include "wimlib/assert.h"
37 #include "wimlib/blob_table.h"
38 #include "wimlib/encoding.h"
39 #include "wimlib/endianness.h"
40 #include "wimlib/error.h"
41 #include "wimlib/metadata.h"
42 #include "wimlib/ntfs_3g.h"
43 #include "wimlib/resource.h"
44 #include "wimlib/unaligned.h"
45 #include "wimlib/util.h"
46 #include "wimlib/write.h"
48 /* A hash table mapping SHA-1 message digests to blob descriptors */
50 struct hlist_head *array;
56 new_blob_table(size_t capacity)
58 struct blob_table *table;
59 struct hlist_head *array;
61 table = MALLOC(sizeof(struct blob_table));
65 array = CALLOC(capacity, sizeof(array[0]));
72 table->capacity = capacity;
77 ERROR("Failed to allocate memory for blob table "
78 "with capacity %zu", capacity);
83 do_free_blob_descriptor(struct blob_descriptor *blob, void *_ignore)
85 free_blob_descriptor(blob);
90 free_blob_table(struct blob_table *table)
93 for_blob_in_table(table, do_free_blob_descriptor, NULL);
99 struct blob_descriptor *
100 new_blob_descriptor(void)
102 struct blob_descriptor *blob;
104 blob = CALLOC(1, sizeof(struct blob_descriptor));
110 /* blob->blob_location = BLOB_NONEXISTENT */
111 BUILD_BUG_ON(BLOB_NONEXISTENT != 0);
116 struct blob_descriptor *
117 clone_blob_descriptor(const struct blob_descriptor *old)
119 struct blob_descriptor *new;
121 new = memdup(old, sizeof(struct blob_descriptor));
125 switch (new->blob_location) {
127 list_add(&new->rdesc_node, &new->rdesc->blob_list);
130 case BLOB_IN_FILE_ON_DISK:
132 case BLOB_IN_WINNT_FILE_ON_DISK:
133 case BLOB_WIN32_ENCRYPTED:
136 case BLOB_IN_STAGING_FILE:
137 BUILD_BUG_ON((void*)&old->file_on_disk !=
138 (void*)&old->staging_file_name);
140 new->file_on_disk = TSTRDUP(old->file_on_disk);
141 if (new->file_on_disk == NULL)
144 case BLOB_IN_ATTACHED_BUFFER:
145 new->attached_buffer = memdup(old->attached_buffer, old->size);
146 if (new->attached_buffer == NULL)
150 case BLOB_IN_NTFS_VOLUME:
152 struct ntfs_location *loc;
153 loc = memdup(old->ntfs_loc, sizeof(struct ntfs_location));
157 loc->attr_name = NULL;
159 loc->path = STRDUP(old->ntfs_loc->path);
160 if (loc->path == NULL)
162 if (loc->attr_name_nchars != 0) {
163 loc->attr_name = utf16le_dup(old->ntfs_loc->attr_name);
164 if (loc->attr_name == NULL)
176 free_blob_descriptor(new);
181 blob_release_location(struct blob_descriptor *blob)
183 switch (blob->blob_location) {
185 list_del(&blob->rdesc_node);
186 if (list_empty(&blob->rdesc->blob_list))
189 case BLOB_IN_FILE_ON_DISK:
191 case BLOB_IN_WINNT_FILE_ON_DISK:
192 case BLOB_WIN32_ENCRYPTED:
195 case BLOB_IN_STAGING_FILE:
196 BUILD_BUG_ON((void*)&blob->file_on_disk !=
197 (void*)&blob->staging_file_name);
199 case BLOB_IN_ATTACHED_BUFFER:
200 BUILD_BUG_ON((void*)&blob->file_on_disk !=
201 (void*)&blob->attached_buffer);
202 FREE(blob->file_on_disk);
205 case BLOB_IN_NTFS_VOLUME:
206 if (blob->ntfs_loc) {
207 FREE(blob->ntfs_loc->path);
208 FREE(blob->ntfs_loc->attr_name);
209 FREE(blob->ntfs_loc);
219 free_blob_descriptor(struct blob_descriptor *blob)
222 blob_release_location(blob);
227 /* Should this blob be retained even if it has no references? */
229 should_retain_blob(const struct blob_descriptor *blob)
231 return blob->blob_location == BLOB_IN_WIM;
235 finalize_blob(struct blob_descriptor *blob)
237 if (!should_retain_blob(blob))
238 free_blob_descriptor(blob);
242 * Decrements the reference count of the specified blob, which must be either
243 * (a) unhashed, or (b) inserted in the specified blob table.
245 * If the blob's reference count reaches 0, we may unlink it from @table and
246 * free it. However, we retain blobs with 0 reference count that originated
247 * from WIM files (BLOB_IN_WIM). We do this for two reasons:
249 * 1. This prevents information about valid blobs in a WIM file --- blobs which
250 * will continue to be present after appending to the WIM file --- from being
251 * lost merely because we dropped all references to them.
253 * 2. Blob reference counts we read from WIM files can't be trusted. It's
254 * possible that a WIM has reference counts that are too low; WIMGAPI
255 * sometimes creates WIMs where this is the case. It's also possible that
256 * blobs have been referenced from an external WIM; those blobs can
257 * potentially have any reference count at all, either lower or higher than
258 * would be expected for this WIM ("this WIM" meaning the owner of @table) if
259 * it were a standalone WIM.
261 * So we can't take the reference counts too seriously. But at least, we do
262 * recalculate by default when writing a new WIM file.
265 blob_decrement_refcnt(struct blob_descriptor *blob, struct blob_table *table)
267 if (unlikely(blob->refcnt == 0)) /* See comment above */
270 if (--blob->refcnt != 0)
273 if (blob->unhashed) {
274 list_del(&blob->unhashed_list);
276 /* If the blob has been extracted to a staging file for a FUSE
277 * mount, unlink the staging file. (Note that there still may
278 * be open file descriptors to it.) */
279 if (blob->blob_location == BLOB_IN_STAGING_FILE)
280 unlinkat(blob->staging_dir_fd,
281 blob->staging_file_name, 0);
284 if (!should_retain_blob(blob))
285 blob_table_unlink(table, blob);
288 /* If FUSE mounts are enabled, then don't actually free the blob
289 * descriptor until the last file descriptor to it has been closed. */
291 if (blob->num_opened_fds == 0)
298 blob_decrement_num_opened_fds(struct blob_descriptor *blob)
300 wimlib_assert(blob->num_opened_fds != 0);
302 if (--blob->num_opened_fds == 0 && blob->refcnt == 0)
308 blob_table_insert_raw(struct blob_table *table, struct blob_descriptor *blob)
310 size_t i = blob->hash_short % table->capacity;
312 hlist_add_head(&blob->hash_list, &table->array[i]);
316 enlarge_blob_table(struct blob_table *table)
318 size_t old_capacity, new_capacity;
319 struct hlist_head *old_array, *new_array;
320 struct blob_descriptor *blob;
321 struct hlist_node *cur, *tmp;
324 old_capacity = table->capacity;
325 new_capacity = old_capacity * 2;
326 new_array = CALLOC(new_capacity, sizeof(struct hlist_head));
327 if (new_array == NULL)
329 old_array = table->array;
330 table->array = new_array;
331 table->capacity = new_capacity;
333 for (i = 0; i < old_capacity; i++) {
334 hlist_for_each_entry_safe(blob, cur, tmp, &old_array[i], hash_list) {
335 hlist_del(&blob->hash_list);
336 blob_table_insert_raw(table, blob);
342 /* Insert a blob descriptor into the blob table. */
344 blob_table_insert(struct blob_table *table, struct blob_descriptor *blob)
346 blob_table_insert_raw(table, blob);
347 if (++table->num_blobs > table->capacity)
348 enlarge_blob_table(table);
351 /* Unlinks a blob descriptor from the blob table; does not free it. */
353 blob_table_unlink(struct blob_table *table, struct blob_descriptor *blob)
355 wimlib_assert(!blob->unhashed);
356 wimlib_assert(table->num_blobs != 0);
358 hlist_del(&blob->hash_list);
362 /* Given a SHA-1 message digest, return the corresponding blob descriptor from
363 * the specified blob table, or NULL if there is none. */
364 struct blob_descriptor *
365 lookup_blob(const struct blob_table *table, const u8 *hash)
368 struct blob_descriptor *blob;
369 struct hlist_node *pos;
371 i = load_size_t_unaligned(hash) % table->capacity;
372 hlist_for_each_entry(blob, pos, &table->array[i], hash_list)
373 if (hashes_equal(hash, blob->hash))
378 /* Call a function on all blob descriptors in the specified blob table. Stop
379 * early and return nonzero if any call to the function returns nonzero. */
381 for_blob_in_table(struct blob_table *table,
382 int (*visitor)(struct blob_descriptor *, void *), void *arg)
384 struct blob_descriptor *blob;
385 struct hlist_node *pos, *tmp;
388 for (size_t i = 0; i < table->capacity; i++) {
389 hlist_for_each_entry_safe(blob, pos, tmp, &table->array[i],
392 ret = visitor(blob, arg);
401 * This is a qsort() callback that sorts blobs into an order optimized for
402 * reading. Sorting is done primarily by blob location, then secondarily by a
403 * location-dependent order. For example, blobs in WIM resources are sorted
404 * such that the underlying WIM files will be read sequentially. This is
405 * especially important for WIM files containing solid resources.
408 cmp_blobs_by_sequential_order(const void *p1, const void *p2)
410 const struct blob_descriptor *blob1, *blob2;
412 WIMStruct *wim1, *wim2;
414 blob1 = *(const struct blob_descriptor**)p1;
415 blob2 = *(const struct blob_descriptor**)p2;
417 v = (int)blob1->blob_location - (int)blob2->blob_location;
419 /* Different resource locations? */
423 switch (blob1->blob_location) {
425 wim1 = blob1->rdesc->wim;
426 wim2 = blob2->rdesc->wim;
428 /* Different (possibly split) WIMs? */
430 v = memcmp(wim1->hdr.guid, wim2->hdr.guid, WIM_GUID_LEN);
435 /* Different part numbers in the same WIM? */
436 v = (int)wim1->hdr.part_number - (int)wim2->hdr.part_number;
440 if (blob1->rdesc->offset_in_wim != blob2->rdesc->offset_in_wim)
441 return cmp_u64(blob1->rdesc->offset_in_wim,
442 blob2->rdesc->offset_in_wim);
444 return cmp_u64(blob1->offset_in_res, blob2->offset_in_res);
446 case BLOB_IN_FILE_ON_DISK:
448 case BLOB_IN_STAGING_FILE:
451 case BLOB_IN_WINNT_FILE_ON_DISK:
452 case BLOB_WIN32_ENCRYPTED:
454 /* Compare files by path: just a heuristic that will place files
455 * in the same directory next to each other. */
456 return tstrcmp(blob1->file_on_disk, blob2->file_on_disk);
458 case BLOB_IN_NTFS_VOLUME:
459 return tstrcmp(blob1->ntfs_loc->path, blob2->ntfs_loc->path);
462 /* No additional sorting order defined for this resource
463 * location (e.g. BLOB_IN_ATTACHED_BUFFER); simply compare
464 * everything equal to each other. */
470 sort_blob_list(struct list_head *blob_list, size_t list_head_offset,
471 int (*compar)(const void *, const void*))
473 struct list_head *cur;
474 struct blob_descriptor **array;
477 size_t num_blobs = 0;
479 list_for_each(cur, blob_list)
485 array_size = num_blobs * sizeof(array[0]);
486 array = MALLOC(array_size);
488 return WIMLIB_ERR_NOMEM;
490 cur = blob_list->next;
491 for (i = 0; i < num_blobs; i++) {
492 array[i] = (struct blob_descriptor*)((u8*)cur - list_head_offset);
496 qsort(array, num_blobs, sizeof(array[0]), compar);
498 INIT_LIST_HEAD(blob_list);
499 for (i = 0; i < num_blobs; i++) {
500 list_add_tail((struct list_head*)
501 ((u8*)array[i] + list_head_offset), blob_list);
507 /* Sort the specified list of blobs in an order optimized for sequential
510 sort_blob_list_by_sequential_order(struct list_head *blob_list,
511 size_t list_head_offset)
513 return sort_blob_list(blob_list, list_head_offset,
514 cmp_blobs_by_sequential_order);
518 add_blob_to_array(struct blob_descriptor *blob, void *_pp)
520 struct blob_descriptor ***pp = _pp;
525 /* Iterate through the blob descriptors in the specified blob table in an order
526 * optimized for sequential reading. */
528 for_blob_in_table_sorted_by_sequential_order(struct blob_table *table,
529 int (*visitor)(struct blob_descriptor *, void *),
532 struct blob_descriptor **blob_array, **p;
533 size_t num_blobs = table->num_blobs;
536 blob_array = MALLOC(num_blobs * sizeof(blob_array[0]));
538 return WIMLIB_ERR_NOMEM;
540 for_blob_in_table(table, add_blob_to_array, &p);
542 wimlib_assert(p == blob_array + num_blobs);
544 qsort(blob_array, num_blobs, sizeof(blob_array[0]),
545 cmp_blobs_by_sequential_order);
547 for (size_t i = 0; i < num_blobs; i++) {
548 ret = visitor(blob_array[i], arg);
556 /* On-disk format of a blob descriptor in a WIM file.
558 * Note: if the WIM file contains solid resource(s), then this structure is
559 * sometimes overloaded to describe a "resource" rather than a "blob". See the
560 * code for details. */
561 struct blob_descriptor_disk {
563 /* Size, offset, and flags of the blob. */
564 struct wim_reshdr_disk reshdr;
566 /* Which part of the split WIM this blob is in; indexed from 1. */
569 /* Reference count of this blob over all WIM images. (But see comment
570 * above blob_decrement_refcnt().) */
573 /* SHA-1 message digest of the uncompressed data of this blob, or all
574 * zeroes if this blob is of zero length. */
575 u8 hash[SHA1_HASH_SIZE];
578 /* Given a nonempty run of consecutive blob descriptors with the SOLID flag set,
579 * count how many specify resources (as opposed to blobs within those
582 * Returns the resulting count. */
584 count_solid_resources(const struct blob_descriptor_disk *entries, size_t max)
588 struct wim_reshdr reshdr;
590 get_wim_reshdr(&(entries++)->reshdr, &reshdr);
592 if (!(reshdr.flags & WIM_RESHDR_FLAG_SOLID)) {
593 /* Run was terminated by a stand-alone blob entry. */
597 if (reshdr.uncompressed_size == SOLID_RESOURCE_MAGIC_NUMBER) {
598 /* This is a resource entry. */
606 * Given a run of consecutive blob descriptors with the SOLID flag set and
607 * having @num_rdescs resource entries, load resource information from them into
608 * the resource descriptors in the @rdescs array.
610 * Returns 0 on success, or a nonzero error code on failure.
613 do_load_solid_info(WIMStruct *wim, struct wim_resource_descriptor **rdescs,
615 const struct blob_descriptor_disk *entries)
617 for (size_t i = 0; i < num_rdescs; i++) {
618 struct wim_reshdr reshdr;
619 struct alt_chunk_table_header_disk hdr;
620 struct wim_resource_descriptor *rdesc;
623 /* Advance to next resource entry. */
626 get_wim_reshdr(&(entries++)->reshdr, &reshdr);
627 } while (reshdr.uncompressed_size != SOLID_RESOURCE_MAGIC_NUMBER);
631 wim_res_hdr_to_desc(&reshdr, wim, rdesc);
633 /* For solid resources, the uncompressed size, compression type,
634 * and chunk size are stored in the resource itself, not in the
637 ret = full_pread(&wim->in_fd, &hdr,
638 sizeof(hdr), reshdr.offset_in_wim);
640 ERROR("Failed to read header of solid resource "
641 "(offset_in_wim=%"PRIu64")",
642 reshdr.offset_in_wim);
646 rdesc->uncompressed_size = le64_to_cpu(hdr.res_usize);
648 /* Compression format numbers must be the same as in
649 * WIMGAPI to be compatible here. */
650 BUILD_BUG_ON(WIMLIB_COMPRESSION_TYPE_NONE != 0);
651 BUILD_BUG_ON(WIMLIB_COMPRESSION_TYPE_XPRESS != 1);
652 BUILD_BUG_ON(WIMLIB_COMPRESSION_TYPE_LZX != 2);
653 BUILD_BUG_ON(WIMLIB_COMPRESSION_TYPE_LZMS != 3);
654 rdesc->compression_type = le32_to_cpu(hdr.compression_format);
656 rdesc->chunk_size = le32_to_cpu(hdr.chunk_size);
658 DEBUG("Solid resource %zu/%zu: %"PRIu64" => %"PRIu64" "
659 "(%"TS"/%"PRIu32") @ +%"PRIu64"",
661 rdesc->uncompressed_size,
663 wimlib_get_compression_type_string(rdesc->compression_type),
665 rdesc->offset_in_wim);
671 * Given a nonempty run of consecutive blob descriptors with the SOLID flag set,
672 * allocate a 'struct wim_resource_descriptor' for each resource within that
675 * Returns 0 on success, or a nonzero error code on failure.
676 * Returns the pointers and count in *rdescs_ret and *num_rdescs_ret.
679 load_solid_info(WIMStruct *wim,
680 const struct blob_descriptor_disk *entries,
681 size_t num_remaining_entries,
682 struct wim_resource_descriptor ***rdescs_ret,
683 size_t *num_rdescs_ret)
686 struct wim_resource_descriptor **rdescs;
690 num_rdescs = count_solid_resources(entries, num_remaining_entries);
691 rdescs = CALLOC(num_rdescs, sizeof(rdescs[0]));
693 return WIMLIB_ERR_NOMEM;
695 for (i = 0; i < num_rdescs; i++) {
696 rdescs[i] = MALLOC(sizeof(struct wim_resource_descriptor));
698 ret = WIMLIB_ERR_NOMEM;
699 goto out_free_rdescs;
703 ret = do_load_solid_info(wim, rdescs, num_rdescs, entries);
705 goto out_free_rdescs;
707 *rdescs_ret = rdescs;
708 *num_rdescs_ret = num_rdescs;
712 for (i = 0; i < num_rdescs; i++)
718 /* Given a 'struct blob_descriptor' allocated for an on-disk blob descriptor
719 * with the SOLID flag set, try to assign it to resource in the current solid
722 assign_blob_to_solid_resource(const struct wim_reshdr *reshdr,
723 struct blob_descriptor *blob,
724 struct wim_resource_descriptor **rdescs,
727 u64 offset = reshdr->offset_in_wim;
729 /* XXX: This linear search will be slow in the degenerate case where the
730 * number of solid resources in the run is huge. */
731 blob->size = reshdr->size_in_wim;
732 blob->flags = reshdr->flags;
733 for (size_t i = 0; i < num_rdescs; i++) {
734 if (offset + blob->size <= rdescs[i]->uncompressed_size) {
735 blob->offset_in_res = offset;
736 blob_set_is_located_in_wim_resource(blob, rdescs[i]);
739 offset -= rdescs[i]->uncompressed_size;
741 ERROR("blob could not be assigned to a solid resource");
742 return WIMLIB_ERR_INVALID_LOOKUP_TABLE_ENTRY;
746 free_solid_rdescs(struct wim_resource_descriptor **rdescs, size_t num_rdescs)
749 for (size_t i = 0; i < num_rdescs; i++)
750 if (list_empty(&rdescs[i]->blob_list))
757 cmp_blobs_by_offset_in_res(const void *p1, const void *p2)
759 const struct blob_descriptor *blob1, *blob2;
761 blob1 = *(const struct blob_descriptor**)p1;
762 blob2 = *(const struct blob_descriptor**)p2;
764 return cmp_u64(blob1->offset_in_res, blob2->offset_in_res);
767 /* Validate the size and location of a WIM resource. */
769 validate_resource(struct wim_resource_descriptor *rdesc)
771 struct blob_descriptor *blob;
773 u64 expected_next_offset;
776 /* Verify that the resource itself has a valid offset and size. */
777 if (rdesc->offset_in_wim + rdesc->size_in_wim < rdesc->size_in_wim)
778 goto invalid_due_to_overflow;
780 /* Verify that each blob in the resource has a valid offset and size.
782 expected_next_offset = 0;
783 out_of_order = false;
784 list_for_each_entry(blob, &rdesc->blob_list, rdesc_node) {
785 if (blob->offset_in_res + blob->size < blob->size ||
786 blob->offset_in_res + blob->size > rdesc->uncompressed_size)
787 goto invalid_due_to_overflow;
789 if (blob->offset_in_res >= expected_next_offset)
790 expected_next_offset = blob->offset_in_res + blob->size;
795 /* If the blobs were not located at strictly increasing positions (not
796 * allowing for overlap), sort them. Then make sure that none overlap.
799 ret = sort_blob_list(&rdesc->blob_list,
800 offsetof(struct blob_descriptor,
802 cmp_blobs_by_offset_in_res);
806 expected_next_offset = 0;
807 list_for_each_entry(blob, &rdesc->blob_list, rdesc_node) {
808 if (blob->offset_in_res >= expected_next_offset)
809 expected_next_offset = blob->offset_in_res + blob->size;
811 goto invalid_due_to_overlap;
817 invalid_due_to_overflow:
818 ERROR("Invalid blob table (offset overflow)");
819 return WIMLIB_ERR_INVALID_LOOKUP_TABLE_ENTRY;
821 invalid_due_to_overlap:
822 ERROR("Invalid blob table (blobs in solid resource overlap)");
823 return WIMLIB_ERR_INVALID_LOOKUP_TABLE_ENTRY;
827 finish_solid_rdescs(struct wim_resource_descriptor **rdescs, size_t num_rdescs)
830 for (size_t i = 0; i < num_rdescs; i++) {
831 ret = validate_resource(rdescs[i]);
835 free_solid_rdescs(rdescs, num_rdescs);
840 * read_blob_table() -
842 * Read the blob table from a WIM file. Usually, each entry in this table
843 * describes a "blob", or equivalently a "resource", that the WIM file contains,
844 * along with its location and SHA-1 message digest. Descriptors for
845 * non-metadata blobs will be saved in the in-memory blob table
846 * (wim->blob_table), whereas descriptors for metadata blobs will be saved in a
847 * special location per-image (the wim->image_metadata array).
849 * However, in WIM_VERSION_SOLID (3584) WIMs, a resource may contain multiple
850 * blobs that are compressed together. Such a resource is called a "solid
851 * resource". Solid resources are still described in the on-disk "blob table",
852 * although the format is not the most logical. A consecutive sequence of
853 * entries that all have flag WIM_RESHDR_FLAG_SOLID (0x10) set is a "solid run".
854 * A solid run describes a set of solid resources, each of which contains a set
855 * of blobs. In a solid run, a 'struct wim_reshdr_disk' with 'uncompressed_size
856 * = SOLID_RESOURCE_MAGIC_NUMBER (0x100000000)' specifies a solid resource,
857 * whereas any other 'struct wim_reshdr_disk' specifies a blob within a solid
858 * resource. There are some oddities in how we need to determine which solid
859 * resource a blob is actually in; see the code for details.
861 * Possible return values:
862 * WIMLIB_ERR_SUCCESS (0)
863 * WIMLIB_ERR_INVALID_LOOKUP_TABLE_ENTRY
866 * Or an error code caused by failure to read the blob table from the WIM
870 read_blob_table(WIMStruct *wim)
875 struct blob_table *table = NULL;
876 struct blob_descriptor *cur_blob = NULL;
877 size_t num_duplicate_blobs = 0;
878 size_t num_wrong_part_blobs = 0;
880 struct wim_resource_descriptor **cur_solid_rdescs = NULL;
881 size_t cur_num_solid_rdescs = 0;
883 DEBUG("Reading blob table.");
885 /* Calculate the number of entries in the blob table. */
886 num_entries = wim->hdr.blob_table_reshdr.uncompressed_size /
887 sizeof(struct blob_descriptor_disk);
889 /* Read the blob table into a buffer. */
890 ret = wim_reshdr_to_data(&wim->hdr.blob_table_reshdr, wim, &buf);
894 /* Allocate a hash table to map SHA-1 message digests into blob
895 * descriptors. This is the in-memory "blob table". */
896 table = new_blob_table(num_entries * 2 + 1);
900 /* Allocate and initalize blob descriptors from the raw blob table
902 for (size_t i = 0; i < num_entries; i++) {
903 const struct blob_descriptor_disk *disk_entry =
904 &((const struct blob_descriptor_disk*)buf)[i];
905 struct wim_reshdr reshdr;
908 /* Get the resource header */
909 get_wim_reshdr(&disk_entry->reshdr, &reshdr);
911 DEBUG("reshdr: size_in_wim=%"PRIu64", "
912 "uncompressed_size=%"PRIu64", "
913 "offset_in_wim=%"PRIu64", "
915 reshdr.size_in_wim, reshdr.uncompressed_size,
916 reshdr.offset_in_wim, reshdr.flags);
918 /* Ignore SOLID flag if it isn't supposed to be used in this WIM
920 if (wim->hdr.wim_version == WIM_VERSION_DEFAULT)
921 reshdr.flags &= ~WIM_RESHDR_FLAG_SOLID;
923 /* Allocate a new 'struct blob_descriptor'. */
924 cur_blob = new_blob_descriptor();
928 /* Get the part number, reference count, and hash. */
929 part_number = le16_to_cpu(disk_entry->part_number);
930 cur_blob->refcnt = le32_to_cpu(disk_entry->refcnt);
931 copy_hash(cur_blob->hash, disk_entry->hash);
933 if (reshdr.flags & WIM_RESHDR_FLAG_SOLID) {
937 if (!cur_solid_rdescs) {
938 /* Starting new run */
939 ret = load_solid_info(wim, disk_entry,
942 &cur_num_solid_rdescs);
947 if (reshdr.uncompressed_size == SOLID_RESOURCE_MAGIC_NUMBER) {
948 /* Resource entry, not blob entry */
949 goto free_cur_blob_and_continue;
954 ret = assign_blob_to_solid_resource(&reshdr,
957 cur_num_solid_rdescs);
962 /* Normal blob/resource entry; SOLID not set. */
964 struct wim_resource_descriptor *rdesc;
966 if (unlikely(cur_solid_rdescs)) {
967 /* This entry terminated a solid run. */
968 ret = finish_solid_rdescs(cur_solid_rdescs,
969 cur_num_solid_rdescs);
970 cur_solid_rdescs = NULL;
975 /* How to handle an uncompressed resource with its
976 * uncompressed size different from its compressed size?
978 * Based on a simple test, WIMGAPI seems to handle this
981 * if (size_in_wim > uncompressed_size) {
982 * Ignore uncompressed_size; use size_in_wim
985 * Honor uncompressed_size, but treat the part of
986 * the file data above size_in_wim as all zeros.
989 * So we will do the same. */
990 if (unlikely(!(reshdr.flags &
991 WIM_RESHDR_FLAG_COMPRESSED) &&
992 (reshdr.size_in_wim >
993 reshdr.uncompressed_size)))
995 reshdr.uncompressed_size = reshdr.size_in_wim;
998 /* Set up a resource descriptor for this blob. */
1000 rdesc = MALLOC(sizeof(struct wim_resource_descriptor));
1004 wim_res_hdr_to_desc(&reshdr, wim, rdesc);
1006 cur_blob->offset_in_res = 0;
1007 cur_blob->size = reshdr.uncompressed_size;
1008 cur_blob->flags = reshdr.flags;
1010 blob_set_is_located_in_wim_resource(cur_blob, rdesc);
1013 /* cur_blob is now a blob bound to a resource. */
1015 /* Ignore entries with all zeroes in the hash field. */
1016 if (is_zero_hash(cur_blob->hash))
1017 goto free_cur_blob_and_continue;
1019 /* Verify that the part number matches that of the underlying
1021 if (part_number != wim->hdr.part_number) {
1022 num_wrong_part_blobs++;
1023 goto free_cur_blob_and_continue;
1026 if (reshdr.flags & WIM_RESHDR_FLAG_METADATA) {
1028 /* Blob table entry for a metadata resource. */
1030 /* Metadata entries with no references must be ignored.
1031 * See, for example, the WinPE WIMs from the WAIK v2.1.
1033 if (cur_blob->refcnt == 0)
1034 goto free_cur_blob_and_continue;
1036 if (cur_blob->refcnt != 1) {
1037 /* We don't currently support this case due to
1038 * the complications of multiple images sharing
1039 * the same metadata resource or a metadata
1040 * resource also being referenced by files. */
1041 ERROR("Found metadata resource with refcnt != 1");
1042 ret = WIMLIB_ERR_INVALID_LOOKUP_TABLE_ENTRY;
1046 if (wim->hdr.part_number != 1) {
1047 WARNING("Ignoring metadata resource found in a "
1048 "non-first part of the split WIM");
1049 goto free_cur_blob_and_continue;
1052 /* The number of entries in the blob table with
1053 * WIM_RESHDR_FLAG_METADATA set should be the same as
1054 * the image_count field in the WIM header. */
1055 if (image_index == wim->hdr.image_count) {
1056 WARNING("Found more metadata resources than images");
1057 goto free_cur_blob_and_continue;
1060 /* Notice very carefully: We are assigning the metadata
1061 * resources to images in the same order in which their
1062 * blob table entries occur on disk. (This is also the
1063 * behavior of Microsoft's software.) In particular,
1064 * this overrides the actual locations of the metadata
1065 * resources themselves in the WIM file as well as any
1066 * information written in the XML data. */
1067 DEBUG("Found metadata resource for image %"PRIu32" at "
1068 "offset %"PRIu64".",
1070 reshdr.offset_in_wim);
1072 wim->image_metadata[image_index++]->metadata_blob = cur_blob;
1074 /* Blob table entry for a non-metadata blob. */
1076 /* Ignore this blob if it's a duplicate. */
1077 if (lookup_blob(table, cur_blob->hash)) {
1078 num_duplicate_blobs++;
1079 goto free_cur_blob_and_continue;
1082 /* Insert the blob into the in-memory blob table, keyed
1083 * by its SHA-1 message digest. */
1084 blob_table_insert(table, cur_blob);
1089 free_cur_blob_and_continue:
1090 if (cur_solid_rdescs &&
1091 cur_blob->blob_location == BLOB_IN_WIM)
1092 blob_unset_is_located_in_wim_resource(cur_blob);
1093 free_blob_descriptor(cur_blob);
1097 if (cur_solid_rdescs) {
1098 /* End of blob table terminated a solid run. */
1099 ret = finish_solid_rdescs(cur_solid_rdescs, cur_num_solid_rdescs);
1100 cur_solid_rdescs = NULL;
1105 if (wim->hdr.part_number == 1 && image_index != wim->hdr.image_count) {
1106 WARNING("Could not find metadata resources for all images");
1107 for (u32 i = image_index; i < wim->hdr.image_count; i++)
1108 put_image_metadata(wim->image_metadata[i], NULL);
1109 wim->hdr.image_count = image_index;
1112 if (num_duplicate_blobs > 0)
1113 WARNING("Ignoring %zu duplicate blobs", num_duplicate_blobs);
1115 if (num_wrong_part_blobs > 0) {
1116 WARNING("Ignoring %zu blobs with wrong part number",
1117 num_wrong_part_blobs);
1120 DEBUG("Done reading blob table.");
1121 wim->blob_table = table;
1126 ERROR("Not enough memory to read blob table!");
1127 ret = WIMLIB_ERR_NOMEM;
1129 free_solid_rdescs(cur_solid_rdescs, cur_num_solid_rdescs);
1130 free_blob_descriptor(cur_blob);
1131 free_blob_table(table);
1138 write_blob_descriptor(struct blob_descriptor_disk *disk_entry,
1139 const struct wim_reshdr *out_reshdr,
1140 u16 part_number, u32 refcnt, const u8 *hash)
1142 put_wim_reshdr(out_reshdr, &disk_entry->reshdr);
1143 disk_entry->part_number = cpu_to_le16(part_number);
1144 disk_entry->refcnt = cpu_to_le32(refcnt);
1145 copy_hash(disk_entry->hash, hash);
1148 /* Note: the list of blob descriptors must be sorted so that all entries for the
1149 * same solid resource are consecutive. In addition, blob descriptors with
1150 * WIM_RESHDR_FLAG_METADATA set must be in the same order as the indices of the
1151 * underlying images. */
1153 write_blob_table_from_blob_list(struct list_head *blob_list,
1154 struct filedes *out_fd,
1156 struct wim_reshdr *out_reshdr,
1157 int write_resource_flags)
1160 struct blob_descriptor *blob;
1161 struct blob_descriptor_disk *table_buf;
1162 struct blob_descriptor_disk *table_buf_ptr;
1164 u64 prev_res_offset_in_wim = ~0ULL;
1165 u64 prev_uncompressed_size;
1169 list_for_each_entry(blob, blob_list, blob_table_list) {
1170 table_size += sizeof(struct blob_descriptor_disk);
1172 if (blob->out_reshdr.flags & WIM_RESHDR_FLAG_SOLID &&
1173 blob->out_res_offset_in_wim != prev_res_offset_in_wim)
1175 table_size += sizeof(struct blob_descriptor_disk);
1176 prev_res_offset_in_wim = blob->out_res_offset_in_wim;
1180 DEBUG("Writing WIM blob table (size=%zu, offset=%"PRIu64")",
1181 table_size, out_fd->offset);
1183 table_buf = MALLOC(table_size);
1184 if (table_buf == NULL) {
1185 ERROR("Failed to allocate %zu bytes for temporary blob table",
1187 return WIMLIB_ERR_NOMEM;
1189 table_buf_ptr = table_buf;
1191 prev_res_offset_in_wim = ~0ULL;
1192 prev_uncompressed_size = 0;
1194 list_for_each_entry(blob, blob_list, blob_table_list) {
1195 if (blob->out_reshdr.flags & WIM_RESHDR_FLAG_SOLID) {
1196 struct wim_reshdr tmp_reshdr;
1198 /* Eww. When WIMGAPI sees multiple solid resources, it
1199 * expects the offsets to be adjusted as if there were
1200 * really only one solid resource. */
1202 if (blob->out_res_offset_in_wim != prev_res_offset_in_wim) {
1203 /* Put the resource entry for solid resource */
1204 tmp_reshdr.offset_in_wim = blob->out_res_offset_in_wim;
1205 tmp_reshdr.size_in_wim = blob->out_res_size_in_wim;
1206 tmp_reshdr.uncompressed_size = SOLID_RESOURCE_MAGIC_NUMBER;
1207 tmp_reshdr.flags = WIM_RESHDR_FLAG_SOLID;
1209 write_blob_descriptor(table_buf_ptr++, &tmp_reshdr,
1210 part_number, 1, zero_hash);
1212 logical_offset += prev_uncompressed_size;
1214 prev_res_offset_in_wim = blob->out_res_offset_in_wim;
1215 prev_uncompressed_size = blob->out_res_uncompressed_size;
1217 tmp_reshdr = blob->out_reshdr;
1218 tmp_reshdr.offset_in_wim += logical_offset;
1219 write_blob_descriptor(table_buf_ptr++, &tmp_reshdr,
1220 part_number, blob->out_refcnt, blob->hash);
1222 write_blob_descriptor(table_buf_ptr++, &blob->out_reshdr,
1223 part_number, blob->out_refcnt, blob->hash);
1227 wimlib_assert((u8*)table_buf_ptr - (u8*)table_buf == table_size);
1229 /* Write the blob table uncompressed. Although wimlib can handle a
1230 * compressed blob table, MS software cannot. */
1231 ret = write_wim_resource_from_buffer(table_buf,
1233 WIM_RESHDR_FLAG_METADATA,
1235 WIMLIB_COMPRESSION_TYPE_NONE,
1239 write_resource_flags);
1241 DEBUG("ret=%d", ret);
1245 /* Allocate a blob descriptor for the contents of the buffer, or re-use an
1246 * existing descriptor in @blob_table for an identical blob. */
1247 struct blob_descriptor *
1248 new_blob_from_data_buffer(const void *buffer, size_t size,
1249 struct blob_table *blob_table)
1251 u8 hash[SHA1_HASH_SIZE];
1252 struct blob_descriptor *blob, *existing_blob;
1254 sha1_buffer(buffer, size, hash);
1255 existing_blob = lookup_blob(blob_table, hash);
1256 if (existing_blob) {
1257 wimlib_assert(existing_blob->size == size);
1258 blob = existing_blob;
1262 blob = new_blob_descriptor();
1265 buffer_copy = memdup(buffer, size);
1266 if (buffer_copy == NULL) {
1267 free_blob_descriptor(blob);
1270 blob->blob_location = BLOB_IN_ATTACHED_BUFFER;
1271 blob->attached_buffer = buffer_copy;
1273 copy_hash(blob->hash, hash);
1274 blob_table_insert(blob_table, blob);
1280 * Calculate the SHA-1 message digest of a blob and move its descriptor from the
1281 * list of unhashed blobs to the blob table, possibly joining it with an
1287 * The blob table in which the blob needs to be indexed
1289 * On success, a pointer to the resulting blob descriptor is written to
1290 * this location. This will be the same as @blob if it was inserted into
1291 * the blob table, or different if a duplicate blob was found.
1293 * Returns 0 on success; nonzero if there is an error reading the blob data.
1296 hash_unhashed_blob(struct blob_descriptor *blob, struct blob_table *blob_table,
1297 struct blob_descriptor **blob_ret)
1300 struct blob_descriptor *duplicate_blob;
1301 struct blob_descriptor **back_ptr;
1303 wimlib_assert(blob->unhashed);
1305 /* back_ptr must be saved because @back_inode and @back_stream_id are in
1306 * union with the SHA-1 message digest and will no longer be valid once
1307 * the SHA-1 has been calculated. */
1308 back_ptr = retrieve_pointer_to_unhashed_blob(blob);
1310 ret = sha1_blob(blob);
1314 list_del(&blob->unhashed_list);
1317 /* Look for a duplicate blob */
1318 duplicate_blob = lookup_blob(blob_table, blob->hash);
1319 if (duplicate_blob) {
1320 /* We have a duplicate blob. Transfer the reference counts from
1321 * this blob to the duplicate and update the reference to this
1322 * blob (from an stream) to point to the duplicate. The caller
1323 * is responsible for freeing @blob if needed. */
1324 wimlib_assert(duplicate_blob->size == blob->size);
1325 duplicate_blob->refcnt += blob->refcnt;
1327 *back_ptr = duplicate_blob;
1328 blob = duplicate_blob;
1330 /* No duplicate blob, so we need to insert this blob into the
1331 * blob table and treat it as a hashed blob. */
1332 blob_table_insert(blob_table, blob);
1339 blob_to_wimlib_resource_entry(const struct blob_descriptor *blob,
1340 struct wimlib_resource_entry *wentry)
1342 memset(wentry, 0, sizeof(*wentry));
1344 wentry->uncompressed_size = blob->size;
1345 if (blob->blob_location == BLOB_IN_WIM) {
1346 wentry->part_number = blob->rdesc->wim->hdr.part_number;
1347 if (blob->flags & WIM_RESHDR_FLAG_SOLID) {
1348 wentry->offset = blob->offset_in_res;
1350 wentry->compressed_size = blob->rdesc->size_in_wim;
1351 wentry->offset = blob->rdesc->offset_in_wim;
1353 wentry->raw_resource_offset_in_wim = blob->rdesc->offset_in_wim;
1354 wentry->raw_resource_compressed_size = blob->rdesc->size_in_wim;
1355 wentry->raw_resource_uncompressed_size = blob->rdesc->uncompressed_size;
1357 copy_hash(wentry->sha1_hash, blob->hash);
1358 wentry->reference_count = blob->refcnt;
1359 wentry->is_compressed = (blob->flags & WIM_RESHDR_FLAG_COMPRESSED) != 0;
1360 wentry->is_metadata = (blob->flags & WIM_RESHDR_FLAG_METADATA) != 0;
1361 wentry->is_free = (blob->flags & WIM_RESHDR_FLAG_FREE) != 0;
1362 wentry->is_spanned = (blob->flags & WIM_RESHDR_FLAG_SPANNED) != 0;
1363 wentry->packed = (blob->flags & WIM_RESHDR_FLAG_SOLID) != 0;
1366 struct iterate_blob_context {
1367 wimlib_iterate_lookup_table_callback_t cb;
1372 do_iterate_blob(struct blob_descriptor *blob, void *_ctx)
1374 struct iterate_blob_context *ctx = _ctx;
1375 struct wimlib_resource_entry entry;
1377 blob_to_wimlib_resource_entry(blob, &entry);
1378 return (*ctx->cb)(&entry, ctx->user_ctx);
1381 /* API function documented in wimlib.h */
1383 wimlib_iterate_lookup_table(WIMStruct *wim, int flags,
1384 wimlib_iterate_lookup_table_callback_t cb,
1388 return WIMLIB_ERR_INVALID_PARAM;
1390 struct iterate_blob_context ctx = {
1392 .user_ctx = user_ctx,
1394 if (wim_has_metadata(wim)) {
1396 for (int i = 0; i < wim->hdr.image_count; i++) {
1397 ret = do_iterate_blob(wim->image_metadata[i]->metadata_blob,
1403 return for_blob_in_table(wim->blob_table, do_iterate_blob, &ctx);