4 * A blob table maps SHA-1 message digests to "blobs", which are nonempty
5 * sequences of binary data. Within a WIM file, blobs are single-instanced.
7 * This file also contains code to read and write the corresponding on-disk
8 * representation of this table in the WIM file format.
12 * Copyright (C) 2012-2016 Eric Biggers
14 * This file is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 3 of the License, or (at your option) any
19 * This file is distributed in the hope that it will be useful, but WITHOUT
20 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
21 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
24 * You should have received a copy of the GNU Lesser General Public License
25 * along with this file; if not, see https://www.gnu.org/licenses/.
34 #include <unistd.h> /* for unlink() */
36 #include "wimlib/assert.h"
37 #include "wimlib/bitops.h"
38 #include "wimlib/blob_table.h"
39 #include "wimlib/dentry.h"
40 #include "wimlib/encoding.h"
41 #include "wimlib/endianness.h"
42 #include "wimlib/error.h"
43 #include "wimlib/metadata.h"
44 #include "wimlib/ntfs_3g.h"
45 #include "wimlib/resource.h"
46 #include "wimlib/unaligned.h"
47 #include "wimlib/util.h"
48 #include "wimlib/win32.h"
49 #include "wimlib/write.h"
51 /* A hash table mapping SHA-1 message digests to blob descriptors */
53 struct hlist_head *array;
55 size_t mask; /* capacity - 1; capacity is a power of 2 */
59 new_blob_table(size_t capacity)
61 struct blob_table *table;
62 struct hlist_head *array;
64 capacity = roundup_pow_of_2(capacity);
66 table = MALLOC(sizeof(struct blob_table));
70 array = CALLOC(capacity, sizeof(array[0]));
77 table->mask = capacity - 1;
82 ERROR("Failed to allocate memory for blob table "
83 "with capacity %zu", capacity);
88 do_free_blob_descriptor(struct blob_descriptor *blob, void *_ignore)
90 free_blob_descriptor(blob);
95 free_blob_table(struct blob_table *table)
98 for_blob_in_table(table, do_free_blob_descriptor, NULL);
104 struct blob_descriptor *
105 new_blob_descriptor(void)
107 STATIC_ASSERT(BLOB_NONEXISTENT == 0);
108 return CALLOC(1, sizeof(struct blob_descriptor));
111 struct blob_descriptor *
112 clone_blob_descriptor(const struct blob_descriptor *old)
114 struct blob_descriptor *new;
116 new = memdup(old, sizeof(struct blob_descriptor));
120 switch (new->blob_location) {
122 list_add(&new->rdesc_node, &new->rdesc->blob_list);
125 case BLOB_IN_FILE_ON_DISK:
127 case BLOB_IN_STAGING_FILE:
128 STATIC_ASSERT((void*)&old->file_on_disk ==
129 (void*)&old->staging_file_name);
131 new->file_on_disk = TSTRDUP(old->file_on_disk);
132 if (new->file_on_disk == NULL)
136 case BLOB_IN_WINDOWS_FILE:
137 new->windows_file = clone_windows_file(old->windows_file);
140 case BLOB_IN_ATTACHED_BUFFER:
141 new->attached_buffer = memdup(old->attached_buffer, old->size);
142 if (new->attached_buffer == NULL)
146 case BLOB_IN_NTFS_VOLUME:
147 new->ntfs_loc = clone_ntfs_location(old->ntfs_loc);
156 free_blob_descriptor(new);
160 /* Release a blob descriptor from its location, if any, and set its new location
161 * to BLOB_NONEXISTENT. */
163 blob_release_location(struct blob_descriptor *blob)
165 switch (blob->blob_location) {
167 struct wim_resource_descriptor *rdesc = blob->rdesc;
169 list_del(&blob->rdesc_node);
170 if (list_empty(&rdesc->blob_list)) {
171 wim_decrement_refcnt(rdesc->wim);
176 case BLOB_IN_FILE_ON_DISK:
178 case BLOB_IN_STAGING_FILE:
179 STATIC_ASSERT((void*)&blob->file_on_disk ==
180 (void*)&blob->staging_file_name);
182 case BLOB_IN_ATTACHED_BUFFER:
183 STATIC_ASSERT((void*)&blob->file_on_disk ==
184 (void*)&blob->attached_buffer);
185 FREE(blob->file_on_disk);
188 case BLOB_IN_WINDOWS_FILE:
189 free_windows_file(blob->windows_file);
193 case BLOB_IN_NTFS_VOLUME:
194 free_ntfs_location(blob->ntfs_loc);
198 blob->blob_location = BLOB_NONEXISTENT;
202 free_blob_descriptor(struct blob_descriptor *blob)
205 blob_release_location(blob);
210 /* Should this blob be retained even if it has no references? */
212 should_retain_blob(const struct blob_descriptor *blob)
214 return blob->blob_location == BLOB_IN_WIM;
218 finalize_blob(struct blob_descriptor *blob)
220 if (!should_retain_blob(blob))
221 free_blob_descriptor(blob);
225 * Decrements the reference count of the specified blob, which must be either
226 * (a) unhashed, or (b) inserted in the specified blob table.
228 * If the blob's reference count reaches 0, we may unlink it from @table and
229 * free it. However, we retain blobs with 0 reference count that originated
230 * from WIM files (BLOB_IN_WIM). We do this for two reasons:
232 * 1. This prevents information about valid blobs in a WIM file --- blobs which
233 * will continue to be present after appending to the WIM file --- from being
234 * lost merely because we dropped all references to them.
236 * 2. Blob reference counts we read from WIM files can't be trusted. It's
237 * possible that a WIM has reference counts that are too low; WIMGAPI
238 * sometimes creates WIMs where this is the case. It's also possible that
239 * blobs have been referenced from an external WIM; those blobs can
240 * potentially have any reference count at all, either lower or higher than
241 * would be expected for this WIM ("this WIM" meaning the owner of @table) if
242 * it were a standalone WIM.
244 * So we can't take the reference counts too seriously. But at least, we do
245 * recalculate by default when writing a new WIM file.
248 blob_decrement_refcnt(struct blob_descriptor *blob, struct blob_table *table)
250 blob_subtract_refcnt(blob, table, 1);
254 blob_subtract_refcnt(struct blob_descriptor *blob, struct blob_table *table,
257 if (unlikely(blob->refcnt < count)) {
258 blob->refcnt = 0; /* See comment above */
262 blob->refcnt -= count;
264 if (blob->refcnt != 0)
267 if (blob->unhashed) {
268 list_del(&blob->unhashed_list);
270 /* If the blob has been extracted to a staging file for a FUSE
271 * mount, unlink the staging file. (Note that there still may
272 * be open file descriptors to it.) */
273 if (blob->blob_location == BLOB_IN_STAGING_FILE)
274 unlinkat(blob->staging_dir_fd,
275 blob->staging_file_name, 0);
278 if (!should_retain_blob(blob))
279 blob_table_unlink(table, blob);
282 /* If FUSE mounts are enabled, then don't actually free the blob
283 * descriptor until the last file descriptor to it has been closed. */
285 if (blob->num_opened_fds == 0)
292 blob_decrement_num_opened_fds(struct blob_descriptor *blob)
294 wimlib_assert(blob->num_opened_fds != 0);
296 if (--blob->num_opened_fds == 0 && blob->refcnt == 0)
302 blob_table_insert_raw(struct blob_table *table, struct blob_descriptor *blob)
304 size_t i = blob->hash_short & table->mask;
306 hlist_add_head(&blob->hash_list, &table->array[i]);
310 enlarge_blob_table(struct blob_table *table)
312 size_t old_capacity, new_capacity;
313 struct hlist_head *old_array, *new_array;
314 struct blob_descriptor *blob;
315 struct hlist_node *tmp;
318 old_capacity = table->mask + 1;
319 new_capacity = old_capacity * 2;
320 new_array = CALLOC(new_capacity, sizeof(struct hlist_head));
321 if (new_array == NULL)
323 old_array = table->array;
324 table->array = new_array;
325 table->mask = new_capacity - 1;
327 for (i = 0; i < old_capacity; i++)
328 hlist_for_each_entry_safe(blob, tmp, &old_array[i], hash_list)
329 blob_table_insert_raw(table, blob);
333 /* Insert a blob descriptor into the blob table. */
335 blob_table_insert(struct blob_table *table, struct blob_descriptor *blob)
337 blob_table_insert_raw(table, blob);
338 if (table->num_blobs++ > table->mask)
339 enlarge_blob_table(table);
342 /* Unlinks a blob descriptor from the blob table; does not free it. */
344 blob_table_unlink(struct blob_table *table, struct blob_descriptor *blob)
346 wimlib_assert(!blob->unhashed);
347 wimlib_assert(table->num_blobs != 0);
349 hlist_del(&blob->hash_list);
353 /* Given a SHA-1 message digest, return the corresponding blob descriptor from
354 * the specified blob table, or NULL if there is none. */
355 struct blob_descriptor *
356 lookup_blob(const struct blob_table *table, const u8 *hash)
359 struct blob_descriptor *blob;
361 i = load_size_t_unaligned(hash) & table->mask;
362 hlist_for_each_entry(blob, &table->array[i], hash_list)
363 if (hashes_equal(hash, blob->hash))
368 /* Call a function on all blob descriptors in the specified blob table. Stop
369 * early and return nonzero if any call to the function returns nonzero. */
371 for_blob_in_table(struct blob_table *table,
372 int (*visitor)(struct blob_descriptor *, void *), void *arg)
374 struct blob_descriptor *blob;
375 struct hlist_node *tmp;
378 for (size_t i = 0; i <= table->mask; i++) {
379 hlist_for_each_entry_safe(blob, tmp, &table->array[i],
382 ret = visitor(blob, arg);
391 * This is a qsort() callback that sorts blobs into an order optimized for
392 * reading. Sorting is done primarily by blob location, then secondarily by a
393 * location-dependent order. For example, blobs in WIM resources are sorted
394 * such that the underlying WIM files will be read sequentially. This is
395 * especially important for WIM files containing solid resources.
398 cmp_blobs_by_sequential_order(const void *p1, const void *p2)
400 const struct blob_descriptor *blob1, *blob2;
402 WIMStruct *wim1, *wim2;
404 blob1 = *(const struct blob_descriptor**)p1;
405 blob2 = *(const struct blob_descriptor**)p2;
407 v = (int)blob1->blob_location - (int)blob2->blob_location;
409 /* Different locations? Note: "unsafe compaction mode" requires that
410 * blobs in WIMs sort before all others. For the logic here to ensure
411 * this, BLOB_IN_WIM must have the lowest value among all defined
412 * blob_locations. Statically verify that the enum values haven't
414 STATIC_ASSERT(BLOB_NONEXISTENT == 0 && BLOB_IN_WIM == 1);
418 switch (blob1->blob_location) {
420 wim1 = blob1->rdesc->wim;
421 wim2 = blob2->rdesc->wim;
423 /* Different WIM files? */
426 /* Resources from the WIM file currently being compacted
427 * (if any) must always sort first. */
428 v = (int)wim2->being_compacted - (int)wim1->being_compacted;
432 /* Different split WIMs? */
433 v = cmp_guids(wim1->hdr.guid, wim2->hdr.guid);
437 /* Different part numbers in the same split WIM? */
438 v = (int)wim1->hdr.part_number - (int)wim2->hdr.part_number;
442 /* Probably two WIMStructs for the same on-disk file.
443 * Just sort by pointer. */
444 return wim1 < wim2 ? -1 : 1;
449 /* Sort by increasing resource offset */
450 if (blob1->rdesc->offset_in_wim != blob2->rdesc->offset_in_wim)
451 return cmp_u64(blob1->rdesc->offset_in_wim,
452 blob2->rdesc->offset_in_wim);
454 /* The blobs are in the same solid resource. Sort by increasing
455 * offset in the resource. */
456 return cmp_u64(blob1->offset_in_res, blob2->offset_in_res);
458 case BLOB_IN_FILE_ON_DISK:
460 case BLOB_IN_STAGING_FILE:
462 /* Compare files by path: just a heuristic that will place files
463 * in the same directory next to each other. */
464 return tstrcmp(blob1->file_on_disk, blob2->file_on_disk);
466 case BLOB_IN_WINDOWS_FILE:
467 return cmp_windows_files(blob1->windows_file, blob2->windows_file);
470 case BLOB_IN_NTFS_VOLUME:
471 return cmp_ntfs_locations(blob1->ntfs_loc, blob2->ntfs_loc);
474 /* No additional sorting order defined for this resource
475 * location (e.g. BLOB_IN_ATTACHED_BUFFER); simply compare
476 * everything equal to each other. */
482 sort_blob_list(struct list_head *blob_list, size_t list_head_offset,
483 int (*compar)(const void *, const void*))
485 struct list_head *cur;
486 struct blob_descriptor **array;
489 size_t num_blobs = 0;
491 list_for_each(cur, blob_list)
497 array_size = num_blobs * sizeof(array[0]);
498 array = MALLOC(array_size);
500 return WIMLIB_ERR_NOMEM;
502 cur = blob_list->next;
503 for (i = 0; i < num_blobs; i++) {
504 array[i] = (struct blob_descriptor*)((u8*)cur - list_head_offset);
508 qsort(array, num_blobs, sizeof(array[0]), compar);
510 INIT_LIST_HEAD(blob_list);
511 for (i = 0; i < num_blobs; i++) {
512 list_add_tail((struct list_head*)
513 ((u8*)array[i] + list_head_offset), blob_list);
519 /* Sort the specified list of blobs in an order optimized for sequential
522 sort_blob_list_by_sequential_order(struct list_head *blob_list,
523 size_t list_head_offset)
525 return sort_blob_list(blob_list, list_head_offset,
526 cmp_blobs_by_sequential_order);
530 add_blob_to_array(struct blob_descriptor *blob, void *_pp)
532 struct blob_descriptor ***pp = _pp;
537 /* Iterate through the blob descriptors in the specified blob table in an order
538 * optimized for sequential reading. */
540 for_blob_in_table_sorted_by_sequential_order(struct blob_table *table,
541 int (*visitor)(struct blob_descriptor *, void *),
544 struct blob_descriptor **blob_array, **p;
545 size_t num_blobs = table->num_blobs;
548 blob_array = MALLOC(num_blobs * sizeof(blob_array[0]));
550 return WIMLIB_ERR_NOMEM;
552 for_blob_in_table(table, add_blob_to_array, &p);
554 wimlib_assert(p == blob_array + num_blobs);
556 qsort(blob_array, num_blobs, sizeof(blob_array[0]),
557 cmp_blobs_by_sequential_order);
559 for (size_t i = 0; i < num_blobs; i++) {
560 ret = visitor(blob_array[i], arg);
568 /* On-disk format of a blob descriptor in a WIM file.
570 * Note: if the WIM file contains solid resource(s), then this structure is
571 * sometimes overloaded to describe a "resource" rather than a "blob". See the
572 * code for details. */
573 struct blob_descriptor_disk {
575 /* Size, offset, and flags of the blob. */
576 struct wim_reshdr_disk reshdr;
578 /* Which part of the split WIM this blob is in; indexed from 1. */
581 /* Reference count of this blob over all WIM images. (But see comment
582 * above blob_decrement_refcnt().) */
585 /* SHA-1 message digest of the uncompressed data of this blob, or all
586 * zeroes if this blob is of zero length. */
587 u8 hash[SHA1_HASH_SIZE];
588 } __attribute__((packed));
590 /* Given a nonempty run of consecutive blob descriptors with the SOLID flag set,
591 * count how many specify resources (as opposed to blobs within those
594 * Returns the resulting count. */
596 count_solid_resources(const struct blob_descriptor_disk *entries, size_t max)
600 struct wim_reshdr reshdr;
602 get_wim_reshdr(&(entries++)->reshdr, &reshdr);
604 if (!(reshdr.flags & WIM_RESHDR_FLAG_SOLID)) {
605 /* Run was terminated by a stand-alone blob entry. */
609 if (reshdr.uncompressed_size == SOLID_RESOURCE_MAGIC_NUMBER) {
610 /* This is a resource entry. */
618 * Given a run of consecutive blob descriptors with the SOLID flag set and
619 * having @num_rdescs resource entries, load resource information from them into
620 * the resource descriptors in the @rdescs array.
622 * Returns 0 on success, or a nonzero error code on failure.
625 do_load_solid_info(WIMStruct *wim, struct wim_resource_descriptor **rdescs,
627 const struct blob_descriptor_disk *entries)
629 for (size_t i = 0; i < num_rdescs; i++) {
630 struct wim_reshdr reshdr;
631 struct alt_chunk_table_header_disk hdr;
632 struct wim_resource_descriptor *rdesc;
635 /* Advance to next resource entry. */
638 get_wim_reshdr(&(entries++)->reshdr, &reshdr);
639 } while (reshdr.uncompressed_size != SOLID_RESOURCE_MAGIC_NUMBER);
643 wim_reshdr_to_desc(&reshdr, wim, rdesc);
645 /* For solid resources, the uncompressed size, compression type,
646 * and chunk size are stored in the resource itself, not in the
649 ret = full_pread(&wim->in_fd, &hdr,
650 sizeof(hdr), reshdr.offset_in_wim);
652 ERROR("Failed to read header of solid resource "
653 "(offset_in_wim=%"PRIu64")",
654 reshdr.offset_in_wim);
658 rdesc->uncompressed_size = le64_to_cpu(hdr.res_usize);
660 /* Compression format numbers must be the same as in
661 * WIMGAPI to be compatible here. */
662 STATIC_ASSERT(WIMLIB_COMPRESSION_TYPE_NONE == 0);
663 STATIC_ASSERT(WIMLIB_COMPRESSION_TYPE_XPRESS == 1);
664 STATIC_ASSERT(WIMLIB_COMPRESSION_TYPE_LZX == 2);
665 STATIC_ASSERT(WIMLIB_COMPRESSION_TYPE_LZMS == 3);
666 rdesc->compression_type = le32_to_cpu(hdr.compression_format);
667 rdesc->chunk_size = le32_to_cpu(hdr.chunk_size);
673 * Given a nonempty run of consecutive blob descriptors with the SOLID flag set,
674 * allocate a 'struct wim_resource_descriptor' for each resource within that
677 * Returns 0 on success, or a nonzero error code on failure.
678 * Returns the pointers and count in *rdescs_ret and *num_rdescs_ret.
681 load_solid_info(WIMStruct *wim,
682 const struct blob_descriptor_disk *entries,
683 size_t num_remaining_entries,
684 struct wim_resource_descriptor ***rdescs_ret,
685 size_t *num_rdescs_ret)
688 struct wim_resource_descriptor **rdescs;
692 num_rdescs = count_solid_resources(entries, num_remaining_entries);
693 rdescs = CALLOC(num_rdescs, sizeof(rdescs[0]));
695 return WIMLIB_ERR_NOMEM;
697 for (i = 0; i < num_rdescs; i++) {
698 rdescs[i] = MALLOC(sizeof(struct wim_resource_descriptor));
700 ret = WIMLIB_ERR_NOMEM;
701 goto out_free_rdescs;
705 ret = do_load_solid_info(wim, rdescs, num_rdescs, entries);
707 goto out_free_rdescs;
709 wim->refcnt += num_rdescs;
711 *rdescs_ret = rdescs;
712 *num_rdescs_ret = num_rdescs;
716 for (i = 0; i < num_rdescs; i++)
722 /* Given a 'struct blob_descriptor' allocated for an on-disk blob descriptor
723 * with the SOLID flag set, try to assign it to resource in the current solid
726 assign_blob_to_solid_resource(const struct wim_reshdr *reshdr,
727 struct blob_descriptor *blob,
728 struct wim_resource_descriptor **rdescs,
731 u64 offset = reshdr->offset_in_wim;
733 /* XXX: This linear search will be slow in the degenerate case where the
734 * number of solid resources in the run is huge. */
735 blob->size = reshdr->size_in_wim;
736 for (size_t i = 0; i < num_rdescs; i++) {
737 if (offset + blob->size <= rdescs[i]->uncompressed_size) {
738 blob_set_is_located_in_wim_resource(blob, rdescs[i], offset);
741 offset -= rdescs[i]->uncompressed_size;
743 ERROR("blob could not be assigned to a solid resource");
744 return WIMLIB_ERR_INVALID_LOOKUP_TABLE_ENTRY;
748 free_solid_rdescs(struct wim_resource_descriptor **rdescs, size_t num_rdescs)
751 for (size_t i = 0; i < num_rdescs; i++) {
752 if (list_empty(&rdescs[i]->blob_list)) {
753 rdescs[i]->wim->refcnt--;
762 cmp_blobs_by_offset_in_res(const void *p1, const void *p2)
764 const struct blob_descriptor *blob1, *blob2;
766 blob1 = *(const struct blob_descriptor**)p1;
767 blob2 = *(const struct blob_descriptor**)p2;
769 return cmp_u64(blob1->offset_in_res, blob2->offset_in_res);
772 /* Validate the size and location of a WIM resource. */
774 validate_resource(struct wim_resource_descriptor *rdesc)
776 struct blob_descriptor *blob;
778 u64 expected_next_offset;
781 /* Verify that the resource itself has a valid offset and size. */
782 if (rdesc->offset_in_wim + rdesc->size_in_wim < rdesc->size_in_wim)
783 goto invalid_due_to_overflow;
785 /* Verify that each blob in the resource has a valid offset and size.
787 expected_next_offset = 0;
788 out_of_order = false;
789 list_for_each_entry(blob, &rdesc->blob_list, rdesc_node) {
790 if (blob->offset_in_res + blob->size < blob->size ||
791 blob->offset_in_res + blob->size > rdesc->uncompressed_size)
792 goto invalid_due_to_overflow;
794 if (blob->offset_in_res >= expected_next_offset)
795 expected_next_offset = blob->offset_in_res + blob->size;
800 /* If the blobs were not located at strictly increasing positions (not
801 * allowing for overlap), sort them. Then make sure that none overlap.
804 ret = sort_blob_list(&rdesc->blob_list,
805 offsetof(struct blob_descriptor,
807 cmp_blobs_by_offset_in_res);
811 expected_next_offset = 0;
812 list_for_each_entry(blob, &rdesc->blob_list, rdesc_node) {
813 if (blob->offset_in_res >= expected_next_offset)
814 expected_next_offset = blob->offset_in_res + blob->size;
816 goto invalid_due_to_overlap;
822 invalid_due_to_overflow:
823 ERROR("Invalid blob table (offset overflow)");
824 return WIMLIB_ERR_INVALID_LOOKUP_TABLE_ENTRY;
826 invalid_due_to_overlap:
827 ERROR("Invalid blob table (blobs in solid resource overlap)");
828 return WIMLIB_ERR_INVALID_LOOKUP_TABLE_ENTRY;
832 finish_solid_rdescs(struct wim_resource_descriptor **rdescs, size_t num_rdescs)
835 for (size_t i = 0; i < num_rdescs; i++) {
836 ret = validate_resource(rdescs[i]);
840 free_solid_rdescs(rdescs, num_rdescs);
845 * read_blob_table() -
847 * Read the blob table from a WIM file. Usually, each entry in this table
848 * describes a "blob", or equivalently a "resource", that the WIM file contains,
849 * along with its location and SHA-1 message digest. Descriptors for
850 * non-metadata blobs will be saved in the in-memory blob table
851 * (wim->blob_table), whereas descriptors for metadata blobs will be saved in a
852 * special location per-image (the wim->image_metadata array).
854 * However, in WIM_VERSION_SOLID (3584) WIMs, a resource may contain multiple
855 * blobs that are compressed together. Such a resource is called a "solid
856 * resource". Solid resources are still described in the on-disk "blob table",
857 * although the format is not the most logical. A consecutive sequence of
858 * entries that all have flag WIM_RESHDR_FLAG_SOLID (0x10) set is a "solid run".
859 * A solid run describes a set of solid resources, each of which contains a set
860 * of blobs. In a solid run, a 'struct wim_reshdr_disk' with 'uncompressed_size
861 * = SOLID_RESOURCE_MAGIC_NUMBER (0x100000000)' specifies a solid resource,
862 * whereas any other 'struct wim_reshdr_disk' specifies a blob within a solid
863 * resource. There are some oddities in how we need to determine which solid
864 * resource a blob is actually in; see the code for details.
866 * Possible return values:
867 * WIMLIB_ERR_SUCCESS (0)
868 * WIMLIB_ERR_INVALID_LOOKUP_TABLE_ENTRY
871 * Or an error code caused by failure to read the blob table from the WIM
875 read_blob_table(WIMStruct *wim)
880 struct blob_table *table = NULL;
881 struct blob_descriptor *cur_blob = NULL;
882 size_t num_duplicate_blobs = 0;
883 size_t num_empty_blobs = 0;
884 size_t num_wrong_part_blobs = 0;
886 struct wim_resource_descriptor **cur_solid_rdescs = NULL;
887 size_t cur_num_solid_rdescs = 0;
889 /* Calculate the number of entries in the blob table. */
890 num_entries = wim->hdr.blob_table_reshdr.uncompressed_size /
891 sizeof(struct blob_descriptor_disk);
893 /* Read the blob table into a buffer. */
894 ret = wim_reshdr_to_data(&wim->hdr.blob_table_reshdr, wim, &buf);
898 /* Allocate a hash table to map SHA-1 message digests into blob
899 * descriptors. This is the in-memory "blob table". */
900 table = new_blob_table(num_entries);
904 /* Allocate and initialize blob descriptors from the raw blob table
906 for (size_t i = 0; i < num_entries; i++) {
907 const struct blob_descriptor_disk *disk_entry =
908 &((const struct blob_descriptor_disk*)buf)[i];
909 struct wim_reshdr reshdr;
912 /* Get the resource header */
913 get_wim_reshdr(&disk_entry->reshdr, &reshdr);
915 /* Ignore SOLID flag if it isn't supposed to be used in this WIM
917 if (wim->hdr.wim_version == WIM_VERSION_DEFAULT)
918 reshdr.flags &= ~WIM_RESHDR_FLAG_SOLID;
920 /* Allocate a new 'struct blob_descriptor'. */
921 cur_blob = new_blob_descriptor();
925 /* Get the part number, reference count, and hash. */
926 part_number = le16_to_cpu(disk_entry->part_number);
927 cur_blob->refcnt = le32_to_cpu(disk_entry->refcnt);
928 copy_hash(cur_blob->hash, disk_entry->hash);
930 if (reshdr.flags & WIM_RESHDR_FLAG_SOLID) {
934 if (!cur_solid_rdescs) {
935 /* Starting new run */
936 ret = load_solid_info(wim, disk_entry,
939 &cur_num_solid_rdescs);
944 if (reshdr.uncompressed_size == SOLID_RESOURCE_MAGIC_NUMBER) {
945 /* Resource entry, not blob entry */
946 goto free_cur_blob_and_continue;
951 ret = assign_blob_to_solid_resource(&reshdr,
954 cur_num_solid_rdescs);
959 /* Normal blob/resource entry; SOLID not set. */
961 struct wim_resource_descriptor *rdesc;
963 if (unlikely(cur_solid_rdescs)) {
964 /* This entry terminated a solid run. */
965 ret = finish_solid_rdescs(cur_solid_rdescs,
966 cur_num_solid_rdescs);
967 cur_solid_rdescs = NULL;
972 if (unlikely(!(reshdr.flags & WIM_RESHDR_FLAG_COMPRESSED) &&
973 (reshdr.size_in_wim != reshdr.uncompressed_size)))
975 ERROR("Uncompressed resource has "
976 "size_in_wim != uncompressed_size");
977 ret = WIMLIB_ERR_INVALID_LOOKUP_TABLE_ENTRY;
981 /* Set up a resource descriptor for this blob. */
983 rdesc = MALLOC(sizeof(struct wim_resource_descriptor));
987 wim_reshdr_to_desc_and_blob(&reshdr, wim, rdesc, cur_blob);
991 /* cur_blob is now a blob bound to a resource. */
993 /* Ignore entries with all zeroes in the hash field. */
994 if (unlikely(is_zero_hash(cur_blob->hash)))
995 goto free_cur_blob_and_continue;
997 /* Verify that the blob has nonzero size. */
998 if (unlikely(cur_blob->size == 0)) {
1000 goto free_cur_blob_and_continue;
1003 /* Verify that the part number matches that of the underlying
1005 if (unlikely(part_number != wim->hdr.part_number)) {
1006 num_wrong_part_blobs++;
1007 goto free_cur_blob_and_continue;
1010 if (reshdr.flags & WIM_RESHDR_FLAG_METADATA) {
1011 /* Blob table entry for a metadata resource. */
1013 /* Metadata entries with no references must be ignored.
1014 * See, for example, the WinPE WIMs from the WAIK v2.1.
1016 if (cur_blob->refcnt == 0)
1017 goto free_cur_blob_and_continue;
1019 if (cur_blob->refcnt != 1) {
1020 /* We don't currently support this case due to
1021 * the complications of multiple images sharing
1022 * the same metadata resource or a metadata
1023 * resource also being referenced by files. */
1024 ERROR("Found metadata resource with refcnt != 1");
1025 ret = WIMLIB_ERR_INVALID_LOOKUP_TABLE_ENTRY;
1029 if (reshdr.flags & WIM_RESHDR_FLAG_SOLID) {
1030 ERROR("Image metadata in solid resources "
1032 ret = WIMLIB_ERR_INVALID_LOOKUP_TABLE_ENTRY;
1036 if (wim->hdr.part_number != 1) {
1037 WARNING("Ignoring metadata resource found in a "
1038 "non-first part of the split WIM");
1039 goto free_cur_blob_and_continue;
1042 /* The number of entries in the blob table with
1043 * WIM_RESHDR_FLAG_METADATA set should be the same as
1044 * the image_count field in the WIM header. */
1045 if (image_index == wim->hdr.image_count) {
1046 WARNING("Found more metadata resources than images");
1047 goto free_cur_blob_and_continue;
1050 /* Notice very carefully: We are assigning the metadata
1051 * resources to images in the same order in which their
1052 * blob table entries occur on disk. (This is also the
1053 * behavior of Microsoft's software.) In particular,
1054 * this overrides the actual locations of the metadata
1055 * resources themselves in the WIM file as well as any
1056 * information written in the XML data. */
1057 wim->image_metadata[image_index] = new_unloaded_image_metadata(cur_blob);
1058 if (!wim->image_metadata[image_index])
1062 /* Blob table entry for a non-metadata blob. */
1064 /* Ignore this blob if it's a duplicate. */
1065 if (lookup_blob(table, cur_blob->hash)) {
1066 num_duplicate_blobs++;
1067 goto free_cur_blob_and_continue;
1070 /* Insert the blob into the in-memory blob table, keyed
1071 * by its SHA-1 message digest. */
1072 blob_table_insert(table, cur_blob);
1077 free_cur_blob_and_continue:
1078 if (cur_solid_rdescs &&
1079 cur_blob->blob_location == BLOB_IN_WIM)
1080 blob_unset_is_located_in_wim_resource(cur_blob);
1081 free_blob_descriptor(cur_blob);
1085 if (cur_solid_rdescs) {
1086 /* End of blob table terminated a solid run. */
1087 ret = finish_solid_rdescs(cur_solid_rdescs, cur_num_solid_rdescs);
1088 cur_solid_rdescs = NULL;
1093 if (wim->hdr.part_number == 1 && image_index != wim->hdr.image_count) {
1094 WARNING("Could not find metadata resources for all images");
1095 wim->hdr.image_count = image_index;
1098 if (num_duplicate_blobs > 0)
1099 WARNING("Ignoring %zu duplicate blobs", num_duplicate_blobs);
1101 if (num_empty_blobs > 0)
1102 WARNING("Ignoring %zu empty blobs", num_empty_blobs);
1104 if (num_wrong_part_blobs > 0) {
1105 WARNING("Ignoring %zu blobs with wrong part number",
1106 num_wrong_part_blobs);
1109 wim->blob_table = table;
1114 ERROR("Not enough memory to read blob table!");
1115 ret = WIMLIB_ERR_NOMEM;
1117 free_solid_rdescs(cur_solid_rdescs, cur_num_solid_rdescs);
1118 free_blob_descriptor(cur_blob);
1119 free_blob_table(table);
1126 write_blob_descriptor(struct blob_descriptor_disk *disk_entry,
1127 const struct wim_reshdr *out_reshdr,
1128 u16 part_number, u32 refcnt, const u8 *hash)
1130 put_wim_reshdr(out_reshdr, &disk_entry->reshdr);
1131 disk_entry->part_number = cpu_to_le16(part_number);
1132 disk_entry->refcnt = cpu_to_le32(refcnt);
1133 copy_hash(disk_entry->hash, hash);
1136 /* Note: the list of blob descriptors must be sorted so that all entries for the
1137 * same solid resource are consecutive. In addition, blob descriptors for
1138 * metadata resources must be in the same order as the indices of the underlying
1141 write_blob_table_from_blob_list(struct list_head *blob_list,
1142 struct filedes *out_fd,
1144 struct wim_reshdr *out_reshdr,
1145 int write_resource_flags)
1148 struct blob_descriptor *blob;
1149 struct blob_descriptor_disk *table_buf;
1150 struct blob_descriptor_disk *table_buf_ptr;
1152 u64 prev_res_offset_in_wim = ~0ULL;
1153 u64 prev_uncompressed_size;
1157 list_for_each_entry(blob, blob_list, blob_table_list) {
1158 table_size += sizeof(struct blob_descriptor_disk);
1160 if (blob->out_reshdr.flags & WIM_RESHDR_FLAG_SOLID &&
1161 blob->out_res_offset_in_wim != prev_res_offset_in_wim)
1163 table_size += sizeof(struct blob_descriptor_disk);
1164 prev_res_offset_in_wim = blob->out_res_offset_in_wim;
1168 table_buf = MALLOC(table_size);
1169 if (table_buf == NULL) {
1170 ERROR("Failed to allocate %zu bytes for temporary blob table",
1172 return WIMLIB_ERR_NOMEM;
1174 table_buf_ptr = table_buf;
1176 prev_res_offset_in_wim = ~0ULL;
1177 prev_uncompressed_size = 0;
1179 list_for_each_entry(blob, blob_list, blob_table_list) {
1180 if (blob->out_reshdr.flags & WIM_RESHDR_FLAG_SOLID) {
1181 struct wim_reshdr tmp_reshdr;
1183 /* Eww. When WIMGAPI sees multiple solid resources, it
1184 * expects the offsets to be adjusted as if there were
1185 * really only one solid resource. */
1187 if (blob->out_res_offset_in_wim != prev_res_offset_in_wim) {
1188 /* Put the resource entry for solid resource */
1189 tmp_reshdr.offset_in_wim = blob->out_res_offset_in_wim;
1190 tmp_reshdr.size_in_wim = blob->out_res_size_in_wim;
1191 tmp_reshdr.uncompressed_size = SOLID_RESOURCE_MAGIC_NUMBER;
1192 tmp_reshdr.flags = WIM_RESHDR_FLAG_SOLID;
1194 write_blob_descriptor(table_buf_ptr++, &tmp_reshdr,
1195 part_number, 1, zero_hash);
1197 logical_offset += prev_uncompressed_size;
1199 prev_res_offset_in_wim = blob->out_res_offset_in_wim;
1200 prev_uncompressed_size = blob->out_res_uncompressed_size;
1202 tmp_reshdr = blob->out_reshdr;
1203 tmp_reshdr.offset_in_wim += logical_offset;
1204 write_blob_descriptor(table_buf_ptr++, &tmp_reshdr,
1205 part_number, blob->out_refcnt, blob->hash);
1207 write_blob_descriptor(table_buf_ptr++, &blob->out_reshdr,
1208 part_number, blob->out_refcnt, blob->hash);
1212 wimlib_assert((u8*)table_buf_ptr - (u8*)table_buf == table_size);
1214 /* Write the blob table uncompressed. Although wimlib can handle a
1215 * compressed blob table, MS software cannot. */
1216 ret = write_wim_resource_from_buffer(table_buf,
1220 WIMLIB_COMPRESSION_TYPE_NONE,
1224 write_resource_flags);
1229 /* Allocate a blob descriptor for the contents of the buffer, or re-use an
1230 * existing descriptor in @blob_table for an identical blob. */
1231 struct blob_descriptor *
1232 new_blob_from_data_buffer(const void *buffer, size_t size,
1233 struct blob_table *blob_table)
1235 u8 hash[SHA1_HASH_SIZE];
1236 struct blob_descriptor *blob;
1239 sha1(buffer, size, hash);
1241 blob = lookup_blob(blob_table, hash);
1245 blob = new_blob_descriptor();
1249 buffer_copy = memdup(buffer, size);
1251 free_blob_descriptor(blob);
1254 blob_set_is_located_in_attached_buffer(blob, buffer_copy, size);
1255 copy_hash(blob->hash, hash);
1256 blob_table_insert(blob_table, blob);
1260 struct blob_descriptor *
1261 after_blob_hashed(struct blob_descriptor *blob,
1262 struct blob_descriptor **back_ptr,
1263 struct blob_table *blob_table, struct wim_inode *inode)
1265 struct blob_descriptor *duplicate_blob;
1267 list_del(&blob->unhashed_list);
1270 /* Look for a duplicate blob */
1271 duplicate_blob = lookup_blob(blob_table, blob->hash);
1272 if (duplicate_blob) {
1273 /* We have a duplicate blob. Transfer the reference counts from
1274 * this blob to the duplicate and update the reference to this
1275 * blob (from a stream) to point to the duplicate. The caller
1276 * is responsible for freeing @blob if needed. */
1277 if (duplicate_blob->size != blob->size) {
1278 tchar hash_str[SHA1_HASH_STRING_LEN];
1280 sprint_hash(blob->hash, hash_str);
1281 WARNING("SHA-1 collision at \"%"TS"\"\n"
1282 " (hash=%"TS", size=%"PRIu64", other_size=%"PRIu64").\n"
1283 " File will be corrupted!",
1284 inode_any_full_path(inode), hash_str,
1285 blob->size, duplicate_blob->size);
1287 duplicate_blob->refcnt += blob->refcnt;
1289 *back_ptr = duplicate_blob;
1290 return duplicate_blob;
1292 /* No duplicate blob, so we need to insert this blob into the
1293 * blob table and treat it as a hashed blob. */
1294 blob_table_insert(blob_table, blob);
1300 * Calculate the SHA-1 message digest of a blob and move its descriptor from the
1301 * list of unhashed blobs to the blob table, possibly joining it with an
1307 * The blob table in which the blob needs to be indexed
1309 * On success, a pointer to the resulting blob descriptor is written to
1310 * this location. This will be the same as @blob if it was inserted into
1311 * the blob table, or different if a duplicate blob was found.
1313 * Returns 0 on success; nonzero if there is an error reading the blob data.
1316 hash_unhashed_blob(struct blob_descriptor *blob, struct blob_table *blob_table,
1317 struct blob_descriptor **blob_ret)
1319 struct blob_descriptor **back_ptr;
1320 struct wim_inode *inode;
1323 back_ptr = retrieve_pointer_to_unhashed_blob(blob);
1324 inode = blob->back_inode;
1326 ret = sha1_blob(blob);
1330 *blob_ret = after_blob_hashed(blob, back_ptr, blob_table, inode);
1335 blob_to_wimlib_resource_entry(const struct blob_descriptor *blob,
1336 struct wimlib_resource_entry *wentry)
1338 memset(wentry, 0, sizeof(*wentry));
1340 wentry->uncompressed_size = blob->size;
1341 if (blob->blob_location == BLOB_IN_WIM) {
1342 unsigned res_flags = blob->rdesc->flags;
1344 wentry->part_number = blob->rdesc->wim->hdr.part_number;
1345 if (res_flags & WIM_RESHDR_FLAG_SOLID) {
1346 wentry->offset = blob->offset_in_res;
1348 wentry->compressed_size = blob->rdesc->size_in_wim;
1349 wentry->offset = blob->rdesc->offset_in_wim;
1351 wentry->raw_resource_offset_in_wim = blob->rdesc->offset_in_wim;
1352 wentry->raw_resource_compressed_size = blob->rdesc->size_in_wim;
1353 wentry->raw_resource_uncompressed_size = blob->rdesc->uncompressed_size;
1355 wentry->is_compressed = (res_flags & WIM_RESHDR_FLAG_COMPRESSED) != 0;
1356 wentry->is_free = (res_flags & WIM_RESHDR_FLAG_FREE) != 0;
1357 wentry->is_spanned = (res_flags & WIM_RESHDR_FLAG_SPANNED) != 0;
1358 wentry->packed = (res_flags & WIM_RESHDR_FLAG_SOLID) != 0;
1360 if (!blob->unhashed)
1361 copy_hash(wentry->sha1_hash, blob->hash);
1362 wentry->reference_count = blob->refcnt;
1363 wentry->is_metadata = blob->is_metadata;
1366 struct iterate_blob_context {
1367 wimlib_iterate_lookup_table_callback_t cb;
1372 do_iterate_blob(struct blob_descriptor *blob, void *_ctx)
1374 struct iterate_blob_context *ctx = _ctx;
1375 struct wimlib_resource_entry entry;
1377 blob_to_wimlib_resource_entry(blob, &entry);
1378 return (*ctx->cb)(&entry, ctx->user_ctx);
1381 /* API function documented in wimlib.h */
1383 wimlib_iterate_lookup_table(WIMStruct *wim, int flags,
1384 wimlib_iterate_lookup_table_callback_t cb,
1388 return WIMLIB_ERR_INVALID_PARAM;
1390 struct iterate_blob_context ctx = {
1392 .user_ctx = user_ctx,
1394 if (wim_has_metadata(wim)) {
1396 for (int i = 0; i < wim->hdr.image_count; i++) {
1397 struct blob_descriptor *blob;
1398 struct wim_image_metadata *imd = wim->image_metadata[i];
1400 ret = do_iterate_blob(imd->metadata_blob, &ctx);
1403 image_for_each_unhashed_blob(blob, imd) {
1404 ret = do_iterate_blob(blob, &ctx);
1410 return for_blob_in_table(wim->blob_table, do_iterate_blob, &ctx);