4 * Read uncompressed and compressed metadata and file resources from a WIM file.
8 * Copyright (C) 2012, 2013 Eric Biggers
10 * This file is part of wimlib, a library for working with WIM files.
12 * wimlib is free software; you can redistribute it and/or modify it under the
13 * terms of the GNU General Public License as published by the Free Software
14 * Foundation; either version 3 of the License, or (at your option) any later
17 * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
18 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
19 * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License along with
22 * wimlib; if not, see http://www.gnu.org/licenses/.
25 #include "wimlib_internal.h"
27 #include "lookup_table.h"
28 #include "buffer_io.h"
46 * Reads all or part of a compressed WIM resource.
48 * Returns zero on success, nonzero on failure.
51 read_compressed_resource(int in_fd,
52 u64 resource_compressed_size,
53 u64 resource_uncompressed_size,
58 consume_data_callback_t cb,
67 int (*decompress)(const void *, unsigned, void *, unsigned);
68 /* Set the appropriate decompress function. */
69 if (resource_ctype == WIMLIB_COMPRESSION_TYPE_LZX)
70 decompress = wimlib_lzx_decompress;
72 decompress = wimlib_xpress_decompress;
74 /* The structure of a compressed resource consists of a table of chunk
75 * offsets followed by the chunks themselves. Each chunk consists of
76 * compressed data, and there is one chunk for each WIM_CHUNK_SIZE =
77 * 32768 bytes of the uncompressed file, with the last chunk having any
80 * The chunk offsets are measured relative to the end of the chunk
81 * table. The first chunk is omitted from the table in the WIM file
82 * because its offset is implicitly given by the fact that it directly
83 * follows the chunk table and therefore must have an offset of 0.
86 /* Calculate how many chunks the resource consists of in its entirety.
88 u64 num_chunks = (resource_uncompressed_size + WIM_CHUNK_SIZE - 1) /
90 /* As mentioned, the first chunk has no entry in the chunk table. */
91 u64 num_chunk_entries = num_chunks - 1;
94 /* The index of the chunk that the read starts at. */
95 u64 start_chunk = offset / WIM_CHUNK_SIZE;
96 /* The byte offset at which the read starts, within the start chunk. */
97 u64 start_chunk_offset = offset % WIM_CHUNK_SIZE;
99 /* The index of the chunk that contains the last byte of the read. */
100 u64 end_chunk = (offset + len - 1) / WIM_CHUNK_SIZE;
101 /* The byte offset of the last byte of the read, within the end chunk */
102 u64 end_chunk_offset = (offset + len - 1) % WIM_CHUNK_SIZE;
104 /* Number of chunks that are actually needed to read the requested part
106 u64 num_needed_chunks = end_chunk - start_chunk + 1;
108 /* If the end chunk is not the last chunk, an extra chunk entry is
109 * needed because we need to know the offset of the chunk after the last
110 * chunk read to figure out the size of the last read chunk. */
111 if (end_chunk != num_chunks - 1)
114 /* Allocate the chunk table. It will only contain offsets for the
115 * chunks that are actually needed for this read. */
117 bool chunk_offsets_malloced;
118 if (num_needed_chunks < 1000) {
119 chunk_offsets = alloca(num_needed_chunks * sizeof(u64));
120 chunk_offsets_malloced = false;
122 chunk_offsets = malloc(num_needed_chunks * sizeof(u64));
123 if (!chunk_offsets) {
124 ERROR("Failed to allocate chunk table "
125 "with %"PRIu64" entries", num_needed_chunks);
126 return WIMLIB_ERR_NOMEM;
128 chunk_offsets_malloced = true;
131 /* Set the implicit offset of the first chunk if it is included in the
134 * Note: M$'s documentation includes a picture that shows the first
135 * chunk starting right after the chunk entry table, labeled as offset
136 * 0x10. However, in the actual file format, the offset is measured
137 * from the end of the chunk entry table, so the first chunk has an
139 if (start_chunk == 0)
140 chunk_offsets[0] = 0;
142 /* According to M$'s documentation, if the uncompressed size of
143 * the file is greater than 4 GB, the chunk entries are 8-byte
144 * integers. Otherwise, they are 4-byte integers. */
145 u64 chunk_entry_size = (resource_uncompressed_size >= (u64)1 << 32) ?
148 /* Size of the full chunk table in the WIM file. */
149 u64 chunk_table_size = chunk_entry_size * num_chunk_entries;
151 /* Read the needed chunk offsets from the table in the WIM file. */
153 /* Index, in the WIM file, of the first needed entry in the
155 u64 start_table_idx = (start_chunk == 0) ? 0 : start_chunk - 1;
157 /* Number of entries we need to actually read from the chunk
158 * table (excludes the implicit first chunk). */
159 u64 num_needed_chunk_entries = (start_chunk == 0) ?
160 num_needed_chunks - 1 : num_needed_chunks;
162 /* Skip over unneeded chunk table entries. */
163 u64 file_offset_of_needed_chunk_entries = resource_offset +
164 start_table_idx * chunk_entry_size;
166 /* Number of bytes we need to read from the chunk table. */
167 size_t size = num_needed_chunk_entries * chunk_entry_size;
169 /* Read the raw data into the end of the chunk_offsets array to
170 * avoid allocating another array. */
171 void *chunk_tab_buf = (void*)&chunk_offsets[num_needed_chunks] - size;
173 if (full_pread(in_fd, chunk_tab_buf, size,
174 file_offset_of_needed_chunk_entries) != size)
177 /* Now fill in chunk_offsets from the entries we have read in
180 u64 *chunk_tab_p = chunk_offsets;
181 if (start_chunk == 0)
184 if (chunk_entry_size == 4) {
185 u32 *entries = (u32*)chunk_tab_buf;
186 while (num_needed_chunk_entries--)
187 *chunk_tab_p++ = le32_to_cpu(*entries++);
189 u64 *entries = (u64*)chunk_tab_buf;
190 while (num_needed_chunk_entries--)
191 *chunk_tab_p++ = le64_to_cpu(*entries++);
194 /* Done with the chunk table now. We must now seek to the first chunk
195 * that is needed for the read. */
197 u64 cur_read_offset = resource_offset + chunk_table_size + chunk_offsets[0];
199 /* Pointer to current position in the output buffer for uncompressed
200 * data. Alternatively, if using a callback function, we repeatedly
201 * fill a temporary buffer to feed data into the callback function. */
204 out_p = alloca(WIM_CHUNK_SIZE);
208 /* Buffer for compressed data. While most compressed chunks will have a
209 * size much less than WIM_CHUNK_SIZE, WIM_CHUNK_SIZE - 1 is the maximum
210 * size in the worst-case. This assumption is valid only if chunks that
211 * happen to compress to more than the uncompressed size (i.e. a
212 * sequence of random bytes) are always stored uncompressed. But this seems
213 * to be the case in M$'s WIM files, even though it is undocumented. */
214 void *compressed_buf = alloca(WIM_CHUNK_SIZE - 1);
216 /* Decompress all the chunks. */
217 for (u64 i = start_chunk; i <= end_chunk; i++) {
219 /* Calculate the sizes of the compressed chunk and of the
220 * uncompressed chunk. */
221 unsigned compressed_chunk_size;
222 unsigned uncompressed_chunk_size;
223 if (i != num_chunks - 1) {
224 /* All the chunks except the last one in the resource
225 * expand to WIM_CHUNK_SIZE uncompressed, and the amount
226 * of compressed data for the chunk is given by the
227 * difference of offsets in the chunk offset table. */
228 compressed_chunk_size = chunk_offsets[i + 1 - start_chunk] -
229 chunk_offsets[i - start_chunk];
230 uncompressed_chunk_size = WIM_CHUNK_SIZE;
232 /* The last compressed chunk consists of the remaining
233 * bytes in the file resource, and the last uncompressed
234 * chunk has size equal to however many bytes are left-
235 * that is, the remainder of the uncompressed size when
236 * divided by WIM_CHUNK_SIZE.
238 * Note that the resource_compressed_size includes the
239 * chunk table, so the size of it must be subtracted. */
240 compressed_chunk_size = resource_compressed_size -
242 chunk_offsets[i - start_chunk];
244 uncompressed_chunk_size = resource_uncompressed_size %
247 /* If the remainder is 0, the last chunk actually
248 * uncompresses to a full WIM_CHUNK_SIZE bytes. */
249 if (uncompressed_chunk_size == 0)
250 uncompressed_chunk_size = WIM_CHUNK_SIZE;
253 /* Figure out how much of this chunk we actually need to read */
255 if (i == start_chunk)
256 start_offset = start_chunk_offset;
261 end_offset = end_chunk_offset;
263 end_offset = WIM_CHUNK_SIZE - 1;
265 unsigned partial_chunk_size = end_offset + 1 - start_offset;
266 bool is_partial_chunk = (partial_chunk_size != uncompressed_chunk_size);
268 /* This is undocumented, but chunks can be uncompressed. This
269 * appears to always be the case when the compressed chunk size
270 * is equal to the uncompressed chunk size. */
271 if (compressed_chunk_size == uncompressed_chunk_size) {
272 /* Uncompressed chunk */
273 if (full_pread(in_fd,
274 cb ? out_p + start_offset : out_p,
276 cur_read_offset + start_offset) != partial_chunk_size)
281 /* Compressed chunk */
283 /* Read the compressed data into compressed_buf. */
284 if (full_pread(in_fd,
286 compressed_chunk_size,
287 cur_read_offset) != compressed_chunk_size)
292 /* For partial chunks and when writing directly to a
293 * buffer, we must buffer the uncompressed data because
294 * we don't need all of it. */
295 if (is_partial_chunk && !cb) {
296 u8 uncompressed_buf[uncompressed_chunk_size];
298 ret = decompress(compressed_buf,
299 compressed_chunk_size,
301 uncompressed_chunk_size);
303 ret = WIMLIB_ERR_DECOMPRESSION;
306 memcpy(out_p, uncompressed_buf + start_offset,
309 ret = decompress(compressed_buf,
310 compressed_chunk_size,
312 uncompressed_chunk_size);
314 ret = WIMLIB_ERR_DECOMPRESSION;
320 /* Feed the data to the callback function */
321 ret = cb(out_p + start_offset,
322 partial_chunk_size, ctx_or_buf);
326 /* No callback function provided; we are writing
327 * directly to a buffer. Advance the pointer into this
328 * buffer by the number of uncompressed bytes that were
330 out_p += partial_chunk_size;
332 cur_read_offset += compressed_chunk_size;
337 if (chunk_offsets_malloced)
342 ERROR_WITH_ERRNO("Error reading compressed file resource");
343 ret = WIMLIB_ERR_READ;
347 /* Reads the contents of a struct resource_entry, as represented in the on-disk
348 * format, from the memory pointed to by @p, and fills in the fields of @entry.
349 * A pointer to the byte after the memory read at @p is returned. */
351 get_resource_entry(const void *p, struct resource_entry *entry)
356 p = get_u56(p, &size);
357 p = get_u8(p, &flags);
359 entry->flags = flags;
361 /* offset and original_size are truncated to 62 bits to avoid possible
362 * overflows, when converting to a signed 64-bit integer (off_t) or when
363 * adding size or original_size. This is okay since no one would ever
364 * actually have a WIM bigger than 4611686018427387903 bytes... */
365 p = get_u64(p, &entry->offset);
366 if (entry->offset & 0xc000000000000000ULL) {
367 WARNING("Truncating offset in resource entry");
368 entry->offset &= 0x3fffffffffffffffULL;
370 p = get_u64(p, &entry->original_size);
371 if (entry->original_size & 0xc000000000000000ULL) {
372 WARNING("Truncating original_size in resource entry");
373 entry->original_size &= 0x3fffffffffffffffULL;
378 /* Copies the struct resource_entry @entry to the memory pointed to by @p in the
379 * on-disk format. A pointer to the byte after the memory written at @p is
382 put_resource_entry(void *p, const struct resource_entry *entry)
384 p = put_u56(p, entry->size);
385 p = put_u8(p, entry->flags);
386 p = put_u64(p, entry->offset);
387 p = put_u64(p, entry->original_size);
392 read_partial_wim_resource(const struct wim_lookup_table_entry *lte,
394 consume_data_callback_t cb,
403 wimlib_assert(lte->resource_location == RESOURCE_IN_WIM);
408 if (lte->resource_entry.flags & WIM_RESHDR_FLAG_COMPRESSED &&
409 !(flags & WIMLIB_RESOURCE_FLAG_RAW))
411 ret = read_compressed_resource(in_fd,
412 lte->resource_entry.size,
413 lte->resource_entry.original_size,
414 lte->resource_entry.offset,
415 wimlib_get_compression_type(wim),
421 offset += lte->resource_entry.offset;
423 /* Send data to callback function */
424 u8 buf[min(WIM_CHUNK_SIZE, size)];
426 size_t bytes_to_read = min(WIM_CHUNK_SIZE, size);
427 size_t bytes_read = full_pread(in_fd, buf,
428 bytes_to_read, offset);
429 if (bytes_read != bytes_to_read)
431 ret = cb(buf, bytes_read, ctx_or_buf);
435 offset += bytes_read;
438 /* Send data directly to a buffer */
439 if (full_pread(in_fd, ctx_or_buf, size, offset) != size)
446 ERROR_WITH_ERRNO("Error reading data from WIM");
447 ret = WIMLIB_ERR_READ;
458 read_partial_wim_resource_into_buf(const struct wim_lookup_table_entry *lte,
459 size_t size, u64 offset, void *buf)
461 return read_partial_wim_resource(lte, size, NULL, buf, 0, offset);
465 read_wim_resource_prefix(const struct wim_lookup_table_entry *lte,
467 consume_data_callback_t cb,
471 return read_partial_wim_resource(lte, size, cb, ctx_or_buf, flags, 0);
477 read_file_on_disk_prefix(const struct wim_lookup_table_entry *lte,
479 consume_data_callback_t cb,
483 const tchar *filename = lte->file_on_disk;
488 fd = open(filename, O_RDONLY);
490 ERROR_WITH_ERRNO("Can't open \"%"TS"\"", filename);
491 return WIMLIB_ERR_OPEN;
494 /* Send data to callback function */
495 u8 buf[min(WIM_CHUNK_SIZE, size)];
496 size_t bytes_to_read;
498 bytes_to_read = min(WIM_CHUNK_SIZE, size);
499 bytes_read = full_read(fd, buf, bytes_to_read);
500 if (bytes_read != bytes_to_read)
502 ret = cb(buf, bytes_read, ctx_or_buf);
508 /* Send data directly to a buffer */
509 bytes_read = full_read(fd, ctx_or_buf, size);
510 if (bytes_read != size)
516 ERROR_WITH_ERRNO("Error reading \"%"TS"\"", filename);
517 ret = WIMLIB_ERR_READ;
522 #endif /* !__WIN32__ */
525 read_buffer_prefix(const struct wim_lookup_table_entry *lte,
526 u64 size, consume_data_callback_t cb,
527 void *ctx_or_buf, int _ignored_flags)
529 const void *inbuf = lte->attached_buffer;
534 size_t chunk_size = min(WIM_CHUNK_SIZE, size);
535 ret = cb(inbuf, chunk_size, ctx_or_buf);
542 memcpy(ctx_or_buf, inbuf, size);
547 typedef int (*read_resource_prefix_handler_t)(const struct wim_lookup_table_entry *lte,
549 consume_data_callback_t cb,
554 * Read the first @size bytes from a generic "resource", which may be located in
555 * the WIM (compressed or uncompressed), in an external file, or directly in an
558 * Feed the data either to a callback function (cb != NULL, passing it
559 * ctx_or_buf), or write it directly into a buffer (cb == NULL, ctx_or_buf
560 * specifies the buffer, which must have room for @size bytes).
562 * When using a callback function, it is called with chunks up to 32768 bytes in
563 * size until the resource is exhausted.
565 * If the resource is located in a WIM file, @flags can be:
566 * * WIMLIB_RESOURCE_FLAG_THREADSAFE_READ if it must be safe to access the resource
567 * concurrently by multiple threads.
568 * * WIMLIB_RESOURCE_FLAG_RAW if the raw compressed data is to be supplied
569 * instead of the uncompressed data.
570 * Otherwise, the @flags are ignored.
573 read_resource_prefix(const struct wim_lookup_table_entry *lte,
574 u64 size, consume_data_callback_t cb, void *ctx_or_buf,
577 static const read_resource_prefix_handler_t handlers[] = {
578 [RESOURCE_IN_WIM] = read_wim_resource_prefix,
580 [RESOURCE_IN_FILE_ON_DISK] = read_file_on_disk_prefix,
582 [RESOURCE_IN_ATTACHED_BUFFER] = read_buffer_prefix,
584 [RESOURCE_IN_STAGING_FILE] = read_file_on_disk_prefix,
587 [RESOURCE_IN_NTFS_VOLUME] = read_ntfs_file_prefix,
590 [RESOURCE_WIN32] = read_win32_file_prefix,
591 [RESOURCE_WIN32_ENCRYPTED] = read_win32_encrypted_file_prefix,
594 wimlib_assert(lte->resource_location < ARRAY_LEN(handlers)
595 && handlers[lte->resource_location] != NULL);
596 return handlers[lte->resource_location](lte, size, cb, ctx_or_buf, flags);
600 read_full_resource_into_buf(const struct wim_lookup_table_entry *lte,
603 return read_resource_prefix(lte, wim_resource_size(lte), NULL, buf, 0);
608 consume_data_callback_t extract_chunk;
609 void *extract_chunk_arg;
613 extract_chunk_sha1_wrapper(const void *chunk, size_t chunk_size,
616 struct extract_ctx *ctx = _ctx;
618 sha1_update(&ctx->sha_ctx, chunk, chunk_size);
619 return ctx->extract_chunk(chunk, chunk_size, ctx->extract_chunk_arg);
622 /* Extracts the first @size bytes of a WIM resource to somewhere. In the
623 * process, the SHA1 message digest of the resource is checked if the full
624 * resource is being extracted.
626 * @extract_chunk is a function that is called to extract each chunk of the
629 extract_wim_resource(const struct wim_lookup_table_entry *lte,
631 consume_data_callback_t extract_chunk,
632 void *extract_chunk_arg)
635 if (size == wim_resource_size(lte)) {
637 struct extract_ctx ctx;
638 ctx.extract_chunk = extract_chunk;
639 ctx.extract_chunk_arg = extract_chunk_arg;
640 sha1_init(&ctx.sha_ctx);
641 ret = read_resource_prefix(lte, size,
642 extract_chunk_sha1_wrapper,
645 u8 hash[SHA1_HASH_SIZE];
646 sha1_final(hash, &ctx.sha_ctx);
647 if (!hashes_equal(hash, lte->hash)) {
648 #ifdef ENABLE_ERROR_MESSAGES
649 ERROR("Invalid SHA1 message digest "
650 "on the following WIM resource:");
651 print_lookup_table_entry(lte, stderr);
652 if (lte->resource_location == RESOURCE_IN_WIM)
653 ERROR("The WIM file appears to be corrupt!");
655 ret = WIMLIB_ERR_INVALID_RESOURCE_HASH;
660 ret = read_resource_prefix(lte, size, extract_chunk,
661 extract_chunk_arg, 0);
667 extract_wim_chunk_to_fd(const void *buf, size_t len, void *_fd_p)
669 int fd = *(int*)_fd_p;
670 ssize_t ret = full_write(fd, buf, len);
672 ERROR_WITH_ERRNO("Error writing to file descriptor");
673 return WIMLIB_ERR_WRITE;
680 extract_wim_resource_to_fd(const struct wim_lookup_table_entry *lte,
683 return extract_wim_resource(lte, size, extract_wim_chunk_to_fd, &fd);
688 sha1_chunk(const void *buf, size_t len, void *ctx)
690 sha1_update(ctx, buf, len);
694 /* Calculate the SHA1 message digest of a stream. */
696 sha1_resource(struct wim_lookup_table_entry *lte)
702 ret = read_resource_prefix(lte, wim_resource_size(lte),
703 sha1_chunk, &sha_ctx, 0);
705 sha1_final(lte->hash, &sha_ctx);
710 * Copies the file resource specified by the lookup table entry @lte from the
711 * input WIM to the output WIM that has its FILE * given by
712 * ((WIMStruct*)wim)->out_fp.
714 * The output_resource_entry, out_refcnt, and part_number fields of @lte are
717 * (This function is confusing and should be refactored somehow.)
720 copy_resource(struct wim_lookup_table_entry *lte, void *wim)
725 ret = write_wim_resource(lte, w->out_fd,
726 wim_resource_compression_type(lte),
727 <e->output_resource_entry, 0);
729 lte->out_refcnt = lte->refcnt;
730 lte->part_number = w->hdr.part_number;