4 * Read uncompressed and compressed metadata and file resources from a WIM file.
8 * Copyright (C) 2012, 2013 Eric Biggers
10 * This file is part of wimlib, a library for working with WIM files.
12 * wimlib is free software; you can redistribute it and/or modify it under the
13 * terms of the GNU General Public License as published by the Free Software
14 * Foundation; either version 3 of the License, or (at your option) any later
17 * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
18 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
19 * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License along with
22 * wimlib; if not, see http://www.gnu.org/licenses/.
30 #include "wimlib/buffer_io.h"
31 #include "wimlib/dentry.h"
32 #include "wimlib/error.h"
33 #include "wimlib/file_io.h"
34 #include "wimlib/lookup_table.h"
35 #include "wimlib/resource.h"
36 #include "wimlib/sha1.h"
39 /* for read_win32_file_prefix(), read_win32_encrypted_file_prefix() */
40 # include "wimlib/win32.h"
44 /* for read_ntfs_file_prefix() */
45 # include "wimlib/ntfs_3g.h"
58 * Reads all or part of a compressed WIM resource.
60 * Returns zero on success, nonzero on failure.
63 read_compressed_resource(int in_fd,
64 u64 resource_compressed_size,
65 u64 resource_uncompressed_size,
70 consume_data_callback_t cb,
79 int (*decompress)(const void *, unsigned, void *, unsigned);
80 /* Set the appropriate decompress function. */
81 if (resource_ctype == WIMLIB_COMPRESSION_TYPE_LZX)
82 decompress = wimlib_lzx_decompress;
84 decompress = wimlib_xpress_decompress;
86 /* The structure of a compressed resource consists of a table of chunk
87 * offsets followed by the chunks themselves. Each chunk consists of
88 * compressed data, and there is one chunk for each WIM_CHUNK_SIZE =
89 * 32768 bytes of the uncompressed file, with the last chunk having any
92 * The chunk offsets are measured relative to the end of the chunk
93 * table. The first chunk is omitted from the table in the WIM file
94 * because its offset is implicitly given by the fact that it directly
95 * follows the chunk table and therefore must have an offset of 0.
98 /* Calculate how many chunks the resource consists of in its entirety.
100 u64 num_chunks = (resource_uncompressed_size + WIM_CHUNK_SIZE - 1) /
102 /* As mentioned, the first chunk has no entry in the chunk table. */
103 u64 num_chunk_entries = num_chunks - 1;
106 /* The index of the chunk that the read starts at. */
107 u64 start_chunk = offset / WIM_CHUNK_SIZE;
108 /* The byte offset at which the read starts, within the start chunk. */
109 u64 start_chunk_offset = offset % WIM_CHUNK_SIZE;
111 /* The index of the chunk that contains the last byte of the read. */
112 u64 end_chunk = (offset + len - 1) / WIM_CHUNK_SIZE;
113 /* The byte offset of the last byte of the read, within the end chunk */
114 u64 end_chunk_offset = (offset + len - 1) % WIM_CHUNK_SIZE;
116 /* Number of chunks that are actually needed to read the requested part
118 u64 num_needed_chunks = end_chunk - start_chunk + 1;
120 /* If the end chunk is not the last chunk, an extra chunk entry is
121 * needed because we need to know the offset of the chunk after the last
122 * chunk read to figure out the size of the last read chunk. */
123 if (end_chunk != num_chunks - 1)
126 /* Allocate the chunk table. It will only contain offsets for the
127 * chunks that are actually needed for this read. */
129 bool chunk_offsets_malloced;
130 if (num_needed_chunks < 1000) {
131 chunk_offsets = alloca(num_needed_chunks * sizeof(u64));
132 chunk_offsets_malloced = false;
134 chunk_offsets = malloc(num_needed_chunks * sizeof(u64));
135 if (!chunk_offsets) {
136 ERROR("Failed to allocate chunk table "
137 "with %"PRIu64" entries", num_needed_chunks);
138 return WIMLIB_ERR_NOMEM;
140 chunk_offsets_malloced = true;
143 /* Set the implicit offset of the first chunk if it is included in the
146 * Note: M$'s documentation includes a picture that shows the first
147 * chunk starting right after the chunk entry table, labeled as offset
148 * 0x10. However, in the actual file format, the offset is measured
149 * from the end of the chunk entry table, so the first chunk has an
151 if (start_chunk == 0)
152 chunk_offsets[0] = 0;
154 /* According to M$'s documentation, if the uncompressed size of
155 * the file is greater than 4 GB, the chunk entries are 8-byte
156 * integers. Otherwise, they are 4-byte integers. */
157 u64 chunk_entry_size = (resource_uncompressed_size >= (u64)1 << 32) ?
160 /* Size of the full chunk table in the WIM file. */
161 u64 chunk_table_size = chunk_entry_size * num_chunk_entries;
163 /* Read the needed chunk offsets from the table in the WIM file. */
165 /* Index, in the WIM file, of the first needed entry in the
167 u64 start_table_idx = (start_chunk == 0) ? 0 : start_chunk - 1;
169 /* Number of entries we need to actually read from the chunk
170 * table (excludes the implicit first chunk). */
171 u64 num_needed_chunk_entries = (start_chunk == 0) ?
172 num_needed_chunks - 1 : num_needed_chunks;
174 /* Skip over unneeded chunk table entries. */
175 u64 file_offset_of_needed_chunk_entries = resource_offset +
176 start_table_idx * chunk_entry_size;
178 /* Number of bytes we need to read from the chunk table. */
179 size_t size = num_needed_chunk_entries * chunk_entry_size;
181 /* Read the raw data into the end of the chunk_offsets array to
182 * avoid allocating another array. */
183 void *chunk_tab_buf = (void*)&chunk_offsets[num_needed_chunks] - size;
185 if (full_pread(in_fd, chunk_tab_buf, size,
186 file_offset_of_needed_chunk_entries) != size)
189 /* Now fill in chunk_offsets from the entries we have read in
192 u64 *chunk_tab_p = chunk_offsets;
193 if (start_chunk == 0)
196 if (chunk_entry_size == 4) {
197 u32 *entries = (u32*)chunk_tab_buf;
198 while (num_needed_chunk_entries--)
199 *chunk_tab_p++ = le32_to_cpu(*entries++);
201 u64 *entries = (u64*)chunk_tab_buf;
202 while (num_needed_chunk_entries--)
203 *chunk_tab_p++ = le64_to_cpu(*entries++);
206 /* Done with the chunk table now. We must now seek to the first chunk
207 * that is needed for the read. */
209 u64 cur_read_offset = resource_offset + chunk_table_size + chunk_offsets[0];
211 /* Pointer to current position in the output buffer for uncompressed
212 * data. Alternatively, if using a callback function, we repeatedly
213 * fill a temporary buffer to feed data into the callback function. */
216 out_p = alloca(WIM_CHUNK_SIZE);
220 /* Buffer for compressed data. While most compressed chunks will have a
221 * size much less than WIM_CHUNK_SIZE, WIM_CHUNK_SIZE - 1 is the maximum
222 * size in the worst-case. This assumption is valid only if chunks that
223 * happen to compress to more than the uncompressed size (i.e. a
224 * sequence of random bytes) are always stored uncompressed. But this seems
225 * to be the case in M$'s WIM files, even though it is undocumented. */
226 void *compressed_buf = alloca(WIM_CHUNK_SIZE - 1);
228 /* Decompress all the chunks. */
229 for (u64 i = start_chunk; i <= end_chunk; i++) {
231 /* Calculate the sizes of the compressed chunk and of the
232 * uncompressed chunk. */
233 unsigned compressed_chunk_size;
234 unsigned uncompressed_chunk_size;
235 if (i != num_chunks - 1) {
236 /* All the chunks except the last one in the resource
237 * expand to WIM_CHUNK_SIZE uncompressed, and the amount
238 * of compressed data for the chunk is given by the
239 * difference of offsets in the chunk offset table. */
240 compressed_chunk_size = chunk_offsets[i + 1 - start_chunk] -
241 chunk_offsets[i - start_chunk];
242 uncompressed_chunk_size = WIM_CHUNK_SIZE;
244 /* The last compressed chunk consists of the remaining
245 * bytes in the file resource, and the last uncompressed
246 * chunk has size equal to however many bytes are left-
247 * that is, the remainder of the uncompressed size when
248 * divided by WIM_CHUNK_SIZE.
250 * Note that the resource_compressed_size includes the
251 * chunk table, so the size of it must be subtracted. */
252 compressed_chunk_size = resource_compressed_size -
254 chunk_offsets[i - start_chunk];
256 uncompressed_chunk_size = resource_uncompressed_size %
259 /* If the remainder is 0, the last chunk actually
260 * uncompresses to a full WIM_CHUNK_SIZE bytes. */
261 if (uncompressed_chunk_size == 0)
262 uncompressed_chunk_size = WIM_CHUNK_SIZE;
265 /* Figure out how much of this chunk we actually need to read */
267 if (i == start_chunk)
268 start_offset = start_chunk_offset;
273 end_offset = end_chunk_offset;
275 end_offset = WIM_CHUNK_SIZE - 1;
277 unsigned partial_chunk_size = end_offset + 1 - start_offset;
278 bool is_partial_chunk = (partial_chunk_size != uncompressed_chunk_size);
280 /* This is undocumented, but chunks can be uncompressed. This
281 * appears to always be the case when the compressed chunk size
282 * is equal to the uncompressed chunk size. */
283 if (compressed_chunk_size == uncompressed_chunk_size) {
284 /* Uncompressed chunk */
285 if (full_pread(in_fd,
286 cb ? out_p + start_offset : out_p,
288 cur_read_offset + start_offset) != partial_chunk_size)
293 /* Compressed chunk */
295 /* Read the compressed data into compressed_buf. */
296 if (full_pread(in_fd,
298 compressed_chunk_size,
299 cur_read_offset) != compressed_chunk_size)
304 /* For partial chunks and when writing directly to a
305 * buffer, we must buffer the uncompressed data because
306 * we don't need all of it. */
307 if (is_partial_chunk && !cb) {
308 u8 uncompressed_buf[uncompressed_chunk_size];
310 ret = decompress(compressed_buf,
311 compressed_chunk_size,
313 uncompressed_chunk_size);
315 ret = WIMLIB_ERR_DECOMPRESSION;
318 memcpy(out_p, uncompressed_buf + start_offset,
321 ret = decompress(compressed_buf,
322 compressed_chunk_size,
324 uncompressed_chunk_size);
326 ret = WIMLIB_ERR_DECOMPRESSION;
332 /* Feed the data to the callback function */
333 ret = cb(out_p + start_offset,
334 partial_chunk_size, ctx_or_buf);
338 /* No callback function provided; we are writing
339 * directly to a buffer. Advance the pointer into this
340 * buffer by the number of uncompressed bytes that were
342 out_p += partial_chunk_size;
344 cur_read_offset += compressed_chunk_size;
349 if (chunk_offsets_malloced)
354 ERROR_WITH_ERRNO("Error reading compressed file resource");
355 ret = WIMLIB_ERR_READ;
359 /* Reads the contents of a struct resource_entry, as represented in the on-disk
360 * format, from the memory pointed to by @p, and fills in the fields of @entry.
361 * A pointer to the byte after the memory read at @p is returned. */
363 get_resource_entry(const void *p, struct resource_entry *entry)
368 p = get_u56(p, &size);
369 p = get_u8(p, &flags);
371 entry->flags = flags;
373 /* offset and original_size are truncated to 62 bits to avoid possible
374 * overflows, when converting to a signed 64-bit integer (off_t) or when
375 * adding size or original_size. This is okay since no one would ever
376 * actually have a WIM bigger than 4611686018427387903 bytes... */
377 p = get_u64(p, &entry->offset);
378 if (entry->offset & 0xc000000000000000ULL) {
379 WARNING("Truncating offset in resource entry");
380 entry->offset &= 0x3fffffffffffffffULL;
382 p = get_u64(p, &entry->original_size);
383 if (entry->original_size & 0xc000000000000000ULL) {
384 WARNING("Truncating original_size in resource entry");
385 entry->original_size &= 0x3fffffffffffffffULL;
390 /* Copies the struct resource_entry @entry to the memory pointed to by @p in the
391 * on-disk format. A pointer to the byte after the memory written at @p is
394 put_resource_entry(void *p, const struct resource_entry *entry)
396 p = put_u56(p, entry->size);
397 p = put_u8(p, entry->flags);
398 p = put_u64(p, entry->offset);
399 p = put_u64(p, entry->original_size);
404 read_partial_wim_resource(const struct wim_lookup_table_entry *lte,
406 consume_data_callback_t cb,
415 wimlib_assert(lte->resource_location == RESOURCE_IN_WIM);
420 if (lte->resource_entry.flags & WIM_RESHDR_FLAG_COMPRESSED &&
421 !(flags & WIMLIB_RESOURCE_FLAG_RAW))
423 ret = read_compressed_resource(in_fd,
424 lte->resource_entry.size,
425 lte->resource_entry.original_size,
426 lte->resource_entry.offset,
427 wimlib_get_compression_type(wim),
433 offset += lte->resource_entry.offset;
435 /* Send data to callback function */
436 u8 buf[min(WIM_CHUNK_SIZE, size)];
438 size_t bytes_to_read = min(WIM_CHUNK_SIZE, size);
439 size_t bytes_read = full_pread(in_fd, buf,
440 bytes_to_read, offset);
441 if (bytes_read != bytes_to_read)
443 ret = cb(buf, bytes_read, ctx_or_buf);
447 offset += bytes_read;
450 /* Send data directly to a buffer */
451 if (full_pread(in_fd, ctx_or_buf, size, offset) != size)
458 ERROR_WITH_ERRNO("Error reading data from WIM");
459 ret = WIMLIB_ERR_READ;
470 read_partial_wim_resource_into_buf(const struct wim_lookup_table_entry *lte,
471 size_t size, u64 offset, void *buf)
473 return read_partial_wim_resource(lte, size, NULL, buf, 0, offset);
477 read_wim_resource_prefix(const struct wim_lookup_table_entry *lte,
479 consume_data_callback_t cb,
483 return read_partial_wim_resource(lte, size, cb, ctx_or_buf, flags, 0);
489 read_file_on_disk_prefix(const struct wim_lookup_table_entry *lte,
491 consume_data_callback_t cb,
495 const tchar *filename = lte->file_on_disk;
500 fd = open(filename, O_RDONLY);
502 ERROR_WITH_ERRNO("Can't open \"%"TS"\"", filename);
503 return WIMLIB_ERR_OPEN;
506 /* Send data to callback function */
507 u8 buf[min(WIM_CHUNK_SIZE, size)];
508 size_t bytes_to_read;
510 bytes_to_read = min(WIM_CHUNK_SIZE, size);
511 bytes_read = full_read(fd, buf, bytes_to_read);
512 if (bytes_read != bytes_to_read)
514 ret = cb(buf, bytes_read, ctx_or_buf);
520 /* Send data directly to a buffer */
521 bytes_read = full_read(fd, ctx_or_buf, size);
522 if (bytes_read != size)
528 ERROR_WITH_ERRNO("Error reading \"%"TS"\"", filename);
529 ret = WIMLIB_ERR_READ;
534 #endif /* !__WIN32__ */
537 read_buffer_prefix(const struct wim_lookup_table_entry *lte,
538 u64 size, consume_data_callback_t cb,
539 void *ctx_or_buf, int _ignored_flags)
541 const void *inbuf = lte->attached_buffer;
546 size_t chunk_size = min(WIM_CHUNK_SIZE, size);
547 ret = cb(inbuf, chunk_size, ctx_or_buf);
554 memcpy(ctx_or_buf, inbuf, size);
559 typedef int (*read_resource_prefix_handler_t)(const struct wim_lookup_table_entry *lte,
561 consume_data_callback_t cb,
566 * Read the first @size bytes from a generic "resource", which may be located in
567 * the WIM (compressed or uncompressed), in an external file, or directly in an
570 * Feed the data either to a callback function (cb != NULL, passing it
571 * ctx_or_buf), or write it directly into a buffer (cb == NULL, ctx_or_buf
572 * specifies the buffer, which must have room for @size bytes).
574 * When using a callback function, it is called with chunks up to 32768 bytes in
575 * size until the resource is exhausted.
577 * If the resource is located in a WIM file, @flags can be:
578 * * WIMLIB_RESOURCE_FLAG_RAW if the raw compressed data is to be supplied
579 * instead of the uncompressed data.
580 * Otherwise, the @flags are ignored.
583 read_resource_prefix(const struct wim_lookup_table_entry *lte,
584 u64 size, consume_data_callback_t cb, void *ctx_or_buf,
587 static const read_resource_prefix_handler_t handlers[] = {
588 [RESOURCE_IN_WIM] = read_wim_resource_prefix,
590 [RESOURCE_IN_FILE_ON_DISK] = read_file_on_disk_prefix,
592 [RESOURCE_IN_ATTACHED_BUFFER] = read_buffer_prefix,
594 [RESOURCE_IN_STAGING_FILE] = read_file_on_disk_prefix,
597 [RESOURCE_IN_NTFS_VOLUME] = read_ntfs_file_prefix,
600 [RESOURCE_WIN32] = read_win32_file_prefix,
601 [RESOURCE_WIN32_ENCRYPTED] = read_win32_encrypted_file_prefix,
604 wimlib_assert(lte->resource_location < ARRAY_LEN(handlers)
605 && handlers[lte->resource_location] != NULL);
606 return handlers[lte->resource_location](lte, size, cb, ctx_or_buf, flags);
610 read_full_resource_into_buf(const struct wim_lookup_table_entry *lte,
613 return read_resource_prefix(lte, wim_resource_size(lte), NULL, buf, 0);
618 consume_data_callback_t extract_chunk;
619 void *extract_chunk_arg;
623 extract_chunk_sha1_wrapper(const void *chunk, size_t chunk_size,
626 struct extract_ctx *ctx = _ctx;
628 sha1_update(&ctx->sha_ctx, chunk, chunk_size);
629 return ctx->extract_chunk(chunk, chunk_size, ctx->extract_chunk_arg);
632 /* Extracts the first @size bytes of a WIM resource to somewhere. In the
633 * process, the SHA1 message digest of the resource is checked if the full
634 * resource is being extracted.
636 * @extract_chunk is a function that is called to extract each chunk of the
639 extract_wim_resource(const struct wim_lookup_table_entry *lte,
641 consume_data_callback_t extract_chunk,
642 void *extract_chunk_arg)
645 if (size == wim_resource_size(lte)) {
647 struct extract_ctx ctx;
648 ctx.extract_chunk = extract_chunk;
649 ctx.extract_chunk_arg = extract_chunk_arg;
650 sha1_init(&ctx.sha_ctx);
651 ret = read_resource_prefix(lte, size,
652 extract_chunk_sha1_wrapper,
655 u8 hash[SHA1_HASH_SIZE];
656 sha1_final(hash, &ctx.sha_ctx);
657 if (!hashes_equal(hash, lte->hash)) {
658 #ifdef ENABLE_ERROR_MESSAGES
659 ERROR("Invalid SHA1 message digest "
660 "on the following WIM resource:");
661 print_lookup_table_entry(lte, stderr);
662 if (lte->resource_location == RESOURCE_IN_WIM)
663 ERROR("The WIM file appears to be corrupt!");
665 ret = WIMLIB_ERR_INVALID_RESOURCE_HASH;
670 ret = read_resource_prefix(lte, size, extract_chunk,
671 extract_chunk_arg, 0);
677 extract_wim_chunk_to_fd(const void *buf, size_t len, void *_fd_p)
679 int fd = *(int*)_fd_p;
680 ssize_t ret = full_write(fd, buf, len);
682 ERROR_WITH_ERRNO("Error writing to file descriptor");
683 return WIMLIB_ERR_WRITE;
690 extract_wim_resource_to_fd(const struct wim_lookup_table_entry *lte,
693 return extract_wim_resource(lte, size, extract_wim_chunk_to_fd, &fd);
698 sha1_chunk(const void *buf, size_t len, void *ctx)
700 sha1_update(ctx, buf, len);
704 /* Calculate the SHA1 message digest of a stream. */
706 sha1_resource(struct wim_lookup_table_entry *lte)
712 ret = read_resource_prefix(lte, wim_resource_size(lte),
713 sha1_chunk, &sha_ctx, 0);
715 sha1_final(lte->hash, &sha_ctx);
720 * Copies the file resource specified by the lookup table entry @lte from the
721 * input WIM to the output WIM that has its FILE * given by
722 * ((WIMStruct*)wim)->out_fp.
724 * The output_resource_entry, out_refcnt, and part_number fields of @lte are
727 * (This function is confusing and should be refactored somehow.)
730 copy_resource(struct wim_lookup_table_entry *lte, void *wim)
735 ret = write_wim_resource(lte, w->out_fd,
736 wim_resource_compression_type(lte),
737 <e->output_resource_entry, 0);
739 lte->out_refcnt = lte->refcnt;
740 lte->part_number = w->hdr.part_number;