+prepare_chunk_buffer(struct write_blobs_ctx *ctx)
+{
+ /* While we are unable to get a new chunk buffer due to too many chunks
+ * already outstanding, retrieve and write the next compressed chunk. */
+ while (!(ctx->cur_chunk_buf =
+ ctx->compressor->get_chunk_buffer(ctx->compressor)))
+ {
+ const void *cchunk;
+ u32 csize;
+ u32 usize;
+ bool bret;
+ int ret;
+
+ bret = ctx->compressor->get_compression_result(ctx->compressor,
+ &cchunk,
+ &csize,
+ &usize);
+ wimlib_assert(bret);
+
+ ret = write_chunk(ctx, cchunk, csize, usize);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/* Process the next chunk of data to be written to a WIM resource. */
+static int
+write_blob_process_chunk(const void *chunk, size_t size, void *_ctx)
+{
+ struct write_blobs_ctx *ctx = _ctx;
+ int ret;
+ const u8 *chunkptr, *chunkend;
+
+ wimlib_assert(size != 0);
+
+ if (ctx->compressor == NULL) {
+ /* Write chunk uncompressed. */
+ ret = write_chunk(ctx, chunk, size, size);
+ if (ret)
+ return ret;
+ ctx->cur_read_blob_offset += size;
+ return 0;
+ }
+
+ /* Submit the chunk for compression, but take into account that the
+ * @size the chunk was provided in may not correspond to the
+ * @out_chunk_size being used for compression. */
+ chunkptr = chunk;
+ chunkend = chunkptr + size;
+ do {
+ size_t needed_chunk_size;
+ size_t bytes_consumed;
+
+ if (!ctx->cur_chunk_buf) {
+ ret = prepare_chunk_buffer(ctx);
+ if (ret)
+ return ret;
+ }
+
+ if (ctx->write_resource_flags & WRITE_RESOURCE_FLAG_SOLID) {
+ needed_chunk_size = ctx->out_chunk_size;
+ } else {
+ needed_chunk_size = min(ctx->out_chunk_size,
+ ctx->cur_chunk_buf_filled +
+ (ctx->cur_read_blob_size -
+ ctx->cur_read_blob_offset));
+ }
+
+ bytes_consumed = min(chunkend - chunkptr,
+ needed_chunk_size - ctx->cur_chunk_buf_filled);
+
+ memcpy(&ctx->cur_chunk_buf[ctx->cur_chunk_buf_filled],
+ chunkptr, bytes_consumed);
+
+ chunkptr += bytes_consumed;
+ ctx->cur_read_blob_offset += bytes_consumed;
+ ctx->cur_chunk_buf_filled += bytes_consumed;
+
+ if (ctx->cur_chunk_buf_filled == needed_chunk_size) {
+ ctx->compressor->signal_chunk_filled(ctx->compressor,
+ ctx->cur_chunk_buf_filled);
+ ctx->cur_chunk_buf = NULL;
+ ctx->cur_chunk_buf_filled = 0;
+ }
+ } while (chunkptr != chunkend);
+ return 0;
+}
+
+/* Finish processing a blob for writing. It may not have been completely
+ * written yet, as the chunk_compressor implementation may still have chunks
+ * buffered or being compressed. */
+static int
+write_blob_end_read(struct blob_descriptor *blob, int status, void *_ctx)
+{
+ struct write_blobs_ctx *ctx = _ctx;
+
+ wimlib_assert(ctx->cur_read_blob_offset == ctx->cur_read_blob_size || status);
+
+ if (!blob->will_be_in_output_wim) {
+ /* The blob was a duplicate. Now that its data has finished
+ * being read, it is being discarded in favor of the duplicate
+ * entry. It therefore is no longer needed, and we can fire the
+ * DONE_WITH_FILE callback because the file will not be read
+ * again.
+ *
+ * Note: we can't yet fire DONE_WITH_FILE for non-duplicate
+ * blobs, since it needs to be possible to re-read the file if
+ * it does not compress to less than its original size. */
+ if (!status)
+ status = done_with_blob(blob, ctx);
+ free_blob_descriptor(blob);
+ } else if (!status && blob->unhashed && ctx->blob_table != NULL) {
+ /* The blob was not a duplicate and was previously unhashed.
+ * Since we passed COMPUTE_MISSING_BLOB_HASHES to
+ * read_blob_list(), blob->hash is now computed and valid. So
+ * turn this blob into a "hashed" blob. */
+ list_del(&blob->unhashed_list);
+ blob_table_insert(ctx->blob_table, blob);
+ blob->unhashed = 0;
+ }
+ return status;
+}
+
+/* Compute statistics about a list of blobs that will be written.
+ *
+ * Assumes the blobs are sorted such that all blobs located in each distinct WIM
+ * (specified by WIMStruct) are together. */
+static void
+compute_blob_list_stats(struct list_head *blob_list,
+ struct write_blobs_ctx *ctx)
+{
+ struct blob_descriptor *blob;
+ u64 total_bytes = 0;
+ u64 num_blobs = 0;
+ u64 total_parts = 0;
+ WIMStruct *prev_wim_part = NULL;
+
+ list_for_each_entry(blob, blob_list, write_blobs_list) {
+ num_blobs++;
+ total_bytes += blob->size;
+ if (blob->blob_location == BLOB_IN_WIM) {
+ if (prev_wim_part != blob->rdesc->wim) {
+ prev_wim_part = blob->rdesc->wim;
+ total_parts++;
+ }
+ }
+ }
+ ctx->progress_data.progress.write_streams.total_bytes = total_bytes;
+ ctx->progress_data.progress.write_streams.total_streams = num_blobs;
+ ctx->progress_data.progress.write_streams.completed_bytes = 0;
+ ctx->progress_data.progress.write_streams.completed_streams = 0;
+ ctx->progress_data.progress.write_streams.compression_type = ctx->out_ctype;
+ ctx->progress_data.progress.write_streams.total_parts = total_parts;
+ ctx->progress_data.progress.write_streams.completed_parts = 0;
+ ctx->progress_data.next_progress = 0;
+}
+
+/* Find blobs in @blob_list that can be copied to the output WIM in raw form
+ * rather than compressed. Delete these blobs from @blob_list and move them to
+ * @raw_copy_blobs. Return the total uncompressed size of the blobs that need
+ * to be compressed. */
+static u64
+find_raw_copy_blobs(struct list_head *blob_list, int write_resource_flags,
+ int out_ctype, u32 out_chunk_size,
+ struct list_head *raw_copy_blobs)
+{
+ struct blob_descriptor *blob, *tmp;
+ u64 num_nonraw_bytes = 0;
+
+ INIT_LIST_HEAD(raw_copy_blobs);
+
+ /* Initialize temporary raw_copy_ok flag. */
+ list_for_each_entry(blob, blob_list, write_blobs_list)
+ if (blob->blob_location == BLOB_IN_WIM)
+ blob->rdesc->raw_copy_ok = 0;
+
+ list_for_each_entry_safe(blob, tmp, blob_list, write_blobs_list) {
+ if (can_raw_copy(blob, write_resource_flags,
+ out_ctype, out_chunk_size))
+ {
+ blob->rdesc->raw_copy_ok = 1;
+ list_move_tail(&blob->write_blobs_list, raw_copy_blobs);
+ } else {
+ num_nonraw_bytes += blob->size;
+ }
+ }
+
+ return num_nonraw_bytes;
+}
+
+/* Copy a raw compressed resource located in another WIM file to the WIM file
+ * being written. */
+static int
+write_raw_copy_resource(struct wim_resource_descriptor *in_rdesc,
+ struct filedes *out_fd)
+{
+ u64 cur_read_offset;
+ u64 end_read_offset;
+ u8 buf[BUFFER_SIZE];
+ size_t bytes_to_read;
+ int ret;
+ struct filedes *in_fd;
+ struct blob_descriptor *blob;
+ u64 out_offset_in_wim;
+
+ /* Copy the raw data. */
+ cur_read_offset = in_rdesc->offset_in_wim;
+ end_read_offset = cur_read_offset + in_rdesc->size_in_wim;
+
+ out_offset_in_wim = out_fd->offset;
+
+ if (in_rdesc->is_pipable) {
+ if (cur_read_offset < sizeof(struct pwm_blob_hdr))
+ return WIMLIB_ERR_INVALID_PIPABLE_WIM;
+ cur_read_offset -= sizeof(struct pwm_blob_hdr);
+ out_offset_in_wim += sizeof(struct pwm_blob_hdr);
+ }
+ in_fd = &in_rdesc->wim->in_fd;
+ wimlib_assert(cur_read_offset != end_read_offset);
+ do {
+
+ bytes_to_read = min(sizeof(buf), end_read_offset - cur_read_offset);
+
+ ret = full_pread(in_fd, buf, bytes_to_read, cur_read_offset);
+ if (ret)
+ return ret;
+
+ ret = full_write(out_fd, buf, bytes_to_read);
+ if (ret)
+ return ret;
+
+ cur_read_offset += bytes_to_read;
+
+ } while (cur_read_offset != end_read_offset);
+
+ list_for_each_entry(blob, &in_rdesc->blob_list, rdesc_node) {
+ if (blob->will_be_in_output_wim) {
+ blob_set_out_reshdr_for_reuse(blob);
+ if (in_rdesc->flags & WIM_RESHDR_FLAG_SOLID)
+ blob->out_res_offset_in_wim = out_offset_in_wim;
+ else
+ blob->out_reshdr.offset_in_wim = out_offset_in_wim;
+
+ }
+ }
+ return 0;
+}
+
+/* Copy a list of raw compressed resources located in other WIM file(s) to the
+ * WIM file being written. */
+static int
+write_raw_copy_resources(struct list_head *raw_copy_blobs,
+ struct filedes *out_fd,
+ struct write_blobs_progress_data *progress_data)
+{
+ struct blob_descriptor *blob;
+ int ret;
+
+ list_for_each_entry(blob, raw_copy_blobs, write_blobs_list)
+ blob->rdesc->raw_copy_ok = 1;
+
+ list_for_each_entry(blob, raw_copy_blobs, write_blobs_list) {
+ if (blob->rdesc->raw_copy_ok) {
+ /* Write each solid resource only one time. */
+ ret = write_raw_copy_resource(blob->rdesc, out_fd);
+ if (ret)
+ return ret;
+ blob->rdesc->raw_copy_ok = 0;
+ }
+ ret = do_write_blobs_progress(progress_data, blob->size,
+ 1, false);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/* Wait for and write all chunks pending in the compressor. */
+static int
+finish_remaining_chunks(struct write_blobs_ctx *ctx)
+{
+ const void *cdata;
+ u32 csize;
+ u32 usize;
+ int ret;
+
+ if (ctx->compressor == NULL)
+ return 0;
+
+ if (ctx->cur_chunk_buf_filled != 0) {
+ ctx->compressor->signal_chunk_filled(ctx->compressor,
+ ctx->cur_chunk_buf_filled);
+ }
+
+ while (ctx->compressor->get_compression_result(ctx->compressor, &cdata,
+ &csize, &usize))
+ {
+ ret = write_chunk(ctx, cdata, csize, usize);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static void
+validate_blob_list(struct list_head *blob_list)
+{
+ struct blob_descriptor *blob;
+
+ list_for_each_entry(blob, blob_list, write_blobs_list) {
+ wimlib_assert(blob->will_be_in_output_wim);
+ wimlib_assert(blob->size != 0);
+ }
+}
+
+static inline bool
+blob_is_in_file(const struct blob_descriptor *blob)
+{
+ return blob->blob_location == BLOB_IN_FILE_ON_DISK
+#ifdef __WIN32__
+ || blob->blob_location == BLOB_IN_WINNT_FILE_ON_DISK
+ || blob->blob_location == BLOB_WIN32_ENCRYPTED
+#endif
+ ;
+}
+
+static void
+init_done_with_file_info(struct list_head *blob_list)
+{
+ struct blob_descriptor *blob;
+
+ list_for_each_entry(blob, blob_list, write_blobs_list) {
+ if (blob_is_in_file(blob)) {
+ blob->file_inode->i_num_remaining_streams = 0;
+ blob->may_send_done_with_file = 1;
+ } else {
+ blob->may_send_done_with_file = 0;
+ }
+ }
+
+ list_for_each_entry(blob, blob_list, write_blobs_list)
+ if (blob->may_send_done_with_file)
+ blob->file_inode->i_num_remaining_streams++;
+}
+
+/*
+ * Write a list of blobs to the output WIM file.
+ *
+ * @blob_list
+ * The list of blobs to write, specified by a list of 'struct blob_descriptor' linked
+ * by the 'write_blobs_list' member.
+ *
+ * @out_fd
+ * The file descriptor, opened for writing, to which to write the blobs.
+ *
+ * @write_resource_flags
+ * Flags to modify how the blobs are written:
+ *
+ * WRITE_RESOURCE_FLAG_RECOMPRESS:
+ * Force compression of all resources, even if they could otherwise
+ * be re-used by copying the raw data, due to being located in a WIM
+ * file with compatible compression parameters.
+ *
+ * WRITE_RESOURCE_FLAG_PIPABLE:
+ * Write the resources in the wimlib-specific pipable format, and
+ * furthermore do so in such a way that no seeking backwards in
+ * @out_fd will be performed (so it may be a pipe).
+ *
+ * WRITE_RESOURCE_FLAG_SOLID:
+ * Combine all the blobs into a single resource rather than writing
+ * them in separate resources. This flag is only valid if the WIM
+ * version number has been, or will be, set to WIM_VERSION_SOLID.
+ * This flag may not be combined with WRITE_RESOURCE_FLAG_PIPABLE.
+ *
+ * @out_ctype
+ * Compression format to use in the output resources, specified as one of
+ * the WIMLIB_COMPRESSION_TYPE_* constants. WIMLIB_COMPRESSION_TYPE_NONE
+ * is allowed.
+ *
+ * @out_chunk_size
+ * Compression chunk size to use in the output resources. It must be a
+ * valid chunk size for the specified compression format @out_ctype, unless
+ * @out_ctype is WIMLIB_COMPRESSION_TYPE_NONE, in which case this parameter
+ * is ignored.
+ *
+ * @num_threads
+ * Number of threads to use to compress data. If 0, a default number of
+ * threads will be chosen. The number of threads still may be decreased
+ * from the specified value if insufficient memory is detected.
+ *
+ * @blob_table
+ * If on-the-fly deduplication of unhashed blobs is desired, this parameter
+ * must be pointer to the blob table for the WIMStruct on whose behalf the
+ * blobs are being written. Otherwise, this parameter can be NULL.
+ *
+ * @filter_ctx
+ * If on-the-fly deduplication of unhashed blobs is desired, this parameter
+ * can be a pointer to a context for blob filtering used to detect whether
+ * the duplicate blob has been hard-filtered or not. If no blobs are
+ * hard-filtered or no blobs are unhashed, this parameter can be NULL.
+ *
+ * This function will write the blobs in @blob_list to resources in
+ * consecutive positions in the output WIM file, or to a single solid resource
+ * if WRITE_RESOURCE_FLAG_SOLID was specified in @write_resource_flags. In both
+ * cases, the @out_reshdr of the `struct blob_descriptor' for each blob written will be
+ * updated to specify its location, size, and flags in the output WIM. In the
+ * solid resource case, WIM_RESHDR_FLAG_SOLID will be set in the @flags field of
+ * each @out_reshdr, and furthermore @out_res_offset_in_wim and
+ * @out_res_size_in_wim of each @out_reshdr will be set to the offset and size,
+ * respectively, in the output WIM of the solid resource containing the
+ * corresponding blob.
+ *
+ * Each of the blobs to write may be in any location supported by the
+ * resource-handling code (specifically, read_blob_list()), such as the contents
+ * of external file that has been logically added to the output WIM, or a blob
+ * in another WIM file that has been imported, or even a blob in the "same" WIM
+ * file of which a modified copy is being written. In the case that a blob is
+ * already in a WIM file and uses compatible compression parameters, by default
+ * this function will re-use the raw data instead of decompressing it, then
+ * recompressing it; however, with WRITE_RESOURCE_FLAG_RECOMPRESS
+ * specified in @write_resource_flags, this is not done.
+ *
+ * As a further requirement, this function requires that the
+ * @will_be_in_output_wim member be set to 1 on all blobs in @blob_list as well
+ * as any other blobs not in @blob_list that will be in the output WIM file, but
+ * set to 0 on any other blobs in the output WIM's blob table or sharing a solid
+ * resource with a blob in @blob_list. Still furthermore, if on-the-fly
+ * deduplication of blobs is possible, then all blobs in @blob_list must also be
+ * linked by @blob_table_list along with any other blobs that have
+ * @will_be_in_output_wim set.
+ *
+ * This function handles on-the-fly deduplication of blobs for which SHA-1
+ * message digests have not yet been calculated. Such blobs may or may not need
+ * to be written. If @blob_table is non-NULL, then each blob in @blob_list that
+ * has @unhashed set but not @unique_size set is checksummed immediately before
+ * it would otherwise be read for writing in order to determine if it is
+ * identical to another blob already being written or one that would be filtered
+ * out of the output WIM using blob_filtered() with the context @filter_ctx.
+ * Each such duplicate blob will be removed from @blob_list, its reference count
+ * transfered to the pre-existing duplicate blob, its memory freed, and will not
+ * be written. Alternatively, if a blob in @blob_list is a duplicate with any
+ * blob in @blob_table that has not been marked for writing or would not be
+ * hard-filtered, it is freed and the pre-existing duplicate is written instead,
+ * taking ownership of the reference count and slot in the @blob_table_list.
+ *
+ * Returns 0 if every blob was either written successfully or did not need to be
+ * written; otherwise returns a non-zero error code.
+ */
+static int
+write_blob_list(struct list_head *blob_list,
+ struct filedes *out_fd,
+ int write_resource_flags,
+ int out_ctype,
+ u32 out_chunk_size,
+ unsigned num_threads,
+ struct blob_table *blob_table,
+ struct filter_context *filter_ctx,
+ wimlib_progress_func_t progfunc,
+ void *progctx)
+{
+ int ret;
+ struct write_blobs_ctx ctx;
+ struct list_head raw_copy_blobs;
+ u64 num_nonraw_bytes;
+
+ wimlib_assert((write_resource_flags &
+ (WRITE_RESOURCE_FLAG_SOLID |
+ WRITE_RESOURCE_FLAG_PIPABLE)) !=
+ (WRITE_RESOURCE_FLAG_SOLID |
+ WRITE_RESOURCE_FLAG_PIPABLE));
+
+ validate_blob_list(blob_list);
+
+ if (list_empty(blob_list))
+ return 0;
+
+ /* If needed, set auxiliary information so that we can detect when the
+ * library has finished using each external file. */
+ if (unlikely(write_resource_flags & WRITE_RESOURCE_FLAG_SEND_DONE_WITH_FILE))
+ init_done_with_file_info(blob_list);
+
+ memset(&ctx, 0, sizeof(ctx));
+
+ ctx.out_fd = out_fd;
+ ctx.blob_table = blob_table;
+ ctx.out_ctype = out_ctype;
+ ctx.out_chunk_size = out_chunk_size;
+ ctx.write_resource_flags = write_resource_flags;
+ ctx.filter_ctx = filter_ctx;
+
+ /*
+ * We normally sort the blobs to write by a "sequential" order that is
+ * optimized for reading. But when using solid compression, we instead
+ * sort the blobs by file extension and file name (when applicable; and
+ * we don't do this for blobs from solid resources) so that similar
+ * files are grouped together, which improves the compression ratio.
+ * This is somewhat of a hack since a blob does not necessarily
+ * correspond one-to-one with a filename, nor is there any guarantee
+ * that two files with similar names or extensions are actually similar
+ * in content. A potential TODO is to sort the blobs based on some
+ * measure of similarity of their actual contents.
+ */
+
+ ret = sort_blob_list_by_sequential_order(blob_list,
+ offsetof(struct blob_descriptor,
+ write_blobs_list));
+ if (ret)
+ return ret;
+
+ compute_blob_list_stats(blob_list, &ctx);
+
+ if (write_resource_flags & WRITE_RESOURCE_FLAG_SOLID_SORT) {
+ ret = sort_blob_list_for_solid_compression(blob_list);
+ if (unlikely(ret))
+ WARNING("Failed to sort blobs for solid compression. Continuing anyways.");
+ }
+
+ ctx.progress_data.progfunc = progfunc;
+ ctx.progress_data.progctx = progctx;
+
+ num_nonraw_bytes = find_raw_copy_blobs(blob_list, write_resource_flags,
+ out_ctype, out_chunk_size,
+ &raw_copy_blobs);
+
+ /* Copy any compressed resources for which the raw data can be reused
+ * without decompression. */
+ ret = write_raw_copy_resources(&raw_copy_blobs, ctx.out_fd,
+ &ctx.progress_data);
+
+ if (ret || num_nonraw_bytes == 0)
+ goto out_destroy_context;
+
+ /* Unless uncompressed output was required, allocate a chunk_compressor
+ * to do compression. There are serial and parallel implementations of
+ * the chunk_compressor interface. We default to parallel using the
+ * specified number of threads, unless the upper bound on the number
+ * bytes needing to be compressed is less than a heuristic value. */
+ if (out_ctype != WIMLIB_COMPRESSION_TYPE_NONE) {
+
+ #ifdef ENABLE_MULTITHREADED_COMPRESSION
+ if (num_nonraw_bytes > max(2000000, out_chunk_size)) {
+ ret = new_parallel_chunk_compressor(out_ctype,
+ out_chunk_size,
+ num_threads, 0,
+ &ctx.compressor);
+ if (ret > 0) {
+ WARNING("Couldn't create parallel chunk compressor: %"TS".\n"
+ " Falling back to single-threaded compression.",
+ wimlib_get_error_string(ret));
+ }
+ }
+ #endif
+
+ if (ctx.compressor == NULL) {
+ ret = new_serial_chunk_compressor(out_ctype, out_chunk_size,
+ &ctx.compressor);
+ if (ret)
+ goto out_destroy_context;
+ }
+ }
+
+ if (ctx.compressor)
+ ctx.progress_data.progress.write_streams.num_threads = ctx.compressor->num_threads;
+ else
+ ctx.progress_data.progress.write_streams.num_threads = 1;
+
+ INIT_LIST_HEAD(&ctx.blobs_being_compressed);
+ INIT_LIST_HEAD(&ctx.blobs_in_solid_resource);
+
+ ret = call_progress(ctx.progress_data.progfunc,
+ WIMLIB_PROGRESS_MSG_WRITE_STREAMS,
+ &ctx.progress_data.progress,
+ ctx.progress_data.progctx);
+ if (ret)
+ goto out_destroy_context;
+
+ if (write_resource_flags & WRITE_RESOURCE_FLAG_SOLID) {
+ ret = begin_write_resource(&ctx, num_nonraw_bytes);
+ if (ret)
+ goto out_destroy_context;
+ }
+
+ /* Read the list of blobs needing to be compressed, using the specified
+ * callbacks to execute processing of the data. */
+
+ struct read_blob_callbacks cbs = {
+ .begin_blob = write_blob_begin_read,
+ .consume_chunk = write_blob_process_chunk,
+ .end_blob = write_blob_end_read,
+ .ctx = &ctx,