From 0ecf748e0db6bb4d9a02388b4ea925d8742848b1 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Mon, 30 Dec 2013 16:58:32 -0600
Subject: [PATCH] Fix completed_streams of write streams progress and update
 docs

---
 include/wimlib.h  |  87 +++++++++++++++++-------
 programs/imagex.c |  18 +++--
 src/write.c       | 165 +++++++++++++++++++++++++++++-----------------
 3 files changed, 180 insertions(+), 90 deletions(-)

diff --git a/include/wimlib.h b/include/wimlib.h
index 119c5870..0fc5fb9f 100644
--- a/include/wimlib.h
+++ b/include/wimlib.h
@@ -491,9 +491,8 @@ enum wimlib_progress_msg {
 	 * ::wimlib_progress_info.scan. */
 	WIMLIB_PROGRESS_MSG_SCAN_END,
 
-	/**
-	 * File resources are currently being written to the WIM.
-	 * @p info will point to ::wimlib_progress_info.write_streams. */
+	/** File resources ("streams") are currently being written to the WIM.
+	 * @p info will point to ::wimlib_progress_info.write_streams.  */
 	WIMLIB_PROGRESS_MSG_WRITE_STREAMS,
 
 	/**
@@ -561,43 +560,83 @@ union wimlib_progress_info {
 	/* N.B. I wanted these to be anonymous structs, but Doxygen won't
 	 * document them if they aren't given a name... */
 
-	/** Valid on messages ::WIMLIB_PROGRESS_MSG_WRITE_STREAMS. */
+	/** Valid on the message ::WIMLIB_PROGRESS_MSG_WRITE_STREAMS.  This is
+	 * the primary message for tracking the progress of writing a WIM file.
+	 */
 	struct wimlib_progress_info_write_streams {
-		/** Number of bytes that are going to be written for all the
-		 * streams combined.  This is the amount in uncompressed data.
-		 * (The actual number of bytes will be less if the data is being
-		 * written compressed.) */
+		/** Total number of uncompressed bytes of stream data being
+		 * written.  This can be thought of as the total uncompressed
+		 * size of the files being archived, with some caveats.  WIM
+		 * files use single-instance streams, so the size provided here
+		 * only counts distinct streams, except for the following
+		 * exception: the size provided here may include the sizes of
+		 * all newly added (e.g. with wimlib_add_image() streams,
+		 * pending automatic de-duplication during the write operation
+		 * itself.  When each such stream de-duplication occurs, this
+		 * number will be decreased by the size of the duplicate stream
+		 * that need not be written.
+		 *
+		 * In the case of a wimlib_overwrite() that the library opted to
+		 * perform in-place, both @p total_streams and @p total_bytes
+		 * will only count the streams actually being written and not
+		 * pre-existing streams in the WIM file.  */
 		uint64_t total_bytes;
 
-		/** Number of streams that are going to be written. */
+		/** Total number of streams being written.  This can be thought
+		 * of as the total number of files being archived, with some
+		 * caveats.  In general, a single file or directory may contain
+		 * multiple data streams, each of which will be represented
+		 * separately in this number.  Furthermore, WIM files use
+		 * single-instance streams, so the stream count provided here
+		 * only counts distinct streams, except for the following
+		 * exception: the stream count provided here may include newly
+		 * added (e.g. with wimlib_add_image() streams, pending
+		 * automatic de-duplication during the write operation itself.
+		 * When each such stream de-duplication occurs, this number will
+		 * be decreased by 1 to account for the duplicate stream that
+		 * need not be written.  */
 		uint64_t total_streams;
 
-		/** Number of uncompressed bytes that have been written so far.
-		 * Will be 0 initially, and equal to @p total_bytes at the end.
-		 * */
+		/** Number of uncompressed bytes of stream data that have been
+		 * written so far.  This number be 0 initially, and will be
+		 * equal to @p total_bytes at the end of the write operation.
+		 * Note that @p total_bytes (but not @p completed_bytes) may
+		 * decrease throughout the write operation due to the discovery
+		 * of stream duplications.  */
 		uint64_t completed_bytes;
 
-		/** Number of streams that have been written.  Will be 0
-		 * initially, and equal to @p total_streams at the end. */
+		/** Number of streams that have been written so far.  This
+		 * number will be 0 initially, and will be equal to @p
+		 * total_streams at the end of the write operation.  Note that
+		 * @p total_streams (but not @p completed_streams) may decrease
+		 * throughout the write operation due to the discovery of stream
+		 * duplications.
+		 *
+		 * For applications that wish to calculate a simple "percent
+		 * complete" for the write operation, it will likely be more
+		 * accurate to calculate the percentage from @p completed_bytes
+		 * and @p total_bytes rather than @p completed_streams and
+		 * @p total_streams because the time for the operation to
+		 * complete is mainly determined by the number of bytes that
+		 * need to be read, compressed, and written, not just the number
+		 * of files being archived.  */
 		uint64_t completed_streams;
 
-		/** Number of threads that are being used to compress resources
-		 * (if applicable).  */
-		unsigned num_threads;
+		/** Number of threads that are being used to compress streams,
+		 * or 1 if streams are being written uncompressed.  */
+		uint32_t num_threads;
 
-		/** The compression type being used to write the streams; either
-		 * ::WIMLIB_COMPRESSION_TYPE_NONE,
-		 * ::WIMLIB_COMPRESSION_TYPE_XPRESS, or
-		 * ::WIMLIB_COMPRESSION_TYPE_LZX. */
-		int	 compression_type;
+		/** The compression type being used to write the streams, as one
+		 * of the ::wimlib_compression_type constants.  */
+		int32_t	 compression_type;
 
 		/** Number of split WIM parts from which streams are being
 		 * written (may be 0 if irrelevant).  */
-		unsigned total_parts;
+		uint32_t total_parts;
 
 		/** Number of split WIM parts from which streams have been
 		 * written (may be 0 if irrelevant).  */
-		unsigned completed_parts;
+		uint32_t completed_parts;
 	} write_streams;
 
 	/** Valid on messages ::WIMLIB_PROGRESS_MSG_SCAN_BEGIN,
diff --git a/programs/imagex.c b/programs/imagex.c
index 39ed2742..9569a255 100644
--- a/programs/imagex.c
+++ b/programs/imagex.c
@@ -1101,16 +1101,22 @@ imagex_progress_func(enum wimlib_progress_msg msg,
 		return 0;
 	switch (msg) {
 	case WIMLIB_PROGRESS_MSG_WRITE_STREAMS:
+		{
+			static bool first = false;
+			if (!first) {
+				imagex_printf(T("Writing %"TS"-compressed data "
+						"using %u thread%"TS"\n"),
+					      wimlib_get_compression_type_string(
+							info->write_streams.compression_type),
+					info->write_streams.num_threads,
+					(info->write_streams.num_threads == 1) ? T("") : T("s"));
+				first = true;
+			}
+		}
 		unit_shift = get_unit(info->write_streams.total_bytes, &unit_name);
 		percent_done = TO_PERCENT(info->write_streams.completed_bytes,
 					  info->write_streams.total_bytes);
 
-		if (info->write_streams.completed_streams == 0) {
-			imagex_printf(T("Writing %"TS"-compressed data using %u thread%"TS"\n"),
-				wimlib_get_compression_type_string(info->write_streams.compression_type),
-				info->write_streams.num_threads,
-				(info->write_streams.num_threads == 1) ? T("") : T("s"));
-		}
 		if (info->write_streams.total_parts <= 1) {
 			imagex_printf(T("\r%"PRIu64" %"TS" of %"PRIu64" %"TS" (uncompressed) "
 				"written (%u%% done)"),
diff --git a/src/write.c b/src/write.c
index bcf282ba..7e597e18 100644
--- a/src/write.c
+++ b/src/write.c
@@ -282,23 +282,27 @@ struct write_streams_progress_data {
 
 static void
 do_write_streams_progress(struct write_streams_progress_data *progress_data,
-			  u64 size,
-			  bool discarded,
-			  struct wim_lookup_table_entry *cur_stream)
+			  struct wim_lookup_table_entry *cur_stream,
+			  u64 complete_size,
+			  u32 complete_count,
+			  bool discarded)
 {
 	union wimlib_progress_info *progress = &progress_data->progress;
 	bool new_wim_part;
 
 	if (discarded) {
-		progress->write_streams.total_bytes -= size;
+		progress->write_streams.total_bytes -= complete_size;
+		progress->write_streams.total_streams -= complete_count;
 		if (progress_data->next_progress != ~(uint64_t)0 &&
 		    progress_data->next_progress > progress->write_streams.total_bytes)
 		{
 			progress_data->next_progress = progress->write_streams.total_bytes;
 		}
 	} else {
-		progress->write_streams.completed_bytes += size;
+		progress->write_streams.completed_bytes += complete_size;
+		progress->write_streams.completed_streams += complete_count;
 	}
+
 	new_wim_part = false;
 	if (cur_stream->resource_location == RESOURCE_IN_WIM &&
 	    cur_stream->rspec->wim != progress_data->prev_wim_part)
@@ -309,7 +313,7 @@ do_write_streams_progress(struct write_streams_progress_data *progress_data,
 		}
 		progress_data->prev_wim_part = cur_stream->rspec->wim;
 	}
-	progress->write_streams.completed_streams++;
+
 	if (progress_data->progress_func
 	    && (progress->write_streams.completed_bytes >= progress_data->next_progress
 		|| new_wim_part))
@@ -368,20 +372,24 @@ struct write_streams_ctx {
 	/* List of streams that currently have chunks being compressed.  */
 	struct list_head pending_streams;
 
+	/* List of streams in the resource pack.  Streams are moved here after
+	 * @pending_streams only when writing a packed resource.  */
+	struct list_head pack_streams;
+
 	/* Set to true if the stream currently being read was a duplicate, and
 	 * therefore the corresponding stream entry needs to be freed once the
 	 * read finishes.  (In this case we add the duplicate entry to
 	 * pending_streams rather than the entry being read.)  */
 	bool stream_was_duplicate;
 
-	/* Current uncompressed offset in the resource being read.  */
-	u64 cur_read_res_offset;
+	/* Current uncompressed offset in the stream being read.  */
+	u64 cur_read_stream_offset;
 
-	/* Uncompressed size of the resource currently being read.  */
-	u64 cur_read_res_size;
+	/* Uncompressed size of the stream currently being read.  */
+	u64 cur_read_stream_size;
 
-	/* Current uncompressed offset in the resource being written.  */
-	u64 cur_write_res_offset;
+	/* Current uncompressed offset in the stream being written.  */
+	u64 cur_write_stream_offset;
 
 	/* Uncompressed size of resource currently being written.  */
 	u64 cur_write_res_size;
@@ -489,7 +497,7 @@ begin_write_resource(struct write_streams_ctx *ctx, u64 res_expected_size)
 	/* Output file descriptor is now positioned at the offset at which to
 	 * write the first chunk of the resource.  */
 	ctx->chunks_start_offset = ctx->out_fd->offset;
-	ctx->cur_write_res_offset = 0;
+	ctx->cur_write_stream_offset = 0;
 	ctx->cur_write_res_size = res_expected_size;
 	return 0;
 }
@@ -607,7 +615,8 @@ end_write_resource(struct write_streams_ctx *ctx, struct wim_reshdr *out_reshdr)
 	u64 res_uncompressed_size;
 	u64 res_offset_in_wim;
 
-	wimlib_assert(ctx->cur_write_res_size == ctx->cur_write_res_offset);
+	wimlib_assert(ctx->cur_write_stream_offset == ctx->cur_write_res_size ||
+		      (ctx->write_resource_flags & WRITE_RESOURCE_FLAG_PACK_STREAMS));
 	res_uncompressed_size = ctx->cur_write_res_size;
 
 	if (ctx->compressor) {
@@ -637,8 +646,8 @@ write_stream_begin_read(struct wim_lookup_table_entry *lte,
 
 	wimlib_assert(lte->size > 0);
 
-	ctx->cur_read_res_offset = 0;
-	ctx->cur_read_res_size = lte->size;
+	ctx->cur_read_stream_offset = 0;
+	ctx->cur_read_stream_size = lte->size;
 
 	/* As an optimization, we allow some streams to be "unhashed", meaning
 	 * their SHA1 message digests are unknown.  This is the case with
@@ -676,7 +685,8 @@ write_stream_begin_read(struct wim_lookup_table_entry *lte,
 				DEBUG("Discarding duplicate stream of "
 				      "length %"PRIu64, lte->size);
 				do_write_streams_progress(&ctx->progress_data,
-							  lte->size, true, lte);
+							  lte, lte->size,
+							  1, true);
 				list_del(&lte->write_streams_list);
 				list_del(&lte->lookup_table_list);
 				if (lte_new->will_be_in_output_wim)
@@ -766,11 +776,13 @@ write_chunk(struct write_streams_ctx *ctx, const void *cchunk,
 	int ret;
 
 	struct wim_lookup_table_entry *lte;
+	u32 completed_stream_count;
+	u32 completed_size;
 
 	lte = list_entry(ctx->pending_streams.next,
 			 struct wim_lookup_table_entry, write_streams_list);
 
-	if (ctx->cur_write_res_offset == 0 &&
+	if (ctx->cur_write_stream_offset == 0 &&
 	    !(ctx->write_resource_flags & WRITE_RESOURCE_FLAG_PACK_STREAMS))
 	{
 		/* Starting to write a new stream in non-packed mode.  */
@@ -817,52 +829,83 @@ write_chunk(struct write_streams_ctx *ctx, const void *cchunk,
 	if (ret)
 		goto error;
 
-	ctx->cur_write_res_offset += usize;
+	ctx->cur_write_stream_offset += usize;
 
-	do_write_streams_progress(&ctx->progress_data,
-				  usize, false, lte);
+	completed_size = usize;
+	completed_stream_count = 0;
+	if (ctx->write_resource_flags & WRITE_RESOURCE_FLAG_PACK_STREAMS) {
+		/* Wrote chunk in packed mode.  It may have finished multiple
+		 * streams.  */
+		while (ctx->cur_write_stream_offset > lte->size) {
+			struct wim_lookup_table_entry *next;
 
-	if (ctx->cur_write_res_offset == ctx->cur_write_res_size &&
-	    !(ctx->write_resource_flags & WRITE_RESOURCE_FLAG_PACK_STREAMS))
-	{
-		wimlib_assert(ctx->cur_write_res_offset == lte->size);
+			ctx->cur_write_stream_offset -= lte->size;
 
-		/* Finished writing a stream in non-packed mode.  */
+			wimlib_assert(!list_is_singular(&ctx->pending_streams) &&
+				      !list_empty(&ctx->pending_streams));
+			next = list_entry(lte->write_streams_list.next,
+					  struct wim_lookup_table_entry,
+					  write_streams_list);
+			list_move_tail(&lte->write_streams_list,
+				       &ctx->pack_streams);
+			lte = next;
+			completed_stream_count++;
+		}
+		if (ctx->cur_write_stream_offset == lte->size) {
+			ctx->cur_write_stream_offset = 0;
+			list_move_tail(&lte->write_streams_list,
+				       &ctx->pack_streams);
+			completed_stream_count++;
+		}
+	} else {
+		/* Wrote chunk in non-packed mode.  It may have finished a
+		 * stream.  */
+		if (ctx->cur_write_stream_offset == lte->size) {
 
-		ret = end_write_resource(ctx, &lte->out_reshdr);
-		if (ret)
-			return ret;
+			completed_stream_count++;
 
-		lte->out_reshdr.flags = filter_resource_flags(lte->flags);
-		if (ctx->compressor != NULL)
-			lte->out_reshdr.flags |= WIM_RESHDR_FLAG_COMPRESSED;
+			list_del(&lte->write_streams_list);
 
-		if (ctx->compressor != NULL &&
-		    lte->out_reshdr.size_in_wim >= lte->out_reshdr.uncompressed_size &&
-		    !(ctx->write_resource_flags & WRITE_RESOURCE_FLAG_PIPABLE) &&
-		    !(lte->flags & WIM_RESHDR_FLAG_PACKED_STREAMS))
-		{
-			/* Stream did not compress to less than its original
-			 * size.  If we're not writing a pipable WIM (which
-			 * could mean the output file descriptor is
-			 * non-seekable), and the stream isn't located in a
-			 * resource pack (which would make reading it again
-			 * costly), truncate the file to the start of the stream
-			 * and write it uncompressed instead.  */
-			DEBUG("Stream of size %"PRIu64" did not compress to "
-			      "less than original size; writing uncompressed.",
-			      lte->size);
-			ret = write_stream_uncompressed(lte, ctx->out_fd);
+			wimlib_assert(ctx->cur_write_stream_offset ==
+				      ctx->cur_write_res_size);
+
+			ret = end_write_resource(ctx, &lte->out_reshdr);
 			if (ret)
 				return ret;
-		}
 
-		wimlib_assert(lte->out_reshdr.uncompressed_size == lte->size);
+			lte->out_reshdr.flags = filter_resource_flags(lte->flags);
+			if (ctx->compressor != NULL)
+				lte->out_reshdr.flags |= WIM_RESHDR_FLAG_COMPRESSED;
 
-		list_del(&lte->write_streams_list);
-		ctx->cur_write_res_offset = 0;
+			if (ctx->compressor != NULL &&
+			    lte->out_reshdr.size_in_wim >= lte->out_reshdr.uncompressed_size &&
+			    !(ctx->write_resource_flags & WRITE_RESOURCE_FLAG_PIPABLE) &&
+			    !(lte->flags & WIM_RESHDR_FLAG_PACKED_STREAMS))
+			{
+				/* Stream did not compress to less than its original
+				 * size.  If we're not writing a pipable WIM (which
+				 * could mean the output file descriptor is
+				 * non-seekable), and the stream isn't located in a
+				 * resource pack (which would make reading it again
+				 * costly), truncate the file to the start of the stream
+				 * and write it uncompressed instead.  */
+				DEBUG("Stream of size %"PRIu64" did not compress to "
+				      "less than original size; writing uncompressed.",
+				      lte->size);
+				ret = write_stream_uncompressed(lte, ctx->out_fd);
+				if (ret)
+					return ret;
+			}
+			wimlib_assert(lte->out_reshdr.uncompressed_size == lte->size);
+
+			ctx->cur_write_stream_offset = 0;
+		}
 	}
 
+	do_write_streams_progress(&ctx->progress_data, lte,
+				  completed_size, completed_stream_count,
+				  false);
+
 	return 0;
 
 error:
@@ -911,7 +954,7 @@ write_stream_process_chunk(const void *chunk, size_t size, void *_ctx)
 		 ret = write_chunk(ctx, chunk, size, size);
 		 if (ret)
 			 return ret;
-		 ctx->cur_read_res_offset += size;
+		 ctx->cur_read_stream_offset += size;
 		 return 0;
 	}
 
@@ -929,8 +972,8 @@ write_stream_process_chunk(const void *chunk, size_t size, void *_ctx)
 		} else {
 			u64 res_bytes_remaining;
 
-			res_bytes_remaining = ctx->cur_read_res_size -
-					      ctx->cur_read_res_offset;
+			res_bytes_remaining = ctx->cur_read_stream_size -
+					      ctx->cur_read_stream_offset;
 			needed_chunk_size = min(ctx->out_chunk_size,
 						ctx->chunk_buf_filled +
 							res_bytes_remaining);
@@ -942,7 +985,7 @@ write_stream_process_chunk(const void *chunk, size_t size, void *_ctx)
 			/* No intermediate buffering needed.  */
 			resized_chunk = chunkptr;
 			chunkptr += needed_chunk_size;
-			ctx->cur_read_res_offset += needed_chunk_size;
+			ctx->cur_read_stream_offset += needed_chunk_size;
 		} else {
 			/* Intermediate buffering needed.  */
 			size_t bytes_consumed;
@@ -954,7 +997,7 @@ write_stream_process_chunk(const void *chunk, size_t size, void *_ctx)
 			       chunkptr, bytes_consumed);
 
 			chunkptr += bytes_consumed;
-			ctx->cur_read_res_offset += bytes_consumed;
+			ctx->cur_read_stream_offset += bytes_consumed;
 			ctx->chunk_buf_filled += bytes_consumed;
 			if (ctx->chunk_buf_filled == needed_chunk_size) {
 				resized_chunk = ctx->chunk_buf;
@@ -982,7 +1025,7 @@ write_stream_end_read(struct wim_lookup_table_entry *lte, int status, void *_ctx
 {
 	struct write_streams_ctx *ctx = _ctx;
 	if (status == 0)
-		wimlib_assert(ctx->cur_read_res_offset == ctx->cur_read_res_size);
+		wimlib_assert(ctx->cur_read_stream_offset == ctx->cur_read_stream_size);
 	if (ctx->stream_was_duplicate) {
 		free_lookup_table_entry(lte);
 	} else if (lte->unhashed && ctx->lookup_table != NULL) {
@@ -1144,7 +1187,8 @@ write_raw_copy_resources(struct list_head *raw_copy_resources,
 		ret = write_raw_copy_resource(lte->rspec, out_fd);
 		if (ret)
 			return ret;
-		do_write_streams_progress(progress_data, lte->size, false, lte);
+		do_write_streams_progress(progress_data, lte, lte->size,
+					  1, false);
 	}
 	return 0;
 }
@@ -1427,6 +1471,7 @@ write_stream_list(struct list_head *stream_list,
 	      ctx.progress_data.progress.write_streams.num_threads);
 
 	INIT_LIST_HEAD(&ctx.pending_streams);
+	INIT_LIST_HEAD(&ctx.pack_streams);
 
 	if (ctx.progress_data.progress_func) {
 		(*ctx.progress_data.progress_func)(WIMLIB_PROGRESS_MSG_WRITE_STREAMS,
@@ -1480,7 +1525,7 @@ write_stream_list(struct list_head *stream_list,
 		      reshdr.uncompressed_size);
 
 		offset_in_res = 0;
-		list_for_each_entry(lte, &ctx.pending_streams, write_streams_list) {
+		list_for_each_entry(lte, &ctx.pack_streams, write_streams_list) {
 			lte->out_reshdr.size_in_wim = lte->size;
 			lte->out_reshdr.flags = filter_resource_flags(lte->flags);
 			lte->out_reshdr.flags |= WIM_RESHDR_FLAG_PACKED_STREAMS;
-- 
2.43.0