]> wimlib.net Git - wimlib/blob - src/compress_parallel.c
wimoptimize.1: fix ESD => WIM example
[wimlib] / src / compress_parallel.c
1 /*
2  * compress_parallel.c
3  *
4  * Compress chunks of data (parallel version).
5  */
6
7 /*
8  * Copyright (C) 2013-2023 Eric Biggers
9  *
10  * This file is free software; you can redistribute it and/or modify it under
11  * the terms of the GNU Lesser General Public License as published by the Free
12  * Software Foundation; either version 3 of the License, or (at your option) any
13  * later version.
14  *
15  * This file is distributed in the hope that it will be useful, but WITHOUT
16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
18  * details.
19  *
20  * You should have received a copy of the GNU Lesser General Public License
21  * along with this file; if not, see https://www.gnu.org/licenses/.
22  */
23
24 #ifdef HAVE_CONFIG_H
25 #  include "config.h"
26 #endif
27
28 #include <errno.h>
29 #include <stdlib.h>
30 #include <string.h>
31
32 #include "wimlib/assert.h"
33 #include "wimlib/chunk_compressor.h"
34 #include "wimlib/error.h"
35 #include "wimlib/list.h"
36 #include "wimlib/threads.h"
37 #include "wimlib/util.h"
38
39 struct message_queue {
40         struct list_head list;
41         struct mutex lock;
42         struct condvar msg_avail_cond;
43         struct condvar space_avail_cond;
44         bool terminating;
45 };
46
47 struct compressor_thread_data {
48         struct thread thread;
49         struct message_queue *chunks_to_compress_queue;
50         struct message_queue *compressed_chunks_queue;
51         struct wimlib_compressor *compressor;
52 };
53
54 #define MAX_CHUNKS_PER_MSG 16
55
56 struct message {
57         u8 *uncompressed_chunks[MAX_CHUNKS_PER_MSG];
58         u8 *compressed_chunks[MAX_CHUNKS_PER_MSG];
59         u32 uncompressed_chunk_sizes[MAX_CHUNKS_PER_MSG];
60         u32 compressed_chunk_sizes[MAX_CHUNKS_PER_MSG];
61         size_t num_filled_chunks;
62         size_t num_alloc_chunks;
63         struct list_head list;
64         bool complete;
65         struct list_head submission_list;
66 };
67
68 struct parallel_chunk_compressor {
69         struct chunk_compressor base;
70
71         struct message_queue chunks_to_compress_queue;
72         struct message_queue compressed_chunks_queue;
73         struct compressor_thread_data *thread_data;
74         unsigned num_thread_data;
75         unsigned num_started_threads;
76
77         struct message *msgs;
78         size_t num_messages;
79
80         struct list_head available_msgs;
81         struct list_head submitted_msgs;
82         struct message *next_submit_msg;
83         struct message *next_ready_msg;
84         size_t next_chunk_idx;
85 };
86
87
88
89 static int
90 message_queue_init(struct message_queue *q)
91 {
92         if (!mutex_init(&q->lock))
93                 goto err;
94         if (!condvar_init(&q->msg_avail_cond))
95                 goto err_destroy_lock;
96         if (!condvar_init(&q->space_avail_cond))
97                 goto err_destroy_msg_avail_cond;
98         INIT_LIST_HEAD(&q->list);
99         return 0;
100
101 err_destroy_msg_avail_cond:
102         condvar_destroy(&q->msg_avail_cond);
103 err_destroy_lock:
104         mutex_destroy(&q->lock);
105 err:
106         return WIMLIB_ERR_NOMEM;
107 }
108
109 static void
110 message_queue_destroy(struct message_queue *q)
111 {
112         if (q->list.next != NULL) {
113                 mutex_destroy(&q->lock);
114                 condvar_destroy(&q->msg_avail_cond);
115                 condvar_destroy(&q->space_avail_cond);
116         }
117 }
118
119 static void
120 message_queue_put(struct message_queue *q, struct message *msg)
121 {
122         mutex_lock(&q->lock);
123         list_add_tail(&msg->list, &q->list);
124         condvar_signal(&q->msg_avail_cond);
125         mutex_unlock(&q->lock);
126 }
127
128 static struct message *
129 message_queue_get(struct message_queue *q)
130 {
131         struct message *msg;
132
133         mutex_lock(&q->lock);
134         while (list_empty(&q->list) && !q->terminating)
135                 condvar_wait(&q->msg_avail_cond, &q->lock);
136         if (!q->terminating) {
137                 msg = list_entry(q->list.next, struct message, list);
138                 list_del(&msg->list);
139         } else
140                 msg = NULL;
141         mutex_unlock(&q->lock);
142         return msg;
143 }
144
145 static void
146 message_queue_terminate(struct message_queue *q)
147 {
148         mutex_lock(&q->lock);
149         q->terminating = true;
150         condvar_broadcast(&q->msg_avail_cond);
151         mutex_unlock(&q->lock);
152 }
153
154 static int
155 init_message(struct message *msg, size_t num_chunks, u32 out_chunk_size)
156 {
157         msg->num_alloc_chunks = num_chunks;
158         for (size_t i = 0; i < num_chunks; i++) {
159                 msg->compressed_chunks[i] = MALLOC(out_chunk_size - 1);
160                 msg->uncompressed_chunks[i] = MALLOC(out_chunk_size);
161                 if (msg->compressed_chunks[i] == NULL ||
162                     msg->uncompressed_chunks[i] == NULL)
163                         return WIMLIB_ERR_NOMEM;
164         }
165         return 0;
166 }
167
168 static void
169 destroy_message(struct message *msg)
170 {
171         for (size_t i = 0; i < msg->num_alloc_chunks; i++) {
172                 FREE(msg->compressed_chunks[i]);
173                 FREE(msg->uncompressed_chunks[i]);
174         }
175 }
176
177 static void
178 free_messages(struct message *msgs, size_t num_messages)
179 {
180         if (msgs) {
181                 for (size_t i = 0; i < num_messages; i++)
182                         destroy_message(&msgs[i]);
183                 FREE(msgs);
184         }
185 }
186
187 static struct message *
188 allocate_messages(size_t count, size_t chunks_per_msg, u32 out_chunk_size)
189 {
190         struct message *msgs;
191
192         msgs = CALLOC(count, sizeof(struct message));
193         if (msgs == NULL)
194                 return NULL;
195         for (size_t i = 0; i < count; i++) {
196                 if (init_message(&msgs[i], chunks_per_msg, out_chunk_size)) {
197                         free_messages(msgs, count);
198                         return NULL;
199                 }
200         }
201         return msgs;
202 }
203
204 static void
205 compress_chunks(struct message *msg, struct wimlib_compressor *compressor)
206 {
207
208         for (size_t i = 0; i < msg->num_filled_chunks; i++) {
209                 wimlib_assert(msg->uncompressed_chunk_sizes[i] != 0);
210                 msg->compressed_chunk_sizes[i] =
211                         wimlib_compress(msg->uncompressed_chunks[i],
212                                         msg->uncompressed_chunk_sizes[i],
213                                         msg->compressed_chunks[i],
214                                         msg->uncompressed_chunk_sizes[i] - 1,
215                                         compressor);
216         }
217 }
218
219 static void *
220 compressor_thread_proc(void *arg)
221 {
222         struct compressor_thread_data *params = arg;
223         struct message *msg;
224
225         while ((msg = message_queue_get(params->chunks_to_compress_queue)) != NULL) {
226                 compress_chunks(msg, params->compressor);
227                 message_queue_put(params->compressed_chunks_queue, msg);
228         }
229         return NULL;
230 }
231
232 static void
233 parallel_chunk_compressor_destroy(struct chunk_compressor *_ctx)
234 {
235         struct parallel_chunk_compressor *ctx = (struct parallel_chunk_compressor *)_ctx;
236         unsigned i;
237
238         if (ctx == NULL)
239                 return;
240
241         if (ctx->num_started_threads != 0) {
242                 message_queue_terminate(&ctx->chunks_to_compress_queue);
243
244                 for (i = 0; i < ctx->num_started_threads; i++)
245                         thread_join(&ctx->thread_data[i].thread);
246         }
247
248         message_queue_destroy(&ctx->chunks_to_compress_queue);
249         message_queue_destroy(&ctx->compressed_chunks_queue);
250
251         if (ctx->thread_data != NULL)
252                 for (i = 0; i < ctx->num_thread_data; i++)
253                         wimlib_free_compressor(ctx->thread_data[i].compressor);
254
255         FREE(ctx->thread_data);
256
257         free_messages(ctx->msgs, ctx->num_messages);
258
259         FREE(ctx);
260 }
261
262 static void
263 submit_compression_msg(struct parallel_chunk_compressor *ctx)
264 {
265         struct message *msg = ctx->next_submit_msg;
266
267         msg->complete = false;
268         list_add_tail(&msg->submission_list, &ctx->submitted_msgs);
269         message_queue_put(&ctx->chunks_to_compress_queue, msg);
270         ctx->next_submit_msg = NULL;
271 }
272
273 static void *
274 parallel_chunk_compressor_get_chunk_buffer(struct chunk_compressor *_ctx)
275 {
276         struct parallel_chunk_compressor *ctx = (struct parallel_chunk_compressor *)_ctx;
277         struct message *msg;
278
279         if (ctx->next_submit_msg) {
280                 msg = ctx->next_submit_msg;
281         } else {
282                 if (list_empty(&ctx->available_msgs))
283                         return NULL;
284
285                 msg = list_entry(ctx->available_msgs.next, struct message, list);
286                 list_del(&msg->list);
287                 ctx->next_submit_msg = msg;
288                 msg->num_filled_chunks = 0;
289         }
290
291         return msg->uncompressed_chunks[msg->num_filled_chunks];
292 }
293
294 static void
295 parallel_chunk_compressor_signal_chunk_filled(struct chunk_compressor *_ctx, u32 usize)
296 {
297         struct parallel_chunk_compressor *ctx = (struct parallel_chunk_compressor *)_ctx;
298         struct message *msg;
299
300         wimlib_assert(usize > 0);
301         wimlib_assert(usize <= ctx->base.out_chunk_size);
302         wimlib_assert(ctx->next_submit_msg);
303
304         msg = ctx->next_submit_msg;
305         msg->uncompressed_chunk_sizes[msg->num_filled_chunks] = usize;
306         if (++msg->num_filled_chunks == msg->num_alloc_chunks)
307                 submit_compression_msg(ctx);
308 }
309
310 static bool
311 parallel_chunk_compressor_get_compression_result(struct chunk_compressor *_ctx,
312                                                  const void **cdata_ret, u32 *csize_ret,
313                                                  u32 *usize_ret)
314 {
315         struct parallel_chunk_compressor *ctx = (struct parallel_chunk_compressor *)_ctx;
316         struct message *msg;
317
318         if (ctx->next_submit_msg)
319                 submit_compression_msg(ctx);
320
321         if (ctx->next_ready_msg) {
322                 msg = ctx->next_ready_msg;
323         } else {
324                 if (list_empty(&ctx->submitted_msgs))
325                         return false;
326
327                 while (!(msg = list_entry(ctx->submitted_msgs.next,
328                                           struct message,
329                                           submission_list))->complete)
330                         message_queue_get(&ctx->compressed_chunks_queue)->complete = true;
331
332                 ctx->next_ready_msg = msg;
333                 ctx->next_chunk_idx = 0;
334         }
335
336         if (msg->compressed_chunk_sizes[ctx->next_chunk_idx]) {
337                 *cdata_ret = msg->compressed_chunks[ctx->next_chunk_idx];
338                 *csize_ret = msg->compressed_chunk_sizes[ctx->next_chunk_idx];
339         } else {
340                 *cdata_ret = msg->uncompressed_chunks[ctx->next_chunk_idx];
341                 *csize_ret = msg->uncompressed_chunk_sizes[ctx->next_chunk_idx];
342         }
343         *usize_ret = msg->uncompressed_chunk_sizes[ctx->next_chunk_idx];
344
345         if (++ctx->next_chunk_idx == msg->num_filled_chunks) {
346                 list_del(&msg->submission_list);
347                 list_add_tail(&msg->list, &ctx->available_msgs);
348                 ctx->next_ready_msg = NULL;
349         }
350         return true;
351 }
352
353 int
354 new_parallel_chunk_compressor(int out_ctype, u32 out_chunk_size,
355                               unsigned num_threads, u64 max_memory,
356                               struct chunk_compressor **compressor_ret)
357 {
358         u64 approx_mem_required;
359         size_t chunks_per_msg;
360         size_t msgs_per_thread;
361         struct parallel_chunk_compressor *ctx;
362         unsigned i;
363         int ret;
364         unsigned desired_num_threads;
365
366         wimlib_assert(out_chunk_size > 0);
367
368         if (num_threads == 0)
369                 num_threads = get_available_cpus();
370
371         if (num_threads == 1)
372                 return -1;
373
374         if (max_memory == 0)
375                 max_memory = get_available_memory();
376
377         desired_num_threads = num_threads;
378
379         if (out_chunk_size < ((u32)1 << 23)) {
380                 /* Relatively small chunks.  Use 2 messages per thread, each
381                  * with at least 2 chunks.  Use more chunks per message if there
382                  * are lots of threads and/or the chunks are very small.  */
383                 chunks_per_msg = 2;
384                 chunks_per_msg += num_threads * (65536 / out_chunk_size) / 16;
385                 chunks_per_msg = max(chunks_per_msg, 2);
386                 chunks_per_msg = min(chunks_per_msg, MAX_CHUNKS_PER_MSG);
387                 msgs_per_thread = 2;
388         } else {
389                 /* Big chunks: Just have one buffer per thread --- more would
390                  * just waste memory.  */
391                 chunks_per_msg = 1;
392                 msgs_per_thread = 1;
393         }
394         for (;;) {
395                 approx_mem_required =
396                         (u64)chunks_per_msg *
397                         (u64)msgs_per_thread *
398                         (u64)num_threads *
399                         (u64)out_chunk_size
400                         + out_chunk_size
401                         + 1000000
402                         + num_threads * wimlib_get_compressor_needed_memory(out_ctype,
403                                                                             out_chunk_size,
404                                                                             0);
405                 if (approx_mem_required <= max_memory)
406                         break;
407
408                 if (chunks_per_msg > 1)
409                         chunks_per_msg--;
410                 else if (msgs_per_thread > 1)
411                         msgs_per_thread--;
412                 else if (num_threads > 1)
413                         num_threads--;
414                 else
415                         break;
416         }
417
418         if (num_threads < desired_num_threads) {
419                 WARNING("Wanted to use %u threads, but limiting to %u "
420                         "to fit in available memory!",
421                         desired_num_threads, num_threads);
422         }
423
424         if (num_threads == 1)
425                 return -2;
426
427         ret = WIMLIB_ERR_NOMEM;
428         ctx = CALLOC(1, sizeof(*ctx));
429         if (ctx == NULL)
430                 goto err;
431
432         ctx->base.out_ctype = out_ctype;
433         ctx->base.out_chunk_size = out_chunk_size;
434         ctx->base.destroy = parallel_chunk_compressor_destroy;
435         ctx->base.get_chunk_buffer = parallel_chunk_compressor_get_chunk_buffer;
436         ctx->base.signal_chunk_filled = parallel_chunk_compressor_signal_chunk_filled;
437         ctx->base.get_compression_result = parallel_chunk_compressor_get_compression_result;
438
439         ctx->num_thread_data = num_threads;
440
441         ret = message_queue_init(&ctx->chunks_to_compress_queue);
442         if (ret)
443                 goto err;
444
445         ret = message_queue_init(&ctx->compressed_chunks_queue);
446         if (ret)
447                 goto err;
448
449         ret = WIMLIB_ERR_NOMEM;
450         ctx->thread_data = CALLOC(num_threads, sizeof(ctx->thread_data[0]));
451         if (ctx->thread_data == NULL)
452                 goto err;
453
454         for (i = 0; i < num_threads; i++) {
455                 struct compressor_thread_data *dat;
456
457                 dat = &ctx->thread_data[i];
458
459                 dat->chunks_to_compress_queue = &ctx->chunks_to_compress_queue;
460                 dat->compressed_chunks_queue = &ctx->compressed_chunks_queue;
461                 ret = wimlib_create_compressor(out_ctype, out_chunk_size,
462                                                WIMLIB_COMPRESSOR_FLAG_DESTRUCTIVE,
463                                                &dat->compressor);
464                 if (ret)
465                         goto err;
466         }
467
468         for (ctx->num_started_threads = 0;
469              ctx->num_started_threads < num_threads;
470              ctx->num_started_threads++)
471         {
472                 if (!thread_create(&ctx->thread_data[ctx->num_started_threads].thread,
473                                    compressor_thread_proc,
474                                    &ctx->thread_data[ctx->num_started_threads]))
475                 {
476                         ret = WIMLIB_ERR_NOMEM;
477                         if (ctx->num_started_threads >= 2)
478                                 break;
479                         goto err;
480                 }
481         }
482
483         ctx->base.num_threads = ctx->num_started_threads;
484
485         ret = WIMLIB_ERR_NOMEM;
486         ctx->num_messages = ctx->num_started_threads * msgs_per_thread;
487         ctx->msgs = allocate_messages(ctx->num_messages,
488                                       chunks_per_msg, out_chunk_size);
489         if (ctx->msgs == NULL)
490                 goto err;
491
492         INIT_LIST_HEAD(&ctx->available_msgs);
493         for (size_t i = 0; i < ctx->num_messages; i++)
494                 list_add_tail(&ctx->msgs[i].list, &ctx->available_msgs);
495
496         INIT_LIST_HEAD(&ctx->submitted_msgs);
497
498         *compressor_ret = &ctx->base;
499         return 0;
500
501 err:
502         parallel_chunk_compressor_destroy(&ctx->base);
503         return ret;
504 }