]> wimlib.net Git - wimlib/blob - src/compress_parallel.c
Reduce unnecessary copying during chunk compression
[wimlib] / src / compress_parallel.c
1 /*
2  * compress_parallel.c
3  *
4  * Compress chunks of data (parallel version).
5  */
6
7 /*
8  * Copyright (C) 2013 Eric Biggers
9  *
10  * This file is free software; you can redistribute it and/or modify it under
11  * the terms of the GNU Lesser General Public License as published by the Free
12  * Software Foundation; either version 3 of the License, or (at your option) any
13  * later version.
14  *
15  * This file is distributed in the hope that it will be useful, but WITHOUT
16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
18  * details.
19  *
20  * You should have received a copy of the GNU Lesser General Public License
21  * along with this file; if not, see http://www.gnu.org/licenses/.
22  */
23
24 #ifdef HAVE_CONFIG_H
25 #  include "config.h"
26 #endif
27
28 #ifdef ENABLE_MULTITHREADED_COMPRESSION
29
30 #include <errno.h>
31 #include <limits.h>
32 #include <pthread.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <unistd.h>
36 #ifdef HAVE_SYS_SYSCTL_H
37 #  include <sys/sysctl.h>
38 #endif
39
40 #include "wimlib/assert.h"
41 #include "wimlib/chunk_compressor.h"
42 #include "wimlib/error.h"
43 #include "wimlib/list.h"
44 #include "wimlib/util.h"
45 #ifdef __WIN32__
46 #  include "wimlib/win32.h" /* win32_get_number_of_processors() */
47 #endif
48
49 struct message_queue {
50         struct list_head list;
51         pthread_mutex_t lock;
52         pthread_cond_t msg_avail_cond;
53         pthread_cond_t space_avail_cond;
54         bool terminating;
55 };
56
57 struct compressor_thread_data {
58         pthread_t thread;
59         struct message_queue *chunks_to_compress_queue;
60         struct message_queue *compressed_chunks_queue;
61         struct wimlib_compressor *compressor;
62 };
63
64 #define MAX_CHUNKS_PER_MSG 16
65
66 struct message {
67         u8 *uncompressed_chunks[MAX_CHUNKS_PER_MSG];
68         u8 *compressed_chunks[MAX_CHUNKS_PER_MSG];
69         u32 uncompressed_chunk_sizes[MAX_CHUNKS_PER_MSG];
70         u32 compressed_chunk_sizes[MAX_CHUNKS_PER_MSG];
71         size_t num_filled_chunks;
72         size_t num_alloc_chunks;
73         struct list_head list;
74         bool complete;
75         struct list_head submission_list;
76 };
77
78 struct parallel_chunk_compressor {
79         struct chunk_compressor base;
80
81         struct message_queue chunks_to_compress_queue;
82         struct message_queue compressed_chunks_queue;
83         struct compressor_thread_data *thread_data;
84         unsigned num_thread_data;
85         unsigned num_started_threads;
86
87         struct message *msgs;
88         size_t num_messages;
89
90         struct list_head available_msgs;
91         struct list_head submitted_msgs;
92         struct message *next_submit_msg;
93         struct message *next_ready_msg;
94         size_t next_chunk_idx;
95 };
96
97 static unsigned
98 get_default_num_threads(void)
99 {
100         long n;
101 #ifdef __WIN32__
102         n = win32_get_number_of_processors();
103 #else
104         n = sysconf(_SC_NPROCESSORS_ONLN);
105 #endif
106         if (n < 1 || n >= UINT_MAX) {
107                 WARNING("Failed to determine number of processors; assuming 1.");
108                 return 1;
109         }
110         return n;
111 }
112
113 static u64
114 get_avail_memory(void)
115 {
116 #ifdef __WIN32__
117         u64 phys_bytes = win32_get_avail_memory();
118         if (phys_bytes == 0)
119                 goto default_size;
120         return phys_bytes;
121 #elif defined(_SC_PAGESIZE) && defined(_SC_PHYS_PAGES)
122         long page_size = sysconf(_SC_PAGESIZE);
123         long num_pages = sysconf(_SC_PHYS_PAGES);
124         if (page_size <= 0 || num_pages <= 0)
125                 goto default_size;
126         return ((u64)page_size * (u64)num_pages);
127 #else
128         int mib[2] = {CTL_HW, HW_MEMSIZE};
129         u64 memsize;
130         size_t len = sizeof(memsize);
131         if (sysctl(mib, ARRAY_LEN(mib), &memsize, &len, NULL, 0) < 0 || len != 8)
132                 goto default_size;
133         return memsize;
134 #endif
135
136 default_size:
137         WARNING("Failed to determine available memory; assuming 1 GiB");
138         return 1ULL << 30;
139 }
140
141 static int
142 message_queue_init(struct message_queue *q)
143 {
144         if (pthread_mutex_init(&q->lock, NULL)) {
145                 ERROR_WITH_ERRNO("Failed to initialize mutex");
146                 goto err;
147         }
148         if (pthread_cond_init(&q->msg_avail_cond, NULL)) {
149                 ERROR_WITH_ERRNO("Failed to initialize condition variable");
150                 goto err_destroy_lock;
151         }
152         if (pthread_cond_init(&q->space_avail_cond, NULL)) {
153                 ERROR_WITH_ERRNO("Failed to initialize condition variable");
154                 goto err_destroy_msg_avail_cond;
155         }
156         INIT_LIST_HEAD(&q->list);
157         return 0;
158
159 err_destroy_msg_avail_cond:
160         pthread_cond_destroy(&q->msg_avail_cond);
161 err_destroy_lock:
162         pthread_mutex_destroy(&q->lock);
163 err:
164         return WIMLIB_ERR_NOMEM;
165 }
166
167 static void
168 message_queue_destroy(struct message_queue *q)
169 {
170         if (q->list.next != NULL) {
171                 pthread_mutex_destroy(&q->lock);
172                 pthread_cond_destroy(&q->msg_avail_cond);
173                 pthread_cond_destroy(&q->space_avail_cond);
174         }
175 }
176
177 static void
178 message_queue_put(struct message_queue *q, struct message *msg)
179 {
180         pthread_mutex_lock(&q->lock);
181         list_add_tail(&msg->list, &q->list);
182         pthread_cond_signal(&q->msg_avail_cond);
183         pthread_mutex_unlock(&q->lock);
184 }
185
186 static struct message *
187 message_queue_get(struct message_queue *q)
188 {
189         struct message *msg;
190
191         pthread_mutex_lock(&q->lock);
192         while (list_empty(&q->list) && !q->terminating)
193                 pthread_cond_wait(&q->msg_avail_cond, &q->lock);
194         if (!q->terminating) {
195                 msg = list_entry(q->list.next, struct message, list);
196                 list_del(&msg->list);
197         } else
198                 msg = NULL;
199         pthread_mutex_unlock(&q->lock);
200         return msg;
201 }
202
203 static void
204 message_queue_terminate(struct message_queue *q)
205 {
206         pthread_mutex_lock(&q->lock);
207         q->terminating = true;
208         pthread_cond_broadcast(&q->msg_avail_cond);
209         pthread_mutex_unlock(&q->lock);
210 }
211
212 static int
213 init_message(struct message *msg, size_t num_chunks, u32 out_chunk_size)
214 {
215         msg->num_alloc_chunks = num_chunks;
216         for (size_t i = 0; i < num_chunks; i++) {
217                 msg->compressed_chunks[i] = MALLOC(out_chunk_size - 1);
218                 msg->uncompressed_chunks[i] = MALLOC(out_chunk_size);
219                 if (msg->compressed_chunks[i] == NULL ||
220                     msg->uncompressed_chunks[i] == NULL)
221                         return WIMLIB_ERR_NOMEM;
222         }
223         return 0;
224 }
225
226 static void
227 destroy_message(struct message *msg)
228 {
229         for (size_t i = 0; i < msg->num_alloc_chunks; i++) {
230                 FREE(msg->compressed_chunks[i]);
231                 FREE(msg->uncompressed_chunks[i]);
232         }
233 }
234
235 static void
236 free_messages(struct message *msgs, size_t num_messages)
237 {
238         if (msgs) {
239                 for (size_t i = 0; i < num_messages; i++)
240                         destroy_message(&msgs[i]);
241                 FREE(msgs);
242         }
243 }
244
245 static struct message *
246 allocate_messages(size_t count, size_t chunks_per_msg, u32 out_chunk_size)
247 {
248         struct message *msgs;
249
250         msgs = CALLOC(count, sizeof(struct message));
251         if (msgs == NULL)
252                 return NULL;
253         for (size_t i = 0; i < count; i++) {
254                 if (init_message(&msgs[i], chunks_per_msg, out_chunk_size)) {
255                         free_messages(msgs, count);
256                         return NULL;
257                 }
258         }
259         return msgs;
260 }
261
262 static void
263 compress_chunks(struct message *msg, struct wimlib_compressor *compressor)
264 {
265
266         for (size_t i = 0; i < msg->num_filled_chunks; i++) {
267                 wimlib_assert(msg->uncompressed_chunk_sizes[i] != 0);
268                 msg->compressed_chunk_sizes[i] =
269                         wimlib_compress(msg->uncompressed_chunks[i],
270                                         msg->uncompressed_chunk_sizes[i],
271                                         msg->compressed_chunks[i],
272                                         msg->uncompressed_chunk_sizes[i] - 1,
273                                         compressor);
274         }
275 }
276
277 static void *
278 compressor_thread_proc(void *arg)
279 {
280         struct compressor_thread_data *params = arg;
281         struct message *msg;
282
283         while ((msg = message_queue_get(params->chunks_to_compress_queue)) != NULL) {
284                 compress_chunks(msg, params->compressor);
285                 message_queue_put(params->compressed_chunks_queue, msg);
286         }
287         return NULL;
288 }
289
290 static void
291 parallel_chunk_compressor_destroy(struct chunk_compressor *_ctx)
292 {
293         struct parallel_chunk_compressor *ctx = (struct parallel_chunk_compressor *)_ctx;
294         unsigned i;
295
296         if (ctx == NULL)
297                 return;
298
299         if (ctx->num_started_threads != 0) {
300                 DEBUG("Terminating %u compressor threads", ctx->num_started_threads);
301                 message_queue_terminate(&ctx->chunks_to_compress_queue);
302
303                 for (i = 0; i < ctx->num_started_threads; i++)
304                         pthread_join(ctx->thread_data[i].thread, NULL);
305         }
306
307         message_queue_destroy(&ctx->chunks_to_compress_queue);
308         message_queue_destroy(&ctx->compressed_chunks_queue);
309
310         if (ctx->thread_data != NULL)
311                 for (i = 0; i < ctx->num_thread_data; i++)
312                         wimlib_free_compressor(ctx->thread_data[i].compressor);
313
314         FREE(ctx->thread_data);
315
316         free_messages(ctx->msgs, ctx->num_messages);
317
318         FREE(ctx);
319 }
320
321 static void
322 submit_compression_msg(struct parallel_chunk_compressor *ctx)
323 {
324         struct message *msg = ctx->next_submit_msg;
325
326         msg->complete = false;
327         list_add_tail(&msg->submission_list, &ctx->submitted_msgs);
328         message_queue_put(&ctx->chunks_to_compress_queue, msg);
329         ctx->next_submit_msg = NULL;
330 }
331
332 static void *
333 parallel_chunk_compressor_get_chunk_buffer(struct chunk_compressor *_ctx)
334 {
335         struct parallel_chunk_compressor *ctx = (struct parallel_chunk_compressor *)_ctx;
336         struct message *msg;
337
338         if (ctx->next_submit_msg) {
339                 msg = ctx->next_submit_msg;
340         } else {
341                 if (list_empty(&ctx->available_msgs))
342                         return NULL;
343
344                 msg = list_entry(ctx->available_msgs.next, struct message, list);
345                 list_del(&msg->list);
346                 ctx->next_submit_msg = msg;
347                 msg->num_filled_chunks = 0;
348         }
349
350         return msg->uncompressed_chunks[msg->num_filled_chunks];
351 }
352
353 static void
354 parallel_chunk_compressor_signal_chunk_filled(struct chunk_compressor *_ctx, u32 usize)
355 {
356         struct parallel_chunk_compressor *ctx = (struct parallel_chunk_compressor *)_ctx;
357         struct message *msg;
358
359         wimlib_assert(usize > 0);
360         wimlib_assert(usize <= ctx->base.out_chunk_size);
361         wimlib_assert(ctx->next_submit_msg);
362
363         msg = ctx->next_submit_msg;
364         msg->uncompressed_chunk_sizes[msg->num_filled_chunks] = usize;
365         if (++msg->num_filled_chunks == msg->num_alloc_chunks)
366                 submit_compression_msg(ctx);
367 }
368
369 static bool
370 parallel_chunk_compressor_get_compression_result(struct chunk_compressor *_ctx,
371                                                  const void **cdata_ret, u32 *csize_ret,
372                                                  u32 *usize_ret)
373 {
374         struct parallel_chunk_compressor *ctx = (struct parallel_chunk_compressor *)_ctx;
375         struct message *msg;
376
377         if (ctx->next_submit_msg)
378                 submit_compression_msg(ctx);
379
380         if (ctx->next_ready_msg) {
381                 msg = ctx->next_ready_msg;
382         } else {
383                 if (list_empty(&ctx->submitted_msgs))
384                         return false;
385
386                 while (!(msg = list_entry(ctx->submitted_msgs.next,
387                                           struct message,
388                                           submission_list))->complete)
389                         message_queue_get(&ctx->compressed_chunks_queue)->complete = true;
390
391                 ctx->next_ready_msg = msg;
392                 ctx->next_chunk_idx = 0;
393         }
394
395         if (msg->compressed_chunk_sizes[ctx->next_chunk_idx]) {
396                 *cdata_ret = msg->compressed_chunks[ctx->next_chunk_idx];
397                 *csize_ret = msg->compressed_chunk_sizes[ctx->next_chunk_idx];
398         } else {
399                 *cdata_ret = msg->uncompressed_chunks[ctx->next_chunk_idx];
400                 *csize_ret = msg->uncompressed_chunk_sizes[ctx->next_chunk_idx];
401         }
402         *usize_ret = msg->uncompressed_chunk_sizes[ctx->next_chunk_idx];
403
404         if (++ctx->next_chunk_idx == msg->num_filled_chunks) {
405                 list_del(&msg->submission_list);
406                 list_add_tail(&msg->list, &ctx->available_msgs);
407                 ctx->next_ready_msg = NULL;
408         }
409         return true;
410 }
411
412 int
413 new_parallel_chunk_compressor(int out_ctype, u32 out_chunk_size,
414                               unsigned num_threads, u64 max_memory,
415                               struct chunk_compressor **compressor_ret)
416 {
417         u64 approx_mem_required;
418         size_t chunks_per_msg;
419         size_t msgs_per_thread;
420         struct parallel_chunk_compressor *ctx;
421         unsigned i;
422         int ret;
423         unsigned desired_num_threads;
424
425         wimlib_assert(out_chunk_size > 0);
426
427         if (num_threads == 0)
428                 num_threads = get_default_num_threads();
429
430         if (num_threads == 1) {
431                 DEBUG("Only 1 thread; Not bothering with "
432                       "parallel chunk compressor.");
433                 return -1;
434         }
435
436         if (max_memory == 0)
437                 max_memory = get_avail_memory();
438
439         desired_num_threads = num_threads;
440
441         if (out_chunk_size < ((u32)1 << 23)) {
442                 /* Relatively small chunks.  Use 2 messages per thread, each
443                  * with at least 2 chunks.  Use more chunks per message if there
444                  * are lots of threads and/or the chunks are very small.  */
445                 chunks_per_msg = 2;
446                 chunks_per_msg += num_threads * (65536 / out_chunk_size) / 16;
447                 chunks_per_msg = max(chunks_per_msg, 2);
448                 chunks_per_msg = min(chunks_per_msg, MAX_CHUNKS_PER_MSG);
449                 msgs_per_thread = 2;
450         } else {
451                 /* Big chunks: Just have one buffer per thread --- more would
452                  * just waste memory.  */
453                 chunks_per_msg = 1;
454                 msgs_per_thread = 1;
455         }
456         for (;;) {
457                 approx_mem_required =
458                         (u64)chunks_per_msg *
459                         (u64)msgs_per_thread *
460                         (u64)num_threads *
461                         (u64)out_chunk_size
462                         + out_chunk_size
463                         + 1000000
464                         + num_threads * wimlib_get_compressor_needed_memory(out_ctype,
465                                                                             out_chunk_size,
466                                                                             0);
467                 if (approx_mem_required <= max_memory)
468                         break;
469
470                 if (chunks_per_msg > 1)
471                         chunks_per_msg--;
472                 else if (msgs_per_thread > 1)
473                         msgs_per_thread--;
474                 else if (num_threads > 1)
475                         num_threads--;
476                 else
477                         break;
478         }
479
480         if (num_threads < desired_num_threads) {
481                 WARNING("Wanted to use %u threads, but limiting to %u "
482                         "to fit in available memory!",
483                         desired_num_threads, num_threads);
484         }
485
486         if (num_threads == 1) {
487                 DEBUG("Only 1 thread; Not bothering with "
488                       "parallel chunk compressor.");
489                 return -2;
490         }
491
492         ret = WIMLIB_ERR_NOMEM;
493         ctx = CALLOC(1, sizeof(*ctx));
494         if (ctx == NULL)
495                 goto err;
496
497         ctx->base.out_ctype = out_ctype;
498         ctx->base.out_chunk_size = out_chunk_size;
499         ctx->base.destroy = parallel_chunk_compressor_destroy;
500         ctx->base.get_chunk_buffer = parallel_chunk_compressor_get_chunk_buffer;
501         ctx->base.signal_chunk_filled = parallel_chunk_compressor_signal_chunk_filled;
502         ctx->base.get_compression_result = parallel_chunk_compressor_get_compression_result;
503
504         ctx->num_thread_data = num_threads;
505
506         ret = message_queue_init(&ctx->chunks_to_compress_queue);
507         if (ret)
508                 goto err;
509
510         ret = message_queue_init(&ctx->compressed_chunks_queue);
511         if (ret)
512                 goto err;
513
514         ret = WIMLIB_ERR_NOMEM;
515         ctx->thread_data = CALLOC(num_threads, sizeof(ctx->thread_data[0]));
516         if (ctx->thread_data == NULL)
517                 goto err;
518
519         for (i = 0; i < num_threads; i++) {
520                 struct compressor_thread_data *dat;
521
522                 dat = &ctx->thread_data[i];
523
524                 dat->chunks_to_compress_queue = &ctx->chunks_to_compress_queue;
525                 dat->compressed_chunks_queue = &ctx->compressed_chunks_queue;
526                 ret = wimlib_create_compressor(out_ctype, out_chunk_size, 0,
527                                                &dat->compressor);
528                 if (ret)
529                         goto err;
530         }
531
532         for (ctx->num_started_threads = 0;
533              ctx->num_started_threads < num_threads;
534              ctx->num_started_threads++)
535         {
536                 DEBUG("pthread_create thread %u of %u",
537                       ctx->num_started_threads + 1, num_threads);
538                 ret = pthread_create(&ctx->thread_data[ctx->num_started_threads].thread,
539                                      NULL,
540                                      compressor_thread_proc,
541                                      &ctx->thread_data[ctx->num_started_threads]);
542                 if (ret) {
543                         errno = ret;
544                         WARNING_WITH_ERRNO("Failed to create compressor thread %u of %u",
545                                            ctx->num_started_threads + 1,
546                                            num_threads);
547                         ret = WIMLIB_ERR_NOMEM;
548                         if (ctx->num_started_threads >= 2)
549                                 break;
550                         goto err;
551                 }
552         }
553
554         ctx->base.num_threads = ctx->num_started_threads;
555
556         ret = WIMLIB_ERR_NOMEM;
557         ctx->num_messages = ctx->num_started_threads * msgs_per_thread;
558         ctx->msgs = allocate_messages(ctx->num_messages,
559                                       chunks_per_msg, out_chunk_size);
560         if (ctx->msgs == NULL)
561                 goto err;
562
563         INIT_LIST_HEAD(&ctx->available_msgs);
564         for (size_t i = 0; i < ctx->num_messages; i++)
565                 list_add_tail(&ctx->msgs[i].list, &ctx->available_msgs);
566
567         INIT_LIST_HEAD(&ctx->submitted_msgs);
568
569         *compressor_ret = &ctx->base;
570         return 0;
571
572 err:
573         parallel_chunk_compressor_destroy(&ctx->base);
574         return ret;
575 }
576
577 #endif /* ENABLE_MULTITHREADED_COMPRESSION */