]> wimlib.net Git - wimlib/commitdiff
Add support for a data recovery mode
authorEric Biggers <ebiggers3@gmail.com>
Wed, 7 Apr 2021 04:31:48 +0000 (21:31 -0700)
committerEric Biggers <ebiggers3@gmail.com>
Wed, 7 Apr 2021 04:31:48 +0000 (21:31 -0700)
Add support for extracting file data even if it is corrupted (i.e. if
its hash doesn't match or some of its chunks can't be decompressed).

This isn't recommended for general use, but it could be useful for
recovering data from a corrupted WIM archive.

19 files changed:
Makefile.am
NEWS
doc/man1/wimapply.1
doc/man1/wimextract.1
include/wimlib.h
include/wimlib/blob_table.h
include/wimlib/ntfs_3g.h
include/wimlib/resource.h
include/wimlib/win32.h
programs/imagex.c
src/extract.c
src/ntfs-3g_capture.c
src/resource.c
src/win32_capture.c
src/write.c
tests/test-imagex-capture_and_apply
tests/wims/README
tests/wims/corrupted_file_1.wim [new file with mode: 0644]
tests/wims/corrupted_file_2.wim [new file with mode: 0644]

index 85b038b530654a6410c5a376e11c184c8f70a8c1..391b09ca96bb1f7518582e08f46c62d355579476 100644 (file)
@@ -340,7 +340,8 @@ EXTRA_DIST +=                                       \
        tests/security_descriptor_1.base64      \
        tests/security_descriptor_1.bin         \
        tests/security_descriptor_2.base64      \
-       tests/security_descriptor_2.bin
+       tests/security_descriptor_2.bin         \
+       tests/wims
 
 if WINDOWS_NATIVE_BUILD
 # Tests are run manually for Windows builds.
diff --git a/NEWS b/NEWS
index 3ac38c16da0e3fa4049727b737627ea3c45ce472..69f8ce0302f7b995eacf0e1e96a99a331558b3c5 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,12 @@ Version 1.13.4-BETA1:
        wimsplit now prints progress messages regularly rather than just once
        per WIM part.
 
+       Added support for a data recovery mode which causes files to be
+       extracted even if they are corrupted.  The option is --recover-data for
+       wimapply and wimextract, and WIMLIB_EXTRACT_FLAG_RECOVER_DATA for the
+       library.  Note that this option won't help with all types of corruption;
+       some types of corruption will still cause a fatal error.
+
 Version 1.13.3:
        On Windows, improved performance of capturing an entire drive in some
        cases.
index 4ae339f05e582a7b306c05f63b0507d79b745806..ebd8f7eb7991c2f677f20e0385169052c1a4c895 100644 (file)
@@ -355,15 +355,22 @@ files matching any of the patterns in this section will not be compressed.
 In addition, wimlib has a hardcoded list of files for which it knows, for
 compatibility with the Windows bootloader, to override the requested compression
 format.
+.TP
+\fB--recover-data\fR
+If a file is corrupted (its stored hash doesn't match its actual hash, or some
+parts of it can't be decompressed), extract the corrupted file anyway with a
+warning, rather than aborting with an error.  This may be useful to recover data
+if a WIM archive was corrupted.  Note that recovering data is not guaranteed to
+succeed, as it depends on the type of corruption that occurred.
 .SH NOTES
 \fIData integrity\fR: WIM files include checksums of file data.  To detect
 accidental (non-malicious) data corruption, wimlib calculates the checksum of
 every file it extracts and issues an error if it does not have the expected
-value.  (This default behavior seems equivalent to the \fB/verify\fR option of
-ImageX.)  In addition, a WIM file can include an integrity table (extra
-checksums) over the raw data of the entire WIM file.  For performance reasons
-wimlib does not check the integrity table by default, but the \fB--check\fR
-option can be passed to make it do so.
+value, unless the \fB--recover-data\fR option is given.  (This default behavior
+seems equivalent to the \fB/verify\fR option of ImageX.)  In addition, a WIM
+file can include an integrity table (extra checksums) over the raw data of the
+entire WIM file.  For performance reasons wimlib does not check the integrity
+table by default, but the \fB--check\fR option can be passed to make it do so.
 .PP
 \fIESD files\fR: wimlib can extract files from solid-compressed WIMs, or "ESD"
 (.esd) files, just like from normal WIM (.wim) files.  However, Microsoft
index c402d6443dfe4e5e4e02f200569338f97fd1e24e..01e87d2d90501b047030b8e7913e3b9d78dc5a6a 100644 (file)
@@ -152,6 +152,9 @@ See the documentation for this option to \fBwimapply\fR(1).
 .TP
 \fB--compact\fR=\fIFORMAT\fR
 See the documentation for this option to \fBwimapply\fR(1).
+.TP
+\fB--recover-data\fR
+See the documentation for this option to \fBwimapply\fR(1).
 .SH NOTES
 See \fBwimapply\fR(1) for information about what data and metadata are extracted
 on UNIX-like systems versus on Windows.
index 47c357d064672834d12f2d2508c69efe628ea4f2..b1aff6297efae7a7cdbdee4b1cff3c61ca71d12a 100644 (file)
@@ -1928,6 +1928,10 @@ typedef int (*wimlib_iterate_lookup_table_callback_t)(const struct wimlib_resour
  * wimlib_extract_paths() when passed multiple paths.  */
 #define WIMLIB_EXTRACT_FLAG_NTFS                       0x00000001
 
+/** Since wimlib v1.13.4: Don't consider corrupted files to be an error.  Just
+ * extract them in whatever form we can.  */
+#define WIMLIB_EXTRACT_FLAG_RECOVER_DATA               0x00000002
+
 /** UNIX-like systems only:  Extract UNIX-specific metadata captured with
  * ::WIMLIB_ADD_FLAG_UNIX_DATA.  */
 #define WIMLIB_EXTRACT_FLAG_UNIX_DATA                  0x00000020
index 610db2419e2f9b96c8c0b079f8830d937848109f..f15c61175c45d86785b8e0f65d9f44aac0053af9 100644 (file)
@@ -141,6 +141,9 @@ struct blob_descriptor {
        /* 1 iff the SHA-1 message digest of this blob is unknown.  */
        u16 unhashed : 1;
 
+       /* 1 iff this blob has failed its checksum.  */
+       u16 corrupted : 1;
+
        /* Temporary fields used when writing blobs; set as documented for
         * prepare_blob_list_for_write().  */
        u16 unique_size : 1;
index 7909e43e73f0e0fe0111e9841a4be09152c2040a..e2ed6d6bbcef1ab231cc706c33526171a1bdfa6f 100644 (file)
@@ -11,7 +11,8 @@ struct ntfs_location;
 
 extern int
 read_ntfs_attribute_prefix(const struct blob_descriptor *blob, u64 size,
-                          const struct consume_chunk_callback *cb);
+                          const struct consume_chunk_callback *cb,
+                          bool recover_data);
 
 extern struct ntfs_location *
 clone_ntfs_location(const struct ntfs_location *loc);
index 05ceed70b88b7b24e96f45296e324698cd14321c..94e43e5fe45aece4feacabb058e86bb28b067faf 100644 (file)
@@ -273,6 +273,7 @@ call_end_blob(struct blob_descriptor *blob, int status,
 #define VERIFY_BLOB_HASHES             0x1
 #define COMPUTE_MISSING_BLOB_HASHES    0x2
 #define BLOB_LIST_ALREADY_SORTED       0x4
+#define RECOVER_DATA                   0x8
 
 extern int
 read_blob_list(struct list_head *blob_list, size_t list_head_offset,
@@ -280,18 +281,19 @@ read_blob_list(struct list_head *blob_list, size_t list_head_offset,
 
 extern int
 read_blob_with_cbs(struct blob_descriptor *blob,
-                  const struct read_blob_callbacks *cbs);
+                  const struct read_blob_callbacks *cbs, bool recover_data);
 
 extern int
 read_blob_with_sha1(struct blob_descriptor *blob,
-                   const struct read_blob_callbacks *cbs);
+                   const struct read_blob_callbacks *cbs, bool recover_data);
 
 extern int
 extract_blob_prefix_to_fd(struct blob_descriptor *blob, u64 size,
                          struct filedes *fd);
 
 extern int
-extract_blob_to_fd(struct blob_descriptor *blob, struct filedes *fd);
+extract_blob_to_fd(struct blob_descriptor *blob, struct filedes *fd,
+                  bool recover_data);
 
 /* Miscellaneous blob functions.  */
 
index 94511a872b28f10072dcd9e6c36876f1fa0110bf..71a77107a837f5678ae16a20f2b4ac8c164c96cb 100644 (file)
@@ -25,7 +25,8 @@ cmp_windows_files(const struct windows_file *file1,
 
 extern int
 read_windows_file_prefix(const struct blob_descriptor *blob, u64 size,
-                        const struct consume_chunk_callback *cb);
+                        const struct consume_chunk_callback *cb,
+                        bool recover_data);
 
 extern int
 win32_global_init(int init_flags);
index dd43d75c059e494d79f95c5d15371c4f4dd19414..59181eb94be60d322aea334dbcd5601b36b9fc03 100644 (file)
@@ -6,7 +6,7 @@
  */
 
 /*
- * Copyright (C) 2012-2018 Eric Biggers
+ * Copyright (C) 2012-2021 Eric Biggers
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -202,6 +202,7 @@ enum {
        IMAGEX_PRESERVE_DIR_STRUCTURE_OPTION,
        IMAGEX_REBUILD_OPTION,
        IMAGEX_RECOMPRESS_OPTION,
+       IMAGEX_RECOVER_DATA_OPTION,
        IMAGEX_RECURSIVE_OPTION,
        IMAGEX_REF_OPTION,
        IMAGEX_RPFIX_OPTION,
@@ -239,6 +240,7 @@ static const struct option apply_options[] = {
        {T("include-invalid-names"), no_argument,       NULL, IMAGEX_INCLUDE_INVALID_NAMES_OPTION},
        {T("wimboot"),     no_argument,       NULL, IMAGEX_WIMBOOT_OPTION},
        {T("compact"),     required_argument, NULL, IMAGEX_COMPACT_OPTION},
+       {T("recover-data"), no_argument,      NULL, IMAGEX_RECOVER_DATA_OPTION},
        {NULL, 0, NULL, 0},
 };
 
@@ -336,6 +338,7 @@ static const struct option extract_options[] = {
        {T("preserve-dir-structure"), no_argument, NULL, IMAGEX_PRESERVE_DIR_STRUCTURE_OPTION},
        {T("wimboot"),     no_argument,       NULL, IMAGEX_WIMBOOT_OPTION},
        {T("compact"),     required_argument, NULL, IMAGEX_COMPACT_OPTION},
+       {T("recover-data"), no_argument,      NULL, IMAGEX_RECOVER_DATA_OPTION},
        {NULL, 0, NULL, 0},
 };
 
@@ -1747,6 +1750,9 @@ imagex_apply(int argc, tchar **argv, int cmd)
                        if (ret)
                                goto out_free_refglobs;
                        break;
+               case IMAGEX_RECOVER_DATA_OPTION:
+                       extract_flags |= WIMLIB_EXTRACT_FLAG_RECOVER_DATA;
+                       break;
                default:
                        goto out_usage;
                }
@@ -3294,6 +3300,9 @@ imagex_extract(int argc, tchar **argv, int cmd)
                        if (ret)
                                goto out_free_refglobs;
                        break;
+               case IMAGEX_RECOVER_DATA_OPTION:
+                       extract_flags |= WIMLIB_EXTRACT_FLAG_RECOVER_DATA;
+                       break;
                default:
                        goto out_usage;
                }
@@ -4484,7 +4493,7 @@ T(
 "                    [--check] [--ref=\"GLOB\"] [--no-acls] [--strict-acls]\n"
 "                    [--no-attributes] [--rpfix] [--norpfix]\n"
 "                    [--include-invalid-names] [--wimboot] [--unix-data]\n"
-"                    [--compact=FORMAT]\n"
+"                    [--compact=FORMAT] [--recover-data]\n"
 ),
 [CMD_CAPTURE] =
 T(
@@ -4517,8 +4526,8 @@ T(
 "    %"TS" WIMFILE IMAGE [(PATH | @LISTFILE)...]\n"
 "                    [--check] [--ref=\"GLOB\"] [--dest-dir=CMD_DIR]\n"
 "                    [--to-stdout] [--no-acls] [--strict-acls]\n"
-"                    [--no-attributes] [--include-invalid-names]\n"
-"                    [--no-globs] [--nullglob] [--preserve-dir-structure]\n"
+"                    [--no-attributes] [--include-invalid-names] [--no-globs]\n"
+"                    [--nullglob] [--preserve-dir-structure] [--recover-data]\n"
 ),
 [CMD_INFO] =
 T(
@@ -4602,7 +4611,7 @@ version(void)
        static const tchar * const fmt =
        T(
 "wimlib-imagex " PACKAGE_VERSION " (using wimlib %"TS")\n"
-"Copyright (C) 2012-2018 Eric Biggers\n"
+"Copyright (C) 2012-2021 Eric Biggers\n"
 "License GPLv3+; GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>.\n"
 "This is free software: you are free to change and redistribute it.\n"
 "There is NO WARRANTY, to the extent permitted by law.\n"
index 49564f4f20fcb266498a7af04b10b853ea094061..0e5d4ec4c900c1aab13d23bdd4f905c51afb6b8e 100644 (file)
@@ -72,6 +72,7 @@
 /* Keep in sync with wimlib.h  */
 #define WIMLIB_EXTRACT_MASK_PUBLIC                             \
        (WIMLIB_EXTRACT_FLAG_NTFS                       |       \
+        WIMLIB_EXTRACT_FLAG_RECOVER_DATA               |       \
         WIMLIB_EXTRACT_FLAG_UNIX_DATA                  |       \
         WIMLIB_EXTRACT_FLAG_NO_ACLS                    |       \
         WIMLIB_EXTRACT_FLAG_STRICT_ACLS                |       \
@@ -310,7 +311,9 @@ read_blobs_from_pipe(struct apply_ctx *ctx, const struct read_blob_callbacks *cb
                    && (blob->out_refcnt))
                {
                        wim_reshdr_to_desc_and_blob(&reshdr, ctx->wim, &rdesc, blob);
-                       ret = read_blob_with_sha1(blob, cbs);
+                       ret = read_blob_with_sha1(blob, cbs,
+                                                 ctx->extract_flags &
+                                                 WIMLIB_EXTRACT_FLAG_RECOVER_DATA);
                        blob_unset_is_located_in_wim_resource(blob);
                        if (ret)
                                return ret;
@@ -504,18 +507,39 @@ extract_from_tmpfile(const tchar *tmpfile_name,
 
        for (u32 i = 0; i < orig_blob->out_refcnt; i++) {
                tmpfile_blob.inline_blob_extraction_targets[0] = targets[i];
-               ret = read_blob_with_cbs(&tmpfile_blob, cbs);
+               ret = read_blob_with_cbs(&tmpfile_blob, cbs, false);
                if (ret)
                        return ret;
        }
        return 0;
 }
 
+static void
+warn_about_corrupted_file(struct wim_dentry *dentry,
+                         const struct wim_inode_stream *stream)
+{
+       WARNING("Corruption in %s\"%"TS"\"!  Extracting anyway since data recovery mode is enabled.",
+               stream_is_unnamed_data_stream(stream) ? "" : "alternate stream of ",
+               dentry_full_path(dentry));
+}
+
 static int
 end_extract_blob(struct blob_descriptor *blob, int status, void *_ctx)
 {
        struct apply_ctx *ctx = _ctx;
 
+       if ((ctx->extract_flags & WIMLIB_EXTRACT_FLAG_RECOVER_DATA) &&
+           !status && blob->corrupted) {
+               const struct blob_extraction_target *targets =
+                       blob_extraction_targets(blob);
+               for (u32 i = 0; i < blob->out_refcnt; i++) {
+                       struct wim_dentry *dentry =
+                               inode_first_extraction_dentry(targets[i].inode);
+
+                       warn_about_corrupted_file(dentry, targets[i].stream);
+               }
+       }
+
        if (unlikely(filedes_valid(&ctx->tmpfile_fd))) {
                filedes_close(&ctx->tmpfile_fd);
                if (!status)
@@ -560,10 +584,15 @@ extract_blob_list(struct apply_ctx *ctx, const struct read_blob_callbacks *cbs)
        if (ctx->extract_flags & WIMLIB_EXTRACT_FLAG_FROM_PIPE) {
                return read_blobs_from_pipe(ctx, &wrapper_cbs);
        } else {
+               int flags = VERIFY_BLOB_HASHES;
+
+               if (ctx->extract_flags & WIMLIB_EXTRACT_FLAG_RECOVER_DATA)
+                       flags |= RECOVER_DATA;
+
                return read_blob_list(&ctx->blob_list,
                                      offsetof(struct blob_descriptor,
                                               extraction_list),
-                                     &wrapper_cbs, VERIFY_BLOB_HASHES);
+                                     &wrapper_cbs, flags);
        }
 }
 
@@ -574,11 +603,13 @@ extract_blob_list(struct apply_ctx *ctx, const struct read_blob_callbacks *cbs)
  * unnamed data stream only.  */
 static int
 extract_dentry_to_stdout(struct wim_dentry *dentry,
-                        const struct blob_table *blob_table)
+                        const struct blob_table *blob_table, int extract_flags)
 {
        struct wim_inode *inode = dentry->d_inode;
        struct blob_descriptor *blob;
        struct filedes _stdout;
+       bool recover = (extract_flags & WIMLIB_EXTRACT_FLAG_RECOVER_DATA);
+       int ret;
 
        if (inode->i_attributes & (FILE_ATTRIBUTE_REPARSE_POINT |
                                   FILE_ATTRIBUTE_DIRECTORY |
@@ -598,15 +629,23 @@ extract_dentry_to_stdout(struct wim_dentry *dentry,
        }
 
        filedes_init(&_stdout, STDOUT_FILENO);
-       return extract_blob_to_fd(blob, &_stdout);
+       ret = extract_blob_to_fd(blob, &_stdout, recover);
+       if (ret)
+               return ret;
+       if (recover && blob->corrupted)
+               warn_about_corrupted_file(dentry,
+                                         inode_get_unnamed_data_stream(inode));
+       return 0;
 }
 
 static int
 extract_dentries_to_stdout(struct wim_dentry **dentries, size_t num_dentries,
-                          const struct blob_table *blob_table)
+                          const struct blob_table *blob_table,
+                          int extract_flags)
 {
        for (size_t i = 0; i < num_dentries; i++) {
-               int ret = extract_dentry_to_stdout(dentries[i], blob_table);
+               int ret = extract_dentry_to_stdout(dentries[i], blob_table,
+                                                  extract_flags);
                if (ret)
                        return ret;
        }
@@ -1446,7 +1485,8 @@ extract_trees(WIMStruct *wim, struct wim_dentry **trees, size_t num_trees,
 
        if (extract_flags & WIMLIB_EXTRACT_FLAG_TO_STDOUT) {
                ret = extract_dentries_to_stdout(trees, num_trees,
-                                                wim->blob_table);
+                                                wim->blob_table,
+                                                extract_flags);
                goto out;
        }
 
index bc8fed710cb898b303b5cbd4e3d9ac6eafcf4e15..1745cbd1f6dff8308bb5759d77d2a70234b6931d 100644 (file)
@@ -117,7 +117,8 @@ open_ntfs_attr(ntfs_inode *ni, const struct ntfs_location *loc)
 
 int
 read_ntfs_attribute_prefix(const struct blob_descriptor *blob, u64 size,
-                          const struct consume_chunk_callback *cb)
+                          const struct consume_chunk_callback *cb,
+                          bool recover_data)
 {
        const struct ntfs_location *loc = blob->ntfs_loc;
        ntfs_volume *vol = loc->volume->vol;
index 9054b55efaf46e058a145231cf79e4fd462c2e4d..8b6139ad4dbf65da2a0dc32a4c1043c59f90f667 100644 (file)
@@ -83,6 +83,34 @@ struct data_range {
        u64 size;
 };
 
+static int
+decompress_chunk(const void *cbuf, u32 chunk_csize, u8 *ubuf, u32 chunk_usize,
+                struct wimlib_decompressor *decompressor, bool recover_data)
+{
+       int res = wimlib_decompress(cbuf, chunk_csize, ubuf, chunk_usize,
+                                   decompressor);
+       if (likely(res == 0))
+               return 0;
+
+       if (recover_data) {
+               WARNING("Failed to decompress data!  Continuing anyway since data recovery mode is enabled.");
+
+               /* Continue on with *something*.  In the worst case just use a
+                * zeroed buffer.  But, try to fill as much of it with
+                * decompressed data as we can.  This works because if the
+                * corruption isn't located right at the beginning of the
+                * compressed chunk, wimlib_decompress() may write some correct
+                * output at the beginning even if it fails later.  */
+               memset(ubuf, 0, chunk_usize);
+               (void)wimlib_decompress(cbuf, chunk_csize, ubuf,
+                                       chunk_usize, decompressor);
+               return 0;
+       }
+       ERROR("Failed to decompress data!");
+       errno = EINVAL;
+       return WIMLIB_ERR_DECOMPRESSION;
+}
+
 /*
  * Read data from a compressed WIM resource.
  *
@@ -98,6 +126,9 @@ struct data_range {
  *     the data being read.  Each call provides the next chunk of the requested
  *     data, uncompressed.  Each chunk will be nonempty and will not cross
  *     range boundaries but otherwise will be of unspecified size.
+ * @recover_data
+ *     If a chunk can't be fully decompressed due to being corrupted, continue
+ *     with whatever data can be recovered rather than return an error.
  *
  * Possible return values:
  *
@@ -114,7 +145,8 @@ static int
 read_compressed_wim_resource(const struct wim_resource_descriptor * const rdesc,
                             const struct data_range * const ranges,
                             const size_t num_ranges,
-                            const struct consume_chunk_callback *cb)
+                            const struct consume_chunk_callback *cb,
+                            bool recover_data)
 {
        int ret;
        u64 *chunk_offsets = NULL;
@@ -446,17 +478,12 @@ read_compressed_wim_resource(const struct wim_resource_descriptor * const rdesc,
                                goto read_error;
 
                        if (read_buf == cbuf) {
-                               ret = wimlib_decompress(cbuf,
-                                                       chunk_csize,
-                                                       ubuf,
-                                                       chunk_usize,
-                                                       decompressor);
-                               if (unlikely(ret)) {
-                                       ERROR("Failed to decompress data!");
-                                       ret = WIMLIB_ERR_DECOMPRESSION;
-                                       errno = EINVAL;
+                               ret = decompress_chunk(cbuf, chunk_csize,
+                                                      ubuf, chunk_usize,
+                                                      decompressor,
+                                                      recover_data);
+                               if (unlikely(ret))
                                        goto out_cleanup;
-                               }
                        }
                        cur_read_offset += chunk_csize;
 
@@ -592,7 +619,8 @@ bufferer_cb(const void *chunk, size_t size, void *_ctx)
 static int
 read_partial_wim_resource(const struct wim_resource_descriptor *rdesc,
                          const u64 offset, const u64 size,
-                         const struct consume_chunk_callback *cb)
+                         const struct consume_chunk_callback *cb,
+                         bool recover_data)
 {
        if (rdesc->flags & (WIM_RESHDR_FLAG_COMPRESSED |
                            WIM_RESHDR_FLAG_SOLID))
@@ -604,7 +632,8 @@ read_partial_wim_resource(const struct wim_resource_descriptor *rdesc,
                        .offset = offset,
                        .size = size,
                };
-               return read_compressed_wim_resource(rdesc, &range, 1, cb);
+               return read_compressed_wim_resource(rdesc, &range, 1, cb,
+                                                   recover_data);
        }
 
        /* Uncompressed resource  */
@@ -626,7 +655,7 @@ read_partial_wim_blob_into_buf(const struct blob_descriptor *blob,
        return read_partial_wim_resource(blob->rdesc,
                                         blob->offset_in_res + offset,
                                         size,
-                                        &cb);
+                                        &cb, false);
 }
 
 static int
@@ -643,15 +672,15 @@ skip_wim_resource(const struct wim_resource_descriptor *rdesc)
                .func = noop_cb,
        };
        return read_partial_wim_resource(rdesc, 0,
-                                        rdesc->uncompressed_size, &cb);
+                                        rdesc->uncompressed_size, &cb, false);
 }
 
 static int
 read_wim_blob_prefix(const struct blob_descriptor *blob, u64 size,
-                    const struct consume_chunk_callback *cb)
+                    const struct consume_chunk_callback *cb, bool recover_data)
 {
        return read_partial_wim_resource(blob->rdesc, blob->offset_in_res,
-                                        size, cb);
+                                        size, cb, recover_data);
 }
 
 /* This function handles reading blob data that is located in an external file,
@@ -664,7 +693,8 @@ read_wim_blob_prefix(const struct blob_descriptor *blob, u64 size,
  * encrypted), so Windows uses its own code for its equivalent case.  */
 static int
 read_file_on_disk_prefix(const struct blob_descriptor *blob, u64 size,
-                        const struct consume_chunk_callback *cb)
+                        const struct consume_chunk_callback *cb,
+                        bool recover_data)
 {
        int ret;
        int raw_fd;
@@ -684,7 +714,8 @@ read_file_on_disk_prefix(const struct blob_descriptor *blob, u64 size,
 #ifdef WITH_FUSE
 static int
 read_staging_file_prefix(const struct blob_descriptor *blob, u64 size,
-                        const struct consume_chunk_callback *cb)
+                        const struct consume_chunk_callback *cb,
+                        bool recover_data)
 {
        int raw_fd;
        struct filedes fd;
@@ -708,7 +739,8 @@ read_staging_file_prefix(const struct blob_descriptor *blob, u64 size,
  * already located in an in-memory buffer.  */
 static int
 read_buffer_prefix(const struct blob_descriptor *blob,
-                  u64 size, const struct consume_chunk_callback *cb)
+                  u64 size, const struct consume_chunk_callback *cb,
+                  bool recover_data)
 {
        if (unlikely(!size))
                return 0;
@@ -717,7 +749,8 @@ read_buffer_prefix(const struct blob_descriptor *blob,
 
 typedef int (*read_blob_prefix_handler_t)(const struct blob_descriptor *blob,
                                          u64 size,
-                                         const struct consume_chunk_callback *cb);
+                                         const struct consume_chunk_callback *cb,
+                                         bool recover_data);
 
 /*
  * Read the first @size bytes from a generic "blob", which may be located in any
@@ -728,11 +761,12 @@ typedef int (*read_blob_prefix_handler_t)(const struct blob_descriptor *blob,
  * Returns 0 on success; nonzero on error.  A nonzero value will be returned if
  * the blob data cannot be successfully read (for a number of different reasons,
  * depending on the blob location), or if @cb returned nonzero in which case
- * that error code will be returned.
+ * that error code will be returned.  If @recover_data is true, then errors
+ * decompressing chunks in WIM resources will be ignored.
  */
 static int
 read_blob_prefix(const struct blob_descriptor *blob, u64 size,
-                const struct consume_chunk_callback *cb)
+                const struct consume_chunk_callback *cb, bool recover_data)
 {
        static const read_blob_prefix_handler_t handlers[] = {
                [BLOB_IN_WIM] = read_wim_blob_prefix,
@@ -751,7 +785,7 @@ read_blob_prefix(const struct blob_descriptor *blob, u64 size,
        wimlib_assert(blob->blob_location < ARRAY_LEN(handlers)
                      && handlers[blob->blob_location] != NULL);
        wimlib_assert(size <= blob->size);
-       return handlers[blob->blob_location](blob, size, cb);
+       return handlers[blob->blob_location](blob, size, cb, recover_data);
 }
 
 struct blob_chunk_ctx {
@@ -775,7 +809,7 @@ consume_blob_chunk(const void *chunk, size_t size, void *_ctx)
  * callbacks (all of which are optional).  */
 int
 read_blob_with_cbs(struct blob_descriptor *blob,
-                  const struct read_blob_callbacks *cbs)
+                  const struct read_blob_callbacks *cbs, bool recover_data)
 {
        int ret;
        struct blob_chunk_ctx ctx = {
@@ -792,7 +826,7 @@ read_blob_with_cbs(struct blob_descriptor *blob,
        if (unlikely(ret))
                return ret;
 
-       ret = read_blob_prefix(blob, blob->size, &cb);
+       ret = read_blob_prefix(blob, blob->size, &cb, recover_data);
 
        return call_end_blob(blob, ret, cbs);
 }
@@ -807,7 +841,7 @@ read_blob_into_buf(const struct blob_descriptor *blob, void *buf)
                .func   = bufferer_cb,
                .ctx    = &buf,
        };
-       return read_blob_prefix(blob, blob->size, &cb);
+       return read_blob_prefix(blob, blob->size, &cb, false);
 }
 
 /* Retrieve the full uncompressed data of the specified blob.  A buffer large
@@ -955,6 +989,7 @@ hasher_begin_blob(struct blob_descriptor *blob, void *_ctx)
        struct hasher_context *ctx = _ctx;
 
        sha1_init(&ctx->sha_ctx);
+       blob->corrupted = 0;
 
        return call_begin_blob(blob, &ctx->cbs);
 }
@@ -977,8 +1012,8 @@ hasher_continue_blob(const struct blob_descriptor *blob, u64 offset,
 }
 
 static int
-report_sha1_mismatch_error(const struct blob_descriptor *blob,
-                          const u8 actual_hash[SHA1_HASH_SIZE])
+report_sha1_mismatch(struct blob_descriptor *blob,
+                    const u8 actual_hash[SHA1_HASH_SIZE], bool recover_data)
 {
        tchar expected_hashstr[SHA1_HASH_SIZE * 2 + 1];
        tchar actual_hashstr[SHA1_HASH_SIZE * 2 + 1];
@@ -989,6 +1024,8 @@ report_sha1_mismatch_error(const struct blob_descriptor *blob,
        sprint_hash(blob->hash, expected_hashstr);
        sprint_hash(actual_hash, actual_hashstr);
 
+       blob->corrupted = 1;
+
        if (blob_is_in_file(blob)) {
                ERROR("A file was concurrently modified!\n"
                      "        Path: \"%"TS"\"\n"
@@ -997,18 +1034,21 @@ report_sha1_mismatch_error(const struct blob_descriptor *blob,
                      blob_file_path(blob), expected_hashstr, actual_hashstr);
                return WIMLIB_ERR_CONCURRENT_MODIFICATION_DETECTED;
        } else if (blob->blob_location == BLOB_IN_WIM) {
+       #ifdef ENABLE_ERROR_MESSAGES
                const struct wim_resource_descriptor *rdesc = blob->rdesc;
-               ERROR("A WIM resource is corrupted!\n"
-                     "        WIM file: \"%"TS"\"\n"
-                     "        Blob uncompressed size: %"PRIu64"\n"
-                     "        Resource offset in WIM: %"PRIu64"\n"
-                     "        Resource uncompressed size: %"PRIu64"\n"
-                     "        Resource size in WIM: %"PRIu64"\n"
-                     "        Resource flags: 0x%x%"TS"\n"
-                     "        Resource compression type: %"TS"\n"
-                     "        Resource compression chunk size: %"PRIu32"\n"
-                     "        Expected SHA-1: %"TS"\n"
-                     "        Actual SHA-1: %"TS"\n",
+
+               (recover_data ? wimlib_warning : wimlib_error)(
+                     T("A WIM resource is corrupted!\n"
+                       "        WIM file: \"%"TS"\"\n"
+                       "        Blob uncompressed size: %"PRIu64"\n"
+                       "        Resource offset in WIM: %"PRIu64"\n"
+                       "        Resource uncompressed size: %"PRIu64"\n"
+                       "        Resource size in WIM: %"PRIu64"\n"
+                       "        Resource flags: 0x%x%"TS"\n"
+                       "        Resource compression type: %"TS"\n"
+                       "        Resource compression chunk size: %"PRIu32"\n"
+                       "        Expected SHA-1: %"TS"\n"
+                       "        Actual SHA-1: %"TS"\n"),
                      rdesc->wim->filename,
                      blob->size,
                      rdesc->offset_in_wim,
@@ -1020,6 +1060,9 @@ report_sha1_mismatch_error(const struct blob_descriptor *blob,
                                                rdesc->compression_type),
                      rdesc->chunk_size,
                      expected_hashstr, actual_hashstr);
+       #endif /* ENABLE_ERROR_MESSAGES */
+               if (recover_data)
+                       return 0;
                return WIMLIB_ERR_INVALID_RESOURCE_HASH;
        } else {
                ERROR("File data was concurrently modified!\n"
@@ -1058,7 +1101,8 @@ hasher_end_blob(struct blob_descriptor *blob, int status, void *_ctx)
        } else if ((ctx->flags & VERIFY_BLOB_HASHES) &&
                   unlikely(!hashes_equal(hash, blob->hash)))
        {
-               ret = report_sha1_mismatch_error(blob, hash);
+               ret = report_sha1_mismatch(blob, hash,
+                                          ctx->flags & RECOVER_DATA);
                goto out_next_cb;
        }
        ret = 0;
@@ -1071,10 +1115,11 @@ out_next_cb:
  * SHA-1 message digest of the blob.  */
 int
 read_blob_with_sha1(struct blob_descriptor *blob,
-                   const struct read_blob_callbacks *cbs)
+                   const struct read_blob_callbacks *cbs, bool recover_data)
 {
        struct hasher_context hasher_ctx = {
-               .flags = VERIFY_BLOB_HASHES | COMPUTE_MISSING_BLOB_HASHES,
+               .flags = VERIFY_BLOB_HASHES | COMPUTE_MISSING_BLOB_HASHES |
+                        (recover_data ? RECOVER_DATA : 0),
                .cbs = *cbs,
        };
        struct read_blob_callbacks hasher_cbs = {
@@ -1083,7 +1128,7 @@ read_blob_with_sha1(struct blob_descriptor *blob,
                .end_blob       = hasher_end_blob,
                .ctx            = &hasher_ctx,
        };
-       return read_blob_with_cbs(blob, &hasher_cbs);
+       return read_blob_with_cbs(blob, &hasher_cbs, recover_data);
 }
 
 static int
@@ -1091,7 +1136,8 @@ read_blobs_in_solid_resource(struct blob_descriptor *first_blob,
                             struct blob_descriptor *last_blob,
                             size_t blob_count,
                             size_t list_head_offset,
-                            const struct read_blob_callbacks *sink_cbs)
+                            const struct read_blob_callbacks *sink_cbs,
+                            bool recover_data)
 {
        struct data_range *ranges;
        bool ranges_malloced;
@@ -1141,7 +1187,7 @@ read_blobs_in_solid_resource(struct blob_descriptor *first_blob,
        };
 
        ret = read_compressed_wim_resource(first_blob->rdesc, ranges,
-                                          blob_count, &cb);
+                                          blob_count, &cb, recover_data);
 
        if (ranges_malloced)
                FREE(ranges);
@@ -1178,7 +1224,8 @@ oom:
  *             For all blobs being read that have already had SHA-1 message
  *             digests computed, calculate the SHA-1 message digest of the read
  *             data and compare it with the previously computed value.  If they
- *             do not match, return WIMLIB_ERR_INVALID_RESOURCE_HASH.
+ *             do not match, return WIMLIB_ERR_INVALID_RESOURCE_HASH (unless
+ *             RECOVER_DATA is also set, in which case just issue a warning).
  *
  *     COMPUTE_MISSING_BLOB_HASHES
  *             For all blobs being read that have not yet had their SHA-1
@@ -1188,6 +1235,9 @@ oom:
  *     BLOB_LIST_ALREADY_SORTED
  *             @blob_list is already sorted in sequential order for reading.
  *
+ *     RECOVER_DATA
+ *             Don't consider corrupted blob data to be an error.
+ *
  * The callback functions are allowed to delete the current blob from the list
  * if necessary.
  *
@@ -1273,14 +1323,15 @@ read_blob_list(struct list_head *blob_list, size_t list_head_offset,
                                ret = read_blobs_in_solid_resource(blob, blob_last,
                                                                   blob_count,
                                                                   list_head_offset,
-                                                                  sink_cbs);
+                                                                  sink_cbs,
+                                                                  flags & RECOVER_DATA);
                                if (ret)
                                        return ret;
                                continue;
                        }
                }
 
-               ret = read_blob_with_cbs(blob, sink_cbs);
+               ret = read_blob_with_cbs(blob, sink_cbs, flags & RECOVER_DATA);
                if (unlikely(ret && ret != BEGIN_BLOB_STATUS_SKIP_BLOB))
                        return ret;
        }
@@ -1314,19 +1365,20 @@ extract_blob_prefix_to_fd(struct blob_descriptor *blob, u64 size,
                .func   = extract_chunk_to_fd,
                .ctx    = fd,
        };
-       return read_blob_prefix(blob, size, &cb);
+       return read_blob_prefix(blob, size, &cb, false);
 }
 
 /* Extract the full uncompressed contents of the specified blob to the specified
  * file descriptor.  This checks the SHA-1 message digest.  */
 int
-extract_blob_to_fd(struct blob_descriptor *blob, struct filedes *fd)
+extract_blob_to_fd(struct blob_descriptor *blob, struct filedes *fd,
+                  bool recover_data)
 {
        struct read_blob_callbacks cbs = {
                .continue_blob  = extract_blob_chunk_to_fd,
                .ctx            = fd,
        };
-       return read_blob_with_sha1(blob, &cbs);
+       return read_blob_with_sha1(blob, &cbs, recover_data);
 }
 
 /* Calculate the SHA-1 message digest of a blob and store it in @blob->hash.  */
@@ -1335,7 +1387,7 @@ sha1_blob(struct blob_descriptor *blob)
 {
        static const struct read_blob_callbacks cbs = {
        };
-       return read_blob_with_sha1(blob, &cbs);
+       return read_blob_with_sha1(blob, &cbs, false);
 }
 
 /*
index bc05d70779459c59871530c88cae5585d4ead1f5..1b088de959463e3aef96be15f4eafb4b5448c2b6 100644 (file)
@@ -488,7 +488,8 @@ read_win32_encrypted_file_prefix(const wchar_t *path, bool is_dir, u64 size,
  * described by @blob.  */
 int
 read_windows_file_prefix(const struct blob_descriptor *blob, u64 size,
-                        const struct consume_chunk_callback *cb)
+                        const struct consume_chunk_callback *cb,
+                        bool recover_data)
 {
        const struct windows_file *file = blob->windows_file;
 
index bca5e862a2f1e2549bb471c9a66855b4714f0216..e28069ebba707acd49ad53e5aae3c9ade3a01b8a 100644 (file)
@@ -767,7 +767,7 @@ write_blob_uncompressed(struct blob_descriptor *blob, struct filedes *out_fd)
        if (filedes_seek(out_fd, begin_offset) == -1)
                return 0;
 
-       ret = extract_blob_to_fd(blob, out_fd);
+       ret = extract_blob_to_fd(blob, out_fd, false);
        if (ret) {
                /* Error reading the uncompressed data.  */
                if (out_fd->offset == begin_offset &&
index fe8cdca0bf103ff1c2ed30503f38a868c0afed99..24503729d9ab931684581e2ce449da3b6cd811a8 100755 (executable)
@@ -142,6 +142,20 @@ mkdir in.dir out.dir
 
 . $srcdir/tests/common_tests.sh
 
+# Test the data recovery mode
+__msg "Testing data recovery mode"
+for file in corrupted_file_1.wim corrupted_file_2.wim; do
+       rm -rf out.dir
+       wimapply $srcdir/tests/wims/$file 1 out.dir 2>/dev/null && \
+               error "Applying $file in default mode unexpectedly succeeded"
+       rm -rf out.dir
+       wimapply --recover-data $srcdir/tests/wims/$file 1 out.dir || \
+               error "Applying $file in data recovery mode unexpectedly failed"
+       if [ ! -e out.dir/file ]; then
+               error "Recovered file not found"
+       fi
+done
+
 # Make sure exclusion list works
 __msg "Testing default capture configuration file"
 touch in.dir/hiberfil.sys
index 07cfc6bbb85d62d32518ed71a6d6e7c27409ef9d..e766f12002a230f5a9487e94e6165bc089810b44 100644 (file)
@@ -1,5 +1,9 @@
 Some fun files:
 
+corrupted_file_1.wim:  This WIM contains a file whose SHA-1 digest doesn't match.
+
+corrupted_file_2.wim:  This WIM contains a file that fails to decompress.
+
 cyclic.wim:  This WIM has an image with a cyclic directory structure and should be
 detected as invalid.
 
diff --git a/tests/wims/corrupted_file_1.wim b/tests/wims/corrupted_file_1.wim
new file mode 100644 (file)
index 0000000..4bb087e
Binary files /dev/null and b/tests/wims/corrupted_file_1.wim differ
diff --git a/tests/wims/corrupted_file_2.wim b/tests/wims/corrupted_file_2.wim
new file mode 100644 (file)
index 0000000..68b5854
Binary files /dev/null and b/tests/wims/corrupted_file_2.wim differ