2 * scan.c - Helper routines for directory tree scans
6 * Copyright (C) 2013-2017 Eric Biggers
8 * This file is free software; you can redistribute it and/or modify it under
9 * the terms of the GNU Lesser General Public License as published by the Free
10 * Software Foundation; either version 3 of the License, or (at your option) any
13 * This file is distributed in the hope that it will be useful, but WITHOUT
14 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
18 * You should have received a copy of the GNU Lesser General Public License
19 * along with this file; if not, see https://www.gnu.org/licenses/.
28 #include "wimlib/blob_table.h"
29 #include "wimlib/dentry.h"
30 #include "wimlib/error.h"
31 #include "wimlib/paths.h"
32 #include "wimlib/pattern.h"
33 #include "wimlib/progress.h"
34 #include "wimlib/scan.h"
35 #include "wimlib/textfile.h"
38 * Tally a file (or directory) that has been scanned for a capture operation,
39 * and possibly call the progress function provided by the library user.
42 * Current path, flags, optional progress function, and progress data for
45 * Status of the scanned file.
47 * If @status is WIMLIB_SCAN_DENTRY_OK, this is a pointer to the WIM inode
48 * that has been created for the scanned file. The first time the file is
49 * seen, inode->i_nlink will be 1. On subsequent visits of the same inode
50 * via additional hard links, inode->i_nlink will be greater than 1.
53 do_scan_progress(struct scan_params *params, int status,
54 const struct wim_inode *inode)
60 case WIMLIB_SCAN_DENTRY_OK:
61 if (!(params->add_flags & WIMLIB_ADD_FLAG_VERBOSE))
64 case WIMLIB_SCAN_DENTRY_UNSUPPORTED:
65 case WIMLIB_SCAN_DENTRY_EXCLUDED:
66 case WIMLIB_SCAN_DENTRY_FIXED_SYMLINK:
67 case WIMLIB_SCAN_DENTRY_NOT_FIXED_SYMLINK:
68 if (!(params->add_flags & WIMLIB_ADD_FLAG_EXCLUDE_VERBOSE))
72 params->progress.scan.cur_path = params->cur_path;
73 params->progress.scan.status = status;
74 if (status == WIMLIB_SCAN_DENTRY_OK) {
76 /* The first time the inode is seen, tally all its streams. */
77 if (inode->i_nlink == 1) {
78 for (unsigned i = 0; i < inode->i_num_streams; i++) {
79 const struct blob_descriptor *blob =
80 stream_blob_resolved(&inode->i_streams[i]);
82 params->progress.scan.num_bytes_scanned += blob->size;
86 /* Tally the file itself, counting every hard link. It's
87 * debatable whether every link should be counted, but counting
88 * every link makes the statistics consistent with the ones
89 * placed in the FILECOUNT and DIRCOUNT elements of the WIM
90 * file's XML document. It also avoids possible user confusion
91 * if the number of files reported were to be lower than that
92 * displayed by some other software such as file browsers. */
93 if (inode_is_directory(inode))
94 params->progress.scan.num_dirs_scanned++;
96 params->progress.scan.num_nondirs_scanned++;
99 /* Call the user-provided progress function. */
101 cookie = progress_get_win32_path(params->progress.scan.cur_path);
102 ret = call_progress(params->progfunc, WIMLIB_PROGRESS_MSG_SCAN_DENTRY,
103 ¶ms->progress, params->progctx);
104 progress_put_win32_path(cookie);
109 * Given a null-terminated pathname pattern @pat that has been read from line
110 * @line_no of the file @path, validate and canonicalize the pattern.
112 * On success, returns 0.
113 * On failure, returns WIMLIB_ERR_INVALID_CAPTURE_CONFIG.
114 * In either case, @pat may have been modified in-place (and possibly
118 mangle_pat(tchar *pat, const tchar *path, unsigned long line_no)
120 if (!is_any_path_separator(pat[0]) &&
121 pat[0] != T('\0') && pat[1] == T(':'))
123 /* Pattern begins with drive letter. */
125 if (!is_any_path_separator(pat[2])) {
126 /* Something like c:file, which is actually a path
127 * relative to the current working directory on the c:
128 * drive. We require paths with drive letters to be
130 ERROR("%"TS":%lu: Invalid pattern \"%"TS"\":\n"
131 " Patterns including drive letters must be absolute!\n"
132 " Maybe try \"%"TC":%"TC"%"TS"\"?\n",
134 pat[0], OS_PREFERRED_PATH_SEPARATOR, &pat[2]);
135 return WIMLIB_ERR_INVALID_CAPTURE_CONFIG;
138 WARNING("%"TS":%lu: Pattern \"%"TS"\" starts with a drive "
139 "letter, which is being removed.",
142 /* Strip the drive letter. */
143 tmemmove(pat, pat + 2, tstrlen(pat + 2) + 1);
146 /* Collapse consecutive path separators, and translate both / and \ into
147 * / (UNIX) or \ (Windows).
149 * Note: we expect that this function produces patterns that can be used
150 * for both filesystem paths and WIM paths, so the desired path
151 * separators must be the same. */
152 STATIC_ASSERT(OS_PREFERRED_PATH_SEPARATOR == WIM_PATH_SEPARATOR);
153 do_canonicalize_path(pat, pat);
155 /* Relative patterns can only match file names, so they must be
156 * single-component only. */
157 if (pat[0] != OS_PREFERRED_PATH_SEPARATOR &&
158 tstrchr(pat, OS_PREFERRED_PATH_SEPARATOR))
160 ERROR("%"TS":%lu: Invalid pattern \"%"TS"\":\n"
161 " Relative patterns can only include one path component!\n"
162 " Maybe try \"%"TC"%"TS"\"?",
163 path, line_no, pat, OS_PREFERRED_PATH_SEPARATOR, pat);
164 return WIMLIB_ERR_INVALID_CAPTURE_CONFIG;
171 * Read, parse, and validate a capture configuration file from either an on-disk
172 * file or an in-memory buffer.
174 * To read from a file, specify @config_file, and use NULL for @buf.
175 * To read from a buffer, specify @buf and @bufsize.
177 * @config must be initialized to all 0's.
179 * On success, 0 will be returned, and the resulting capture configuration will
180 * be stored in @config.
182 * On failure, a positive error code will be returned, and the contents of
183 * @config will be invalidated.
186 read_capture_config(const tchar *config_file, const void *buf,
187 size_t bufsize, struct capture_config *config)
191 /* [PrepopulateList] is used for apply, not capture. But since we do
192 * understand it, recognize it, thereby avoiding the unrecognized
193 * section warning, but discard the resulting strings.
195 * We currently ignore [CompressionExclusionList] and
196 * [CompressionFolderList]. This is a known issue that doesn't seem to
197 * have any real consequences, so don't issue warnings about not
198 * recognizing those sections. */
199 STRING_LIST(prepopulate_pats);
200 STRING_LIST(compression_exclusion_pats);
201 STRING_LIST(compression_folder_pats);
203 struct text_file_section sections[] = {
205 &config->exclusion_pats},
206 {T("ExclusionException"),
207 &config->exclusion_exception_pats},
208 {T("PrepopulateList"),
210 {T("CompressionExclusionList"),
211 &compression_exclusion_pats},
212 {T("CompressionFolderList"),
213 &compression_folder_pats},
217 ret = load_text_file(config_file, buf, bufsize, &mem,
218 sections, ARRAY_LEN(sections),
219 LOAD_TEXT_FILE_REMOVE_QUOTES, mangle_pat);
221 ERROR("Failed to load capture configuration file \"%"TS"\"",
224 case WIMLIB_ERR_INVALID_UTF8_STRING:
225 case WIMLIB_ERR_INVALID_UTF16_STRING:
226 ERROR("Note: the capture configuration file must be "
227 "valid UTF-8 or UTF-16LE");
228 ret = WIMLIB_ERR_INVALID_CAPTURE_CONFIG;
230 case WIMLIB_ERR_OPEN:
231 case WIMLIB_ERR_STAT:
232 case WIMLIB_ERR_NOMEM:
233 case WIMLIB_ERR_READ:
234 ret = WIMLIB_ERR_UNABLE_TO_READ_CAPTURE_CONFIG;
240 FREE(prepopulate_pats.strings);
241 FREE(compression_exclusion_pats.strings);
242 FREE(compression_folder_pats.strings);
249 destroy_capture_config(struct capture_config *config)
251 FREE(config->exclusion_pats.strings);
252 FREE(config->exclusion_exception_pats.strings);
257 * Determine whether @path matches any of the patterns in @list.
258 * Path separators in @path must be WIM_PATH_SEPARATOR.
261 match_pattern_list(const tchar *path, const struct string_list *list,
264 for (size_t i = 0; i < list->num_strings; i++)
265 if (match_path(path, list->strings[i], match_flags))
271 * Determine if a file should be excluded from capture.
273 * This function tests exclusions from both possible sources of exclusions:
275 * (1) The capture configuration file
276 * (2) The user-provided progress function
278 * params->root_path_nchars must have been set beforehand. Example for UNIX: if
279 * the capture root directory is "foobar/subdir", then all paths will be
280 * provided starting with "foobar/subdir", so params->root_path_nchars must have
281 * been set to strlen("foobar/subdir") so that the appropriate path suffix can
282 * be matched against the patterns in the exclusion list.
286 * = 0 if not excluded and no error
287 * > 0 (wimlib error code) if error
290 try_exclude(const struct scan_params *params)
294 if (params->config) {
295 const tchar *path = params->cur_path + params->root_path_nchars;
296 if (match_pattern_list(path, ¶ms->config->exclusion_pats,
297 MATCH_RECURSIVELY) &&
298 !match_pattern_list(path, ¶ms->config->exclusion_exception_pats,
299 MATCH_RECURSIVELY | MATCH_ANCESTORS))
303 if (unlikely(params->add_flags & WIMLIB_ADD_FLAG_TEST_FILE_EXCLUSION)) {
305 union wimlib_progress_info info;
308 info.test_file_exclusion.path = params->cur_path;
309 info.test_file_exclusion.will_exclude = false;
311 cookie = progress_get_win32_path(info.test_file_exclusion.path);
313 ret = call_progress(params->progfunc, WIMLIB_PROGRESS_MSG_TEST_FILE_EXCLUSION,
314 &info, params->progctx);
316 progress_put_win32_path(cookie);
320 if (info.test_file_exclusion.will_exclude)
328 * Determine whether a directory entry of the specified name should be ignored.
329 * This is a lower level function which runs prior to try_exclude(). It handles
330 * the standard '.' and '..' entries, which show up in directory listings but
331 * should not be archived. It also checks for odd filenames that usually should
332 * not exist but could cause problems if archiving them were to be attempted.
335 should_ignore_filename(const tchar *name, const int name_nchars)
337 if (name_nchars <= 0) {
338 WARNING("Ignoring empty filename");
342 if (name[0] == T('.') &&
343 (name_nchars == 1 || (name_nchars == 2 && name[1] == T('.'))))
346 for (int i = 0; i < name_nchars; i++) {
347 if (name[i] == T('\0')) {
348 WARNING("Ignoring filename containing embedded null character");
351 if (name[i] == OS_PREFERRED_PATH_SEPARATOR) {
352 WARNING("Ignoring filename containing embedded path separator");
360 /* Attach a newly scanned directory tree to its parent directory, with duplicate
363 attach_scanned_tree(struct wim_dentry *parent, struct wim_dentry *child,
364 struct blob_table *blob_table)
366 struct wim_dentry *duplicate;
368 if (child && (duplicate = dentry_add_child(parent, child))) {
369 WARNING("Duplicate file path: \"%"TS"\". Only capturing "
370 "the first version.", dentry_full_path(duplicate));
371 free_dentry_tree(child, blob_table);
375 /* Set the path at which the directory tree scan is beginning. */
377 pathbuf_init(struct scan_params *params, const tchar *root_path)
379 size_t nchars = tstrlen(root_path);
380 size_t alloc_nchars = nchars + 1 + 1024;
382 params->cur_path = MALLOC(alloc_nchars * sizeof(tchar));
383 if (!params->cur_path)
384 return WIMLIB_ERR_NOMEM;
385 tmemcpy(params->cur_path, root_path, nchars + 1);
386 params->cur_path_nchars = nchars;
387 params->cur_path_alloc_nchars = alloc_nchars;
388 params->root_path_nchars = nchars;
393 * Append a filename to the current path.
395 * If successful, returns a pointer to the filename component and sets
396 * *orig_path_nchars_ret to the old path length, which can be restored later
397 * using pathbuf_truncate(). Otherwise returns NULL (out of memory).
400 pathbuf_append_name(struct scan_params *params, const tchar *name,
401 size_t name_nchars, size_t *orig_path_nchars_ret)
403 size_t path_nchars = params->cur_path_nchars;
404 size_t required_nchars = path_nchars + 1 + name_nchars + 1;
405 tchar *buf = params->cur_path;
407 if (unlikely(required_nchars > params->cur_path_alloc_nchars)) {
408 required_nchars += 1024;
409 buf = REALLOC(buf, required_nchars * sizeof(tchar));
412 params->cur_path = buf;
413 params->cur_path_alloc_nchars = required_nchars;
415 *orig_path_nchars_ret = path_nchars;
418 * Add the slash, but not if it will be a duplicate (which can happen if
419 * the path to the capture root directory ends in a slash), because
420 * on Windows duplicate slashes sometimes don't work as expected.
422 if (path_nchars && buf[path_nchars - 1] != OS_PREFERRED_PATH_SEPARATOR)
423 buf[path_nchars++] = OS_PREFERRED_PATH_SEPARATOR;
425 tmemcpy(&buf[path_nchars], name, name_nchars);
426 path_nchars += name_nchars;
427 buf[path_nchars] = T('\0');
428 params->cur_path_nchars = path_nchars;
429 return &buf[path_nchars - name_nchars];
432 /* Truncate the current path to the specified number of characters. */
434 pathbuf_truncate(struct scan_params *params, size_t nchars)
436 wimlib_assert(nchars <= params->cur_path_nchars);
437 params->cur_path[nchars] = T('\0');
438 params->cur_path_nchars = nchars;