wimlib.net Git - wimlib/blob - src/textfile.c

   1 /*
   2  * textfile.c
   3  */
   4
   5 /*
   6  * Copyright (C) 2014 Eric Biggers
   7  *
   8  * This file is free software; you can redistribute it and/or modify it under
   9  * the terms of the GNU Lesser General Public License as published by the Free
  10  * Software Foundation; either version 3 of the License, or (at your option) any
  11  * later version.
  12  *
  13  * This file is distributed in the hope that it will be useful, but WITHOUT
  14  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  15  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
  16  * details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public License
  19  * along with this file; if not, see https://www.gnu.org/licenses/.
  20  */
  21
  22 #ifdef HAVE_CONFIG_H
  23 #  include "config.h"
  24 #endif
  25
  26 #include <ctype.h>
  27 #include <errno.h>
  28 #include <fcntl.h>
  29 #include <string.h>
  30 #include <sys/stat.h>
  31 #include <unistd.h>
  32
  33 #include "wimlib/encoding.h"
  34 #include "wimlib/error.h"
  35 #include "wimlib/file_io.h"
  36 #include "wimlib/textfile.h"
  37 #include "wimlib/util.h"
  38
  39 static int
  40 stdin_get_contents(void **buf_ret, size_t *bufsize_ret)
  41 {
  42         char *buf = NULL;
  43         size_t filled = 0;
  44         size_t capacity = 0;
  45
  46         do {
  47                 size_t new_capacity = (capacity * 2) + 256;
  48                 char *new_buf;
  49
  50                 if (new_capacity <= capacity ||
  51                     !(new_buf = REALLOC(buf, new_capacity))) {
  52                         ERROR("Too much data sent on stdin!");
  53                         FREE(buf);
  54                         return WIMLIB_ERR_INVALID_PARAM;
  55                 }
  56                 buf = new_buf;
  57                 capacity = new_capacity;
  58                 filled += fread(&buf[filled], 1, capacity - filled, stdin);
  59         } while (filled == capacity);
  60
  61         if (!feof(stdin)) {
  62                 ERROR_WITH_ERRNO("Error reading stdin");
  63                 FREE(buf);
  64                 return WIMLIB_ERR_READ;
  65         }
  66         *buf_ret = buf;
  67         *bufsize_ret = filled;
  68         return 0;
  69 }
  70
  71 static int
  72 read_file_contents(const tchar *path, void **buf_ret, size_t *bufsize_ret)
  73 {
  74         int raw_fd;
  75         struct filedes fd;
  76         struct stat st;
  77         void *buf;
  78         int ret;
  79         int errno_save;
  80
  81         raw_fd = topen(path, O_RDONLY | O_BINARY);
  82         if (raw_fd < 0) {
  83                 ERROR_WITH_ERRNO("Can't open \"%"TS"\"", path);
  84                 return WIMLIB_ERR_OPEN;
  85         }
  86         if (fstat(raw_fd, &st)) {
  87                 ERROR_WITH_ERRNO("Can't stat \"%"TS"\"", path);
  88                 close(raw_fd);
  89                 return WIMLIB_ERR_STAT;
  90         }
  91         if ((size_t)st.st_size != st.st_size ||
  92             (buf = MALLOC(st.st_size)) == NULL)
  93         {
  94                 close(raw_fd);
  95                 ERROR("Not enough memory to read \"%"TS"\"", path);
  96                 return WIMLIB_ERR_NOMEM;
  97         }
  98
  99         filedes_init(&fd, raw_fd);
 100         ret = full_read(&fd, buf, st.st_size);
 101         errno_save = errno;
 102         filedes_close(&fd);
 103         errno = errno_save;
 104         if (ret) {
 105                 ERROR_WITH_ERRNO("Error reading \"%"TS"\"", path);
 106                 FREE(buf);
 107                 return ret;
 108         }
 109
 110         *buf_ret = buf;
 111         *bufsize_ret = st.st_size;
 112         return 0;
 113 }
 114
 115 static int
 116 translate_text_buffer(const u8 *buf_raw, size_t bufsize_raw,
 117                       tchar **tstr_ret, size_t *tstr_nchars_ret)
 118 {
 119         size_t offset_raw;
 120         bool utf8;
 121         tchar *buf_tstr;
 122         size_t bufsize_tstr;
 123         int ret;
 124
 125         /* Guess the encoding: UTF-8 or UTF-16LE.  (Something weirder and you're
 126          * out of luck, sorry...)  */
 127         if (bufsize_raw >= 2 &&
 128             buf_raw[0] == 0xFF &&
 129             buf_raw[1] == 0xFE)
 130         {
 131                 utf8 = false;
 132                 offset_raw = 2;
 133         }
 134         else if (bufsize_raw >= 2 &&
 135                  buf_raw[0] <= 0x7F &&
 136                  buf_raw[1] == 0x00)
 137         {
 138                 utf8 = false;
 139                 offset_raw = 0;
 140         }
 141         else if (bufsize_raw >= 3 &&
 142                  buf_raw[0] == 0xEF &&
 143                  buf_raw[1] == 0xBB &&
 144                  buf_raw[2] == 0xBF)
 145         {
 146                 utf8 = true;
 147                 offset_raw = 3;
 148         }
 149         else
 150         {
 151                 utf8 = true;
 152                 offset_raw = 0;
 153         }
 154
 155         if (utf8) {
 156                 ret = utf8_to_tstr((const char *)(buf_raw + offset_raw),
 157                                    bufsize_raw - offset_raw,
 158                                    &buf_tstr, &bufsize_tstr);
 159         } else {
 160                 ret = utf16le_to_tstr((const utf16lechar *)(buf_raw + offset_raw),
 161                                       bufsize_raw - offset_raw,
 162                                       &buf_tstr, &bufsize_tstr);
 163         }
 164         if (ret)
 165                 return ret;
 166
 167         *tstr_ret = buf_tstr;
 168         *tstr_nchars_ret = bufsize_tstr / sizeof(tchar);
 169         return 0;
 170 }
 171
 172 static int
 173 string_list_append(struct string_list *list, tchar *str)
 174 {
 175         size_t num_alloc_strings = list->num_alloc_strings;
 176
 177         if (list->num_strings == num_alloc_strings) {
 178                 tchar **new_strings;
 179
 180                 num_alloc_strings = max(num_alloc_strings * 3 / 2,
 181                                         num_alloc_strings + 4);
 182                 new_strings = REALLOC(list->strings,
 183                                       sizeof(list->strings[0]) * num_alloc_strings);
 184                 if (!new_strings)
 185                         return WIMLIB_ERR_NOMEM;
 186                 list->strings = new_strings;
 187                 list->num_alloc_strings = num_alloc_strings;
 188         }
 189         list->strings[list->num_strings++] = str;
 190         return 0;
 191 }
 192
 193 #define NOT_IN_SECTION          -1
 194 #define IN_UNKNOWN_SECTION      -2
 195
 196 static int
 197 parse_text_file(const tchar *path, tchar *buf, size_t buflen,
 198                 const struct text_file_section *pos_sections,
 199                 int num_pos_sections, int flags, line_mangle_t mangle_line)
 200 {
 201         int current_section = NOT_IN_SECTION;
 202         bool have_named_sections = false;
 203         tchar *p;
 204         tchar *nl;
 205         unsigned long line_no = 1;
 206
 207         for (int i = 0; i < num_pos_sections; i++) {
 208                 if (*pos_sections[i].name)
 209                         have_named_sections = true;
 210                 else
 211                         current_section = i;
 212         }
 213
 214         for (p = buf; p != buf + buflen; p = nl + 1, line_no++) {
 215                 tchar *line_begin, *line_end;
 216                 size_t line_len;
 217                 int ret;
 218
 219                 nl = tmemchr(p, T('\n'), buf + buflen - p);
 220                 if (!nl)
 221                         break;
 222
 223                 line_begin = p;
 224                 line_end = nl;
 225
 226                 /* Ignore leading whitespace.  */
 227                 while (line_begin < nl && istspace(*line_begin))
 228                         line_begin++;
 229
 230                 /* Ignore trailing whitespace.  */
 231                 while (line_end > line_begin && istspace(*(line_end - 1)))
 232                         line_end--;
 233
 234                 line_len = line_end - line_begin;
 235
 236                 /* Ignore comments and empty lines.  */
 237                 if (line_len == 0 || *line_begin == T(';') || *line_begin == T('#'))
 238                         continue;
 239
 240                 line_begin[line_len] = T('\0');
 241
 242                 /* Check for beginning of new section.  */
 243                 if (line_begin[0] == T('[') &&
 244                     line_begin[line_len - 1] == T(']') &&
 245                     have_named_sections)
 246                 {
 247                         line_begin[line_len - 1] = T('\0');
 248                         current_section = IN_UNKNOWN_SECTION;
 249                         for (int i = 0; i < num_pos_sections; i++) {
 250                                 if (!tstrcmp(line_begin + 1,
 251                                              pos_sections[i].name))
 252                                 {
 253                                         current_section = i;
 254                                         break;
 255                                 }
 256                         }
 257                         line_begin[line_len - 1] = T(']');
 258                         if (current_section < 0) {
 259                                 if (!(flags & LOAD_TEXT_FILE_NO_WARNINGS)) {
 260                                         WARNING("%"TS":%lu: Unrecognized section \"%"TS"\"",
 261                                                 path, line_no, line_begin);
 262                                 }
 263                         }
 264                         continue;
 265                 }
 266
 267                 if (current_section < 0) {
 268                         if (current_section == NOT_IN_SECTION) {
 269                                 if (!(flags & LOAD_TEXT_FILE_NO_WARNINGS)) {
 270                                         WARNING("%"TS":%lu: Not in a bracketed section!",
 271                                                 path, line_no);
 272                                 }
 273                         }
 274                         continue;
 275                 }
 276
 277                 if (flags & LOAD_TEXT_FILE_REMOVE_QUOTES) {
 278                         if (line_begin[0] == T('"') || line_begin[0] == T('\'')) {
 279                                 tchar quote = line_begin[0];
 280                                 if (line_len >= 2 &&
 281                                     line_begin[line_len - 1] == quote)
 282                                 {
 283                                         line_begin++;
 284                                         line_len -= 2;
 285                                         line_begin[line_len] = T('\0');
 286                                 }
 287                         }
 288                 }
 289
 290                 if (mangle_line) {
 291                         ret = (*mangle_line)(line_begin, path, line_no);
 292                         if (ret)
 293                                 return ret;
 294                 }
 295
 296                 ret = string_list_append(pos_sections[current_section].strings,
 297                                          line_begin);
 298                 if (ret)
 299                         return ret;
 300         }
 301         return 0;
 302 }
 303
 304 /**
 305  * load_text_file -
 306  *
 307  * Read and parse lines from a text file given as an on-disk file, standard
 308  * input, or a buffer.  The file may contain sections, like in an INI file.
 309  *
 310  * @path
 311  *      If @buf is NULL, then either the path to the file on-disk to read, or
 312  *      NULL to read from standard input.  Else, a dummy name for the buffer.
 313  * @buf
 314  *      If NULL, the data will be read from the @path file.  Otherwise the data
 315  *      will be read from this buffer.
 316  * @bufsize
 317  *      Length of buffer in bytes; ignored if @buf is NULL.
 318  * @mem_ret
 319  *      On success, a pointer to a buffer backing the parsed lines is stored
 320  *      here.  This must be freed after the parsed lines are done being used.
 321  * @pos_sections
 322  *      Specifications of allowed sections in the file.  Each such specification
 323  *      consists of the name of the section (e.g. [ExclusionList], like in the
 324  *      INI file format), along with a pointer to the list of lines parsed for
 325  *      that section.  Use an empty name to indicate the destination of lines
 326  *      not in any section.  Each list must be initialized to empty.
 327  * @num_pos_sections
 328  *      Number of entries in the @pos_sections array.
 329  * @flags
 330  *      Flags: LOAD_TEXT_FILE_* flags.
 331  * @mangle_line
 332  *      Optional callback to validate and/or modify each line being read.
 333  *
 334  * Returns 0 on success; nonzero on failure.
 335  *
 336  * Unknown sections are ignored, but a warning is printed for each, unless
 337  * LOAD_TEXT_FILE_NO_WARNINGS is specified.
 338  */
 339 int
 340 load_text_file(const tchar *path, const void *buf, size_t bufsize,
 341                void **mem_ret,
 342                const struct text_file_section *pos_sections,
 343                int num_pos_sections,
 344                int flags, line_mangle_t mangle_line)
 345 {
 346         int ret;
 347         bool is_filemode = (buf == NULL);
 348         bool is_stdin = (is_filemode && path == NULL);
 349         tchar *tstr;
 350         size_t tstr_nchars;
 351
 352         if (is_stdin && !(flags & LOAD_TEXT_FILE_ALLOW_STDIN))
 353                 return WIMLIB_ERR_INVALID_PARAM;
 354
 355         if (is_filemode) {
 356                 if (is_stdin)
 357                         ret = stdin_get_contents((void **)&buf, &bufsize);
 358                 else
 359                         ret = read_file_contents(path, (void **)&buf, &bufsize);
 360                 if (ret)
 361                         return ret;
 362         }
 363
 364         ret = translate_text_buffer(buf, bufsize, &tstr, &tstr_nchars);
 365         if (is_filemode)
 366                 FREE((void *)buf);
 367         if (ret)
 368                 return ret;
 369
 370         tstr[tstr_nchars++] = T('\n');
 371
 372         ret = parse_text_file(is_stdin ? T("<stdin>") : path,
 373                               tstr, tstr_nchars, pos_sections,
 374                               num_pos_sections, flags, mangle_line);
 375         if (ret) {
 376                 for (int i = 0; i < num_pos_sections; i++)
 377                         FREE(pos_sections[i].strings->strings);
 378                 FREE(tstr);
 379                 return ret;
 380         }
 381
 382         *mem_ret = tstr;
 383         return 0;
 384 }
 385
 386 /* API function documented in wimlib.h */
 387 WIMLIBAPI int
 388 wimlib_load_text_file(const tchar *path,
 389                       tchar **tstr_ret, size_t *tstr_nchars_ret)
 390 {
 391         void *buf;
 392         size_t bufsize;
 393         int ret;
 394
 395         if (path == NULL || (path[0] == '-' && path[1] == '\0'))
 396                 ret = stdin_get_contents(&buf, &bufsize);
 397         else
 398                 ret = read_file_contents(path, &buf, &bufsize);
 399         if (ret)
 400                 return ret;
 401         ret = translate_text_buffer(buf, bufsize, tstr_ret, tstr_nchars_ret);
 402         FREE(buf);
 403         return ret;
 404 }