wimlib.net Git - wimlib/blob - src/textfile.c

   1 /*
   2  * textfile.c
   3  */
   4
   5 /*
   6  * Copyright (C) 2014 Eric Biggers
   7  *
   8  * This file is part of wimlib, a library for working with WIM files.
   9  *
  10  * wimlib is free software; you can redistribute it and/or modify it under the
  11  * terms of the GNU General Public License as published by the Free
  12  * Software Foundation; either version 3 of the License, or (at your option)
  13  * any later version.
  14  *
  15  * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
  16  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  17  * A PARTICULAR PURPOSE. See the GNU General Public License for more
  18  * details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with wimlib; if not, see http://www.gnu.org/licenses/.
  22  */
  23
  24 #ifdef HAVE_CONFIG_H
  25 #  include "config.h"
  26 #endif
  27
  28 #include "wimlib/assert.h"
  29 #include "wimlib/encoding.h"
  30 #include "wimlib/error.h"
  31 #include "wimlib/file_io.h"
  32 #include "wimlib/textfile.h"
  33 #include "wimlib/util.h"
  34
  35 #include <ctype.h>
  36 #include <errno.h>
  37 #include <fcntl.h>
  38 #include <string.h>
  39 #include <sys/stat.h>
  40 #include <unistd.h>
  41
  42 static int
  43 read_file_contents(const tchar *path, void **buf_ret, size_t *bufsize_ret)
  44 {
  45         int raw_fd;
  46         struct filedes fd;
  47         struct stat st;
  48         void *buf;
  49         int ret;
  50         int errno_save;
  51
  52         if (!path || !*path)
  53                 return WIMLIB_ERR_INVALID_PARAM;
  54
  55         raw_fd = topen(path, O_RDONLY | O_BINARY);
  56         if (raw_fd < 0) {
  57                 ERROR_WITH_ERRNO("Can't open \"%"TS"\"", path);
  58                 return WIMLIB_ERR_OPEN;
  59         }
  60         if (fstat(raw_fd, &st)) {
  61                 ERROR_WITH_ERRNO("Can't stat \"%"TS"\"", path);
  62                 close(raw_fd);
  63                 return WIMLIB_ERR_STAT;
  64         }
  65         if ((size_t)st.st_size != st.st_size ||
  66             (buf = MALLOC(st.st_size)) == NULL)
  67         {
  68                 close(raw_fd);
  69                 ERROR("Not enough memory to read \"%"TS"\"", path);
  70                 return WIMLIB_ERR_NOMEM;
  71         }
  72
  73         filedes_init(&fd, raw_fd);
  74         ret = full_read(&fd, buf, st.st_size);
  75         errno_save = errno;
  76         filedes_close(&fd);
  77         errno = errno_save;
  78         if (ret) {
  79                 ERROR_WITH_ERRNO("Error reading \"%"TS"\"", path);
  80                 FREE(buf);
  81                 return ret;
  82         }
  83
  84         *buf_ret = buf;
  85         *bufsize_ret = st.st_size;
  86         return 0;
  87 }
  88
  89 static int
  90 translate_text_buffer(const u8 *buf_raw, size_t bufsize_raw,
  91                       tchar **tstr_ret, size_t *tstr_nchars_ret)
  92 {
  93         size_t offset_raw;
  94         bool utf8;
  95         tchar *buf_tstr;
  96         size_t bufsize_tstr;
  97         int ret;
  98
  99         /* Guess the encoding: UTF-8 or UTF-16LE.  (Something weirder and you're
 100          * out of luck, sorry...)  */
 101         if (bufsize_raw >= 2 &&
 102             buf_raw[0] == 0xFF &&
 103             buf_raw[1] == 0xFE)
 104         {
 105                 utf8 = false;
 106                 offset_raw = 2;
 107         }
 108         else if (bufsize_raw >= 2 &&
 109                  buf_raw[0] <= 0x7F &&
 110                  buf_raw[1] == 0x00)
 111         {
 112                 utf8 = false;
 113                 offset_raw = 0;
 114         }
 115         else if (bufsize_raw >= 3 &&
 116                  buf_raw[0] == 0xEF &&
 117                  buf_raw[1] == 0xBB &&
 118                  buf_raw[2] == 0xBF)
 119         {
 120                 utf8 = true;
 121                 offset_raw = 3;
 122         }
 123         else
 124         {
 125                 utf8 = true;
 126                 offset_raw = 0;
 127         }
 128
 129         if (utf8) {
 130                 ret = utf8_to_tstr((const char *)(buf_raw + offset_raw),
 131                                    bufsize_raw - offset_raw,
 132                                    &buf_tstr, &bufsize_tstr);
 133         } else {
 134         #if TCHAR_IS_UTF16LE
 135                 bufsize_tstr = bufsize_raw - offset_raw;
 136                 buf_tstr = MALLOC(bufsize_tstr + 2);
 137                 if (buf_tstr) {
 138                         memcpy(buf_tstr, buf_raw + offset_raw, bufsize_tstr);
 139                         ((u8*)buf_tstr)[bufsize_tstr + 0] = 0;
 140                         ((u8*)buf_tstr)[bufsize_tstr + 1] = 0;
 141                         ret = 0;
 142                 } else {
 143                         ret = WIMLIB_ERR_NOMEM;
 144                 }
 145         #else
 146                 ret = utf16le_to_tstr((const utf16lechar *)(buf_raw + offset_raw),
 147                                       bufsize_raw - offset_raw,
 148                                       &buf_tstr, &bufsize_tstr);
 149         #endif
 150         }
 151         if (ret)
 152                 return ret;
 153
 154         *tstr_ret = buf_tstr;
 155         *tstr_nchars_ret = bufsize_tstr / sizeof(tchar);
 156         return 0;
 157 }
 158
 159 static int
 160 string_set_append(struct string_set *set, tchar *str)
 161 {
 162         size_t num_alloc_strings = set->num_alloc_strings;
 163
 164         if (set->num_strings == num_alloc_strings) {
 165                 tchar **new_strings;
 166
 167                 num_alloc_strings = max(num_alloc_strings * 3 / 2,
 168                                         num_alloc_strings + 4);
 169                 new_strings = REALLOC(set->strings,
 170                                       sizeof(set->strings[0]) * num_alloc_strings);
 171                 if (!new_strings)
 172                         return WIMLIB_ERR_NOMEM;
 173                 set->strings = new_strings;
 174                 set->num_alloc_strings = num_alloc_strings;
 175         }
 176         set->strings[set->num_strings++] = str;
 177         return 0;
 178 }
 179
 180 #define NOT_IN_SECTION          -1
 181 #define IN_UNKNOWN_SECTION      -2
 182
 183 static int
 184 parse_text_file(const tchar *path, tchar *buf, size_t buflen,
 185                 const struct text_file_section *pos_sections,
 186                 int num_pos_sections, int flags, line_mangle_t mangle_line)
 187 {
 188         int current_section = NOT_IN_SECTION;
 189         bool have_named_sections = false;
 190         tchar *p;
 191         tchar *nl;
 192         unsigned long line_no = 1;
 193
 194         for (int i = 0; i < num_pos_sections; i++) {
 195                 if (*pos_sections[i].name)
 196                         have_named_sections = true;
 197                 else
 198                         current_section = i;
 199         }
 200
 201         for (p = buf; p != buf + buflen; p = nl + 1, line_no++) {
 202                 tchar *line_begin, *line_end;
 203                 size_t line_len;
 204                 int ret;
 205
 206                 nl = tmemchr(p, T('\n'), buf + buflen - p);
 207                 if (!nl)
 208                         break;
 209
 210                 line_begin = p;
 211                 line_end = nl;
 212
 213                 /* Ignore leading whitespace.  */
 214                 while (line_begin < nl && istspace(*line_begin))
 215                         line_begin++;
 216
 217                 /* Ignore trailing whitespace.  */
 218                 while (line_end > line_begin && istspace(*(line_end - 1)))
 219                         line_end--;
 220
 221                 line_len = line_end - line_begin;
 222
 223                 /* Ignore comments and empty lines.  */
 224                 if (line_len == 0 || *line_begin == T(';') || *line_begin == T('#'))
 225                         continue;
 226
 227                 line_begin[line_len] = T('\0');
 228
 229                 /* Check for beginning of new section.  */
 230                 if (line_begin[0] == T('[') &&
 231                     line_begin[line_len - 1] == T(']') &&
 232                     have_named_sections)
 233                 {
 234                         line_begin[line_len - 1] = T('\0');
 235                         current_section = IN_UNKNOWN_SECTION;
 236                         for (int i = 0; i < num_pos_sections; i++) {
 237                                 if (!tstrcmp(line_begin + 1,
 238                                              pos_sections[i].name))
 239                                 {
 240                                         current_section = i;
 241                                         break;
 242                                 }
 243                         }
 244                         line_begin[line_len - 1] = T(']');
 245                         if (current_section < 0) {
 246                                 if (!(flags & LOAD_TEXT_FILE_NO_WARNINGS)) {
 247                                         WARNING("%"TS":%lu: Unrecognized section \"%"TS"\"",
 248                                                 path, line_no, line_begin);
 249                                 }
 250                         }
 251                         continue;
 252                 }
 253
 254                 if (current_section < 0) {
 255                         if (current_section == NOT_IN_SECTION) {
 256                                 if (!(flags & LOAD_TEXT_FILE_NO_WARNINGS)) {
 257                                         WARNING("%"TS":%lu: Not in a bracketed section!",
 258                                                 path, line_no);
 259                                 }
 260                         }
 261                         continue;
 262                 }
 263
 264                 if (flags & LOAD_TEXT_FILE_REMOVE_QUOTES) {
 265                         if (line_begin[0] == T('"') || line_begin[0] == T('\'')) {
 266                                 tchar quote = line_begin[0];
 267                                 if (line_len >= 2 &&
 268                                     line_begin[line_len - 1] == quote)
 269                                 {
 270                                         line_begin++;
 271                                         line_len -= 2;
 272                                         line_begin[line_len] = T('\0');
 273                                 }
 274                         }
 275                 }
 276
 277                 if (mangle_line) {
 278                         ret = (*mangle_line)(line_begin, path, line_no);
 279                         if (ret)
 280                                 return ret;
 281                 }
 282
 283                 ret = string_set_append(pos_sections[current_section].strings,
 284                                         line_begin);
 285                 if (ret)
 286                         return ret;
 287         }
 288         return 0;
 289 }
 290
 291 /**
 292  * do_load_text_file -
 293  *
 294  * Read and parse lines from a text file from an on-disk file or a buffer.
 295  * The file may contain sections, like in an INI file.
 296  *
 297  * @path
 298  *      Path to the file on disk to read, or a dummy name for the buffer.
 299  * @buf
 300  *      If NULL, the data will be read from the @path file.  Otherwise the data
 301  *      will be read from this buffer, which must be newline-terminated.
 302  * @buflen
 303  *      Length of buffer in bytes; ignored if @buf is NULL.
 304  * @buf_ret
 305  *      On success, a pointer to a buffer backing the parsed lines is stored
 306  *      here.  If @buf is not NULL, this will be @buf.  Otherwise, this will be
 307  *      an allocated buffer that must be freed when finished with the lines.
 308  * @pos_sections
 309  *      Specifications of allowed sections in the file.  Each such specification
 310  *      consists of the name of the section (e.g. [ExclusionList], like in the
 311  *      INI file format), along with a pointer to the list of lines parsed for
 312  *      that section.  Use an empty name to indicate the destination of lines
 313  *      not in any section.
 314  * @num_pos_sections
 315  *      Length of @pos_sections array.
 316  * @flags
 317  *      LOAD_TEXT_FILE_REMOVE_QUOTES or 0.
 318  * @mangle_line
 319  *      Optional callback to modify each line being read.
 320  *
 321  * Returns 0 on success or a positive error code on failure.
 322  *
 323  * Unknown sections are ignored (warning printed).
 324  */
 325 int
 326 do_load_text_file(const tchar *path,
 327                   const void *buf, size_t bufsize,
 328                   void **mem_ret,
 329                   const struct text_file_section *pos_sections,
 330                   int num_pos_sections,
 331                   int flags,
 332                   line_mangle_t mangle_line)
 333 {
 334         int ret;
 335         bool pathmode = (buf == NULL);
 336         tchar *tstr;
 337         size_t tstr_nchars;
 338
 339         if (pathmode) {
 340                 ret = read_file_contents(path, (void **)&buf, &bufsize);
 341                 if (ret)
 342                         return ret;
 343         }
 344
 345         ret = translate_text_buffer(buf, bufsize, &tstr, &tstr_nchars);
 346         if (pathmode)
 347                 FREE((void *)buf);
 348         if (ret)
 349                 return ret;
 350
 351         tstr[tstr_nchars++] = T('\n');
 352
 353         ret = parse_text_file(path, tstr, tstr_nchars, pos_sections,
 354                               num_pos_sections, flags, mangle_line);
 355         if (ret) {
 356                 for (int i = 0; i < num_pos_sections; i++)
 357                         FREE(pos_sections[i].strings->strings);
 358                 FREE(tstr);
 359                 return ret;
 360         }
 361
 362         *mem_ret = tstr;
 363         return 0;
 364 }