pintos-os.org Git - pspp/blob - lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           int length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  49 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          int node, int root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static int fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax);
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   int nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  int nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             int accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         int *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
  94                                       re_bitset_ptr_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       int *char_class_alloc,
  97                                       const unsigned char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
 103                                       re_bitset_ptr_t sbcset,
 104                                       const unsigned char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        unsigned REG_TRANSLATE_TYPE trans,
 109                                        const unsigned char *class_name,
 110                                        const unsigned char *extra,
 111                                        int non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 const char __re_error_msgid[] attribute_hidden =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 const size_t __re_error_msgid_idx[] attribute_hidden =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205 /* re_compile_pattern is the GNU regular expression compiler: it
 206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 207    Returns 0 if the pattern was valid, otherwise an error string.
 208
 209    Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
 210    are set in BUFP on entry.  */
 211
 212 const char *
 213 re_compile_pattern (const char *pattern, size_t length,
 214                     struct re_pattern_buffer *bufp)
 215 {
 216   reg_errcode_t ret;
 217
 218   /* And GNU code determines whether or not to get register information
 219      by passing null for the REGS argument to re_match, etc., not by
 220      setting re_no_sub, unless REG_NO_SUB is set.  */
 221   bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
 222
 223   /* Match anchors at newline.  */
 224   bufp->re_newline_anchor = 1;
 225
 226   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 227
 228   if (!ret)
 229     return NULL;
 230   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 231 }
 232 #ifdef _LIBC
 233 weak_alias (__re_compile_pattern, re_compile_pattern)
 234 #endif
 235
 236 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 237    also be assigned to arbitrarily: each pattern buffer stores its own
 238    syntax, so it can be changed between regex compilations.  */
 239 /* This has no initializer because initialized variables in Emacs
 240    become read-only after dumping.  */
 241 reg_syntax_t re_syntax_options;
 242
 243
 244 /* Specify the precise syntax of regexps for compilation.  This provides
 245    for compatibility for various utilities which historically have
 246    different, incompatible syntaxes.
 247
 248    The argument SYNTAX is a bit mask comprised of the various bits
 249    defined in regex.h.  We return the old syntax.  */
 250
 251 reg_syntax_t
 252 re_set_syntax (reg_syntax_t syntax)
 253 {
 254   reg_syntax_t ret = re_syntax_options;
 255
 256   re_syntax_options = syntax;
 257   return ret;
 258 }
 259 #ifdef _LIBC
 260 weak_alias (__re_set_syntax, re_set_syntax)
 261 #endif
 262
 263 int
 264 re_compile_fastmap (struct re_pattern_buffer *bufp)
 265 {
 266   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
 267   char *fastmap = bufp->re_fastmap;
 268
 269   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 270   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 271   if (dfa->init_state != dfa->init_state_word)
 272     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 273   if (dfa->init_state != dfa->init_state_nl)
 274     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 275   if (dfa->init_state != dfa->init_state_begbuf)
 276     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 277   bufp->re_fastmap_accurate = 1;
 278   return 0;
 279 }
 280 #ifdef _LIBC
 281 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 282 #endif
 283
 284 static inline void
 285 __attribute ((always_inline))
 286 re_set_fastmap (char *fastmap, int icase, int ch)
 287 {
 288   fastmap[ch] = 1;
 289   if (icase)
 290     fastmap[tolower (ch)] = 1;
 291 }
 292
 293 /* Helper function for re_compile_fastmap.
 294    Compile fastmap for the initial_state INIT_STATE.  */
 295
 296 static void
 297 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 298                          char *fastmap)
 299 {
 300   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
 301   int node_cnt;
 302   int icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
 303   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 304     {
 305       int node = init_state->nodes.elems[node_cnt];
 306       re_token_type_t type = dfa->nodes[node].type;
 307
 308       if (type == CHARACTER)
 309         {
 310           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 311 #ifdef RE_ENABLE_I18N
 312           if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
 313             {
 314               unsigned char buf[MB_LEN_MAX];
 315               unsigned char *p;
 316               wchar_t wc;
 317               mbstate_t state;
 318
 319               p = buf;
 320               *p++ = dfa->nodes[node].opr.c;
 321               while (++node < dfa->nodes_len
 322                      && dfa->nodes[node].type == CHARACTER
 323                      && dfa->nodes[node].mb_partial)
 324                 *p++ = dfa->nodes[node].opr.c;
 325               memset (&state, 0, sizeof (state));
 326               if (mbrtowc (&wc, (const char *) buf, p - buf,
 327                            &state) == p - buf
 328                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 329                       != (size_t) -1))
 330                 re_set_fastmap (fastmap, 0, buf[0]);
 331             }
 332 #endif
 333         }
 334       else if (type == SIMPLE_BRACKET)
 335         {
 336           int i, j, ch;
 337           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 338             for (j = 0; j < UINT_BITS; ++j, ++ch)
 339               if (dfa->nodes[node].opr.sbcset[i] & (1u << j))
 340                 re_set_fastmap (fastmap, icase, ch);
 341         }
 342 #ifdef RE_ENABLE_I18N
 343       else if (type == COMPLEX_BRACKET)
 344         {
 345           int i;
 346           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 347           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 348               || cset->nranges || cset->nchar_classes)
 349             {
 350 # ifdef _LIBC
 351               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 352                 {
 353                   /* In this case we want to catch the bytes which are
 354                      the first byte of any collation elements.
 355                      e.g. In da_DK, we want to catch 'a' since "aa"
 356                           is a valid collation element, and don't catch
 357                           'b' since 'b' is the only collation element
 358                           which starts from 'b'.  */
 359                   int j, ch;
 360                   const int32_t *table = (const int32_t *)
 361                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 362                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 363                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 364                       if (table[ch] < 0)
 365                         re_set_fastmap (fastmap, icase, ch);
 366                 }
 367 # else
 368               if (dfa->mb_cur_max > 1)
 369                 for (i = 0; i < SBC_MAX; ++i)
 370                   if (__btowc (i) == WEOF)
 371                     re_set_fastmap (fastmap, icase, i);
 372 # endif /* not _LIBC */
 373             }
 374           for (i = 0; i < cset->nmbchars; ++i)
 375             {
 376               char buf[256];
 377               mbstate_t state;
 378               memset (&state, '\0', sizeof (state));
 379               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 380                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 381               if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
 382                 {
 383                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 384                       != (size_t) -1)
 385                     re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 386                 }
 387             }
 388         }
 389 #endif /* RE_ENABLE_I18N */
 390       else if (type == OP_PERIOD
 391 #ifdef RE_ENABLE_I18N
 392                || type == OP_UTF8_PERIOD
 393 #endif /* RE_ENABLE_I18N */
 394                || type == END_OF_RE)
 395         {
 396           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 397           if (type == END_OF_RE)
 398             bufp->re_can_be_null = 1;
 399           return;
 400         }
 401     }
 402 }
 403 \f
 404 /* Entry point for POSIX code.  */
 405 /* regcomp takes a regular expression as a string and compiles it.
 406
 407    PREG is a regex_t *.  We do not expect any fields to be initialized,
 408    since POSIX says we shouldn't.  Thus, we set
 409
 410      `re_buffer' to the compiled pattern;
 411      `re_used' to the length of the compiled pattern;
 412      `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
 413        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 414        REG_SYNTAX_POSIX_BASIC;
 415      `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
 416      `re_fastmap' to an allocated space for the fastmap;
 417      `re_fastmap_accurate' to zero;
 418      `re_nsub' to the number of subexpressions in PATTERN.
 419
 420    PATTERN is the address of the pattern string.
 421
 422    CFLAGS is a series of bits which affect compilation.
 423
 424      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 425      use POSIX basic syntax.
 426
 427      If REG_NEWLINE is set, then . and [^...] don't match newline.
 428      Also, regexec will try a match beginning after every newline.
 429
 430      If REG_ICASE is set, then we considers upper- and lowercase
 431      versions of letters to be equivalent when matching.
 432
 433      If REG_NOSUB is set, then when PREG is passed to regexec, that
 434      routine will report only success or failure, and nothing about the
 435      registers.
 436
 437    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 438    the return codes and their meanings.)  */
 439
 440 int
 441 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
 442 {
 443   reg_errcode_t ret;
 444   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
 445                         : REG_SYNTAX_POSIX_BASIC);
 446
 447   preg->re_buffer = NULL;
 448   preg->re_allocated = 0;
 449   preg->re_used = 0;
 450
 451   /* Try to allocate space for the fastmap.  */
 452   preg->re_fastmap = re_malloc (char, SBC_MAX);
 453   if (BE (preg->re_fastmap == NULL, 0))
 454     return REG_ESPACE;
 455
 456   syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
 457
 458   /* If REG_NEWLINE is set, newlines are treated differently.  */
 459   if (cflags & REG_NEWLINE)
 460     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 461       syntax &= ~REG_DOT_NEWLINE;
 462       syntax |= REG_HAT_LISTS_NOT_NEWLINE;
 463       /* It also changes the matching behavior.  */
 464       preg->re_newline_anchor = 1;
 465     }
 466   else
 467     preg->re_newline_anchor = 0;
 468   preg->re_no_sub = !!(cflags & REG_NOSUB);
 469   preg->re_translate = NULL;
 470
 471   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 472
 473   /* POSIX doesn't distinguish between an unmatched open-group and an
 474      unmatched close-group: both are REG_EPAREN.  */
 475   if (ret == REG_ERPAREN)
 476     ret = REG_EPAREN;
 477
 478   /* We have already checked preg->re_fastmap != NULL.  */
 479   if (BE (ret == REG_NOERROR, 1))
 480     /* Compute the fastmap now, since regexec cannot modify the pattern
 481        buffer.  This function never fails in this implementation.  */
 482     (void) re_compile_fastmap (preg);
 483   else
 484     {
 485       /* Some error occurred while compiling the expression.  */
 486       re_free (preg->re_fastmap);
 487       preg->re_fastmap = NULL;
 488     }
 489
 490   return (int) ret;
 491 }
 492 #ifdef _LIBC
 493 weak_alias (__regcomp, regcomp)
 494 #endif
 495
 496 /* Returns a message corresponding to an error code, ERRCODE, returned
 497    from either regcomp or regexec.   We don't use PREG here.  */
 498
 499 size_t
 500 regerror (int errcode, const regex_t *__restrict preg,
 501           char *__restrict errbuf, size_t errbuf_size)
 502 {
 503   const char *msg;
 504   size_t msg_size;
 505
 506   if (BE (errcode < 0
 507           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 508                                / sizeof (__re_error_msgid_idx[0])), 0))
 509     /* Only error codes returned by the rest of the code should be passed
 510        to this routine.  If we are given anything else, or if other regex
 511        code generates an invalid error code, then the program has a bug.
 512        Dump core so we can fix it.  */
 513     abort ();
 514
 515   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 516
 517   msg_size = strlen (msg) + 1; /* Includes the null.  */
 518
 519   if (BE (errbuf_size != 0, 1))
 520     {
 521       if (BE (msg_size > errbuf_size, 0))
 522         {
 523 #if defined HAVE_MEMPCPY || defined _LIBC
 524           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 525 #else
 526           memcpy (errbuf, msg, errbuf_size - 1);
 527           errbuf[errbuf_size - 1] = 0;
 528 #endif
 529         }
 530       else
 531         memcpy (errbuf, msg, msg_size);
 532     }
 533
 534   return msg_size;
 535 }
 536 #ifdef _LIBC
 537 weak_alias (__regerror, regerror)
 538 #endif
 539
 540
 541 #ifdef RE_ENABLE_I18N
 542 /* This static array is used for the map to single-byte characters when
 543    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 544    it the same all the time.  UTF-8 is the preferred encoding so this is
 545    a worthwhile optimization.  */
 546 static const bitset utf8_sb_map =
 547 {
 548   /* Set the first 128 bits.  */
 549 # if UINT_MAX == 0xffffffff
 550   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
 551 # else
 552 #  error "Add case for new unsigned int size"
 553 # endif
 554 };
 555 #endif
 556
 557
 558 static void
 559 free_dfa_content (re_dfa_t *dfa)
 560 {
 561   int i, j;
 562
 563   if (dfa->nodes)
 564     for (i = 0; i < dfa->nodes_len; ++i)
 565       free_token (dfa->nodes + i);
 566   re_free (dfa->nexts);
 567   for (i = 0; i < dfa->nodes_len; ++i)
 568     {
 569       if (dfa->eclosures != NULL)
 570         re_node_set_free (dfa->eclosures + i);
 571       if (dfa->inveclosures != NULL)
 572         re_node_set_free (dfa->inveclosures + i);
 573       if (dfa->edests != NULL)
 574         re_node_set_free (dfa->edests + i);
 575     }
 576   re_free (dfa->edests);
 577   re_free (dfa->eclosures);
 578   re_free (dfa->inveclosures);
 579   re_free (dfa->nodes);
 580
 581   if (dfa->state_table)
 582     for (i = 0; i <= dfa->state_hash_mask; ++i)
 583       {
 584         struct re_state_table_entry *entry = dfa->state_table + i;
 585         for (j = 0; j < entry->num; ++j)
 586           {
 587             re_dfastate_t *state = entry->array[j];
 588             free_state (state);
 589           }
 590         re_free (entry->array);
 591       }
 592   re_free (dfa->state_table);
 593 #ifdef RE_ENABLE_I18N
 594   if (dfa->sb_char != utf8_sb_map)
 595     re_free (dfa->sb_char);
 596 #endif
 597   re_free (dfa->subexp_map);
 598 #ifdef DEBUG
 599   re_free (dfa->re_str);
 600 #endif
 601
 602   re_free (dfa);
 603 }
 604
 605
 606 /* Free dynamically allocated space used by PREG.  */
 607
 608 void
 609 regfree (regex_t *preg)
 610 {
 611   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 612   if (BE (dfa != NULL, 1))
 613     free_dfa_content (dfa);
 614   preg->re_buffer = NULL;
 615   preg->re_allocated = 0;
 616
 617   re_free (preg->re_fastmap);
 618   preg->re_fastmap = NULL;
 619
 620   re_free (preg->re_translate);
 621   preg->re_translate = NULL;
 622 }
 623 #ifdef _LIBC
 624 weak_alias (__regfree, regfree)
 625 #endif
 626 \f
 627 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 628    them unless specifically requested.  */
 629
 630 #if defined _REGEX_RE_COMP || defined _LIBC
 631
 632 /* BSD has one and only one pattern buffer.  */
 633 static struct re_pattern_buffer re_comp_buf;
 634
 635 char *
 636 # ifdef _LIBC
 637 /* Make these definitions weak in libc, so POSIX programs can redefine
 638    these names if they don't use our functions, and still use
 639    regcomp/regexec above without link errors.  */
 640 weak_function
 641 # endif
 642 re_comp (s)
 643      const char *s;
 644 {
 645   reg_errcode_t ret;
 646   char *fastmap;
 647
 648   if (!s)
 649     {
 650       if (!re_comp_buf.re_buffer)
 651         return gettext ("No previous regular expression");
 652       return 0;
 653     }
 654
 655   if (re_comp_buf.re_buffer)
 656     {
 657       fastmap = re_comp_buf.re_fastmap;
 658       re_comp_buf.re_fastmap = NULL;
 659       __regfree (&re_comp_buf);
 660       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 661       re_comp_buf.re_fastmap = fastmap;
 662     }
 663
 664   if (re_comp_buf.re_fastmap == NULL)
 665     {
 666       re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
 667       if (re_comp_buf.re_fastmap == NULL)
 668         return (char *) gettext (__re_error_msgid
 669                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 670     }
 671
 672   /* Since `re_exec' always passes NULL for the `regs' argument, we
 673      don't need to initialize the pattern buffer fields which affect it.  */
 674
 675   /* Match anchors at newlines.  */
 676   re_comp_buf.re_newline_anchor = 1;
 677
 678   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 679
 680   if (!ret)
 681     return NULL;
 682
 683   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 684   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 685 }
 686
 687 #ifdef _LIBC
 688 libc_freeres_fn (free_mem)
 689 {
 690   __regfree (&re_comp_buf);
 691 }
 692 #endif
 693
 694 #endif /* _REGEX_RE_COMP */
 695 \f
 696 /* Internal entry point.
 697    Compile the regular expression PATTERN, whose length is LENGTH.
 698    SYNTAX indicate regular expression's syntax.  */
 699
 700 static reg_errcode_t
 701 re_compile_internal (regex_t *preg, const char * pattern, int length,
 702                      reg_syntax_t syntax)
 703 {
 704   reg_errcode_t err = REG_NOERROR;
 705   re_dfa_t *dfa;
 706   re_string_t regexp;
 707
 708   /* Initialize the pattern buffer.  */
 709   preg->re_fastmap_accurate = 0;
 710   preg->re_syntax = syntax;
 711   preg->re_not_bol = preg->re_not_eol = 0;
 712   preg->re_used = 0;
 713   preg->re_nsub = 0;
 714   preg->re_can_be_null = 0;
 715   preg->re_regs_allocated = REG_UNALLOCATED;
 716
 717   /* Initialize the dfa.  */
 718   dfa = (re_dfa_t *) preg->re_buffer;
 719   if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
 720     {
 721       /* If zero allocated, but buffer is non-null, try to realloc
 722          enough space.  This loses if buffer's address is bogus, but
 723          that is the user's responsibility.  If buffer is null this
 724          is a simple allocation.  */
 725       dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
 726       if (dfa == NULL)
 727         return REG_ESPACE;
 728       preg->re_allocated = sizeof (re_dfa_t);
 729       preg->re_buffer = (unsigned char *) dfa;
 730     }
 731   preg->re_used = sizeof (re_dfa_t);
 732
 733   __libc_lock_init (dfa->lock);
 734
 735   err = init_dfa (dfa, length);
 736   if (BE (err != REG_NOERROR, 0))
 737     {
 738       free_dfa_content (dfa);
 739       preg->re_buffer = NULL;
 740       preg->re_allocated = 0;
 741       return err;
 742     }
 743 #ifdef DEBUG
 744   dfa->re_str = re_malloc (char, length + 1);
 745   strncpy (dfa->re_str, pattern, length + 1);
 746 #endif
 747
 748   err = re_string_construct (&regexp, pattern, length, preg->re_translate,
 749                              syntax & REG_IGNORE_CASE, dfa);
 750   if (BE (err != REG_NOERROR, 0))
 751     {
 752     re_compile_internal_free_return:
 753       free_workarea_compile (preg);
 754       re_string_destruct (&regexp);
 755       free_dfa_content (dfa);
 756       preg->re_buffer = NULL;
 757       preg->re_allocated = 0;
 758       return err;
 759     }
 760
 761   /* Parse the regular expression, and build a structure tree.  */
 762   preg->re_nsub = 0;
 763   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 764   if (BE (dfa->str_tree == NULL, 0))
 765     goto re_compile_internal_free_return;
 766
 767   /* Analyze the tree and create the nfa.  */
 768   err = analyze (preg);
 769   if (BE (err != REG_NOERROR, 0))
 770     goto re_compile_internal_free_return;
 771
 772 #ifdef RE_ENABLE_I18N
 773   /* If possible, do searching in single byte encoding to speed things up.  */
 774   if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
 775     optimize_utf8 (dfa);
 776 #endif
 777
 778   /* Then create the initial state of the dfa.  */
 779   err = create_initial_state (dfa);
 780
 781   /* Release work areas.  */
 782   free_workarea_compile (preg);
 783   re_string_destruct (&regexp);
 784
 785   if (BE (err != REG_NOERROR, 0))
 786     {
 787       free_dfa_content (dfa);
 788       preg->re_buffer = NULL;
 789       preg->re_allocated = 0;
 790     }
 791
 792   return err;
 793 }
 794
 795 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 796    as the initial length of some arrays.  */
 797
 798 static reg_errcode_t
 799 init_dfa (re_dfa_t *dfa, int pat_len)
 800 {
 801   unsigned int table_size;
 802 #ifndef _LIBC
 803   char *codeset_name;
 804 #endif
 805
 806   memset (dfa, '\0', sizeof (re_dfa_t));
 807
 808   /* Force allocation of str_tree_storage the first time.  */
 809   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 810
 811   dfa->nodes_alloc = pat_len + 1;
 812   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 813
 814   /*  table_size = 2 ^ ceil(log pat_len) */
 815   for (table_size = 1; ; table_size <<= 1)
 816     if (table_size > pat_len)
 817       break;
 818
 819   dfa->state_table = re_calloc (struct re_state_table_entry, table_size);
 820   dfa->state_hash_mask = table_size - 1;
 821
 822   dfa->mb_cur_max = MB_CUR_MAX;
 823 #ifdef _LIBC
 824   if (dfa->mb_cur_max == 6
 825       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 826     dfa->is_utf8 = 1;
 827   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 828                        != 0);
 829 #else
 830 # ifdef HAVE_LANGINFO_CODESET
 831   codeset_name = nl_langinfo (CODESET);
 832 # else
 833   codeset_name = getenv ("LC_ALL");
 834   if (codeset_name == NULL || codeset_name[0] == '\0')
 835     codeset_name = getenv ("LC_CTYPE");
 836   if (codeset_name == NULL || codeset_name[0] == '\0')
 837     codeset_name = getenv ("LANG");
 838   if (codeset_name == NULL)
 839     codeset_name = "";
 840   else if (strchr (codeset_name, '.') !=  NULL)
 841     codeset_name = strchr (codeset_name, '.') + 1;
 842 # endif
 843
 844   if (strcasecmp (codeset_name, "UTF-8") == 0
 845       || strcasecmp (codeset_name, "UTF8") == 0)
 846     dfa->is_utf8 = 1;
 847
 848   /* We check exhaustively in the loop below if this charset is a
 849      superset of ASCII.  */
 850   dfa->map_notascii = 0;
 851 #endif
 852
 853 #ifdef RE_ENABLE_I18N
 854   if (dfa->mb_cur_max > 1)
 855     {
 856       if (dfa->is_utf8)
 857         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 858       else
 859         {
 860           int i, j, ch;
 861
 862           dfa->sb_char = re_calloc (unsigned int, BITSET_UINTS);
 863           if (BE (dfa->sb_char == NULL, 0))
 864             return REG_ESPACE;
 865
 866           /* Clear all bits by, then set those corresponding to single
 867              byte chars.  */
 868           bitset_empty (dfa->sb_char);
 869
 870           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 871             for (j = 0; j < UINT_BITS; ++j, ++ch)
 872               {
 873                 wint_t wch = __btowc (ch);
 874                 if (wch != WEOF)
 875                   dfa->sb_char[i] |= 1u << j;
 876 # ifndef _LIBC
 877                 if (isascii (ch) && wch != ch)
 878                   dfa->map_notascii = 1;
 879 # endif
 880               }
 881         }
 882     }
 883 #endif
 884
 885   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 886     return REG_ESPACE;
 887   return REG_NOERROR;
 888 }
 889
 890 /* Initialize WORD_CHAR table, which indicate which character is
 891    "word".  In this case "word" means that it is the word construction
 892    character used by some operators like "\<", "\>", etc.  */
 893
 894 static void
 895 init_word_char (re_dfa_t *dfa)
 896 {
 897   int i, j, ch;
 898   dfa->word_ops_used = 1;
 899   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 900     for (j = 0; j < UINT_BITS; ++j, ++ch)
 901       if (isalnum (ch) || ch == '_')
 902         dfa->word_char[i] |= 1u << j;
 903 }
 904
 905 /* Free the work area which are only used while compiling.  */
 906
 907 static void
 908 free_workarea_compile (regex_t *preg)
 909 {
 910   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 911   bin_tree_storage_t *storage, *next;
 912   for (storage = dfa->str_tree_storage; storage; storage = next)
 913     {
 914       next = storage->next;
 915       re_free (storage);
 916     }
 917   dfa->str_tree_storage = NULL;
 918   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 919   dfa->str_tree = NULL;
 920   re_free (dfa->org_indices);
 921   dfa->org_indices = NULL;
 922 }
 923
 924 /* Create initial states for all contexts.  */
 925
 926 static reg_errcode_t
 927 create_initial_state (re_dfa_t *dfa)
 928 {
 929   int first, i;
 930   reg_errcode_t err;
 931   re_node_set init_nodes;
 932
 933   /* Initial states have the epsilon closure of the node which is
 934      the first node of the regular expression.  */
 935   first = dfa->str_tree->first->node_idx;
 936   dfa->init_node = first;
 937   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 938   if (BE (err != REG_NOERROR, 0))
 939     return err;
 940
 941   /* The back-references which are in initial states can epsilon transit,
 942      since in this case all of the subexpressions can be null.
 943      Then we add epsilon closures of the nodes which are the next nodes of
 944      the back-references.  */
 945   if (dfa->nbackref > 0)
 946     for (i = 0; i < init_nodes.nelem; ++i)
 947       {
 948         int node_idx = init_nodes.elems[i];
 949         re_token_type_t type = dfa->nodes[node_idx].type;
 950
 951         int clexp_idx;
 952         if (type != OP_BACK_REF)
 953           continue;
 954         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 955           {
 956             re_token_t *clexp_node;
 957             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 958             if (clexp_node->type == OP_CLOSE_SUBEXP
 959                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 960               break;
 961           }
 962         if (clexp_idx == init_nodes.nelem)
 963           continue;
 964
 965         if (type == OP_BACK_REF)
 966           {
 967             int dest_idx = dfa->edests[node_idx].elems[0];
 968             if (!re_node_set_contains (&init_nodes, dest_idx))
 969               {
 970                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 971                 i = 0;
 972               }
 973           }
 974       }
 975
 976   /* It must be the first time to invoke acquire_state.  */
 977   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 978   /* We don't check ERR here, since the initial state must not be NULL.  */
 979   if (BE (dfa->init_state == NULL, 0))
 980     return err;
 981   if (dfa->init_state->has_constraint)
 982     {
 983       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 984                                                        CONTEXT_WORD);
 985       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 986                                                      CONTEXT_NEWLINE);
 987       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
 988                                                          &init_nodes,
 989                                                          CONTEXT_NEWLINE
 990                                                          | CONTEXT_BEGBUF);
 991       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 992               || dfa->init_state_begbuf == NULL, 0))
 993         return err;
 994     }
 995   else
 996     dfa->init_state_word = dfa->init_state_nl
 997       = dfa->init_state_begbuf = dfa->init_state;
 998
 999   re_node_set_free (&init_nodes);
1000   return REG_NOERROR;
1001 }
1002 \f
1003 #ifdef RE_ENABLE_I18N
1004 /* If it is possible to do searching in single byte encoding instead of UTF-8
1005    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1006    DFA nodes where needed.  */
1007
1008 static void
1009 optimize_utf8 (re_dfa_t *dfa)
1010 {
1011   int node, i, mb_chars = 0, has_period = 0;
1012
1013   for (node = 0; node < dfa->nodes_len; ++node)
1014     switch (dfa->nodes[node].type)
1015       {
1016       case CHARACTER:
1017         if (dfa->nodes[node].opr.c >= 0x80)
1018           mb_chars = 1;
1019         break;
1020       case ANCHOR:
1021         switch (dfa->nodes[node].opr.idx)
1022           {
1023           case LINE_FIRST:
1024           case LINE_LAST:
1025           case BUF_FIRST:
1026           case BUF_LAST:
1027             break;
1028           default:
1029             /* Word anchors etc. cannot be handled.  */
1030             return;
1031           }
1032         break;
1033       case OP_PERIOD:
1034         has_period = 1;
1035         break;
1036       case OP_BACK_REF:
1037       case OP_ALT:
1038       case END_OF_RE:
1039       case OP_DUP_ASTERISK:
1040       case OP_OPEN_SUBEXP:
1041       case OP_CLOSE_SUBEXP:
1042         break;
1043       case COMPLEX_BRACKET:
1044         return;
1045       case SIMPLE_BRACKET:
1046         /* Just double check.  */
1047         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1048           if (dfa->nodes[node].opr.sbcset[i])
1049             return;
1050         break;
1051       default:
1052         abort ();
1053       }
1054
1055   if (mb_chars || has_period)
1056     for (node = 0; node < dfa->nodes_len; ++node)
1057       {
1058         if (dfa->nodes[node].type == CHARACTER
1059             && dfa->nodes[node].opr.c >= 0x80)
1060           dfa->nodes[node].mb_partial = 0;
1061         else if (dfa->nodes[node].type == OP_PERIOD)
1062           dfa->nodes[node].type = OP_UTF8_PERIOD;
1063       }
1064
1065   /* The search can be in single byte locale.  */
1066   dfa->mb_cur_max = 1;
1067   dfa->is_utf8 = 0;
1068   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1069 }
1070 #endif
1071 \f
1072 /* Analyze the structure tree, and calculate "first", "next", "edest",
1073    "eclosure", and "inveclosure".  */
1074
1075 static reg_errcode_t
1076 analyze (regex_t *preg)
1077 {
1078   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1079   reg_errcode_t ret;
1080
1081   /* Allocate arrays.  */
1082   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1083   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1084   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1085   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1086   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1087           || dfa->eclosures == NULL, 0))
1088     return REG_ESPACE;
1089
1090   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1091   if (dfa->subexp_map != NULL)
1092     {
1093       int i;
1094       for (i = 0; i < preg->re_nsub; i++)
1095         dfa->subexp_map[i] = i;
1096       preorder (dfa->str_tree, optimize_subexps, dfa);
1097       for (i = 0; i < preg->re_nsub; i++)
1098         if (dfa->subexp_map[i] != i)
1099           break;
1100       if (i == preg->re_nsub)
1101         {
1102           free (dfa->subexp_map);
1103           dfa->subexp_map = NULL;
1104         }
1105     }
1106
1107   ret = postorder (dfa->str_tree, lower_subexps, preg);
1108   if (BE (ret != REG_NOERROR, 0))
1109     return ret;
1110   ret = postorder (dfa->str_tree, calc_first, dfa);
1111   if (BE (ret != REG_NOERROR, 0))
1112     return ret;
1113   preorder (dfa->str_tree, calc_next, dfa);
1114   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1115   if (BE (ret != REG_NOERROR, 0))
1116     return ret;
1117   ret = calc_eclosure (dfa);
1118   if (BE (ret != REG_NOERROR, 0))
1119     return ret;
1120
1121   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1122      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1123   if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1124       || dfa->nbackref)
1125     {
1126       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1127       if (BE (dfa->inveclosures == NULL, 0))
1128         return REG_ESPACE;
1129       ret = calc_inveclosure (dfa);
1130     }
1131
1132   return ret;
1133 }
1134
1135 /* Our parse trees are very unbalanced, so we cannot use a stack to
1136    implement parse tree visits.  Instead, we use parent pointers and
1137    some hairy code in these two functions.  */
1138 static reg_errcode_t
1139 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1140            void *extra)
1141 {
1142   bin_tree_t *node, *prev;
1143
1144   for (node = root; ; )
1145     {
1146       /* Descend down the tree, preferably to the left (or to the right
1147          if that's the only child).  */
1148       while (node->left || node->right)
1149         if (node->left)
1150           node = node->left;
1151         else
1152           node = node->right;
1153
1154       do
1155         {
1156           reg_errcode_t err = fn (extra, node);
1157           if (BE (err != REG_NOERROR, 0))
1158             return err;
1159           if (node->parent == NULL)
1160             return REG_NOERROR;
1161           prev = node;
1162           node = node->parent;
1163         }
1164       /* Go up while we have a node that is reached from the right.  */
1165       while (node->right == prev || node->right == NULL);
1166       node = node->right;
1167     }
1168 }
1169
1170 static reg_errcode_t
1171 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1172           void *extra)
1173 {
1174   bin_tree_t *node;
1175
1176   for (node = root; ; )
1177     {
1178       reg_errcode_t err = fn (extra, node);
1179       if (BE (err != REG_NOERROR, 0))
1180         return err;
1181
1182       /* Go to the left node, or up and to the right.  */
1183       if (node->left)
1184         node = node->left;
1185       else
1186         {
1187           bin_tree_t *prev = NULL;
1188           while (node->right == prev || node->right == NULL)
1189             {
1190               prev = node;
1191               node = node->parent;
1192               if (!node)
1193                 return REG_NOERROR;
1194             }
1195           node = node->right;
1196         }
1197     }
1198 }
1199
1200 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1201    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1202    backreferences as well.  Requires a preorder visit.  */
1203 static reg_errcode_t
1204 optimize_subexps (void *extra, bin_tree_t *node)
1205 {
1206   re_dfa_t *dfa = (re_dfa_t *) extra;
1207
1208   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1209     {
1210       int idx = node->token.opr.idx;
1211       node->token.opr.idx = dfa->subexp_map[idx];
1212       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1213     }
1214
1215   else if (node->token.type == SUBEXP
1216            && node->left && node->left->token.type == SUBEXP)
1217     {
1218       int other_idx = node->left->token.opr.idx;
1219
1220       node->left = node->left->left;
1221       if (node->left)
1222         node->left->parent = node;
1223
1224       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1225       if (other_idx < CHAR_BIT * sizeof dfa->used_bkref_map)
1226         dfa->used_bkref_map &= ~(1u << other_idx);
1227     }
1228
1229   return REG_NOERROR;
1230 }
1231
1232 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1233    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1234 static reg_errcode_t
1235 lower_subexps (void *extra, bin_tree_t *node)
1236 {
1237   regex_t *preg = (regex_t *) extra;
1238   reg_errcode_t err = REG_NOERROR;
1239
1240   if (node->left && node->left->token.type == SUBEXP)
1241     {
1242       node->left = lower_subexp (&err, preg, node->left);
1243       if (node->left)
1244         node->left->parent = node;
1245     }
1246   if (node->right && node->right->token.type == SUBEXP)
1247     {
1248       node->right = lower_subexp (&err, preg, node->right);
1249       if (node->right)
1250         node->right->parent = node;
1251     }
1252
1253   return err;
1254 }
1255
1256 static bin_tree_t *
1257 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1258 {
1259   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1260   bin_tree_t *body = node->left;
1261   bin_tree_t *op, *cls, *tree1, *tree;
1262
1263   if (preg->re_no_sub
1264       /* We do not optimize empty subexpressions, because otherwise we may
1265          have bad CONCAT nodes with NULL children.  This is obviously not
1266          very common, so we do not lose much.  An example that triggers
1267          this case is the sed "script" /\(\)/x.  */
1268       && node->left != NULL
1269       && (node->token.opr.idx >= CHAR_BIT * sizeof dfa->used_bkref_map
1270           || !(dfa->used_bkref_map & (1u << node->token.opr.idx))))
1271     return node->left;
1272
1273   /* Convert the SUBEXP node to the concatenation of an
1274      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1275   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1276   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1277   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1278   tree = create_tree (dfa, op, tree1, CONCAT);
1279   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1280     {
1281       *err = REG_ESPACE;
1282       return NULL;
1283     }
1284
1285   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1286   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1287   return tree;
1288 }
1289
1290 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1291    nodes.  Requires a postorder visit.  */
1292 static reg_errcode_t
1293 calc_first (void *extra, bin_tree_t *node)
1294 {
1295   re_dfa_t *dfa = (re_dfa_t *) extra;
1296   if (node->token.type == CONCAT)
1297     {
1298       node->first = node->left->first;
1299       node->node_idx = node->left->node_idx;
1300     }
1301   else
1302     {
1303       node->first = node;
1304       node->node_idx = re_dfa_add_node (dfa, node->token);
1305       if (BE (node->node_idx == -1, 0))
1306         return REG_ESPACE;
1307     }
1308   return REG_NOERROR;
1309 }
1310
1311 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1312 static reg_errcode_t
1313 calc_next (void *extra, bin_tree_t *node)
1314 {
1315   switch (node->token.type)
1316     {
1317     case OP_DUP_ASTERISK:
1318       node->left->next = node;
1319       break;
1320     case CONCAT:
1321       node->left->next = node->right->first;
1322       node->right->next = node->next;
1323       break;
1324     default:
1325       if (node->left)
1326         node->left->next = node->next;
1327       if (node->right)
1328         node->right->next = node->next;
1329       break;
1330     }
1331   return REG_NOERROR;
1332 }
1333
1334 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1335 static reg_errcode_t
1336 link_nfa_nodes (void *extra, bin_tree_t *node)
1337 {
1338   re_dfa_t *dfa = (re_dfa_t *) extra;
1339   int idx = node->node_idx;
1340   reg_errcode_t err = REG_NOERROR;
1341
1342   switch (node->token.type)
1343     {
1344     case CONCAT:
1345       break;
1346
1347     case END_OF_RE:
1348       assert (node->next == NULL);
1349       break;
1350
1351     case OP_DUP_ASTERISK:
1352     case OP_ALT:
1353       {
1354         int left, right;
1355         dfa->has_plural_match = 1;
1356         if (node->left != NULL)
1357           left = node->left->first->node_idx;
1358         else
1359           left = node->next->node_idx;
1360         if (node->right != NULL)
1361           right = node->right->first->node_idx;
1362         else
1363           right = node->next->node_idx;
1364         assert (left > -1);
1365         assert (right > -1);
1366         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1367       }
1368       break;
1369
1370     case ANCHOR:
1371     case OP_OPEN_SUBEXP:
1372     case OP_CLOSE_SUBEXP:
1373       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1374       break;
1375
1376     case OP_BACK_REF:
1377       dfa->nexts[idx] = node->next->node_idx;
1378       if (node->token.type == OP_BACK_REF)
1379         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1380       break;
1381
1382     default:
1383       assert (!IS_EPSILON_NODE (node->token.type));
1384       dfa->nexts[idx] = node->next->node_idx;
1385       break;
1386     }
1387
1388   return err;
1389 }
1390
1391 /* Duplicate the epsilon closure of the node ROOT_NODE.
1392    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1393    to their own constraint.  */
1394
1395 static reg_errcode_t
1396 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1397                         int root_node, unsigned int init_constraint)
1398 {
1399   int org_node, clone_node, ret;
1400   unsigned int constraint = init_constraint;
1401   for (org_node = top_org_node, clone_node = top_clone_node;;)
1402     {
1403       int org_dest, clone_dest;
1404       if (dfa->nodes[org_node].type == OP_BACK_REF)
1405         {
1406           /* If the back reference epsilon-transit, its destination must
1407              also have the constraint.  Then duplicate the epsilon closure
1408              of the destination of the back reference, and store it in
1409              edests of the back reference.  */
1410           org_dest = dfa->nexts[org_node];
1411           re_node_set_empty (dfa->edests + clone_node);
1412           clone_dest = duplicate_node (dfa, org_dest, constraint);
1413           if (BE (clone_dest == -1, 0))
1414             return REG_ESPACE;
1415           dfa->nexts[clone_node] = dfa->nexts[org_node];
1416           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1417           if (BE (ret < 0, 0))
1418             return REG_ESPACE;
1419         }
1420       else if (dfa->edests[org_node].nelem == 0)
1421         {
1422           /* In case of the node can't epsilon-transit, don't duplicate the
1423              destination and store the original destination as the
1424              destination of the node.  */
1425           dfa->nexts[clone_node] = dfa->nexts[org_node];
1426           break;
1427         }
1428       else if (dfa->edests[org_node].nelem == 1)
1429         {
1430           /* In case of the node can epsilon-transit, and it has only one
1431              destination.  */
1432           org_dest = dfa->edests[org_node].elems[0];
1433           re_node_set_empty (dfa->edests + clone_node);
1434           if (dfa->nodes[org_node].type == ANCHOR)
1435             {
1436               /* In case of the node has another constraint, append it.  */
1437               if (org_node == root_node && clone_node != org_node)
1438                 {
1439                   /* ...but if the node is root_node itself, it means the
1440                      epsilon closure have a loop, then tie it to the
1441                      destination of the root_node.  */
1442                   ret = re_node_set_insert (dfa->edests + clone_node,
1443                                             org_dest);
1444                   if (BE (ret < 0, 0))
1445                     return REG_ESPACE;
1446                   break;
1447                 }
1448               constraint |= dfa->nodes[org_node].opr.ctx_type;
1449             }
1450           clone_dest = duplicate_node (dfa, org_dest, constraint);
1451           if (BE (clone_dest == -1, 0))
1452             return REG_ESPACE;
1453           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1454           if (BE (ret < 0, 0))
1455             return REG_ESPACE;
1456         }
1457       else /* dfa->edests[org_node].nelem == 2 */
1458         {
1459           /* In case of the node can epsilon-transit, and it has two
1460              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1461           org_dest = dfa->edests[org_node].elems[0];
1462           re_node_set_empty (dfa->edests + clone_node);
1463           /* Search for a duplicated node which satisfies the constraint.  */
1464           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1465           if (clone_dest == -1)
1466             {
1467               /* There are no such a duplicated node, create a new one.  */
1468               reg_errcode_t err;
1469               clone_dest = duplicate_node (dfa, org_dest, constraint);
1470               if (BE (clone_dest == -1, 0))
1471                 return REG_ESPACE;
1472               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1473               if (BE (ret < 0, 0))
1474                 return REG_ESPACE;
1475               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1476                                             root_node, constraint);
1477               if (BE (err != REG_NOERROR, 0))
1478                 return err;
1479             }
1480           else
1481             {
1482               /* There are a duplicated node which satisfy the constraint,
1483                  use it to avoid infinite loop.  */
1484               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1485               if (BE (ret < 0, 0))
1486                 return REG_ESPACE;
1487             }
1488
1489           org_dest = dfa->edests[org_node].elems[1];
1490           clone_dest = duplicate_node (dfa, org_dest, constraint);
1491           if (BE (clone_dest == -1, 0))
1492             return REG_ESPACE;
1493           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1494           if (BE (ret < 0, 0))
1495             return REG_ESPACE;
1496         }
1497       org_node = org_dest;
1498       clone_node = clone_dest;
1499     }
1500   return REG_NOERROR;
1501 }
1502
1503 /* Search for a node which is duplicated from the node ORG_NODE, and
1504    satisfies the constraint CONSTRAINT.  */
1505
1506 static int
1507 search_duplicated_node (re_dfa_t *dfa, int org_node, unsigned int constraint)
1508 {
1509   int idx;
1510   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1511     {
1512       if (org_node == dfa->org_indices[idx]
1513           && constraint == dfa->nodes[idx].constraint)
1514         return idx; /* Found.  */
1515     }
1516   return -1; /* Not found.  */
1517 }
1518
1519 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1520    Return the index of the new node, or -1 if insufficient storage is
1521    available.  */
1522
1523 static int
1524 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1525 {
1526   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1527   if (BE (dup_idx != -1, 1))
1528     {
1529       dfa->nodes[dup_idx].constraint = constraint;
1530       if (dfa->nodes[org_idx].type == ANCHOR)
1531         dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1532       dfa->nodes[dup_idx].duplicated = 1;
1533
1534       /* Store the index of the original node.  */
1535       dfa->org_indices[dup_idx] = org_idx;
1536     }
1537   return dup_idx;
1538 }
1539
1540 static reg_errcode_t
1541 calc_inveclosure (re_dfa_t *dfa)
1542 {
1543   int src, idx, ret;
1544   for (idx = 0; idx < dfa->nodes_len; ++idx)
1545     re_node_set_init_empty (dfa->inveclosures + idx);
1546
1547   for (src = 0; src < dfa->nodes_len; ++src)
1548     {
1549       int *elems = dfa->eclosures[src].elems;
1550       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1551         {
1552           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1553           if (BE (ret == -1, 0))
1554             return REG_ESPACE;
1555         }
1556     }
1557
1558   return REG_NOERROR;
1559 }
1560
1561 /* Calculate "eclosure" for all the node in DFA.  */
1562
1563 static reg_errcode_t
1564 calc_eclosure (re_dfa_t *dfa)
1565 {
1566   int node_idx, incomplete;
1567 #ifdef DEBUG
1568   assert (dfa->nodes_len > 0);
1569 #endif
1570   incomplete = 0;
1571   /* For each nodes, calculate epsilon closure.  */
1572   for (node_idx = 0; ; ++node_idx)
1573     {
1574       reg_errcode_t err;
1575       re_node_set eclosure_elem;
1576       if (node_idx == dfa->nodes_len)
1577         {
1578           if (!incomplete)
1579             break;
1580           incomplete = 0;
1581           node_idx = 0;
1582         }
1583
1584 #ifdef DEBUG
1585       assert (dfa->eclosures[node_idx].nelem != -1);
1586 #endif
1587
1588       /* If we have already calculated, skip it.  */
1589       if (dfa->eclosures[node_idx].nelem != 0)
1590         continue;
1591       /* Calculate epsilon closure of `node_idx'.  */
1592       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1593       if (BE (err != REG_NOERROR, 0))
1594         return err;
1595
1596       if (dfa->eclosures[node_idx].nelem == 0)
1597         {
1598           incomplete = 1;
1599           re_node_set_free (&eclosure_elem);
1600         }
1601     }
1602   return REG_NOERROR;
1603 }
1604
1605 /* Calculate epsilon closure of NODE.  */
1606
1607 static reg_errcode_t
1608 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1609 {
1610   reg_errcode_t err;
1611   unsigned int constraint;
1612   int i, incomplete;
1613   re_node_set eclosure;
1614   incomplete = 0;
1615   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1616   if (BE (err != REG_NOERROR, 0))
1617     return err;
1618
1619   /* This indicates that we are calculating this node now.
1620      We reference this value to avoid infinite loop.  */
1621   dfa->eclosures[node].nelem = -1;
1622
1623   constraint = ((dfa->nodes[node].type == ANCHOR)
1624                 ? dfa->nodes[node].opr.ctx_type : 0);
1625   /* If the current node has constraints, duplicate all nodes.
1626      Since they must inherit the constraints.  */
1627   if (constraint
1628       && dfa->edests[node].nelem
1629       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1630     {
1631       int org_node, cur_node;
1632       org_node = cur_node = node;
1633       err = duplicate_node_closure (dfa, node, node, node, constraint);
1634       if (BE (err != REG_NOERROR, 0))
1635         return err;
1636     }
1637
1638   /* Expand each epsilon destination nodes.  */
1639   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1640     for (i = 0; i < dfa->edests[node].nelem; ++i)
1641       {
1642         re_node_set eclosure_elem;
1643         int edest = dfa->edests[node].elems[i];
1644         /* If calculating the epsilon closure of `edest' is in progress,
1645            return intermediate result.  */
1646         if (dfa->eclosures[edest].nelem == -1)
1647           {
1648             incomplete = 1;
1649             continue;
1650           }
1651         /* If we haven't calculated the epsilon closure of `edest' yet,
1652            calculate now. Otherwise use calculated epsilon closure.  */
1653         if (dfa->eclosures[edest].nelem == 0)
1654           {
1655             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1656             if (BE (err != REG_NOERROR, 0))
1657               return err;
1658           }
1659         else
1660           eclosure_elem = dfa->eclosures[edest];
1661         /* Merge the epsilon closure of `edest'.  */
1662         re_node_set_merge (&eclosure, &eclosure_elem);
1663         /* If the epsilon closure of `edest' is incomplete,
1664            the epsilon closure of this node is also incomplete.  */
1665         if (dfa->eclosures[edest].nelem == 0)
1666           {
1667             incomplete = 1;
1668             re_node_set_free (&eclosure_elem);
1669           }
1670       }
1671
1672   /* Epsilon closures include itself.  */
1673   re_node_set_insert (&eclosure, node);
1674   if (incomplete && !root)
1675     dfa->eclosures[node].nelem = 0;
1676   else
1677     dfa->eclosures[node] = eclosure;
1678   *new_set = eclosure;
1679   return REG_NOERROR;
1680 }
1681 \f
1682 /* Functions for token which are used in the parser.  */
1683
1684 /* Fetch a token from INPUT.
1685    We must not use this function inside bracket expressions.  */
1686
1687 static void
1688 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1689 {
1690   re_string_skip_bytes (input, peek_token (result, input, syntax));
1691 }
1692
1693 /* Peek a token from INPUT, and return the length of the token.
1694    We must not use this function inside bracket expressions.  */
1695
1696 static int
1697 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1698 {
1699   unsigned char c;
1700
1701   if (re_string_eoi (input))
1702     {
1703       token->type = END_OF_RE;
1704       return 0;
1705     }
1706
1707   c = re_string_peek_byte (input, 0);
1708   token->opr.c = c;
1709
1710   token->word_char = 0;
1711 #ifdef RE_ENABLE_I18N
1712   token->mb_partial = 0;
1713   if (input->mb_cur_max > 1 &&
1714       !re_string_first_byte (input, re_string_cur_idx (input)))
1715     {
1716       token->type = CHARACTER;
1717       token->mb_partial = 1;
1718       return 1;
1719     }
1720 #endif
1721   if (c == '\\')
1722     {
1723       unsigned char c2;
1724       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1725         {
1726           token->type = BACK_SLASH;
1727           return 1;
1728         }
1729
1730       c2 = re_string_peek_byte_case (input, 1);
1731       token->opr.c = c2;
1732       token->type = CHARACTER;
1733 #ifdef RE_ENABLE_I18N
1734       if (input->mb_cur_max > 1)
1735         {
1736           wint_t wc = re_string_wchar_at (input,
1737                                           re_string_cur_idx (input) + 1);
1738           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1739         }
1740       else
1741 #endif
1742         token->word_char = IS_WORD_CHAR (c2) != 0;
1743
1744       switch (c2)
1745         {
1746         case '|':
1747           if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1748             token->type = OP_ALT;
1749           break;
1750         case '1': case '2': case '3': case '4': case '5':
1751         case '6': case '7': case '8': case '9':
1752           if (!(syntax & REG_NO_BK_REFS))
1753             {
1754               token->type = OP_BACK_REF;
1755               token->opr.idx = c2 - '1';
1756             }
1757           break;
1758         case '<':
1759           if (!(syntax & REG_NO_GNU_OPS))
1760             {
1761               token->type = ANCHOR;
1762               token->opr.ctx_type = WORD_FIRST;
1763             }
1764           break;
1765         case '>':
1766           if (!(syntax & REG_NO_GNU_OPS))
1767             {
1768               token->type = ANCHOR;
1769               token->opr.ctx_type = WORD_LAST;
1770             }
1771           break;
1772         case 'b':
1773           if (!(syntax & REG_NO_GNU_OPS))
1774             {
1775               token->type = ANCHOR;
1776               token->opr.ctx_type = WORD_DELIM;
1777             }
1778           break;
1779         case 'B':
1780           if (!(syntax & REG_NO_GNU_OPS))
1781             {
1782               token->type = ANCHOR;
1783               token->opr.ctx_type = NOT_WORD_DELIM;
1784             }
1785           break;
1786         case 'w':
1787           if (!(syntax & REG_NO_GNU_OPS))
1788             token->type = OP_WORD;
1789           break;
1790         case 'W':
1791           if (!(syntax & REG_NO_GNU_OPS))
1792             token->type = OP_NOTWORD;
1793           break;
1794         case 's':
1795           if (!(syntax & REG_NO_GNU_OPS))
1796             token->type = OP_SPACE;
1797           break;
1798         case 'S':
1799           if (!(syntax & REG_NO_GNU_OPS))
1800             token->type = OP_NOTSPACE;
1801           break;
1802         case '`':
1803           if (!(syntax & REG_NO_GNU_OPS))
1804             {
1805               token->type = ANCHOR;
1806               token->opr.ctx_type = BUF_FIRST;
1807             }
1808           break;
1809         case '\'':
1810           if (!(syntax & REG_NO_GNU_OPS))
1811             {
1812               token->type = ANCHOR;
1813               token->opr.ctx_type = BUF_LAST;
1814             }
1815           break;
1816         case '(':
1817           if (!(syntax & REG_NO_BK_PARENS))
1818             token->type = OP_OPEN_SUBEXP;
1819           break;
1820         case ')':
1821           if (!(syntax & REG_NO_BK_PARENS))
1822             token->type = OP_CLOSE_SUBEXP;
1823           break;
1824         case '+':
1825           if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1826             token->type = OP_DUP_PLUS;
1827           break;
1828         case '?':
1829           if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1830             token->type = OP_DUP_QUESTION;
1831           break;
1832         case '{':
1833           if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1834             token->type = OP_OPEN_DUP_NUM;
1835           break;
1836         case '}':
1837           if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1838             token->type = OP_CLOSE_DUP_NUM;
1839           break;
1840         default:
1841           break;
1842         }
1843       return 2;
1844     }
1845
1846   token->type = CHARACTER;
1847 #ifdef RE_ENABLE_I18N
1848   if (input->mb_cur_max > 1)
1849     {
1850       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1851       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1852     }
1853   else
1854 #endif
1855     token->word_char = IS_WORD_CHAR (token->opr.c);
1856
1857   switch (c)
1858     {
1859     case '\n':
1860       if (syntax & REG_NEWLINE_ALT)
1861         token->type = OP_ALT;
1862       break;
1863     case '|':
1864       if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1865         token->type = OP_ALT;
1866       break;
1867     case '*':
1868       token->type = OP_DUP_ASTERISK;
1869       break;
1870     case '+':
1871       if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1872         token->type = OP_DUP_PLUS;
1873       break;
1874     case '?':
1875       if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1876         token->type = OP_DUP_QUESTION;
1877       break;
1878     case '{':
1879       if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1880         token->type = OP_OPEN_DUP_NUM;
1881       break;
1882     case '}':
1883       if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1884         token->type = OP_CLOSE_DUP_NUM;
1885       break;
1886     case '(':
1887       if (syntax & REG_NO_BK_PARENS)
1888         token->type = OP_OPEN_SUBEXP;
1889       break;
1890     case ')':
1891       if (syntax & REG_NO_BK_PARENS)
1892         token->type = OP_CLOSE_SUBEXP;
1893       break;
1894     case '[':
1895       token->type = OP_OPEN_BRACKET;
1896       break;
1897     case '.':
1898       token->type = OP_PERIOD;
1899       break;
1900     case '^':
1901       if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1902           re_string_cur_idx (input) != 0)
1903         {
1904           char prev = re_string_peek_byte (input, -1);
1905           if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1906             break;
1907         }
1908       token->type = ANCHOR;
1909       token->opr.ctx_type = LINE_FIRST;
1910       break;
1911     case '$':
1912       if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1913           re_string_cur_idx (input) + 1 != re_string_length (input))
1914         {
1915           re_token_t next;
1916           re_string_skip_bytes (input, 1);
1917           peek_token (&next, input, syntax);
1918           re_string_skip_bytes (input, -1);
1919           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1920             break;
1921         }
1922       token->type = ANCHOR;
1923       token->opr.ctx_type = LINE_LAST;
1924       break;
1925     default:
1926       break;
1927     }
1928   return 1;
1929 }
1930
1931 /* Peek a token from INPUT, and return the length of the token.
1932    We must not use this function out of bracket expressions.  */
1933
1934 static int
1935 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1936 {
1937   unsigned char c;
1938   if (re_string_eoi (input))
1939     {
1940       token->type = END_OF_RE;
1941       return 0;
1942     }
1943   c = re_string_peek_byte (input, 0);
1944   token->opr.c = c;
1945
1946 #ifdef RE_ENABLE_I18N
1947   if (input->mb_cur_max > 1 &&
1948       !re_string_first_byte (input, re_string_cur_idx (input)))
1949     {
1950       token->type = CHARACTER;
1951       return 1;
1952     }
1953 #endif /* RE_ENABLE_I18N */
1954
1955   if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1956       && re_string_cur_idx (input) + 1 < re_string_length (input))
1957     {
1958       /* In this case, '\' escape a character.  */
1959       unsigned char c2;
1960       re_string_skip_bytes (input, 1);
1961       c2 = re_string_peek_byte (input, 0);
1962       token->opr.c = c2;
1963       token->type = CHARACTER;
1964       return 1;
1965     }
1966   if (c == '[') /* '[' is a special char in a bracket exps.  */
1967     {
1968       unsigned char c2;
1969       int token_len;
1970       if (re_string_cur_idx (input) + 1 < re_string_length (input))
1971         c2 = re_string_peek_byte (input, 1);
1972       else
1973         c2 = 0;
1974       token->opr.c = c2;
1975       token_len = 2;
1976       switch (c2)
1977         {
1978         case '.':
1979           token->type = OP_OPEN_COLL_ELEM;
1980           break;
1981         case '=':
1982           token->type = OP_OPEN_EQUIV_CLASS;
1983           break;
1984         case ':':
1985           if (syntax & REG_CHAR_CLASSES)
1986             {
1987               token->type = OP_OPEN_CHAR_CLASS;
1988               break;
1989             }
1990           /* else fall through.  */
1991         default:
1992           token->type = CHARACTER;
1993           token->opr.c = c;
1994           token_len = 1;
1995           break;
1996         }
1997       return token_len;
1998     }
1999   switch (c)
2000     {
2001     case '-':
2002       token->type = OP_CHARSET_RANGE;
2003       break;
2004     case ']':
2005       token->type = OP_CLOSE_BRACKET;
2006       break;
2007     case '^':
2008       token->type = OP_NON_MATCH_LIST;
2009       break;
2010     default:
2011       token->type = CHARACTER;
2012     }
2013   return 1;
2014 }
2015 \f
2016 /* Functions for parser.  */
2017
2018 /* Entry point of the parser.
2019    Parse the regular expression REGEXP and return the structure tree.
2020    If an error is occured, ERR is set by error code, and return NULL.
2021    This function build the following tree, from regular expression <reg_exp>:
2022            CAT
2023            / \
2024           /   \
2025    <reg_exp>  EOR
2026
2027    CAT means concatenation.
2028    EOR means end of regular expression.  */
2029
2030 static bin_tree_t *
2031 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2032        reg_errcode_t *err)
2033 {
2034   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2035   bin_tree_t *tree, *eor, *root;
2036   re_token_t current_token;
2037   dfa->syntax = syntax;
2038   fetch_token (&current_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2039   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2040   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2041     return NULL;
2042   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2043   if (tree != NULL)
2044     root = create_tree (dfa, tree, eor, CONCAT);
2045   else
2046     root = eor;
2047   if (BE (eor == NULL || root == NULL, 0))
2048     {
2049       *err = REG_ESPACE;
2050       return NULL;
2051     }
2052   return root;
2053 }
2054
2055 /* This function build the following tree, from regular expression
2056    <branch1>|<branch2>:
2057            ALT
2058            / \
2059           /   \
2060    <branch1> <branch2>
2061
2062    ALT means alternative, which represents the operator `|'.  */
2063
2064 static bin_tree_t *
2065 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2066                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2067 {
2068   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2069   bin_tree_t *tree, *branch = NULL;
2070   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2071   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2072     return NULL;
2073
2074   while (token->type == OP_ALT)
2075     {
2076       fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2077       if (token->type != OP_ALT && token->type != END_OF_RE
2078           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2079         {
2080           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2081           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2082             return NULL;
2083         }
2084       else
2085         branch = NULL;
2086       tree = create_tree (dfa, tree, branch, OP_ALT);
2087       if (BE (tree == NULL, 0))
2088         {
2089           *err = REG_ESPACE;
2090           return NULL;
2091         }
2092     }
2093   return tree;
2094 }
2095
2096 /* This function build the following tree, from regular expression
2097    <exp1><exp2>:
2098         CAT
2099         / \
2100        /   \
2101    <exp1> <exp2>
2102
2103    CAT means concatenation.  */
2104
2105 static bin_tree_t *
2106 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2107               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2108 {
2109   bin_tree_t *tree, *exp;
2110   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2111   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2112   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2113     return NULL;
2114
2115   while (token->type != OP_ALT && token->type != END_OF_RE
2116          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2117     {
2118       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2119       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2120         {
2121           return NULL;
2122         }
2123       if (tree != NULL && exp != NULL)
2124         {
2125           tree = create_tree (dfa, tree, exp, CONCAT);
2126           if (tree == NULL)
2127             {
2128               *err = REG_ESPACE;
2129               return NULL;
2130             }
2131         }
2132       else if (tree == NULL)
2133         tree = exp;
2134       /* Otherwise exp == NULL, we don't need to create new tree.  */
2135     }
2136   return tree;
2137 }
2138
2139 /* This function build the following tree, from regular expression a*:
2140          *
2141          |
2142          a
2143 */
2144
2145 static bin_tree_t *
2146 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2147                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2148 {
2149   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2150   bin_tree_t *tree;
2151   switch (token->type)
2152     {
2153     case CHARACTER:
2154       tree = create_token_tree (dfa, NULL, NULL, token);
2155       if (BE (tree == NULL, 0))
2156         {
2157           *err = REG_ESPACE;
2158           return NULL;
2159         }
2160 #ifdef RE_ENABLE_I18N
2161       if (dfa->mb_cur_max > 1)
2162         {
2163           while (!re_string_eoi (regexp)
2164                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2165             {
2166               bin_tree_t *mbc_remain;
2167               fetch_token (token, regexp, syntax);
2168               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2169               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2170               if (BE (mbc_remain == NULL || tree == NULL, 0))
2171                 {
2172                   *err = REG_ESPACE;
2173                   return NULL;
2174                 }
2175             }
2176         }
2177 #endif
2178       break;
2179     case OP_OPEN_SUBEXP:
2180       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2181       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2182         return NULL;
2183       break;
2184     case OP_OPEN_BRACKET:
2185       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2186       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2187         return NULL;
2188       break;
2189     case OP_BACK_REF:
2190       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2191         {
2192           *err = REG_ESUBREG;
2193           return NULL;
2194         }
2195       dfa->used_bkref_map |= 1 << token->opr.idx;
2196       tree = create_token_tree (dfa, NULL, NULL, token);
2197       if (BE (tree == NULL, 0))
2198         {
2199           *err = REG_ESPACE;
2200           return NULL;
2201         }
2202       ++dfa->nbackref;
2203       dfa->has_mb_node = 1;
2204       break;
2205     case OP_OPEN_DUP_NUM:
2206       if (syntax & REG_CONTEXT_INVALID_DUP)
2207         {
2208           *err = REG_BADRPT;
2209           return NULL;
2210         }
2211       /* FALLTHROUGH */
2212     case OP_DUP_ASTERISK:
2213     case OP_DUP_PLUS:
2214     case OP_DUP_QUESTION:
2215       if (syntax & REG_CONTEXT_INVALID_OPS)
2216         {
2217           *err = REG_BADRPT;
2218           return NULL;
2219         }
2220       else if (syntax & REG_CONTEXT_INDEP_OPS)
2221         {
2222           fetch_token (token, regexp, syntax);
2223           return parse_expression (regexp, preg, token, syntax, nest, err);
2224         }
2225       /* else fall through  */
2226     case OP_CLOSE_SUBEXP:
2227       if ((token->type == OP_CLOSE_SUBEXP) &&
2228           !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2229         {
2230           *err = REG_ERPAREN;
2231           return NULL;
2232         }
2233       /* else fall through  */
2234     case OP_CLOSE_DUP_NUM:
2235       /* We treat it as a normal character.  */
2236
2237       /* Then we can these characters as normal characters.  */
2238       token->type = CHARACTER;
2239       /* mb_partial and word_char bits should be initialized already
2240          by peek_token.  */
2241       tree = create_token_tree (dfa, NULL, NULL, token);
2242       if (BE (tree == NULL, 0))
2243         {
2244           *err = REG_ESPACE;
2245           return NULL;
2246         }
2247       break;
2248     case ANCHOR:
2249       if ((token->opr.ctx_type
2250            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2251           && dfa->word_ops_used == 0)
2252         init_word_char (dfa);
2253       if (token->opr.ctx_type == WORD_DELIM
2254           || token->opr.ctx_type == NOT_WORD_DELIM)
2255         {
2256           bin_tree_t *tree_first, *tree_last;
2257           if (token->opr.ctx_type == WORD_DELIM)
2258             {
2259               token->opr.ctx_type = WORD_FIRST;
2260               tree_first = create_token_tree (dfa, NULL, NULL, token);
2261               token->opr.ctx_type = WORD_LAST;
2262             }
2263           else
2264             {
2265               token->opr.ctx_type = INSIDE_WORD;
2266               tree_first = create_token_tree (dfa, NULL, NULL, token);
2267               token->opr.ctx_type = INSIDE_NOTWORD;
2268             }
2269           tree_last = create_token_tree (dfa, NULL, NULL, token);
2270           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2271           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2272             {
2273               *err = REG_ESPACE;
2274               return NULL;
2275             }
2276         }
2277       else
2278         {
2279           tree = create_token_tree (dfa, NULL, NULL, token);
2280           if (BE (tree == NULL, 0))
2281             {
2282               *err = REG_ESPACE;
2283               return NULL;
2284             }
2285         }
2286       /* We must return here, since ANCHORs can't be followed
2287          by repetition operators.
2288          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2289              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2290       fetch_token (token, regexp, syntax);
2291       return tree;
2292     case OP_PERIOD:
2293       tree = create_token_tree (dfa, NULL, NULL, token);
2294       if (BE (tree == NULL, 0))
2295         {
2296           *err = REG_ESPACE;
2297           return NULL;
2298         }
2299       if (dfa->mb_cur_max > 1)
2300         dfa->has_mb_node = 1;
2301       break;
2302     case OP_WORD:
2303     case OP_NOTWORD:
2304       tree = build_charclass_op (dfa, regexp->trans,
2305                                  (const unsigned char *) "alnum",
2306                                  (const unsigned char *) "_",
2307                                  token->type == OP_NOTWORD, err);
2308       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2309         return NULL;
2310       break;
2311     case OP_SPACE:
2312     case OP_NOTSPACE:
2313       tree = build_charclass_op (dfa, regexp->trans,
2314                                  (const unsigned char *) "space",
2315                                  (const unsigned char *) "",
2316                                  token->type == OP_NOTSPACE, err);
2317       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2318         return NULL;
2319       break;
2320     case OP_ALT:
2321     case END_OF_RE:
2322       return NULL;
2323     case BACK_SLASH:
2324       *err = REG_EESCAPE;
2325       return NULL;
2326     default:
2327       /* Must not happen?  */
2328 #ifdef DEBUG
2329       assert (0);
2330 #endif
2331       return NULL;
2332     }
2333   fetch_token (token, regexp, syntax);
2334
2335   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2336          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2337     {
2338       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2339       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2340         return NULL;
2341       /* In BRE consecutive duplications are not allowed.  */
2342       if ((syntax & REG_CONTEXT_INVALID_DUP)
2343           && (token->type == OP_DUP_ASTERISK
2344               || token->type == OP_OPEN_DUP_NUM))
2345         {
2346           *err = REG_BADRPT;
2347           return NULL;
2348         }
2349     }
2350
2351   return tree;
2352 }
2353
2354 /* This function build the following tree, from regular expression
2355    (<reg_exp>):
2356          SUBEXP
2357             |
2358         <reg_exp>
2359 */
2360
2361 static bin_tree_t *
2362 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2363                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2364 {
2365   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2366   bin_tree_t *tree;
2367   size_t cur_nsub;
2368   cur_nsub = preg->re_nsub++;
2369
2370   fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2371
2372   /* The subexpression may be a null string.  */
2373   if (token->type == OP_CLOSE_SUBEXP)
2374     tree = NULL;
2375   else
2376     {
2377       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2378       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2379         *err = REG_EPAREN;
2380       if (BE (*err != REG_NOERROR, 0))
2381         return NULL;
2382     }
2383
2384   if (cur_nsub <= '9' - '1')
2385     dfa->completed_bkref_map |= 1 << cur_nsub;
2386
2387   tree = create_tree (dfa, tree, NULL, SUBEXP);
2388   if (BE (tree == NULL, 0))
2389     {
2390       *err = REG_ESPACE;
2391       return NULL;
2392     }
2393   tree->token.opr.idx = cur_nsub;
2394   return tree;
2395 }
2396
2397 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2398
2399 static bin_tree_t *
2400 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2401               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2402 {
2403   bin_tree_t *tree = NULL, *old_tree = NULL;
2404   int i, start, end, start_idx = re_string_cur_idx (regexp);
2405   re_token_t start_token = *token;
2406
2407   if (token->type == OP_OPEN_DUP_NUM)
2408     {
2409       end = 0;
2410       start = fetch_number (regexp, token, syntax);
2411       if (start == -1)
2412         {
2413           if (token->type == CHARACTER && token->opr.c == ',')
2414             start = 0; /* We treat "{,m}" as "{0,m}".  */
2415           else
2416             {
2417               *err = REG_BADBR; /* <re>{} is invalid.  */
2418               return NULL;
2419             }
2420         }
2421       if (BE (start != -2, 1))
2422         {
2423           /* We treat "{n}" as "{n,n}".  */
2424           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2425                  : ((token->type == CHARACTER && token->opr.c == ',')
2426                     ? fetch_number (regexp, token, syntax) : -2));
2427         }
2428       if (BE (start == -2 || end == -2, 0))
2429         {
2430           /* Invalid sequence.  */
2431           if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2432             {
2433               if (token->type == END_OF_RE)
2434                 *err = REG_EBRACE;
2435               else
2436                 *err = REG_BADBR;
2437
2438               return NULL;
2439             }
2440
2441           /* If the syntax bit is set, rollback.  */
2442           re_string_set_index (regexp, start_idx);
2443           *token = start_token;
2444           token->type = CHARACTER;
2445           /* mb_partial and word_char bits should be already initialized by
2446              peek_token.  */
2447           return elem;
2448         }
2449
2450       if (BE (end != -1 && start > end, 0))
2451         {
2452           /* First number greater than second.  */
2453           *err = REG_BADBR;
2454           return NULL;
2455         }
2456     }
2457   else
2458     {
2459       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2460       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2461     }
2462
2463   fetch_token (token, regexp, syntax);
2464
2465   if (BE (elem == NULL, 0))
2466     return NULL;
2467   if (BE (start == 0 && end == 0, 0))
2468     {
2469       postorder (elem, free_tree, NULL);
2470       return NULL;
2471     }
2472
2473   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2474   if (BE (start > 0, 0))
2475     {
2476       tree = elem;
2477       for (i = 2; i <= start; ++i)
2478         {
2479           elem = duplicate_tree (elem, dfa);
2480           tree = create_tree (dfa, tree, elem, CONCAT);
2481           if (BE (elem == NULL || tree == NULL, 0))
2482             goto parse_dup_op_espace;
2483         }
2484
2485       if (start == end)
2486         return tree;
2487
2488       /* Duplicate ELEM before it is marked optional.  */
2489       elem = duplicate_tree (elem, dfa);
2490       old_tree = tree;
2491     }
2492   else
2493     old_tree = NULL;
2494
2495   if (elem->token.type == SUBEXP)
2496     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2497
2498   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2499   if (BE (tree == NULL, 0))
2500     goto parse_dup_op_espace;
2501
2502   /* This loop is actually executed only when end != -1,
2503      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2504      already created the start+1-th copy.  */
2505   for (i = start + 2; i <= end; ++i)
2506     {
2507       elem = duplicate_tree (elem, dfa);
2508       tree = create_tree (dfa, tree, elem, CONCAT);
2509       if (BE (elem == NULL || tree == NULL, 0))
2510         goto parse_dup_op_espace;
2511
2512       tree = create_tree (dfa, tree, NULL, OP_ALT);
2513       if (BE (tree == NULL, 0))
2514         goto parse_dup_op_espace;
2515     }
2516
2517   if (old_tree)
2518     tree = create_tree (dfa, old_tree, tree, CONCAT);
2519
2520   return tree;
2521
2522  parse_dup_op_espace:
2523   *err = REG_ESPACE;
2524   return NULL;
2525 }
2526
2527 /* Size of the names for collating symbol/equivalence_class/character_class.
2528    I'm not sure, but maybe enough.  */
2529 #define BRACKET_NAME_BUF_SIZE 32
2530
2531 #ifndef _LIBC
2532   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2533      Build the range expression which starts from START_ELEM, and ends
2534      at END_ELEM.  The result are written to MBCSET and SBCSET.
2535      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2536      mbcset->range_ends, is a pointer argument sinse we may
2537      update it.  */
2538
2539 static reg_errcode_t
2540 build_range_exp (re_bitset_ptr_t sbcset,
2541 # ifdef RE_ENABLE_I18N
2542                  re_charset_t *mbcset, int *range_alloc,
2543 # endif
2544                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2545 {
2546   unsigned int start_ch, end_ch;
2547   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2548   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2549           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2550           0))
2551     return REG_ERANGE;
2552
2553   /* We can handle no multi character collating elements without libc
2554      support.  */
2555   if (BE ((start_elem->type == COLL_SYM
2556            && strlen ((char *) start_elem->opr.name) > 1)
2557           || (end_elem->type == COLL_SYM
2558               && strlen ((char *) end_elem->opr.name) > 1), 0))
2559     return REG_ECOLLATE;
2560
2561 # ifdef RE_ENABLE_I18N
2562   {
2563     wchar_t wc;
2564     wint_t start_wc, end_wc;
2565     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2566
2567     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2568                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2569                    : 0));
2570     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2571               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2572                  : 0));
2573     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2574                 ? __btowc (start_ch) : start_elem->opr.wch);
2575     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2576               ? __btowc (end_ch) : end_elem->opr.wch);
2577     if (start_wc == WEOF || end_wc == WEOF)
2578       return REG_ECOLLATE;
2579     cmp_buf[0] = start_wc;
2580     cmp_buf[4] = end_wc;
2581     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2582       return REG_ERANGE;
2583
2584     /* Got valid collation sequence values, add them as a new entry.
2585        However, for !_LIBC we have no collation elements: if the
2586        character set is single byte, the single byte character set
2587        that we build below suffices.  parse_bracket_exp passes
2588        no MBCSET if dfa->mb_cur_max == 1.  */
2589     if (mbcset)
2590       {
2591         /* Check the space of the arrays.  */
2592         if (BE (*range_alloc == mbcset->nranges, 0))
2593           {
2594             /* There is not enough space, need realloc.  */
2595             wchar_t *new_array_start, *new_array_end;
2596             int new_nranges;
2597
2598             /* +1 in case of mbcset->nranges is 0.  */
2599             new_nranges = 2 * mbcset->nranges + 1;
2600             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2601                are NULL if *range_alloc == 0.  */
2602             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2603                                           new_nranges);
2604             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2605                                         new_nranges);
2606
2607             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2608               return REG_ESPACE;
2609
2610             mbcset->range_starts = new_array_start;
2611             mbcset->range_ends = new_array_end;
2612             *range_alloc = new_nranges;
2613           }
2614
2615         mbcset->range_starts[mbcset->nranges] = start_wc;
2616         mbcset->range_ends[mbcset->nranges++] = end_wc;
2617       }
2618
2619     /* Build the table for single byte characters.  */
2620     for (wc = 0; wc < SBC_MAX; ++wc)
2621       {
2622         cmp_buf[2] = wc;
2623         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2624             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2625           bitset_set (sbcset, wc);
2626       }
2627   }
2628 # else /* not RE_ENABLE_I18N */
2629   {
2630     unsigned int ch;
2631     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2632                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2633                    : 0));
2634     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2635               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2636                  : 0));
2637     if (start_ch > end_ch)
2638       return REG_ERANGE;
2639     /* Build the table for single byte characters.  */
2640     for (ch = 0; ch < SBC_MAX; ++ch)
2641       if (start_ch <= ch  && ch <= end_ch)
2642         bitset_set (sbcset, ch);
2643   }
2644 # endif /* not RE_ENABLE_I18N */
2645   return REG_NOERROR;
2646 }
2647 #endif /* not _LIBC */
2648
2649 #ifndef _LIBC
2650 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2651    Build the collating element which is represented by NAME.
2652    The result are written to MBCSET and SBCSET.
2653    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2654    pointer argument since we may update it.  */
2655
2656 static reg_errcode_t
2657 build_collating_symbol (re_bitset_ptr_t sbcset,
2658 # ifdef RE_ENABLE_I18N
2659                         re_charset_t *mbcset, int *coll_sym_alloc,
2660 # endif
2661                         const unsigned char *name)
2662 {
2663   size_t name_len = strlen ((const char *) name);
2664   if (BE (name_len != 1, 0))
2665     return REG_ECOLLATE;
2666   else
2667     {
2668       bitset_set (sbcset, name[0]);
2669       return REG_NOERROR;
2670     }
2671 }
2672 #endif /* not _LIBC */
2673
2674 /* This function parse bracket expression like "[abc]", "[a-c]",
2675    "[[.a-a.]]" etc.  */
2676
2677 static bin_tree_t *
2678 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2679                    reg_syntax_t syntax, reg_errcode_t *err)
2680 {
2681 #ifdef _LIBC
2682   const unsigned char *collseqmb;
2683   const char *collseqwc;
2684   uint32_t nrules;
2685   int32_t table_size;
2686   const int32_t *symb_table;
2687   const unsigned char *extra;
2688
2689   /* Local function for parse_bracket_exp used in _LIBC environement.
2690      Seek the collating symbol entry correspondings to NAME.
2691      Return the index of the symbol in the SYMB_TABLE.  */
2692
2693   auto inline int32_t
2694   __attribute ((always_inline))
2695   seek_collating_symbol_entry (name, name_len)
2696          const unsigned char *name;
2697          size_t name_len;
2698     {
2699       int32_t hash = elem_hash ((const char *) name, name_len);
2700       int32_t elem = hash % table_size;
2701       int32_t second = hash % (table_size - 2);
2702       while (symb_table[2 * elem] != 0)
2703         {
2704           /* First compare the hashing value.  */
2705           if (symb_table[2 * elem] == hash
2706               /* Compare the length of the name.  */
2707               && name_len == extra[symb_table[2 * elem + 1]]
2708               /* Compare the name.  */
2709               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2710                          name_len) == 0)
2711             {
2712               /* Yep, this is the entry.  */
2713               break;
2714             }
2715
2716           /* Next entry.  */
2717           elem += second;
2718         }
2719       return elem;
2720     }
2721
2722   /* Local function for parse_bracket_exp used in _LIBC environement.
2723      Look up the collation sequence value of BR_ELEM.
2724      Return the value if succeeded, UINT_MAX otherwise.  */
2725
2726   auto inline unsigned int
2727   __attribute ((always_inline))
2728   lookup_collation_sequence_value (br_elem)
2729          bracket_elem_t *br_elem;
2730     {
2731       if (br_elem->type == SB_CHAR)
2732         {
2733           /*
2734           if (MB_CUR_MAX == 1)
2735           */
2736           if (nrules == 0)
2737             return collseqmb[br_elem->opr.ch];
2738           else
2739             {
2740               wint_t wc = __btowc (br_elem->opr.ch);
2741               return __collseq_table_lookup (collseqwc, wc);
2742             }
2743         }
2744       else if (br_elem->type == MB_CHAR)
2745         {
2746           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2747         }
2748       else if (br_elem->type == COLL_SYM)
2749         {
2750           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2751           if (nrules != 0)
2752             {
2753               int32_t elem, idx;
2754               elem = seek_collating_symbol_entry (br_elem->opr.name,
2755                                                   sym_name_len);
2756               if (symb_table[2 * elem] != 0)
2757                 {
2758                   /* We found the entry.  */
2759                   idx = symb_table[2 * elem + 1];
2760                   /* Skip the name of collating element name.  */
2761                   idx += 1 + extra[idx];
2762                   /* Skip the byte sequence of the collating element.  */
2763                   idx += 1 + extra[idx];
2764                   /* Adjust for the alignment.  */
2765                   idx = (idx + 3) & ~3;
2766                   /* Skip the multibyte collation sequence value.  */
2767                   idx += sizeof (unsigned int);
2768                   /* Skip the wide char sequence of the collating element.  */
2769                   idx += sizeof (unsigned int) *
2770                     (1 + *(unsigned int *) (extra + idx));
2771                   /* Return the collation sequence value.  */
2772                   return *(unsigned int *) (extra + idx);
2773                 }
2774               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2775                 {
2776                   /* No valid character.  Match it as a single byte
2777                      character.  */
2778                   return collseqmb[br_elem->opr.name[0]];
2779                 }
2780             }
2781           else if (sym_name_len == 1)
2782             return collseqmb[br_elem->opr.name[0]];
2783         }
2784       return UINT_MAX;
2785     }
2786
2787   /* Local function for parse_bracket_exp used in _LIBC environement.
2788      Build the range expression which starts from START_ELEM, and ends
2789      at END_ELEM.  The result are written to MBCSET and SBCSET.
2790      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2791      mbcset->range_ends, is a pointer argument sinse we may
2792      update it.  */
2793
2794   auto inline reg_errcode_t
2795   __attribute ((always_inline))
2796   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2797          re_charset_t *mbcset;
2798          int *range_alloc;
2799          re_bitset_ptr_t sbcset;
2800          bracket_elem_t *start_elem, *end_elem;
2801     {
2802       unsigned int ch;
2803       uint32_t start_collseq;
2804       uint32_t end_collseq;
2805
2806       /* Equivalence Classes and Character Classes can't be a range
2807          start/end.  */
2808       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2809               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2810               0))
2811         return REG_ERANGE;
2812
2813       start_collseq = lookup_collation_sequence_value (start_elem);
2814       end_collseq = lookup_collation_sequence_value (end_elem);
2815       /* Check start/end collation sequence values.  */
2816       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2817         return REG_ECOLLATE;
2818       if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2819         return REG_ERANGE;
2820
2821       /* Got valid collation sequence values, add them as a new entry.
2822          However, if we have no collation elements, and the character set
2823          is single byte, the single byte character set that we
2824          build below suffices. */
2825       if (nrules > 0 || dfa->mb_cur_max > 1)
2826         {
2827           /* Check the space of the arrays.  */
2828           if (BE (*range_alloc == mbcset->nranges, 0))
2829             {
2830               /* There is not enough space, need realloc.  */
2831               uint32_t *new_array_start;
2832               uint32_t *new_array_end;
2833               int new_nranges;
2834
2835               /* +1 in case of mbcset->nranges is 0.  */
2836               new_nranges = 2 * mbcset->nranges + 1;
2837               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2838                                             new_nranges);
2839               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2840                                           new_nranges);
2841
2842               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2843                 return REG_ESPACE;
2844
2845               mbcset->range_starts = new_array_start;
2846               mbcset->range_ends = new_array_end;
2847               *range_alloc = new_nranges;
2848             }
2849
2850           mbcset->range_starts[mbcset->nranges] = start_collseq;
2851           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2852         }
2853
2854       /* Build the table for single byte characters.  */
2855       for (ch = 0; ch < SBC_MAX; ch++)
2856         {
2857           uint32_t ch_collseq;
2858           /*
2859           if (MB_CUR_MAX == 1)
2860           */
2861           if (nrules == 0)
2862             ch_collseq = collseqmb[ch];
2863           else
2864             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2865           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2866             bitset_set (sbcset, ch);
2867         }
2868       return REG_NOERROR;
2869     }
2870
2871   /* Local function for parse_bracket_exp used in _LIBC environement.
2872      Build the collating element which is represented by NAME.
2873      The result are written to MBCSET and SBCSET.
2874      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2875      pointer argument sinse we may update it.  */
2876
2877   auto inline reg_errcode_t
2878   __attribute ((always_inline))
2879   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2880          re_charset_t *mbcset;
2881          int *coll_sym_alloc;
2882          re_bitset_ptr_t sbcset;
2883          const unsigned char *name;
2884     {
2885       int32_t elem, idx;
2886       size_t name_len = strlen ((const char *) name);
2887       if (nrules != 0)
2888         {
2889           elem = seek_collating_symbol_entry (name, name_len);
2890           if (symb_table[2 * elem] != 0)
2891             {
2892               /* We found the entry.  */
2893               idx = symb_table[2 * elem + 1];
2894               /* Skip the name of collating element name.  */
2895               idx += 1 + extra[idx];
2896             }
2897           else if (symb_table[2 * elem] == 0 && name_len == 1)
2898             {
2899               /* No valid character, treat it as a normal
2900                  character.  */
2901               bitset_set (sbcset, name[0]);
2902               return REG_NOERROR;
2903             }
2904           else
2905             return REG_ECOLLATE;
2906
2907           /* Got valid collation sequence, add it as a new entry.  */
2908           /* Check the space of the arrays.  */
2909           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2910             {
2911               /* Not enough, realloc it.  */
2912               /* +1 in case of mbcset->ncoll_syms is 0.  */
2913               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2914               /* Use realloc since mbcset->coll_syms is NULL
2915                  if *alloc == 0.  */
2916               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2917                                                    new_coll_sym_alloc);
2918               if (BE (new_coll_syms == NULL, 0))
2919                 return REG_ESPACE;
2920               mbcset->coll_syms = new_coll_syms;
2921               *coll_sym_alloc = new_coll_sym_alloc;
2922             }
2923           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2924           return REG_NOERROR;
2925         }
2926       else
2927         {
2928           if (BE (name_len != 1, 0))
2929             return REG_ECOLLATE;
2930           else
2931             {
2932               bitset_set (sbcset, name[0]);
2933               return REG_NOERROR;
2934             }
2935         }
2936     }
2937 #endif
2938
2939   re_token_t br_token;
2940   re_bitset_ptr_t sbcset;
2941 #ifdef RE_ENABLE_I18N
2942   re_charset_t *mbcset;
2943   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2944   int equiv_class_alloc = 0, char_class_alloc = 0;
2945 #endif /* not RE_ENABLE_I18N */
2946   int non_match = 0;
2947   bin_tree_t *work_tree;
2948   int token_len;
2949   int first_round = 1;
2950 #ifdef _LIBC
2951   collseqmb = (const unsigned char *)
2952     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2953   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2954   if (nrules)
2955     {
2956       /*
2957       if (MB_CUR_MAX > 1)
2958       */
2959         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2960       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2961       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2962                                                   _NL_COLLATE_SYMB_TABLEMB);
2963       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2964                                                    _NL_COLLATE_SYMB_EXTRAMB);
2965     }
2966 #endif
2967   sbcset = re_calloc (unsigned int, BITSET_UINTS);
2968 #ifdef RE_ENABLE_I18N
2969   mbcset = re_calloc (re_charset_t, 1);
2970 #endif /* RE_ENABLE_I18N */
2971 #ifdef RE_ENABLE_I18N
2972   if (BE (sbcset == NULL || mbcset == NULL, 0))
2973 #else
2974   if (BE (sbcset == NULL, 0))
2975 #endif /* RE_ENABLE_I18N */
2976     {
2977       *err = REG_ESPACE;
2978       return NULL;
2979     }
2980
2981   token_len = peek_token_bracket (token, regexp, syntax);
2982   if (BE (token->type == END_OF_RE, 0))
2983     {
2984       *err = REG_BADPAT;
2985       goto parse_bracket_exp_free_return;
2986     }
2987   if (token->type == OP_NON_MATCH_LIST)
2988     {
2989 #ifdef RE_ENABLE_I18N
2990       mbcset->non_match = 1;
2991 #endif /* not RE_ENABLE_I18N */
2992       non_match = 1;
2993       if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
2994         bitset_set (sbcset, '\0');
2995       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2996       token_len = peek_token_bracket (token, regexp, syntax);
2997       if (BE (token->type == END_OF_RE, 0))
2998         {
2999           *err = REG_BADPAT;
3000           goto parse_bracket_exp_free_return;
3001         }
3002     }
3003
3004   /* We treat the first ']' as a normal character.  */
3005   if (token->type == OP_CLOSE_BRACKET)
3006     token->type = CHARACTER;
3007
3008   while (1)
3009     {
3010       bracket_elem_t start_elem, end_elem;
3011       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3012       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3013       reg_errcode_t ret;
3014       int token_len2 = 0, is_range_exp = 0;
3015       re_token_t token2;
3016
3017       start_elem.opr.name = start_name_buf;
3018       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3019                                    syntax, first_round);
3020       if (BE (ret != REG_NOERROR, 0))
3021         {
3022           *err = ret;
3023           goto parse_bracket_exp_free_return;
3024         }
3025       first_round = 0;
3026
3027       /* Get information about the next token.  We need it in any case.  */
3028       token_len = peek_token_bracket (token, regexp, syntax);
3029
3030       /* Do not check for ranges if we know they are not allowed.  */
3031       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3032         {
3033           if (BE (token->type == END_OF_RE, 0))
3034             {
3035               *err = REG_EBRACK;
3036               goto parse_bracket_exp_free_return;
3037             }
3038           if (token->type == OP_CHARSET_RANGE)
3039             {
3040               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3041               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3042               if (BE (token2.type == END_OF_RE, 0))
3043                 {
3044                   *err = REG_EBRACK;
3045                   goto parse_bracket_exp_free_return;
3046                 }
3047               if (token2.type == OP_CLOSE_BRACKET)
3048                 {
3049                   /* We treat the last '-' as a normal character.  */
3050                   re_string_skip_bytes (regexp, -token_len);
3051                   token->type = CHARACTER;
3052                 }
3053               else
3054                 is_range_exp = 1;
3055             }
3056         }
3057
3058       if (is_range_exp == 1)
3059         {
3060           end_elem.opr.name = end_name_buf;
3061           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3062                                        dfa, syntax, 1);
3063           if (BE (ret != REG_NOERROR, 0))
3064             {
3065               *err = ret;
3066               goto parse_bracket_exp_free_return;
3067             }
3068
3069           token_len = peek_token_bracket (token, regexp, syntax);
3070
3071 #ifdef _LIBC
3072           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3073                                   &start_elem, &end_elem);
3074 #else
3075 # ifdef RE_ENABLE_I18N
3076           *err = build_range_exp (sbcset,
3077                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3078                                   &range_alloc, &start_elem, &end_elem);
3079 # else
3080           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3081 # endif
3082 #endif /* RE_ENABLE_I18N */
3083           if (BE (*err != REG_NOERROR, 0))
3084             goto parse_bracket_exp_free_return;
3085         }
3086       else
3087         {
3088           switch (start_elem.type)
3089             {
3090             case SB_CHAR:
3091               bitset_set (sbcset, start_elem.opr.ch);
3092               break;
3093 #ifdef RE_ENABLE_I18N
3094             case MB_CHAR:
3095               /* Check whether the array has enough space.  */
3096               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3097                 {
3098                   wchar_t *new_mbchars;
3099                   /* Not enough, realloc it.  */
3100                   /* +1 in case of mbcset->nmbchars is 0.  */
3101                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3102                   /* Use realloc since array is NULL if *alloc == 0.  */
3103                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3104                                             mbchar_alloc);
3105                   if (BE (new_mbchars == NULL, 0))
3106                     goto parse_bracket_exp_espace;
3107                   mbcset->mbchars = new_mbchars;
3108                 }
3109               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3110               break;
3111 #endif /* RE_ENABLE_I18N */
3112             case EQUIV_CLASS:
3113               *err = build_equiv_class (sbcset,
3114 #ifdef RE_ENABLE_I18N
3115                                         mbcset, &equiv_class_alloc,
3116 #endif /* RE_ENABLE_I18N */
3117                                         start_elem.opr.name);
3118               if (BE (*err != REG_NOERROR, 0))
3119                 goto parse_bracket_exp_free_return;
3120               break;
3121             case COLL_SYM:
3122               *err = build_collating_symbol (sbcset,
3123 #ifdef RE_ENABLE_I18N
3124                                              mbcset, &coll_sym_alloc,
3125 #endif /* RE_ENABLE_I18N */
3126                                              start_elem.opr.name);
3127               if (BE (*err != REG_NOERROR, 0))
3128                 goto parse_bracket_exp_free_return;
3129               break;
3130             case CHAR_CLASS:
3131               *err = build_charclass (regexp->trans, sbcset,
3132 #ifdef RE_ENABLE_I18N
3133                                       mbcset, &char_class_alloc,
3134 #endif /* RE_ENABLE_I18N */
3135                                       start_elem.opr.name, syntax);
3136               if (BE (*err != REG_NOERROR, 0))
3137                goto parse_bracket_exp_free_return;
3138               break;
3139             default:
3140               assert (0);
3141               break;
3142             }
3143         }
3144       if (BE (token->type == END_OF_RE, 0))
3145         {
3146           *err = REG_EBRACK;
3147           goto parse_bracket_exp_free_return;
3148         }
3149       if (token->type == OP_CLOSE_BRACKET)
3150         break;
3151     }
3152
3153   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3154
3155   /* If it is non-matching list.  */
3156   if (non_match)
3157     bitset_not (sbcset);
3158
3159 #ifdef RE_ENABLE_I18N
3160   /* Ensure only single byte characters are set.  */
3161   if (dfa->mb_cur_max > 1)
3162     bitset_mask (sbcset, dfa->sb_char);
3163
3164   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3165       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3166                                                      || mbcset->non_match)))
3167     {
3168       bin_tree_t *mbc_tree;
3169       int sbc_idx;
3170       /* Build a tree for complex bracket.  */
3171       dfa->has_mb_node = 1;
3172       br_token.type = COMPLEX_BRACKET;
3173       br_token.opr.mbcset = mbcset;
3174       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3175       if (BE (mbc_tree == NULL, 0))
3176         goto parse_bracket_exp_espace;
3177       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3178         if (sbcset[sbc_idx])
3179           break;
3180       /* If there are no bits set in sbcset, there is no point
3181          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3182       if (sbc_idx < BITSET_UINTS)
3183         {
3184           /* Build a tree for simple bracket.  */
3185           br_token.type = SIMPLE_BRACKET;
3186           br_token.opr.sbcset = sbcset;
3187           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3188           if (BE (work_tree == NULL, 0))
3189             goto parse_bracket_exp_espace;
3190
3191           /* Then join them by ALT node.  */
3192           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3193           if (BE (work_tree == NULL, 0))
3194             goto parse_bracket_exp_espace;
3195         }
3196       else
3197         {
3198           re_free (sbcset);
3199           work_tree = mbc_tree;
3200         }
3201     }
3202   else
3203 #endif /* not RE_ENABLE_I18N */
3204     {
3205 #ifdef RE_ENABLE_I18N
3206       free_charset (mbcset);
3207 #endif
3208       /* Build a tree for simple bracket.  */
3209       br_token.type = SIMPLE_BRACKET;
3210       br_token.opr.sbcset = sbcset;
3211       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3212       if (BE (work_tree == NULL, 0))
3213         goto parse_bracket_exp_espace;
3214     }
3215   return work_tree;
3216
3217  parse_bracket_exp_espace:
3218   *err = REG_ESPACE;
3219  parse_bracket_exp_free_return:
3220   re_free (sbcset);
3221 #ifdef RE_ENABLE_I18N
3222   free_charset (mbcset);
3223 #endif /* RE_ENABLE_I18N */
3224   return NULL;
3225 }
3226
3227 /* Parse an element in the bracket expression.  */
3228
3229 static reg_errcode_t
3230 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3231                        re_token_t *token, int token_len, re_dfa_t *dfa,
3232                        reg_syntax_t syntax, int accept_hyphen)
3233 {
3234 #ifdef RE_ENABLE_I18N
3235   int cur_char_size;
3236   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3237   if (cur_char_size > 1)
3238     {
3239       elem->type = MB_CHAR;
3240       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3241       re_string_skip_bytes (regexp, cur_char_size);
3242       return REG_NOERROR;
3243     }
3244 #endif /* RE_ENABLE_I18N */
3245   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3246   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3247       || token->type == OP_OPEN_EQUIV_CLASS)
3248     return parse_bracket_symbol (elem, regexp, token);
3249   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3250     {
3251       /* A '-' must only appear as anything but a range indicator before
3252          the closing bracket.  Everything else is an error.  */
3253       re_token_t token2;
3254       (void) peek_token_bracket (&token2, regexp, syntax);
3255       if (token2.type != OP_CLOSE_BRACKET)
3256         /* The actual error value is not standardized since this whole
3257            case is undefined.  But ERANGE makes good sense.  */
3258         return REG_ERANGE;
3259     }
3260   elem->type = SB_CHAR;
3261   elem->opr.ch = token->opr.c;
3262   return REG_NOERROR;
3263 }
3264
3265 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3266    such as [:<character_class>:], [.<collating_element>.], and
3267    [=<equivalent_class>=].  */
3268
3269 static reg_errcode_t
3270 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3271                       re_token_t *token)
3272 {
3273   unsigned char ch, delim = token->opr.c;
3274   int i = 0;
3275   if (re_string_eoi(regexp))
3276     return REG_EBRACK;
3277   for (;; ++i)
3278     {
3279       if (i >= BRACKET_NAME_BUF_SIZE)
3280         return REG_EBRACK;
3281       if (token->type == OP_OPEN_CHAR_CLASS)
3282         ch = re_string_fetch_byte_case (regexp);
3283       else
3284         ch = re_string_fetch_byte (regexp);
3285       if (re_string_eoi(regexp))
3286         return REG_EBRACK;
3287       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3288         break;
3289       elem->opr.name[i] = ch;
3290     }
3291   re_string_skip_bytes (regexp, 1);
3292   elem->opr.name[i] = '\0';
3293   switch (token->type)
3294     {
3295     case OP_OPEN_COLL_ELEM:
3296       elem->type = COLL_SYM;
3297       break;
3298     case OP_OPEN_EQUIV_CLASS:
3299       elem->type = EQUIV_CLASS;
3300       break;
3301     case OP_OPEN_CHAR_CLASS:
3302       elem->type = CHAR_CLASS;
3303       break;
3304     default:
3305       break;
3306     }
3307   return REG_NOERROR;
3308 }
3309
3310   /* Helper function for parse_bracket_exp.
3311      Build the equivalence class which is represented by NAME.
3312      The result are written to MBCSET and SBCSET.
3313      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3314      is a pointer argument sinse we may update it.  */
3315
3316 static reg_errcode_t
3317 build_equiv_class (re_bitset_ptr_t sbcset,
3318 #ifdef RE_ENABLE_I18N
3319                    re_charset_t *mbcset, int *equiv_class_alloc,
3320 #endif
3321                    const unsigned char *name)
3322 {
3323 #if defined _LIBC
3324   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3325   if (nrules != 0)
3326     {
3327       const int32_t *table, *indirect;
3328       const unsigned char *weights, *extra, *cp;
3329       unsigned char char_buf[2];
3330       int32_t idx1, idx2;
3331       unsigned int ch;
3332       size_t len;
3333       /* This #include defines a local function!  */
3334 # include <locale/weight.h>
3335       /* Calculate the index for equivalence class.  */
3336       cp = name;
3337       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3338       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3339                                                _NL_COLLATE_WEIGHTMB);
3340       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3341                                                    _NL_COLLATE_EXTRAMB);
3342       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3343                                                 _NL_COLLATE_INDIRECTMB);
3344       idx1 = findidx (&cp);
3345       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3346         /* This isn't a valid character.  */
3347         return REG_ECOLLATE;
3348
3349       /* Build single byte matcing table for this equivalence class.  */
3350       char_buf[1] = (unsigned char) '\0';
3351       len = weights[idx1];
3352       for (ch = 0; ch < SBC_MAX; ++ch)
3353         {
3354           char_buf[0] = ch;
3355           cp = char_buf;
3356           idx2 = findidx (&cp);
3357 /*
3358           idx2 = table[ch];
3359 */
3360           if (idx2 == 0)
3361             /* This isn't a valid character.  */
3362             continue;
3363           if (len == weights[idx2])
3364             {
3365               int cnt = 0;
3366               while (cnt <= len &&
3367                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3368                 ++cnt;
3369
3370               if (cnt > len)
3371                 bitset_set (sbcset, ch);
3372             }
3373         }
3374       /* Check whether the array has enough space.  */
3375       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3376         {
3377           /* Not enough, realloc it.  */
3378           /* +1 in case of mbcset->nequiv_classes is 0.  */
3379           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3380           /* Use realloc since the array is NULL if *alloc == 0.  */
3381           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3382                                                    int32_t,
3383                                                    new_equiv_class_alloc);
3384           if (BE (new_equiv_classes == NULL, 0))
3385             return REG_ESPACE;
3386           mbcset->equiv_classes = new_equiv_classes;
3387           *equiv_class_alloc = new_equiv_class_alloc;
3388         }
3389       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3390     }
3391   else
3392 #endif /* _LIBC */
3393     {
3394       if (BE (strlen ((const char *) name) != 1, 0))
3395         return REG_ECOLLATE;
3396       bitset_set (sbcset, *name);
3397     }
3398   return REG_NOERROR;
3399 }
3400
3401   /* Helper function for parse_bracket_exp.
3402      Build the character class which is represented by NAME.
3403      The result are written to MBCSET and SBCSET.
3404      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3405      is a pointer argument sinse we may update it.  */
3406
3407 static reg_errcode_t
3408 build_charclass (unsigned REG_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset,
3409 #ifdef RE_ENABLE_I18N
3410                  re_charset_t *mbcset, int *char_class_alloc,
3411 #endif
3412                  const unsigned char *class_name, reg_syntax_t syntax)
3413 {
3414   int i;
3415   const char *name = (const char *) class_name;
3416
3417   /* In case of REG_ICASE "upper" and "lower" match the both of
3418      upper and lower cases.  */
3419   if ((syntax & REG_IGNORE_CASE)
3420       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3421     name = "alpha";
3422
3423 #ifdef RE_ENABLE_I18N
3424   /* Check the space of the arrays.  */
3425   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3426     {
3427       /* Not enough, realloc it.  */
3428       /* +1 in case of mbcset->nchar_classes is 0.  */
3429       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3430       /* Use realloc since array is NULL if *alloc == 0.  */
3431       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3432                                                new_char_class_alloc);
3433       if (BE (new_char_classes == NULL, 0))
3434         return REG_ESPACE;
3435       mbcset->char_classes = new_char_classes;
3436       *char_class_alloc = new_char_class_alloc;
3437     }
3438   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3439 #endif /* RE_ENABLE_I18N */
3440
3441 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3442     for (i = 0; i < SBC_MAX; ++i)               \
3443       {                                         \
3444         if (ctype_func (i))                     \
3445           {                                     \
3446             int ch = trans ? trans[i] : i;      \
3447             bitset_set (sbcset, ch);            \
3448           }                                     \
3449       }
3450
3451   if (strcmp (name, "alnum") == 0)
3452     BUILD_CHARCLASS_LOOP (isalnum)
3453   else if (strcmp (name, "cntrl") == 0)
3454     BUILD_CHARCLASS_LOOP (iscntrl)
3455   else if (strcmp (name, "lower") == 0)
3456     BUILD_CHARCLASS_LOOP (islower)
3457   else if (strcmp (name, "space") == 0)
3458     BUILD_CHARCLASS_LOOP (isspace)
3459   else if (strcmp (name, "alpha") == 0)
3460     BUILD_CHARCLASS_LOOP (isalpha)
3461   else if (strcmp (name, "digit") == 0)
3462     BUILD_CHARCLASS_LOOP (isdigit)
3463   else if (strcmp (name, "print") == 0)
3464     BUILD_CHARCLASS_LOOP (isprint)
3465   else if (strcmp (name, "upper") == 0)
3466     BUILD_CHARCLASS_LOOP (isupper)
3467   else if (strcmp (name, "blank") == 0)
3468     BUILD_CHARCLASS_LOOP (isblank)
3469   else if (strcmp (name, "graph") == 0)
3470     BUILD_CHARCLASS_LOOP (isgraph)
3471   else if (strcmp (name, "punct") == 0)
3472     BUILD_CHARCLASS_LOOP (ispunct)
3473   else if (strcmp (name, "xdigit") == 0)
3474     BUILD_CHARCLASS_LOOP (isxdigit)
3475   else
3476     return REG_ECTYPE;
3477
3478   return REG_NOERROR;
3479 }
3480
3481 static bin_tree_t *
3482 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3483                     const unsigned char *class_name,
3484                     const unsigned char *extra,
3485                     int non_match, reg_errcode_t *err)
3486 {
3487   re_bitset_ptr_t sbcset;
3488 #ifdef RE_ENABLE_I18N
3489   re_charset_t *mbcset;
3490   int alloc = 0;
3491 #endif /* not RE_ENABLE_I18N */
3492   reg_errcode_t ret;
3493   re_token_t br_token;
3494   bin_tree_t *tree;
3495
3496   sbcset = re_calloc (unsigned int, BITSET_UINTS);
3497 #ifdef RE_ENABLE_I18N
3498   mbcset = re_calloc (re_charset_t, 1);
3499 #endif /* RE_ENABLE_I18N */
3500
3501 #ifdef RE_ENABLE_I18N
3502   if (BE (sbcset == NULL || mbcset == NULL, 0))
3503 #else /* not RE_ENABLE_I18N */
3504   if (BE (sbcset == NULL, 0))
3505 #endif /* not RE_ENABLE_I18N */
3506     {
3507       *err = REG_ESPACE;
3508       return NULL;
3509     }
3510
3511   if (non_match)
3512     {
3513 #ifdef RE_ENABLE_I18N
3514       /*
3515       if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3516         bitset_set(cset->sbcset, '\0');
3517       */
3518       mbcset->non_match = 1;
3519 #endif /* not RE_ENABLE_I18N */
3520     }
3521
3522   /* We don't care the syntax in this case.  */
3523   ret = build_charclass (trans, sbcset,
3524 #ifdef RE_ENABLE_I18N
3525                          mbcset, &alloc,
3526 #endif /* RE_ENABLE_I18N */
3527                          class_name, 0);
3528
3529   if (BE (ret != REG_NOERROR, 0))
3530     {
3531       re_free (sbcset);
3532 #ifdef RE_ENABLE_I18N
3533       free_charset (mbcset);
3534 #endif /* RE_ENABLE_I18N */
3535       *err = ret;
3536       return NULL;
3537     }
3538   /* \w match '_' also.  */
3539   for (; *extra; extra++)
3540     bitset_set (sbcset, *extra);
3541
3542   /* If it is non-matching list.  */
3543   if (non_match)
3544     bitset_not (sbcset);
3545
3546 #ifdef RE_ENABLE_I18N
3547   /* Ensure only single byte characters are set.  */
3548   if (dfa->mb_cur_max > 1)
3549     bitset_mask (sbcset, dfa->sb_char);
3550 #endif
3551
3552   /* Build a tree for simple bracket.  */
3553   br_token.type = SIMPLE_BRACKET;
3554   br_token.opr.sbcset = sbcset;
3555   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3556   if (BE (tree == NULL, 0))
3557     goto build_word_op_espace;
3558
3559 #ifdef RE_ENABLE_I18N
3560   if (dfa->mb_cur_max > 1)
3561     {
3562       bin_tree_t *mbc_tree;
3563       /* Build a tree for complex bracket.  */
3564       br_token.type = COMPLEX_BRACKET;
3565       br_token.opr.mbcset = mbcset;
3566       dfa->has_mb_node = 1;
3567       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3568       if (BE (mbc_tree == NULL, 0))
3569         goto build_word_op_espace;
3570       /* Then join them by ALT node.  */
3571       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3572       if (BE (mbc_tree != NULL, 1))
3573         return tree;
3574     }
3575   else
3576     {
3577       free_charset (mbcset);
3578       return tree;
3579     }
3580 #else /* not RE_ENABLE_I18N */
3581   return tree;
3582 #endif /* not RE_ENABLE_I18N */
3583
3584  build_word_op_espace:
3585   re_free (sbcset);
3586 #ifdef RE_ENABLE_I18N
3587   free_charset (mbcset);
3588 #endif /* RE_ENABLE_I18N */
3589   *err = REG_ESPACE;
3590   return NULL;
3591 }
3592
3593 /* This is intended for the expressions like "a{1,3}".
3594    Fetch a number from `input', and return the number.
3595    Return -1, if the number field is empty like "{,1}".
3596    Return -2, If an error is occured.  */
3597
3598 static int
3599 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3600 {
3601   int num = -1;
3602   unsigned char c;
3603   while (1)
3604     {
3605       fetch_token (token, input, syntax);
3606       c = token->opr.c;
3607       if (BE (token->type == END_OF_RE, 0))
3608         return -2;
3609       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3610         break;
3611       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3612              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3613       num = (num > REG_DUP_MAX) ? -2 : num;
3614     }
3615   return num;
3616 }
3617 \f
3618 #ifdef RE_ENABLE_I18N
3619 static void
3620 free_charset (re_charset_t *cset)
3621 {
3622   re_free (cset->mbchars);
3623 # ifdef _LIBC
3624   re_free (cset->coll_syms);
3625   re_free (cset->equiv_classes);
3626   re_free (cset->range_starts);
3627   re_free (cset->range_ends);
3628 # endif
3629   re_free (cset->char_classes);
3630   re_free (cset);
3631 }
3632 #endif /* RE_ENABLE_I18N */
3633 \f
3634 /* Functions for binary tree operation.  */
3635
3636 /* Create a tree node.  */
3637
3638 static bin_tree_t *
3639 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3640              re_token_type_t type)
3641 {
3642   re_token_t t;
3643   t.type = type;
3644   return create_token_tree (dfa, left, right, &t);
3645 }
3646
3647 static bin_tree_t *
3648 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3649                    const re_token_t *token)
3650 {
3651   bin_tree_t *tree;
3652   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3653     {
3654       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3655
3656       if (storage == NULL)
3657         return NULL;
3658       storage->next = dfa->str_tree_storage;
3659       dfa->str_tree_storage = storage;
3660       dfa->str_tree_storage_idx = 0;
3661     }
3662   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3663
3664   tree->parent = NULL;
3665   tree->left = left;
3666   tree->right = right;
3667   tree->token = *token;
3668   tree->token.duplicated = 0;
3669   tree->token.opt_subexp = 0;
3670   tree->first = NULL;
3671   tree->next = NULL;
3672   tree->node_idx = -1;
3673
3674   if (left != NULL)
3675     left->parent = tree;
3676   if (right != NULL)
3677     right->parent = tree;
3678   return tree;
3679 }
3680
3681 /* Mark the tree SRC as an optional subexpression.
3682    To be called from preorder or postorder.  */
3683
3684 static reg_errcode_t
3685 mark_opt_subexp (void *extra, bin_tree_t *node)
3686 {
3687   int idx = (int) (long) extra;
3688   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3689     node->token.opt_subexp = 1;
3690
3691   return REG_NOERROR;
3692 }
3693
3694 /* Free the allocated memory inside NODE. */
3695
3696 static void
3697 free_token (re_token_t *node)
3698 {
3699 #ifdef RE_ENABLE_I18N
3700   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3701     free_charset (node->opr.mbcset);
3702   else
3703 #endif /* RE_ENABLE_I18N */
3704     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3705       re_free (node->opr.sbcset);
3706 }
3707
3708 /* Worker function for tree walking.  Free the allocated memory inside NODE
3709    and its children. */
3710
3711 static reg_errcode_t
3712 free_tree (void *extra, bin_tree_t *node)
3713 {
3714   free_token (&node->token);
3715   return REG_NOERROR;
3716 }
3717
3718
3719 /* Duplicate the node SRC, and return new node.  This is a preorder
3720    visit similar to the one implemented by the generic visitor, but
3721    we need more infrastructure to maintain two parallel trees --- so,
3722    it's easier to duplicate.  */
3723
3724 static bin_tree_t *
3725 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3726 {
3727   const bin_tree_t *node;
3728   bin_tree_t *dup_root;
3729   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3730
3731   for (node = root; ; )
3732     {
3733       /* Create a new tree and link it back to the current parent.  */
3734       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3735       if (*p_new == NULL)
3736         return NULL;
3737       (*p_new)->parent = dup_node;
3738       (*p_new)->token.duplicated = 1;
3739       dup_node = *p_new;
3740
3741       /* Go to the left node, or up and to the right.  */
3742       if (node->left)
3743         {
3744           node = node->left;
3745           p_new = &dup_node->left;
3746         }
3747       else
3748         {
3749           const bin_tree_t *prev = NULL;
3750           while (node->right == prev || node->right == NULL)
3751             {
3752               prev = node;
3753               node = node->parent;
3754               dup_node = dup_node->parent;
3755               if (!node)
3756                 return dup_root;
3757             }
3758           node = node->right;
3759           p_new = &dup_node->right;
3760         }
3761     }
3762 }