pintos-os.org Git - pspp/blob - lib/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  21                                      int n) internal_function;
  22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  23 static void match_ctx_free (re_match_context_t *cache) internal_function;
  24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  25                                           int str_idx, int from, int to)
  26      internal_function;
  27 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
  28      internal_function;
  29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  30                                            int str_idx) internal_function;
  31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  32                                                    int node, int str_idx)
  33      internal_function;
  34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  35                            re_dfastate_t **limited_sts, int last_node,
  36                            int last_str_idx)
  37      internal_function;
  38 static reg_errcode_t re_search_internal (const regex_t *preg,
  39                                          const char *string, int length,
  40                                          int start, int range, int stop,
  41                                          size_t nmatch, regmatch_t pmatch[],
  42                                          int eflags) internal_function;
  43 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  44                              const char *string1, int length1,
  45                              const char *string2, int length2,
  46                              int start, int range, struct re_registers *regs,
  47                              int stop, int ret_len) internal_function;
  48 static int re_search_stub (struct re_pattern_buffer *bufp,
  49                            const char *string, int length, int start,
  50                            int range, int stop, struct re_registers *regs,
  51                            int ret_len) internal_function;
  52 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  53                               int nregs, int regs_allocated) internal_function;
  54 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  55      internal_function;
  56 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  57                            int *p_match_first)
  58      internal_function;
  59 static int check_halt_state_context (const re_match_context_t *mctx,
  60                                      const re_dfastate_t *state, int idx)
  61      internal_function;
  62 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  63                          regmatch_t *prev_idx_match, int cur_node,
  64                          int cur_idx, int nmatch) internal_function;
  65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  66                                       int str_idx, int dest_node, int nregs,
  67                                       regmatch_t *regs,
  68                                       re_node_set *eps_via_nodes) internal_function;
  69 static reg_errcode_t set_regs (const regex_t *preg,
  70                                const re_match_context_t *mctx,
  71                                size_t nmatch, regmatch_t *pmatch,
  72                                int fl_backtrack) internal_function;
  73 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  74
  75 #ifdef RE_ENABLE_I18N
  76 static int sift_states_iter_mb (const re_match_context_t *mctx,
  77                                 re_sift_context_t *sctx,
  78                                 int node_idx, int str_idx, int max_str_idx) internal_function;
  79 #endif /* RE_ENABLE_I18N */
  80 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  81                                            re_sift_context_t *sctx) internal_function;
  82 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
  83                                           re_sift_context_t *sctx, int str_idx,
  84                                           re_node_set *cur_dest) internal_function;
  85 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
  86                                               re_sift_context_t *sctx,
  87                                               int str_idx,
  88                                               re_node_set *dest_nodes) internal_function;
  89 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
  90                                             re_node_set *dest_nodes,
  91                                             const re_node_set *candidates) internal_function;
  92 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
  93                              int dst_node, int dst_idx, int src_node,
  94                              int src_idx) internal_function;
  95 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
  96                                         int boundaries, int subexp_idx,
  97                                         int from_node, int bkref_idx) internal_function;
  98 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
  99                                       int limit, int subexp_idx,
 100                                       int node, int str_idx,
 101                                       int bkref_idx) internal_function;
 102 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 103                                           re_node_set *dest_nodes,
 104                                           const re_node_set *candidates,
 105                                           re_node_set *limits,
 106                                           struct re_backref_cache_entry *bkref_ents,
 107                                           int str_idx) internal_function;
 108 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 109                                         re_sift_context_t *sctx,
 110                                         int str_idx, const re_node_set *candidates) internal_function;
 111 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 112                                         re_dfastate_t **src, int num) internal_function;
 113 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 114                                          re_match_context_t *mctx) internal_function;
 115 static re_dfastate_t *transit_state (reg_errcode_t *err,
 116                                      re_match_context_t *mctx,
 117                                      re_dfastate_t *state) internal_function;
 118 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 119                                             re_match_context_t *mctx,
 120                                             re_dfastate_t *next_state) internal_function;
 121 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 122                                                 re_node_set *cur_nodes,
 123                                                 int str_idx) internal_function;
 124 #if 0
 125 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 126                                         re_match_context_t *mctx,
 127                                         re_dfastate_t *pstate) internal_function;
 128 #endif
 129 #ifdef RE_ENABLE_I18N
 130 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 131                                        re_dfastate_t *pstate) internal_function;
 132 #endif /* RE_ENABLE_I18N */
 133 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 134                                           const re_node_set *nodes) internal_function;
 135 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 136                                  int bkref_node, int bkref_str_idx) internal_function;
 137 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 138                                      const re_sub_match_top_t *sub_top,
 139                                      re_sub_match_last_t *sub_last,
 140                                      int bkref_node, int bkref_str) internal_function;
 141 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 142                              int subexp_idx, int type) internal_function;
 143 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 144                                     state_array_t *path, int top_node,
 145                                     int top_str, int last_node, int last_str,
 146                                     int type) internal_function;
 147 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 148                                                    int str_idx,
 149                                                    re_node_set *cur_nodes,
 150                                                    re_node_set *next_nodes) internal_function;
 151 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 152                                                re_node_set *cur_nodes,
 153                                                int ex_subexp, int type) internal_function;
 154 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 155                                                    re_node_set *dst_nodes,
 156                                                    int target, int ex_subexp,
 157                                                    int type) internal_function;
 158 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 159                                          re_node_set *cur_nodes, int cur_str,
 160                                          int subexp_num, int type) internal_function;
 161 static int build_trtable (re_dfa_t *dfa,
 162                           re_dfastate_t *state) internal_function;
 163 #ifdef RE_ENABLE_I18N
 164 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
 165                                     const re_string_t *input, int idx) internal_function;
 166 # ifdef _LIBC
 167 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 168                                                    size_t name_len) internal_function;
 169 # endif /* _LIBC */
 170 #endif /* RE_ENABLE_I18N */
 171 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
 172                                        const re_dfastate_t *state,
 173                                        re_node_set *states_node,
 174                                        bitset *states_ch) internal_function;
 175 static int check_node_accept (const re_match_context_t *mctx,
 176                               const re_token_t *node, int idx) internal_function;
 177 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 178 \f
 179 /* Entry point for POSIX code.  */
 180
 181 /* regexec searches for a given pattern, specified by PREG, in the
 182    string STRING.
 183
 184    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 185    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 186    least NMATCH elements, and we set them to the offsets of the
 187    corresponding matched substrings.
 188
 189    EFLAGS specifies `execution flags' which affect matching: if
 190    REG_NOTBOL is set, then ^ does not match at the beginning of the
 191    string; if REG_NOTEOL is set, then $ does not match at the end.
 192
 193    We return 0 if we find a match and REG_NOMATCH if not.  */
 194
 195 int
 196 regexec (const regex_t *__restrict preg, const char *__restrict string,
 197          size_t nmatch, regmatch_t pmatch[], int eflags)
 198 {
 199   reg_errcode_t err;
 200   int start, length;
 201 #ifdef _LIBC
 202   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 203 #endif
 204
 205   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 206     return REG_BADPAT;
 207
 208   if (eflags & REG_STARTEND)
 209     {
 210       start = pmatch[0].rm_so;
 211       length = pmatch[0].rm_eo;
 212     }
 213   else
 214     {
 215       start = 0;
 216       length = strlen (string);
 217     }
 218
 219   __libc_lock_lock (dfa->lock);
 220   if (preg->re_no_sub)
 221     err = re_search_internal (preg, string, length, start, length - start,
 222                               length, 0, NULL, eflags);
 223   else
 224     err = re_search_internal (preg, string, length, start, length - start,
 225                               length, nmatch, pmatch, eflags);
 226   __libc_lock_unlock (dfa->lock);
 227   return err != REG_NOERROR;
 228 }
 229
 230 #ifdef _LIBC
 231 # include <shlib-compat.h>
 232 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 233
 234 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 235 __typeof__ (__regexec) __compat_regexec;
 236
 237 int
 238 attribute_compat_text_section
 239 __compat_regexec (const regex_t *__restrict preg,
 240                   const char *__restrict string, size_t nmatch,
 241                   regmatch_t pmatch[], int eflags)
 242 {
 243   return regexec (preg, string, nmatch, pmatch,
 244                   eflags & (REG_NOTBOL | REG_NOTEOL));
 245 }
 246 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 247 # endif
 248 #endif
 249
 250 /* Entry points for GNU code.  */
 251
 252 /* re_match, re_search, re_match_2, re_search_2
 253
 254    The former two functions operate on STRING with length LENGTH,
 255    while the later two operate on concatenation of STRING1 and STRING2
 256    with lengths LENGTH1 and LENGTH2, respectively.
 257
 258    re_match() matches the compiled pattern in BUFP against the string,
 259    starting at index START.
 260
 261    re_search() first tries matching at index START, then it tries to match
 262    starting from index START + 1, and so on.  The last start position tried
 263    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 264    way as re_match().)
 265
 266    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 267    the first STOP characters of the concatenation of the strings should be
 268    concerned.
 269
 270    If REGS is not NULL, and BUFP->re_no_sub is not set, the offsets of the match
 271    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 272    computed relative to the concatenation, not relative to the individual
 273    strings.)
 274
 275    On success, re_match* functions return the length of the match, re_search*
 276    return the position of the start of the match.  Return value -1 means no
 277    match was found and -2 indicates an internal error.  */
 278
 279 int
 280 re_match (struct re_pattern_buffer *bufp, const char *string,
 281           int length, int start, struct re_registers *regs)
 282 {
 283   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 284 }
 285 #ifdef _LIBC
 286 weak_alias (__re_match, re_match)
 287 #endif
 288
 289 int
 290 re_search (struct re_pattern_buffer *bufp, const char *string,
 291            int length, int start, int range, struct re_registers *regs)
 292 {
 293   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 294 }
 295 #ifdef _LIBC
 296 weak_alias (__re_search, re_search)
 297 #endif
 298
 299 int
 300 re_match_2 (struct re_pattern_buffer *bufp,
 301             const char *string1, int length1,
 302             const char *string2, int length2,
 303             int start, struct re_registers *regs, int stop)
 304 {
 305   return re_search_2_stub (bufp, string1, length1, string2, length2,
 306                            start, 0, regs, stop, 1);
 307 }
 308 #ifdef _LIBC
 309 weak_alias (__re_match_2, re_match_2)
 310 #endif
 311
 312 int
 313 re_search_2 (struct re_pattern_buffer *bufp,
 314              const char *string1, int length1,
 315              const char *string2, int length2,
 316              int start, int range, struct re_registers *regs, int stop)
 317 {
 318   return re_search_2_stub (bufp, string1, length1, string2, length2,
 319                            start, range, regs, stop, 0);
 320 }
 321 #ifdef _LIBC
 322 weak_alias (__re_search_2, re_search_2)
 323 #endif
 324
 325 static int
 326 internal_function
 327 re_search_2_stub (struct re_pattern_buffer *bufp,
 328                   const char *string1, int length1,
 329                   const char *string2, int length2,
 330                   int start, int range, struct re_registers *regs, int stop,
 331                   int ret_len)
 332 {
 333   const char *str;
 334   int rval;
 335   int len = length1 + length2;
 336   int free_str = 0;
 337
 338   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 339     return -2;
 340
 341   /* Concatenate the strings.  */
 342   if (length2 > 0)
 343     if (length1 > 0)
 344       {
 345         char *s = re_malloc (char, len);
 346
 347         if (BE (s == NULL, 0))
 348           return -2;
 349         memcpy (s, string1, length1);
 350         memcpy (s + length1, string2, length2);
 351         str = s;
 352         free_str = 1;
 353       }
 354     else
 355       str = string2;
 356   else
 357     str = string1;
 358
 359   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 360                          ret_len);
 361   if (free_str)
 362     re_free ((char *) str);
 363   return rval;
 364 }
 365
 366 /* The parameters have the same meaning as those of re_search.
 367    Additional parameters:
 368    If RET_LEN is nonzero the length of the match is returned (re_match style);
 369    otherwise the position of the match is returned.  */
 370
 371 static int
 372 internal_function
 373 re_search_stub (struct re_pattern_buffer *bufp,
 374                 const char *string, int length,
 375                 int start, int range, int stop, struct re_registers *regs,
 376                 int ret_len)
 377 {
 378   reg_errcode_t result;
 379   regmatch_t *pmatch;
 380   int nregs, rval;
 381   int eflags = 0;
 382 #ifdef _LIBC
 383   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
 384 #endif
 385
 386   /* Check for out-of-range.  */
 387   if (BE (start < 0 || start > length, 0))
 388     return -1;
 389   if (BE (start + range > length, 0))
 390     range = length - start;
 391   else if (BE (start + range < 0, 0))
 392     range = -start;
 393
 394   __libc_lock_lock (dfa->lock);
 395
 396   eflags |= (bufp->re_not_bol) ? REG_NOTBOL : 0;
 397   eflags |= (bufp->re_not_eol) ? REG_NOTEOL : 0;
 398
 399   /* Compile fastmap if we haven't yet.  */
 400   if (range > 0 && bufp->re_fastmap != NULL && !bufp->re_fastmap_accurate)
 401     re_compile_fastmap (bufp);
 402
 403   if (BE (bufp->re_no_sub, 0))
 404     regs = NULL;
 405
 406   /* We need at least 1 register.  */
 407   if (regs == NULL)
 408     nregs = 1;
 409   else if (BE (bufp->re_regs_allocated == REG_FIXED
 410                && regs->rm_num_regs < bufp->re_nsub + 1, 0))
 411     {
 412       nregs = regs->rm_num_regs;
 413       if (BE (nregs < 1, 0))
 414         {
 415           /* Nothing can be copied to regs.  */
 416           regs = NULL;
 417           nregs = 1;
 418         }
 419     }
 420   else
 421     nregs = bufp->re_nsub + 1;
 422   pmatch = re_malloc (regmatch_t, nregs);
 423   if (BE (pmatch == NULL, 0))
 424     {
 425       rval = -2;
 426       goto out;
 427     }
 428
 429   result = re_search_internal (bufp, string, length, start, range, stop,
 430                                nregs, pmatch, eflags);
 431
 432   rval = 0;
 433
 434   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 435   if (result != REG_NOERROR)
 436     rval = -1;
 437   else if (regs != NULL)
 438     {
 439       /* If caller wants register contents data back, copy them.  */
 440       bufp->re_regs_allocated = re_copy_regs (regs, pmatch, nregs,
 441                                               bufp->re_regs_allocated);
 442       if (BE (bufp->re_regs_allocated == REG_UNALLOCATED, 0))
 443         rval = -2;
 444     }
 445
 446   if (BE (rval == 0, 1))
 447     {
 448       if (ret_len)
 449         {
 450           assert (pmatch[0].rm_so == start);
 451           rval = pmatch[0].rm_eo - start;
 452         }
 453       else
 454         rval = pmatch[0].rm_so;
 455     }
 456   re_free (pmatch);
 457  out:
 458   __libc_lock_unlock (dfa->lock);
 459   return rval;
 460 }
 461
 462 static unsigned
 463 internal_function
 464 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, int nregs,
 465               int regs_allocated)
 466 {
 467   int rval = REG_REALLOCATE;
 468   int i;
 469   int need_regs = nregs + 1;
 470   /* We need one extra element beyond `rm_num_regs' for the `-1' marker GNU code
 471      uses.  */
 472
 473   /* Have the register data arrays been allocated?  */
 474   if (regs_allocated == REG_UNALLOCATED)
 475     { /* No.  So allocate them with malloc.  */
 476       regs->rm_start = re_malloc (regoff_t, need_regs);
 477       regs->rm_end = re_malloc (regoff_t, need_regs);
 478       if (BE (regs->rm_start == NULL, 0) || BE (regs->rm_end == NULL, 0))
 479         return REG_UNALLOCATED;
 480       regs->rm_num_regs = need_regs;
 481     }
 482   else if (regs_allocated == REG_REALLOCATE)
 483     { /* Yes.  If we need more elements than were already
 484          allocated, reallocate them.  If we need fewer, just
 485          leave it alone.  */
 486       if (BE (need_regs > regs->rm_num_regs, 0))
 487         {
 488           regoff_t *new_start =
 489             re_realloc (regs->rm_start, regoff_t, need_regs);
 490           regoff_t *new_end = re_realloc (regs->rm_end, regoff_t, need_regs);
 491           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 492             return REG_UNALLOCATED;
 493           regs->rm_start = new_start;
 494           regs->rm_end = new_end;
 495           regs->rm_num_regs = need_regs;
 496         }
 497     }
 498   else
 499     {
 500       assert (regs_allocated == REG_FIXED);
 501       /* This function may not be called with REG_FIXED and nregs too big.  */
 502       assert (regs->rm_num_regs >= nregs);
 503       rval = REG_FIXED;
 504     }
 505
 506   /* Copy the regs.  */
 507   for (i = 0; i < nregs; ++i)
 508     {
 509       regs->rm_start[i] = pmatch[i].rm_so;
 510       regs->rm_end[i] = pmatch[i].rm_eo;
 511     }
 512   for ( ; i < regs->rm_num_regs; ++i)
 513     regs->rm_start[i] = regs->rm_end[i] = -1;
 514
 515   return rval;
 516 }
 517
 518 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 519    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 520    this memory for recording register information.  STARTS and ENDS
 521    must be allocated using the malloc library routine, and must each
 522    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 523
 524    If NUM_REGS == 0, then subsequent matches should allocate their own
 525    register data.
 526
 527    Unless this function is called, the first search or match using
 528    PATTERN_BUFFER will allocate its own register data, without
 529    freeing the old data.  */
 530
 531 void
 532 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
 533                   unsigned int num_regs, regoff_t *starts, regoff_t *ends)
 534 {
 535   if (num_regs)
 536     {
 537       bufp->re_regs_allocated = REG_REALLOCATE;
 538       regs->rm_num_regs = num_regs;
 539       regs->rm_start = starts;
 540       regs->rm_end = ends;
 541     }
 542   else
 543     {
 544       bufp->re_regs_allocated = REG_UNALLOCATED;
 545       regs->rm_num_regs = 0;
 546       regs->rm_start = regs->rm_end = NULL;
 547     }
 548 }
 549 #ifdef _LIBC
 550 weak_alias (__re_set_registers, re_set_registers)
 551 #endif
 552 \f
 553 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 554    them unless specifically requested.  */
 555
 556 #if defined _REGEX_RE_COMP || defined _LIBC
 557 int
 558 # ifdef _LIBC
 559 weak_function
 560 # endif
 561 re_exec (s)
 562      const char *s;
 563 {
 564   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 565 }
 566 #endif /* _REGEX_RE_COMP */
 567 \f
 568 /* Internal entry point.  */
 569
 570 /* Searches for a compiled pattern PREG in the string STRING, whose
 571    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 572    mingings with regexec.  START, and RANGE have the same meanings
 573    with re_search.
 574    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 575    otherwise return the error code.
 576    Note: We assume front end functions already check ranges.
 577    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 578
 579 static reg_errcode_t
 580 internal_function
 581 re_search_internal (const regex_t *preg,
 582                     const char *string, int length,
 583                     int start, int range, int stop,
 584                     size_t nmatch, regmatch_t pmatch[],
 585                     int eflags)
 586 {
 587   reg_errcode_t err;
 588   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
 589   int left_lim, right_lim, incr;
 590   int fl_longest_match, match_first, match_kind, match_last = -1;
 591   int extra_nmatch;
 592   int sb, ch;
 593 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 594   re_match_context_t mctx = { .dfa = dfa };
 595 #else
 596   re_match_context_t mctx;
 597 #endif
 598   char *fastmap = (preg->re_fastmap != NULL && preg->re_fastmap_accurate
 599                    && range && !preg->re_can_be_null) ? preg->re_fastmap : NULL;
 600   unsigned REG_TRANSLATE_TYPE t =
 601     (unsigned REG_TRANSLATE_TYPE) preg->re_translate;
 602
 603 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 604   memset (&mctx, '\0', sizeof (re_match_context_t));
 605   mctx.dfa = dfa;
 606 #endif
 607
 608   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 609   nmatch -= extra_nmatch;
 610
 611   /* Check if the DFA haven't been compiled.  */
 612   if (BE (preg->re_used == 0 || dfa->init_state == NULL
 613           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 614           || dfa->init_state_begbuf == NULL, 0))
 615     return REG_NOMATCH;
 616
 617 #ifdef DEBUG
 618   /* We assume front-end functions already check them.  */
 619   assert (start + range >= 0 && start + range <= length);
 620 #endif
 621
 622   /* If initial states with non-begbuf contexts have no elements,
 623      the regex must be anchored.  If preg->re_newline_anchor is set,
 624      we'll never use init_state_nl, so do not check it.  */
 625   if (dfa->init_state->nodes.nelem == 0
 626       && dfa->init_state_word->nodes.nelem == 0
 627       && (dfa->init_state_nl->nodes.nelem == 0
 628           || !preg->re_newline_anchor))
 629     {
 630       if (start != 0 && start + range != 0)
 631         return REG_NOMATCH;
 632       start = range = 0;
 633     }
 634
 635   /* We must check the longest matching, if nmatch > 0.  */
 636   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 637
 638   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 639                             preg->re_translate,
 640                             preg->re_syntax & REG_IGNORE_CASE, dfa);
 641   if (BE (err != REG_NOERROR, 0))
 642     goto free_return;
 643   mctx.input.stop = stop;
 644   mctx.input.raw_stop = stop;
 645   mctx.input.newline_anchor = preg->re_newline_anchor;
 646
 647   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 648   if (BE (err != REG_NOERROR, 0))
 649     goto free_return;
 650
 651   /* We will log all the DFA states through which the dfa pass,
 652      if nmatch > 1, or this dfa has "multibyte node", which is a
 653      back-reference or a node which can accept multibyte character or
 654      multi character collating element.  */
 655   if (nmatch > 1 || dfa->has_mb_node)
 656     {
 657       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 658       if (BE (mctx.state_log == NULL, 0))
 659         {
 660           err = REG_ESPACE;
 661           goto free_return;
 662         }
 663     }
 664   else
 665     mctx.state_log = NULL;
 666
 667   match_first = start;
 668   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 669                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 670
 671   /* Check incrementally whether of not the input string match.  */
 672   incr = (range < 0) ? -1 : 1;
 673   left_lim = (range < 0) ? start + range : start;
 674   right_lim = (range < 0) ? start : start + range;
 675   sb = dfa->mb_cur_max == 1;
 676   match_kind =
 677     (fastmap
 678      ? ((sb || !(preg->re_syntax & REG_IGNORE_CASE || t) ? 4 : 0)
 679         | (range >= 0 ? 2 : 0)
 680         | (t != NULL ? 1 : 0))
 681      : 8);
 682
 683   for (;; match_first += incr)
 684     {
 685       err = REG_NOMATCH;
 686       if (match_first < left_lim || right_lim < match_first)
 687         goto free_return;
 688
 689       /* Advance as rapidly as possible through the string, until we
 690          find a plausible place to start matching.  This may be done
 691          with varying efficiency, so there are various possibilities:
 692          only the most common of them are specialized, in order to
 693          save on code size.  We use a switch statement for speed.  */
 694       switch (match_kind)
 695         {
 696         case 8:
 697           /* No fastmap.  */
 698           break;
 699
 700         case 7:
 701           /* Fastmap with single-byte translation, match forward.  */
 702           while (BE (match_first < right_lim, 1)
 703                  && !fastmap[t[(unsigned char) string[match_first]]])
 704             ++match_first;
 705           goto forward_match_found_start_or_reached_end;
 706
 707         case 6:
 708           /* Fastmap without translation, match forward.  */
 709           while (BE (match_first < right_lim, 1)
 710                  && !fastmap[(unsigned char) string[match_first]])
 711             ++match_first;
 712
 713         forward_match_found_start_or_reached_end:
 714           if (BE (match_first == right_lim, 0))
 715             {
 716               ch = match_first >= length
 717                        ? 0 : (unsigned char) string[match_first];
 718               if (!fastmap[t ? t[ch] : ch])
 719                 goto free_return;
 720             }
 721           break;
 722
 723         case 4:
 724         case 5:
 725           /* Fastmap without multi-byte translation, match backwards.  */
 726           while (match_first >= left_lim)
 727             {
 728               ch = match_first >= length
 729                        ? 0 : (unsigned char) string[match_first];
 730               if (fastmap[t ? t[ch] : ch])
 731                 break;
 732               --match_first;
 733             }
 734           if (match_first < left_lim)
 735             goto free_return;
 736           break;
 737
 738         default:
 739           /* In this case, we can't determine easily the current byte,
 740              since it might be a component byte of a multibyte
 741              character.  Then we use the constructed buffer instead.  */
 742           for (;;)
 743             {
 744               /* If MATCH_FIRST is out of the valid range, reconstruct the
 745                  buffers.  */
 746               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 747               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 748                 {
 749                   err = re_string_reconstruct (&mctx.input, match_first,
 750                                                eflags);
 751                   if (BE (err != REG_NOERROR, 0))
 752                     goto free_return;
 753
 754                   offset = match_first - mctx.input.raw_mbs_idx;
 755                 }
 756               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 757                  Note that MATCH_FIRST must not be smaller than 0.  */
 758               ch = (match_first >= length
 759                     ? 0 : re_string_byte_at (&mctx.input, offset));
 760               if (fastmap[ch])
 761                 break;
 762               match_first += incr;
 763               if (match_first < left_lim || match_first > right_lim)
 764                 {
 765                   err = REG_NOMATCH;
 766                   goto free_return;
 767                 }
 768             }
 769           break;
 770         }
 771
 772       /* Reconstruct the buffers so that the matcher can assume that
 773          the matching starts from the beginning of the buffer.  */
 774       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 775       if (BE (err != REG_NOERROR, 0))
 776         goto free_return;
 777
 778 #ifdef RE_ENABLE_I18N
 779      /* Don't consider this char as a possible match start if it part,
 780         yet isn't the head, of a multibyte character.  */
 781       if (!sb && !re_string_first_byte (&mctx.input, 0))
 782         continue;
 783 #endif
 784
 785       /* It seems to be appropriate one, then use the matcher.  */
 786       /* We assume that the matching starts from 0.  */
 787       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 788       match_last = check_matching (&mctx, fl_longest_match,
 789                                    range >= 0 ? &match_first : NULL);
 790       if (match_last != -1)
 791         {
 792           if (BE (match_last == -2, 0))
 793             {
 794               err = REG_ESPACE;
 795               goto free_return;
 796             }
 797           else
 798             {
 799               mctx.match_last = match_last;
 800               if ((!preg->re_no_sub && nmatch > 1) || dfa->nbackref)
 801                 {
 802                   re_dfastate_t *pstate = mctx.state_log[match_last];
 803                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 804                                                              match_last);
 805                 }
 806               if ((!preg->re_no_sub && nmatch > 1 && dfa->has_plural_match)
 807                   || dfa->nbackref)
 808                 {
 809                   err = prune_impossible_nodes (&mctx);
 810                   if (err == REG_NOERROR)
 811                     break;
 812                   if (BE (err != REG_NOMATCH, 0))
 813                     goto free_return;
 814                   match_last = -1;
 815                 }
 816               else
 817                 break; /* We found a match.  */
 818             }
 819         }
 820
 821       match_ctx_clean (&mctx);
 822     }
 823
 824 #ifdef DEBUG
 825   assert (match_last != -1);
 826   assert (err == REG_NOERROR);
 827 #endif
 828
 829   /* Set pmatch[] if we need.  */
 830   if (nmatch > 0)
 831     {
 832       int reg_idx;
 833
 834       /* Initialize registers.  */
 835       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 836         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 837
 838       /* Set the points where matching start/end.  */
 839       pmatch[0].rm_so = 0;
 840       pmatch[0].rm_eo = mctx.match_last;
 841
 842       if (!preg->re_no_sub && nmatch > 1)
 843         {
 844           err = set_regs (preg, &mctx, nmatch, pmatch,
 845                           dfa->has_plural_match && dfa->nbackref > 0);
 846           if (BE (err != REG_NOERROR, 0))
 847             goto free_return;
 848         }
 849
 850       /* At last, add the offset to the each registers, since we slided
 851          the buffers so that we could assume that the matching starts
 852          from 0.  */
 853       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 854         if (pmatch[reg_idx].rm_so != -1)
 855           {
 856 #ifdef RE_ENABLE_I18N
 857             if (BE (mctx.input.offsets_needed != 0, 0))
 858               {
 859                 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
 860                   pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
 861                 else
 862                   pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
 863                 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
 864                   pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
 865                 else
 866                   pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
 867               }
 868 #else
 869             assert (mctx.input.offsets_needed == 0);
 870 #endif
 871             pmatch[reg_idx].rm_so += match_first;
 872             pmatch[reg_idx].rm_eo += match_first;
 873           }
 874       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 875         {
 876           pmatch[nmatch + reg_idx].rm_so = -1;
 877           pmatch[nmatch + reg_idx].rm_eo = -1;
 878         }
 879
 880       if (dfa->subexp_map)
 881         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 882           if (dfa->subexp_map[reg_idx] != reg_idx)
 883             {
 884               pmatch[reg_idx + 1].rm_so
 885                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 886               pmatch[reg_idx + 1].rm_eo
 887                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 888             }
 889     }
 890
 891  free_return:
 892   re_free (mctx.state_log);
 893   if (dfa->nbackref)
 894     match_ctx_free (&mctx);
 895   re_string_destruct (&mctx.input);
 896   return err;
 897 }
 898
 899 static reg_errcode_t
 900 internal_function
 901 prune_impossible_nodes (re_match_context_t *mctx)
 902 {
 903   re_dfa_t *const dfa = mctx->dfa;
 904   int halt_node, match_last;
 905   reg_errcode_t ret;
 906   re_dfastate_t **sifted_states;
 907   re_dfastate_t **lim_states = NULL;
 908   re_sift_context_t sctx;
 909 #ifdef DEBUG
 910   assert (mctx->state_log != NULL);
 911 #endif
 912   match_last = mctx->match_last;
 913   halt_node = mctx->last_node;
 914   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 915   if (BE (sifted_states == NULL, 0))
 916     {
 917       ret = REG_ESPACE;
 918       goto free_return;
 919     }
 920   if (dfa->nbackref)
 921     {
 922       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 923       if (BE (lim_states == NULL, 0))
 924         {
 925           ret = REG_ESPACE;
 926           goto free_return;
 927         }
 928       while (1)
 929         {
 930           memset (lim_states, '\0',
 931                   sizeof (re_dfastate_t *) * (match_last + 1));
 932           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 933                          match_last);
 934           ret = sift_states_backward (mctx, &sctx);
 935           re_node_set_free (&sctx.limits);
 936           if (BE (ret != REG_NOERROR, 0))
 937               goto free_return;
 938           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 939             break;
 940           do
 941             {
 942               --match_last;
 943               if (match_last < 0)
 944                 {
 945                   ret = REG_NOMATCH;
 946                   goto free_return;
 947                 }
 948             } while (mctx->state_log[match_last] == NULL
 949                      || !mctx->state_log[match_last]->halt);
 950           halt_node = check_halt_state_context (mctx,
 951                                                 mctx->state_log[match_last],
 952                                                 match_last);
 953         }
 954       ret = merge_state_array (dfa, sifted_states, lim_states,
 955                                match_last + 1);
 956       re_free (lim_states);
 957       lim_states = NULL;
 958       if (BE (ret != REG_NOERROR, 0))
 959         goto free_return;
 960     }
 961   else
 962     {
 963       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 964       ret = sift_states_backward (mctx, &sctx);
 965       re_node_set_free (&sctx.limits);
 966       if (BE (ret != REG_NOERROR, 0))
 967         goto free_return;
 968     }
 969   re_free (mctx->state_log);
 970   mctx->state_log = sifted_states;
 971   sifted_states = NULL;
 972   mctx->last_node = halt_node;
 973   mctx->match_last = match_last;
 974   ret = REG_NOERROR;
 975  free_return:
 976   re_free (sifted_states);
 977   re_free (lim_states);
 978   return ret;
 979 }
 980
 981 /* Acquire an initial state and return it.
 982    We must select appropriate initial state depending on the context,
 983    since initial states may have constraints like "\<", "^", etc..  */
 984
 985 static inline re_dfastate_t *
 986 __attribute ((always_inline)) internal_function
 987 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
 988                             int idx)
 989 {
 990   re_dfa_t *const dfa = mctx->dfa;
 991   if (dfa->init_state->has_constraint)
 992     {
 993       unsigned int context;
 994       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
 995       if (IS_WORD_CONTEXT (context))
 996         return dfa->init_state_word;
 997       else if (IS_ORDINARY_CONTEXT (context))
 998         return dfa->init_state;
 999       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1000         return dfa->init_state_begbuf;
1001       else if (IS_NEWLINE_CONTEXT (context))
1002         return dfa->init_state_nl;
1003       else if (IS_BEGBUF_CONTEXT (context))
1004         {
1005           /* It is relatively rare case, then calculate on demand.  */
1006           return re_acquire_state_context (err, dfa,
1007                                            dfa->init_state->entrance_nodes,
1008                                            context);
1009         }
1010       else
1011         /* Must not happen?  */
1012         return dfa->init_state;
1013     }
1014   else
1015     return dfa->init_state;
1016 }
1017
1018 /* Check whether the regular expression match input string INPUT or not,
1019    and return the index where the matching end, return -1 if not match,
1020    or return -2 in case of an error.
1021    FL_LONGEST_MATCH means we want the POSIX longest matching.
1022    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1023    next place where we may want to try matching.
1024    Note that the matcher assume that the maching starts from the current
1025    index of the buffer.  */
1026
1027 static int
1028 internal_function
1029 check_matching (re_match_context_t *mctx, int fl_longest_match,
1030                 int *p_match_first)
1031 {
1032   re_dfa_t *const dfa = mctx->dfa;
1033   reg_errcode_t err;
1034   int match = 0;
1035   int match_last = -1;
1036   int cur_str_idx = re_string_cur_idx (&mctx->input);
1037   re_dfastate_t *cur_state;
1038   int at_init_state = p_match_first != NULL;
1039   int next_start_idx = cur_str_idx;
1040
1041   err = REG_NOERROR;
1042   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1043   /* An initial state must not be NULL (invalid).  */
1044   if (BE (cur_state == NULL, 0))
1045     {
1046       assert (err == REG_ESPACE);
1047       return -2;
1048     }
1049
1050   if (mctx->state_log != NULL)
1051     {
1052       mctx->state_log[cur_str_idx] = cur_state;
1053
1054       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1055          later.  E.g. Processing back references.  */
1056       if (BE (dfa->nbackref, 0))
1057         {
1058           at_init_state = 0;
1059           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1060           if (BE (err != REG_NOERROR, 0))
1061             return err;
1062
1063           if (cur_state->has_backref)
1064             {
1065               err = transit_state_bkref (mctx, &cur_state->nodes);
1066               if (BE (err != REG_NOERROR, 0))
1067                 return err;
1068             }
1069         }
1070     }
1071
1072   /* If the RE accepts NULL string.  */
1073   if (BE (cur_state->halt, 0))
1074     {
1075       if (!cur_state->has_constraint
1076           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1077         {
1078           if (!fl_longest_match)
1079             return cur_str_idx;
1080           else
1081             {
1082               match_last = cur_str_idx;
1083               match = 1;
1084             }
1085         }
1086     }
1087
1088   while (!re_string_eoi (&mctx->input))
1089     {
1090       re_dfastate_t *old_state = cur_state;
1091       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1092
1093       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1094           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1095               && mctx->input.valid_len < mctx->input.len))
1096         {
1097           err = extend_buffers (mctx);
1098           if (BE (err != REG_NOERROR, 0))
1099             {
1100               assert (err == REG_ESPACE);
1101               return -2;
1102             }
1103         }
1104
1105       cur_state = transit_state (&err, mctx, cur_state);
1106       if (mctx->state_log != NULL)
1107         cur_state = merge_state_with_log (&err, mctx, cur_state);
1108
1109       if (cur_state == NULL)
1110         {
1111           /* Reached the invalid state or an error.  Try to recover a valid
1112              state using the state log, if available and if we have not
1113              already found a valid (even if not the longest) match.  */
1114           if (BE (err != REG_NOERROR, 0))
1115             return -2;
1116
1117           if (mctx->state_log == NULL
1118               || (match && !fl_longest_match)
1119               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1120             break;
1121         }
1122
1123       if (BE (at_init_state, 0))
1124         {
1125           if (old_state == cur_state)
1126             next_start_idx = next_char_idx;
1127           else
1128             at_init_state = 0;
1129         }
1130
1131       if (cur_state->halt)
1132         {
1133           /* Reached a halt state.
1134              Check the halt state can satisfy the current context.  */
1135           if (!cur_state->has_constraint
1136               || check_halt_state_context (mctx, cur_state,
1137                                            re_string_cur_idx (&mctx->input)))
1138             {
1139               /* We found an appropriate halt state.  */
1140               match_last = re_string_cur_idx (&mctx->input);
1141               match = 1;
1142
1143               /* We found a match, do not modify match_first below.  */
1144               p_match_first = NULL;
1145               if (!fl_longest_match)
1146                 break;
1147             }
1148         }
1149     }
1150
1151   if (p_match_first)
1152     *p_match_first += next_start_idx;
1153
1154   return match_last;
1155 }
1156
1157 /* Check NODE match the current context.  */
1158
1159 static int
1160 internal_function
1161 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1162 {
1163   re_token_type_t type = dfa->nodes[node].type;
1164   unsigned int constraint = dfa->nodes[node].constraint;
1165   if (type != END_OF_RE)
1166     return 0;
1167   if (!constraint)
1168     return 1;
1169   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1170     return 0;
1171   return 1;
1172 }
1173
1174 /* Check the halt state STATE match the current context.
1175    Return 0 if not match, if the node, STATE has, is a halt node and
1176    match the context, return the node.  */
1177
1178 static int
1179 internal_function
1180 check_halt_state_context (const re_match_context_t *mctx,
1181                           const re_dfastate_t *state, int idx)
1182 {
1183   int i;
1184   unsigned int context;
1185 #ifdef DEBUG
1186   assert (state->halt);
1187 #endif
1188   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1189   for (i = 0; i < state->nodes.nelem; ++i)
1190     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1191       return state->nodes.elems[i];
1192   return 0;
1193 }
1194
1195 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1196    corresponding to the DFA).
1197    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1198    of errors.  */
1199
1200 static int
1201 internal_function
1202 proceed_next_node (const re_match_context_t *mctx,
1203                    int nregs, regmatch_t *regs, int *pidx, int node,
1204                    re_node_set *eps_via_nodes, struct re_fail_stack_t *fs)
1205 {
1206   re_dfa_t *const dfa = mctx->dfa;
1207   int i, err;
1208   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1209     {
1210       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1211       re_node_set *edests = &dfa->edests[node];
1212       int dest_node;
1213       err = re_node_set_insert (eps_via_nodes, node);
1214       if (BE (err < 0, 0))
1215         return -2;
1216       /* Pick up a valid destination, or return -1 if none is found.  */
1217       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1218         {
1219           int candidate = edests->elems[i];
1220           if (!re_node_set_contains (cur_nodes, candidate))
1221             continue;
1222           if (dest_node == -1)
1223             dest_node = candidate;
1224
1225           else
1226             {
1227               /* In order to avoid infinite loop like "(a*)*", return the second
1228                  epsilon-transition if the first was already considered.  */
1229               if (re_node_set_contains (eps_via_nodes, dest_node))
1230                 return candidate;
1231
1232               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1233               else if (fs != NULL
1234                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1235                                            eps_via_nodes))
1236                 return -2;
1237
1238               /* We know we are going to exit.  */
1239               break;
1240             }
1241         }
1242       return dest_node;
1243     }
1244   else
1245     {
1246       int naccepted = 0;
1247       re_token_type_t type = dfa->nodes[node].type;
1248
1249 #ifdef RE_ENABLE_I18N
1250       if (dfa->nodes[node].accept_mb)
1251         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1252       else
1253 #endif /* RE_ENABLE_I18N */
1254       if (type == OP_BACK_REF)
1255         {
1256           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1257           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1258           if (fs != NULL)
1259             {
1260               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1261                 return -1;
1262               else if (naccepted)
1263                 {
1264                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1265                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1266                               naccepted) != 0)
1267                     return -1;
1268                 }
1269             }
1270
1271           if (naccepted == 0)
1272             {
1273               int dest_node;
1274               err = re_node_set_insert (eps_via_nodes, node);
1275               if (BE (err < 0, 0))
1276                 return -2;
1277               dest_node = dfa->edests[node].elems[0];
1278               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1279                                         dest_node))
1280                 return dest_node;
1281             }
1282         }
1283
1284       if (naccepted != 0
1285           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1286         {
1287           int dest_node = dfa->nexts[node];
1288           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1289           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1290                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1291                                                dest_node)))
1292             return -1;
1293           re_node_set_empty (eps_via_nodes);
1294           return dest_node;
1295         }
1296     }
1297   return -1;
1298 }
1299
1300 static reg_errcode_t
1301 internal_function
1302 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1303                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1304 {
1305   reg_errcode_t err;
1306   int num = fs->num++;
1307   if (fs->num == fs->alloc)
1308     {
1309       struct re_fail_stack_ent_t *new_array =
1310         re_realloc (fs->stack, struct re_fail_stack_ent_t, fs->alloc * 2);
1311       if (new_array == NULL)
1312         return REG_ESPACE;
1313       fs->alloc *= 2;
1314       fs->stack = new_array;
1315     }
1316   fs->stack[num].idx = str_idx;
1317   fs->stack[num].node = dest_node;
1318   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1319   if (fs->stack[num].regs == NULL)
1320     return REG_ESPACE;
1321   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1322   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1323   return err;
1324 }
1325
1326 static int
1327 internal_function
1328 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx,
1329                 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1330 {
1331   int num = --fs->num;
1332   assert (num >= 0);
1333   *pidx = fs->stack[num].idx;
1334   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1335   re_node_set_free (eps_via_nodes);
1336   re_free (fs->stack[num].regs);
1337   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1338   return fs->stack[num].node;
1339 }
1340
1341 /* Set the positions where the subexpressions are starts/ends to registers
1342    PMATCH.
1343    Note: We assume that pmatch[0] is already set, and
1344    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1345
1346 static reg_errcode_t
1347 internal_function
1348 set_regs (const regex_t *preg, const re_match_context_t *mctx,
1349           size_t nmatch, regmatch_t *pmatch, int fl_backtrack)
1350 {
1351   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1352   int idx, cur_node;
1353   re_node_set eps_via_nodes;
1354   struct re_fail_stack_t *fs;
1355   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1356   regmatch_t *prev_idx_match;
1357   int prev_idx_match_malloced = 0;
1358
1359 #ifdef DEBUG
1360   assert (nmatch > 1);
1361   assert (mctx->state_log != NULL);
1362 #endif
1363   if (fl_backtrack)
1364     {
1365       fs = &fs_body;
1366       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1367       if (fs->stack == NULL)
1368         return REG_ESPACE;
1369     }
1370   else
1371     fs = NULL;
1372
1373   cur_node = dfa->init_node;
1374   re_node_set_init_empty (&eps_via_nodes);
1375
1376   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1377     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1378   else
1379     {
1380       prev_idx_match = re_malloc (regmatch_t, nmatch);
1381       if (prev_idx_match == NULL)
1382         {
1383           free_fail_stack_return (fs);
1384           return REG_ESPACE;
1385         }
1386       prev_idx_match_malloced = 1;
1387     }
1388   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1389
1390   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1391     {
1392       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1393
1394       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1395         {
1396           int reg_idx;
1397           if (fs)
1398             {
1399               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1400                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1401                   break;
1402               if (reg_idx == nmatch)
1403                 {
1404                   re_node_set_free (&eps_via_nodes);
1405                   if (prev_idx_match_malloced)
1406                     re_free (prev_idx_match);
1407                   return free_fail_stack_return (fs);
1408                 }
1409               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1410                                          &eps_via_nodes);
1411             }
1412           else
1413             {
1414               re_node_set_free (&eps_via_nodes);
1415               if (prev_idx_match_malloced)
1416                 re_free (prev_idx_match);
1417               return REG_NOERROR;
1418             }
1419         }
1420
1421       /* Proceed to next node.  */
1422       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1423                                     &eps_via_nodes, fs);
1424
1425       if (BE (cur_node < 0, 0))
1426         {
1427           if (BE (cur_node == -2, 0))
1428             {
1429               re_node_set_free (&eps_via_nodes);
1430               if (prev_idx_match_malloced)
1431                 re_free (prev_idx_match);
1432               free_fail_stack_return (fs);
1433               return REG_ESPACE;
1434             }
1435           if (fs)
1436             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1437                                        &eps_via_nodes);
1438           else
1439             {
1440               re_node_set_free (&eps_via_nodes);
1441               if (prev_idx_match_malloced)
1442                 re_free (prev_idx_match);
1443               return REG_NOMATCH;
1444             }
1445         }
1446     }
1447   re_node_set_free (&eps_via_nodes);
1448   if (prev_idx_match_malloced)
1449     re_free (prev_idx_match);
1450   return free_fail_stack_return (fs);
1451 }
1452
1453 static reg_errcode_t
1454 internal_function
1455 free_fail_stack_return (struct re_fail_stack_t *fs)
1456 {
1457   if (fs)
1458     {
1459       int fs_idx;
1460       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1461         {
1462           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1463           re_free (fs->stack[fs_idx].regs);
1464         }
1465       re_free (fs->stack);
1466     }
1467   return REG_NOERROR;
1468 }
1469
1470 static void
1471 internal_function
1472 update_regs (re_dfa_t *dfa, regmatch_t *pmatch, regmatch_t *prev_idx_match,
1473              int cur_node, int cur_idx, int nmatch)
1474 {
1475   int type = dfa->nodes[cur_node].type;
1476   if (type == OP_OPEN_SUBEXP)
1477     {
1478       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1479
1480       /* We are at the first node of this sub expression.  */
1481       if (reg_num < nmatch)
1482         {
1483           pmatch[reg_num].rm_so = cur_idx;
1484           pmatch[reg_num].rm_eo = -1;
1485         }
1486     }
1487   else if (type == OP_CLOSE_SUBEXP)
1488     {
1489       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1490       if (reg_num < nmatch)
1491         {
1492           /* We are at the last node of this sub expression.  */
1493           if (pmatch[reg_num].rm_so < cur_idx)
1494             {
1495               pmatch[reg_num].rm_eo = cur_idx;
1496               /* This is a non-empty match or we are not inside an optional
1497                  subexpression.  Accept this right away.  */
1498               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1499             }
1500           else
1501             {
1502               if (dfa->nodes[cur_node].opt_subexp
1503                   && prev_idx_match[reg_num].rm_so != -1)
1504                 /* We transited through an empty match for an optional
1505                    subexpression, like (a?)*, and this is not the subexp's
1506                    first match.  Copy back the old content of the registers
1507                    so that matches of an inner subexpression are undone as
1508                    well, like in ((a?))*.  */
1509                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1510               else
1511                 /* We completed a subexpression, but it may be part of
1512                    an optional one, so do not update PREV_IDX_MATCH.  */
1513                 pmatch[reg_num].rm_eo = cur_idx;
1514             }
1515         }
1516     }
1517 }
1518
1519 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1520    and sift the nodes in each states according to the following rules.
1521    Updated state_log will be wrote to STATE_LOG.
1522
1523    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1524      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1525         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1526         the LAST_NODE, we throw away the node `a'.
1527      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1528         string `s' and transit to `b':
1529         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1530            away the node `a'.
1531         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1532             thrown away, we throw away the node `a'.
1533      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1534         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1535            node `a'.
1536         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1537             we throw away the node `a'.  */
1538
1539 #define STATE_NODE_CONTAINS(state,node) \
1540   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1541
1542 static reg_errcode_t
1543 internal_function
1544 sift_states_backward (re_match_context_t *mctx, re_sift_context_t *sctx)
1545 {
1546   reg_errcode_t err;
1547   int null_cnt = 0;
1548   int str_idx = sctx->last_str_idx;
1549   re_node_set cur_dest;
1550
1551 #ifdef DEBUG
1552   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1553 #endif
1554
1555   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1556      transit to the last_node and the last_node itself.  */
1557   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1558   if (BE (err != REG_NOERROR, 0))
1559     return err;
1560   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1561   if (BE (err != REG_NOERROR, 0))
1562     goto free_return;
1563
1564   /* Then check each states in the state_log.  */
1565   while (str_idx > 0)
1566     {
1567       /* Update counters.  */
1568       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1569       if (null_cnt > mctx->max_mb_elem_len)
1570         {
1571           memset (sctx->sifted_states, '\0',
1572                   sizeof (re_dfastate_t *) * str_idx);
1573           re_node_set_free (&cur_dest);
1574           return REG_NOERROR;
1575         }
1576       re_node_set_empty (&cur_dest);
1577       --str_idx;
1578
1579       if (mctx->state_log[str_idx])
1580         {
1581           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1582           if (BE (err != REG_NOERROR, 0))
1583             goto free_return;
1584         }
1585
1586       /* Add all the nodes which satisfy the following conditions:
1587          - It can epsilon transit to a node in CUR_DEST.
1588          - It is in CUR_SRC.
1589          And update state_log.  */
1590       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1591       if (BE (err != REG_NOERROR, 0))
1592         goto free_return;
1593     }
1594   err = REG_NOERROR;
1595  free_return:
1596   re_node_set_free (&cur_dest);
1597   return err;
1598 }
1599
1600 static reg_errcode_t
1601 internal_function
1602 build_sifted_states (re_match_context_t *mctx, re_sift_context_t *sctx,
1603                      int str_idx, re_node_set *cur_dest)
1604 {
1605   re_dfa_t *const dfa = mctx->dfa;
1606   re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1607   int i;
1608
1609   /* Then build the next sifted state.
1610      We build the next sifted state on `cur_dest', and update
1611      `sifted_states[str_idx]' with `cur_dest'.
1612      Note:
1613      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1614      `cur_src' points the node_set of the old `state_log[str_idx]'
1615      (with the epsilon nodes pre-filtered out).  */
1616   for (i = 0; i < cur_src->nelem; i++)
1617     {
1618       int prev_node = cur_src->elems[i];
1619       int naccepted = 0;
1620       int ret;
1621
1622 #ifdef DEBUG
1623       re_token_type_t type = dfa->nodes[prev_node].type;
1624       assert (!IS_EPSILON_NODE (type));
1625 #endif
1626 #ifdef RE_ENABLE_I18N
1627       /* If the node may accept `multi byte'.  */
1628       if (dfa->nodes[prev_node].accept_mb)
1629         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1630                                          str_idx, sctx->last_str_idx);
1631 #endif /* RE_ENABLE_I18N */
1632
1633       /* We don't check backreferences here.
1634          See update_cur_sifted_state().  */
1635       if (!naccepted
1636           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1637           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1638                                   dfa->nexts[prev_node]))
1639         naccepted = 1;
1640
1641       if (naccepted == 0)
1642         continue;
1643
1644       if (sctx->limits.nelem)
1645         {
1646           int to_idx = str_idx + naccepted;
1647           if (check_dst_limits (mctx, &sctx->limits,
1648                                 dfa->nexts[prev_node], to_idx,
1649                                 prev_node, str_idx))
1650             continue;
1651         }
1652       ret = re_node_set_insert (cur_dest, prev_node);
1653       if (BE (ret == -1, 0))
1654         return REG_ESPACE;
1655     }
1656
1657   return REG_NOERROR;
1658 }
1659
1660 /* Helper functions.  */
1661
1662 static reg_errcode_t
1663 internal_function
1664 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1665 {
1666   int top = mctx->state_log_top;
1667
1668   if (next_state_log_idx >= mctx->input.bufs_len
1669       || (next_state_log_idx >= mctx->input.valid_len
1670           && mctx->input.valid_len < mctx->input.len))
1671     {
1672       reg_errcode_t err;
1673       err = extend_buffers (mctx);
1674       if (BE (err != REG_NOERROR, 0))
1675         return err;
1676     }
1677
1678   if (top < next_state_log_idx)
1679     {
1680       memset (mctx->state_log + top + 1, '\0',
1681               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1682       mctx->state_log_top = next_state_log_idx;
1683     }
1684   return REG_NOERROR;
1685 }
1686
1687 static reg_errcode_t
1688 internal_function
1689 merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src,
1690                    int num)
1691 {
1692   int st_idx;
1693   reg_errcode_t err;
1694   for (st_idx = 0; st_idx < num; ++st_idx)
1695     {
1696       if (dst[st_idx] == NULL)
1697         dst[st_idx] = src[st_idx];
1698       else if (src[st_idx] != NULL)
1699         {
1700           re_node_set merged_set;
1701           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1702                                         &src[st_idx]->nodes);
1703           if (BE (err != REG_NOERROR, 0))
1704             return err;
1705           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1706           re_node_set_free (&merged_set);
1707           if (BE (err != REG_NOERROR, 0))
1708             return err;
1709         }
1710     }
1711   return REG_NOERROR;
1712 }
1713
1714 static reg_errcode_t
1715 internal_function
1716 update_cur_sifted_state (re_match_context_t *mctx, re_sift_context_t *sctx,
1717                          int str_idx, re_node_set *dest_nodes)
1718 {
1719   re_dfa_t *const dfa = mctx->dfa;
1720   reg_errcode_t err;
1721   const re_node_set *candidates;
1722   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1723                 : &mctx->state_log[str_idx]->nodes);
1724
1725   if (dest_nodes->nelem == 0)
1726     sctx->sifted_states[str_idx] = NULL;
1727   else
1728     {
1729       if (candidates)
1730         {
1731           /* At first, add the nodes which can epsilon transit to a node in
1732              DEST_NODE.  */
1733           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1734           if (BE (err != REG_NOERROR, 0))
1735             return err;
1736
1737           /* Then, check the limitations in the current sift_context.  */
1738           if (sctx->limits.nelem)
1739             {
1740               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1741                                          mctx->bkref_ents, str_idx);
1742               if (BE (err != REG_NOERROR, 0))
1743                 return err;
1744             }
1745         }
1746
1747       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1748       if (BE (err != REG_NOERROR, 0))
1749         return err;
1750     }
1751
1752   if (candidates && mctx->state_log[str_idx]->has_backref)
1753     {
1754       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1755       if (BE (err != REG_NOERROR, 0))
1756         return err;
1757     }
1758   return REG_NOERROR;
1759 }
1760
1761 static reg_errcode_t
1762 internal_function
1763 add_epsilon_src_nodes (re_dfa_t *dfa, re_node_set *dest_nodes,
1764                        const re_node_set *candidates)
1765 {
1766   reg_errcode_t err = REG_NOERROR;
1767   int i;
1768
1769   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1770   if (BE (err != REG_NOERROR, 0))
1771     return err;
1772
1773   if (!state->inveclosure.alloc)
1774     {
1775       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1776       if (BE (err != REG_NOERROR, 0))
1777         return REG_ESPACE;
1778       for (i = 0; i < dest_nodes->nelem; i++)
1779         re_node_set_merge (&state->inveclosure,
1780                            dfa->inveclosures + dest_nodes->elems[i]);
1781     }
1782   return re_node_set_add_intersect (dest_nodes, candidates,
1783                                     &state->inveclosure);
1784 }
1785
1786 static reg_errcode_t
1787 internal_function
1788 sub_epsilon_src_nodes (re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1789                        const re_node_set *candidates)
1790 {
1791     int ecl_idx;
1792     reg_errcode_t err;
1793     re_node_set *inv_eclosure = dfa->inveclosures + node;
1794     re_node_set except_nodes;
1795     re_node_set_init_empty (&except_nodes);
1796     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1797       {
1798         int cur_node = inv_eclosure->elems[ecl_idx];
1799         if (cur_node == node)
1800           continue;
1801         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1802           {
1803             int edst1 = dfa->edests[cur_node].elems[0];
1804             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1805                          ? dfa->edests[cur_node].elems[1] : -1);
1806             if ((!re_node_set_contains (inv_eclosure, edst1)
1807                  && re_node_set_contains (dest_nodes, edst1))
1808                 || (edst2 > 0
1809                     && !re_node_set_contains (inv_eclosure, edst2)
1810                     && re_node_set_contains (dest_nodes, edst2)))
1811               {
1812                 err = re_node_set_add_intersect (&except_nodes, candidates,
1813                                                  dfa->inveclosures + cur_node);
1814                 if (BE (err != REG_NOERROR, 0))
1815                   {
1816                     re_node_set_free (&except_nodes);
1817                     return err;
1818                   }
1819               }
1820           }
1821       }
1822     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1823       {
1824         int cur_node = inv_eclosure->elems[ecl_idx];
1825         if (!re_node_set_contains (&except_nodes, cur_node))
1826           {
1827             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1828             re_node_set_remove_at (dest_nodes, idx);
1829           }
1830       }
1831     re_node_set_free (&except_nodes);
1832     return REG_NOERROR;
1833 }
1834
1835 static int
1836 internal_function
1837 check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
1838                   int dst_node, int dst_idx, int src_node, int src_idx)
1839 {
1840   re_dfa_t *const dfa = mctx->dfa;
1841   int lim_idx, src_pos, dst_pos;
1842
1843   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1844   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1845   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1846     {
1847       int subexp_idx;
1848       struct re_backref_cache_entry *ent;
1849       ent = mctx->bkref_ents + limits->elems[lim_idx];
1850       subexp_idx = dfa->nodes[ent->node].opr.idx;
1851
1852       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1853                                            subexp_idx, dst_node, dst_idx,
1854                                            dst_bkref_idx);
1855       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1856                                            subexp_idx, src_node, src_idx,
1857                                            src_bkref_idx);
1858
1859       /* In case of:
1860          <src> <dst> ( <subexp> )
1861          ( <subexp> ) <src> <dst>
1862          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1863       if (src_pos == dst_pos)
1864         continue; /* This is unrelated limitation.  */
1865       else
1866         return 1;
1867     }
1868   return 0;
1869 }
1870
1871 static int
1872 internal_function
1873 check_dst_limits_calc_pos_1 (re_match_context_t *mctx, int boundaries,
1874                              int subexp_idx, int from_node, int bkref_idx)
1875 {
1876   re_dfa_t *const dfa = mctx->dfa;
1877   re_node_set *eclosures = dfa->eclosures + from_node;
1878   int node_idx;
1879
1880   /* Else, we are on the boundary: examine the nodes on the epsilon
1881      closure.  */
1882   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1883     {
1884       int node = eclosures->elems[node_idx];
1885       switch (dfa->nodes[node].type)
1886         {
1887         case OP_BACK_REF:
1888           if (bkref_idx != -1)
1889             {
1890               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1891               do
1892                 {
1893                   int dst, cpos;
1894
1895                   if (ent->node != node)
1896                     continue;
1897
1898                   if (subexp_idx
1899                       < CHAR_BIT * sizeof ent->eps_reachable_subexps_map
1900                       && !(ent->eps_reachable_subexps_map & (1u << subexp_idx)))
1901                     continue;
1902
1903                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1904                      OP_CLOSE_SUBEXP cases below.  But, if the
1905                      destination node is the same node as the source
1906                      node, don't recurse because it would cause an
1907                      infinite loop: a regex that exhibits this behavior
1908                      is ()\1*\1*  */
1909                   dst = dfa->edests[node].elems[0];
1910                   if (dst == from_node)
1911                     {
1912                       if (boundaries & 1)
1913                         return -1;
1914                       else /* if (boundaries & 2) */
1915                         return 0;
1916                     }
1917
1918                   cpos =
1919                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1920                                                  dst, bkref_idx);
1921                   if (cpos == -1 /* && (boundaries & 1) */)
1922                     return -1;
1923                   if (cpos == 0 && (boundaries & 2))
1924                     return 0;
1925
1926                   if (subexp_idx
1927                       < CHAR_BIT * sizeof ent->eps_reachable_subexps_map)
1928                     ent->eps_reachable_subexps_map &= ~(1u << subexp_idx);
1929                 }
1930               while (ent++->more);
1931             }
1932           break;
1933
1934         case OP_OPEN_SUBEXP:
1935           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1936             return -1;
1937           break;
1938
1939         case OP_CLOSE_SUBEXP:
1940           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1941             return 0;
1942           break;
1943
1944         default:
1945             break;
1946         }
1947     }
1948
1949   return (boundaries & 2) ? 1 : 0;
1950 }
1951
1952 static int
1953 internal_function
1954 check_dst_limits_calc_pos (re_match_context_t *mctx, int limit, int subexp_idx,
1955                            int from_node, int str_idx, int bkref_idx)
1956 {
1957   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1958   int boundaries;
1959
1960   /* If we are outside the range of the subexpression, return -1 or 1.  */
1961   if (str_idx < lim->subexp_from)
1962     return -1;
1963
1964   if (lim->subexp_to < str_idx)
1965     return 1;
1966
1967   /* If we are within the subexpression, return 0.  */
1968   boundaries = (str_idx == lim->subexp_from);
1969   boundaries |= (str_idx == lim->subexp_to) << 1;
1970   if (boundaries == 0)
1971     return 0;
1972
1973   /* Else, examine epsilon closure.  */
1974   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1975                                       from_node, bkref_idx);
1976 }
1977
1978 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1979    which are against limitations from DEST_NODES. */
1980
1981 static reg_errcode_t
1982 internal_function
1983 check_subexp_limits (re_dfa_t *dfa, re_node_set *dest_nodes,
1984                      const re_node_set *candidates, re_node_set *limits,
1985                      struct re_backref_cache_entry *bkref_ents, int str_idx)
1986 {
1987   reg_errcode_t err;
1988   int node_idx, lim_idx;
1989
1990   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1991     {
1992       int subexp_idx;
1993       struct re_backref_cache_entry *ent;
1994       ent = bkref_ents + limits->elems[lim_idx];
1995
1996       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1997         continue; /* This is unrelated limitation.  */
1998
1999       subexp_idx = dfa->nodes[ent->node].opr.idx;
2000       if (ent->subexp_to == str_idx)
2001         {
2002           int ops_node = -1;
2003           int cls_node = -1;
2004           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2005             {
2006               int node = dest_nodes->elems[node_idx];
2007               re_token_type_t type = dfa->nodes[node].type;
2008               if (type == OP_OPEN_SUBEXP
2009                   && subexp_idx == dfa->nodes[node].opr.idx)
2010                 ops_node = node;
2011               else if (type == OP_CLOSE_SUBEXP
2012                        && subexp_idx == dfa->nodes[node].opr.idx)
2013                 cls_node = node;
2014             }
2015
2016           /* Check the limitation of the open subexpression.  */
2017           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2018           if (ops_node >= 0)
2019             {
2020               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2021                                            candidates);
2022               if (BE (err != REG_NOERROR, 0))
2023                 return err;
2024             }
2025
2026           /* Check the limitation of the close subexpression.  */
2027           if (cls_node >= 0)
2028             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2029               {
2030                 int node = dest_nodes->elems[node_idx];
2031                 if (!re_node_set_contains (dfa->inveclosures + node,
2032                                            cls_node)
2033                     && !re_node_set_contains (dfa->eclosures + node,
2034                                               cls_node))
2035                   {
2036                     /* It is against this limitation.
2037                        Remove it form the current sifted state.  */
2038                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2039                                                  candidates);
2040                     if (BE (err != REG_NOERROR, 0))
2041                       return err;
2042                     --node_idx;
2043                   }
2044               }
2045         }
2046       else /* (ent->subexp_to != str_idx)  */
2047         {
2048           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2049             {
2050               int node = dest_nodes->elems[node_idx];
2051               re_token_type_t type = dfa->nodes[node].type;
2052               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2053                 {
2054                   if (subexp_idx != dfa->nodes[node].opr.idx)
2055                     continue;
2056                   /* It is against this limitation.
2057                      Remove it form the current sifted state.  */
2058                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2059                                                candidates);
2060                   if (BE (err != REG_NOERROR, 0))
2061                     return err;
2062                 }
2063             }
2064         }
2065     }
2066   return REG_NOERROR;
2067 }
2068
2069 static reg_errcode_t
2070 internal_function
2071 sift_states_bkref (re_match_context_t *mctx, re_sift_context_t *sctx,
2072                    int str_idx, const re_node_set *candidates)
2073 {
2074   re_dfa_t *const dfa = mctx->dfa;
2075   reg_errcode_t err;
2076   int node_idx, node;
2077   re_sift_context_t local_sctx;
2078   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2079
2080   if (first_idx == -1)
2081     return REG_NOERROR;
2082
2083   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2084
2085   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2086     {
2087       int enabled_idx;
2088       re_token_type_t type;
2089       struct re_backref_cache_entry *entry;
2090       node = candidates->elems[node_idx];
2091       type = dfa->nodes[node].type;
2092       /* Avoid infinite loop for the REs like "()\1+".  */
2093       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2094         continue;
2095       if (type != OP_BACK_REF)
2096         continue;
2097
2098       entry = mctx->bkref_ents + first_idx;
2099       enabled_idx = first_idx;
2100       do
2101         {
2102           int subexp_len, to_idx, dst_node, ret;
2103           re_dfastate_t *cur_state;
2104
2105           if (entry->node != node)
2106             continue;
2107           subexp_len = entry->subexp_to - entry->subexp_from;
2108           to_idx = str_idx + subexp_len;
2109           dst_node = (subexp_len ? dfa->nexts[node]
2110                       : dfa->edests[node].elems[0]);
2111
2112           if (to_idx > sctx->last_str_idx
2113               || sctx->sifted_states[to_idx] == NULL
2114               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2115               || check_dst_limits (mctx, &sctx->limits, node,
2116                                    str_idx, dst_node, to_idx))
2117             continue;
2118
2119           if (local_sctx.sifted_states == NULL)
2120             {
2121               local_sctx = *sctx;
2122               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2123               if (BE (err != REG_NOERROR, 0))
2124                 goto free_return;
2125             }
2126           local_sctx.last_node = node;
2127           local_sctx.last_str_idx = str_idx;
2128           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2129           if (BE (ret < 0, 0))
2130             {
2131               err = REG_ESPACE;
2132               goto free_return;
2133             }
2134           cur_state = local_sctx.sifted_states[str_idx];
2135           err = sift_states_backward (mctx, &local_sctx);
2136           if (BE (err != REG_NOERROR, 0))
2137             goto free_return;
2138           if (sctx->limited_states != NULL)
2139             {
2140               err = merge_state_array (dfa, sctx->limited_states,
2141                                        local_sctx.sifted_states,
2142                                        str_idx + 1);
2143               if (BE (err != REG_NOERROR, 0))
2144                 goto free_return;
2145             }
2146           local_sctx.sifted_states[str_idx] = cur_state;
2147           re_node_set_remove (&local_sctx.limits, enabled_idx);
2148
2149           /* mctx->bkref_ents may have changed, reload the pointer.  */
2150           entry = mctx->bkref_ents + enabled_idx;
2151         }
2152       while (enabled_idx++, entry++->more);
2153     }
2154   err = REG_NOERROR;
2155  free_return:
2156   if (local_sctx.sifted_states != NULL)
2157     {
2158       re_node_set_free (&local_sctx.limits);
2159     }
2160
2161   return err;
2162 }
2163
2164
2165 #ifdef RE_ENABLE_I18N
2166 static int
2167 internal_function
2168 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2169                      int node_idx, int str_idx, int max_str_idx)
2170 {
2171   re_dfa_t *const dfa = mctx->dfa;
2172   int naccepted;
2173   /* Check the node can accept `multi byte'.  */
2174   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2175   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2176       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2177                             dfa->nexts[node_idx]))
2178     /* The node can't accept the `multi byte', or the
2179        destination was already thrown away, then the node
2180        could't accept the current input `multi byte'.   */
2181     naccepted = 0;
2182   /* Otherwise, it is sure that the node could accept
2183      `naccepted' bytes input.  */
2184   return naccepted;
2185 }
2186 #endif /* RE_ENABLE_I18N */
2187
2188 \f
2189 /* Functions for state transition.  */
2190
2191 /* Return the next state to which the current state STATE will transit by
2192    accepting the current input byte, and update STATE_LOG if necessary.
2193    If STATE can accept a multibyte char/collating element/back reference
2194    update the destination of STATE_LOG.  */
2195
2196 static re_dfastate_t *
2197 internal_function
2198 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2199                re_dfastate_t *state)
2200 {
2201   re_dfastate_t **trtable;
2202   unsigned char ch;
2203
2204 #ifdef RE_ENABLE_I18N
2205   /* If the current state can accept multibyte.  */
2206   if (BE (state->accept_mb, 0))
2207     {
2208       *err = transit_state_mb (mctx, state);
2209       if (BE (*err != REG_NOERROR, 0))
2210         return NULL;
2211     }
2212 #endif /* RE_ENABLE_I18N */
2213
2214   /* Then decide the next state with the single byte.  */
2215 #if 0
2216   if (0)
2217     /* don't use transition table  */
2218     return transit_state_sb (err, mctx, state);
2219 #endif
2220
2221   /* Use transition table  */
2222   ch = re_string_fetch_byte (&mctx->input);
2223   for (;;)
2224     {
2225       trtable = state->trtable;
2226       if (BE (trtable != NULL, 1))
2227         return trtable[ch];
2228
2229       trtable = state->word_trtable;
2230       if (BE (trtable != NULL, 1))
2231         {
2232           unsigned int context;
2233           context
2234             = re_string_context_at (&mctx->input,
2235                                     re_string_cur_idx (&mctx->input) - 1,
2236                                     mctx->eflags);
2237           if (IS_WORD_CONTEXT (context))
2238             return trtable[ch + SBC_MAX];
2239           else
2240             return trtable[ch];
2241         }
2242
2243       if (!build_trtable (mctx->dfa, state))
2244         {
2245           *err = REG_ESPACE;
2246           return NULL;
2247         }
2248
2249       /* Retry, we now have a transition table.  */
2250     }
2251 }
2252
2253 /* Update the state_log if we need */
2254 re_dfastate_t *
2255 internal_function
2256 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2257                       re_dfastate_t *next_state)
2258 {
2259   re_dfa_t *const dfa = mctx->dfa;
2260   int cur_idx = re_string_cur_idx (&mctx->input);
2261
2262   if (cur_idx > mctx->state_log_top)
2263     {
2264       mctx->state_log[cur_idx] = next_state;
2265       mctx->state_log_top = cur_idx;
2266     }
2267   else if (mctx->state_log[cur_idx] == 0)
2268     {
2269       mctx->state_log[cur_idx] = next_state;
2270     }
2271   else
2272     {
2273       re_dfastate_t *pstate;
2274       unsigned int context;
2275       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2276       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2277          the destination of a multibyte char/collating element/
2278          back reference.  Then the next state is the union set of
2279          these destinations and the results of the transition table.  */
2280       pstate = mctx->state_log[cur_idx];
2281       log_nodes = pstate->entrance_nodes;
2282       if (next_state != NULL)
2283         {
2284           table_nodes = next_state->entrance_nodes;
2285           *err = re_node_set_init_union (&next_nodes, table_nodes,
2286                                              log_nodes);
2287           if (BE (*err != REG_NOERROR, 0))
2288             return NULL;
2289         }
2290       else
2291         next_nodes = *log_nodes;
2292       /* Note: We already add the nodes of the initial state,
2293          then we don't need to add them here.  */
2294
2295       context = re_string_context_at (&mctx->input,
2296                                       re_string_cur_idx (&mctx->input) - 1,
2297                                       mctx->eflags);
2298       next_state = mctx->state_log[cur_idx]
2299         = re_acquire_state_context (err, dfa, &next_nodes, context);
2300       /* We don't need to check errors here, since the return value of
2301          this function is next_state and ERR is already set.  */
2302
2303       if (table_nodes != NULL)
2304         re_node_set_free (&next_nodes);
2305     }
2306
2307   if (BE (dfa->nbackref, 0) && next_state != NULL)
2308     {
2309       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2310          later.  We must check them here, since the back references in the
2311          next state might use them.  */
2312       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2313                                         cur_idx);
2314       if (BE (*err != REG_NOERROR, 0))
2315         return NULL;
2316
2317       /* If the next state has back references.  */
2318       if (next_state->has_backref)
2319         {
2320           *err = transit_state_bkref (mctx, &next_state->nodes);
2321           if (BE (*err != REG_NOERROR, 0))
2322             return NULL;
2323           next_state = mctx->state_log[cur_idx];
2324         }
2325     }
2326
2327   return next_state;
2328 }
2329
2330 /* Skip bytes in the input that correspond to part of a
2331    multi-byte match, then look in the log for a state
2332    from which to restart matching.  */
2333 static re_dfastate_t *
2334 internal_function
2335 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2336 {
2337   re_dfastate_t *cur_state = NULL;
2338   do
2339     {
2340       int max = mctx->state_log_top;
2341       int cur_str_idx = re_string_cur_idx (&mctx->input);
2342
2343       do
2344         {
2345           if (++cur_str_idx > max)
2346             return NULL;
2347           re_string_skip_bytes (&mctx->input, 1);
2348         }
2349       while (mctx->state_log[cur_str_idx] == NULL);
2350
2351       cur_state = merge_state_with_log (err, mctx, NULL);
2352     }
2353   while (*err == REG_NOERROR && cur_state == NULL);
2354   return cur_state;
2355 }
2356
2357 /* Helper functions for transit_state.  */
2358
2359 /* From the node set CUR_NODES, pick up the nodes whose types are
2360    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2361    expression. And register them to use them later for evaluating the
2362    correspoding back references.  */
2363
2364 static reg_errcode_t
2365 internal_function
2366 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2367                            int str_idx)
2368 {
2369   re_dfa_t *const dfa = mctx->dfa;
2370   int node_idx;
2371   reg_errcode_t err;
2372
2373   /* TODO: This isn't efficient.
2374            Because there might be more than one nodes whose types are
2375            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2376            nodes.
2377            E.g. RE: (a){2}  */
2378   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2379     {
2380       int node = cur_nodes->elems[node_idx];
2381       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2382           && dfa->nodes[node].opr.idx < CHAR_BIT * sizeof dfa->used_bkref_map
2383           && dfa->used_bkref_map & (1u << dfa->nodes[node].opr.idx))
2384         {
2385           err = match_ctx_add_subtop (mctx, node, str_idx);
2386           if (BE (err != REG_NOERROR, 0))
2387             return err;
2388         }
2389     }
2390   return REG_NOERROR;
2391 }
2392
2393 #if 0
2394 /* Return the next state to which the current state STATE will transit by
2395    accepting the current input byte.  */
2396
2397 static re_dfastate_t *
2398 transit_state_sb (err, mctx, state)
2399      reg_errcode_t *err;
2400      re_match_context_t *mctx;
2401      re_dfastate_t *state;
2402 {
2403   re_dfa_t *const dfa = mctx->dfa;
2404   re_node_set next_nodes;
2405   re_dfastate_t *next_state;
2406   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2407   unsigned int context;
2408
2409   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2410   if (BE (*err != REG_NOERROR, 0))
2411     return NULL;
2412   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2413     {
2414       int cur_node = state->nodes.elems[node_cnt];
2415       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2416         {
2417           *err = re_node_set_merge (&next_nodes,
2418                                     dfa->eclosures + dfa->nexts[cur_node]);
2419           if (BE (*err != REG_NOERROR, 0))
2420             {
2421               re_node_set_free (&next_nodes);
2422               return NULL;
2423             }
2424         }
2425     }
2426   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2427   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2428   /* We don't need to check errors here, since the return value of
2429      this function is next_state and ERR is already set.  */
2430
2431   re_node_set_free (&next_nodes);
2432   re_string_skip_bytes (&mctx->input, 1);
2433   return next_state;
2434 }
2435 #endif
2436
2437 #ifdef RE_ENABLE_I18N
2438 static reg_errcode_t
2439 internal_function
2440 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2441 {
2442   re_dfa_t *const dfa = mctx->dfa;
2443   reg_errcode_t err;
2444   int i;
2445
2446   for (i = 0; i < pstate->nodes.nelem; ++i)
2447     {
2448       re_node_set dest_nodes, *new_nodes;
2449       int cur_node_idx = pstate->nodes.elems[i];
2450       int naccepted, dest_idx;
2451       unsigned int context;
2452       re_dfastate_t *dest_state;
2453
2454       if (!dfa->nodes[cur_node_idx].accept_mb)
2455         continue;
2456
2457       if (dfa->nodes[cur_node_idx].constraint)
2458         {
2459           context = re_string_context_at (&mctx->input,
2460                                           re_string_cur_idx (&mctx->input),
2461                                           mctx->eflags);
2462           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2463                                            context))
2464             continue;
2465         }
2466
2467       /* How many bytes the node can accept?  */
2468       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2469                                            re_string_cur_idx (&mctx->input));
2470       if (naccepted == 0)
2471         continue;
2472
2473       /* The node can accepts `naccepted' bytes.  */
2474       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2475       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2476                                : mctx->max_mb_elem_len);
2477       err = clean_state_log_if_needed (mctx, dest_idx);
2478       if (BE (err != REG_NOERROR, 0))
2479         return err;
2480 #ifdef DEBUG
2481       assert (dfa->nexts[cur_node_idx] != -1);
2482 #endif
2483       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2484
2485       dest_state = mctx->state_log[dest_idx];
2486       if (dest_state == NULL)
2487         dest_nodes = *new_nodes;
2488       else
2489         {
2490           err = re_node_set_init_union (&dest_nodes,
2491                                         dest_state->entrance_nodes, new_nodes);
2492           if (BE (err != REG_NOERROR, 0))
2493             return err;
2494         }
2495       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2496       mctx->state_log[dest_idx]
2497         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2498       if (dest_state != NULL)
2499         re_node_set_free (&dest_nodes);
2500       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2501         return err;
2502     }
2503   return REG_NOERROR;
2504 }
2505 #endif /* RE_ENABLE_I18N */
2506
2507 static reg_errcode_t
2508 internal_function
2509 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2510 {
2511   re_dfa_t *const dfa = mctx->dfa;
2512   reg_errcode_t err;
2513   int i;
2514   int cur_str_idx = re_string_cur_idx (&mctx->input);
2515
2516   for (i = 0; i < nodes->nelem; ++i)
2517     {
2518       int dest_str_idx, prev_nelem, bkc_idx;
2519       int node_idx = nodes->elems[i];
2520       unsigned int context;
2521       const re_token_t *node = dfa->nodes + node_idx;
2522       re_node_set *new_dest_nodes;
2523
2524       /* Check whether `node' is a backreference or not.  */
2525       if (node->type != OP_BACK_REF)
2526         continue;
2527
2528       if (node->constraint)
2529         {
2530           context = re_string_context_at (&mctx->input, cur_str_idx,
2531                                           mctx->eflags);
2532           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2533             continue;
2534         }
2535
2536       /* `node' is a backreference.
2537          Check the substring which the substring matched.  */
2538       bkc_idx = mctx->nbkref_ents;
2539       err = get_subexp (mctx, node_idx, cur_str_idx);
2540       if (BE (err != REG_NOERROR, 0))
2541         goto free_return;
2542
2543       /* And add the epsilon closures (which is `new_dest_nodes') of
2544          the backreference to appropriate state_log.  */
2545 #ifdef DEBUG
2546       assert (dfa->nexts[node_idx] != -1);
2547 #endif
2548       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2549         {
2550           int subexp_len;
2551           re_dfastate_t *dest_state;
2552           struct re_backref_cache_entry *bkref_ent;
2553           bkref_ent = mctx->bkref_ents + bkc_idx;
2554           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2555             continue;
2556           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2557           new_dest_nodes = (subexp_len == 0
2558                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2559                             : dfa->eclosures + dfa->nexts[node_idx]);
2560           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2561                           - bkref_ent->subexp_from);
2562           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2563                                           mctx->eflags);
2564           dest_state = mctx->state_log[dest_str_idx];
2565           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2566                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2567           /* Add `new_dest_node' to state_log.  */
2568           if (dest_state == NULL)
2569             {
2570               mctx->state_log[dest_str_idx]
2571                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2572                                             context);
2573               if (BE (mctx->state_log[dest_str_idx] == NULL
2574                       && err != REG_NOERROR, 0))
2575                 goto free_return;
2576             }
2577           else
2578             {
2579               re_node_set dest_nodes;
2580               err = re_node_set_init_union (&dest_nodes,
2581                                             dest_state->entrance_nodes,
2582                                             new_dest_nodes);
2583               if (BE (err != REG_NOERROR, 0))
2584                 {
2585                   re_node_set_free (&dest_nodes);
2586                   goto free_return;
2587                 }
2588               mctx->state_log[dest_str_idx]
2589                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2590               re_node_set_free (&dest_nodes);
2591               if (BE (mctx->state_log[dest_str_idx] == NULL
2592                       && err != REG_NOERROR, 0))
2593                 goto free_return;
2594             }
2595           /* We need to check recursively if the backreference can epsilon
2596              transit.  */
2597           if (subexp_len == 0
2598               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2599             {
2600               err = check_subexp_matching_top (mctx, new_dest_nodes,
2601                                                cur_str_idx);
2602               if (BE (err != REG_NOERROR, 0))
2603                 goto free_return;
2604               err = transit_state_bkref (mctx, new_dest_nodes);
2605               if (BE (err != REG_NOERROR, 0))
2606                 goto free_return;
2607             }
2608         }
2609     }
2610   err = REG_NOERROR;
2611  free_return:
2612   return err;
2613 }
2614
2615 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2616    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2617    Note that we might collect inappropriate candidates here.
2618    However, the cost of checking them strictly here is too high, then we
2619    delay these checking for prune_impossible_nodes().  */
2620
2621 static reg_errcode_t
2622 internal_function
2623 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2624 {
2625   re_dfa_t *const dfa = mctx->dfa;
2626   int subexp_num, sub_top_idx;
2627   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2628   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2629   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2630   if (cache_idx != -1)
2631     {
2632       const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2633       do
2634         if (entry->node == bkref_node)
2635           return REG_NOERROR; /* We already checked it.  */
2636       while (entry++->more);
2637     }
2638
2639   subexp_num = dfa->nodes[bkref_node].opr.idx;
2640
2641   /* For each sub expression  */
2642   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2643     {
2644       reg_errcode_t err;
2645       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2646       re_sub_match_last_t *sub_last;
2647       int sub_last_idx, sl_str, bkref_str_off;
2648
2649       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2650         continue; /* It isn't related.  */
2651
2652       sl_str = sub_top->str_idx;
2653       bkref_str_off = bkref_str_idx;
2654       /* At first, check the last node of sub expressions we already
2655          evaluated.  */
2656       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2657         {
2658           int sl_str_diff;
2659           sub_last = sub_top->lasts[sub_last_idx];
2660           sl_str_diff = sub_last->str_idx - sl_str;
2661           /* The matched string by the sub expression match with the substring
2662              at the back reference?  */
2663           if (sl_str_diff > 0)
2664             {
2665               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2666                 {
2667                   /* Not enough chars for a successful match.  */
2668                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2669                     break;
2670
2671                   err = clean_state_log_if_needed (mctx,
2672                                                    bkref_str_off
2673                                                    + sl_str_diff);
2674                   if (BE (err != REG_NOERROR, 0))
2675                     return err;
2676                   buf = (const char *) re_string_get_buffer (&mctx->input);
2677                 }
2678               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2679                 break; /* We don't need to search this sub expression any more.  */
2680             }
2681           bkref_str_off += sl_str_diff;
2682           sl_str += sl_str_diff;
2683           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2684                                 bkref_str_idx);
2685
2686           /* Reload buf, since the preceding call might have reallocated
2687              the buffer.  */
2688           buf = (const char *) re_string_get_buffer (&mctx->input);
2689
2690           if (err == REG_NOMATCH)
2691             continue;
2692           if (BE (err != REG_NOERROR, 0))
2693             return err;
2694         }
2695
2696       if (sub_last_idx < sub_top->nlasts)
2697         continue;
2698       if (sub_last_idx > 0)
2699         ++sl_str;
2700       /* Then, search for the other last nodes of the sub expression.  */
2701       for (; sl_str <= bkref_str_idx; ++sl_str)
2702         {
2703           int cls_node, sl_str_off;
2704           const re_node_set *nodes;
2705           sl_str_off = sl_str - sub_top->str_idx;
2706           /* The matched string by the sub expression match with the substring
2707              at the back reference?  */
2708           if (sl_str_off > 0)
2709             {
2710               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2711                 {
2712                   /* If we are at the end of the input, we cannot match.  */
2713                   if (bkref_str_off >= mctx->input.len)
2714                     break;
2715
2716                   err = extend_buffers (mctx);
2717                   if (BE (err != REG_NOERROR, 0))
2718                     return err;
2719
2720                   buf = (const char *) re_string_get_buffer (&mctx->input);
2721                 }
2722               if (buf [bkref_str_off++] != buf[sl_str - 1])
2723                 break; /* We don't need to search this sub expression
2724                           any more.  */
2725             }
2726           if (mctx->state_log[sl_str] == NULL)
2727             continue;
2728           /* Does this state have a ')' of the sub expression?  */
2729           nodes = &mctx->state_log[sl_str]->nodes;
2730           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2731           if (cls_node == -1)
2732             continue; /* No.  */
2733           if (sub_top->path == NULL)
2734             {
2735               sub_top->path = re_calloc (state_array_t,
2736                                          sl_str - sub_top->str_idx + 1);
2737               if (sub_top->path == NULL)
2738                 return REG_ESPACE;
2739             }
2740           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2741              in the current context?  */
2742           err = check_arrival (mctx, sub_top->path, sub_top->node,
2743                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2744           if (err == REG_NOMATCH)
2745               continue;
2746           if (BE (err != REG_NOERROR, 0))
2747               return err;
2748           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2749           if (BE (sub_last == NULL, 0))
2750             return REG_ESPACE;
2751           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2752                                 bkref_str_idx);
2753           if (err == REG_NOMATCH)
2754             continue;
2755         }
2756     }
2757   return REG_NOERROR;
2758 }
2759
2760 /* Helper functions for get_subexp().  */
2761
2762 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2763    If it can arrive, register the sub expression expressed with SUB_TOP
2764    and SUB_LAST.  */
2765
2766 static reg_errcode_t
2767 internal_function
2768 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2769                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2770 {
2771   reg_errcode_t err;
2772   int to_idx;
2773   /* Can the subexpression arrive the back reference?  */
2774   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2775                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2776   if (err != REG_NOERROR)
2777     return err;
2778   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2779                              sub_last->str_idx);
2780   if (BE (err != REG_NOERROR, 0))
2781     return err;
2782   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2783   return clean_state_log_if_needed (mctx, to_idx);
2784 }
2785
2786 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2787    Search '(' if FL_OPEN, or search ')' otherwise.
2788    TODO: This function isn't efficient...
2789          Because there might be more than one nodes whose types are
2790          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2791          nodes.
2792          E.g. RE: (a){2}  */
2793
2794 static int
2795 internal_function
2796 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2797                   int subexp_idx, int type)
2798 {
2799   int cls_idx;
2800   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2801     {
2802       int cls_node = nodes->elems[cls_idx];
2803       const re_token_t *node = dfa->nodes + cls_node;
2804       if (node->type == type
2805           && node->opr.idx == subexp_idx)
2806         return cls_node;
2807     }
2808   return -1;
2809 }
2810
2811 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2812    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2813    heavily reused.
2814    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2815
2816 static reg_errcode_t
2817 internal_function
2818 check_arrival (re_match_context_t *mctx, state_array_t *path,
2819                int top_node, int top_str, int last_node, int last_str,
2820                int type)
2821 {
2822   re_dfa_t *const dfa = mctx->dfa;
2823   reg_errcode_t err;
2824   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2825   re_dfastate_t *cur_state = NULL;
2826   re_node_set *cur_nodes, next_nodes;
2827   re_dfastate_t **backup_state_log;
2828   unsigned int context;
2829
2830   subexp_num = dfa->nodes[top_node].opr.idx;
2831   /* Extend the buffer if we need.  */
2832   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2833     {
2834       re_dfastate_t **new_array;
2835       int old_alloc = path->alloc;
2836       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2837       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2838       if (new_array == NULL)
2839         {
2840           path->alloc = old_alloc;
2841           return REG_ESPACE;
2842         }
2843       path->array = new_array;
2844       memset (new_array + old_alloc, '\0',
2845               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2846     }
2847
2848   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2849
2850   /* Temporary modify MCTX.  */
2851   backup_state_log = mctx->state_log;
2852   backup_cur_idx = mctx->input.cur_idx;
2853   mctx->state_log = path->array;
2854   mctx->input.cur_idx = str_idx;
2855
2856   /* Setup initial node set.  */
2857   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2858   if (str_idx == top_str)
2859     {
2860       err = re_node_set_init_1 (&next_nodes, top_node);
2861       if (BE (err != REG_NOERROR, 0))
2862         return err;
2863       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2864       if (BE (err != REG_NOERROR, 0))
2865         {
2866           re_node_set_free (&next_nodes);
2867           return err;
2868         }
2869     }
2870   else
2871     {
2872       cur_state = mctx->state_log[str_idx];
2873       if (cur_state && cur_state->has_backref)
2874         {
2875           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2876           if (BE ( err != REG_NOERROR, 0))
2877             return err;
2878         }
2879       else
2880         re_node_set_init_empty (&next_nodes);
2881     }
2882   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2883     {
2884       if (next_nodes.nelem)
2885         {
2886           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2887                                     subexp_num, type);
2888           if (BE ( err != REG_NOERROR, 0))
2889             {
2890               re_node_set_free (&next_nodes);
2891               return err;
2892             }
2893         }
2894       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2895       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2896         {
2897           re_node_set_free (&next_nodes);
2898           return err;
2899         }
2900       mctx->state_log[str_idx] = cur_state;
2901     }
2902
2903   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2904     {
2905       re_node_set_empty (&next_nodes);
2906       if (mctx->state_log[str_idx + 1])
2907         {
2908           err = re_node_set_merge (&next_nodes,
2909                                    &mctx->state_log[str_idx + 1]->nodes);
2910           if (BE (err != REG_NOERROR, 0))
2911             {
2912               re_node_set_free (&next_nodes);
2913               return err;
2914             }
2915         }
2916       if (cur_state)
2917         {
2918           err = check_arrival_add_next_nodes (mctx, str_idx,
2919                                               &cur_state->non_eps_nodes, &next_nodes);
2920           if (BE (err != REG_NOERROR, 0))
2921             {
2922               re_node_set_free (&next_nodes);
2923               return err;
2924             }
2925         }
2926       ++str_idx;
2927       if (next_nodes.nelem)
2928         {
2929           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2930           if (BE (err != REG_NOERROR, 0))
2931             {
2932               re_node_set_free (&next_nodes);
2933               return err;
2934             }
2935           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2936                                     subexp_num, type);
2937           if (BE ( err != REG_NOERROR, 0))
2938             {
2939               re_node_set_free (&next_nodes);
2940               return err;
2941             }
2942         }
2943       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2944       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2945       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2946         {
2947           re_node_set_free (&next_nodes);
2948           return err;
2949         }
2950       mctx->state_log[str_idx] = cur_state;
2951       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2952     }
2953   re_node_set_free (&next_nodes);
2954   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2955                : &mctx->state_log[last_str]->nodes);
2956   path->next_idx = str_idx;
2957
2958   /* Fix MCTX.  */
2959   mctx->state_log = backup_state_log;
2960   mctx->input.cur_idx = backup_cur_idx;
2961
2962   /* Then check the current node set has the node LAST_NODE.  */
2963   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2964     return REG_NOERROR;
2965
2966   return REG_NOMATCH;
2967 }
2968
2969 /* Helper functions for check_arrival.  */
2970
2971 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2972    to NEXT_NODES.
2973    TODO: This function is similar to the functions transit_state*(),
2974          however this function has many additional works.
2975          Can't we unify them?  */
2976
2977 static reg_errcode_t
2978 internal_function
2979 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
2980                               re_node_set *cur_nodes,
2981                               re_node_set *next_nodes)
2982 {
2983   re_dfa_t *const dfa = mctx->dfa;
2984   int result;
2985   int cur_idx;
2986   reg_errcode_t err;
2987   re_node_set union_set;
2988   re_node_set_init_empty (&union_set);
2989   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2990     {
2991       int naccepted = 0;
2992       int cur_node = cur_nodes->elems[cur_idx];
2993 #ifdef DEBUG
2994       re_token_type_t type = dfa->nodes[cur_node].type;
2995       assert (!IS_EPSILON_NODE (type));
2996 #endif
2997 #ifdef RE_ENABLE_I18N
2998       /* If the node may accept `multi byte'.  */
2999       if (dfa->nodes[cur_node].accept_mb)
3000         {
3001           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3002                                                str_idx);
3003           if (naccepted > 1)
3004             {
3005               re_dfastate_t *dest_state;
3006               int next_node = dfa->nexts[cur_node];
3007               int next_idx = str_idx + naccepted;
3008               dest_state = mctx->state_log[next_idx];
3009               re_node_set_empty (&union_set);
3010               if (dest_state)
3011                 {
3012                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3013                   if (BE (err != REG_NOERROR, 0))
3014                     {
3015                       re_node_set_free (&union_set);
3016                       return err;
3017                     }
3018                 }
3019               result = re_node_set_insert (&union_set, next_node);
3020               if (BE (result < 0, 0))
3021                 {
3022                   re_node_set_free (&union_set);
3023                   return REG_ESPACE;
3024                 }
3025               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3026                                                             &union_set);
3027               if (BE (mctx->state_log[next_idx] == NULL
3028                       && err != REG_NOERROR, 0))
3029                 {
3030                   re_node_set_free (&union_set);
3031                   return err;
3032                 }
3033             }
3034         }
3035 #endif /* RE_ENABLE_I18N */
3036       if (naccepted
3037           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3038         {
3039           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3040           if (BE (result < 0, 0))
3041             {
3042               re_node_set_free (&union_set);
3043               return REG_ESPACE;
3044             }
3045         }
3046     }
3047   re_node_set_free (&union_set);
3048   return REG_NOERROR;
3049 }
3050
3051 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3052    CUR_NODES, however exclude the nodes which are:
3053     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3054     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3055 */
3056
3057 static reg_errcode_t
3058 internal_function
3059 check_arrival_expand_ecl (re_dfa_t *dfa, re_node_set *cur_nodes,
3060                           int ex_subexp, int type)
3061 {
3062   reg_errcode_t err;
3063   int idx, outside_node;
3064   re_node_set new_nodes;
3065 #ifdef DEBUG
3066   assert (cur_nodes->nelem);
3067 #endif
3068   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3069   if (BE (err != REG_NOERROR, 0))
3070     return err;
3071   /* Create a new node set NEW_NODES with the nodes which are epsilon
3072      closures of the node in CUR_NODES.  */
3073
3074   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3075     {
3076       int cur_node = cur_nodes->elems[idx];
3077       re_node_set *eclosure = dfa->eclosures + cur_node;
3078       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3079       if (outside_node == -1)
3080         {
3081           /* There are no problematic nodes, just merge them.  */
3082           err = re_node_set_merge (&new_nodes, eclosure);
3083           if (BE (err != REG_NOERROR, 0))
3084             {
3085               re_node_set_free (&new_nodes);
3086               return err;
3087             }
3088         }
3089       else
3090         {
3091           /* There are problematic nodes, re-calculate incrementally.  */
3092           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3093                                               ex_subexp, type);
3094           if (BE (err != REG_NOERROR, 0))
3095             {
3096               re_node_set_free (&new_nodes);
3097               return err;
3098             }
3099         }
3100     }
3101   re_node_set_free (cur_nodes);
3102   *cur_nodes = new_nodes;
3103   return REG_NOERROR;
3104 }
3105
3106 /* Helper function for check_arrival_expand_ecl.
3107    Check incrementally the epsilon closure of TARGET, and if it isn't
3108    problematic append it to DST_NODES.  */
3109
3110 static reg_errcode_t
3111 internal_function
3112 check_arrival_expand_ecl_sub (re_dfa_t *dfa, re_node_set *dst_nodes,
3113                               int target, int ex_subexp, int type)
3114 {
3115   int cur_node;
3116   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3117     {
3118       int err;
3119
3120       if (dfa->nodes[cur_node].type == type
3121           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3122         {
3123           if (type == OP_CLOSE_SUBEXP)
3124             {
3125               err = re_node_set_insert (dst_nodes, cur_node);
3126               if (BE (err == -1, 0))
3127                 return REG_ESPACE;
3128             }
3129           break;
3130         }
3131       err = re_node_set_insert (dst_nodes, cur_node);
3132       if (BE (err == -1, 0))
3133         return REG_ESPACE;
3134       if (dfa->edests[cur_node].nelem == 0)
3135         break;
3136       if (dfa->edests[cur_node].nelem == 2)
3137         {
3138           reg_errcode_t ret =
3139             check_arrival_expand_ecl_sub (dfa, dst_nodes,
3140                                           dfa->edests[cur_node].elems[1],
3141                                           ex_subexp, type);
3142           if (BE (ret != REG_NOERROR, 0))
3143             return ret;
3144         }
3145       cur_node = dfa->edests[cur_node].elems[0];
3146     }
3147   return REG_NOERROR;
3148 }
3149
3150
3151 /* For all the back references in the current state, calculate the
3152    destination of the back references by the appropriate entry
3153    in MCTX->BKREF_ENTS.  */
3154
3155 static reg_errcode_t
3156 internal_function
3157 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3158                     int cur_str, int subexp_num, int type)
3159 {
3160   re_dfa_t *const dfa = mctx->dfa;
3161   reg_errcode_t err;
3162   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3163   struct re_backref_cache_entry *ent;
3164
3165   if (cache_idx_start == -1)
3166     return REG_NOERROR;
3167
3168  restart:
3169   ent = mctx->bkref_ents + cache_idx_start;
3170   do
3171     {
3172       int to_idx, next_node;
3173
3174       /* Is this entry ENT is appropriate?  */
3175       if (!re_node_set_contains (cur_nodes, ent->node))
3176         continue; /* No.  */
3177
3178       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3179       /* Calculate the destination of the back reference, and append it
3180          to MCTX->STATE_LOG.  */
3181       if (to_idx == cur_str)
3182         {
3183           /* The backreference did epsilon transit, we must re-check all the
3184              node in the current state.  */
3185           re_node_set new_dests;
3186           reg_errcode_t err2, err3;
3187           next_node = dfa->edests[ent->node].elems[0];
3188           if (re_node_set_contains (cur_nodes, next_node))
3189             continue;
3190           err = re_node_set_init_1 (&new_dests, next_node);
3191           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3192           err3 = re_node_set_merge (cur_nodes, &new_dests);
3193           re_node_set_free (&new_dests);
3194           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3195                   || err3 != REG_NOERROR, 0))
3196             {
3197               err = (err != REG_NOERROR ? err
3198                      : (err2 != REG_NOERROR ? err2 : err3));
3199               return err;
3200             }
3201           /* TODO: It is still inefficient...  */
3202           goto restart;
3203         }
3204       else
3205         {
3206           re_node_set union_set;
3207           next_node = dfa->nexts[ent->node];
3208           if (mctx->state_log[to_idx])
3209             {
3210               int ret;
3211               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3212                                         next_node))
3213                 continue;
3214               err = re_node_set_init_copy (&union_set,
3215                                            &mctx->state_log[to_idx]->nodes);
3216               ret = re_node_set_insert (&union_set, next_node);
3217               if (BE (err != REG_NOERROR || ret < 0, 0))
3218                 {
3219                   re_node_set_free (&union_set);
3220                   err = err != REG_NOERROR ? err : REG_ESPACE;
3221                   return err;
3222                 }
3223             }
3224           else
3225             {
3226               err = re_node_set_init_1 (&union_set, next_node);
3227               if (BE (err != REG_NOERROR, 0))
3228                 return err;
3229             }
3230           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3231           re_node_set_free (&union_set);
3232           if (BE (mctx->state_log[to_idx] == NULL
3233                   && err != REG_NOERROR, 0))
3234             return err;
3235         }
3236     }
3237   while (ent++->more);
3238   return REG_NOERROR;
3239 }
3240
3241 /* Build transition table for the state.
3242    Return 1 if succeeded, otherwise return NULL.  */
3243
3244 static int
3245 internal_function
3246 build_trtable (re_dfa_t *dfa, re_dfastate_t *state)
3247 {
3248   reg_errcode_t err;
3249   int i, j, ch, need_word_trtable = 0;
3250   unsigned int elem, mask;
3251   int dests_node_malloced = 0, dest_states_malloced = 0;
3252   int ndests; /* Number of the destination states from `state'.  */
3253   re_dfastate_t **trtable;
3254   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3255   re_node_set follows, *dests_node;
3256   bitset *dests_ch;
3257   bitset acceptable;
3258
3259   /* We build DFA states which corresponds to the destination nodes
3260      from `state'.  `dests_node[i]' represents the nodes which i-th
3261      destination state contains, and `dests_ch[i]' represents the
3262      characters which i-th destination state accepts.  */
3263   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3264     dests_node = (re_node_set *)
3265       alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3266   else
3267     {
3268       dests_node = (re_node_set *)
3269         malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3270       if (BE (dests_node == NULL, 0))
3271         return 0;
3272       dests_node_malloced = 1;
3273     }
3274   dests_ch = (bitset *) (dests_node + SBC_MAX);
3275
3276   /* Initialize transiton table.  */
3277   state->word_trtable = state->trtable = NULL;
3278
3279   /* At first, group all nodes belonging to `state' into several
3280      destinations.  */
3281   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3282   if (BE (ndests <= 0, 0))
3283     {
3284       if (dests_node_malloced)
3285         free (dests_node);
3286       /* Return 0 in case of an error, 1 otherwise.  */
3287       if (ndests == 0)
3288         {
3289           state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3290           return 1;
3291         }
3292       return 0;
3293     }
3294
3295   err = re_node_set_alloc (&follows, ndests + 1);
3296   if (BE (err != REG_NOERROR, 0))
3297     goto out_free;
3298
3299   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3300                          + ndests * 3 * sizeof (re_dfastate_t *)))
3301     dest_states = (re_dfastate_t **)
3302       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3303   else
3304     {
3305       dest_states = (re_dfastate_t **)
3306         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3307       if (BE (dest_states == NULL, 0))
3308         {
3309 out_free:
3310           if (dest_states_malloced)
3311             free (dest_states);
3312           re_node_set_free (&follows);
3313           for (i = 0; i < ndests; ++i)
3314             re_node_set_free (dests_node + i);
3315           if (dests_node_malloced)
3316             free (dests_node);
3317           return 0;
3318         }
3319       dest_states_malloced = 1;
3320     }
3321   dest_states_word = dest_states + ndests;
3322   dest_states_nl = dest_states_word + ndests;
3323   bitset_empty (acceptable);
3324
3325   /* Then build the states for all destinations.  */
3326   for (i = 0; i < ndests; ++i)
3327     {
3328       int next_node;
3329       re_node_set_empty (&follows);
3330       /* Merge the follows of this destination states.  */
3331       for (j = 0; j < dests_node[i].nelem; ++j)
3332         {
3333           next_node = dfa->nexts[dests_node[i].elems[j]];
3334           if (next_node != -1)
3335             {
3336               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3337               if (BE (err != REG_NOERROR, 0))
3338                 goto out_free;
3339             }
3340         }
3341       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3342       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3343         goto out_free;
3344       /* If the new state has context constraint,
3345          build appropriate states for these contexts.  */
3346       if (dest_states[i]->has_constraint)
3347         {
3348           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3349                                                           CONTEXT_WORD);
3350           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3351             goto out_free;
3352
3353           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3354             need_word_trtable = 1;
3355
3356           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3357                                                         CONTEXT_NEWLINE);
3358           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3359             goto out_free;
3360         }
3361       else
3362         {
3363           dest_states_word[i] = dest_states[i];
3364           dest_states_nl[i] = dest_states[i];
3365         }
3366       bitset_merge (acceptable, dests_ch[i]);
3367     }
3368
3369   if (!BE (need_word_trtable, 0))
3370     {
3371       /* We don't care about whether the following character is a word
3372          character, or we are in a single-byte character set so we can
3373          discern by looking at the character code: allocate a
3374          256-entry transition table.  */
3375       trtable = state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3376       if (BE (trtable == NULL, 0))
3377         goto out_free;
3378
3379       /* For all characters ch...:  */
3380       for (i = 0; i < BITSET_UINTS; ++i)
3381         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3382              elem;
3383              mask <<= 1, elem >>= 1, ++ch)
3384           if (BE (elem & 1, 0))
3385             {
3386               /* There must be exactly one destination which accepts
3387                  character ch.  See group_nodes_into_DFAstates.  */
3388               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3389                 ;
3390
3391               /* j-th destination accepts the word character ch.  */
3392               if (dfa->word_char[i] & mask)
3393                 trtable[ch] = dest_states_word[j];
3394               else
3395                 trtable[ch] = dest_states[j];
3396             }
3397     }
3398   else
3399     {
3400       /* We care about whether the following character is a word
3401          character, and we are in a multi-byte character set: discern
3402          by looking at the character code: build two 256-entry
3403          transition tables, one starting at trtable[0] and one
3404          starting at trtable[SBC_MAX].  */
3405       trtable = state->word_trtable = re_calloc (re_dfastate_t *, 2 * SBC_MAX);
3406       if (BE (trtable == NULL, 0))
3407         goto out_free;
3408
3409       /* For all characters ch...:  */
3410       for (i = 0; i < BITSET_UINTS; ++i)
3411         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3412              elem;
3413              mask <<= 1, elem >>= 1, ++ch)
3414           if (BE (elem & 1, 0))
3415             {
3416               /* There must be exactly one destination which accepts
3417                  character ch.  See group_nodes_into_DFAstates.  */
3418               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3419                 ;
3420
3421               /* j-th destination accepts the word character ch.  */
3422               trtable[ch] = dest_states[j];
3423               trtable[ch + SBC_MAX] = dest_states_word[j];
3424             }
3425     }
3426
3427   /* new line */
3428   if (bitset_contain (acceptable, NEWLINE_CHAR))
3429     {
3430       /* The current state accepts newline character.  */
3431       for (j = 0; j < ndests; ++j)
3432         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3433           {
3434             /* k-th destination accepts newline character.  */
3435             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3436             if (need_word_trtable)
3437               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3438             /* There must be only one destination which accepts
3439                newline.  See group_nodes_into_DFAstates.  */
3440             break;
3441           }
3442     }
3443
3444   if (dest_states_malloced)
3445     free (dest_states);
3446
3447   re_node_set_free (&follows);
3448   for (i = 0; i < ndests; ++i)
3449     re_node_set_free (dests_node + i);
3450
3451   if (dests_node_malloced)
3452     free (dests_node);
3453
3454   return 1;
3455 }
3456
3457 /* Group all nodes belonging to STATE into several destinations.
3458    Then for all destinations, set the nodes belonging to the destination
3459    to DESTS_NODE[i] and set the characters accepted by the destination
3460    to DEST_CH[i].  This function return the number of destinations.  */
3461
3462 static int
3463 internal_function
3464 group_nodes_into_DFAstates (re_dfa_t *dfa, const re_dfastate_t *state,
3465                             re_node_set *dests_node, bitset *dests_ch)
3466 {
3467   reg_errcode_t err;
3468   int result;
3469   int i, j, k;
3470   int ndests; /* Number of the destinations from `state'.  */
3471   bitset accepts; /* Characters a node can accept.  */
3472   const re_node_set *cur_nodes = &state->nodes;
3473   bitset_empty (accepts);
3474   ndests = 0;
3475
3476   /* For all the nodes belonging to `state',  */
3477   for (i = 0; i < cur_nodes->nelem; ++i)
3478     {
3479       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3480       re_token_type_t type = node->type;
3481       unsigned int constraint = node->constraint;
3482
3483       /* Enumerate all single byte character this node can accept.  */
3484       if (type == CHARACTER)
3485         bitset_set (accepts, node->opr.c);
3486       else if (type == SIMPLE_BRACKET)
3487         {
3488           bitset_merge (accepts, node->opr.sbcset);
3489         }
3490       else if (type == OP_PERIOD)
3491         {
3492 #ifdef RE_ENABLE_I18N
3493           if (dfa->mb_cur_max > 1)
3494             bitset_merge (accepts, dfa->sb_char);
3495           else
3496 #endif
3497             bitset_set_all (accepts);
3498           if (!(dfa->syntax & REG_DOT_NEWLINE))
3499             bitset_clear (accepts, '\n');
3500           if (dfa->syntax & REG_DOT_NOT_NULL)
3501             bitset_clear (accepts, '\0');
3502         }
3503 #ifdef RE_ENABLE_I18N
3504       else if (type == OP_UTF8_PERIOD)
3505         {
3506           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3507           if (!(dfa->syntax & REG_DOT_NEWLINE))
3508             bitset_clear (accepts, '\n');
3509           if (dfa->syntax & REG_DOT_NOT_NULL)
3510             bitset_clear (accepts, '\0');
3511         }
3512 #endif
3513       else
3514         continue;
3515
3516       /* Check the `accepts' and sift the characters which are not
3517          match it the context.  */
3518       if (constraint)
3519         {
3520           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3521             {
3522               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3523               bitset_empty (accepts);
3524               if (accepts_newline)
3525                 bitset_set (accepts, NEWLINE_CHAR);
3526               else
3527                 continue;
3528             }
3529           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3530             {
3531               bitset_empty (accepts);
3532               continue;
3533             }
3534
3535           if (constraint & NEXT_WORD_CONSTRAINT)
3536             {
3537               unsigned int any_set = 0;
3538               if (type == CHARACTER && !node->word_char)
3539                 {
3540                   bitset_empty (accepts);
3541                   continue;
3542                 }
3543 #ifdef RE_ENABLE_I18N
3544               if (dfa->mb_cur_max > 1)
3545                 for (j = 0; j < BITSET_UINTS; ++j)
3546                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3547               else
3548 #endif
3549                 for (j = 0; j < BITSET_UINTS; ++j)
3550                   any_set |= (accepts[j] &= dfa->word_char[j]);
3551               if (!any_set)
3552                 continue;
3553             }
3554           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3555             {
3556               unsigned int any_set = 0;
3557               if (type == CHARACTER && node->word_char)
3558                 {
3559                   bitset_empty (accepts);
3560                   continue;
3561                 }
3562 #ifdef RE_ENABLE_I18N
3563               if (dfa->mb_cur_max > 1)
3564                 for (j = 0; j < BITSET_UINTS; ++j)
3565                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3566               else
3567 #endif
3568                 for (j = 0; j < BITSET_UINTS; ++j)
3569                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3570               if (!any_set)
3571                 continue;
3572             }
3573         }
3574
3575       /* Then divide `accepts' into DFA states, or create a new
3576          state.  Above, we make sure that accepts is not empty.  */
3577       for (j = 0; j < ndests; ++j)
3578         {
3579           bitset intersec; /* Intersection sets, see below.  */
3580           bitset remains;
3581           /* Flags, see below.  */
3582           int has_intersec, not_subset, not_consumed;
3583
3584           /* Optimization, skip if this state doesn't accept the character.  */
3585           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3586             continue;
3587
3588           /* Enumerate the intersection set of this state and `accepts'.  */
3589           has_intersec = 0;
3590           for (k = 0; k < BITSET_UINTS; ++k)
3591             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3592           /* And skip if the intersection set is empty.  */
3593           if (!has_intersec)
3594             continue;
3595
3596           /* Then check if this state is a subset of `accepts'.  */
3597           not_subset = not_consumed = 0;
3598           for (k = 0; k < BITSET_UINTS; ++k)
3599             {
3600               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3601               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3602             }
3603
3604           /* If this state isn't a subset of `accepts', create a
3605              new group state, which has the `remains'. */
3606           if (not_subset)
3607             {
3608               bitset_copy (dests_ch[ndests], remains);
3609               bitset_copy (dests_ch[j], intersec);
3610               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3611               if (BE (err != REG_NOERROR, 0))
3612                 goto error_return;
3613               ++ndests;
3614             }
3615
3616           /* Put the position in the current group. */
3617           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3618           if (BE (result < 0, 0))
3619             goto error_return;
3620
3621           /* If all characters are consumed, go to next node. */
3622           if (!not_consumed)
3623             break;
3624         }
3625       /* Some characters remain, create a new group. */
3626       if (j == ndests)
3627         {
3628           bitset_copy (dests_ch[ndests], accepts);
3629           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3630           if (BE (err != REG_NOERROR, 0))
3631             goto error_return;
3632           ++ndests;
3633           bitset_empty (accepts);
3634         }
3635     }
3636   return ndests;
3637  error_return:
3638   for (j = 0; j < ndests; ++j)
3639     re_node_set_free (dests_node + j);
3640   return -1;
3641 }
3642
3643 #ifdef RE_ENABLE_I18N
3644 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3645    Return the number of the bytes the node accepts.
3646    STR_IDX is the current index of the input string.
3647
3648    This function handles the nodes which can accept one character, or
3649    one collating element like '.', '[a-z]', opposite to the other nodes
3650    can only accept one byte.  */
3651
3652 static int
3653 internal_function
3654 check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
3655                          const re_string_t *input, int str_idx)
3656 {
3657   const re_token_t *node = dfa->nodes + node_idx;
3658   int char_len, elem_len;
3659   int i;
3660
3661   if (BE (node->type == OP_UTF8_PERIOD, 0))
3662     {
3663       unsigned char c = re_string_byte_at (input, str_idx), d;
3664       if (BE (c < 0xc2, 1))
3665         return 0;
3666
3667       if (str_idx + 2 > input->len)
3668         return 0;
3669
3670       d = re_string_byte_at (input, str_idx + 1);
3671       if (c < 0xe0)
3672         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3673       else if (c < 0xf0)
3674         {
3675           char_len = 3;
3676           if (c == 0xe0 && d < 0xa0)
3677             return 0;
3678         }
3679       else if (c < 0xf8)
3680         {
3681           char_len = 4;
3682           if (c == 0xf0 && d < 0x90)
3683             return 0;
3684         }
3685       else if (c < 0xfc)
3686         {
3687           char_len = 5;
3688           if (c == 0xf8 && d < 0x88)
3689             return 0;
3690         }
3691       else if (c < 0xfe)
3692         {
3693           char_len = 6;
3694           if (c == 0xfc && d < 0x84)
3695             return 0;
3696         }
3697       else
3698         return 0;
3699
3700       if (str_idx + char_len > input->len)
3701         return 0;
3702
3703       for (i = 1; i < char_len; ++i)
3704         {
3705           d = re_string_byte_at (input, str_idx + i);
3706           if (d < 0x80 || d > 0xbf)
3707             return 0;
3708         }
3709       return char_len;
3710     }
3711
3712   char_len = re_string_char_size_at (input, str_idx);
3713   if (node->type == OP_PERIOD)
3714     {
3715       if (char_len <= 1)
3716         return 0;
3717       /* FIXME: I don't think this if is needed, as both '\n'
3718          and '\0' are char_len == 1.  */
3719       /* '.' accepts any one character except the following two cases.  */
3720       if ((!(dfa->syntax & REG_DOT_NEWLINE) &&
3721            re_string_byte_at (input, str_idx) == '\n') ||
3722           ((dfa->syntax & REG_DOT_NOT_NULL) &&
3723            re_string_byte_at (input, str_idx) == '\0'))
3724         return 0;
3725       return char_len;
3726     }
3727
3728   elem_len = re_string_elem_size_at (input, str_idx);
3729   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3730     return 0;
3731
3732   if (node->type == COMPLEX_BRACKET)
3733     {
3734       const re_charset_t *cset = node->opr.mbcset;
3735 # ifdef _LIBC
3736       const unsigned char *pin
3737         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3738       int j;
3739       uint32_t nrules;
3740 # endif /* _LIBC */
3741       int match_len = 0;
3742       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3743                     ? re_string_wchar_at (input, str_idx) : 0);
3744
3745       /* match with multibyte character?  */
3746       for (i = 0; i < cset->nmbchars; ++i)
3747         if (wc == cset->mbchars[i])
3748           {
3749             match_len = char_len;
3750             goto check_node_accept_bytes_match;
3751           }
3752       /* match with character_class?  */
3753       for (i = 0; i < cset->nchar_classes; ++i)
3754         {
3755           wctype_t wt = cset->char_classes[i];
3756           if (__iswctype (wc, wt))
3757             {
3758               match_len = char_len;
3759               goto check_node_accept_bytes_match;
3760             }
3761         }
3762
3763 # ifdef _LIBC
3764       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3765       if (nrules != 0)
3766         {
3767           unsigned int in_collseq = 0;
3768           const int32_t *table, *indirect;
3769           const unsigned char *weights, *extra;
3770           const char *collseqwc;
3771           int32_t idx;
3772           /* This #include defines a local function!  */
3773 #  include <locale/weight.h>
3774
3775           /* match with collating_symbol?  */
3776           if (cset->ncoll_syms)
3777             extra = (const unsigned char *)
3778               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3779           for (i = 0; i < cset->ncoll_syms; ++i)
3780             {
3781               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3782               /* Compare the length of input collating element and
3783                  the length of current collating element.  */
3784               if (*coll_sym != elem_len)
3785                 continue;
3786               /* Compare each bytes.  */
3787               for (j = 0; j < *coll_sym; j++)
3788                 if (pin[j] != coll_sym[1 + j])
3789                   break;
3790               if (j == *coll_sym)
3791                 {
3792                   /* Match if every bytes is equal.  */
3793                   match_len = j;
3794                   goto check_node_accept_bytes_match;
3795                 }
3796             }
3797
3798           if (cset->nranges)
3799             {
3800               if (elem_len <= char_len)
3801                 {
3802                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3803                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3804                 }
3805               else
3806                 in_collseq = find_collation_sequence_value (pin, elem_len);
3807             }
3808           /* match with range expression?  */
3809           for (i = 0; i < cset->nranges; ++i)
3810             if (cset->range_starts[i] <= in_collseq
3811                 && in_collseq <= cset->range_ends[i])
3812               {
3813                 match_len = elem_len;
3814                 goto check_node_accept_bytes_match;
3815               }
3816
3817           /* match with equivalence_class?  */
3818           if (cset->nequiv_classes)
3819             {
3820               const unsigned char *cp = pin;
3821               table = (const int32_t *)
3822                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3823               weights = (const unsigned char *)
3824                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3825               extra = (const unsigned char *)
3826                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3827               indirect = (const int32_t *)
3828                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3829               idx = findidx (&cp);
3830               if (idx > 0)
3831                 for (i = 0; i < cset->nequiv_classes; ++i)
3832                   {
3833                     int32_t equiv_class_idx = cset->equiv_classes[i];
3834                     size_t weight_len = weights[idx];
3835                     if (weight_len == weights[equiv_class_idx])
3836                       {
3837                         int cnt = 0;
3838                         while (cnt <= weight_len
3839                                && (weights[equiv_class_idx + 1 + cnt]
3840                                    == weights[idx + 1 + cnt]))
3841                           ++cnt;
3842                         if (cnt > weight_len)
3843                           {
3844                             match_len = elem_len;
3845                             goto check_node_accept_bytes_match;
3846                           }
3847                       }
3848                   }
3849             }
3850         }
3851       else
3852 # endif /* _LIBC */
3853         {
3854           /* match with range expression?  */
3855 #if __GNUC__ >= 2
3856           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3857 #else
3858           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3859           cmp_buf[2] = wc;
3860 #endif
3861           for (i = 0; i < cset->nranges; ++i)
3862             {
3863               cmp_buf[0] = cset->range_starts[i];
3864               cmp_buf[4] = cset->range_ends[i];
3865               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3866                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3867                 {
3868                   match_len = char_len;
3869                   goto check_node_accept_bytes_match;
3870                 }
3871             }
3872         }
3873     check_node_accept_bytes_match:
3874       if (!cset->non_match)
3875         return match_len;
3876       else
3877         {
3878           if (match_len > 0)
3879             return 0;
3880           else
3881             return (elem_len > char_len) ? elem_len : char_len;
3882         }
3883     }
3884   return 0;
3885 }
3886
3887 # ifdef _LIBC
3888 static unsigned int
3889 find_collation_sequence_value (mbs, mbs_len)
3890     const unsigned char *mbs;
3891     size_t mbs_len;
3892 {
3893   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3894   if (nrules == 0)
3895     {
3896       if (mbs_len == 1)
3897         {
3898           /* No valid character.  Match it as a single byte character.  */
3899           const unsigned char *collseq = (const unsigned char *)
3900             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3901           return collseq[mbs[0]];
3902         }
3903       return UINT_MAX;
3904     }
3905   else
3906     {
3907       int32_t idx;
3908       const unsigned char *extra = (const unsigned char *)
3909         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3910       int32_t extrasize = (const unsigned char *)
3911         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3912
3913       for (idx = 0; idx < extrasize;)
3914         {
3915           int mbs_cnt, found = 0;
3916           int32_t elem_mbs_len;
3917           /* Skip the name of collating element name.  */
3918           idx = idx + extra[idx] + 1;
3919           elem_mbs_len = extra[idx++];
3920           if (mbs_len == elem_mbs_len)
3921             {
3922               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3923                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3924                   break;
3925               if (mbs_cnt == elem_mbs_len)
3926                 /* Found the entry.  */
3927                 found = 1;
3928             }
3929           /* Skip the byte sequence of the collating element.  */
3930           idx += elem_mbs_len;
3931           /* Adjust for the alignment.  */
3932           idx = (idx + 3) & ~3;
3933           /* Skip the collation sequence value.  */
3934           idx += sizeof (uint32_t);
3935           /* Skip the wide char sequence of the collating element.  */
3936           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3937           /* If we found the entry, return the sequence value.  */
3938           if (found)
3939             return *(uint32_t *) (extra + idx);
3940           /* Skip the collation sequence value.  */
3941           idx += sizeof (uint32_t);
3942         }
3943       return UINT_MAX;
3944     }
3945 }
3946 # endif /* _LIBC */
3947 #endif /* RE_ENABLE_I18N */
3948
3949 /* Check whether the node accepts the byte which is IDX-th
3950    byte of the INPUT.  */
3951
3952 static int
3953 internal_function
3954 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3955                    int idx)
3956 {
3957   unsigned char ch;
3958   ch = re_string_byte_at (&mctx->input, idx);
3959   switch (node->type)
3960     {
3961     case CHARACTER:
3962       if (node->opr.c != ch)
3963         return 0;
3964       break;
3965
3966     case SIMPLE_BRACKET:
3967       if (!bitset_contain (node->opr.sbcset, ch))
3968         return 0;
3969       break;
3970
3971 #ifdef RE_ENABLE_I18N
3972     case OP_UTF8_PERIOD:
3973       if (ch >= 0x80)
3974         return 0;
3975       /* FALLTHROUGH */
3976 #endif
3977     case OP_PERIOD:
3978       if ((ch == '\n' && !(mctx->dfa->syntax & REG_DOT_NEWLINE))
3979           || (ch == '\0' && (mctx->dfa->syntax & REG_DOT_NOT_NULL)))
3980         return 0;
3981       break;
3982
3983     default:
3984       return 0;
3985     }
3986
3987   if (node->constraint)
3988     {
3989       /* The node has constraints.  Check whether the current context
3990          satisfies the constraints.  */
3991       unsigned int context = re_string_context_at (&mctx->input, idx,
3992                                                    mctx->eflags);
3993       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3994         return 0;
3995     }
3996
3997   return 1;
3998 }
3999
4000 /* Extend the buffers, if the buffers have run out.  */
4001
4002 static reg_errcode_t
4003 internal_function
4004 extend_buffers (re_match_context_t *mctx)
4005 {
4006   reg_errcode_t ret;
4007   re_string_t *pstr = &mctx->input;
4008
4009   /* Double the lengthes of the buffers.  */
4010   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4011   if (BE (ret != REG_NOERROR, 0))
4012     return ret;
4013
4014   if (mctx->state_log != NULL)
4015     {
4016       /* And double the length of state_log.  */
4017       /* XXX We have no indication of the size of this buffer.  If this
4018          allocation fail we have no indication that the state_log array
4019          does not have the right size.  */
4020       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4021                                               pstr->bufs_len + 1);
4022       if (BE (new_array == NULL, 0))
4023         return REG_ESPACE;
4024       mctx->state_log = new_array;
4025     }
4026
4027   /* Then reconstruct the buffers.  */
4028   if (pstr->icase)
4029     {
4030 #ifdef RE_ENABLE_I18N
4031       if (pstr->mb_cur_max > 1)
4032         {
4033           ret = build_wcs_upper_buffer (pstr);
4034           if (BE (ret != REG_NOERROR, 0))
4035             return ret;
4036         }
4037       else
4038 #endif /* RE_ENABLE_I18N  */
4039         build_upper_buffer (pstr);
4040     }
4041   else
4042     {
4043 #ifdef RE_ENABLE_I18N
4044       if (pstr->mb_cur_max > 1)
4045         build_wcs_buffer (pstr);
4046       else
4047 #endif /* RE_ENABLE_I18N  */
4048         {
4049           if (pstr->trans != NULL)
4050             re_string_translate_buffer (pstr);
4051         }
4052     }
4053   return REG_NOERROR;
4054 }
4055
4056 \f
4057 /* Functions for matching context.  */
4058
4059 /* Initialize MCTX.  */
4060
4061 static reg_errcode_t
4062 internal_function
4063 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4064 {
4065   mctx->eflags = eflags;
4066   mctx->match_last = -1;
4067   if (n > 0)
4068     {
4069       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4070       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4071       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4072         return REG_ESPACE;
4073     }
4074   /* Already zero-ed by the caller.
4075      else
4076        mctx->bkref_ents = NULL;
4077      mctx->nbkref_ents = 0;
4078      mctx->nsub_tops = 0;  */
4079   mctx->abkref_ents = n;
4080   mctx->max_mb_elem_len = 1;
4081   mctx->asub_tops = n;
4082   return REG_NOERROR;
4083 }
4084
4085 /* Clean the entries which depend on the current input in MCTX.
4086    This function must be invoked when the matcher changes the start index
4087    of the input, or changes the input string.  */
4088
4089 static void
4090 internal_function
4091 match_ctx_clean (re_match_context_t *mctx)
4092 {
4093   int st_idx;
4094   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4095     {
4096       int sl_idx;
4097       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4098       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4099         {
4100           re_sub_match_last_t *last = top->lasts[sl_idx];
4101           re_free (last->path.array);
4102           re_free (last);
4103         }
4104       re_free (top->lasts);
4105       if (top->path)
4106         {
4107           re_free (top->path->array);
4108           re_free (top->path);
4109         }
4110       free (top);
4111     }
4112
4113   mctx->nsub_tops = 0;
4114   mctx->nbkref_ents = 0;
4115 }
4116
4117 /* Free all the memory associated with MCTX.  */
4118
4119 static void
4120 internal_function
4121 match_ctx_free (re_match_context_t *mctx)
4122 {
4123   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4124   match_ctx_clean (mctx);
4125   re_free (mctx->sub_tops);
4126   re_free (mctx->bkref_ents);
4127 }
4128
4129 /* Add a new backreference entry to MCTX.
4130    Note that we assume that caller never call this function with duplicate
4131    entry, and call with STR_IDX which isn't smaller than any existing entry.
4132 */
4133
4134 static reg_errcode_t
4135 internal_function
4136 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx,
4137                      int from, int to)
4138 {
4139   if (mctx->nbkref_ents >= mctx->abkref_ents)
4140     {
4141       struct re_backref_cache_entry* new_entry;
4142       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4143                               mctx->abkref_ents * 2);
4144       if (BE (new_entry == NULL, 0))
4145         {
4146           re_free (mctx->bkref_ents);
4147           return REG_ESPACE;
4148         }
4149       mctx->bkref_ents = new_entry;
4150       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4151               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4152       mctx->abkref_ents *= 2;
4153     }
4154   if (mctx->nbkref_ents > 0
4155       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4156     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4157
4158   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4159   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4160   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4161   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4162
4163   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4164      If bit N is clear, means that this entry won't epsilon-transition to
4165      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4166      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4167      such node.
4168
4169      A backreference does not epsilon-transition unless it is empty, so set
4170      to all zeros if FROM != TO.  */
4171   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4172     = (from == to ? -1 : 0);
4173
4174   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4175   if (mctx->max_mb_elem_len < to - from)
4176     mctx->max_mb_elem_len = to - from;
4177   return REG_NOERROR;
4178 }
4179
4180 /* Search for the first entry which has the same str_idx, or -1 if none is
4181    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4182
4183 static int
4184 internal_function
4185 search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
4186 {
4187   int left, right, mid, last;
4188   last = right = mctx->nbkref_ents;
4189   for (left = 0; left < right;)
4190     {
4191       mid = (left + right) / 2;
4192       if (mctx->bkref_ents[mid].str_idx < str_idx)
4193         left = mid + 1;
4194       else
4195         right = mid;
4196     }
4197   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4198     return left;
4199   else
4200     return -1;
4201 }
4202
4203 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4204    at STR_IDX.  */
4205
4206 static reg_errcode_t
4207 internal_function
4208 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4209 {
4210 #ifdef DEBUG
4211   assert (mctx->sub_tops != NULL);
4212   assert (mctx->asub_tops > 0);
4213 #endif
4214   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4215     {
4216       int new_asub_tops = mctx->asub_tops * 2;
4217       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4218                                                    re_sub_match_top_t *,
4219                                                    new_asub_tops);
4220       if (BE (new_array == NULL, 0))
4221         return REG_ESPACE;
4222       mctx->sub_tops = new_array;
4223       mctx->asub_tops = new_asub_tops;
4224     }
4225   mctx->sub_tops[mctx->nsub_tops] = re_calloc (re_sub_match_top_t, 1);
4226   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4227     return REG_ESPACE;
4228   mctx->sub_tops[mctx->nsub_tops]->node = node;
4229   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4230   return REG_NOERROR;
4231 }
4232
4233 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4234    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4235
4236 static re_sub_match_last_t *
4237 internal_function
4238 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4239 {
4240   re_sub_match_last_t *new_entry;
4241   if (BE (subtop->nlasts == subtop->alasts, 0))
4242     {
4243       int new_alasts = 2 * subtop->alasts + 1;
4244       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4245                                                     re_sub_match_last_t *,
4246                                                     new_alasts);
4247       if (BE (new_array == NULL, 0))
4248         return NULL;
4249       subtop->lasts = new_array;
4250       subtop->alasts = new_alasts;
4251     }
4252   new_entry = re_calloc (re_sub_match_last_t, 1);
4253   if (BE (new_entry != NULL, 1))
4254     {
4255       subtop->lasts[subtop->nlasts] = new_entry;
4256       new_entry->node = node;
4257       new_entry->str_idx = str_idx;
4258       ++subtop->nlasts;
4259     }
4260   return new_entry;
4261 }
4262
4263 static void
4264 internal_function
4265 sift_ctx_init (re_sift_context_t *sctx,
4266                re_dfastate_t **sifted_sts,
4267                re_dfastate_t **limited_sts,
4268                int last_node, int last_str_idx)
4269 {
4270   sctx->sifted_states = sifted_sts;
4271   sctx->limited_states = limited_sts;
4272   sctx->last_node = last_node;
4273   sctx->last_str_idx = last_str_idx;
4274   re_node_set_init_empty (&sctx->limits);
4275 }