pintos-os.org Git - pspp/blob - lib/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  21                                      int n) internal_function;
  22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  23 static void match_ctx_free (re_match_context_t *cache) internal_function;
  24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  25                                           int str_idx, int from, int to)
  26      internal_function;
  27 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
  28      internal_function;
  29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  30                                            int str_idx) internal_function;
  31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  32                                                    int node, int str_idx)
  33      internal_function;
  34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  35                            re_dfastate_t **limited_sts, int last_node,
  36                            int last_str_idx)
  37      internal_function;
  38 static reg_errcode_t re_search_internal (const regex_t *preg,
  39                                          const char *string, int length,
  40                                          int start, int range, int stop,
  41                                          size_t nmatch, regmatch_t pmatch[],
  42                                          int eflags) internal_function;
  43 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  44                              const char *string1, int length1,
  45                              const char *string2, int length2,
  46                              int start, int range, struct re_registers *regs,
  47                              int stop, int ret_len) internal_function;
  48 static int re_search_stub (struct re_pattern_buffer *bufp,
  49                            const char *string, int length, int start,
  50                            int range, int stop, struct re_registers *regs,
  51                            int ret_len) internal_function;
  52 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  53                               int nregs, int regs_allocated) internal_function;
  54 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  55      internal_function;
  56 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  57                            int *p_match_first)
  58      internal_function;
  59 static int check_halt_state_context (const re_match_context_t *mctx,
  60                                      const re_dfastate_t *state, int idx)
  61      internal_function;
  62 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  63                          regmatch_t *prev_idx_match, int cur_node,
  64                          int cur_idx, int nmatch) internal_function;
  65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  66                                       int str_idx, int dest_node, int nregs,
  67                                       regmatch_t *regs,
  68                                       re_node_set *eps_via_nodes) internal_function;
  69 static reg_errcode_t set_regs (const regex_t *preg,
  70                                const re_match_context_t *mctx,
  71                                size_t nmatch, regmatch_t *pmatch,
  72                                int fl_backtrack) internal_function;
  73 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  74
  75 #ifdef RE_ENABLE_I18N
  76 static int sift_states_iter_mb (const re_match_context_t *mctx,
  77                                 re_sift_context_t *sctx,
  78                                 int node_idx, int str_idx, int max_str_idx) internal_function;
  79 #endif /* RE_ENABLE_I18N */
  80 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  81                                            re_sift_context_t *sctx) internal_function;
  82 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
  83                                           re_sift_context_t *sctx, int str_idx,
  84                                           re_node_set *cur_dest) internal_function;
  85 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
  86                                               re_sift_context_t *sctx,
  87                                               int str_idx,
  88                                               re_node_set *dest_nodes) internal_function;
  89 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
  90                                             re_node_set *dest_nodes,
  91                                             const re_node_set *candidates) internal_function;
  92 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
  93                              int dst_node, int dst_idx, int src_node,
  94                              int src_idx) internal_function;
  95 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
  96                                         int boundaries, int subexp_idx,
  97                                         int from_node, int bkref_idx) internal_function;
  98 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
  99                                       int limit, int subexp_idx,
 100                                       int node, int str_idx,
 101                                       int bkref_idx) internal_function;
 102 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 103                                           re_node_set *dest_nodes,
 104                                           const re_node_set *candidates,
 105                                           re_node_set *limits,
 106                                           struct re_backref_cache_entry *bkref_ents,
 107                                           int str_idx) internal_function;
 108 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 109                                         re_sift_context_t *sctx,
 110                                         int str_idx, const re_node_set *candidates) internal_function;
 111 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 112                                         re_dfastate_t **src, int num) internal_function;
 113 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 114                                          re_match_context_t *mctx) internal_function;
 115 static re_dfastate_t *transit_state (reg_errcode_t *err,
 116                                      re_match_context_t *mctx,
 117                                      re_dfastate_t *state) internal_function;
 118 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 119                                             re_match_context_t *mctx,
 120                                             re_dfastate_t *next_state) internal_function;
 121 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 122                                                 re_node_set *cur_nodes,
 123                                                 int str_idx) internal_function;
 124 #if 0
 125 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 126                                         re_match_context_t *mctx,
 127                                         re_dfastate_t *pstate) internal_function;
 128 #endif
 129 #ifdef RE_ENABLE_I18N
 130 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 131                                        re_dfastate_t *pstate) internal_function;
 132 #endif /* RE_ENABLE_I18N */
 133 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 134                                           const re_node_set *nodes) internal_function;
 135 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 136                                  int bkref_node, int bkref_str_idx) internal_function;
 137 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 138                                      const re_sub_match_top_t *sub_top,
 139                                      re_sub_match_last_t *sub_last,
 140                                      int bkref_node, int bkref_str) internal_function;
 141 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 142                              int subexp_idx, int type) internal_function;
 143 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 144                                     state_array_t *path, int top_node,
 145                                     int top_str, int last_node, int last_str,
 146                                     int type) internal_function;
 147 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 148                                                    int str_idx,
 149                                                    re_node_set *cur_nodes,
 150                                                    re_node_set *next_nodes) internal_function;
 151 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 152                                                re_node_set *cur_nodes,
 153                                                int ex_subexp, int type) internal_function;
 154 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 155                                                    re_node_set *dst_nodes,
 156                                                    int target, int ex_subexp,
 157                                                    int type) internal_function;
 158 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 159                                          re_node_set *cur_nodes, int cur_str,
 160                                          int subexp_num, int type) internal_function;
 161 static int build_trtable (re_dfa_t *dfa,
 162                           re_dfastate_t *state) internal_function;
 163 #ifdef RE_ENABLE_I18N
 164 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
 165                                     const re_string_t *input, int idx) internal_function;
 166 # ifdef _LIBC
 167 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 168                                                    size_t name_len) internal_function;
 169 # endif /* _LIBC */
 170 #endif /* RE_ENABLE_I18N */
 171 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
 172                                        const re_dfastate_t *state,
 173                                        re_node_set *states_node,
 174                                        bitset *states_ch) internal_function;
 175 static int check_node_accept (const re_match_context_t *mctx,
 176                               const re_token_t *node, int idx) internal_function;
 177 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 178 \f
 179 /* Entry point for POSIX code.  */
 180
 181 /* regexec searches for a given pattern, specified by PREG, in the
 182    string STRING.
 183
 184    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 185    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 186    least NMATCH elements, and we set them to the offsets of the
 187    corresponding matched substrings.
 188
 189    EFLAGS specifies `execution flags' which affect matching: if
 190    REG_NOTBOL is set, then ^ does not match at the beginning of the
 191    string; if REG_NOTEOL is set, then $ does not match at the end.
 192
 193    We return 0 if we find a match and REG_NOMATCH if not.  */
 194
 195 int
 196 regexec (const regex_t *__restrict preg, const char *__restrict string,
 197          size_t nmatch, regmatch_t pmatch[], int eflags)
 198 {
 199   reg_errcode_t err;
 200   int start, length;
 201 #ifdef _LIBC
 202   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 203 #endif
 204
 205   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 206     return REG_BADPAT;
 207
 208   if (eflags & REG_STARTEND)
 209     {
 210       start = pmatch[0].rm_so;
 211       length = pmatch[0].rm_eo;
 212     }
 213   else
 214     {
 215       start = 0;
 216       length = strlen (string);
 217     }
 218
 219   __libc_lock_lock (dfa->lock);
 220   if (preg->no_sub)
 221     err = re_search_internal (preg, string, length, start, length - start,
 222                               length, 0, NULL, eflags);
 223   else
 224     err = re_search_internal (preg, string, length, start, length - start,
 225                               length, nmatch, pmatch, eflags);
 226   __libc_lock_unlock (dfa->lock);
 227   return err != REG_NOERROR;
 228 }
 229
 230 #ifdef _LIBC
 231 # include <shlib-compat.h>
 232 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 233
 234 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 235 __typeof__ (__regexec) __compat_regexec;
 236
 237 int
 238 attribute_compat_text_section
 239 __compat_regexec (const regex_t *__restrict preg,
 240                   const char *__restrict string, size_t nmatch,
 241                   regmatch_t pmatch[], int eflags)
 242 {
 243   return regexec (preg, string, nmatch, pmatch,
 244                   eflags & (REG_NOTBOL | REG_NOTEOL));
 245 }
 246 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 247 # endif
 248 #endif
 249
 250 /* Entry points for GNU code.  */
 251
 252 /* re_match, re_search, re_match_2, re_search_2
 253
 254    The former two functions operate on STRING with length LENGTH,
 255    while the later two operate on concatenation of STRING1 and STRING2
 256    with lengths LENGTH1 and LENGTH2, respectively.
 257
 258    re_match() matches the compiled pattern in BUFP against the string,
 259    starting at index START.
 260
 261    re_search() first tries matching at index START, then it tries to match
 262    starting from index START + 1, and so on.  The last start position tried
 263    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 264    way as re_match().)
 265
 266    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 267    the first STOP characters of the concatenation of the strings should be
 268    concerned.
 269
 270    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 271    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 272    computed relative to the concatenation, not relative to the individual
 273    strings.)
 274
 275    On success, re_match* functions return the length of the match, re_search*
 276    return the position of the start of the match.  Return value -1 means no
 277    match was found and -2 indicates an internal error.  */
 278
 279 int
 280 re_match (struct re_pattern_buffer *bufp, const char *string,
 281           int length, int start, struct re_registers *regs)
 282 {
 283   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 284 }
 285 #ifdef _LIBC
 286 weak_alias (__re_match, re_match)
 287 #endif
 288
 289 int
 290 re_search (struct re_pattern_buffer *bufp, const char *string,
 291            int length, int start, int range, struct re_registers *regs)
 292 {
 293   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 294 }
 295 #ifdef _LIBC
 296 weak_alias (__re_search, re_search)
 297 #endif
 298
 299 int
 300 re_match_2 (struct re_pattern_buffer *bufp,
 301             const char *string1, int length1,
 302             const char *string2, int length2,
 303             int start, struct re_registers *regs, int stop)
 304 {
 305   return re_search_2_stub (bufp, string1, length1, string2, length2,
 306                            start, 0, regs, stop, 1);
 307 }
 308 #ifdef _LIBC
 309 weak_alias (__re_match_2, re_match_2)
 310 #endif
 311
 312 int
 313 re_search_2 (struct re_pattern_buffer *bufp,
 314              const char *string1, int length1,
 315              const char *string2, int length2,
 316              int start, int range, struct re_registers *regs, int stop)
 317 {
 318   return re_search_2_stub (bufp, string1, length1, string2, length2,
 319                            start, range, regs, stop, 0);
 320 }
 321 #ifdef _LIBC
 322 weak_alias (__re_search_2, re_search_2)
 323 #endif
 324
 325 static int
 326 internal_function
 327 re_search_2_stub (struct re_pattern_buffer *bufp,
 328                   const char *string1, int length1,
 329                   const char *string2, int length2,
 330                   int start, int range, struct re_registers *regs, int stop,
 331                   int ret_len)
 332 {
 333   const char *str;
 334   int rval;
 335   int len = length1 + length2;
 336   int free_str = 0;
 337
 338   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 339     return -2;
 340
 341   /* Concatenate the strings.  */
 342   if (length2 > 0)
 343     if (length1 > 0)
 344       {
 345         char *s = re_malloc (char, len);
 346
 347         if (BE (s == NULL, 0))
 348           return -2;
 349         memcpy (s, string1, length1);
 350         memcpy (s + length1, string2, length2);
 351         str = s;
 352         free_str = 1;
 353       }
 354     else
 355       str = string2;
 356   else
 357     str = string1;
 358
 359   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 360                          ret_len);
 361   if (free_str)
 362     re_free ((char *) str);
 363   return rval;
 364 }
 365
 366 /* The parameters have the same meaning as those of re_search.
 367    Additional parameters:
 368    If RET_LEN is nonzero the length of the match is returned (re_match style);
 369    otherwise the position of the match is returned.  */
 370
 371 static int
 372 internal_function
 373 re_search_stub (struct re_pattern_buffer *bufp,
 374                 const char *string, int length,
 375                 int start, int range, int stop, struct re_registers *regs,
 376                 int ret_len)
 377 {
 378   reg_errcode_t result;
 379   regmatch_t *pmatch;
 380   int nregs, rval;
 381   int eflags = 0;
 382 #ifdef _LIBC
 383   re_dfa_t *dfa = (re_dfa_t *)bufp->buffer;
 384 #endif
 385
 386   /* Check for out-of-range.  */
 387   if (BE (start < 0 || start > length, 0))
 388     return -1;
 389   if (BE (start + range > length, 0))
 390     range = length - start;
 391   else if (BE (start + range < 0, 0))
 392     range = -start;
 393
 394   __libc_lock_lock (dfa->lock);
 395
 396   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 397   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 398
 399   /* Compile fastmap if we haven't yet.  */
 400   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 401     re_compile_fastmap (bufp);
 402
 403   if (BE (bufp->no_sub, 0))
 404     regs = NULL;
 405
 406   /* We need at least 1 register.  */
 407   if (regs == NULL)
 408     nregs = 1;
 409   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 410                regs->num_regs < bufp->re_nsub + 1, 0))
 411     {
 412       nregs = regs->num_regs;
 413       if (BE (nregs < 1, 0))
 414         {
 415           /* Nothing can be copied to regs.  */
 416           regs = NULL;
 417           nregs = 1;
 418         }
 419     }
 420   else
 421     nregs = bufp->re_nsub + 1;
 422   pmatch = re_malloc (regmatch_t, nregs);
 423   if (BE (pmatch == NULL, 0))
 424     {
 425       rval = -2;
 426       goto out;
 427     }
 428
 429   result = re_search_internal (bufp, string, length, start, range, stop,
 430                                nregs, pmatch, eflags);
 431
 432   rval = 0;
 433
 434   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 435   if (result != REG_NOERROR)
 436     rval = -1;
 437   else if (regs != NULL)
 438     {
 439       /* If caller wants register contents data back, copy them.  */
 440       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 441                                            bufp->regs_allocated);
 442       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 443         rval = -2;
 444     }
 445
 446   if (BE (rval == 0, 1))
 447     {
 448       if (ret_len)
 449         {
 450           assert (pmatch[0].rm_so == start);
 451           rval = pmatch[0].rm_eo - start;
 452         }
 453       else
 454         rval = pmatch[0].rm_so;
 455     }
 456   re_free (pmatch);
 457  out:
 458   __libc_lock_unlock (dfa->lock);
 459   return rval;
 460 }
 461
 462 static unsigned
 463 internal_function
 464 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, int nregs,
 465               int regs_allocated)
 466 {
 467   int rval = REGS_REALLOCATE;
 468   int i;
 469   int need_regs = nregs + 1;
 470   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 471      uses.  */
 472
 473   /* Have the register data arrays been allocated?  */
 474   if (regs_allocated == REGS_UNALLOCATED)
 475     { /* No.  So allocate them with malloc.  */
 476       regs->start = re_malloc (regoff_t, need_regs);
 477       regs->end = re_malloc (regoff_t, need_regs);
 478       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
 479         return REGS_UNALLOCATED;
 480       regs->num_regs = need_regs;
 481     }
 482   else if (regs_allocated == REGS_REALLOCATE)
 483     { /* Yes.  If we need more elements than were already
 484          allocated, reallocate them.  If we need fewer, just
 485          leave it alone.  */
 486       if (BE (need_regs > regs->num_regs, 0))
 487         {
 488           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 489           regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
 490           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 491             return REGS_UNALLOCATED;
 492           regs->start = new_start;
 493           regs->end = new_end;
 494           regs->num_regs = need_regs;
 495         }
 496     }
 497   else
 498     {
 499       assert (regs_allocated == REGS_FIXED);
 500       /* This function may not be called with REGS_FIXED and nregs too big.  */
 501       assert (regs->num_regs >= nregs);
 502       rval = REGS_FIXED;
 503     }
 504
 505   /* Copy the regs.  */
 506   for (i = 0; i < nregs; ++i)
 507     {
 508       regs->start[i] = pmatch[i].rm_so;
 509       regs->end[i] = pmatch[i].rm_eo;
 510     }
 511   for ( ; i < regs->num_regs; ++i)
 512     regs->start[i] = regs->end[i] = -1;
 513
 514   return rval;
 515 }
 516
 517 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 518    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 519    this memory for recording register information.  STARTS and ENDS
 520    must be allocated using the malloc library routine, and must each
 521    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 522
 523    If NUM_REGS == 0, then subsequent matches should allocate their own
 524    register data.
 525
 526    Unless this function is called, the first search or match using
 527    PATTERN_BUFFER will allocate its own register data, without
 528    freeing the old data.  */
 529
 530 void
 531 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
 532                   unsigned int num_regs, regoff_t *starts, regoff_t *ends)
 533 {
 534   if (num_regs)
 535     {
 536       bufp->regs_allocated = REGS_REALLOCATE;
 537       regs->num_regs = num_regs;
 538       regs->start = starts;
 539       regs->end = ends;
 540     }
 541   else
 542     {
 543       bufp->regs_allocated = REGS_UNALLOCATED;
 544       regs->num_regs = 0;
 545       regs->start = regs->end = (regoff_t *) 0;
 546     }
 547 }
 548 #ifdef _LIBC
 549 weak_alias (__re_set_registers, re_set_registers)
 550 #endif
 551 \f
 552 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 553    them unless specifically requested.  */
 554
 555 #if defined _REGEX_RE_COMP || defined _LIBC
 556 int
 557 # ifdef _LIBC
 558 weak_function
 559 # endif
 560 re_exec (s)
 561      const char *s;
 562 {
 563   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 564 }
 565 #endif /* _REGEX_RE_COMP */
 566 \f
 567 /* Internal entry point.  */
 568
 569 /* Searches for a compiled pattern PREG in the string STRING, whose
 570    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 571    mingings with regexec.  START, and RANGE have the same meanings
 572    with re_search.
 573    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 574    otherwise return the error code.
 575    Note: We assume front end functions already check ranges.
 576    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 577
 578 static reg_errcode_t
 579 internal_function
 580 re_search_internal (const regex_t *preg,
 581                     const char *string, int length,
 582                     int start, int range, int stop,
 583                     size_t nmatch, regmatch_t pmatch[],
 584                     int eflags)
 585 {
 586   reg_errcode_t err;
 587   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 588   int left_lim, right_lim, incr;
 589   int fl_longest_match, match_first, match_kind, match_last = -1;
 590   int extra_nmatch;
 591   int sb, ch;
 592 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 593   re_match_context_t mctx = { .dfa = dfa };
 594 #else
 595   re_match_context_t mctx;
 596 #endif
 597   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 598                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 599   unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
 600
 601 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 602   memset (&mctx, '\0', sizeof (re_match_context_t));
 603   mctx.dfa = dfa;
 604 #endif
 605
 606   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 607   nmatch -= extra_nmatch;
 608
 609   /* Check if the DFA haven't been compiled.  */
 610   if (BE (preg->used == 0 || dfa->init_state == NULL
 611           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 612           || dfa->init_state_begbuf == NULL, 0))
 613     return REG_NOMATCH;
 614
 615 #ifdef DEBUG
 616   /* We assume front-end functions already check them.  */
 617   assert (start + range >= 0 && start + range <= length);
 618 #endif
 619
 620   /* If initial states with non-begbuf contexts have no elements,
 621      the regex must be anchored.  If preg->newline_anchor is set,
 622      we'll never use init_state_nl, so do not check it.  */
 623   if (dfa->init_state->nodes.nelem == 0
 624       && dfa->init_state_word->nodes.nelem == 0
 625       && (dfa->init_state_nl->nodes.nelem == 0
 626           || !preg->newline_anchor))
 627     {
 628       if (start != 0 && start + range != 0)
 629         return REG_NOMATCH;
 630       start = range = 0;
 631     }
 632
 633   /* We must check the longest matching, if nmatch > 0.  */
 634   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 635
 636   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 637                             preg->translate, preg->syntax & RE_ICASE, dfa);
 638   if (BE (err != REG_NOERROR, 0))
 639     goto free_return;
 640   mctx.input.stop = stop;
 641   mctx.input.raw_stop = stop;
 642   mctx.input.newline_anchor = preg->newline_anchor;
 643
 644   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 645   if (BE (err != REG_NOERROR, 0))
 646     goto free_return;
 647
 648   /* We will log all the DFA states through which the dfa pass,
 649      if nmatch > 1, or this dfa has "multibyte node", which is a
 650      back-reference or a node which can accept multibyte character or
 651      multi character collating element.  */
 652   if (nmatch > 1 || dfa->has_mb_node)
 653     {
 654       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 655       if (BE (mctx.state_log == NULL, 0))
 656         {
 657           err = REG_ESPACE;
 658           goto free_return;
 659         }
 660     }
 661   else
 662     mctx.state_log = NULL;
 663
 664   match_first = start;
 665   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 666                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 667
 668   /* Check incrementally whether of not the input string match.  */
 669   incr = (range < 0) ? -1 : 1;
 670   left_lim = (range < 0) ? start + range : start;
 671   right_lim = (range < 0) ? start : start + range;
 672   sb = dfa->mb_cur_max == 1;
 673   match_kind =
 674     (fastmap
 675      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 676         | (range >= 0 ? 2 : 0)
 677         | (t != NULL ? 1 : 0))
 678      : 8);
 679
 680   for (;; match_first += incr)
 681     {
 682       err = REG_NOMATCH;
 683       if (match_first < left_lim || right_lim < match_first)
 684         goto free_return;
 685
 686       /* Advance as rapidly as possible through the string, until we
 687          find a plausible place to start matching.  This may be done
 688          with varying efficiency, so there are various possibilities:
 689          only the most common of them are specialized, in order to
 690          save on code size.  We use a switch statement for speed.  */
 691       switch (match_kind)
 692         {
 693         case 8:
 694           /* No fastmap.  */
 695           break;
 696
 697         case 7:
 698           /* Fastmap with single-byte translation, match forward.  */
 699           while (BE (match_first < right_lim, 1)
 700                  && !fastmap[t[(unsigned char) string[match_first]]])
 701             ++match_first;
 702           goto forward_match_found_start_or_reached_end;
 703
 704         case 6:
 705           /* Fastmap without translation, match forward.  */
 706           while (BE (match_first < right_lim, 1)
 707                  && !fastmap[(unsigned char) string[match_first]])
 708             ++match_first;
 709
 710         forward_match_found_start_or_reached_end:
 711           if (BE (match_first == right_lim, 0))
 712             {
 713               ch = match_first >= length
 714                        ? 0 : (unsigned char) string[match_first];
 715               if (!fastmap[t ? t[ch] : ch])
 716                 goto free_return;
 717             }
 718           break;
 719
 720         case 4:
 721         case 5:
 722           /* Fastmap without multi-byte translation, match backwards.  */
 723           while (match_first >= left_lim)
 724             {
 725               ch = match_first >= length
 726                        ? 0 : (unsigned char) string[match_first];
 727               if (fastmap[t ? t[ch] : ch])
 728                 break;
 729               --match_first;
 730             }
 731           if (match_first < left_lim)
 732             goto free_return;
 733           break;
 734
 735         default:
 736           /* In this case, we can't determine easily the current byte,
 737              since it might be a component byte of a multibyte
 738              character.  Then we use the constructed buffer instead.  */
 739           for (;;)
 740             {
 741               /* If MATCH_FIRST is out of the valid range, reconstruct the
 742                  buffers.  */
 743               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 744               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 745                 {
 746                   err = re_string_reconstruct (&mctx.input, match_first,
 747                                                eflags);
 748                   if (BE (err != REG_NOERROR, 0))
 749                     goto free_return;
 750
 751                   offset = match_first - mctx.input.raw_mbs_idx;
 752                 }
 753               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 754                  Note that MATCH_FIRST must not be smaller than 0.  */
 755               ch = (match_first >= length
 756                     ? 0 : re_string_byte_at (&mctx.input, offset));
 757               if (fastmap[ch])
 758                 break;
 759               match_first += incr;
 760               if (match_first < left_lim || match_first > right_lim)
 761                 {
 762                   err = REG_NOMATCH;
 763                   goto free_return;
 764                 }
 765             }
 766           break;
 767         }
 768
 769       /* Reconstruct the buffers so that the matcher can assume that
 770          the matching starts from the beginning of the buffer.  */
 771       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 772       if (BE (err != REG_NOERROR, 0))
 773         goto free_return;
 774
 775 #ifdef RE_ENABLE_I18N
 776      /* Don't consider this char as a possible match start if it part,
 777         yet isn't the head, of a multibyte character.  */
 778       if (!sb && !re_string_first_byte (&mctx.input, 0))
 779         continue;
 780 #endif
 781
 782       /* It seems to be appropriate one, then use the matcher.  */
 783       /* We assume that the matching starts from 0.  */
 784       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 785       match_last = check_matching (&mctx, fl_longest_match,
 786                                    range >= 0 ? &match_first : NULL);
 787       if (match_last != -1)
 788         {
 789           if (BE (match_last == -2, 0))
 790             {
 791               err = REG_ESPACE;
 792               goto free_return;
 793             }
 794           else
 795             {
 796               mctx.match_last = match_last;
 797               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 798                 {
 799                   re_dfastate_t *pstate = mctx.state_log[match_last];
 800                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 801                                                              match_last);
 802                 }
 803               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 804                   || dfa->nbackref)
 805                 {
 806                   err = prune_impossible_nodes (&mctx);
 807                   if (err == REG_NOERROR)
 808                     break;
 809                   if (BE (err != REG_NOMATCH, 0))
 810                     goto free_return;
 811                   match_last = -1;
 812                 }
 813               else
 814                 break; /* We found a match.  */
 815             }
 816         }
 817
 818       match_ctx_clean (&mctx);
 819     }
 820
 821 #ifdef DEBUG
 822   assert (match_last != -1);
 823   assert (err == REG_NOERROR);
 824 #endif
 825
 826   /* Set pmatch[] if we need.  */
 827   if (nmatch > 0)
 828     {
 829       int reg_idx;
 830
 831       /* Initialize registers.  */
 832       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 833         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 834
 835       /* Set the points where matching start/end.  */
 836       pmatch[0].rm_so = 0;
 837       pmatch[0].rm_eo = mctx.match_last;
 838
 839       if (!preg->no_sub && nmatch > 1)
 840         {
 841           err = set_regs (preg, &mctx, nmatch, pmatch,
 842                           dfa->has_plural_match && dfa->nbackref > 0);
 843           if (BE (err != REG_NOERROR, 0))
 844             goto free_return;
 845         }
 846
 847       /* At last, add the offset to the each registers, since we slided
 848          the buffers so that we could assume that the matching starts
 849          from 0.  */
 850       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 851         if (pmatch[reg_idx].rm_so != -1)
 852           {
 853 #ifdef RE_ENABLE_I18N
 854             if (BE (mctx.input.offsets_needed != 0, 0))
 855               {
 856                 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
 857                   pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
 858                 else
 859                   pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
 860                 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
 861                   pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
 862                 else
 863                   pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
 864               }
 865 #else
 866             assert (mctx.input.offsets_needed == 0);
 867 #endif
 868             pmatch[reg_idx].rm_so += match_first;
 869             pmatch[reg_idx].rm_eo += match_first;
 870           }
 871       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 872         {
 873           pmatch[nmatch + reg_idx].rm_so = -1;
 874           pmatch[nmatch + reg_idx].rm_eo = -1;
 875         }
 876
 877       if (dfa->subexp_map)
 878         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 879           if (dfa->subexp_map[reg_idx] != reg_idx)
 880             {
 881               pmatch[reg_idx + 1].rm_so
 882                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 883               pmatch[reg_idx + 1].rm_eo
 884                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 885             }
 886     }
 887
 888  free_return:
 889   re_free (mctx.state_log);
 890   if (dfa->nbackref)
 891     match_ctx_free (&mctx);
 892   re_string_destruct (&mctx.input);
 893   return err;
 894 }
 895
 896 static reg_errcode_t
 897 internal_function
 898 prune_impossible_nodes (re_match_context_t *mctx)
 899 {
 900   re_dfa_t *const dfa = mctx->dfa;
 901   int halt_node, match_last;
 902   reg_errcode_t ret;
 903   re_dfastate_t **sifted_states;
 904   re_dfastate_t **lim_states = NULL;
 905   re_sift_context_t sctx;
 906 #ifdef DEBUG
 907   assert (mctx->state_log != NULL);
 908 #endif
 909   match_last = mctx->match_last;
 910   halt_node = mctx->last_node;
 911   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 912   if (BE (sifted_states == NULL, 0))
 913     {
 914       ret = REG_ESPACE;
 915       goto free_return;
 916     }
 917   if (dfa->nbackref)
 918     {
 919       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 920       if (BE (lim_states == NULL, 0))
 921         {
 922           ret = REG_ESPACE;
 923           goto free_return;
 924         }
 925       while (1)
 926         {
 927           memset (lim_states, '\0',
 928                   sizeof (re_dfastate_t *) * (match_last + 1));
 929           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 930                          match_last);
 931           ret = sift_states_backward (mctx, &sctx);
 932           re_node_set_free (&sctx.limits);
 933           if (BE (ret != REG_NOERROR, 0))
 934               goto free_return;
 935           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 936             break;
 937           do
 938             {
 939               --match_last;
 940               if (match_last < 0)
 941                 {
 942                   ret = REG_NOMATCH;
 943                   goto free_return;
 944                 }
 945             } while (mctx->state_log[match_last] == NULL
 946                      || !mctx->state_log[match_last]->halt);
 947           halt_node = check_halt_state_context (mctx,
 948                                                 mctx->state_log[match_last],
 949                                                 match_last);
 950         }
 951       ret = merge_state_array (dfa, sifted_states, lim_states,
 952                                match_last + 1);
 953       re_free (lim_states);
 954       lim_states = NULL;
 955       if (BE (ret != REG_NOERROR, 0))
 956         goto free_return;
 957     }
 958   else
 959     {
 960       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 961       ret = sift_states_backward (mctx, &sctx);
 962       re_node_set_free (&sctx.limits);
 963       if (BE (ret != REG_NOERROR, 0))
 964         goto free_return;
 965     }
 966   re_free (mctx->state_log);
 967   mctx->state_log = sifted_states;
 968   sifted_states = NULL;
 969   mctx->last_node = halt_node;
 970   mctx->match_last = match_last;
 971   ret = REG_NOERROR;
 972  free_return:
 973   re_free (sifted_states);
 974   re_free (lim_states);
 975   return ret;
 976 }
 977
 978 /* Acquire an initial state and return it.
 979    We must select appropriate initial state depending on the context,
 980    since initial states may have constraints like "\<", "^", etc..  */
 981
 982 static inline re_dfastate_t *
 983 __attribute ((always_inline)) internal_function
 984 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
 985                             int idx)
 986 {
 987   re_dfa_t *const dfa = mctx->dfa;
 988   if (dfa->init_state->has_constraint)
 989     {
 990       unsigned int context;
 991       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
 992       if (IS_WORD_CONTEXT (context))
 993         return dfa->init_state_word;
 994       else if (IS_ORDINARY_CONTEXT (context))
 995         return dfa->init_state;
 996       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
 997         return dfa->init_state_begbuf;
 998       else if (IS_NEWLINE_CONTEXT (context))
 999         return dfa->init_state_nl;
1000       else if (IS_BEGBUF_CONTEXT (context))
1001         {
1002           /* It is relatively rare case, then calculate on demand.  */
1003           return re_acquire_state_context (err, dfa,
1004                                            dfa->init_state->entrance_nodes,
1005                                            context);
1006         }
1007       else
1008         /* Must not happen?  */
1009         return dfa->init_state;
1010     }
1011   else
1012     return dfa->init_state;
1013 }
1014
1015 /* Check whether the regular expression match input string INPUT or not,
1016    and return the index where the matching end, return -1 if not match,
1017    or return -2 in case of an error.
1018    FL_LONGEST_MATCH means we want the POSIX longest matching.
1019    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1020    next place where we may want to try matching.
1021    Note that the matcher assume that the maching starts from the current
1022    index of the buffer.  */
1023
1024 static int
1025 internal_function
1026 check_matching (re_match_context_t *mctx, int fl_longest_match,
1027                 int *p_match_first)
1028 {
1029   re_dfa_t *const dfa = mctx->dfa;
1030   reg_errcode_t err;
1031   int match = 0;
1032   int match_last = -1;
1033   int cur_str_idx = re_string_cur_idx (&mctx->input);
1034   re_dfastate_t *cur_state;
1035   int at_init_state = p_match_first != NULL;
1036   int next_start_idx = cur_str_idx;
1037
1038   err = REG_NOERROR;
1039   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1040   /* An initial state must not be NULL (invalid).  */
1041   if (BE (cur_state == NULL, 0))
1042     {
1043       assert (err == REG_ESPACE);
1044       return -2;
1045     }
1046
1047   if (mctx->state_log != NULL)
1048     {
1049       mctx->state_log[cur_str_idx] = cur_state;
1050
1051       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1052          later.  E.g. Processing back references.  */
1053       if (BE (dfa->nbackref, 0))
1054         {
1055           at_init_state = 0;
1056           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1057           if (BE (err != REG_NOERROR, 0))
1058             return err;
1059
1060           if (cur_state->has_backref)
1061             {
1062               err = transit_state_bkref (mctx, &cur_state->nodes);
1063               if (BE (err != REG_NOERROR, 0))
1064                 return err;
1065             }
1066         }
1067     }
1068
1069   /* If the RE accepts NULL string.  */
1070   if (BE (cur_state->halt, 0))
1071     {
1072       if (!cur_state->has_constraint
1073           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1074         {
1075           if (!fl_longest_match)
1076             return cur_str_idx;
1077           else
1078             {
1079               match_last = cur_str_idx;
1080               match = 1;
1081             }
1082         }
1083     }
1084
1085   while (!re_string_eoi (&mctx->input))
1086     {
1087       re_dfastate_t *old_state = cur_state;
1088       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1089
1090       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1091           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1092               && mctx->input.valid_len < mctx->input.len))
1093         {
1094           err = extend_buffers (mctx);
1095           if (BE (err != REG_NOERROR, 0))
1096             {
1097               assert (err == REG_ESPACE);
1098               return -2;
1099             }
1100         }
1101
1102       cur_state = transit_state (&err, mctx, cur_state);
1103       if (mctx->state_log != NULL)
1104         cur_state = merge_state_with_log (&err, mctx, cur_state);
1105
1106       if (cur_state == NULL)
1107         {
1108           /* Reached the invalid state or an error.  Try to recover a valid
1109              state using the state log, if available and if we have not
1110              already found a valid (even if not the longest) match.  */
1111           if (BE (err != REG_NOERROR, 0))
1112             return -2;
1113
1114           if (mctx->state_log == NULL
1115               || (match && !fl_longest_match)
1116               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1117             break;
1118         }
1119
1120       if (BE (at_init_state, 0))
1121         {
1122           if (old_state == cur_state)
1123             next_start_idx = next_char_idx;
1124           else
1125             at_init_state = 0;
1126         }
1127
1128       if (cur_state->halt)
1129         {
1130           /* Reached a halt state.
1131              Check the halt state can satisfy the current context.  */
1132           if (!cur_state->has_constraint
1133               || check_halt_state_context (mctx, cur_state,
1134                                            re_string_cur_idx (&mctx->input)))
1135             {
1136               /* We found an appropriate halt state.  */
1137               match_last = re_string_cur_idx (&mctx->input);
1138               match = 1;
1139
1140               /* We found a match, do not modify match_first below.  */
1141               p_match_first = NULL;
1142               if (!fl_longest_match)
1143                 break;
1144             }
1145         }
1146     }
1147
1148   if (p_match_first)
1149     *p_match_first += next_start_idx;
1150
1151   return match_last;
1152 }
1153
1154 /* Check NODE match the current context.  */
1155
1156 static int
1157 internal_function
1158 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1159 {
1160   re_token_type_t type = dfa->nodes[node].type;
1161   unsigned int constraint = dfa->nodes[node].constraint;
1162   if (type != END_OF_RE)
1163     return 0;
1164   if (!constraint)
1165     return 1;
1166   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1167     return 0;
1168   return 1;
1169 }
1170
1171 /* Check the halt state STATE match the current context.
1172    Return 0 if not match, if the node, STATE has, is a halt node and
1173    match the context, return the node.  */
1174
1175 static int
1176 internal_function
1177 check_halt_state_context (const re_match_context_t *mctx,
1178                           const re_dfastate_t *state, int idx)
1179 {
1180   int i;
1181   unsigned int context;
1182 #ifdef DEBUG
1183   assert (state->halt);
1184 #endif
1185   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1186   for (i = 0; i < state->nodes.nelem; ++i)
1187     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1188       return state->nodes.elems[i];
1189   return 0;
1190 }
1191
1192 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1193    corresponding to the DFA).
1194    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1195    of errors.  */
1196
1197 static int
1198 internal_function
1199 proceed_next_node (const re_match_context_t *mctx,
1200                    int nregs, regmatch_t *regs, int *pidx, int node,
1201                    re_node_set *eps_via_nodes, struct re_fail_stack_t *fs)
1202 {
1203   re_dfa_t *const dfa = mctx->dfa;
1204   int i, err;
1205   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1206     {
1207       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1208       re_node_set *edests = &dfa->edests[node];
1209       int dest_node;
1210       err = re_node_set_insert (eps_via_nodes, node);
1211       if (BE (err < 0, 0))
1212         return -2;
1213       /* Pick up a valid destination, or return -1 if none is found.  */
1214       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1215         {
1216           int candidate = edests->elems[i];
1217           if (!re_node_set_contains (cur_nodes, candidate))
1218             continue;
1219           if (dest_node == -1)
1220             dest_node = candidate;
1221
1222           else
1223             {
1224               /* In order to avoid infinite loop like "(a*)*", return the second
1225                  epsilon-transition if the first was already considered.  */
1226               if (re_node_set_contains (eps_via_nodes, dest_node))
1227                 return candidate;
1228
1229               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1230               else if (fs != NULL
1231                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1232                                            eps_via_nodes))
1233                 return -2;
1234
1235               /* We know we are going to exit.  */
1236               break;
1237             }
1238         }
1239       return dest_node;
1240     }
1241   else
1242     {
1243       int naccepted = 0;
1244       re_token_type_t type = dfa->nodes[node].type;
1245
1246 #ifdef RE_ENABLE_I18N
1247       if (dfa->nodes[node].accept_mb)
1248         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1249       else
1250 #endif /* RE_ENABLE_I18N */
1251       if (type == OP_BACK_REF)
1252         {
1253           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1254           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1255           if (fs != NULL)
1256             {
1257               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1258                 return -1;
1259               else if (naccepted)
1260                 {
1261                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1262                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1263                               naccepted) != 0)
1264                     return -1;
1265                 }
1266             }
1267
1268           if (naccepted == 0)
1269             {
1270               int dest_node;
1271               err = re_node_set_insert (eps_via_nodes, node);
1272               if (BE (err < 0, 0))
1273                 return -2;
1274               dest_node = dfa->edests[node].elems[0];
1275               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1276                                         dest_node))
1277                 return dest_node;
1278             }
1279         }
1280
1281       if (naccepted != 0
1282           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1283         {
1284           int dest_node = dfa->nexts[node];
1285           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1286           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1287                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1288                                                dest_node)))
1289             return -1;
1290           re_node_set_empty (eps_via_nodes);
1291           return dest_node;
1292         }
1293     }
1294   return -1;
1295 }
1296
1297 static reg_errcode_t
1298 internal_function
1299 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1300                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1301 {
1302   reg_errcode_t err;
1303   int num = fs->num++;
1304   if (fs->num == fs->alloc)
1305     {
1306       struct re_fail_stack_ent_t *new_array;
1307       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1308                                        * fs->alloc * 2));
1309       if (new_array == NULL)
1310         return REG_ESPACE;
1311       fs->alloc *= 2;
1312       fs->stack = new_array;
1313     }
1314   fs->stack[num].idx = str_idx;
1315   fs->stack[num].node = dest_node;
1316   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1317   if (fs->stack[num].regs == NULL)
1318     return REG_ESPACE;
1319   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1320   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1321   return err;
1322 }
1323
1324 static int
1325 internal_function
1326 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx,
1327                 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1328 {
1329   int num = --fs->num;
1330   assert (num >= 0);
1331   *pidx = fs->stack[num].idx;
1332   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1333   re_node_set_free (eps_via_nodes);
1334   re_free (fs->stack[num].regs);
1335   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1336   return fs->stack[num].node;
1337 }
1338
1339 /* Set the positions where the subexpressions are starts/ends to registers
1340    PMATCH.
1341    Note: We assume that pmatch[0] is already set, and
1342    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1343
1344 static reg_errcode_t
1345 internal_function
1346 set_regs (const regex_t *preg, const re_match_context_t *mctx,
1347           size_t nmatch, regmatch_t *pmatch, int fl_backtrack)
1348 {
1349   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1350   int idx, cur_node;
1351   re_node_set eps_via_nodes;
1352   struct re_fail_stack_t *fs;
1353   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1354   regmatch_t *prev_idx_match;
1355
1356 #ifdef DEBUG
1357   assert (nmatch > 1);
1358   assert (mctx->state_log != NULL);
1359 #endif
1360   if (fl_backtrack)
1361     {
1362       fs = &fs_body;
1363       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1364       if (fs->stack == NULL)
1365         return REG_ESPACE;
1366     }
1367   else
1368     fs = NULL;
1369
1370   cur_node = dfa->init_node;
1371   re_node_set_init_empty (&eps_via_nodes);
1372
1373   prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * nmatch);
1374   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1375
1376   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1377     {
1378       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1379
1380       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1381         {
1382           int reg_idx;
1383           if (fs)
1384             {
1385               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1386                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1387                   break;
1388               if (reg_idx == nmatch)
1389                 {
1390                   re_node_set_free (&eps_via_nodes);
1391                   return free_fail_stack_return (fs);
1392                 }
1393               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1394                                          &eps_via_nodes);
1395             }
1396           else
1397             {
1398               re_node_set_free (&eps_via_nodes);
1399               return REG_NOERROR;
1400             }
1401         }
1402
1403       /* Proceed to next node.  */
1404       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1405                                     &eps_via_nodes, fs);
1406
1407       if (BE (cur_node < 0, 0))
1408         {
1409           if (BE (cur_node == -2, 0))
1410             {
1411               re_node_set_free (&eps_via_nodes);
1412               free_fail_stack_return (fs);
1413               return REG_ESPACE;
1414             }
1415           if (fs)
1416             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1417                                        &eps_via_nodes);
1418           else
1419             {
1420               re_node_set_free (&eps_via_nodes);
1421               return REG_NOMATCH;
1422             }
1423         }
1424     }
1425   re_node_set_free (&eps_via_nodes);
1426   return free_fail_stack_return (fs);
1427 }
1428
1429 static reg_errcode_t
1430 internal_function
1431 free_fail_stack_return (struct re_fail_stack_t *fs)
1432 {
1433   if (fs)
1434     {
1435       int fs_idx;
1436       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1437         {
1438           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1439           re_free (fs->stack[fs_idx].regs);
1440         }
1441       re_free (fs->stack);
1442     }
1443   return REG_NOERROR;
1444 }
1445
1446 static void
1447 internal_function
1448 update_regs (re_dfa_t *dfa, regmatch_t *pmatch, regmatch_t *prev_idx_match,
1449              int cur_node, int cur_idx, int nmatch)
1450 {
1451   int type = dfa->nodes[cur_node].type;
1452   if (type == OP_OPEN_SUBEXP)
1453     {
1454       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1455
1456       /* We are at the first node of this sub expression.  */
1457       if (reg_num < nmatch)
1458         {
1459           pmatch[reg_num].rm_so = cur_idx;
1460           pmatch[reg_num].rm_eo = -1;
1461         }
1462     }
1463   else if (type == OP_CLOSE_SUBEXP)
1464     {
1465       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1466       if (reg_num < nmatch)
1467         {
1468           /* We are at the last node of this sub expression.  */
1469           if (pmatch[reg_num].rm_so < cur_idx)
1470             {
1471               pmatch[reg_num].rm_eo = cur_idx;
1472               /* This is a non-empty match or we are not inside an optional
1473                  subexpression.  Accept this right away.  */
1474               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1475             }
1476           else
1477             {
1478               if (dfa->nodes[cur_node].opt_subexp
1479                   && prev_idx_match[reg_num].rm_so != -1)
1480                 /* We transited through an empty match for an optional
1481                    subexpression, like (a?)*, and this is not the subexp's
1482                    first match.  Copy back the old content of the registers
1483                    so that matches of an inner subexpression are undone as
1484                    well, like in ((a?))*.  */
1485                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1486               else
1487                 /* We completed a subexpression, but it may be part of
1488                    an optional one, so do not update PREV_IDX_MATCH.  */
1489                 pmatch[reg_num].rm_eo = cur_idx;
1490             }
1491         }
1492     }
1493 }
1494
1495 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1496    and sift the nodes in each states according to the following rules.
1497    Updated state_log will be wrote to STATE_LOG.
1498
1499    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1500      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1501         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1502         the LAST_NODE, we throw away the node `a'.
1503      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1504         string `s' and transit to `b':
1505         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1506            away the node `a'.
1507         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1508             thrown away, we throw away the node `a'.
1509      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1510         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1511            node `a'.
1512         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1513             we throw away the node `a'.  */
1514
1515 #define STATE_NODE_CONTAINS(state,node) \
1516   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1517
1518 static reg_errcode_t
1519 internal_function
1520 sift_states_backward (re_match_context_t *mctx, re_sift_context_t *sctx)
1521 {
1522   reg_errcode_t err;
1523   int null_cnt = 0;
1524   int str_idx = sctx->last_str_idx;
1525   re_node_set cur_dest;
1526
1527 #ifdef DEBUG
1528   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1529 #endif
1530
1531   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1532      transit to the last_node and the last_node itself.  */
1533   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1534   if (BE (err != REG_NOERROR, 0))
1535     return err;
1536   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1537   if (BE (err != REG_NOERROR, 0))
1538     goto free_return;
1539
1540   /* Then check each states in the state_log.  */
1541   while (str_idx > 0)
1542     {
1543       /* Update counters.  */
1544       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1545       if (null_cnt > mctx->max_mb_elem_len)
1546         {
1547           memset (sctx->sifted_states, '\0',
1548                   sizeof (re_dfastate_t *) * str_idx);
1549           re_node_set_free (&cur_dest);
1550           return REG_NOERROR;
1551         }
1552       re_node_set_empty (&cur_dest);
1553       --str_idx;
1554
1555       if (mctx->state_log[str_idx])
1556         {
1557           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1558           if (BE (err != REG_NOERROR, 0))
1559             goto free_return;
1560         }
1561
1562       /* Add all the nodes which satisfy the following conditions:
1563          - It can epsilon transit to a node in CUR_DEST.
1564          - It is in CUR_SRC.
1565          And update state_log.  */
1566       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1567       if (BE (err != REG_NOERROR, 0))
1568         goto free_return;
1569     }
1570   err = REG_NOERROR;
1571  free_return:
1572   re_node_set_free (&cur_dest);
1573   return err;
1574 }
1575
1576 static reg_errcode_t
1577 internal_function
1578 build_sifted_states (re_match_context_t *mctx, re_sift_context_t *sctx,
1579                      int str_idx, re_node_set *cur_dest)
1580 {
1581   re_dfa_t *const dfa = mctx->dfa;
1582   re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1583   int i;
1584
1585   /* Then build the next sifted state.
1586      We build the next sifted state on `cur_dest', and update
1587      `sifted_states[str_idx]' with `cur_dest'.
1588      Note:
1589      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1590      `cur_src' points the node_set of the old `state_log[str_idx]'
1591      (with the epsilon nodes pre-filtered out).  */
1592   for (i = 0; i < cur_src->nelem; i++)
1593     {
1594       int prev_node = cur_src->elems[i];
1595       int naccepted = 0;
1596       int ret;
1597
1598 #ifdef DEBUG
1599       re_token_type_t type = dfa->nodes[prev_node].type;
1600       assert (!IS_EPSILON_NODE (type));
1601 #endif
1602 #ifdef RE_ENABLE_I18N
1603       /* If the node may accept `multi byte'.  */
1604       if (dfa->nodes[prev_node].accept_mb)
1605         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1606                                          str_idx, sctx->last_str_idx);
1607 #endif /* RE_ENABLE_I18N */
1608
1609       /* We don't check backreferences here.
1610          See update_cur_sifted_state().  */
1611       if (!naccepted
1612           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1613           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1614                                   dfa->nexts[prev_node]))
1615         naccepted = 1;
1616
1617       if (naccepted == 0)
1618         continue;
1619
1620       if (sctx->limits.nelem)
1621         {
1622           int to_idx = str_idx + naccepted;
1623           if (check_dst_limits (mctx, &sctx->limits,
1624                                 dfa->nexts[prev_node], to_idx,
1625                                 prev_node, str_idx))
1626             continue;
1627         }
1628       ret = re_node_set_insert (cur_dest, prev_node);
1629       if (BE (ret == -1, 0))
1630         return REG_ESPACE;
1631     }
1632
1633   return REG_NOERROR;
1634 }
1635
1636 /* Helper functions.  */
1637
1638 static reg_errcode_t
1639 internal_function
1640 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1641 {
1642   int top = mctx->state_log_top;
1643
1644   if (next_state_log_idx >= mctx->input.bufs_len
1645       || (next_state_log_idx >= mctx->input.valid_len
1646           && mctx->input.valid_len < mctx->input.len))
1647     {
1648       reg_errcode_t err;
1649       err = extend_buffers (mctx);
1650       if (BE (err != REG_NOERROR, 0))
1651         return err;
1652     }
1653
1654   if (top < next_state_log_idx)
1655     {
1656       memset (mctx->state_log + top + 1, '\0',
1657               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1658       mctx->state_log_top = next_state_log_idx;
1659     }
1660   return REG_NOERROR;
1661 }
1662
1663 static reg_errcode_t
1664 internal_function
1665 merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src,
1666                    int num)
1667 {
1668   int st_idx;
1669   reg_errcode_t err;
1670   for (st_idx = 0; st_idx < num; ++st_idx)
1671     {
1672       if (dst[st_idx] == NULL)
1673         dst[st_idx] = src[st_idx];
1674       else if (src[st_idx] != NULL)
1675         {
1676           re_node_set merged_set;
1677           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1678                                         &src[st_idx]->nodes);
1679           if (BE (err != REG_NOERROR, 0))
1680             return err;
1681           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1682           re_node_set_free (&merged_set);
1683           if (BE (err != REG_NOERROR, 0))
1684             return err;
1685         }
1686     }
1687   return REG_NOERROR;
1688 }
1689
1690 static reg_errcode_t
1691 internal_function
1692 update_cur_sifted_state (re_match_context_t *mctx, re_sift_context_t *sctx,
1693                          int str_idx, re_node_set *dest_nodes)
1694 {
1695   re_dfa_t *const dfa = mctx->dfa;
1696   reg_errcode_t err;
1697   const re_node_set *candidates;
1698   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1699                 : &mctx->state_log[str_idx]->nodes);
1700
1701   if (dest_nodes->nelem == 0)
1702     sctx->sifted_states[str_idx] = NULL;
1703   else
1704     {
1705       if (candidates)
1706         {
1707           /* At first, add the nodes which can epsilon transit to a node in
1708              DEST_NODE.  */
1709           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1710           if (BE (err != REG_NOERROR, 0))
1711             return err;
1712
1713           /* Then, check the limitations in the current sift_context.  */
1714           if (sctx->limits.nelem)
1715             {
1716               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1717                                          mctx->bkref_ents, str_idx);
1718               if (BE (err != REG_NOERROR, 0))
1719                 return err;
1720             }
1721         }
1722
1723       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1724       if (BE (err != REG_NOERROR, 0))
1725         return err;
1726     }
1727
1728   if (candidates && mctx->state_log[str_idx]->has_backref)
1729     {
1730       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1731       if (BE (err != REG_NOERROR, 0))
1732         return err;
1733     }
1734   return REG_NOERROR;
1735 }
1736
1737 static reg_errcode_t
1738 internal_function
1739 add_epsilon_src_nodes (re_dfa_t *dfa, re_node_set *dest_nodes,
1740                        const re_node_set *candidates)
1741 {
1742   reg_errcode_t err = REG_NOERROR;
1743   int i;
1744
1745   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1746   if (BE (err != REG_NOERROR, 0))
1747     return err;
1748
1749   if (!state->inveclosure.alloc)
1750     {
1751       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1752       if (BE (err != REG_NOERROR, 0))
1753         return REG_ESPACE;
1754       for (i = 0; i < dest_nodes->nelem; i++)
1755         re_node_set_merge (&state->inveclosure,
1756                            dfa->inveclosures + dest_nodes->elems[i]);
1757     }
1758   return re_node_set_add_intersect (dest_nodes, candidates,
1759                                     &state->inveclosure);
1760 }
1761
1762 static reg_errcode_t
1763 internal_function
1764 sub_epsilon_src_nodes (re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1765                        const re_node_set *candidates)
1766 {
1767     int ecl_idx;
1768     reg_errcode_t err;
1769     re_node_set *inv_eclosure = dfa->inveclosures + node;
1770     re_node_set except_nodes;
1771     re_node_set_init_empty (&except_nodes);
1772     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1773       {
1774         int cur_node = inv_eclosure->elems[ecl_idx];
1775         if (cur_node == node)
1776           continue;
1777         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1778           {
1779             int edst1 = dfa->edests[cur_node].elems[0];
1780             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1781                          ? dfa->edests[cur_node].elems[1] : -1);
1782             if ((!re_node_set_contains (inv_eclosure, edst1)
1783                  && re_node_set_contains (dest_nodes, edst1))
1784                 || (edst2 > 0
1785                     && !re_node_set_contains (inv_eclosure, edst2)
1786                     && re_node_set_contains (dest_nodes, edst2)))
1787               {
1788                 err = re_node_set_add_intersect (&except_nodes, candidates,
1789                                                  dfa->inveclosures + cur_node);
1790                 if (BE (err != REG_NOERROR, 0))
1791                   {
1792                     re_node_set_free (&except_nodes);
1793                     return err;
1794                   }
1795               }
1796           }
1797       }
1798     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1799       {
1800         int cur_node = inv_eclosure->elems[ecl_idx];
1801         if (!re_node_set_contains (&except_nodes, cur_node))
1802           {
1803             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1804             re_node_set_remove_at (dest_nodes, idx);
1805           }
1806       }
1807     re_node_set_free (&except_nodes);
1808     return REG_NOERROR;
1809 }
1810
1811 static int
1812 internal_function
1813 check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
1814                   int dst_node, int dst_idx, int src_node, int src_idx)
1815 {
1816   re_dfa_t *const dfa = mctx->dfa;
1817   int lim_idx, src_pos, dst_pos;
1818
1819   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1820   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1821   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1822     {
1823       int subexp_idx;
1824       struct re_backref_cache_entry *ent;
1825       ent = mctx->bkref_ents + limits->elems[lim_idx];
1826       subexp_idx = dfa->nodes[ent->node].opr.idx;
1827
1828       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1829                                            subexp_idx, dst_node, dst_idx,
1830                                            dst_bkref_idx);
1831       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1832                                            subexp_idx, src_node, src_idx,
1833                                            src_bkref_idx);
1834
1835       /* In case of:
1836          <src> <dst> ( <subexp> )
1837          ( <subexp> ) <src> <dst>
1838          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1839       if (src_pos == dst_pos)
1840         continue; /* This is unrelated limitation.  */
1841       else
1842         return 1;
1843     }
1844   return 0;
1845 }
1846
1847 static int
1848 internal_function
1849 check_dst_limits_calc_pos_1 (re_match_context_t *mctx, int boundaries,
1850                              int subexp_idx, int from_node, int bkref_idx)
1851 {
1852   re_dfa_t *const dfa = mctx->dfa;
1853   re_node_set *eclosures = dfa->eclosures + from_node;
1854   int node_idx;
1855
1856   /* Else, we are on the boundary: examine the nodes on the epsilon
1857      closure.  */
1858   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1859     {
1860       int node = eclosures->elems[node_idx];
1861       switch (dfa->nodes[node].type)
1862         {
1863         case OP_BACK_REF:
1864           if (bkref_idx != -1)
1865             {
1866               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1867               do
1868                 {
1869                   int dst, cpos;
1870
1871                   if (ent->node != node)
1872                     continue;
1873
1874                   if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1875                       && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
1876                     continue;
1877
1878                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1879                      OP_CLOSE_SUBEXP cases below.  But, if the
1880                      destination node is the same node as the source
1881                      node, don't recurse because it would cause an
1882                      infinite loop: a regex that exhibits this behavior
1883                      is ()\1*\1*  */
1884                   dst = dfa->edests[node].elems[0];
1885                   if (dst == from_node)
1886                     {
1887                       if (boundaries & 1)
1888                         return -1;
1889                       else /* if (boundaries & 2) */
1890                         return 0;
1891                     }
1892
1893                   cpos =
1894                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1895                                                  dst, bkref_idx);
1896                   if (cpos == -1 /* && (boundaries & 1) */)
1897                     return -1;
1898                   if (cpos == 0 && (boundaries & 2))
1899                     return 0;
1900
1901                   ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
1902                 }
1903               while (ent++->more);
1904             }
1905           break;
1906
1907         case OP_OPEN_SUBEXP:
1908           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1909             return -1;
1910           break;
1911
1912         case OP_CLOSE_SUBEXP:
1913           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1914             return 0;
1915           break;
1916
1917         default:
1918             break;
1919         }
1920     }
1921
1922   return (boundaries & 2) ? 1 : 0;
1923 }
1924
1925 static int
1926 internal_function
1927 check_dst_limits_calc_pos (re_match_context_t *mctx, int limit, int subexp_idx,
1928                            int from_node, int str_idx, int bkref_idx)
1929 {
1930   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1931   int boundaries;
1932
1933   /* If we are outside the range of the subexpression, return -1 or 1.  */
1934   if (str_idx < lim->subexp_from)
1935     return -1;
1936
1937   if (lim->subexp_to < str_idx)
1938     return 1;
1939
1940   /* If we are within the subexpression, return 0.  */
1941   boundaries = (str_idx == lim->subexp_from);
1942   boundaries |= (str_idx == lim->subexp_to) << 1;
1943   if (boundaries == 0)
1944     return 0;
1945
1946   /* Else, examine epsilon closure.  */
1947   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1948                                       from_node, bkref_idx);
1949 }
1950
1951 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1952    which are against limitations from DEST_NODES. */
1953
1954 static reg_errcode_t
1955 internal_function
1956 check_subexp_limits (re_dfa_t *dfa, re_node_set *dest_nodes,
1957                      const re_node_set *candidates, re_node_set *limits,
1958                      struct re_backref_cache_entry *bkref_ents, int str_idx)
1959 {
1960   reg_errcode_t err;
1961   int node_idx, lim_idx;
1962
1963   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1964     {
1965       int subexp_idx;
1966       struct re_backref_cache_entry *ent;
1967       ent = bkref_ents + limits->elems[lim_idx];
1968
1969       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1970         continue; /* This is unrelated limitation.  */
1971
1972       subexp_idx = dfa->nodes[ent->node].opr.idx;
1973       if (ent->subexp_to == str_idx)
1974         {
1975           int ops_node = -1;
1976           int cls_node = -1;
1977           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1978             {
1979               int node = dest_nodes->elems[node_idx];
1980               re_token_type_t type = dfa->nodes[node].type;
1981               if (type == OP_OPEN_SUBEXP
1982                   && subexp_idx == dfa->nodes[node].opr.idx)
1983                 ops_node = node;
1984               else if (type == OP_CLOSE_SUBEXP
1985                        && subexp_idx == dfa->nodes[node].opr.idx)
1986                 cls_node = node;
1987             }
1988
1989           /* Check the limitation of the open subexpression.  */
1990           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
1991           if (ops_node >= 0)
1992             {
1993               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
1994                                            candidates);
1995               if (BE (err != REG_NOERROR, 0))
1996                 return err;
1997             }
1998
1999           /* Check the limitation of the close subexpression.  */
2000           if (cls_node >= 0)
2001             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2002               {
2003                 int node = dest_nodes->elems[node_idx];
2004                 if (!re_node_set_contains (dfa->inveclosures + node,
2005                                            cls_node)
2006                     && !re_node_set_contains (dfa->eclosures + node,
2007                                               cls_node))
2008                   {
2009                     /* It is against this limitation.
2010                        Remove it form the current sifted state.  */
2011                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2012                                                  candidates);
2013                     if (BE (err != REG_NOERROR, 0))
2014                       return err;
2015                     --node_idx;
2016                   }
2017               }
2018         }
2019       else /* (ent->subexp_to != str_idx)  */
2020         {
2021           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2022             {
2023               int node = dest_nodes->elems[node_idx];
2024               re_token_type_t type = dfa->nodes[node].type;
2025               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2026                 {
2027                   if (subexp_idx != dfa->nodes[node].opr.idx)
2028                     continue;
2029                   /* It is against this limitation.
2030                      Remove it form the current sifted state.  */
2031                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2032                                                candidates);
2033                   if (BE (err != REG_NOERROR, 0))
2034                     return err;
2035                 }
2036             }
2037         }
2038     }
2039   return REG_NOERROR;
2040 }
2041
2042 static reg_errcode_t
2043 internal_function
2044 sift_states_bkref (re_match_context_t *mctx, re_sift_context_t *sctx,
2045                    int str_idx, const re_node_set *candidates)
2046 {
2047   re_dfa_t *const dfa = mctx->dfa;
2048   reg_errcode_t err;
2049   int node_idx, node;
2050   re_sift_context_t local_sctx;
2051   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2052
2053   if (first_idx == -1)
2054     return REG_NOERROR;
2055
2056   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2057
2058   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2059     {
2060       int enabled_idx;
2061       re_token_type_t type;
2062       struct re_backref_cache_entry *entry;
2063       node = candidates->elems[node_idx];
2064       type = dfa->nodes[node].type;
2065       /* Avoid infinite loop for the REs like "()\1+".  */
2066       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2067         continue;
2068       if (type != OP_BACK_REF)
2069         continue;
2070
2071       entry = mctx->bkref_ents + first_idx;
2072       enabled_idx = first_idx;
2073       do
2074         {
2075           int subexp_len, to_idx, dst_node, ret;
2076           re_dfastate_t *cur_state;
2077
2078           if (entry->node != node)
2079             continue;
2080           subexp_len = entry->subexp_to - entry->subexp_from;
2081           to_idx = str_idx + subexp_len;
2082           dst_node = (subexp_len ? dfa->nexts[node]
2083                       : dfa->edests[node].elems[0]);
2084
2085           if (to_idx > sctx->last_str_idx
2086               || sctx->sifted_states[to_idx] == NULL
2087               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2088               || check_dst_limits (mctx, &sctx->limits, node,
2089                                    str_idx, dst_node, to_idx))
2090             continue;
2091
2092           if (local_sctx.sifted_states == NULL)
2093             {
2094               local_sctx = *sctx;
2095               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2096               if (BE (err != REG_NOERROR, 0))
2097                 goto free_return;
2098             }
2099           local_sctx.last_node = node;
2100           local_sctx.last_str_idx = str_idx;
2101           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2102           if (BE (ret < 0, 0))
2103             {
2104               err = REG_ESPACE;
2105               goto free_return;
2106             }
2107           cur_state = local_sctx.sifted_states[str_idx];
2108           err = sift_states_backward (mctx, &local_sctx);
2109           if (BE (err != REG_NOERROR, 0))
2110             goto free_return;
2111           if (sctx->limited_states != NULL)
2112             {
2113               err = merge_state_array (dfa, sctx->limited_states,
2114                                        local_sctx.sifted_states,
2115                                        str_idx + 1);
2116               if (BE (err != REG_NOERROR, 0))
2117                 goto free_return;
2118             }
2119           local_sctx.sifted_states[str_idx] = cur_state;
2120           re_node_set_remove (&local_sctx.limits, enabled_idx);
2121
2122           /* mctx->bkref_ents may have changed, reload the pointer.  */
2123           entry = mctx->bkref_ents + enabled_idx;
2124         }
2125       while (enabled_idx++, entry++->more);
2126     }
2127   err = REG_NOERROR;
2128  free_return:
2129   if (local_sctx.sifted_states != NULL)
2130     {
2131       re_node_set_free (&local_sctx.limits);
2132     }
2133
2134   return err;
2135 }
2136
2137
2138 #ifdef RE_ENABLE_I18N
2139 static int
2140 internal_function
2141 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2142                      int node_idx, int str_idx, int max_str_idx)
2143 {
2144   re_dfa_t *const dfa = mctx->dfa;
2145   int naccepted;
2146   /* Check the node can accept `multi byte'.  */
2147   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2148   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2149       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2150                             dfa->nexts[node_idx]))
2151     /* The node can't accept the `multi byte', or the
2152        destination was already thrown away, then the node
2153        could't accept the current input `multi byte'.   */
2154     naccepted = 0;
2155   /* Otherwise, it is sure that the node could accept
2156      `naccepted' bytes input.  */
2157   return naccepted;
2158 }
2159 #endif /* RE_ENABLE_I18N */
2160
2161 \f
2162 /* Functions for state transition.  */
2163
2164 /* Return the next state to which the current state STATE will transit by
2165    accepting the current input byte, and update STATE_LOG if necessary.
2166    If STATE can accept a multibyte char/collating element/back reference
2167    update the destination of STATE_LOG.  */
2168
2169 static re_dfastate_t *
2170 internal_function
2171 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2172                re_dfastate_t *state)
2173 {
2174   re_dfastate_t **trtable;
2175   unsigned char ch;
2176
2177 #ifdef RE_ENABLE_I18N
2178   /* If the current state can accept multibyte.  */
2179   if (BE (state->accept_mb, 0))
2180     {
2181       *err = transit_state_mb (mctx, state);
2182       if (BE (*err != REG_NOERROR, 0))
2183         return NULL;
2184     }
2185 #endif /* RE_ENABLE_I18N */
2186
2187   /* Then decide the next state with the single byte.  */
2188 #if 0
2189   if (0)
2190     /* don't use transition table  */
2191     return transit_state_sb (err, mctx, state);
2192 #endif
2193
2194   /* Use transition table  */
2195   ch = re_string_fetch_byte (&mctx->input);
2196   for (;;)
2197     {
2198       trtable = state->trtable;
2199       if (BE (trtable != NULL, 1))
2200         return trtable[ch];
2201
2202       trtable = state->word_trtable;
2203       if (BE (trtable != NULL, 1))
2204         {
2205           unsigned int context;
2206           context
2207             = re_string_context_at (&mctx->input,
2208                                     re_string_cur_idx (&mctx->input) - 1,
2209                                     mctx->eflags);
2210           if (IS_WORD_CONTEXT (context))
2211             return trtable[ch + SBC_MAX];
2212           else
2213             return trtable[ch];
2214         }
2215
2216       if (!build_trtable (mctx->dfa, state))
2217         {
2218           *err = REG_ESPACE;
2219           return NULL;
2220         }
2221
2222       /* Retry, we now have a transition table.  */
2223     }
2224 }
2225
2226 /* Update the state_log if we need */
2227 re_dfastate_t *
2228 internal_function
2229 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2230                       re_dfastate_t *next_state)
2231 {
2232   re_dfa_t *const dfa = mctx->dfa;
2233   int cur_idx = re_string_cur_idx (&mctx->input);
2234
2235   if (cur_idx > mctx->state_log_top)
2236     {
2237       mctx->state_log[cur_idx] = next_state;
2238       mctx->state_log_top = cur_idx;
2239     }
2240   else if (mctx->state_log[cur_idx] == 0)
2241     {
2242       mctx->state_log[cur_idx] = next_state;
2243     }
2244   else
2245     {
2246       re_dfastate_t *pstate;
2247       unsigned int context;
2248       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2249       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2250          the destination of a multibyte char/collating element/
2251          back reference.  Then the next state is the union set of
2252          these destinations and the results of the transition table.  */
2253       pstate = mctx->state_log[cur_idx];
2254       log_nodes = pstate->entrance_nodes;
2255       if (next_state != NULL)
2256         {
2257           table_nodes = next_state->entrance_nodes;
2258           *err = re_node_set_init_union (&next_nodes, table_nodes,
2259                                              log_nodes);
2260           if (BE (*err != REG_NOERROR, 0))
2261             return NULL;
2262         }
2263       else
2264         next_nodes = *log_nodes;
2265       /* Note: We already add the nodes of the initial state,
2266          then we don't need to add them here.  */
2267
2268       context = re_string_context_at (&mctx->input,
2269                                       re_string_cur_idx (&mctx->input) - 1,
2270                                       mctx->eflags);
2271       next_state = mctx->state_log[cur_idx]
2272         = re_acquire_state_context (err, dfa, &next_nodes, context);
2273       /* We don't need to check errors here, since the return value of
2274          this function is next_state and ERR is already set.  */
2275
2276       if (table_nodes != NULL)
2277         re_node_set_free (&next_nodes);
2278     }
2279
2280   if (BE (dfa->nbackref, 0) && next_state != NULL)
2281     {
2282       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2283          later.  We must check them here, since the back references in the
2284          next state might use them.  */
2285       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2286                                         cur_idx);
2287       if (BE (*err != REG_NOERROR, 0))
2288         return NULL;
2289
2290       /* If the next state has back references.  */
2291       if (next_state->has_backref)
2292         {
2293           *err = transit_state_bkref (mctx, &next_state->nodes);
2294           if (BE (*err != REG_NOERROR, 0))
2295             return NULL;
2296           next_state = mctx->state_log[cur_idx];
2297         }
2298     }
2299
2300   return next_state;
2301 }
2302
2303 /* Skip bytes in the input that correspond to part of a
2304    multi-byte match, then look in the log for a state
2305    from which to restart matching.  */
2306 static re_dfastate_t *
2307 internal_function
2308 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2309 {
2310   re_dfastate_t *cur_state = NULL;
2311   do
2312     {
2313       int max = mctx->state_log_top;
2314       int cur_str_idx = re_string_cur_idx (&mctx->input);
2315
2316       do
2317         {
2318           if (++cur_str_idx > max)
2319             return NULL;
2320           re_string_skip_bytes (&mctx->input, 1);
2321         }
2322       while (mctx->state_log[cur_str_idx] == NULL);
2323
2324       cur_state = merge_state_with_log (err, mctx, NULL);
2325     }
2326   while (err == REG_NOERROR && cur_state == NULL);
2327   return cur_state;
2328 }
2329
2330 /* Helper functions for transit_state.  */
2331
2332 /* From the node set CUR_NODES, pick up the nodes whose types are
2333    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2334    expression. And register them to use them later for evaluating the
2335    correspoding back references.  */
2336
2337 static reg_errcode_t
2338 internal_function
2339 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2340                            int str_idx)
2341 {
2342   re_dfa_t *const dfa = mctx->dfa;
2343   int node_idx;
2344   reg_errcode_t err;
2345
2346   /* TODO: This isn't efficient.
2347            Because there might be more than one nodes whose types are
2348            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2349            nodes.
2350            E.g. RE: (a){2}  */
2351   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2352     {
2353       int node = cur_nodes->elems[node_idx];
2354       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2355           && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2356           && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2357         {
2358           err = match_ctx_add_subtop (mctx, node, str_idx);
2359           if (BE (err != REG_NOERROR, 0))
2360             return err;
2361         }
2362     }
2363   return REG_NOERROR;
2364 }
2365
2366 #if 0
2367 /* Return the next state to which the current state STATE will transit by
2368    accepting the current input byte.  */
2369
2370 static re_dfastate_t *
2371 transit_state_sb (err, mctx, state)
2372      reg_errcode_t *err;
2373      re_match_context_t *mctx;
2374      re_dfastate_t *state;
2375 {
2376   re_dfa_t *const dfa = mctx->dfa;
2377   re_node_set next_nodes;
2378   re_dfastate_t *next_state;
2379   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2380   unsigned int context;
2381
2382   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2383   if (BE (*err != REG_NOERROR, 0))
2384     return NULL;
2385   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2386     {
2387       int cur_node = state->nodes.elems[node_cnt];
2388       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2389         {
2390           *err = re_node_set_merge (&next_nodes,
2391                                     dfa->eclosures + dfa->nexts[cur_node]);
2392           if (BE (*err != REG_NOERROR, 0))
2393             {
2394               re_node_set_free (&next_nodes);
2395               return NULL;
2396             }
2397         }
2398     }
2399   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2400   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2401   /* We don't need to check errors here, since the return value of
2402      this function is next_state and ERR is already set.  */
2403
2404   re_node_set_free (&next_nodes);
2405   re_string_skip_bytes (&mctx->input, 1);
2406   return next_state;
2407 }
2408 #endif
2409
2410 #ifdef RE_ENABLE_I18N
2411 static reg_errcode_t
2412 internal_function
2413 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2414 {
2415   re_dfa_t *const dfa = mctx->dfa;
2416   reg_errcode_t err;
2417   int i;
2418
2419   for (i = 0; i < pstate->nodes.nelem; ++i)
2420     {
2421       re_node_set dest_nodes, *new_nodes;
2422       int cur_node_idx = pstate->nodes.elems[i];
2423       int naccepted, dest_idx;
2424       unsigned int context;
2425       re_dfastate_t *dest_state;
2426
2427       if (!dfa->nodes[cur_node_idx].accept_mb)
2428         continue;
2429
2430       if (dfa->nodes[cur_node_idx].constraint)
2431         {
2432           context = re_string_context_at (&mctx->input,
2433                                           re_string_cur_idx (&mctx->input),
2434                                           mctx->eflags);
2435           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2436                                            context))
2437             continue;
2438         }
2439
2440       /* How many bytes the node can accept?  */
2441       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2442                                            re_string_cur_idx (&mctx->input));
2443       if (naccepted == 0)
2444         continue;
2445
2446       /* The node can accepts `naccepted' bytes.  */
2447       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2448       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2449                                : mctx->max_mb_elem_len);
2450       err = clean_state_log_if_needed (mctx, dest_idx);
2451       if (BE (err != REG_NOERROR, 0))
2452         return err;
2453 #ifdef DEBUG
2454       assert (dfa->nexts[cur_node_idx] != -1);
2455 #endif
2456       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2457
2458       dest_state = mctx->state_log[dest_idx];
2459       if (dest_state == NULL)
2460         dest_nodes = *new_nodes;
2461       else
2462         {
2463           err = re_node_set_init_union (&dest_nodes,
2464                                         dest_state->entrance_nodes, new_nodes);
2465           if (BE (err != REG_NOERROR, 0))
2466             return err;
2467         }
2468       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2469       mctx->state_log[dest_idx]
2470         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2471       if (dest_state != NULL)
2472         re_node_set_free (&dest_nodes);
2473       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2474         return err;
2475     }
2476   return REG_NOERROR;
2477 }
2478 #endif /* RE_ENABLE_I18N */
2479
2480 static reg_errcode_t
2481 internal_function
2482 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2483 {
2484   re_dfa_t *const dfa = mctx->dfa;
2485   reg_errcode_t err;
2486   int i;
2487   int cur_str_idx = re_string_cur_idx (&mctx->input);
2488
2489   for (i = 0; i < nodes->nelem; ++i)
2490     {
2491       int dest_str_idx, prev_nelem, bkc_idx;
2492       int node_idx = nodes->elems[i];
2493       unsigned int context;
2494       const re_token_t *node = dfa->nodes + node_idx;
2495       re_node_set *new_dest_nodes;
2496
2497       /* Check whether `node' is a backreference or not.  */
2498       if (node->type != OP_BACK_REF)
2499         continue;
2500
2501       if (node->constraint)
2502         {
2503           context = re_string_context_at (&mctx->input, cur_str_idx,
2504                                           mctx->eflags);
2505           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2506             continue;
2507         }
2508
2509       /* `node' is a backreference.
2510          Check the substring which the substring matched.  */
2511       bkc_idx = mctx->nbkref_ents;
2512       err = get_subexp (mctx, node_idx, cur_str_idx);
2513       if (BE (err != REG_NOERROR, 0))
2514         goto free_return;
2515
2516       /* And add the epsilon closures (which is `new_dest_nodes') of
2517          the backreference to appropriate state_log.  */
2518 #ifdef DEBUG
2519       assert (dfa->nexts[node_idx] != -1);
2520 #endif
2521       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2522         {
2523           int subexp_len;
2524           re_dfastate_t *dest_state;
2525           struct re_backref_cache_entry *bkref_ent;
2526           bkref_ent = mctx->bkref_ents + bkc_idx;
2527           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2528             continue;
2529           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2530           new_dest_nodes = (subexp_len == 0
2531                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2532                             : dfa->eclosures + dfa->nexts[node_idx]);
2533           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2534                           - bkref_ent->subexp_from);
2535           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2536                                           mctx->eflags);
2537           dest_state = mctx->state_log[dest_str_idx];
2538           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2539                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2540           /* Add `new_dest_node' to state_log.  */
2541           if (dest_state == NULL)
2542             {
2543               mctx->state_log[dest_str_idx]
2544                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2545                                             context);
2546               if (BE (mctx->state_log[dest_str_idx] == NULL
2547                       && err != REG_NOERROR, 0))
2548                 goto free_return;
2549             }
2550           else
2551             {
2552               re_node_set dest_nodes;
2553               err = re_node_set_init_union (&dest_nodes,
2554                                             dest_state->entrance_nodes,
2555                                             new_dest_nodes);
2556               if (BE (err != REG_NOERROR, 0))
2557                 {
2558                   re_node_set_free (&dest_nodes);
2559                   goto free_return;
2560                 }
2561               mctx->state_log[dest_str_idx]
2562                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2563               re_node_set_free (&dest_nodes);
2564               if (BE (mctx->state_log[dest_str_idx] == NULL
2565                       && err != REG_NOERROR, 0))
2566                 goto free_return;
2567             }
2568           /* We need to check recursively if the backreference can epsilon
2569              transit.  */
2570           if (subexp_len == 0
2571               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2572             {
2573               err = check_subexp_matching_top (mctx, new_dest_nodes,
2574                                                cur_str_idx);
2575               if (BE (err != REG_NOERROR, 0))
2576                 goto free_return;
2577               err = transit_state_bkref (mctx, new_dest_nodes);
2578               if (BE (err != REG_NOERROR, 0))
2579                 goto free_return;
2580             }
2581         }
2582     }
2583   err = REG_NOERROR;
2584  free_return:
2585   return err;
2586 }
2587
2588 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2589    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2590    Note that we might collect inappropriate candidates here.
2591    However, the cost of checking them strictly here is too high, then we
2592    delay these checking for prune_impossible_nodes().  */
2593
2594 static reg_errcode_t
2595 internal_function
2596 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2597 {
2598   re_dfa_t *const dfa = mctx->dfa;
2599   int subexp_num, sub_top_idx;
2600   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2601   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2602   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2603   if (cache_idx != -1)
2604     {
2605       const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2606       do
2607         if (entry->node == bkref_node)
2608           return REG_NOERROR; /* We already checked it.  */
2609       while (entry++->more);
2610     }
2611
2612   subexp_num = dfa->nodes[bkref_node].opr.idx;
2613
2614   /* For each sub expression  */
2615   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2616     {
2617       reg_errcode_t err;
2618       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2619       re_sub_match_last_t *sub_last;
2620       int sub_last_idx, sl_str, bkref_str_off;
2621
2622       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2623         continue; /* It isn't related.  */
2624
2625       sl_str = sub_top->str_idx;
2626       bkref_str_off = bkref_str_idx;
2627       /* At first, check the last node of sub expressions we already
2628          evaluated.  */
2629       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2630         {
2631           int sl_str_diff;
2632           sub_last = sub_top->lasts[sub_last_idx];
2633           sl_str_diff = sub_last->str_idx - sl_str;
2634           /* The matched string by the sub expression match with the substring
2635              at the back reference?  */
2636           if (sl_str_diff > 0)
2637             {
2638               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2639                 {
2640                   /* Not enough chars for a successful match.  */
2641                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2642                     break;
2643
2644                   err = clean_state_log_if_needed (mctx,
2645                                                    bkref_str_off
2646                                                    + sl_str_diff);
2647                   if (BE (err != REG_NOERROR, 0))
2648                     return err;
2649                   buf = (const char *) re_string_get_buffer (&mctx->input);
2650                 }
2651               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2652                 break; /* We don't need to search this sub expression any more.  */
2653             }
2654           bkref_str_off += sl_str_diff;
2655           sl_str += sl_str_diff;
2656           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2657                                 bkref_str_idx);
2658
2659           /* Reload buf, since the preceding call might have reallocated
2660              the buffer.  */
2661           buf = (const char *) re_string_get_buffer (&mctx->input);
2662
2663           if (err == REG_NOMATCH)
2664             continue;
2665           if (BE (err != REG_NOERROR, 0))
2666             return err;
2667         }
2668
2669       if (sub_last_idx < sub_top->nlasts)
2670         continue;
2671       if (sub_last_idx > 0)
2672         ++sl_str;
2673       /* Then, search for the other last nodes of the sub expression.  */
2674       for (; sl_str <= bkref_str_idx; ++sl_str)
2675         {
2676           int cls_node, sl_str_off;
2677           const re_node_set *nodes;
2678           sl_str_off = sl_str - sub_top->str_idx;
2679           /* The matched string by the sub expression match with the substring
2680              at the back reference?  */
2681           if (sl_str_off > 0)
2682             {
2683               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2684                 {
2685                   /* If we are at the end of the input, we cannot match.  */
2686                   if (bkref_str_off >= mctx->input.len)
2687                     break;
2688
2689                   err = extend_buffers (mctx);
2690                   if (BE (err != REG_NOERROR, 0))
2691                     return err;
2692
2693                   buf = (const char *) re_string_get_buffer (&mctx->input);
2694                 }
2695               if (buf [bkref_str_off++] != buf[sl_str - 1])
2696                 break; /* We don't need to search this sub expression
2697                           any more.  */
2698             }
2699           if (mctx->state_log[sl_str] == NULL)
2700             continue;
2701           /* Does this state have a ')' of the sub expression?  */
2702           nodes = &mctx->state_log[sl_str]->nodes;
2703           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2704           if (cls_node == -1)
2705             continue; /* No.  */
2706           if (sub_top->path == NULL)
2707             {
2708               sub_top->path = calloc (sizeof (state_array_t),
2709                                       sl_str - sub_top->str_idx + 1);
2710               if (sub_top->path == NULL)
2711                 return REG_ESPACE;
2712             }
2713           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2714              in the current context?  */
2715           err = check_arrival (mctx, sub_top->path, sub_top->node,
2716                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2717           if (err == REG_NOMATCH)
2718               continue;
2719           if (BE (err != REG_NOERROR, 0))
2720               return err;
2721           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2722           if (BE (sub_last == NULL, 0))
2723             return REG_ESPACE;
2724           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2725                                 bkref_str_idx);
2726           if (err == REG_NOMATCH)
2727             continue;
2728         }
2729     }
2730   return REG_NOERROR;
2731 }
2732
2733 /* Helper functions for get_subexp().  */
2734
2735 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2736    If it can arrive, register the sub expression expressed with SUB_TOP
2737    and SUB_LAST.  */
2738
2739 static reg_errcode_t
2740 internal_function
2741 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2742                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2743 {
2744   reg_errcode_t err;
2745   int to_idx;
2746   /* Can the subexpression arrive the back reference?  */
2747   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2748                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2749   if (err != REG_NOERROR)
2750     return err;
2751   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2752                              sub_last->str_idx);
2753   if (BE (err != REG_NOERROR, 0))
2754     return err;
2755   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2756   return clean_state_log_if_needed (mctx, to_idx);
2757 }
2758
2759 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2760    Search '(' if FL_OPEN, or search ')' otherwise.
2761    TODO: This function isn't efficient...
2762          Because there might be more than one nodes whose types are
2763          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2764          nodes.
2765          E.g. RE: (a){2}  */
2766
2767 static int
2768 internal_function
2769 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2770                   int subexp_idx, int type)
2771 {
2772   int cls_idx;
2773   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2774     {
2775       int cls_node = nodes->elems[cls_idx];
2776       const re_token_t *node = dfa->nodes + cls_node;
2777       if (node->type == type
2778           && node->opr.idx == subexp_idx)
2779         return cls_node;
2780     }
2781   return -1;
2782 }
2783
2784 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2785    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2786    heavily reused.
2787    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2788
2789 static reg_errcode_t
2790 internal_function
2791 check_arrival (re_match_context_t *mctx, state_array_t *path,
2792                int top_node, int top_str, int last_node, int last_str,
2793                int type)
2794 {
2795   re_dfa_t *const dfa = mctx->dfa;
2796   reg_errcode_t err;
2797   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2798   re_dfastate_t *cur_state = NULL;
2799   re_node_set *cur_nodes, next_nodes;
2800   re_dfastate_t **backup_state_log;
2801   unsigned int context;
2802
2803   subexp_num = dfa->nodes[top_node].opr.idx;
2804   /* Extend the buffer if we need.  */
2805   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2806     {
2807       re_dfastate_t **new_array;
2808       int old_alloc = path->alloc;
2809       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2810       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2811       if (new_array == NULL)
2812         {
2813           path->alloc = old_alloc;
2814           return REG_ESPACE;
2815         }
2816       path->array = new_array;
2817       memset (new_array + old_alloc, '\0',
2818               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2819     }
2820
2821   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2822
2823   /* Temporary modify MCTX.  */
2824   backup_state_log = mctx->state_log;
2825   backup_cur_idx = mctx->input.cur_idx;
2826   mctx->state_log = path->array;
2827   mctx->input.cur_idx = str_idx;
2828
2829   /* Setup initial node set.  */
2830   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2831   if (str_idx == top_str)
2832     {
2833       err = re_node_set_init_1 (&next_nodes, top_node);
2834       if (BE (err != REG_NOERROR, 0))
2835         return err;
2836       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2837       if (BE (err != REG_NOERROR, 0))
2838         {
2839           re_node_set_free (&next_nodes);
2840           return err;
2841         }
2842     }
2843   else
2844     {
2845       cur_state = mctx->state_log[str_idx];
2846       if (cur_state && cur_state->has_backref)
2847         {
2848           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2849           if (BE ( err != REG_NOERROR, 0))
2850             return err;
2851         }
2852       else
2853         re_node_set_init_empty (&next_nodes);
2854     }
2855   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2856     {
2857       if (next_nodes.nelem)
2858         {
2859           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2860                                     subexp_num, type);
2861           if (BE ( err != REG_NOERROR, 0))
2862             {
2863               re_node_set_free (&next_nodes);
2864               return err;
2865             }
2866         }
2867       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2868       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2869         {
2870           re_node_set_free (&next_nodes);
2871           return err;
2872         }
2873       mctx->state_log[str_idx] = cur_state;
2874     }
2875
2876   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2877     {
2878       re_node_set_empty (&next_nodes);
2879       if (mctx->state_log[str_idx + 1])
2880         {
2881           err = re_node_set_merge (&next_nodes,
2882                                    &mctx->state_log[str_idx + 1]->nodes);
2883           if (BE (err != REG_NOERROR, 0))
2884             {
2885               re_node_set_free (&next_nodes);
2886               return err;
2887             }
2888         }
2889       if (cur_state)
2890         {
2891           err = check_arrival_add_next_nodes (mctx, str_idx,
2892                                               &cur_state->non_eps_nodes, &next_nodes);
2893           if (BE (err != REG_NOERROR, 0))
2894             {
2895               re_node_set_free (&next_nodes);
2896               return err;
2897             }
2898         }
2899       ++str_idx;
2900       if (next_nodes.nelem)
2901         {
2902           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2903           if (BE (err != REG_NOERROR, 0))
2904             {
2905               re_node_set_free (&next_nodes);
2906               return err;
2907             }
2908           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2909                                     subexp_num, type);
2910           if (BE ( err != REG_NOERROR, 0))
2911             {
2912               re_node_set_free (&next_nodes);
2913               return err;
2914             }
2915         }
2916       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2917       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2918       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2919         {
2920           re_node_set_free (&next_nodes);
2921           return err;
2922         }
2923       mctx->state_log[str_idx] = cur_state;
2924       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2925     }
2926   re_node_set_free (&next_nodes);
2927   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2928                : &mctx->state_log[last_str]->nodes);
2929   path->next_idx = str_idx;
2930
2931   /* Fix MCTX.  */
2932   mctx->state_log = backup_state_log;
2933   mctx->input.cur_idx = backup_cur_idx;
2934
2935   /* Then check the current node set has the node LAST_NODE.  */
2936   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2937     return REG_NOERROR;
2938
2939   return REG_NOMATCH;
2940 }
2941
2942 /* Helper functions for check_arrival.  */
2943
2944 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2945    to NEXT_NODES.
2946    TODO: This function is similar to the functions transit_state*(),
2947          however this function has many additional works.
2948          Can't we unify them?  */
2949
2950 static reg_errcode_t
2951 internal_function
2952 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
2953                               re_node_set *cur_nodes,
2954                               re_node_set *next_nodes)
2955 {
2956   re_dfa_t *const dfa = mctx->dfa;
2957   int result;
2958   int cur_idx;
2959   reg_errcode_t err;
2960   re_node_set union_set;
2961   re_node_set_init_empty (&union_set);
2962   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2963     {
2964       int naccepted = 0;
2965       int cur_node = cur_nodes->elems[cur_idx];
2966 #ifdef DEBUG
2967       re_token_type_t type = dfa->nodes[cur_node].type;
2968       assert (!IS_EPSILON_NODE (type));
2969 #endif
2970 #ifdef RE_ENABLE_I18N
2971       /* If the node may accept `multi byte'.  */
2972       if (dfa->nodes[cur_node].accept_mb)
2973         {
2974           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2975                                                str_idx);
2976           if (naccepted > 1)
2977             {
2978               re_dfastate_t *dest_state;
2979               int next_node = dfa->nexts[cur_node];
2980               int next_idx = str_idx + naccepted;
2981               dest_state = mctx->state_log[next_idx];
2982               re_node_set_empty (&union_set);
2983               if (dest_state)
2984                 {
2985                   err = re_node_set_merge (&union_set, &dest_state->nodes);
2986                   if (BE (err != REG_NOERROR, 0))
2987                     {
2988                       re_node_set_free (&union_set);
2989                       return err;
2990                     }
2991                 }
2992               result = re_node_set_insert (&union_set, next_node);
2993               if (BE (result < 0, 0))
2994                 {
2995                   re_node_set_free (&union_set);
2996                   return REG_ESPACE;
2997                 }
2998               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
2999                                                             &union_set);
3000               if (BE (mctx->state_log[next_idx] == NULL
3001                       && err != REG_NOERROR, 0))
3002                 {
3003                   re_node_set_free (&union_set);
3004                   return err;
3005                 }
3006             }
3007         }
3008 #endif /* RE_ENABLE_I18N */
3009       if (naccepted
3010           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3011         {
3012           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3013           if (BE (result < 0, 0))
3014             {
3015               re_node_set_free (&union_set);
3016               return REG_ESPACE;
3017             }
3018         }
3019     }
3020   re_node_set_free (&union_set);
3021   return REG_NOERROR;
3022 }
3023
3024 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3025    CUR_NODES, however exclude the nodes which are:
3026     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3027     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3028 */
3029
3030 static reg_errcode_t
3031 internal_function
3032 check_arrival_expand_ecl (re_dfa_t *dfa, re_node_set *cur_nodes,
3033                           int ex_subexp, int type)
3034 {
3035   reg_errcode_t err;
3036   int idx, outside_node;
3037   re_node_set new_nodes;
3038 #ifdef DEBUG
3039   assert (cur_nodes->nelem);
3040 #endif
3041   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3042   if (BE (err != REG_NOERROR, 0))
3043     return err;
3044   /* Create a new node set NEW_NODES with the nodes which are epsilon
3045      closures of the node in CUR_NODES.  */
3046
3047   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3048     {
3049       int cur_node = cur_nodes->elems[idx];
3050       re_node_set *eclosure = dfa->eclosures + cur_node;
3051       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3052       if (outside_node == -1)
3053         {
3054           /* There are no problematic nodes, just merge them.  */
3055           err = re_node_set_merge (&new_nodes, eclosure);
3056           if (BE (err != REG_NOERROR, 0))
3057             {
3058               re_node_set_free (&new_nodes);
3059               return err;
3060             }
3061         }
3062       else
3063         {
3064           /* There are problematic nodes, re-calculate incrementally.  */
3065           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3066                                               ex_subexp, type);
3067           if (BE (err != REG_NOERROR, 0))
3068             {
3069               re_node_set_free (&new_nodes);
3070               return err;
3071             }
3072         }
3073     }
3074   re_node_set_free (cur_nodes);
3075   *cur_nodes = new_nodes;
3076   return REG_NOERROR;
3077 }
3078
3079 /* Helper function for check_arrival_expand_ecl.
3080    Check incrementally the epsilon closure of TARGET, and if it isn't
3081    problematic append it to DST_NODES.  */
3082
3083 static reg_errcode_t
3084 internal_function
3085 check_arrival_expand_ecl_sub (re_dfa_t *dfa, re_node_set *dst_nodes,
3086                               int target, int ex_subexp, int type)
3087 {
3088   int cur_node;
3089   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3090     {
3091       int err;
3092
3093       if (dfa->nodes[cur_node].type == type
3094           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3095         {
3096           if (type == OP_CLOSE_SUBEXP)
3097             {
3098               err = re_node_set_insert (dst_nodes, cur_node);
3099               if (BE (err == -1, 0))
3100                 return REG_ESPACE;
3101             }
3102           break;
3103         }
3104       err = re_node_set_insert (dst_nodes, cur_node);
3105       if (BE (err == -1, 0))
3106         return REG_ESPACE;
3107       if (dfa->edests[cur_node].nelem == 0)
3108         break;
3109       if (dfa->edests[cur_node].nelem == 2)
3110         {
3111           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3112                                               dfa->edests[cur_node].elems[1],
3113                                               ex_subexp, type);
3114           if (BE (err != REG_NOERROR, 0))
3115             return err;
3116         }
3117       cur_node = dfa->edests[cur_node].elems[0];
3118     }
3119   return REG_NOERROR;
3120 }
3121
3122
3123 /* For all the back references in the current state, calculate the
3124    destination of the back references by the appropriate entry
3125    in MCTX->BKREF_ENTS.  */
3126
3127 static reg_errcode_t
3128 internal_function
3129 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3130                     int cur_str, int subexp_num, int type)
3131 {
3132   re_dfa_t *const dfa = mctx->dfa;
3133   reg_errcode_t err;
3134   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3135   struct re_backref_cache_entry *ent;
3136
3137   if (cache_idx_start == -1)
3138     return REG_NOERROR;
3139
3140  restart:
3141   ent = mctx->bkref_ents + cache_idx_start;
3142   do
3143     {
3144       int to_idx, next_node;
3145
3146       /* Is this entry ENT is appropriate?  */
3147       if (!re_node_set_contains (cur_nodes, ent->node))
3148         continue; /* No.  */
3149
3150       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3151       /* Calculate the destination of the back reference, and append it
3152          to MCTX->STATE_LOG.  */
3153       if (to_idx == cur_str)
3154         {
3155           /* The backreference did epsilon transit, we must re-check all the
3156              node in the current state.  */
3157           re_node_set new_dests;
3158           reg_errcode_t err2, err3;
3159           next_node = dfa->edests[ent->node].elems[0];
3160           if (re_node_set_contains (cur_nodes, next_node))
3161             continue;
3162           err = re_node_set_init_1 (&new_dests, next_node);
3163           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3164           err3 = re_node_set_merge (cur_nodes, &new_dests);
3165           re_node_set_free (&new_dests);
3166           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3167                   || err3 != REG_NOERROR, 0))
3168             {
3169               err = (err != REG_NOERROR ? err
3170                      : (err2 != REG_NOERROR ? err2 : err3));
3171               return err;
3172             }
3173           /* TODO: It is still inefficient...  */
3174           goto restart;
3175         }
3176       else
3177         {
3178           re_node_set union_set;
3179           next_node = dfa->nexts[ent->node];
3180           if (mctx->state_log[to_idx])
3181             {
3182               int ret;
3183               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3184                                         next_node))
3185                 continue;
3186               err = re_node_set_init_copy (&union_set,
3187                                            &mctx->state_log[to_idx]->nodes);
3188               ret = re_node_set_insert (&union_set, next_node);
3189               if (BE (err != REG_NOERROR || ret < 0, 0))
3190                 {
3191                   re_node_set_free (&union_set);
3192                   err = err != REG_NOERROR ? err : REG_ESPACE;
3193                   return err;
3194                 }
3195             }
3196           else
3197             {
3198               err = re_node_set_init_1 (&union_set, next_node);
3199               if (BE (err != REG_NOERROR, 0))
3200                 return err;
3201             }
3202           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3203           re_node_set_free (&union_set);
3204           if (BE (mctx->state_log[to_idx] == NULL
3205                   && err != REG_NOERROR, 0))
3206             return err;
3207         }
3208     }
3209   while (ent++->more);
3210   return REG_NOERROR;
3211 }
3212
3213 /* Build transition table for the state.
3214    Return 1 if succeeded, otherwise return NULL.  */
3215
3216 static int
3217 internal_function
3218 build_trtable (re_dfa_t *dfa, re_dfastate_t *state)
3219 {
3220   reg_errcode_t err;
3221   int i, j, ch, need_word_trtable = 0;
3222   unsigned int elem, mask;
3223   int dests_node_malloced = 0, dest_states_malloced = 0;
3224   int ndests; /* Number of the destination states from `state'.  */
3225   re_dfastate_t **trtable;
3226   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3227   re_node_set follows, *dests_node;
3228   bitset *dests_ch;
3229   bitset acceptable;
3230
3231   /* We build DFA states which corresponds to the destination nodes
3232      from `state'.  `dests_node[i]' represents the nodes which i-th
3233      destination state contains, and `dests_ch[i]' represents the
3234      characters which i-th destination state accepts.  */
3235 #ifdef _LIBC
3236   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3237     dests_node = (re_node_set *)
3238       alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3239   else
3240 #endif
3241     {
3242       dests_node = (re_node_set *)
3243         malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3244       if (BE (dests_node == NULL, 0))
3245         return 0;
3246       dests_node_malloced = 1;
3247     }
3248   dests_ch = (bitset *) (dests_node + SBC_MAX);
3249
3250   /* Initialize transiton table.  */
3251   state->word_trtable = state->trtable = NULL;
3252
3253   /* At first, group all nodes belonging to `state' into several
3254      destinations.  */
3255   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3256   if (BE (ndests <= 0, 0))
3257     {
3258       if (dests_node_malloced)
3259         free (dests_node);
3260       /* Return 0 in case of an error, 1 otherwise.  */
3261       if (ndests == 0)
3262         {
3263           state->trtable = (re_dfastate_t **)
3264             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3265           return 1;
3266         }
3267       return 0;
3268     }
3269
3270   err = re_node_set_alloc (&follows, ndests + 1);
3271   if (BE (err != REG_NOERROR, 0))
3272     goto out_free;
3273
3274 #ifdef _LIBC
3275   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3276                          + ndests * 3 * sizeof (re_dfastate_t *)))
3277     dest_states = (re_dfastate_t **)
3278       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3279   else
3280 #endif
3281     {
3282       dest_states = (re_dfastate_t **)
3283         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3284       if (BE (dest_states == NULL, 0))
3285         {
3286 out_free:
3287           if (dest_states_malloced)
3288             free (dest_states);
3289           re_node_set_free (&follows);
3290           for (i = 0; i < ndests; ++i)
3291             re_node_set_free (dests_node + i);
3292           if (dests_node_malloced)
3293             free (dests_node);
3294           return 0;
3295         }
3296       dest_states_malloced = 1;
3297     }
3298   dest_states_word = dest_states + ndests;
3299   dest_states_nl = dest_states_word + ndests;
3300   bitset_empty (acceptable);
3301
3302   /* Then build the states for all destinations.  */
3303   for (i = 0; i < ndests; ++i)
3304     {
3305       int next_node;
3306       re_node_set_empty (&follows);
3307       /* Merge the follows of this destination states.  */
3308       for (j = 0; j < dests_node[i].nelem; ++j)
3309         {
3310           next_node = dfa->nexts[dests_node[i].elems[j]];
3311           if (next_node != -1)
3312             {
3313               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3314               if (BE (err != REG_NOERROR, 0))
3315                 goto out_free;
3316             }
3317         }
3318       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3319       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3320         goto out_free;
3321       /* If the new state has context constraint,
3322          build appropriate states for these contexts.  */
3323       if (dest_states[i]->has_constraint)
3324         {
3325           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3326                                                           CONTEXT_WORD);
3327           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3328             goto out_free;
3329
3330           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3331             need_word_trtable = 1;
3332
3333           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3334                                                         CONTEXT_NEWLINE);
3335           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3336             goto out_free;
3337         }
3338       else
3339         {
3340           dest_states_word[i] = dest_states[i];
3341           dest_states_nl[i] = dest_states[i];
3342         }
3343       bitset_merge (acceptable, dests_ch[i]);
3344     }
3345
3346   if (!BE (need_word_trtable, 0))
3347     {
3348       /* We don't care about whether the following character is a word
3349          character, or we are in a single-byte character set so we can
3350          discern by looking at the character code: allocate a
3351          256-entry transition table.  */
3352       trtable = state->trtable =
3353         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3354       if (BE (trtable == NULL, 0))
3355         goto out_free;
3356
3357       /* For all characters ch...:  */
3358       for (i = 0; i < BITSET_UINTS; ++i)
3359         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3360              elem;
3361              mask <<= 1, elem >>= 1, ++ch)
3362           if (BE (elem & 1, 0))
3363             {
3364               /* There must be exactly one destination which accepts
3365                  character ch.  See group_nodes_into_DFAstates.  */
3366               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3367                 ;
3368
3369               /* j-th destination accepts the word character ch.  */
3370               if (dfa->word_char[i] & mask)
3371                 trtable[ch] = dest_states_word[j];
3372               else
3373                 trtable[ch] = dest_states[j];
3374             }
3375     }
3376   else
3377     {
3378       /* We care about whether the following character is a word
3379          character, and we are in a multi-byte character set: discern
3380          by looking at the character code: build two 256-entry
3381          transition tables, one starting at trtable[0] and one
3382          starting at trtable[SBC_MAX].  */
3383       trtable = state->word_trtable =
3384         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3385       if (BE (trtable == NULL, 0))
3386         goto out_free;
3387
3388       /* For all characters ch...:  */
3389       for (i = 0; i < BITSET_UINTS; ++i)
3390         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3391              elem;
3392              mask <<= 1, elem >>= 1, ++ch)
3393           if (BE (elem & 1, 0))
3394             {
3395               /* There must be exactly one destination which accepts
3396                  character ch.  See group_nodes_into_DFAstates.  */
3397               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3398                 ;
3399
3400               /* j-th destination accepts the word character ch.  */
3401               trtable[ch] = dest_states[j];
3402               trtable[ch + SBC_MAX] = dest_states_word[j];
3403             }
3404     }
3405
3406   /* new line */
3407   if (bitset_contain (acceptable, NEWLINE_CHAR))
3408     {
3409       /* The current state accepts newline character.  */
3410       for (j = 0; j < ndests; ++j)
3411         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3412           {
3413             /* k-th destination accepts newline character.  */
3414             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3415             if (need_word_trtable)
3416               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3417             /* There must be only one destination which accepts
3418                newline.  See group_nodes_into_DFAstates.  */
3419             break;
3420           }
3421     }
3422
3423   if (dest_states_malloced)
3424     free (dest_states);
3425
3426   re_node_set_free (&follows);
3427   for (i = 0; i < ndests; ++i)
3428     re_node_set_free (dests_node + i);
3429
3430   if (dests_node_malloced)
3431     free (dests_node);
3432
3433   return 1;
3434 }
3435
3436 /* Group all nodes belonging to STATE into several destinations.
3437    Then for all destinations, set the nodes belonging to the destination
3438    to DESTS_NODE[i] and set the characters accepted by the destination
3439    to DEST_CH[i].  This function return the number of destinations.  */
3440
3441 static int
3442 internal_function
3443 group_nodes_into_DFAstates (re_dfa_t *dfa, const re_dfastate_t *state,
3444                             re_node_set *dests_node, bitset *dests_ch)
3445 {
3446   reg_errcode_t err;
3447   int result;
3448   int i, j, k;
3449   int ndests; /* Number of the destinations from `state'.  */
3450   bitset accepts; /* Characters a node can accept.  */
3451   const re_node_set *cur_nodes = &state->nodes;
3452   bitset_empty (accepts);
3453   ndests = 0;
3454
3455   /* For all the nodes belonging to `state',  */
3456   for (i = 0; i < cur_nodes->nelem; ++i)
3457     {
3458       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3459       re_token_type_t type = node->type;
3460       unsigned int constraint = node->constraint;
3461
3462       /* Enumerate all single byte character this node can accept.  */
3463       if (type == CHARACTER)
3464         bitset_set (accepts, node->opr.c);
3465       else if (type == SIMPLE_BRACKET)
3466         {
3467           bitset_merge (accepts, node->opr.sbcset);
3468         }
3469       else if (type == OP_PERIOD)
3470         {
3471 #ifdef RE_ENABLE_I18N
3472           if (dfa->mb_cur_max > 1)
3473             bitset_merge (accepts, dfa->sb_char);
3474           else
3475 #endif
3476             bitset_set_all (accepts);
3477           if (!(dfa->syntax & RE_DOT_NEWLINE))
3478             bitset_clear (accepts, '\n');
3479           if (dfa->syntax & RE_DOT_NOT_NULL)
3480             bitset_clear (accepts, '\0');
3481         }
3482 #ifdef RE_ENABLE_I18N
3483       else if (type == OP_UTF8_PERIOD)
3484         {
3485           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3486           if (!(dfa->syntax & RE_DOT_NEWLINE))
3487             bitset_clear (accepts, '\n');
3488           if (dfa->syntax & RE_DOT_NOT_NULL)
3489             bitset_clear (accepts, '\0');
3490         }
3491 #endif
3492       else
3493         continue;
3494
3495       /* Check the `accepts' and sift the characters which are not
3496          match it the context.  */
3497       if (constraint)
3498         {
3499           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3500             {
3501               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3502               bitset_empty (accepts);
3503               if (accepts_newline)
3504                 bitset_set (accepts, NEWLINE_CHAR);
3505               else
3506                 continue;
3507             }
3508           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3509             {
3510               bitset_empty (accepts);
3511               continue;
3512             }
3513
3514           if (constraint & NEXT_WORD_CONSTRAINT)
3515             {
3516               unsigned int any_set = 0;
3517               if (type == CHARACTER && !node->word_char)
3518                 {
3519                   bitset_empty (accepts);
3520                   continue;
3521                 }
3522 #ifdef RE_ENABLE_I18N
3523               if (dfa->mb_cur_max > 1)
3524                 for (j = 0; j < BITSET_UINTS; ++j)
3525                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3526               else
3527 #endif
3528                 for (j = 0; j < BITSET_UINTS; ++j)
3529                   any_set |= (accepts[j] &= dfa->word_char[j]);
3530               if (!any_set)
3531                 continue;
3532             }
3533           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3534             {
3535               unsigned int any_set = 0;
3536               if (type == CHARACTER && node->word_char)
3537                 {
3538                   bitset_empty (accepts);
3539                   continue;
3540                 }
3541 #ifdef RE_ENABLE_I18N
3542               if (dfa->mb_cur_max > 1)
3543                 for (j = 0; j < BITSET_UINTS; ++j)
3544                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3545               else
3546 #endif
3547                 for (j = 0; j < BITSET_UINTS; ++j)
3548                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3549               if (!any_set)
3550                 continue;
3551             }
3552         }
3553
3554       /* Then divide `accepts' into DFA states, or create a new
3555          state.  Above, we make sure that accepts is not empty.  */
3556       for (j = 0; j < ndests; ++j)
3557         {
3558           bitset intersec; /* Intersection sets, see below.  */
3559           bitset remains;
3560           /* Flags, see below.  */
3561           int has_intersec, not_subset, not_consumed;
3562
3563           /* Optimization, skip if this state doesn't accept the character.  */
3564           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3565             continue;
3566
3567           /* Enumerate the intersection set of this state and `accepts'.  */
3568           has_intersec = 0;
3569           for (k = 0; k < BITSET_UINTS; ++k)
3570             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3571           /* And skip if the intersection set is empty.  */
3572           if (!has_intersec)
3573             continue;
3574
3575           /* Then check if this state is a subset of `accepts'.  */
3576           not_subset = not_consumed = 0;
3577           for (k = 0; k < BITSET_UINTS; ++k)
3578             {
3579               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3580               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3581             }
3582
3583           /* If this state isn't a subset of `accepts', create a
3584              new group state, which has the `remains'. */
3585           if (not_subset)
3586             {
3587               bitset_copy (dests_ch[ndests], remains);
3588               bitset_copy (dests_ch[j], intersec);
3589               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3590               if (BE (err != REG_NOERROR, 0))
3591                 goto error_return;
3592               ++ndests;
3593             }
3594
3595           /* Put the position in the current group. */
3596           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3597           if (BE (result < 0, 0))
3598             goto error_return;
3599
3600           /* If all characters are consumed, go to next node. */
3601           if (!not_consumed)
3602             break;
3603         }
3604       /* Some characters remain, create a new group. */
3605       if (j == ndests)
3606         {
3607           bitset_copy (dests_ch[ndests], accepts);
3608           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3609           if (BE (err != REG_NOERROR, 0))
3610             goto error_return;
3611           ++ndests;
3612           bitset_empty (accepts);
3613         }
3614     }
3615   return ndests;
3616  error_return:
3617   for (j = 0; j < ndests; ++j)
3618     re_node_set_free (dests_node + j);
3619   return -1;
3620 }
3621
3622 #ifdef RE_ENABLE_I18N
3623 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3624    Return the number of the bytes the node accepts.
3625    STR_IDX is the current index of the input string.
3626
3627    This function handles the nodes which can accept one character, or
3628    one collating element like '.', '[a-z]', opposite to the other nodes
3629    can only accept one byte.  */
3630
3631 static int
3632 internal_function
3633 check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
3634                          const re_string_t *input, int str_idx)
3635 {
3636   const re_token_t *node = dfa->nodes + node_idx;
3637   int char_len, elem_len;
3638   int i;
3639
3640   if (BE (node->type == OP_UTF8_PERIOD, 0))
3641     {
3642       unsigned char c = re_string_byte_at (input, str_idx), d;
3643       if (BE (c < 0xc2, 1))
3644         return 0;
3645
3646       if (str_idx + 2 > input->len)
3647         return 0;
3648
3649       d = re_string_byte_at (input, str_idx + 1);
3650       if (c < 0xe0)
3651         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3652       else if (c < 0xf0)
3653         {
3654           char_len = 3;
3655           if (c == 0xe0 && d < 0xa0)
3656             return 0;
3657         }
3658       else if (c < 0xf8)
3659         {
3660           char_len = 4;
3661           if (c == 0xf0 && d < 0x90)
3662             return 0;
3663         }
3664       else if (c < 0xfc)
3665         {
3666           char_len = 5;
3667           if (c == 0xf8 && d < 0x88)
3668             return 0;
3669         }
3670       else if (c < 0xfe)
3671         {
3672           char_len = 6;
3673           if (c == 0xfc && d < 0x84)
3674             return 0;
3675         }
3676       else
3677         return 0;
3678
3679       if (str_idx + char_len > input->len)
3680         return 0;
3681
3682       for (i = 1; i < char_len; ++i)
3683         {
3684           d = re_string_byte_at (input, str_idx + i);
3685           if (d < 0x80 || d > 0xbf)
3686             return 0;
3687         }
3688       return char_len;
3689     }
3690
3691   char_len = re_string_char_size_at (input, str_idx);
3692   if (node->type == OP_PERIOD)
3693     {
3694       if (char_len <= 1)
3695         return 0;
3696       /* FIXME: I don't think this if is needed, as both '\n'
3697          and '\0' are char_len == 1.  */
3698       /* '.' accepts any one character except the following two cases.  */
3699       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3700            re_string_byte_at (input, str_idx) == '\n') ||
3701           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3702            re_string_byte_at (input, str_idx) == '\0'))
3703         return 0;
3704       return char_len;
3705     }
3706
3707   elem_len = re_string_elem_size_at (input, str_idx);
3708   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3709     return 0;
3710
3711   if (node->type == COMPLEX_BRACKET)
3712     {
3713       const re_charset_t *cset = node->opr.mbcset;
3714 # ifdef _LIBC
3715       const unsigned char *pin
3716         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3717       int j;
3718       uint32_t nrules;
3719 # endif /* _LIBC */
3720       int match_len = 0;
3721       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3722                     ? re_string_wchar_at (input, str_idx) : 0);
3723
3724       /* match with multibyte character?  */
3725       for (i = 0; i < cset->nmbchars; ++i)
3726         if (wc == cset->mbchars[i])
3727           {
3728             match_len = char_len;
3729             goto check_node_accept_bytes_match;
3730           }
3731       /* match with character_class?  */
3732       for (i = 0; i < cset->nchar_classes; ++i)
3733         {
3734           wctype_t wt = cset->char_classes[i];
3735           if (__iswctype (wc, wt))
3736             {
3737               match_len = char_len;
3738               goto check_node_accept_bytes_match;
3739             }
3740         }
3741
3742 # ifdef _LIBC
3743       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3744       if (nrules != 0)
3745         {
3746           unsigned int in_collseq = 0;
3747           const int32_t *table, *indirect;
3748           const unsigned char *weights, *extra;
3749           const char *collseqwc;
3750           int32_t idx;
3751           /* This #include defines a local function!  */
3752 #  include <locale/weight.h>
3753
3754           /* match with collating_symbol?  */
3755           if (cset->ncoll_syms)
3756             extra = (const unsigned char *)
3757               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3758           for (i = 0; i < cset->ncoll_syms; ++i)
3759             {
3760               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3761               /* Compare the length of input collating element and
3762                  the length of current collating element.  */
3763               if (*coll_sym != elem_len)
3764                 continue;
3765               /* Compare each bytes.  */
3766               for (j = 0; j < *coll_sym; j++)
3767                 if (pin[j] != coll_sym[1 + j])
3768                   break;
3769               if (j == *coll_sym)
3770                 {
3771                   /* Match if every bytes is equal.  */
3772                   match_len = j;
3773                   goto check_node_accept_bytes_match;
3774                 }
3775             }
3776
3777           if (cset->nranges)
3778             {
3779               if (elem_len <= char_len)
3780                 {
3781                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3782                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3783                 }
3784               else
3785                 in_collseq = find_collation_sequence_value (pin, elem_len);
3786             }
3787           /* match with range expression?  */
3788           for (i = 0; i < cset->nranges; ++i)
3789             if (cset->range_starts[i] <= in_collseq
3790                 && in_collseq <= cset->range_ends[i])
3791               {
3792                 match_len = elem_len;
3793                 goto check_node_accept_bytes_match;
3794               }
3795
3796           /* match with equivalence_class?  */
3797           if (cset->nequiv_classes)
3798             {
3799               const unsigned char *cp = pin;
3800               table = (const int32_t *)
3801                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3802               weights = (const unsigned char *)
3803                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3804               extra = (const unsigned char *)
3805                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3806               indirect = (const int32_t *)
3807                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3808               idx = findidx (&cp);
3809               if (idx > 0)
3810                 for (i = 0; i < cset->nequiv_classes; ++i)
3811                   {
3812                     int32_t equiv_class_idx = cset->equiv_classes[i];
3813                     size_t weight_len = weights[idx];
3814                     if (weight_len == weights[equiv_class_idx])
3815                       {
3816                         int cnt = 0;
3817                         while (cnt <= weight_len
3818                                && (weights[equiv_class_idx + 1 + cnt]
3819                                    == weights[idx + 1 + cnt]))
3820                           ++cnt;
3821                         if (cnt > weight_len)
3822                           {
3823                             match_len = elem_len;
3824                             goto check_node_accept_bytes_match;
3825                           }
3826                       }
3827                   }
3828             }
3829         }
3830       else
3831 # endif /* _LIBC */
3832         {
3833           /* match with range expression?  */
3834 #if __GNUC__ >= 2
3835           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3836 #else
3837           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3838           cmp_buf[2] = wc;
3839 #endif
3840           for (i = 0; i < cset->nranges; ++i)
3841             {
3842               cmp_buf[0] = cset->range_starts[i];
3843               cmp_buf[4] = cset->range_ends[i];
3844               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3845                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3846                 {
3847                   match_len = char_len;
3848                   goto check_node_accept_bytes_match;
3849                 }
3850             }
3851         }
3852     check_node_accept_bytes_match:
3853       if (!cset->non_match)
3854         return match_len;
3855       else
3856         {
3857           if (match_len > 0)
3858             return 0;
3859           else
3860             return (elem_len > char_len) ? elem_len : char_len;
3861         }
3862     }
3863   return 0;
3864 }
3865
3866 # ifdef _LIBC
3867 static unsigned int
3868 find_collation_sequence_value (mbs, mbs_len)
3869     const unsigned char *mbs;
3870     size_t mbs_len;
3871 {
3872   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3873   if (nrules == 0)
3874     {
3875       if (mbs_len == 1)
3876         {
3877           /* No valid character.  Match it as a single byte character.  */
3878           const unsigned char *collseq = (const unsigned char *)
3879             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3880           return collseq[mbs[0]];
3881         }
3882       return UINT_MAX;
3883     }
3884   else
3885     {
3886       int32_t idx;
3887       const unsigned char *extra = (const unsigned char *)
3888         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3889       int32_t extrasize = (const unsigned char *)
3890         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3891
3892       for (idx = 0; idx < extrasize;)
3893         {
3894           int mbs_cnt, found = 0;
3895           int32_t elem_mbs_len;
3896           /* Skip the name of collating element name.  */
3897           idx = idx + extra[idx] + 1;
3898           elem_mbs_len = extra[idx++];
3899           if (mbs_len == elem_mbs_len)
3900             {
3901               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3902                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3903                   break;
3904               if (mbs_cnt == elem_mbs_len)
3905                 /* Found the entry.  */
3906                 found = 1;
3907             }
3908           /* Skip the byte sequence of the collating element.  */
3909           idx += elem_mbs_len;
3910           /* Adjust for the alignment.  */
3911           idx = (idx + 3) & ~3;
3912           /* Skip the collation sequence value.  */
3913           idx += sizeof (uint32_t);
3914           /* Skip the wide char sequence of the collating element.  */
3915           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3916           /* If we found the entry, return the sequence value.  */
3917           if (found)
3918             return *(uint32_t *) (extra + idx);
3919           /* Skip the collation sequence value.  */
3920           idx += sizeof (uint32_t);
3921         }
3922       return UINT_MAX;
3923     }
3924 }
3925 # endif /* _LIBC */
3926 #endif /* RE_ENABLE_I18N */
3927
3928 /* Check whether the node accepts the byte which is IDX-th
3929    byte of the INPUT.  */
3930
3931 static int
3932 internal_function
3933 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3934                    int idx)
3935 {
3936   unsigned char ch;
3937   ch = re_string_byte_at (&mctx->input, idx);
3938   switch (node->type)
3939     {
3940     case CHARACTER:
3941       if (node->opr.c != ch)
3942         return 0;
3943       break;
3944
3945     case SIMPLE_BRACKET:
3946       if (!bitset_contain (node->opr.sbcset, ch))
3947         return 0;
3948       break;
3949
3950 #ifdef RE_ENABLE_I18N
3951     case OP_UTF8_PERIOD:
3952       if (ch >= 0x80)
3953         return 0;
3954       /* FALLTHROUGH */
3955 #endif
3956     case OP_PERIOD:
3957       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3958           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3959         return 0;
3960       break;
3961
3962     default:
3963       return 0;
3964     }
3965
3966   if (node->constraint)
3967     {
3968       /* The node has constraints.  Check whether the current context
3969          satisfies the constraints.  */
3970       unsigned int context = re_string_context_at (&mctx->input, idx,
3971                                                    mctx->eflags);
3972       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3973         return 0;
3974     }
3975
3976   return 1;
3977 }
3978
3979 /* Extend the buffers, if the buffers have run out.  */
3980
3981 static reg_errcode_t
3982 internal_function
3983 extend_buffers (re_match_context_t *mctx)
3984 {
3985   reg_errcode_t ret;
3986   re_string_t *pstr = &mctx->input;
3987
3988   /* Double the lengthes of the buffers.  */
3989   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3990   if (BE (ret != REG_NOERROR, 0))
3991     return ret;
3992
3993   if (mctx->state_log != NULL)
3994     {
3995       /* And double the length of state_log.  */
3996       /* XXX We have no indication of the size of this buffer.  If this
3997          allocation fail we have no indication that the state_log array
3998          does not have the right size.  */
3999       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4000                                               pstr->bufs_len + 1);
4001       if (BE (new_array == NULL, 0))
4002         return REG_ESPACE;
4003       mctx->state_log = new_array;
4004     }
4005
4006   /* Then reconstruct the buffers.  */
4007   if (pstr->icase)
4008     {
4009 #ifdef RE_ENABLE_I18N
4010       if (pstr->mb_cur_max > 1)
4011         {
4012           ret = build_wcs_upper_buffer (pstr);
4013           if (BE (ret != REG_NOERROR, 0))
4014             return ret;
4015         }
4016       else
4017 #endif /* RE_ENABLE_I18N  */
4018         build_upper_buffer (pstr);
4019     }
4020   else
4021     {
4022 #ifdef RE_ENABLE_I18N
4023       if (pstr->mb_cur_max > 1)
4024         build_wcs_buffer (pstr);
4025       else
4026 #endif /* RE_ENABLE_I18N  */
4027         {
4028           if (pstr->trans != NULL)
4029             re_string_translate_buffer (pstr);
4030         }
4031     }
4032   return REG_NOERROR;
4033 }
4034
4035 \f
4036 /* Functions for matching context.  */
4037
4038 /* Initialize MCTX.  */
4039
4040 static reg_errcode_t
4041 internal_function
4042 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4043 {
4044   mctx->eflags = eflags;
4045   mctx->match_last = -1;
4046   if (n > 0)
4047     {
4048       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4049       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4050       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4051         return REG_ESPACE;
4052     }
4053   /* Already zero-ed by the caller.
4054      else
4055        mctx->bkref_ents = NULL;
4056      mctx->nbkref_ents = 0;
4057      mctx->nsub_tops = 0;  */
4058   mctx->abkref_ents = n;
4059   mctx->max_mb_elem_len = 1;
4060   mctx->asub_tops = n;
4061   return REG_NOERROR;
4062 }
4063
4064 /* Clean the entries which depend on the current input in MCTX.
4065    This function must be invoked when the matcher changes the start index
4066    of the input, or changes the input string.  */
4067
4068 static void
4069 internal_function
4070 match_ctx_clean (re_match_context_t *mctx)
4071 {
4072   int st_idx;
4073   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4074     {
4075       int sl_idx;
4076       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4077       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4078         {
4079           re_sub_match_last_t *last = top->lasts[sl_idx];
4080           re_free (last->path.array);
4081           re_free (last);
4082         }
4083       re_free (top->lasts);
4084       if (top->path)
4085         {
4086           re_free (top->path->array);
4087           re_free (top->path);
4088         }
4089       free (top);
4090     }
4091
4092   mctx->nsub_tops = 0;
4093   mctx->nbkref_ents = 0;
4094 }
4095
4096 /* Free all the memory associated with MCTX.  */
4097
4098 static void
4099 internal_function
4100 match_ctx_free (re_match_context_t *mctx)
4101 {
4102   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4103   match_ctx_clean (mctx);
4104   re_free (mctx->sub_tops);
4105   re_free (mctx->bkref_ents);
4106 }
4107
4108 /* Add a new backreference entry to MCTX.
4109    Note that we assume that caller never call this function with duplicate
4110    entry, and call with STR_IDX which isn't smaller than any existing entry.
4111 */
4112
4113 static reg_errcode_t
4114 internal_function
4115 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx,
4116                      int from, int to)
4117 {
4118   if (mctx->nbkref_ents >= mctx->abkref_ents)
4119     {
4120       struct re_backref_cache_entry* new_entry;
4121       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4122                               mctx->abkref_ents * 2);
4123       if (BE (new_entry == NULL, 0))
4124         {
4125           re_free (mctx->bkref_ents);
4126           return REG_ESPACE;
4127         }
4128       mctx->bkref_ents = new_entry;
4129       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4130               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4131       mctx->abkref_ents *= 2;
4132     }
4133   if (mctx->nbkref_ents > 0
4134       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4135     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4136
4137   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4138   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4139   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4140   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4141
4142   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4143      If bit N is clear, means that this entry won't epsilon-transition to
4144      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4145      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4146      such node.
4147
4148      A backreference does not epsilon-transition unless it is empty, so set
4149      to all zeros if FROM != TO.  */
4150   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4151     = (from == to ? ~0 : 0);
4152
4153   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4154   if (mctx->max_mb_elem_len < to - from)
4155     mctx->max_mb_elem_len = to - from;
4156   return REG_NOERROR;
4157 }
4158
4159 /* Search for the first entry which has the same str_idx, or -1 if none is
4160    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4161
4162 static int
4163 internal_function
4164 search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
4165 {
4166   int left, right, mid, last;
4167   last = right = mctx->nbkref_ents;
4168   for (left = 0; left < right;)
4169     {
4170       mid = (left + right) / 2;
4171       if (mctx->bkref_ents[mid].str_idx < str_idx)
4172         left = mid + 1;
4173       else
4174         right = mid;
4175     }
4176   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4177     return left;
4178   else
4179     return -1;
4180 }
4181
4182 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4183    at STR_IDX.  */
4184
4185 static reg_errcode_t
4186 internal_function
4187 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4188 {
4189 #ifdef DEBUG
4190   assert (mctx->sub_tops != NULL);
4191   assert (mctx->asub_tops > 0);
4192 #endif
4193   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4194     {
4195       int new_asub_tops = mctx->asub_tops * 2;
4196       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4197                                                    re_sub_match_top_t *,
4198                                                    new_asub_tops);
4199       if (BE (new_array == NULL, 0))
4200         return REG_ESPACE;
4201       mctx->sub_tops = new_array;
4202       mctx->asub_tops = new_asub_tops;
4203     }
4204   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4205   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4206     return REG_ESPACE;
4207   mctx->sub_tops[mctx->nsub_tops]->node = node;
4208   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4209   return REG_NOERROR;
4210 }
4211
4212 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4213    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4214
4215 static re_sub_match_last_t *
4216 internal_function
4217 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4218 {
4219   re_sub_match_last_t *new_entry;
4220   if (BE (subtop->nlasts == subtop->alasts, 0))
4221     {
4222       int new_alasts = 2 * subtop->alasts + 1;
4223       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4224                                                     re_sub_match_last_t *,
4225                                                     new_alasts);
4226       if (BE (new_array == NULL, 0))
4227         return NULL;
4228       subtop->lasts = new_array;
4229       subtop->alasts = new_alasts;
4230     }
4231   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4232   if (BE (new_entry != NULL, 1))
4233     {
4234       subtop->lasts[subtop->nlasts] = new_entry;
4235       new_entry->node = node;
4236       new_entry->str_idx = str_idx;
4237       ++subtop->nlasts;
4238     }
4239   return new_entry;
4240 }
4241
4242 static void
4243 internal_function
4244 sift_ctx_init (re_sift_context_t *sctx,
4245                re_dfastate_t **sifted_sts,
4246                re_dfastate_t **limited_sts,
4247                int last_node, int last_str_idx)
4248 {
4249   sctx->sifted_states = sifted_sts;
4250   sctx->limited_states = limited_sts;
4251   sctx->last_node = last_node;
4252   sctx->last_str_idx = last_str_idx;
4253   re_node_set_init_empty (&sctx->limits);
4254 }