1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 Idx n) internal_function;
22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
23 static void match_ctx_free (re_match_context_t *cache) internal_function;
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to)
27 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
30 Idx str_idx) internal_function;
31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
32 Idx node, Idx str_idx)
34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35 re_dfastate_t **limited_sts, Idx last_node,
38 static reg_errcode_t re_search_internal (const regex_t *preg,
39 const char *string, Idx length,
40 Idx start, Idx last_start, Idx stop,
41 size_t nmatch, regmatch_t pmatch[],
42 int eflags) internal_function;
43 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
44 const char *string1, Idx length1,
45 const char *string2, Idx length2,
46 Idx start, regoff_t range,
47 struct re_registers *regs,
48 Idx stop, bool ret_len) internal_function;
49 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, Idx length, Idx start,
51 regoff_t range, Idx stop,
52 struct re_registers *regs,
53 bool ret_len) internal_function;
54 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
55 Idx nregs, int regs_allocated) internal_function;
56 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
58 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
61 static Idx check_halt_state_context (const re_match_context_t *mctx,
62 const re_dfastate_t *state, Idx idx)
64 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
65 regmatch_t *prev_idx_match, Idx cur_node,
66 Idx cur_idx, Idx nmatch) internal_function;
67 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
68 Idx str_idx, Idx dest_node, Idx nregs,
70 re_node_set *eps_via_nodes) internal_function;
71 static reg_errcode_t set_regs (const regex_t *preg,
72 const re_match_context_t *mctx,
73 size_t nmatch, regmatch_t *pmatch,
74 bool fl_backtrack) internal_function;
75 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
78 static int sift_states_iter_mb (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 Idx node_idx, Idx str_idx, Idx max_str_idx) internal_function;
81 #endif /* RE_ENABLE_I18N */
82 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
83 re_sift_context_t *sctx) internal_function;
84 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
85 re_sift_context_t *sctx, Idx str_idx,
86 re_node_set *cur_dest) internal_function;
87 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
88 re_sift_context_t *sctx,
90 re_node_set *dest_nodes) internal_function;
91 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
92 re_node_set *dest_nodes,
93 const re_node_set *candidates) internal_function;
94 static bool check_dst_limits (const re_match_context_t *mctx,
95 const re_node_set *limits,
96 Idx dst_node, Idx dst_idx, Idx src_node,
97 Idx src_idx) internal_function;
98 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
99 int boundaries, Idx subexp_idx,
100 Idx from_node, Idx bkref_idx) internal_function;
101 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
102 Idx limit, Idx subexp_idx,
103 Idx node, Idx str_idx,
104 Idx bkref_idx) internal_function;
105 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
106 re_node_set *dest_nodes,
107 const re_node_set *candidates,
109 struct re_backref_cache_entry *bkref_ents,
110 Idx str_idx) internal_function;
111 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
112 re_sift_context_t *sctx,
113 Idx str_idx, const re_node_set *candidates) internal_function;
114 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
115 re_dfastate_t **src, Idx num) internal_function;
116 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
117 re_match_context_t *mctx) internal_function;
118 static re_dfastate_t *transit_state (reg_errcode_t *err,
119 re_match_context_t *mctx,
120 re_dfastate_t *state) internal_function;
121 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
122 re_match_context_t *mctx,
123 re_dfastate_t *next_state) internal_function;
124 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
125 re_node_set *cur_nodes,
126 Idx str_idx) internal_function;
128 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
129 re_match_context_t *mctx,
130 re_dfastate_t *pstate) internal_function;
132 #ifdef RE_ENABLE_I18N
133 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
134 re_dfastate_t *pstate) internal_function;
135 #endif /* RE_ENABLE_I18N */
136 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
137 const re_node_set *nodes) internal_function;
138 static reg_errcode_t get_subexp (re_match_context_t *mctx,
139 Idx bkref_node, Idx bkref_str_idx) internal_function;
140 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
141 const re_sub_match_top_t *sub_top,
142 re_sub_match_last_t *sub_last,
143 Idx bkref_node, Idx bkref_str) internal_function;
144 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
145 Idx subexp_idx, int type) internal_function;
146 static reg_errcode_t check_arrival (re_match_context_t *mctx,
147 state_array_t *path, Idx top_node,
148 Idx top_str, Idx last_node, Idx last_str,
149 int type) internal_function;
150 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
152 re_node_set *cur_nodes,
153 re_node_set *next_nodes) internal_function;
154 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
155 re_node_set *cur_nodes,
156 Idx ex_subexp, int type) internal_function;
157 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
158 re_node_set *dst_nodes,
159 Idx target, Idx ex_subexp,
160 int type) internal_function;
161 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
162 re_node_set *cur_nodes, Idx cur_str,
163 Idx subexp_num, int type) internal_function;
164 static bool build_trtable (re_dfa_t *dfa,
165 re_dfastate_t *state) internal_function;
166 #ifdef RE_ENABLE_I18N
167 static int check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
168 const re_string_t *input, Idx idx) internal_function;
170 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
171 size_t name_len) internal_function;
173 #endif /* RE_ENABLE_I18N */
174 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
175 const re_dfastate_t *state,
176 re_node_set *states_node,
177 bitset *states_ch) internal_function;
178 static bool check_node_accept (const re_match_context_t *mctx,
179 const re_token_t *node, Idx idx)
181 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
183 /* Entry point for POSIX code. */
185 /* regexec searches for a given pattern, specified by PREG, in the
188 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
189 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
190 least NMATCH elements, and we set them to the offsets of the
191 corresponding matched substrings.
193 EFLAGS specifies `execution flags' which affect matching: if
194 REG_NOTBOL is set, then ^ does not match at the beginning of the
195 string; if REG_NOTEOL is set, then $ does not match at the end.
197 We return 0 if we find a match and REG_NOMATCH if not. */
200 regexec (const regex_t *__restrict preg, const char *__restrict string,
201 size_t nmatch, regmatch_t pmatch[], int eflags)
206 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
209 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
212 if (eflags & REG_STARTEND)
214 start = pmatch[0].rm_so;
215 length = pmatch[0].rm_eo;
220 length = strlen (string);
223 __libc_lock_lock (dfa->lock);
225 err = re_search_internal (preg, string, length, start, length,
226 length, 0, NULL, eflags);
228 err = re_search_internal (preg, string, length, start, length,
229 length, nmatch, pmatch, eflags);
230 __libc_lock_unlock (dfa->lock);
231 return err != REG_NOERROR;
235 # include <shlib-compat.h>
236 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
238 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
239 __typeof__ (__regexec) __compat_regexec;
242 attribute_compat_text_section
243 __compat_regexec (const regex_t *__restrict preg,
244 const char *__restrict string, size_t nmatch,
245 regmatch_t pmatch[], int eflags)
247 return regexec (preg, string, nmatch, pmatch,
248 eflags & (REG_NOTBOL | REG_NOTEOL));
250 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
254 /* Entry points for GNU code. */
256 /* re_match, re_search, re_match_2, re_search_2
258 The former two functions operate on STRING with length LENGTH,
259 while the later two operate on concatenation of STRING1 and STRING2
260 with lengths LENGTH1 and LENGTH2, respectively.
262 re_match() matches the compiled pattern in BUFP against the string,
263 starting at index START.
265 re_search() first tries matching at index START, then it tries to match
266 starting from index START + 1, and so on. The last start position tried
267 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
270 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
271 the first STOP characters of the concatenation of the strings should be
274 If REGS is not NULL, and BUFP->re_no_sub is not set, the offsets of the match
275 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
276 computed relative to the concatenation, not relative to the individual
279 On success, re_match* functions return the length of the match, re_search*
280 return the position of the start of the match. Return value -1 means no
281 match was found and -2 indicates an internal error. */
284 re_match (struct re_pattern_buffer *bufp, const char *string,
285 Idx length, Idx start, struct re_registers *regs)
287 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
290 weak_alias (__re_match, re_match)
294 re_search (struct re_pattern_buffer *bufp, const char *string,
295 Idx length, Idx start, regoff_t range, struct re_registers *regs)
297 return re_search_stub (bufp, string, length, start, range, length, regs,
301 weak_alias (__re_search, re_search)
305 re_match_2 (struct re_pattern_buffer *bufp,
306 const char *string1, Idx length1,
307 const char *string2, Idx length2,
308 Idx start, struct re_registers *regs, Idx stop)
310 return re_search_2_stub (bufp, string1, length1, string2, length2,
311 start, 0, regs, stop, true);
314 weak_alias (__re_match_2, re_match_2)
318 re_search_2 (struct re_pattern_buffer *bufp,
319 const char *string1, Idx length1,
320 const char *string2, Idx length2,
321 Idx start, regoff_t range, struct re_registers *regs, Idx stop)
323 return re_search_2_stub (bufp, string1, length1, string2, length2,
324 start, range, regs, stop, false);
327 weak_alias (__re_search_2, re_search_2)
332 re_search_2_stub (struct re_pattern_buffer *bufp,
333 const char *string1, Idx length1,
334 const char *string2, Idx length2,
335 Idx start, regoff_t range, struct re_registers *regs,
336 Idx stop, bool ret_len)
340 Idx len = length1 + length2;
343 if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
346 /* Concatenate the strings. */
350 s = re_malloc (char, len);
352 if (BE (s == NULL, 0))
354 memcpy (s, string1, length1);
355 memcpy (s + length1, string2, length2);
363 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
369 /* The parameters have the same meaning as those of re_search.
370 Additional parameters:
371 If RET_LEN is true the length of the match is returned (re_match style);
372 otherwise the position of the match is returned. */
376 re_search_stub (struct re_pattern_buffer *bufp,
377 const char *string, Idx length,
378 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
381 reg_errcode_t result;
387 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
389 Idx last_start = start + range;
391 /* Check for out-of-range. */
392 if (BE (start < 0 || start > length, 0))
394 if (sizeof start < sizeof range)
396 regoff_t length_offset = length;
397 regoff_t start_offset = start;
398 if (BE (length_offset - start_offset < range, 0))
400 else if (BE (range < - start_offset, 0))
405 if (BE ((last_start < start) != (range < 0), 0))
407 /* Overflow occurred when computing last_start; substitute
408 the extreme value. */
409 last_start = range < 0 ? 0 : length;
413 if (BE (length < last_start, 0))
415 else if (BE (last_start < 0, 0))
420 __libc_lock_lock (dfa->lock);
422 eflags |= (bufp->re_not_bol) ? REG_NOTBOL : 0;
423 eflags |= (bufp->re_not_eol) ? REG_NOTEOL : 0;
425 /* Compile fastmap if we haven't yet. */
426 if (start < last_start && bufp->re_fastmap != NULL
427 && !bufp->re_fastmap_accurate)
428 re_compile_fastmap (bufp);
430 if (BE (bufp->re_no_sub, 0))
433 /* We need at least 1 register. */
436 else if (BE (bufp->re_regs_allocated == REG_FIXED
437 && regs->rm_num_regs <= bufp->re_nsub, 0))
439 nregs = regs->rm_num_regs;
440 if (BE (nregs < 1, 0))
442 /* Nothing can be copied to regs. */
448 nregs = bufp->re_nsub + 1;
449 pmatch = re_xmalloc (regmatch_t, nregs);
450 if (BE (pmatch == NULL, 0))
456 result = re_search_internal (bufp, string, length, start, last_start, stop,
457 nregs, pmatch, eflags);
461 /* I hope we needn't fill ther regs with -1's when no match was found. */
462 if (result != REG_NOERROR)
464 else if (regs != NULL)
466 /* If caller wants register contents data back, copy them. */
467 bufp->re_regs_allocated = re_copy_regs (regs, pmatch, nregs,
468 bufp->re_regs_allocated);
469 if (BE (bufp->re_regs_allocated == REG_UNALLOCATED, 0))
473 if (BE (rval == 0, 1))
477 assert (pmatch[0].rm_so == start);
478 rval = pmatch[0].rm_eo - start;
481 rval = pmatch[0].rm_so;
485 __libc_lock_unlock (dfa->lock);
491 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
494 int rval = REG_REALLOCATE;
496 Idx need_regs = nregs + 1;
497 /* We need one extra element beyond `rm_num_regs' for the `-1' marker GNU code
500 /* Have the register data arrays been allocated? */
501 if (regs_allocated == REG_UNALLOCATED)
502 { /* No. So allocate them with malloc. */
503 regs->rm_start = re_xmalloc (regoff_t, need_regs);
504 regs->rm_end = re_malloc (regoff_t, need_regs);
505 if (BE (regs->rm_start == NULL, 0) || BE (regs->rm_end == NULL, 0))
506 return REG_UNALLOCATED;
507 regs->rm_num_regs = need_regs;
509 else if (regs_allocated == REG_REALLOCATE)
510 { /* Yes. If we need more elements than were already
511 allocated, reallocate them. If we need fewer, just
513 if (BE (need_regs > regs->rm_num_regs, 0))
515 regoff_t *new_start =
516 re_xrealloc (regs->rm_start, regoff_t, need_regs);
517 regoff_t *new_end = re_realloc (regs->rm_end, regoff_t, need_regs);
518 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
519 return REG_UNALLOCATED;
520 regs->rm_start = new_start;
521 regs->rm_end = new_end;
522 regs->rm_num_regs = need_regs;
527 assert (regs_allocated == REG_FIXED);
528 /* This function may not be called with REG_FIXED and nregs too big. */
529 assert (regs->rm_num_regs >= nregs);
534 for (i = 0; i < nregs; ++i)
536 regs->rm_start[i] = pmatch[i].rm_so;
537 regs->rm_end[i] = pmatch[i].rm_eo;
539 for ( ; i < regs->rm_num_regs; ++i)
540 regs->rm_start[i] = regs->rm_end[i] = -1;
545 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
546 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
547 this memory for recording register information. STARTS and ENDS
548 must be allocated using the malloc library routine, and must each
549 be at least NUM_REGS * sizeof (regoff_t) bytes long.
551 If NUM_REGS == 0, then subsequent matches should allocate their own
554 Unless this function is called, the first search or match using
555 PATTERN_BUFFER will allocate its own register data, without
556 freeing the old data. */
559 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
560 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
564 bufp->re_regs_allocated = REG_REALLOCATE;
565 regs->rm_num_regs = num_regs;
566 regs->rm_start = starts;
571 bufp->re_regs_allocated = REG_UNALLOCATED;
572 regs->rm_num_regs = 0;
573 regs->rm_start = regs->rm_end = NULL;
577 weak_alias (__re_set_registers, re_set_registers)
580 /* Entry points compatible with 4.2 BSD regex library. We don't define
581 them unless specifically requested. */
583 #if defined _REGEX_RE_COMP || defined _LIBC
588 re_exec (const char *s)
590 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
592 #endif /* _REGEX_RE_COMP */
594 /* Internal entry point. */
596 /* Searches for a compiled pattern PREG in the string STRING, whose
597 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
598 meaning as with regexec. LAST_START is START + RANGE, where
599 START and RANGE have the same meaning as with re_search.
600 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
601 otherwise return the error code.
602 Note: We assume front end functions already check ranges.
603 (0 <= LAST_START && LAST_START <= LENGTH) */
607 re_search_internal (const regex_t *preg,
608 const char *string, Idx length,
609 Idx start, Idx last_start, Idx stop,
610 size_t nmatch, regmatch_t pmatch[],
614 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
615 Idx left_lim, right_lim;
617 bool fl_longest_match;
619 Idx match_first, match_last = REG_MISSING;
623 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
624 re_match_context_t mctx = { .dfa = dfa };
626 re_match_context_t mctx;
628 char *fastmap = ((preg->re_fastmap != NULL && preg->re_fastmap_accurate
629 && start != last_start && !preg->re_can_be_null)
630 ? preg->re_fastmap : NULL);
631 unsigned REG_TRANSLATE_TYPE t =
632 (unsigned REG_TRANSLATE_TYPE) preg->re_translate;
634 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
635 memset (&mctx, '\0', sizeof (re_match_context_t));
639 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
640 nmatch -= extra_nmatch;
642 /* Check if the DFA haven't been compiled. */
643 if (BE (preg->re_used == 0 || dfa->init_state == NULL
644 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
645 || dfa->init_state_begbuf == NULL, 0))
649 /* We assume front-end functions already check them. */
650 assert (0 <= last_start && last_start <= length);
653 /* If initial states with non-begbuf contexts have no elements,
654 the regex must be anchored. If preg->re_newline_anchor is set,
655 we'll never use init_state_nl, so do not check it. */
656 if (dfa->init_state->nodes.nelem == 0
657 && dfa->init_state_word->nodes.nelem == 0
658 && (dfa->init_state_nl->nodes.nelem == 0
659 || !preg->re_newline_anchor))
661 if (start != 0 && last_start != 0)
663 start = last_start = 0;
666 /* We must check the longest matching, if nmatch > 0. */
667 fl_longest_match = (nmatch != 0 || dfa->nbackref);
669 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
671 preg->re_syntax & REG_IGNORE_CASE, dfa);
672 if (BE (err != REG_NOERROR, 0))
674 mctx.input.stop = stop;
675 mctx.input.raw_stop = stop;
676 mctx.input.newline_anchor = preg->re_newline_anchor;
678 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
679 if (BE (err != REG_NOERROR, 0))
682 /* We will log all the DFA states through which the dfa pass,
683 if nmatch > 1, or this dfa has "multibyte node", which is a
684 back-reference or a node which can accept multibyte character or
685 multi character collating element. */
686 if (nmatch > 1 || dfa->has_mb_node)
688 mctx.state_log = re_xmalloc (re_dfastate_t *, mctx.input.bufs_len + 1);
689 if (BE (mctx.state_log == NULL, 0))
696 mctx.state_log = NULL;
699 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
700 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
702 /* Check incrementally whether of not the input string match. */
703 incr = (last_start < start) ? -1 : 1;
704 left_lim = (last_start < start) ? last_start : start;
705 right_lim = (last_start < start) ? start : last_start;
706 sb = dfa->mb_cur_max == 1;
709 ? ((sb || !(preg->re_syntax & REG_IGNORE_CASE || t) ? 4 : 0)
710 | (start <= last_start ? 2 : 0)
711 | (t != NULL ? 1 : 0))
714 for (;; match_first += incr)
717 if (match_first < left_lim || right_lim < match_first)
720 /* Advance as rapidly as possible through the string, until we
721 find a plausible place to start matching. This may be done
722 with varying efficiency, so there are various possibilities:
723 only the most common of them are specialized, in order to
724 save on code size. We use a switch statement for speed. */
732 /* Fastmap with single-byte translation, match forward. */
733 while (BE (match_first < right_lim, 1)
734 && !fastmap[t[(unsigned char) string[match_first]]])
736 goto forward_match_found_start_or_reached_end;
739 /* Fastmap without translation, match forward. */
740 while (BE (match_first < right_lim, 1)
741 && !fastmap[(unsigned char) string[match_first]])
744 forward_match_found_start_or_reached_end:
745 if (BE (match_first == right_lim, 0))
747 ch = match_first >= length
748 ? 0 : (unsigned char) string[match_first];
749 if (!fastmap[t ? t[ch] : ch])
756 /* Fastmap without multi-byte translation, match backwards. */
757 while (match_first >= left_lim)
759 ch = match_first >= length
760 ? 0 : (unsigned char) string[match_first];
761 if (fastmap[t ? t[ch] : ch])
765 if (match_first < left_lim)
770 /* In this case, we can't determine easily the current byte,
771 since it might be a component byte of a multibyte
772 character. Then we use the constructed buffer instead. */
775 /* If MATCH_FIRST is out of the valid range, reconstruct the
777 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
778 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
780 err = re_string_reconstruct (&mctx.input, match_first,
782 if (BE (err != REG_NOERROR, 0))
785 offset = match_first - mctx.input.raw_mbs_idx;
787 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
788 Note that MATCH_FIRST must not be smaller than 0. */
789 ch = (match_first >= length
790 ? 0 : re_string_byte_at (&mctx.input, offset));
794 if (match_first < left_lim || match_first > right_lim)
803 /* Reconstruct the buffers so that the matcher can assume that
804 the matching starts from the beginning of the buffer. */
805 err = re_string_reconstruct (&mctx.input, match_first, eflags);
806 if (BE (err != REG_NOERROR, 0))
809 #ifdef RE_ENABLE_I18N
810 /* Don't consider this char as a possible match start if it part,
811 yet isn't the head, of a multibyte character. */
812 if (!sb && !re_string_first_byte (&mctx.input, 0))
816 /* It seems to be appropriate one, then use the matcher. */
817 /* We assume that the matching starts from 0. */
818 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
819 match_last = check_matching (&mctx, fl_longest_match,
820 start <= last_start ? &match_first : NULL);
821 if (match_last != REG_MISSING)
823 if (BE (match_last == REG_ERROR, 0))
830 mctx.match_last = match_last;
831 if ((!preg->re_no_sub && nmatch > 1) || dfa->nbackref)
833 re_dfastate_t *pstate = mctx.state_log[match_last];
834 mctx.last_node = check_halt_state_context (&mctx, pstate,
837 if ((!preg->re_no_sub && nmatch > 1 && dfa->has_plural_match)
840 err = prune_impossible_nodes (&mctx);
841 if (err == REG_NOERROR)
843 if (BE (err != REG_NOMATCH, 0))
845 match_last = REG_MISSING;
848 break; /* We found a match. */
852 match_ctx_clean (&mctx);
856 assert (match_last != REG_MISSING);
857 assert (err == REG_NOERROR);
860 /* Set pmatch[] if we need. */
865 /* Initialize registers. */
866 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
867 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
869 /* Set the points where matching start/end. */
871 pmatch[0].rm_eo = mctx.match_last;
872 /* FIXME: This function should fail if mctx.match_last exceeds
873 the maximum possible regoff_t value. We need a new error
874 code REG_OVERFLOW. */
876 if (!preg->re_no_sub && nmatch > 1)
878 err = set_regs (preg, &mctx, nmatch, pmatch,
879 dfa->has_plural_match && dfa->nbackref > 0);
880 if (BE (err != REG_NOERROR, 0))
884 /* At last, add the offset to the each registers, since we slided
885 the buffers so that we could assume that the matching starts
887 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
888 if (pmatch[reg_idx].rm_so != -1)
890 #ifdef RE_ENABLE_I18N
891 if (BE (mctx.input.offsets_needed != 0, 0))
893 pmatch[reg_idx].rm_so =
894 (pmatch[reg_idx].rm_so == mctx.input.valid_len
895 ? mctx.input.valid_raw_len
896 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
897 pmatch[reg_idx].rm_eo =
898 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
899 ? mctx.input.valid_raw_len
900 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
903 assert (mctx.input.offsets_needed == 0);
905 pmatch[reg_idx].rm_so += match_first;
906 pmatch[reg_idx].rm_eo += match_first;
908 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
910 pmatch[nmatch + reg_idx].rm_so = -1;
911 pmatch[nmatch + reg_idx].rm_eo = -1;
915 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
916 if (dfa->subexp_map[reg_idx] != reg_idx)
918 pmatch[reg_idx + 1].rm_so
919 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
920 pmatch[reg_idx + 1].rm_eo
921 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
926 re_free (mctx.state_log);
928 match_ctx_free (&mctx);
929 re_string_destruct (&mctx.input);
935 prune_impossible_nodes (re_match_context_t *mctx)
937 re_dfa_t *const dfa = mctx->dfa;
938 Idx halt_node, match_last;
940 re_dfastate_t **sifted_states;
941 re_dfastate_t **lim_states = NULL;
942 re_sift_context_t sctx;
944 assert (mctx->state_log != NULL);
946 match_last = mctx->match_last;
947 halt_node = mctx->last_node;
948 sifted_states = re_xmalloc (re_dfastate_t *, match_last + 1);
949 if (BE (sifted_states == NULL, 0))
956 lim_states = re_xmalloc (re_dfastate_t *, match_last + 1);
957 if (BE (lim_states == NULL, 0))
964 memset (lim_states, '\0',
965 sizeof (re_dfastate_t *) * (match_last + 1));
966 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
968 ret = sift_states_backward (mctx, &sctx);
969 re_node_set_free (&sctx.limits);
970 if (BE (ret != REG_NOERROR, 0))
972 if (sifted_states[0] != NULL || lim_states[0] != NULL)
977 if (! REG_VALID_INDEX (match_last))
982 } while (mctx->state_log[match_last] == NULL
983 || !mctx->state_log[match_last]->halt);
984 halt_node = check_halt_state_context (mctx,
985 mctx->state_log[match_last],
988 ret = merge_state_array (dfa, sifted_states, lim_states,
990 re_free (lim_states);
992 if (BE (ret != REG_NOERROR, 0))
997 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
998 ret = sift_states_backward (mctx, &sctx);
999 re_node_set_free (&sctx.limits);
1000 if (BE (ret != REG_NOERROR, 0))
1003 re_free (mctx->state_log);
1004 mctx->state_log = sifted_states;
1005 sifted_states = NULL;
1006 mctx->last_node = halt_node;
1007 mctx->match_last = match_last;
1010 re_free (sifted_states);
1011 re_free (lim_states);
1015 /* Acquire an initial state and return it.
1016 We must select appropriate initial state depending on the context,
1017 since initial states may have constraints like "\<", "^", etc.. */
1019 static inline re_dfastate_t *
1020 __attribute ((always_inline)) internal_function
1021 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1024 re_dfa_t *const dfa = mctx->dfa;
1025 if (dfa->init_state->has_constraint)
1027 unsigned int context;
1028 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1029 if (IS_WORD_CONTEXT (context))
1030 return dfa->init_state_word;
1031 else if (IS_ORDINARY_CONTEXT (context))
1032 return dfa->init_state;
1033 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1034 return dfa->init_state_begbuf;
1035 else if (IS_NEWLINE_CONTEXT (context))
1036 return dfa->init_state_nl;
1037 else if (IS_BEGBUF_CONTEXT (context))
1039 /* It is relatively rare case, then calculate on demand. */
1040 return re_acquire_state_context (err, dfa,
1041 dfa->init_state->entrance_nodes,
1045 /* Must not happen? */
1046 return dfa->init_state;
1049 return dfa->init_state;
1052 /* Check whether the regular expression match input string INPUT or not,
1053 and return the index where the matching end. Return REG_MISSING if
1054 there is no match, and return REG_ERROR in case of an error.
1055 FL_LONGEST_MATCH means we want the POSIX longest matching.
1056 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1057 next place where we may want to try matching.
1058 Note that the matcher assume that the maching starts from the current
1059 index of the buffer. */
1063 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1066 re_dfa_t *const dfa = mctx->dfa;
1069 Idx match_last = REG_MISSING;
1070 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1071 re_dfastate_t *cur_state;
1072 bool at_init_state = p_match_first != NULL;
1073 Idx next_start_idx = cur_str_idx;
1076 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1077 /* An initial state must not be NULL (invalid). */
1078 if (BE (cur_state == NULL, 0))
1080 assert (err == REG_ESPACE);
1084 if (mctx->state_log != NULL)
1086 mctx->state_log[cur_str_idx] = cur_state;
1088 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1089 later. E.g. Processing back references. */
1090 if (BE (dfa->nbackref, 0))
1092 at_init_state = false;
1093 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1094 if (BE (err != REG_NOERROR, 0))
1097 if (cur_state->has_backref)
1099 err = transit_state_bkref (mctx, &cur_state->nodes);
1100 if (BE (err != REG_NOERROR, 0))
1106 /* If the RE accepts NULL string. */
1107 if (BE (cur_state->halt, 0))
1109 if (!cur_state->has_constraint
1110 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1112 if (!fl_longest_match)
1116 match_last = cur_str_idx;
1122 while (!re_string_eoi (&mctx->input))
1124 re_dfastate_t *old_state = cur_state;
1125 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1127 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1128 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1129 && mctx->input.valid_len < mctx->input.len))
1131 err = extend_buffers (mctx);
1132 if (BE (err != REG_NOERROR, 0))
1134 assert (err == REG_ESPACE);
1139 cur_state = transit_state (&err, mctx, cur_state);
1140 if (mctx->state_log != NULL)
1141 cur_state = merge_state_with_log (&err, mctx, cur_state);
1143 if (cur_state == NULL)
1145 /* Reached the invalid state or an error. Try to recover a valid
1146 state using the state log, if available and if we have not
1147 already found a valid (even if not the longest) match. */
1148 if (BE (err != REG_NOERROR, 0))
1151 if (mctx->state_log == NULL
1152 || (match && !fl_longest_match)
1153 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1157 if (BE (at_init_state, 0))
1159 if (old_state == cur_state)
1160 next_start_idx = next_char_idx;
1162 at_init_state = false;
1165 if (cur_state->halt)
1167 /* Reached a halt state.
1168 Check the halt state can satisfy the current context. */
1169 if (!cur_state->has_constraint
1170 || check_halt_state_context (mctx, cur_state,
1171 re_string_cur_idx (&mctx->input)))
1173 /* We found an appropriate halt state. */
1174 match_last = re_string_cur_idx (&mctx->input);
1177 /* We found a match, do not modify match_first below. */
1178 p_match_first = NULL;
1179 if (!fl_longest_match)
1186 *p_match_first += next_start_idx;
1191 /* Check NODE match the current context. */
1195 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1197 re_token_type_t type = dfa->nodes[node].type;
1198 unsigned int constraint = dfa->nodes[node].constraint;
1199 if (type != END_OF_RE)
1203 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1208 /* Check the halt state STATE match the current context.
1209 Return 0 if not match, if the node, STATE has, is a halt node and
1210 match the context, return the node. */
1214 check_halt_state_context (const re_match_context_t *mctx,
1215 const re_dfastate_t *state, Idx idx)
1218 unsigned int context;
1220 assert (state->halt);
1222 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1223 for (i = 0; i < state->nodes.nelem; ++i)
1224 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1225 return state->nodes.elems[i];
1229 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1230 corresponding to the DFA).
1231 Return the destination node, and update EPS_VIA_NODES;
1232 return REG_MISSING in case of errors. */
1236 proceed_next_node (const re_match_context_t *mctx,
1237 Idx nregs, regmatch_t *regs, Idx *pidx, Idx node,
1238 re_node_set *eps_via_nodes, struct re_fail_stack_t *fs)
1240 re_dfa_t *const dfa = mctx->dfa;
1243 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1245 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1246 re_node_set *edests = &dfa->edests[node];
1248 ok = re_node_set_insert (eps_via_nodes, node);
1251 /* Pick up a valid destination, or return REG_MISSING if none
1253 for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1255 Idx candidate = edests->elems[i];
1256 if (!re_node_set_contains (cur_nodes, candidate))
1258 if (dest_node == REG_MISSING)
1259 dest_node = candidate;
1263 /* In order to avoid infinite loop like "(a*)*", return the second
1264 epsilon-transition if the first was already considered. */
1265 if (re_node_set_contains (eps_via_nodes, dest_node))
1268 /* Otherwise, push the second epsilon-transition on the fail stack. */
1270 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1274 /* We know we are going to exit. */
1283 re_token_type_t type = dfa->nodes[node].type;
1285 #ifdef RE_ENABLE_I18N
1286 if (dfa->nodes[node].accept_mb)
1287 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1289 #endif /* RE_ENABLE_I18N */
1290 if (type == OP_BACK_REF)
1292 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1293 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1296 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1300 char *buf = (char *) re_string_get_buffer (&mctx->input);
1301 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1310 ok = re_node_set_insert (eps_via_nodes, node);
1313 dest_node = dfa->edests[node].elems[0];
1314 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1321 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1323 Idx dest_node = dfa->nexts[node];
1324 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1325 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1326 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1329 re_node_set_empty (eps_via_nodes);
1336 static reg_errcode_t
1338 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1339 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1342 Idx num = fs->num++;
1343 if (fs->num == fs->alloc)
1345 struct re_fail_stack_ent_t *new_array =
1346 re_x2realloc (fs->stack, struct re_fail_stack_ent_t, &fs->alloc);
1347 if (new_array == NULL)
1349 fs->stack = new_array;
1351 fs->stack[num].idx = str_idx;
1352 fs->stack[num].node = dest_node;
1353 fs->stack[num].regs = re_xmalloc (regmatch_t, nregs);
1354 if (fs->stack[num].regs == NULL)
1356 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1357 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1363 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx,
1364 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1366 Idx num = --fs->num;
1367 assert (REG_VALID_INDEX (num));
1368 *pidx = fs->stack[num].idx;
1369 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1370 re_node_set_free (eps_via_nodes);
1371 re_free (fs->stack[num].regs);
1372 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1373 return fs->stack[num].node;
1376 /* Set the positions where the subexpressions are starts/ends to registers
1378 Note: We assume that pmatch[0] is already set, and
1379 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1381 static reg_errcode_t
1383 set_regs (const regex_t *preg, const re_match_context_t *mctx,
1384 size_t nmatch, regmatch_t *pmatch, bool fl_backtrack)
1386 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1388 re_node_set eps_via_nodes;
1389 struct re_fail_stack_t *fs;
1390 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1391 regmatch_t *prev_idx_match;
1392 bool prev_idx_match_malloced = false;
1395 assert (nmatch > 1);
1396 assert (mctx->state_log != NULL);
1401 fs->stack = re_xmalloc (struct re_fail_stack_ent_t, fs->alloc);
1402 if (fs->stack == NULL)
1408 cur_node = dfa->init_node;
1409 re_node_set_init_empty (&eps_via_nodes);
1411 if (re_alloc_oversized (nmatch, sizeof (regmatch_t)))
1413 free_fail_stack_return (fs);
1416 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1417 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1420 prev_idx_match = re_malloc (regmatch_t, nmatch);
1421 if (prev_idx_match == NULL)
1423 free_fail_stack_return (fs);
1426 prev_idx_match_malloced = true;
1428 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1430 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1432 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1434 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1439 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1440 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1442 if (reg_idx == nmatch)
1444 re_node_set_free (&eps_via_nodes);
1445 if (prev_idx_match_malloced)
1446 re_free (prev_idx_match);
1447 return free_fail_stack_return (fs);
1449 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1454 re_node_set_free (&eps_via_nodes);
1455 if (prev_idx_match_malloced)
1456 re_free (prev_idx_match);
1461 /* Proceed to next node. */
1462 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1463 &eps_via_nodes, fs);
1465 if (BE (! REG_VALID_INDEX (cur_node), 0))
1467 if (BE (cur_node == REG_ERROR, 0))
1469 re_node_set_free (&eps_via_nodes);
1470 if (prev_idx_match_malloced)
1471 re_free (prev_idx_match);
1472 free_fail_stack_return (fs);
1476 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1480 re_node_set_free (&eps_via_nodes);
1481 if (prev_idx_match_malloced)
1482 re_free (prev_idx_match);
1487 re_node_set_free (&eps_via_nodes);
1488 if (prev_idx_match_malloced)
1489 re_free (prev_idx_match);
1490 return free_fail_stack_return (fs);
1493 static reg_errcode_t
1495 free_fail_stack_return (struct re_fail_stack_t *fs)
1500 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1502 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1503 re_free (fs->stack[fs_idx].regs);
1505 re_free (fs->stack);
1512 update_regs (re_dfa_t *dfa, regmatch_t *pmatch, regmatch_t *prev_idx_match,
1513 Idx cur_node, Idx cur_idx, Idx nmatch)
1515 int type = dfa->nodes[cur_node].type;
1516 if (type == OP_OPEN_SUBEXP)
1518 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1520 /* We are at the first node of this sub expression. */
1521 if (reg_num < nmatch)
1523 pmatch[reg_num].rm_so = cur_idx;
1524 pmatch[reg_num].rm_eo = -1;
1527 else if (type == OP_CLOSE_SUBEXP)
1529 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1530 if (reg_num < nmatch)
1532 /* We are at the last node of this sub expression. */
1533 if (pmatch[reg_num].rm_so < cur_idx)
1535 pmatch[reg_num].rm_eo = cur_idx;
1536 /* This is a non-empty match or we are not inside an optional
1537 subexpression. Accept this right away. */
1538 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1542 if (dfa->nodes[cur_node].opt_subexp
1543 && prev_idx_match[reg_num].rm_so != -1)
1544 /* We transited through an empty match for an optional
1545 subexpression, like (a?)*, and this is not the subexp's
1546 first match. Copy back the old content of the registers
1547 so that matches of an inner subexpression are undone as
1548 well, like in ((a?))*. */
1549 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1551 /* We completed a subexpression, but it may be part of
1552 an optional one, so do not update PREV_IDX_MATCH. */
1553 pmatch[reg_num].rm_eo = cur_idx;
1559 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1560 and sift the nodes in each states according to the following rules.
1561 Updated state_log will be wrote to STATE_LOG.
1563 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1564 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1565 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1566 the LAST_NODE, we throw away the node `a'.
1567 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1568 string `s' and transit to `b':
1569 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1571 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1572 thrown away, we throw away the node `a'.
1573 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1574 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1576 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1577 we throw away the node `a'. */
1579 #define STATE_NODE_CONTAINS(state,node) \
1580 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1582 static reg_errcode_t
1584 sift_states_backward (re_match_context_t *mctx, re_sift_context_t *sctx)
1588 Idx str_idx = sctx->last_str_idx;
1589 re_node_set cur_dest;
1592 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1595 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1596 transit to the last_node and the last_node itself. */
1597 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1598 if (BE (err != REG_NOERROR, 0))
1600 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1601 if (BE (err != REG_NOERROR, 0))
1604 /* Then check each states in the state_log. */
1607 /* Update counters. */
1608 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1609 if (null_cnt > mctx->max_mb_elem_len)
1611 memset (sctx->sifted_states, '\0',
1612 sizeof (re_dfastate_t *) * str_idx);
1613 re_node_set_free (&cur_dest);
1616 re_node_set_empty (&cur_dest);
1619 if (mctx->state_log[str_idx])
1621 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1622 if (BE (err != REG_NOERROR, 0))
1626 /* Add all the nodes which satisfy the following conditions:
1627 - It can epsilon transit to a node in CUR_DEST.
1629 And update state_log. */
1630 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1631 if (BE (err != REG_NOERROR, 0))
1636 re_node_set_free (&cur_dest);
1640 static reg_errcode_t
1642 build_sifted_states (re_match_context_t *mctx, re_sift_context_t *sctx,
1643 Idx str_idx, re_node_set *cur_dest)
1645 re_dfa_t *const dfa = mctx->dfa;
1646 re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1649 /* Then build the next sifted state.
1650 We build the next sifted state on `cur_dest', and update
1651 `sifted_states[str_idx]' with `cur_dest'.
1653 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1654 `cur_src' points the node_set of the old `state_log[str_idx]'
1655 (with the epsilon nodes pre-filtered out). */
1656 for (i = 0; i < cur_src->nelem; i++)
1658 Idx prev_node = cur_src->elems[i];
1663 re_token_type_t type = dfa->nodes[prev_node].type;
1664 assert (!IS_EPSILON_NODE (type));
1666 #ifdef RE_ENABLE_I18N
1667 /* If the node may accept `multi byte'. */
1668 if (dfa->nodes[prev_node].accept_mb)
1669 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1670 str_idx, sctx->last_str_idx);
1671 #endif /* RE_ENABLE_I18N */
1673 /* We don't check backreferences here.
1674 See update_cur_sifted_state(). */
1676 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1677 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1678 dfa->nexts[prev_node]))
1684 if (sctx->limits.nelem)
1686 Idx to_idx = str_idx + naccepted;
1687 if (check_dst_limits (mctx, &sctx->limits,
1688 dfa->nexts[prev_node], to_idx,
1689 prev_node, str_idx))
1692 ok = re_node_set_insert (cur_dest, prev_node);
1700 /* Helper functions. */
1702 static reg_errcode_t
1704 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1706 Idx top = mctx->state_log_top;
1708 if (next_state_log_idx >= mctx->input.bufs_len
1709 || (next_state_log_idx >= mctx->input.valid_len
1710 && mctx->input.valid_len < mctx->input.len))
1713 err = extend_buffers (mctx);
1714 if (BE (err != REG_NOERROR, 0))
1718 if (top < next_state_log_idx)
1720 memset (mctx->state_log + top + 1, '\0',
1721 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1722 mctx->state_log_top = next_state_log_idx;
1727 static reg_errcode_t
1729 merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src,
1734 for (st_idx = 0; st_idx < num; ++st_idx)
1736 if (dst[st_idx] == NULL)
1737 dst[st_idx] = src[st_idx];
1738 else if (src[st_idx] != NULL)
1740 re_node_set merged_set;
1741 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1742 &src[st_idx]->nodes);
1743 if (BE (err != REG_NOERROR, 0))
1745 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1746 re_node_set_free (&merged_set);
1747 if (BE (err != REG_NOERROR, 0))
1754 static reg_errcode_t
1756 update_cur_sifted_state (re_match_context_t *mctx, re_sift_context_t *sctx,
1757 Idx str_idx, re_node_set *dest_nodes)
1759 re_dfa_t *const dfa = mctx->dfa;
1761 const re_node_set *candidates;
1762 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1763 : &mctx->state_log[str_idx]->nodes);
1765 if (dest_nodes->nelem == 0)
1766 sctx->sifted_states[str_idx] = NULL;
1771 /* At first, add the nodes which can epsilon transit to a node in
1773 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1774 if (BE (err != REG_NOERROR, 0))
1777 /* Then, check the limitations in the current sift_context. */
1778 if (sctx->limits.nelem)
1780 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1781 mctx->bkref_ents, str_idx);
1782 if (BE (err != REG_NOERROR, 0))
1787 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1788 if (BE (err != REG_NOERROR, 0))
1792 if (candidates && mctx->state_log[str_idx]->has_backref)
1794 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1795 if (BE (err != REG_NOERROR, 0))
1801 static reg_errcode_t
1803 add_epsilon_src_nodes (re_dfa_t *dfa, re_node_set *dest_nodes,
1804 const re_node_set *candidates)
1806 reg_errcode_t err = REG_NOERROR;
1809 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1810 if (BE (err != REG_NOERROR, 0))
1813 if (!state->inveclosure.alloc)
1815 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1816 if (BE (err != REG_NOERROR, 0))
1818 for (i = 0; i < dest_nodes->nelem; i++)
1819 re_node_set_merge (&state->inveclosure,
1820 dfa->inveclosures + dest_nodes->elems[i]);
1822 return re_node_set_add_intersect (dest_nodes, candidates,
1823 &state->inveclosure);
1826 static reg_errcode_t
1828 sub_epsilon_src_nodes (re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1829 const re_node_set *candidates)
1833 re_node_set *inv_eclosure = dfa->inveclosures + node;
1834 re_node_set except_nodes;
1835 re_node_set_init_empty (&except_nodes);
1836 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1838 Idx cur_node = inv_eclosure->elems[ecl_idx];
1839 if (cur_node == node)
1841 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1843 Idx edst1 = dfa->edests[cur_node].elems[0];
1844 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1845 ? dfa->edests[cur_node].elems[1] : REG_MISSING);
1846 if ((!re_node_set_contains (inv_eclosure, edst1)
1847 && re_node_set_contains (dest_nodes, edst1))
1848 || (REG_VALID_NONZERO_INDEX (edst2)
1849 && !re_node_set_contains (inv_eclosure, edst2)
1850 && re_node_set_contains (dest_nodes, edst2)))
1852 err = re_node_set_add_intersect (&except_nodes, candidates,
1853 dfa->inveclosures + cur_node);
1854 if (BE (err != REG_NOERROR, 0))
1856 re_node_set_free (&except_nodes);
1862 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1864 Idx cur_node = inv_eclosure->elems[ecl_idx];
1865 if (!re_node_set_contains (&except_nodes, cur_node))
1867 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1868 re_node_set_remove_at (dest_nodes, idx);
1871 re_node_set_free (&except_nodes);
1877 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1878 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1880 re_dfa_t *const dfa = mctx->dfa;
1881 Idx lim_idx, src_pos, dst_pos;
1883 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1884 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1885 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1888 struct re_backref_cache_entry *ent;
1889 ent = mctx->bkref_ents + limits->elems[lim_idx];
1890 subexp_idx = dfa->nodes[ent->node].opr.idx;
1892 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1893 subexp_idx, dst_node, dst_idx,
1895 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1896 subexp_idx, src_node, src_idx,
1900 <src> <dst> ( <subexp> )
1901 ( <subexp> ) <src> <dst>
1902 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1903 if (src_pos == dst_pos)
1904 continue; /* This is unrelated limitation. */
1913 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1914 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1916 re_dfa_t *const dfa = mctx->dfa;
1917 re_node_set *eclosures = dfa->eclosures + from_node;
1920 /* Else, we are on the boundary: examine the nodes on the epsilon
1922 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1924 Idx node = eclosures->elems[node_idx];
1925 switch (dfa->nodes[node].type)
1928 if (bkref_idx != REG_MISSING)
1930 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1936 if (ent->node != node)
1939 if (subexp_idx < BITSET_WORD_BITS
1940 && !(ent->eps_reachable_subexps_map
1941 & ((bitset_word) 1 << subexp_idx)))
1944 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1945 OP_CLOSE_SUBEXP cases below. But, if the
1946 destination node is the same node as the source
1947 node, don't recurse because it would cause an
1948 infinite loop: a regex that exhibits this behavior
1950 dst = dfa->edests[node].elems[0];
1951 if (dst == from_node)
1955 else /* if (boundaries & 2) */
1960 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1962 if (cpos == -1 /* && (boundaries & 1) */)
1964 if (cpos == 0 && (boundaries & 2))
1967 if (subexp_idx < BITSET_WORD_BITS)
1968 ent->eps_reachable_subexps_map &=
1969 ~ ((bitset_word) 1 << subexp_idx);
1971 while (ent++->more);
1975 case OP_OPEN_SUBEXP:
1976 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1980 case OP_CLOSE_SUBEXP:
1981 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1990 return (boundaries & 2) ? 1 : 0;
1995 check_dst_limits_calc_pos (const re_match_context_t *mctx,
1996 Idx limit, Idx subexp_idx,
1997 Idx from_node, Idx str_idx, Idx bkref_idx)
1999 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2002 /* If we are outside the range of the subexpression, return -1 or 1. */
2003 if (str_idx < lim->subexp_from)
2006 if (lim->subexp_to < str_idx)
2009 /* If we are within the subexpression, return 0. */
2010 boundaries = (str_idx == lim->subexp_from);
2011 boundaries |= (str_idx == lim->subexp_to) << 1;
2012 if (boundaries == 0)
2015 /* Else, examine epsilon closure. */
2016 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2017 from_node, bkref_idx);
2020 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2021 which are against limitations from DEST_NODES. */
2023 static reg_errcode_t
2025 check_subexp_limits (re_dfa_t *dfa, re_node_set *dest_nodes,
2026 const re_node_set *candidates, re_node_set *limits,
2027 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2030 Idx node_idx, lim_idx;
2032 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2035 struct re_backref_cache_entry *ent;
2036 ent = bkref_ents + limits->elems[lim_idx];
2038 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2039 continue; /* This is unrelated limitation. */
2041 subexp_idx = dfa->nodes[ent->node].opr.idx;
2042 if (ent->subexp_to == str_idx)
2044 Idx ops_node = REG_MISSING;
2045 Idx cls_node = REG_MISSING;
2046 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2048 Idx node = dest_nodes->elems[node_idx];
2049 re_token_type_t type = dfa->nodes[node].type;
2050 if (type == OP_OPEN_SUBEXP
2051 && subexp_idx == dfa->nodes[node].opr.idx)
2053 else if (type == OP_CLOSE_SUBEXP
2054 && subexp_idx == dfa->nodes[node].opr.idx)
2058 /* Check the limitation of the open subexpression. */
2059 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2060 if (REG_VALID_INDEX (ops_node))
2062 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2064 if (BE (err != REG_NOERROR, 0))
2068 /* Check the limitation of the close subexpression. */
2069 if (REG_VALID_INDEX (cls_node))
2070 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2072 Idx node = dest_nodes->elems[node_idx];
2073 if (!re_node_set_contains (dfa->inveclosures + node,
2075 && !re_node_set_contains (dfa->eclosures + node,
2078 /* It is against this limitation.
2079 Remove it form the current sifted state. */
2080 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2082 if (BE (err != REG_NOERROR, 0))
2088 else /* (ent->subexp_to != str_idx) */
2090 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2092 Idx node = dest_nodes->elems[node_idx];
2093 re_token_type_t type = dfa->nodes[node].type;
2094 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2096 if (subexp_idx != dfa->nodes[node].opr.idx)
2098 /* It is against this limitation.
2099 Remove it form the current sifted state. */
2100 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2102 if (BE (err != REG_NOERROR, 0))
2111 static reg_errcode_t
2113 sift_states_bkref (re_match_context_t *mctx, re_sift_context_t *sctx,
2114 Idx str_idx, const re_node_set *candidates)
2116 re_dfa_t *const dfa = mctx->dfa;
2119 re_sift_context_t local_sctx;
2120 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2122 if (first_idx == REG_MISSING)
2125 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2127 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2130 re_token_type_t type;
2131 struct re_backref_cache_entry *entry;
2132 node = candidates->elems[node_idx];
2133 type = dfa->nodes[node].type;
2134 /* Avoid infinite loop for the REs like "()\1+". */
2135 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2137 if (type != OP_BACK_REF)
2140 entry = mctx->bkref_ents + first_idx;
2141 enabled_idx = first_idx;
2145 Idx subexp_len, to_idx, dst_node;
2146 re_dfastate_t *cur_state;
2148 if (entry->node != node)
2150 subexp_len = entry->subexp_to - entry->subexp_from;
2151 to_idx = str_idx + subexp_len;
2152 dst_node = (subexp_len ? dfa->nexts[node]
2153 : dfa->edests[node].elems[0]);
2155 if (to_idx > sctx->last_str_idx
2156 || sctx->sifted_states[to_idx] == NULL
2157 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2158 || check_dst_limits (mctx, &sctx->limits, node,
2159 str_idx, dst_node, to_idx))
2162 if (local_sctx.sifted_states == NULL)
2165 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2166 if (BE (err != REG_NOERROR, 0))
2169 local_sctx.last_node = node;
2170 local_sctx.last_str_idx = str_idx;
2171 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2177 cur_state = local_sctx.sifted_states[str_idx];
2178 err = sift_states_backward (mctx, &local_sctx);
2179 if (BE (err != REG_NOERROR, 0))
2181 if (sctx->limited_states != NULL)
2183 err = merge_state_array (dfa, sctx->limited_states,
2184 local_sctx.sifted_states,
2186 if (BE (err != REG_NOERROR, 0))
2189 local_sctx.sifted_states[str_idx] = cur_state;
2190 re_node_set_remove (&local_sctx.limits, enabled_idx);
2192 /* mctx->bkref_ents may have changed, reload the pointer. */
2193 entry = mctx->bkref_ents + enabled_idx;
2195 while (enabled_idx++, entry++->more);
2199 if (local_sctx.sifted_states != NULL)
2201 re_node_set_free (&local_sctx.limits);
2208 #ifdef RE_ENABLE_I18N
2211 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2212 Idx node_idx, Idx str_idx, Idx max_str_idx)
2214 re_dfa_t *const dfa = mctx->dfa;
2216 /* Check the node can accept `multi byte'. */
2217 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2218 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2219 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2220 dfa->nexts[node_idx]))
2221 /* The node can't accept the `multi byte', or the
2222 destination was already thrown away, then the node
2223 could't accept the current input `multi byte'. */
2225 /* Otherwise, it is sure that the node could accept
2226 `naccepted' bytes input. */
2229 #endif /* RE_ENABLE_I18N */
2232 /* Functions for state transition. */
2234 /* Return the next state to which the current state STATE will transit by
2235 accepting the current input byte, and update STATE_LOG if necessary.
2236 If STATE can accept a multibyte char/collating element/back reference
2237 update the destination of STATE_LOG. */
2239 static re_dfastate_t *
2241 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2242 re_dfastate_t *state)
2244 re_dfastate_t **trtable;
2247 #ifdef RE_ENABLE_I18N
2248 /* If the current state can accept multibyte. */
2249 if (BE (state->accept_mb, 0))
2251 *err = transit_state_mb (mctx, state);
2252 if (BE (*err != REG_NOERROR, 0))
2255 #endif /* RE_ENABLE_I18N */
2257 /* Then decide the next state with the single byte. */
2260 /* don't use transition table */
2261 return transit_state_sb (err, mctx, state);
2264 /* Use transition table */
2265 ch = re_string_fetch_byte (&mctx->input);
2268 trtable = state->trtable;
2269 if (BE (trtable != NULL, 1))
2272 trtable = state->word_trtable;
2273 if (BE (trtable != NULL, 1))
2275 unsigned int context;
2277 = re_string_context_at (&mctx->input,
2278 re_string_cur_idx (&mctx->input) - 1,
2280 if (IS_WORD_CONTEXT (context))
2281 return trtable[ch + SBC_MAX];
2286 if (!build_trtable (mctx->dfa, state))
2292 /* Retry, we now have a transition table. */
2296 /* Update the state_log if we need */
2299 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2300 re_dfastate_t *next_state)
2302 re_dfa_t *const dfa = mctx->dfa;
2303 Idx cur_idx = re_string_cur_idx (&mctx->input);
2305 if (cur_idx > mctx->state_log_top)
2307 mctx->state_log[cur_idx] = next_state;
2308 mctx->state_log_top = cur_idx;
2310 else if (mctx->state_log[cur_idx] == 0)
2312 mctx->state_log[cur_idx] = next_state;
2316 re_dfastate_t *pstate;
2317 unsigned int context;
2318 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2319 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2320 the destination of a multibyte char/collating element/
2321 back reference. Then the next state is the union set of
2322 these destinations and the results of the transition table. */
2323 pstate = mctx->state_log[cur_idx];
2324 log_nodes = pstate->entrance_nodes;
2325 if (next_state != NULL)
2327 table_nodes = next_state->entrance_nodes;
2328 *err = re_node_set_init_union (&next_nodes, table_nodes,
2330 if (BE (*err != REG_NOERROR, 0))
2334 next_nodes = *log_nodes;
2335 /* Note: We already add the nodes of the initial state,
2336 then we don't need to add them here. */
2338 context = re_string_context_at (&mctx->input,
2339 re_string_cur_idx (&mctx->input) - 1,
2341 next_state = mctx->state_log[cur_idx]
2342 = re_acquire_state_context (err, dfa, &next_nodes, context);
2343 /* We don't need to check errors here, since the return value of
2344 this function is next_state and ERR is already set. */
2346 if (table_nodes != NULL)
2347 re_node_set_free (&next_nodes);
2350 if (BE (dfa->nbackref, 0) && next_state != NULL)
2352 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2353 later. We must check them here, since the back references in the
2354 next state might use them. */
2355 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2357 if (BE (*err != REG_NOERROR, 0))
2360 /* If the next state has back references. */
2361 if (next_state->has_backref)
2363 *err = transit_state_bkref (mctx, &next_state->nodes);
2364 if (BE (*err != REG_NOERROR, 0))
2366 next_state = mctx->state_log[cur_idx];
2373 /* Skip bytes in the input that correspond to part of a
2374 multi-byte match, then look in the log for a state
2375 from which to restart matching. */
2376 static re_dfastate_t *
2378 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2380 re_dfastate_t *cur_state = NULL;
2383 Idx max = mctx->state_log_top;
2384 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2388 if (++cur_str_idx > max)
2390 re_string_skip_bytes (&mctx->input, 1);
2392 while (mctx->state_log[cur_str_idx] == NULL);
2394 cur_state = merge_state_with_log (err, mctx, NULL);
2396 while (*err == REG_NOERROR && cur_state == NULL);
2400 /* Helper functions for transit_state. */
2402 /* From the node set CUR_NODES, pick up the nodes whose types are
2403 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2404 expression. And register them to use them later for evaluating the
2405 correspoding back references. */
2407 static reg_errcode_t
2409 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2412 re_dfa_t *const dfa = mctx->dfa;
2416 /* TODO: This isn't efficient.
2417 Because there might be more than one nodes whose types are
2418 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2421 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2423 Idx node = cur_nodes->elems[node_idx];
2424 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2425 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2426 && (dfa->used_bkref_map
2427 & ((bitset_word) 1 << dfa->nodes[node].opr.idx)))
2429 err = match_ctx_add_subtop (mctx, node, str_idx);
2430 if (BE (err != REG_NOERROR, 0))
2438 /* Return the next state to which the current state STATE will transit by
2439 accepting the current input byte. */
2441 static re_dfastate_t *
2442 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2443 re_dfastate_t *state)
2445 re_dfa_t *const dfa = mctx->dfa;
2446 re_node_set next_nodes;
2447 re_dfastate_t *next_state;
2448 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2449 unsigned int context;
2451 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2452 if (BE (*err != REG_NOERROR, 0))
2454 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2456 Idx cur_node = state->nodes.elems[node_cnt];
2457 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2459 *err = re_node_set_merge (&next_nodes,
2460 dfa->eclosures + dfa->nexts[cur_node]);
2461 if (BE (*err != REG_NOERROR, 0))
2463 re_node_set_free (&next_nodes);
2468 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2469 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2470 /* We don't need to check errors here, since the return value of
2471 this function is next_state and ERR is already set. */
2473 re_node_set_free (&next_nodes);
2474 re_string_skip_bytes (&mctx->input, 1);
2479 #ifdef RE_ENABLE_I18N
2480 static reg_errcode_t
2482 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2484 re_dfa_t *const dfa = mctx->dfa;
2488 for (i = 0; i < pstate->nodes.nelem; ++i)
2490 re_node_set dest_nodes, *new_nodes;
2491 Idx cur_node_idx = pstate->nodes.elems[i];
2494 unsigned int context;
2495 re_dfastate_t *dest_state;
2497 if (!dfa->nodes[cur_node_idx].accept_mb)
2500 if (dfa->nodes[cur_node_idx].constraint)
2502 context = re_string_context_at (&mctx->input,
2503 re_string_cur_idx (&mctx->input),
2505 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2510 /* How many bytes the node can accept? */
2511 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2512 re_string_cur_idx (&mctx->input));
2516 /* The node can accepts `naccepted' bytes. */
2517 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2518 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2519 : mctx->max_mb_elem_len);
2520 err = clean_state_log_if_needed (mctx, dest_idx);
2521 if (BE (err != REG_NOERROR, 0))
2524 assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2526 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2528 dest_state = mctx->state_log[dest_idx];
2529 if (dest_state == NULL)
2530 dest_nodes = *new_nodes;
2533 err = re_node_set_init_union (&dest_nodes,
2534 dest_state->entrance_nodes, new_nodes);
2535 if (BE (err != REG_NOERROR, 0))
2538 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2539 mctx->state_log[dest_idx]
2540 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2541 if (dest_state != NULL)
2542 re_node_set_free (&dest_nodes);
2543 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2548 #endif /* RE_ENABLE_I18N */
2550 static reg_errcode_t
2552 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2554 re_dfa_t *const dfa = mctx->dfa;
2557 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2559 for (i = 0; i < nodes->nelem; ++i)
2561 Idx dest_str_idx, prev_nelem, bkc_idx;
2562 Idx node_idx = nodes->elems[i];
2563 unsigned int context;
2564 const re_token_t *node = dfa->nodes + node_idx;
2565 re_node_set *new_dest_nodes;
2567 /* Check whether `node' is a backreference or not. */
2568 if (node->type != OP_BACK_REF)
2571 if (node->constraint)
2573 context = re_string_context_at (&mctx->input, cur_str_idx,
2575 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2579 /* `node' is a backreference.
2580 Check the substring which the substring matched. */
2581 bkc_idx = mctx->nbkref_ents;
2582 err = get_subexp (mctx, node_idx, cur_str_idx);
2583 if (BE (err != REG_NOERROR, 0))
2586 /* And add the epsilon closures (which is `new_dest_nodes') of
2587 the backreference to appropriate state_log. */
2589 assert (dfa->nexts[node_idx] != REG_MISSING);
2591 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2594 re_dfastate_t *dest_state;
2595 struct re_backref_cache_entry *bkref_ent;
2596 bkref_ent = mctx->bkref_ents + bkc_idx;
2597 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2599 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2600 new_dest_nodes = (subexp_len == 0
2601 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2602 : dfa->eclosures + dfa->nexts[node_idx]);
2603 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2604 - bkref_ent->subexp_from);
2605 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2607 dest_state = mctx->state_log[dest_str_idx];
2608 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2609 : mctx->state_log[cur_str_idx]->nodes.nelem);
2610 /* Add `new_dest_node' to state_log. */
2611 if (dest_state == NULL)
2613 mctx->state_log[dest_str_idx]
2614 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2616 if (BE (mctx->state_log[dest_str_idx] == NULL
2617 && err != REG_NOERROR, 0))
2622 re_node_set dest_nodes;
2623 err = re_node_set_init_union (&dest_nodes,
2624 dest_state->entrance_nodes,
2626 if (BE (err != REG_NOERROR, 0))
2628 re_node_set_free (&dest_nodes);
2631 mctx->state_log[dest_str_idx]
2632 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2633 re_node_set_free (&dest_nodes);
2634 if (BE (mctx->state_log[dest_str_idx] == NULL
2635 && err != REG_NOERROR, 0))
2638 /* We need to check recursively if the backreference can epsilon
2641 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2643 err = check_subexp_matching_top (mctx, new_dest_nodes,
2645 if (BE (err != REG_NOERROR, 0))
2647 err = transit_state_bkref (mctx, new_dest_nodes);
2648 if (BE (err != REG_NOERROR, 0))
2658 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2659 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2660 Note that we might collect inappropriate candidates here.
2661 However, the cost of checking them strictly here is too high, then we
2662 delay these checking for prune_impossible_nodes(). */
2664 static reg_errcode_t
2666 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2668 re_dfa_t *const dfa = mctx->dfa;
2669 Idx subexp_num, sub_top_idx;
2670 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2671 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2672 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2673 if (cache_idx != REG_MISSING)
2675 const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2677 if (entry->node == bkref_node)
2678 return REG_NOERROR; /* We already checked it. */
2679 while (entry++->more);
2682 subexp_num = dfa->nodes[bkref_node].opr.idx;
2684 /* For each sub expression */
2685 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2688 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2689 re_sub_match_last_t *sub_last;
2690 Idx sub_last_idx, sl_str, bkref_str_off;
2692 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2693 continue; /* It isn't related. */
2695 sl_str = sub_top->str_idx;
2696 bkref_str_off = bkref_str_idx;
2697 /* At first, check the last node of sub expressions we already
2699 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2701 regoff_t sl_str_diff;
2702 sub_last = sub_top->lasts[sub_last_idx];
2703 sl_str_diff = sub_last->str_idx - sl_str;
2704 /* The matched string by the sub expression match with the substring
2705 at the back reference? */
2706 if (sl_str_diff > 0)
2708 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2710 /* Not enough chars for a successful match. */
2711 if (bkref_str_off + sl_str_diff > mctx->input.len)
2714 err = clean_state_log_if_needed (mctx,
2717 if (BE (err != REG_NOERROR, 0))
2719 buf = (const char *) re_string_get_buffer (&mctx->input);
2721 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2722 break; /* We don't need to search this sub expression any more. */
2724 bkref_str_off += sl_str_diff;
2725 sl_str += sl_str_diff;
2726 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2729 /* Reload buf, since the preceding call might have reallocated
2731 buf = (const char *) re_string_get_buffer (&mctx->input);
2733 if (err == REG_NOMATCH)
2735 if (BE (err != REG_NOERROR, 0))
2739 if (sub_last_idx < sub_top->nlasts)
2741 if (sub_last_idx > 0)
2743 /* Then, search for the other last nodes of the sub expression. */
2744 for (; sl_str <= bkref_str_idx; ++sl_str)
2747 regoff_t sl_str_off;
2748 const re_node_set *nodes;
2749 sl_str_off = sl_str - sub_top->str_idx;
2750 /* The matched string by the sub expression match with the substring
2751 at the back reference? */
2754 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2756 /* If we are at the end of the input, we cannot match. */
2757 if (bkref_str_off >= mctx->input.len)
2760 err = extend_buffers (mctx);
2761 if (BE (err != REG_NOERROR, 0))
2764 buf = (const char *) re_string_get_buffer (&mctx->input);
2766 if (buf [bkref_str_off++] != buf[sl_str - 1])
2767 break; /* We don't need to search this sub expression
2770 if (mctx->state_log[sl_str] == NULL)
2772 /* Does this state have a ')' of the sub expression? */
2773 nodes = &mctx->state_log[sl_str]->nodes;
2774 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2775 if (cls_node == REG_MISSING)
2777 if (sub_top->path == NULL)
2779 sub_top->path = re_calloc (state_array_t,
2780 sl_str - sub_top->str_idx + 1);
2781 if (sub_top->path == NULL)
2784 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2785 in the current context? */
2786 err = check_arrival (mctx, sub_top->path, sub_top->node,
2787 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2788 if (err == REG_NOMATCH)
2790 if (BE (err != REG_NOERROR, 0))
2792 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2793 if (BE (sub_last == NULL, 0))
2795 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2797 if (err == REG_NOMATCH)
2804 /* Helper functions for get_subexp(). */
2806 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2807 If it can arrive, register the sub expression expressed with SUB_TOP
2810 static reg_errcode_t
2812 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2813 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2817 /* Can the subexpression arrive the back reference? */
2818 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2819 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2820 if (err != REG_NOERROR)
2822 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2824 if (BE (err != REG_NOERROR, 0))
2826 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2827 return clean_state_log_if_needed (mctx, to_idx);
2830 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2831 Search '(' if FL_OPEN, or search ')' otherwise.
2832 TODO: This function isn't efficient...
2833 Because there might be more than one nodes whose types are
2834 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2840 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2841 Idx subexp_idx, int type)
2844 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2846 Idx cls_node = nodes->elems[cls_idx];
2847 const re_token_t *node = dfa->nodes + cls_node;
2848 if (node->type == type
2849 && node->opr.idx == subexp_idx)
2855 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2856 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2858 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2860 static reg_errcode_t
2862 check_arrival (re_match_context_t *mctx, state_array_t *path,
2863 Idx top_node, Idx top_str, Idx last_node, Idx last_str,
2866 re_dfa_t *const dfa = mctx->dfa;
2868 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2869 re_dfastate_t *cur_state = NULL;
2870 re_node_set *cur_nodes, next_nodes;
2871 re_dfastate_t **backup_state_log;
2872 unsigned int context;
2874 subexp_num = dfa->nodes[top_node].opr.idx;
2875 /* Extend the buffer if we need. */
2876 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2878 re_dfastate_t **new_array;
2879 Idx old_alloc = path->alloc;
2880 Idx new_alloc = old_alloc + last_str + mctx->max_mb_elem_len + 1;
2881 if (BE (new_alloc < old_alloc, 0))
2883 new_array = re_xrealloc (path->array, re_dfastate_t *, new_alloc);
2884 if (BE (new_array == NULL, 0))
2886 path->array = new_array;
2887 path->alloc = new_alloc;
2888 memset (new_array + old_alloc, '\0',
2889 sizeof (re_dfastate_t *) * (new_alloc - old_alloc));
2892 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2894 /* Temporary modify MCTX. */
2895 backup_state_log = mctx->state_log;
2896 backup_cur_idx = mctx->input.cur_idx;
2897 mctx->state_log = path->array;
2898 mctx->input.cur_idx = str_idx;
2900 /* Setup initial node set. */
2901 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2902 if (str_idx == top_str)
2904 err = re_node_set_init_1 (&next_nodes, top_node);
2905 if (BE (err != REG_NOERROR, 0))
2907 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2908 if (BE (err != REG_NOERROR, 0))
2910 re_node_set_free (&next_nodes);
2916 cur_state = mctx->state_log[str_idx];
2917 if (cur_state && cur_state->has_backref)
2919 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2920 if (BE ( err != REG_NOERROR, 0))
2924 re_node_set_init_empty (&next_nodes);
2926 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2928 if (next_nodes.nelem)
2930 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2932 if (BE ( err != REG_NOERROR, 0))
2934 re_node_set_free (&next_nodes);
2938 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2939 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2941 re_node_set_free (&next_nodes);
2944 mctx->state_log[str_idx] = cur_state;
2947 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2949 re_node_set_empty (&next_nodes);
2950 if (mctx->state_log[str_idx + 1])
2952 err = re_node_set_merge (&next_nodes,
2953 &mctx->state_log[str_idx + 1]->nodes);
2954 if (BE (err != REG_NOERROR, 0))
2956 re_node_set_free (&next_nodes);
2962 err = check_arrival_add_next_nodes (mctx, str_idx,
2963 &cur_state->non_eps_nodes, &next_nodes);
2964 if (BE (err != REG_NOERROR, 0))
2966 re_node_set_free (&next_nodes);
2971 if (next_nodes.nelem)
2973 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2974 if (BE (err != REG_NOERROR, 0))
2976 re_node_set_free (&next_nodes);
2979 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2981 if (BE ( err != REG_NOERROR, 0))
2983 re_node_set_free (&next_nodes);
2987 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2988 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2989 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2991 re_node_set_free (&next_nodes);
2994 mctx->state_log[str_idx] = cur_state;
2995 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2997 re_node_set_free (&next_nodes);
2998 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2999 : &mctx->state_log[last_str]->nodes);
3000 path->next_idx = str_idx;
3003 mctx->state_log = backup_state_log;
3004 mctx->input.cur_idx = backup_cur_idx;
3006 /* Then check the current node set has the node LAST_NODE. */
3007 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3013 /* Helper functions for check_arrival. */
3015 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3017 TODO: This function is similar to the functions transit_state*(),
3018 however this function has many additional works.
3019 Can't we unify them? */
3021 static reg_errcode_t
3023 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3024 re_node_set *cur_nodes,
3025 re_node_set *next_nodes)
3027 re_dfa_t *const dfa = mctx->dfa;
3031 re_node_set union_set;
3032 re_node_set_init_empty (&union_set);
3033 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3036 Idx cur_node = cur_nodes->elems[cur_idx];
3038 re_token_type_t type = dfa->nodes[cur_node].type;
3039 assert (!IS_EPSILON_NODE (type));
3041 #ifdef RE_ENABLE_I18N
3042 /* If the node may accept `multi byte'. */
3043 if (dfa->nodes[cur_node].accept_mb)
3045 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3049 re_dfastate_t *dest_state;
3050 Idx next_node = dfa->nexts[cur_node];
3051 Idx next_idx = str_idx + naccepted;
3052 dest_state = mctx->state_log[next_idx];
3053 re_node_set_empty (&union_set);
3056 err = re_node_set_merge (&union_set, &dest_state->nodes);
3057 if (BE (err != REG_NOERROR, 0))
3059 re_node_set_free (&union_set);
3063 ok = re_node_set_insert (&union_set, next_node);
3066 re_node_set_free (&union_set);
3069 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3071 if (BE (mctx->state_log[next_idx] == NULL
3072 && err != REG_NOERROR, 0))
3074 re_node_set_free (&union_set);
3079 #endif /* RE_ENABLE_I18N */
3081 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3083 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3086 re_node_set_free (&union_set);
3091 re_node_set_free (&union_set);
3095 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3096 CUR_NODES, however exclude the nodes which are:
3097 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3098 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3101 static reg_errcode_t
3103 check_arrival_expand_ecl (re_dfa_t *dfa, re_node_set *cur_nodes,
3104 Idx ex_subexp, int type)
3107 Idx idx, outside_node;
3108 re_node_set new_nodes;
3110 assert (cur_nodes->nelem);
3112 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3113 if (BE (err != REG_NOERROR, 0))
3115 /* Create a new node set NEW_NODES with the nodes which are epsilon
3116 closures of the node in CUR_NODES. */
3118 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3120 Idx cur_node = cur_nodes->elems[idx];
3121 re_node_set *eclosure = dfa->eclosures + cur_node;
3122 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3123 if (outside_node == REG_MISSING)
3125 /* There are no problematic nodes, just merge them. */
3126 err = re_node_set_merge (&new_nodes, eclosure);
3127 if (BE (err != REG_NOERROR, 0))
3129 re_node_set_free (&new_nodes);
3135 /* There are problematic nodes, re-calculate incrementally. */
3136 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3138 if (BE (err != REG_NOERROR, 0))
3140 re_node_set_free (&new_nodes);
3145 re_node_set_free (cur_nodes);
3146 *cur_nodes = new_nodes;
3150 /* Helper function for check_arrival_expand_ecl.
3151 Check incrementally the epsilon closure of TARGET, and if it isn't
3152 problematic append it to DST_NODES. */
3154 static reg_errcode_t
3156 check_arrival_expand_ecl_sub (re_dfa_t *dfa, re_node_set *dst_nodes,
3157 Idx target, Idx ex_subexp, int type)
3160 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3164 if (dfa->nodes[cur_node].type == type
3165 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3167 if (type == OP_CLOSE_SUBEXP)
3169 ok = re_node_set_insert (dst_nodes, cur_node);
3175 ok = re_node_set_insert (dst_nodes, cur_node);
3178 if (dfa->edests[cur_node].nelem == 0)
3180 if (dfa->edests[cur_node].nelem == 2)
3183 check_arrival_expand_ecl_sub (dfa, dst_nodes,
3184 dfa->edests[cur_node].elems[1],
3186 if (BE (ret != REG_NOERROR, 0))
3189 cur_node = dfa->edests[cur_node].elems[0];
3195 /* For all the back references in the current state, calculate the
3196 destination of the back references by the appropriate entry
3197 in MCTX->BKREF_ENTS. */
3199 static reg_errcode_t
3201 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3202 Idx cur_str, Idx subexp_num, int type)
3204 re_dfa_t *const dfa = mctx->dfa;
3206 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3207 struct re_backref_cache_entry *ent;
3209 if (cache_idx_start == REG_MISSING)
3213 ent = mctx->bkref_ents + cache_idx_start;
3216 Idx to_idx, next_node;
3218 /* Is this entry ENT is appropriate? */
3219 if (!re_node_set_contains (cur_nodes, ent->node))
3222 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3223 /* Calculate the destination of the back reference, and append it
3224 to MCTX->STATE_LOG. */
3225 if (to_idx == cur_str)
3227 /* The backreference did epsilon transit, we must re-check all the
3228 node in the current state. */
3229 re_node_set new_dests;
3230 reg_errcode_t err2, err3;
3231 next_node = dfa->edests[ent->node].elems[0];
3232 if (re_node_set_contains (cur_nodes, next_node))
3234 err = re_node_set_init_1 (&new_dests, next_node);
3235 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3236 err3 = re_node_set_merge (cur_nodes, &new_dests);
3237 re_node_set_free (&new_dests);
3238 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3239 || err3 != REG_NOERROR, 0))
3241 err = (err != REG_NOERROR ? err
3242 : (err2 != REG_NOERROR ? err2 : err3));
3245 /* TODO: It is still inefficient... */
3250 re_node_set union_set;
3251 next_node = dfa->nexts[ent->node];
3252 if (mctx->state_log[to_idx])
3255 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3258 err = re_node_set_init_copy (&union_set,
3259 &mctx->state_log[to_idx]->nodes);
3260 ok = re_node_set_insert (&union_set, next_node);
3261 if (BE (err != REG_NOERROR || ! ok, 0))
3263 re_node_set_free (&union_set);
3264 err = err != REG_NOERROR ? err : REG_ESPACE;
3270 err = re_node_set_init_1 (&union_set, next_node);
3271 if (BE (err != REG_NOERROR, 0))
3274 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3275 re_node_set_free (&union_set);
3276 if (BE (mctx->state_log[to_idx] == NULL
3277 && err != REG_NOERROR, 0))
3281 while (ent++->more);
3285 /* Build transition table for the state.
3286 Return true if successful. */
3290 build_trtable (re_dfa_t *dfa, re_dfastate_t *state)
3295 bool need_word_trtable = false;
3296 bitset_word elem, mask;
3297 bool dests_node_malloced = false, dest_states_malloced = false;
3298 Idx ndests; /* Number of the destination states from `state'. */
3299 re_dfastate_t **trtable;
3300 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3301 re_node_set follows, *dests_node;
3307 re_node_set dests_node[SBC_MAX];
3308 bitset dests_ch[SBC_MAX];
3311 /* We build DFA states which corresponds to the destination nodes
3312 from `state'. `dests_node[i]' represents the nodes which i-th
3313 destination state contains, and `dests_ch[i]' represents the
3314 characters which i-th destination state accepts. */
3315 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3316 dests_alloc = (struct dests_alloc *) alloca (sizeof dests_alloc[0]);
3319 dests_alloc = re_malloc (struct dests_alloc, 1);
3320 if (BE (dests_alloc == NULL, 0))
3322 dests_node_malloced = true;
3324 dests_node = dests_alloc->dests_node;
3325 dests_ch = dests_alloc->dests_ch;
3327 /* Initialize transiton table. */
3328 state->word_trtable = state->trtable = NULL;
3330 /* At first, group all nodes belonging to `state' into several
3332 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3333 if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3335 if (dests_node_malloced)
3339 state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3345 err = re_node_set_alloc (&follows, ndests + 1);
3346 if (BE (err != REG_NOERROR, 0))
3349 /* Avoid arithmetic overflow in size calculation. */
3350 if (BE (((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX)
3351 / (3 * sizeof (re_dfastate_t *)))
3355 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3356 + ndests * 3 * sizeof (re_dfastate_t *)))
3357 dest_states = (re_dfastate_t **)
3358 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3361 dest_states = (re_dfastate_t **)
3362 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3363 if (BE (dest_states == NULL, 0))
3366 if (dest_states_malloced)
3368 re_node_set_free (&follows);
3369 for (i = 0; i < ndests; ++i)
3370 re_node_set_free (dests_node + i);
3371 if (dests_node_malloced)
3375 dest_states_malloced = true;
3377 dest_states_word = dest_states + ndests;
3378 dest_states_nl = dest_states_word + ndests;
3379 bitset_empty (acceptable);
3381 /* Then build the states for all destinations. */
3382 for (i = 0; i < ndests; ++i)
3385 re_node_set_empty (&follows);
3386 /* Merge the follows of this destination states. */
3387 for (j = 0; j < dests_node[i].nelem; ++j)
3389 next_node = dfa->nexts[dests_node[i].elems[j]];
3390 if (next_node != REG_MISSING)
3392 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3393 if (BE (err != REG_NOERROR, 0))
3397 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3398 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3400 /* If the new state has context constraint,
3401 build appropriate states for these contexts. */
3402 if (dest_states[i]->has_constraint)
3404 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3406 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3409 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3410 need_word_trtable = true;
3412 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3414 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3419 dest_states_word[i] = dest_states[i];
3420 dest_states_nl[i] = dest_states[i];
3422 bitset_merge (acceptable, dests_ch[i]);
3425 if (!BE (need_word_trtable, 0))
3427 /* We don't care about whether the following character is a word
3428 character, or we are in a single-byte character set so we can
3429 discern by looking at the character code: allocate a
3430 256-entry transition table. */
3431 trtable = state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3432 if (BE (trtable == NULL, 0))
3435 /* For all characters ch...: */
3436 for (i = 0; i < BITSET_WORDS; ++i)
3437 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3439 mask <<= 1, elem >>= 1, ++ch)
3440 if (BE (elem & 1, 0))
3442 /* There must be exactly one destination which accepts
3443 character ch. See group_nodes_into_DFAstates. */
3444 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3447 /* j-th destination accepts the word character ch. */
3448 if (dfa->word_char[i] & mask)
3449 trtable[ch] = dest_states_word[j];
3451 trtable[ch] = dest_states[j];
3456 /* We care about whether the following character is a word
3457 character, and we are in a multi-byte character set: discern
3458 by looking at the character code: build two 256-entry
3459 transition tables, one starting at trtable[0] and one
3460 starting at trtable[SBC_MAX]. */
3461 trtable = state->word_trtable = re_calloc (re_dfastate_t *, 2 * SBC_MAX);
3462 if (BE (trtable == NULL, 0))
3465 /* For all characters ch...: */
3466 for (i = 0; i < BITSET_WORDS; ++i)
3467 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3469 mask <<= 1, elem >>= 1, ++ch)
3470 if (BE (elem & 1, 0))
3472 /* There must be exactly one destination which accepts
3473 character ch. See group_nodes_into_DFAstates. */
3474 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3477 /* j-th destination accepts the word character ch. */
3478 trtable[ch] = dest_states[j];
3479 trtable[ch + SBC_MAX] = dest_states_word[j];
3484 if (bitset_contain (acceptable, NEWLINE_CHAR))
3486 /* The current state accepts newline character. */
3487 for (j = 0; j < ndests; ++j)
3488 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3490 /* k-th destination accepts newline character. */
3491 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3492 if (need_word_trtable)
3493 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3494 /* There must be only one destination which accepts
3495 newline. See group_nodes_into_DFAstates. */
3500 if (dest_states_malloced)
3503 re_node_set_free (&follows);
3504 for (i = 0; i < ndests; ++i)
3505 re_node_set_free (dests_node + i);
3507 if (dests_node_malloced)
3513 /* Group all nodes belonging to STATE into several destinations.
3514 Then for all destinations, set the nodes belonging to the destination
3515 to DESTS_NODE[i] and set the characters accepted by the destination
3516 to DEST_CH[i]. This function return the number of destinations. */
3520 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3521 re_node_set *dests_node, bitset *dests_ch)
3526 Idx ndests; /* Number of the destinations from `state'. */
3527 bitset accepts; /* Characters a node can accept. */
3528 const re_node_set *cur_nodes = &state->nodes;
3529 bitset_empty (accepts);
3532 /* For all the nodes belonging to `state', */
3533 for (i = 0; i < cur_nodes->nelem; ++i)
3535 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3536 re_token_type_t type = node->type;
3537 unsigned int constraint = node->constraint;
3539 /* Enumerate all single byte character this node can accept. */
3540 if (type == CHARACTER)
3541 bitset_set (accepts, node->opr.c);
3542 else if (type == SIMPLE_BRACKET)
3544 bitset_merge (accepts, node->opr.sbcset);
3546 else if (type == OP_PERIOD)
3548 #ifdef RE_ENABLE_I18N
3549 if (dfa->mb_cur_max > 1)
3550 bitset_merge (accepts, dfa->sb_char);
3553 bitset_set_all (accepts);
3554 if (!(dfa->syntax & REG_DOT_NEWLINE))
3555 bitset_clear (accepts, '\n');
3556 if (dfa->syntax & REG_DOT_NOT_NULL)
3557 bitset_clear (accepts, '\0');
3559 #ifdef RE_ENABLE_I18N
3560 else if (type == OP_UTF8_PERIOD)
3562 if (SBC_MAX / 2 % BITSET_WORD_BITS == 0)
3563 memset (accepts, -1, sizeof accepts / 2);
3565 bitset_merge (accepts, utf8_sb_map);
3566 if (!(dfa->syntax & REG_DOT_NEWLINE))
3567 bitset_clear (accepts, '\n');
3568 if (dfa->syntax & REG_DOT_NOT_NULL)
3569 bitset_clear (accepts, '\0');
3575 /* Check the `accepts' and sift the characters which are not
3576 match it the context. */
3579 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3581 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3582 bitset_empty (accepts);
3583 if (accepts_newline)
3584 bitset_set (accepts, NEWLINE_CHAR);
3588 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3590 bitset_empty (accepts);
3594 if (constraint & NEXT_WORD_CONSTRAINT)
3596 bitset_word any_set = 0;
3597 if (type == CHARACTER && !node->word_char)
3599 bitset_empty (accepts);
3602 #ifdef RE_ENABLE_I18N
3603 if (dfa->mb_cur_max > 1)
3604 for (j = 0; j < BITSET_WORDS; ++j)
3605 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3608 for (j = 0; j < BITSET_WORDS; ++j)
3609 any_set |= (accepts[j] &= dfa->word_char[j]);
3613 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3615 bitset_word any_set = 0;
3616 if (type == CHARACTER && node->word_char)
3618 bitset_empty (accepts);
3621 #ifdef RE_ENABLE_I18N
3622 if (dfa->mb_cur_max > 1)
3623 for (j = 0; j < BITSET_WORDS; ++j)
3624 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3627 for (j = 0; j < BITSET_WORDS; ++j)
3628 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3634 /* Then divide `accepts' into DFA states, or create a new
3635 state. Above, we make sure that accepts is not empty. */
3636 for (j = 0; j < ndests; ++j)
3638 bitset intersec; /* Intersection sets, see below. */
3640 /* Flags, see below. */
3641 bitset_word has_intersec, not_subset, not_consumed;
3643 /* Optimization, skip if this state doesn't accept the character. */
3644 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3647 /* Enumerate the intersection set of this state and `accepts'. */
3649 for (k = 0; k < BITSET_WORDS; ++k)
3650 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3651 /* And skip if the intersection set is empty. */
3655 /* Then check if this state is a subset of `accepts'. */
3656 not_subset = not_consumed = 0;
3657 for (k = 0; k < BITSET_WORDS; ++k)
3659 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3660 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3663 /* If this state isn't a subset of `accepts', create a
3664 new group state, which has the `remains'. */
3667 bitset_copy (dests_ch[ndests], remains);
3668 bitset_copy (dests_ch[j], intersec);
3669 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3670 if (BE (err != REG_NOERROR, 0))
3675 /* Put the position in the current group. */
3676 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3680 /* If all characters are consumed, go to next node. */
3684 /* Some characters remain, create a new group. */
3687 bitset_copy (dests_ch[ndests], accepts);
3688 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3689 if (BE (err != REG_NOERROR, 0))
3692 bitset_empty (accepts);
3697 for (j = 0; j < ndests; ++j)
3698 re_node_set_free (dests_node + j);
3702 #ifdef RE_ENABLE_I18N
3703 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3704 Return the number of the bytes the node accepts.
3705 STR_IDX is the current index of the input string.
3707 This function handles the nodes which can accept one character, or
3708 one collating element like '.', '[a-z]', opposite to the other nodes
3709 can only accept one byte. */
3713 check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
3714 const re_string_t *input, Idx str_idx)
3716 const re_token_t *node = dfa->nodes + node_idx;
3717 int char_len, elem_len;
3720 if (BE (node->type == OP_UTF8_PERIOD, 0))
3722 unsigned char c = re_string_byte_at (input, str_idx), d;
3723 if (BE (c < 0xc2, 1))
3726 if (str_idx + 2 > input->len)
3729 d = re_string_byte_at (input, str_idx + 1);
3731 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3735 if (c == 0xe0 && d < 0xa0)
3741 if (c == 0xf0 && d < 0x90)
3747 if (c == 0xf8 && d < 0x88)
3753 if (c == 0xfc && d < 0x84)
3759 if (str_idx + char_len > input->len)
3762 for (i = 1; i < char_len; ++i)
3764 d = re_string_byte_at (input, str_idx + i);
3765 if (d < 0x80 || d > 0xbf)
3771 char_len = re_string_char_size_at (input, str_idx);
3772 if (node->type == OP_PERIOD)
3776 /* FIXME: I don't think this if is needed, as both '\n'
3777 and '\0' are char_len == 1. */
3778 /* '.' accepts any one character except the following two cases. */
3779 if ((!(dfa->syntax & REG_DOT_NEWLINE) &&
3780 re_string_byte_at (input, str_idx) == '\n') ||
3781 ((dfa->syntax & REG_DOT_NOT_NULL) &&
3782 re_string_byte_at (input, str_idx) == '\0'))
3787 elem_len = re_string_elem_size_at (input, str_idx);
3788 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3791 if (node->type == COMPLEX_BRACKET)
3793 const re_charset_t *cset = node->opr.mbcset;
3795 const unsigned char *pin
3796 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3801 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3802 ? re_string_wchar_at (input, str_idx) : 0);
3804 /* match with multibyte character? */
3805 for (i = 0; i < cset->nmbchars; ++i)
3806 if (wc == cset->mbchars[i])
3808 match_len = char_len;
3809 goto check_node_accept_bytes_match;
3811 /* match with character_class? */
3812 for (i = 0; i < cset->nchar_classes; ++i)
3814 wctype_t wt = cset->char_classes[i];
3815 if (__iswctype (wc, wt))
3817 match_len = char_len;
3818 goto check_node_accept_bytes_match;
3823 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3826 unsigned int in_collseq = 0;
3827 const int32_t *table, *indirect;
3828 const unsigned char *weights, *extra;
3829 const char *collseqwc;
3831 /* This #include defines a local function! */
3832 # include <locale/weight.h>
3834 /* match with collating_symbol? */
3835 if (cset->ncoll_syms)
3836 extra = (const unsigned char *)
3837 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3838 for (i = 0; i < cset->ncoll_syms; ++i)
3840 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3841 /* Compare the length of input collating element and
3842 the length of current collating element. */
3843 if (*coll_sym != elem_len)
3845 /* Compare each bytes. */
3846 for (j = 0; j < *coll_sym; j++)
3847 if (pin[j] != coll_sym[1 + j])
3851 /* Match if every bytes is equal. */
3853 goto check_node_accept_bytes_match;
3859 if (elem_len <= char_len)
3861 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3862 in_collseq = __collseq_table_lookup (collseqwc, wc);
3865 in_collseq = find_collation_sequence_value (pin, elem_len);
3867 /* match with range expression? */
3868 for (i = 0; i < cset->nranges; ++i)
3869 if (cset->range_starts[i] <= in_collseq
3870 && in_collseq <= cset->range_ends[i])
3872 match_len = elem_len;
3873 goto check_node_accept_bytes_match;
3876 /* match with equivalence_class? */
3877 if (cset->nequiv_classes)
3879 const unsigned char *cp = pin;
3880 table = (const int32_t *)
3881 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3882 weights = (const unsigned char *)
3883 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3884 extra = (const unsigned char *)
3885 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3886 indirect = (const int32_t *)
3887 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3888 idx = findidx (&cp);
3890 for (i = 0; i < cset->nequiv_classes; ++i)
3892 int32_t equiv_class_idx = cset->equiv_classes[i];
3893 size_t weight_len = weights[idx];
3894 if (weight_len == weights[equiv_class_idx])
3897 while (cnt <= weight_len
3898 && (weights[equiv_class_idx + 1 + cnt]
3899 == weights[idx + 1 + cnt]))
3901 if (cnt > weight_len)
3903 match_len = elem_len;
3904 goto check_node_accept_bytes_match;
3913 /* match with range expression? */
3915 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3917 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3920 for (i = 0; i < cset->nranges; ++i)
3922 cmp_buf[0] = cset->range_starts[i];
3923 cmp_buf[4] = cset->range_ends[i];
3924 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3925 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3927 match_len = char_len;
3928 goto check_node_accept_bytes_match;
3932 check_node_accept_bytes_match:
3933 if (!cset->non_match)
3940 return (elem_len > char_len) ? elem_len : char_len;
3948 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3950 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3955 /* No valid character. Match it as a single byte character. */
3956 const unsigned char *collseq = (const unsigned char *)
3957 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3958 return collseq[mbs[0]];
3965 const unsigned char *extra = (const unsigned char *)
3966 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3967 int32_t extrasize = (const unsigned char *)
3968 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3970 for (idx = 0; idx < extrasize;)
3974 int32_t elem_mbs_len;
3975 /* Skip the name of collating element name. */
3976 idx = idx + extra[idx] + 1;
3977 elem_mbs_len = extra[idx++];
3978 if (mbs_len == elem_mbs_len)
3980 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3981 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3983 if (mbs_cnt == elem_mbs_len)
3984 /* Found the entry. */
3987 /* Skip the byte sequence of the collating element. */
3988 idx += elem_mbs_len;
3989 /* Adjust for the alignment. */
3990 idx = (idx + 3) & ~3;
3991 /* Skip the collation sequence value. */
3992 idx += sizeof (uint32_t);
3993 /* Skip the wide char sequence of the collating element. */
3994 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3995 /* If we found the entry, return the sequence value. */
3997 return *(uint32_t *) (extra + idx);
3998 /* Skip the collation sequence value. */
3999 idx += sizeof (uint32_t);
4005 #endif /* RE_ENABLE_I18N */
4007 /* Check whether the node accepts the byte which is IDX-th
4008 byte of the INPUT. */
4012 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4016 ch = re_string_byte_at (&mctx->input, idx);
4020 if (node->opr.c != ch)
4024 case SIMPLE_BRACKET:
4025 if (!bitset_contain (node->opr.sbcset, ch))
4029 #ifdef RE_ENABLE_I18N
4030 case OP_UTF8_PERIOD:
4036 if ((ch == '\n' && !(mctx->dfa->syntax & REG_DOT_NEWLINE))
4037 || (ch == '\0' && (mctx->dfa->syntax & REG_DOT_NOT_NULL)))
4045 if (node->constraint)
4047 /* The node has constraints. Check whether the current context
4048 satisfies the constraints. */
4049 unsigned int context = re_string_context_at (&mctx->input, idx,
4051 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4058 /* Extend the buffers, if the buffers have run out. */
4060 static reg_errcode_t
4062 extend_buffers (re_match_context_t *mctx)
4065 re_string_t *pstr = &mctx->input;
4067 /* Double the lengthes of the buffers. */
4068 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4069 if (BE (ret != REG_NOERROR, 0))
4072 if (mctx->state_log != NULL)
4074 /* And double the length of state_log. */
4075 /* XXX We have no indication of the size of this buffer. If this
4076 allocation fail we have no indication that the state_log array
4077 does not have the right size. */
4078 re_dfastate_t **new_array = re_xrealloc (mctx->state_log, re_dfastate_t *,
4079 pstr->bufs_len + 1);
4080 if (BE (new_array == NULL, 0))
4082 mctx->state_log = new_array;
4085 /* Then reconstruct the buffers. */
4088 #ifdef RE_ENABLE_I18N
4089 if (pstr->mb_cur_max > 1)
4091 ret = build_wcs_upper_buffer (pstr);
4092 if (BE (ret != REG_NOERROR, 0))
4096 #endif /* RE_ENABLE_I18N */
4097 build_upper_buffer (pstr);
4101 #ifdef RE_ENABLE_I18N
4102 if (pstr->mb_cur_max > 1)
4103 build_wcs_buffer (pstr);
4105 #endif /* RE_ENABLE_I18N */
4107 if (pstr->trans != NULL)
4108 re_string_translate_buffer (pstr);
4115 /* Functions for matching context. */
4117 /* Initialize MCTX. */
4119 static reg_errcode_t
4121 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4123 mctx->eflags = eflags;
4124 mctx->match_last = REG_MISSING;
4127 mctx->bkref_ents = re_xmalloc (struct re_backref_cache_entry, n);
4128 mctx->sub_tops = re_xmalloc (re_sub_match_top_t *, n);
4129 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4132 /* Already zero-ed by the caller.
4134 mctx->bkref_ents = NULL;
4135 mctx->nbkref_ents = 0;
4136 mctx->nsub_tops = 0; */
4137 mctx->abkref_ents = n;
4138 mctx->max_mb_elem_len = 1;
4139 mctx->asub_tops = n;
4143 /* Clean the entries which depend on the current input in MCTX.
4144 This function must be invoked when the matcher changes the start index
4145 of the input, or changes the input string. */
4149 match_ctx_clean (re_match_context_t *mctx)
4152 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4155 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4156 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4158 re_sub_match_last_t *last = top->lasts[sl_idx];
4159 re_free (last->path.array);
4162 re_free (top->lasts);
4165 re_free (top->path->array);
4166 re_free (top->path);
4171 mctx->nsub_tops = 0;
4172 mctx->nbkref_ents = 0;
4175 /* Free all the memory associated with MCTX. */
4179 match_ctx_free (re_match_context_t *mctx)
4181 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4182 match_ctx_clean (mctx);
4183 re_free (mctx->sub_tops);
4184 re_free (mctx->bkref_ents);
4187 /* Add a new backreference entry to MCTX.
4188 Note that we assume that caller never call this function with duplicate
4189 entry, and call with STR_IDX which isn't smaller than any existing entry.
4192 static reg_errcode_t
4194 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx,
4197 if (mctx->nbkref_ents >= mctx->abkref_ents)
4199 struct re_backref_cache_entry* new_entry;
4200 new_entry = re_x2realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4201 &mctx->abkref_ents);
4202 if (BE (new_entry == NULL, 0))
4204 re_free (mctx->bkref_ents);
4207 mctx->bkref_ents = new_entry;
4208 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4209 (sizeof (struct re_backref_cache_entry)
4210 * (mctx->abkref_ents - mctx->nbkref_ents)));
4212 if (mctx->nbkref_ents > 0
4213 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4214 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4216 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4217 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4218 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4219 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4221 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4222 If bit N is clear, means that this entry won't epsilon-transition to
4223 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4224 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4227 A backreference does not epsilon-transition unless it is empty, so set
4228 to all zeros if FROM != TO. */
4229 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4230 = (from == to ? -1 : 0);
4232 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4233 if (mctx->max_mb_elem_len < to - from)
4234 mctx->max_mb_elem_len = to - from;
4238 /* Return the first entry with the same str_idx, or REG_MISSING if none is
4239 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4243 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4245 Idx left, right, mid, last;
4246 last = right = mctx->nbkref_ents;
4247 for (left = 0; left < right;)
4249 mid = (left + right) / 2;
4250 if (mctx->bkref_ents[mid].str_idx < str_idx)
4255 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4261 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4264 static reg_errcode_t
4266 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4269 assert (mctx->sub_tops != NULL);
4270 assert (mctx->asub_tops > 0);
4272 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4274 Idx new_asub_tops = mctx->asub_tops;
4275 re_sub_match_top_t **new_array = re_x2realloc (mctx->sub_tops,
4276 re_sub_match_top_t *,
4278 if (BE (new_array == NULL, 0))
4280 mctx->sub_tops = new_array;
4281 mctx->asub_tops = new_asub_tops;
4283 mctx->sub_tops[mctx->nsub_tops] = re_calloc (re_sub_match_top_t, 1);
4284 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4286 mctx->sub_tops[mctx->nsub_tops]->node = node;
4287 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4291 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4292 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4294 static re_sub_match_last_t *
4296 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4298 re_sub_match_last_t *new_entry;
4299 if (BE (subtop->nlasts == subtop->alasts, 0))
4301 Idx new_alasts = subtop->alasts;
4302 re_sub_match_last_t **new_array = re_x2realloc (subtop->lasts,
4303 re_sub_match_last_t *,
4305 if (BE (new_array == NULL, 0))
4307 subtop->lasts = new_array;
4308 subtop->alasts = new_alasts;
4310 new_entry = re_calloc (re_sub_match_last_t, 1);
4311 if (BE (new_entry != NULL, 1))
4313 subtop->lasts[subtop->nlasts] = new_entry;
4314 new_entry->node = node;
4315 new_entry->str_idx = str_idx;
4323 sift_ctx_init (re_sift_context_t *sctx,
4324 re_dfastate_t **sifted_sts,
4325 re_dfastate_t **limited_sts,
4326 Idx last_node, Idx last_str_idx)
4328 sctx->sifted_states = sifted_sts;
4329 sctx->limited_states = limited_sts;
4330 sctx->last_node = last_node;
4331 sctx->last_str_idx = last_str_idx;
4332 re_node_set_init_empty (&sctx->limits);