1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
3 Software Foundation, Inc.
4 This file is part of the GNU C Library.
5 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation,
19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 static void optimize_utf8 (re_dfa_t *dfa);
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
50 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static Idx fetch_number (re_string_t *input, re_token_t *token,
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 Idx nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
82 re_token_t *token, int token_len,
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
92 Idx *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
97 Idx *char_class_alloc,
98 const unsigned char *class_name,
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
105 const unsigned char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name,
111 const unsigned char *extra,
112 bool non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 static const char __re_error_msgid[] =
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 static const size_t __re_error_msgid_idx[] =
204 /* Entry points for GNU code. */
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
215 re_compile_pattern (pattern, length, bufp)
218 struct re_pattern_buffer *bufp;
219 #else /* size_t might promote */
221 re_compile_pattern (const char *pattern, size_t length,
222 struct re_pattern_buffer *bufp)
227 /* And GNU code determines whether or not to get register information
228 by passing null for the REGS argument to re_match, etc., not by
229 setting no_sub, unless RE_NO_SUB is set. */
230 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
232 /* Match anchors at newline. */
233 bufp->newline_anchor = 1;
235 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
239 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
242 weak_alias (__re_compile_pattern, re_compile_pattern)
245 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
246 also be assigned to arbitrarily: each pattern buffer stores its own
247 syntax, so it can be changed between regex compilations. */
248 /* This has no initializer because initialized variables in Emacs
249 become read-only after dumping. */
250 reg_syntax_t re_syntax_options;
253 /* Specify the precise syntax of regexps for compilation. This provides
254 for compatibility for various utilities which historically have
255 different, incompatible syntaxes.
257 The argument SYNTAX is a bit mask comprised of the various bits
258 defined in regex.h. We return the old syntax. */
261 re_set_syntax (syntax)
264 reg_syntax_t ret = re_syntax_options;
266 re_syntax_options = syntax;
270 weak_alias (__re_set_syntax, re_set_syntax)
274 re_compile_fastmap (bufp)
275 struct re_pattern_buffer *bufp;
277 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
278 char *fastmap = bufp->fastmap;
280 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
281 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
282 if (dfa->init_state != dfa->init_state_word)
283 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
284 if (dfa->init_state != dfa->init_state_nl)
285 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
286 if (dfa->init_state != dfa->init_state_begbuf)
287 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
288 bufp->fastmap_accurate = 1;
292 weak_alias (__re_compile_fastmap, re_compile_fastmap)
296 __attribute ((always_inline))
297 re_set_fastmap (char *fastmap, bool icase, int ch)
301 fastmap[tolower (ch)] = 1;
304 /* Helper function for re_compile_fastmap.
305 Compile fastmap for the initial_state INIT_STATE. */
308 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
311 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
313 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
314 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
316 Idx node = init_state->nodes.elems[node_cnt];
317 re_token_type_t type = dfa->nodes[node].type;
319 if (type == CHARACTER)
321 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
322 #ifdef RE_ENABLE_I18N
323 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
325 unsigned char buf[MB_LEN_MAX];
331 *p++ = dfa->nodes[node].opr.c;
332 while (++node < dfa->nodes_len
333 && dfa->nodes[node].type == CHARACTER
334 && dfa->nodes[node].mb_partial)
335 *p++ = dfa->nodes[node].opr.c;
336 memset (&state, '\0', sizeof (state));
337 if (__mbrtowc (&wc, (const char *) buf, p - buf,
339 && (__wcrtomb ((char *) buf, towlower (wc), &state)
341 re_set_fastmap (fastmap, false, buf[0]);
345 else if (type == SIMPLE_BRACKET)
348 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
351 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
352 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
353 if (w & ((bitset_word_t) 1 << j))
354 re_set_fastmap (fastmap, icase, ch);
357 #ifdef RE_ENABLE_I18N
358 else if (type == COMPLEX_BRACKET)
360 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
364 /* See if we have to try all bytes which start multiple collation
366 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
367 collation element, and don't catch 'b' since 'b' is
368 the only collation element which starts from 'b' (and
369 it is caught by SIMPLE_BRACKET). */
370 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
371 && (cset->ncoll_syms || cset->nranges))
373 const int32_t *table = (const int32_t *)
374 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
375 for (i = 0; i < SBC_MAX; ++i)
377 re_set_fastmap (fastmap, icase, i);
381 /* See if we have to start the match at all multibyte characters,
382 i.e. where we would not find an invalid sequence. This only
383 applies to multibyte character sets; for single byte character
384 sets, the SIMPLE_BRACKET again suffices. */
385 if (dfa->mb_cur_max > 1
386 && (cset->nchar_classes || cset->non_match || cset->nranges
388 || cset->nequiv_classes
396 memset (&mbs, 0, sizeof (mbs));
397 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
398 re_set_fastmap (fastmap, false, (int) c);
405 /* ... Else catch all bytes which can start the mbchars. */
406 for (i = 0; i < cset->nmbchars; ++i)
410 memset (&state, '\0', sizeof (state));
411 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
412 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
413 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
415 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
417 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
422 #endif /* RE_ENABLE_I18N */
423 else if (type == OP_PERIOD
424 #ifdef RE_ENABLE_I18N
425 || type == OP_UTF8_PERIOD
426 #endif /* RE_ENABLE_I18N */
427 || type == END_OF_RE)
429 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
430 if (type == END_OF_RE)
431 bufp->can_be_null = 1;
437 /* Entry point for POSIX code. */
438 /* regcomp takes a regular expression as a string and compiles it.
440 PREG is a regex_t *. We do not expect any fields to be initialized,
441 since POSIX says we shouldn't. Thus, we set
443 `buffer' to the compiled pattern;
444 `used' to the length of the compiled pattern;
445 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
446 REG_EXTENDED bit in CFLAGS is set; otherwise, to
447 RE_SYNTAX_POSIX_BASIC;
448 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
449 `fastmap' to an allocated space for the fastmap;
450 `fastmap_accurate' to zero;
451 `re_nsub' to the number of subexpressions in PATTERN.
453 PATTERN is the address of the pattern string.
455 CFLAGS is a series of bits which affect compilation.
457 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
458 use POSIX basic syntax.
460 If REG_NEWLINE is set, then . and [^...] don't match newline.
461 Also, regexec will try a match beginning after every newline.
463 If REG_ICASE is set, then we considers upper- and lowercase
464 versions of letters to be equivalent when matching.
466 If REG_NOSUB is set, then when PREG is passed to regexec, that
467 routine will report only success or failure, and nothing about the
470 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
471 the return codes and their meanings.) */
474 regcomp (preg, pattern, cflags)
475 regex_t *_Restrict_ preg;
476 const char *_Restrict_ pattern;
480 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
481 : RE_SYNTAX_POSIX_BASIC);
487 /* Try to allocate space for the fastmap. */
488 preg->fastmap = re_malloc (char, SBC_MAX);
489 if (BE (preg->fastmap == NULL, 0))
492 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
494 /* If REG_NEWLINE is set, newlines are treated differently. */
495 if (cflags & REG_NEWLINE)
496 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
497 syntax &= ~RE_DOT_NEWLINE;
498 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
499 /* It also changes the matching behavior. */
500 preg->newline_anchor = 1;
503 preg->newline_anchor = 0;
504 preg->no_sub = !!(cflags & REG_NOSUB);
505 preg->translate = NULL;
507 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
509 /* POSIX doesn't distinguish between an unmatched open-group and an
510 unmatched close-group: both are REG_EPAREN. */
511 if (ret == REG_ERPAREN)
514 /* We have already checked preg->fastmap != NULL. */
515 if (BE (ret == REG_NOERROR, 1))
516 /* Compute the fastmap now, since regexec cannot modify the pattern
517 buffer. This function never fails in this implementation. */
518 (void) re_compile_fastmap (preg);
521 /* Some error occurred while compiling the expression. */
522 re_free (preg->fastmap);
523 preg->fastmap = NULL;
529 weak_alias (__regcomp, regcomp)
532 /* Returns a message corresponding to an error code, ERRCODE, returned
533 from either regcomp or regexec. We don't use PREG here. */
537 regerror (errcode, preg, errbuf, errbuf_size)
539 const regex_t *_Restrict_ preg;
540 char *_Restrict_ errbuf;
542 #else /* size_t might promote */
544 regerror (int errcode, const regex_t *_Restrict_ preg,
545 char *_Restrict_ errbuf, size_t errbuf_size)
552 || errcode >= (int) (sizeof (__re_error_msgid_idx)
553 / sizeof (__re_error_msgid_idx[0])), 0))
554 /* Only error codes returned by the rest of the code should be passed
555 to this routine. If we are given anything else, or if other regex
556 code generates an invalid error code, then the program has a bug.
557 Dump core so we can fix it. */
560 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
562 msg_size = strlen (msg) + 1; /* Includes the null. */
564 if (BE (errbuf_size != 0, 1))
566 size_t cpy_size = msg_size;
567 if (BE (msg_size > errbuf_size, 0))
569 cpy_size = errbuf_size - 1;
570 errbuf[cpy_size] = '\0';
572 memcpy (errbuf, msg, cpy_size);
578 weak_alias (__regerror, regerror)
582 #ifdef RE_ENABLE_I18N
583 /* This static array is used for the map to single-byte characters when
584 UTF-8 is used. Otherwise we would allocate memory just to initialize
585 it the same all the time. UTF-8 is the preferred encoding so this is
586 a worthwhile optimization. */
587 static const bitset_t utf8_sb_map =
589 /* Set the first 128 bits. */
590 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
591 # error "bitset_word_t is narrower than 32 bits"
592 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
593 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
594 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
595 BITSET_WORD_MAX, BITSET_WORD_MAX,
596 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
600 >> (SBC_MAX % BITSET_WORD_BITS == 0
602 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
608 free_dfa_content (re_dfa_t *dfa)
613 for (i = 0; i < dfa->nodes_len; ++i)
614 free_token (dfa->nodes + i);
615 re_free (dfa->nexts);
616 for (i = 0; i < dfa->nodes_len; ++i)
618 if (dfa->eclosures != NULL)
619 re_node_set_free (dfa->eclosures + i);
620 if (dfa->inveclosures != NULL)
621 re_node_set_free (dfa->inveclosures + i);
622 if (dfa->edests != NULL)
623 re_node_set_free (dfa->edests + i);
625 re_free (dfa->edests);
626 re_free (dfa->eclosures);
627 re_free (dfa->inveclosures);
628 re_free (dfa->nodes);
630 if (dfa->state_table)
631 for (i = 0; i <= dfa->state_hash_mask; ++i)
633 struct re_state_table_entry *entry = dfa->state_table + i;
634 for (j = 0; j < entry->num; ++j)
636 re_dfastate_t *state = entry->array[j];
639 re_free (entry->array);
641 re_free (dfa->state_table);
642 #ifdef RE_ENABLE_I18N
643 if (dfa->sb_char != utf8_sb_map)
644 re_free (dfa->sb_char);
646 re_free (dfa->subexp_map);
648 re_free (dfa->re_str);
655 /* Free dynamically allocated space used by PREG. */
661 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
662 if (BE (dfa != NULL, 1))
663 free_dfa_content (dfa);
667 re_free (preg->fastmap);
668 preg->fastmap = NULL;
670 re_free (preg->translate);
671 preg->translate = NULL;
674 weak_alias (__regfree, regfree)
677 /* Entry points compatible with 4.2 BSD regex library. We don't define
678 them unless specifically requested. */
680 #if defined _REGEX_RE_COMP || defined _LIBC
682 /* BSD has one and only one pattern buffer. */
683 static struct re_pattern_buffer re_comp_buf;
687 /* Make these definitions weak in libc, so POSIX programs can redefine
688 these names if they don't use our functions, and still use
689 regcomp/regexec above without link errors. */
700 if (!re_comp_buf.buffer)
701 return gettext ("No previous regular expression");
705 if (re_comp_buf.buffer)
707 fastmap = re_comp_buf.fastmap;
708 re_comp_buf.fastmap = NULL;
709 __regfree (&re_comp_buf);
710 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
711 re_comp_buf.fastmap = fastmap;
714 if (re_comp_buf.fastmap == NULL)
716 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
717 if (re_comp_buf.fastmap == NULL)
718 return (char *) gettext (__re_error_msgid
719 + __re_error_msgid_idx[(int) REG_ESPACE]);
722 /* Since `re_exec' always passes NULL for the `regs' argument, we
723 don't need to initialize the pattern buffer fields which affect it. */
725 /* Match anchors at newlines. */
726 re_comp_buf.newline_anchor = 1;
728 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
733 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
734 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
738 libc_freeres_fn (free_mem)
740 __regfree (&re_comp_buf);
744 #endif /* _REGEX_RE_COMP */
746 /* Internal entry point.
747 Compile the regular expression PATTERN, whose length is LENGTH.
748 SYNTAX indicate regular expression's syntax. */
751 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
754 reg_errcode_t err = REG_NOERROR;
758 /* Initialize the pattern buffer. */
759 preg->fastmap_accurate = 0;
760 preg->syntax = syntax;
761 preg->not_bol = preg->not_eol = 0;
764 preg->can_be_null = 0;
765 preg->regs_allocated = REGS_UNALLOCATED;
767 /* Initialize the dfa. */
768 dfa = (re_dfa_t *) preg->buffer;
769 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
771 /* If zero allocated, but buffer is non-null, try to realloc
772 enough space. This loses if buffer's address is bogus, but
773 that is the user's responsibility. If ->buffer is NULL this
774 is a simple allocation. */
775 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
778 preg->allocated = sizeof (re_dfa_t);
779 preg->buffer = (unsigned char *) dfa;
781 preg->used = sizeof (re_dfa_t);
783 err = init_dfa (dfa, length);
784 if (BE (err != REG_NOERROR, 0))
786 free_dfa_content (dfa);
792 /* Note: length+1 will not overflow since it is checked in init_dfa. */
793 dfa->re_str = re_malloc (char, length + 1);
794 strncpy (dfa->re_str, pattern, length + 1);
797 __libc_lock_init (dfa->lock);
799 err = re_string_construct (®exp, pattern, length, preg->translate,
800 (syntax & RE_ICASE) != 0, dfa);
801 if (BE (err != REG_NOERROR, 0))
803 re_compile_internal_free_return:
804 free_workarea_compile (preg);
805 re_string_destruct (®exp);
806 free_dfa_content (dfa);
812 /* Parse the regular expression, and build a structure tree. */
814 dfa->str_tree = parse (®exp, preg, syntax, &err);
815 if (BE (dfa->str_tree == NULL, 0))
816 goto re_compile_internal_free_return;
818 /* Analyze the tree and create the nfa. */
819 err = analyze (preg);
820 if (BE (err != REG_NOERROR, 0))
821 goto re_compile_internal_free_return;
823 #ifdef RE_ENABLE_I18N
824 /* If possible, do searching in single byte encoding to speed things up. */
825 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
829 /* Then create the initial state of the dfa. */
830 err = create_initial_state (dfa);
832 /* Release work areas. */
833 free_workarea_compile (preg);
834 re_string_destruct (®exp);
836 if (BE (err != REG_NOERROR, 0))
838 free_dfa_content (dfa);
846 /* Initialize DFA. We use the length of the regular expression PAT_LEN
847 as the initial length of some arrays. */
850 init_dfa (re_dfa_t *dfa, size_t pat_len)
852 __re_size_t table_size;
856 #ifdef RE_ENABLE_I18N
857 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
859 size_t max_i18n_object_size = 0;
861 size_t max_object_size =
862 MAX (sizeof (struct re_state_table_entry),
863 MAX (sizeof (re_token_t),
864 MAX (sizeof (re_node_set),
865 MAX (sizeof (regmatch_t),
866 max_i18n_object_size))));
868 memset (dfa, '\0', sizeof (re_dfa_t));
870 /* Force allocation of str_tree_storage the first time. */
871 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
873 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
874 calculation below, and for similar doubling calculations
875 elsewhere. And it's <= rather than <, because some of the
876 doubling calculations add 1 afterwards. */
877 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
880 dfa->nodes_alloc = pat_len + 1;
881 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
883 /* table_size = 2 ^ ceil(log pat_len) */
884 for (table_size = 1; ; table_size <<= 1)
885 if (table_size > pat_len)
888 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
889 dfa->state_hash_mask = table_size - 1;
891 dfa->mb_cur_max = MB_CUR_MAX;
893 if (dfa->mb_cur_max == 6
894 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
896 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
899 codeset_name = nl_langinfo (CODESET);
900 if (strcasecmp (codeset_name, "UTF-8") == 0
901 || strcasecmp (codeset_name, "UTF8") == 0)
904 /* We check exhaustively in the loop below if this charset is a
905 superset of ASCII. */
906 dfa->map_notascii = 0;
909 #ifdef RE_ENABLE_I18N
910 if (dfa->mb_cur_max > 1)
913 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
918 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
919 if (BE (dfa->sb_char == NULL, 0))
922 /* Set the bits corresponding to single byte chars. */
923 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
924 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
926 wint_t wch = __btowc (ch);
928 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
930 if (isascii (ch) && wch != ch)
931 dfa->map_notascii = 1;
938 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
943 /* Initialize WORD_CHAR table, which indicate which character is
944 "word". In this case "word" means that it is the word construction
945 character used by some operators like "\<", "\>", etc. */
949 init_word_char (re_dfa_t *dfa)
952 dfa->word_ops_used = 1;
953 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
954 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
955 if (isalnum (ch) || ch == '_')
956 dfa->word_char[i] |= (bitset_word_t) 1 << j;
959 /* Free the work area which are only used while compiling. */
962 free_workarea_compile (regex_t *preg)
964 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
965 bin_tree_storage_t *storage, *next;
966 for (storage = dfa->str_tree_storage; storage; storage = next)
968 next = storage->next;
971 dfa->str_tree_storage = NULL;
972 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
973 dfa->str_tree = NULL;
974 re_free (dfa->org_indices);
975 dfa->org_indices = NULL;
978 /* Create initial states for all contexts. */
981 create_initial_state (re_dfa_t *dfa)
985 re_node_set init_nodes;
987 /* Initial states have the epsilon closure of the node which is
988 the first node of the regular expression. */
989 first = dfa->str_tree->first->node_idx;
990 dfa->init_node = first;
991 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
992 if (BE (err != REG_NOERROR, 0))
995 /* The back-references which are in initial states can epsilon transit,
996 since in this case all of the subexpressions can be null.
997 Then we add epsilon closures of the nodes which are the next nodes of
998 the back-references. */
999 if (dfa->nbackref > 0)
1000 for (i = 0; i < init_nodes.nelem; ++i)
1002 Idx node_idx = init_nodes.elems[i];
1003 re_token_type_t type = dfa->nodes[node_idx].type;
1006 if (type != OP_BACK_REF)
1008 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1010 re_token_t *clexp_node;
1011 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1012 if (clexp_node->type == OP_CLOSE_SUBEXP
1013 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1016 if (clexp_idx == init_nodes.nelem)
1019 if (type == OP_BACK_REF)
1021 Idx dest_idx = dfa->edests[node_idx].elems[0];
1022 if (!re_node_set_contains (&init_nodes, dest_idx))
1024 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1030 /* It must be the first time to invoke acquire_state. */
1031 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1032 /* We don't check ERR here, since the initial state must not be NULL. */
1033 if (BE (dfa->init_state == NULL, 0))
1035 if (dfa->init_state->has_constraint)
1037 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1039 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1041 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1045 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1046 || dfa->init_state_begbuf == NULL, 0))
1050 dfa->init_state_word = dfa->init_state_nl
1051 = dfa->init_state_begbuf = dfa->init_state;
1053 re_node_set_free (&init_nodes);
1057 #ifdef RE_ENABLE_I18N
1058 /* If it is possible to do searching in single byte encoding instead of UTF-8
1059 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1060 DFA nodes where needed. */
1063 optimize_utf8 (re_dfa_t *dfa)
1067 bool mb_chars = false;
1068 bool has_period = false;
1070 for (node = 0; node < dfa->nodes_len; ++node)
1071 switch (dfa->nodes[node].type)
1074 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1078 switch (dfa->nodes[node].opr.ctx_type)
1086 /* Word anchors etc. cannot be handled. It's okay to test
1087 opr.ctx_type since constraints (for all DFA nodes) are
1088 created by ORing one or more opr.ctx_type values. */
1098 case OP_DUP_ASTERISK:
1099 case OP_OPEN_SUBEXP:
1100 case OP_CLOSE_SUBEXP:
1102 case COMPLEX_BRACKET:
1104 case SIMPLE_BRACKET:
1105 /* Just double check. */
1107 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1109 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1110 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1112 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1122 if (mb_chars || has_period)
1123 for (node = 0; node < dfa->nodes_len; ++node)
1125 if (dfa->nodes[node].type == CHARACTER
1126 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1127 dfa->nodes[node].mb_partial = 0;
1128 else if (dfa->nodes[node].type == OP_PERIOD)
1129 dfa->nodes[node].type = OP_UTF8_PERIOD;
1132 /* The search can be in single byte locale. */
1133 dfa->mb_cur_max = 1;
1135 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1139 /* Analyze the structure tree, and calculate "first", "next", "edest",
1140 "eclosure", and "inveclosure". */
1142 static reg_errcode_t
1143 analyze (regex_t *preg)
1145 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1148 /* Allocate arrays. */
1149 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1150 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1151 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1152 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1153 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1154 || dfa->eclosures == NULL, 0))
1157 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1158 if (dfa->subexp_map != NULL)
1161 for (i = 0; i < preg->re_nsub; i++)
1162 dfa->subexp_map[i] = i;
1163 preorder (dfa->str_tree, optimize_subexps, dfa);
1164 for (i = 0; i < preg->re_nsub; i++)
1165 if (dfa->subexp_map[i] != i)
1167 if (i == preg->re_nsub)
1169 free (dfa->subexp_map);
1170 dfa->subexp_map = NULL;
1174 ret = postorder (dfa->str_tree, lower_subexps, preg);
1175 if (BE (ret != REG_NOERROR, 0))
1177 ret = postorder (dfa->str_tree, calc_first, dfa);
1178 if (BE (ret != REG_NOERROR, 0))
1180 preorder (dfa->str_tree, calc_next, dfa);
1181 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1182 if (BE (ret != REG_NOERROR, 0))
1184 ret = calc_eclosure (dfa);
1185 if (BE (ret != REG_NOERROR, 0))
1188 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1189 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1190 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1193 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1194 if (BE (dfa->inveclosures == NULL, 0))
1196 ret = calc_inveclosure (dfa);
1202 /* Our parse trees are very unbalanced, so we cannot use a stack to
1203 implement parse tree visits. Instead, we use parent pointers and
1204 some hairy code in these two functions. */
1205 static reg_errcode_t
1206 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1209 bin_tree_t *node, *prev;
1211 for (node = root; ; )
1213 /* Descend down the tree, preferably to the left (or to the right
1214 if that's the only child). */
1215 while (node->left || node->right)
1223 reg_errcode_t err = fn (extra, node);
1224 if (BE (err != REG_NOERROR, 0))
1226 if (node->parent == NULL)
1229 node = node->parent;
1231 /* Go up while we have a node that is reached from the right. */
1232 while (node->right == prev || node->right == NULL);
1237 static reg_errcode_t
1238 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1243 for (node = root; ; )
1245 reg_errcode_t err = fn (extra, node);
1246 if (BE (err != REG_NOERROR, 0))
1249 /* Go to the left node, or up and to the right. */
1254 bin_tree_t *prev = NULL;
1255 while (node->right == prev || node->right == NULL)
1258 node = node->parent;
1267 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1268 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1269 backreferences as well. Requires a preorder visit. */
1270 static reg_errcode_t
1271 optimize_subexps (void *extra, bin_tree_t *node)
1273 re_dfa_t *dfa = (re_dfa_t *) extra;
1275 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1277 int idx = node->token.opr.idx;
1278 node->token.opr.idx = dfa->subexp_map[idx];
1279 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1282 else if (node->token.type == SUBEXP
1283 && node->left && node->left->token.type == SUBEXP)
1285 Idx other_idx = node->left->token.opr.idx;
1287 node->left = node->left->left;
1289 node->left->parent = node;
1291 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1292 if (other_idx < BITSET_WORD_BITS)
1293 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1299 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1300 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1301 static reg_errcode_t
1302 lower_subexps (void *extra, bin_tree_t *node)
1304 regex_t *preg = (regex_t *) extra;
1305 reg_errcode_t err = REG_NOERROR;
1307 if (node->left && node->left->token.type == SUBEXP)
1309 node->left = lower_subexp (&err, preg, node->left);
1311 node->left->parent = node;
1313 if (node->right && node->right->token.type == SUBEXP)
1315 node->right = lower_subexp (&err, preg, node->right);
1317 node->right->parent = node;
1324 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1326 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1327 bin_tree_t *body = node->left;
1328 bin_tree_t *op, *cls, *tree1, *tree;
1331 /* We do not optimize empty subexpressions, because otherwise we may
1332 have bad CONCAT nodes with NULL children. This is obviously not
1333 very common, so we do not lose much. An example that triggers
1334 this case is the sed "script" /\(\)/x. */
1335 && node->left != NULL
1336 && (node->token.opr.idx >= BITSET_WORD_BITS
1337 || !(dfa->used_bkref_map
1338 & ((bitset_word_t) 1 << node->token.opr.idx))))
1341 /* Convert the SUBEXP node to the concatenation of an
1342 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1343 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1344 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1345 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1346 tree = create_tree (dfa, op, tree1, CONCAT);
1347 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1353 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1354 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1358 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1359 nodes. Requires a postorder visit. */
1360 static reg_errcode_t
1361 calc_first (void *extra, bin_tree_t *node)
1363 re_dfa_t *dfa = (re_dfa_t *) extra;
1364 if (node->token.type == CONCAT)
1366 node->first = node->left->first;
1367 node->node_idx = node->left->node_idx;
1372 node->node_idx = re_dfa_add_node (dfa, node->token);
1373 if (BE (node->node_idx == REG_MISSING, 0))
1375 if (node->token.type == ANCHOR)
1376 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1381 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1382 static reg_errcode_t
1383 calc_next (void *extra, bin_tree_t *node)
1385 switch (node->token.type)
1387 case OP_DUP_ASTERISK:
1388 node->left->next = node;
1391 node->left->next = node->right->first;
1392 node->right->next = node->next;
1396 node->left->next = node->next;
1398 node->right->next = node->next;
1404 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1405 static reg_errcode_t
1406 link_nfa_nodes (void *extra, bin_tree_t *node)
1408 re_dfa_t *dfa = (re_dfa_t *) extra;
1409 Idx idx = node->node_idx;
1410 reg_errcode_t err = REG_NOERROR;
1412 switch (node->token.type)
1418 assert (node->next == NULL);
1421 case OP_DUP_ASTERISK:
1425 dfa->has_plural_match = 1;
1426 if (node->left != NULL)
1427 left = node->left->first->node_idx;
1429 left = node->next->node_idx;
1430 if (node->right != NULL)
1431 right = node->right->first->node_idx;
1433 right = node->next->node_idx;
1434 assert (REG_VALID_INDEX (left));
1435 assert (REG_VALID_INDEX (right));
1436 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1441 case OP_OPEN_SUBEXP:
1442 case OP_CLOSE_SUBEXP:
1443 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1447 dfa->nexts[idx] = node->next->node_idx;
1448 if (node->token.type == OP_BACK_REF)
1449 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1453 assert (!IS_EPSILON_NODE (node->token.type));
1454 dfa->nexts[idx] = node->next->node_idx;
1461 /* Duplicate the epsilon closure of the node ROOT_NODE.
1462 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1463 to their own constraint. */
1465 static reg_errcode_t
1467 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1468 Idx root_node, unsigned int init_constraint)
1470 Idx org_node, clone_node;
1472 unsigned int constraint = init_constraint;
1473 for (org_node = top_org_node, clone_node = top_clone_node;;)
1475 Idx org_dest, clone_dest;
1476 if (dfa->nodes[org_node].type == OP_BACK_REF)
1478 /* If the back reference epsilon-transit, its destination must
1479 also have the constraint. Then duplicate the epsilon closure
1480 of the destination of the back reference, and store it in
1481 edests of the back reference. */
1482 org_dest = dfa->nexts[org_node];
1483 re_node_set_empty (dfa->edests + clone_node);
1484 clone_dest = duplicate_node (dfa, org_dest, constraint);
1485 if (BE (clone_dest == REG_MISSING, 0))
1487 dfa->nexts[clone_node] = dfa->nexts[org_node];
1488 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1492 else if (dfa->edests[org_node].nelem == 0)
1494 /* In case of the node can't epsilon-transit, don't duplicate the
1495 destination and store the original destination as the
1496 destination of the node. */
1497 dfa->nexts[clone_node] = dfa->nexts[org_node];
1500 else if (dfa->edests[org_node].nelem == 1)
1502 /* In case of the node can epsilon-transit, and it has only one
1504 org_dest = dfa->edests[org_node].elems[0];
1505 re_node_set_empty (dfa->edests + clone_node);
1506 /* If the node is root_node itself, it means the epsilon closure
1507 has a loop. Then tie it to the destination of the root_node. */
1508 if (org_node == root_node && clone_node != org_node)
1510 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1515 /* In case the node has another constraint, append it. */
1516 constraint |= dfa->nodes[org_node].constraint;
1517 clone_dest = duplicate_node (dfa, org_dest, constraint);
1518 if (BE (clone_dest == REG_MISSING, 0))
1520 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1524 else /* dfa->edests[org_node].nelem == 2 */
1526 /* In case of the node can epsilon-transit, and it has two
1527 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1528 org_dest = dfa->edests[org_node].elems[0];
1529 re_node_set_empty (dfa->edests + clone_node);
1530 /* Search for a duplicated node which satisfies the constraint. */
1531 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1532 if (clone_dest == REG_MISSING)
1534 /* There is no such duplicated node, create a new one. */
1536 clone_dest = duplicate_node (dfa, org_dest, constraint);
1537 if (BE (clone_dest == REG_MISSING, 0))
1539 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1542 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1543 root_node, constraint);
1544 if (BE (err != REG_NOERROR, 0))
1549 /* There is a duplicated node which satisfies the constraint,
1550 use it to avoid infinite loop. */
1551 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556 org_dest = dfa->edests[org_node].elems[1];
1557 clone_dest = duplicate_node (dfa, org_dest, constraint);
1558 if (BE (clone_dest == REG_MISSING, 0))
1560 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1564 org_node = org_dest;
1565 clone_node = clone_dest;
1570 /* Search for a node which is duplicated from the node ORG_NODE, and
1571 satisfies the constraint CONSTRAINT. */
1574 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1575 unsigned int constraint)
1578 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1580 if (org_node == dfa->org_indices[idx]
1581 && constraint == dfa->nodes[idx].constraint)
1582 return idx; /* Found. */
1584 return REG_MISSING; /* Not found. */
1587 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1588 Return the index of the new node, or REG_MISSING if insufficient storage is
1592 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1594 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1595 if (BE (dup_idx != REG_MISSING, 1))
1597 dfa->nodes[dup_idx].constraint = constraint;
1598 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1599 dfa->nodes[dup_idx].duplicated = 1;
1601 /* Store the index of the original node. */
1602 dfa->org_indices[dup_idx] = org_idx;
1607 static reg_errcode_t
1608 calc_inveclosure (re_dfa_t *dfa)
1612 for (idx = 0; idx < dfa->nodes_len; ++idx)
1613 re_node_set_init_empty (dfa->inveclosures + idx);
1615 for (src = 0; src < dfa->nodes_len; ++src)
1617 Idx *elems = dfa->eclosures[src].elems;
1618 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1620 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1629 /* Calculate "eclosure" for all the node in DFA. */
1631 static reg_errcode_t
1632 calc_eclosure (re_dfa_t *dfa)
1637 assert (dfa->nodes_len > 0);
1640 /* For each nodes, calculate epsilon closure. */
1641 for (node_idx = 0; ; ++node_idx)
1644 re_node_set eclosure_elem;
1645 if (node_idx == dfa->nodes_len)
1654 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1657 /* If we have already calculated, skip it. */
1658 if (dfa->eclosures[node_idx].nelem != 0)
1660 /* Calculate epsilon closure of `node_idx'. */
1661 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1662 if (BE (err != REG_NOERROR, 0))
1665 if (dfa->eclosures[node_idx].nelem == 0)
1668 re_node_set_free (&eclosure_elem);
1674 /* Calculate epsilon closure of NODE. */
1676 static reg_errcode_t
1677 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1683 re_node_set eclosure;
1685 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1686 if (BE (err != REG_NOERROR, 0))
1689 /* This indicates that we are calculating this node now.
1690 We reference this value to avoid infinite loop. */
1691 dfa->eclosures[node].nelem = REG_MISSING;
1693 /* If the current node has constraints, duplicate all nodes
1694 since they must inherit the constraints. */
1695 if (dfa->nodes[node].constraint
1696 && dfa->edests[node].nelem
1697 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1699 err = duplicate_node_closure (dfa, node, node, node,
1700 dfa->nodes[node].constraint);
1701 if (BE (err != REG_NOERROR, 0))
1705 /* Expand each epsilon destination nodes. */
1706 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1707 for (i = 0; i < dfa->edests[node].nelem; ++i)
1709 re_node_set eclosure_elem;
1710 Idx edest = dfa->edests[node].elems[i];
1711 /* If calculating the epsilon closure of `edest' is in progress,
1712 return intermediate result. */
1713 if (dfa->eclosures[edest].nelem == REG_MISSING)
1718 /* If we haven't calculated the epsilon closure of `edest' yet,
1719 calculate now. Otherwise use calculated epsilon closure. */
1720 if (dfa->eclosures[edest].nelem == 0)
1722 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1723 if (BE (err != REG_NOERROR, 0))
1727 eclosure_elem = dfa->eclosures[edest];
1728 /* Merge the epsilon closure of `edest'. */
1729 re_node_set_merge (&eclosure, &eclosure_elem);
1730 /* If the epsilon closure of `edest' is incomplete,
1731 the epsilon closure of this node is also incomplete. */
1732 if (dfa->eclosures[edest].nelem == 0)
1735 re_node_set_free (&eclosure_elem);
1739 /* Epsilon closures include itself. */
1740 ok = re_node_set_insert (&eclosure, node);
1743 if (incomplete && !root)
1744 dfa->eclosures[node].nelem = 0;
1746 dfa->eclosures[node] = eclosure;
1747 *new_set = eclosure;
1751 /* Functions for token which are used in the parser. */
1753 /* Fetch a token from INPUT.
1754 We must not use this function inside bracket expressions. */
1758 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1760 re_string_skip_bytes (input, peek_token (result, input, syntax));
1763 /* Peek a token from INPUT, and return the length of the token.
1764 We must not use this function inside bracket expressions. */
1768 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1772 if (re_string_eoi (input))
1774 token->type = END_OF_RE;
1778 c = re_string_peek_byte (input, 0);
1781 token->word_char = 0;
1782 #ifdef RE_ENABLE_I18N
1783 token->mb_partial = 0;
1784 if (input->mb_cur_max > 1 &&
1785 !re_string_first_byte (input, re_string_cur_idx (input)))
1787 token->type = CHARACTER;
1788 token->mb_partial = 1;
1795 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1797 token->type = BACK_SLASH;
1801 c2 = re_string_peek_byte_case (input, 1);
1803 token->type = CHARACTER;
1804 #ifdef RE_ENABLE_I18N
1805 if (input->mb_cur_max > 1)
1807 wint_t wc = re_string_wchar_at (input,
1808 re_string_cur_idx (input) + 1);
1809 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1813 token->word_char = IS_WORD_CHAR (c2) != 0;
1818 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1819 token->type = OP_ALT;
1821 case '1': case '2': case '3': case '4': case '5':
1822 case '6': case '7': case '8': case '9':
1823 if (!(syntax & RE_NO_BK_REFS))
1825 token->type = OP_BACK_REF;
1826 token->opr.idx = c2 - '1';
1830 if (!(syntax & RE_NO_GNU_OPS))
1832 token->type = ANCHOR;
1833 token->opr.ctx_type = WORD_FIRST;
1837 if (!(syntax & RE_NO_GNU_OPS))
1839 token->type = ANCHOR;
1840 token->opr.ctx_type = WORD_LAST;
1844 if (!(syntax & RE_NO_GNU_OPS))
1846 token->type = ANCHOR;
1847 token->opr.ctx_type = WORD_DELIM;
1851 if (!(syntax & RE_NO_GNU_OPS))
1853 token->type = ANCHOR;
1854 token->opr.ctx_type = NOT_WORD_DELIM;
1858 if (!(syntax & RE_NO_GNU_OPS))
1859 token->type = OP_WORD;
1862 if (!(syntax & RE_NO_GNU_OPS))
1863 token->type = OP_NOTWORD;
1866 if (!(syntax & RE_NO_GNU_OPS))
1867 token->type = OP_SPACE;
1870 if (!(syntax & RE_NO_GNU_OPS))
1871 token->type = OP_NOTSPACE;
1874 if (!(syntax & RE_NO_GNU_OPS))
1876 token->type = ANCHOR;
1877 token->opr.ctx_type = BUF_FIRST;
1881 if (!(syntax & RE_NO_GNU_OPS))
1883 token->type = ANCHOR;
1884 token->opr.ctx_type = BUF_LAST;
1888 if (!(syntax & RE_NO_BK_PARENS))
1889 token->type = OP_OPEN_SUBEXP;
1892 if (!(syntax & RE_NO_BK_PARENS))
1893 token->type = OP_CLOSE_SUBEXP;
1896 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1897 token->type = OP_DUP_PLUS;
1900 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1901 token->type = OP_DUP_QUESTION;
1904 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1905 token->type = OP_OPEN_DUP_NUM;
1908 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1909 token->type = OP_CLOSE_DUP_NUM;
1917 token->type = CHARACTER;
1918 #ifdef RE_ENABLE_I18N
1919 if (input->mb_cur_max > 1)
1921 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1922 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1926 token->word_char = IS_WORD_CHAR (token->opr.c);
1931 if (syntax & RE_NEWLINE_ALT)
1932 token->type = OP_ALT;
1935 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1936 token->type = OP_ALT;
1939 token->type = OP_DUP_ASTERISK;
1942 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1943 token->type = OP_DUP_PLUS;
1946 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1947 token->type = OP_DUP_QUESTION;
1950 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1951 token->type = OP_OPEN_DUP_NUM;
1954 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1955 token->type = OP_CLOSE_DUP_NUM;
1958 if (syntax & RE_NO_BK_PARENS)
1959 token->type = OP_OPEN_SUBEXP;
1962 if (syntax & RE_NO_BK_PARENS)
1963 token->type = OP_CLOSE_SUBEXP;
1966 token->type = OP_OPEN_BRACKET;
1969 token->type = OP_PERIOD;
1972 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1973 re_string_cur_idx (input) != 0)
1975 char prev = re_string_peek_byte (input, -1);
1976 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1979 token->type = ANCHOR;
1980 token->opr.ctx_type = LINE_FIRST;
1983 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1984 re_string_cur_idx (input) + 1 != re_string_length (input))
1987 re_string_skip_bytes (input, 1);
1988 peek_token (&next, input, syntax);
1989 re_string_skip_bytes (input, -1);
1990 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1993 token->type = ANCHOR;
1994 token->opr.ctx_type = LINE_LAST;
2002 /* Peek a token from INPUT, and return the length of the token.
2003 We must not use this function out of bracket expressions. */
2007 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2010 if (re_string_eoi (input))
2012 token->type = END_OF_RE;
2015 c = re_string_peek_byte (input, 0);
2018 #ifdef RE_ENABLE_I18N
2019 if (input->mb_cur_max > 1 &&
2020 !re_string_first_byte (input, re_string_cur_idx (input)))
2022 token->type = CHARACTER;
2025 #endif /* RE_ENABLE_I18N */
2027 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2028 && re_string_cur_idx (input) + 1 < re_string_length (input))
2030 /* In this case, '\' escape a character. */
2032 re_string_skip_bytes (input, 1);
2033 c2 = re_string_peek_byte (input, 0);
2035 token->type = CHARACTER;
2038 if (c == '[') /* '[' is a special char in a bracket exps. */
2042 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2043 c2 = re_string_peek_byte (input, 1);
2051 token->type = OP_OPEN_COLL_ELEM;
2054 token->type = OP_OPEN_EQUIV_CLASS;
2057 if (syntax & RE_CHAR_CLASSES)
2059 token->type = OP_OPEN_CHAR_CLASS;
2062 /* else fall through. */
2064 token->type = CHARACTER;
2074 token->type = OP_CHARSET_RANGE;
2077 token->type = OP_CLOSE_BRACKET;
2080 token->type = OP_NON_MATCH_LIST;
2083 token->type = CHARACTER;
2088 /* Functions for parser. */
2090 /* Entry point of the parser.
2091 Parse the regular expression REGEXP and return the structure tree.
2092 If an error is occured, ERR is set by error code, and return NULL.
2093 This function build the following tree, from regular expression <reg_exp>:
2099 CAT means concatenation.
2100 EOR means end of regular expression. */
2103 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2106 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2107 bin_tree_t *tree, *eor, *root;
2108 re_token_t current_token;
2109 dfa->syntax = syntax;
2110 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2111 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2112 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2114 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2116 root = create_tree (dfa, tree, eor, CONCAT);
2119 if (BE (eor == NULL || root == NULL, 0))
2127 /* This function build the following tree, from regular expression
2128 <branch1>|<branch2>:
2134 ALT means alternative, which represents the operator `|'. */
2137 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2138 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2140 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2141 bin_tree_t *tree, *branch = NULL;
2142 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2143 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2146 while (token->type == OP_ALT)
2148 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2149 if (token->type != OP_ALT && token->type != END_OF_RE
2150 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2152 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2153 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2158 tree = create_tree (dfa, tree, branch, OP_ALT);
2159 if (BE (tree == NULL, 0))
2168 /* This function build the following tree, from regular expression
2175 CAT means concatenation. */
2178 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2179 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2181 bin_tree_t *tree, *expr;
2182 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2183 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2184 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2187 while (token->type != OP_ALT && token->type != END_OF_RE
2188 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2190 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2191 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2195 if (tree != NULL && expr != NULL)
2197 tree = create_tree (dfa, tree, expr, CONCAT);
2204 else if (tree == NULL)
2206 /* Otherwise expr == NULL, we don't need to create new tree. */
2211 /* This function build the following tree, from regular expression a*:
2218 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2219 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2221 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2223 switch (token->type)
2226 tree = create_token_tree (dfa, NULL, NULL, token);
2227 if (BE (tree == NULL, 0))
2232 #ifdef RE_ENABLE_I18N
2233 if (dfa->mb_cur_max > 1)
2235 while (!re_string_eoi (regexp)
2236 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2238 bin_tree_t *mbc_remain;
2239 fetch_token (token, regexp, syntax);
2240 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2241 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2242 if (BE (mbc_remain == NULL || tree == NULL, 0))
2251 case OP_OPEN_SUBEXP:
2252 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2253 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2256 case OP_OPEN_BRACKET:
2257 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2258 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2262 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2267 dfa->used_bkref_map |= 1 << token->opr.idx;
2268 tree = create_token_tree (dfa, NULL, NULL, token);
2269 if (BE (tree == NULL, 0))
2275 dfa->has_mb_node = 1;
2277 case OP_OPEN_DUP_NUM:
2278 if (syntax & RE_CONTEXT_INVALID_DUP)
2284 case OP_DUP_ASTERISK:
2286 case OP_DUP_QUESTION:
2287 if (syntax & RE_CONTEXT_INVALID_OPS)
2292 else if (syntax & RE_CONTEXT_INDEP_OPS)
2294 fetch_token (token, regexp, syntax);
2295 return parse_expression (regexp, preg, token, syntax, nest, err);
2297 /* else fall through */
2298 case OP_CLOSE_SUBEXP:
2299 if ((token->type == OP_CLOSE_SUBEXP) &&
2300 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2305 /* else fall through */
2306 case OP_CLOSE_DUP_NUM:
2307 /* We treat it as a normal character. */
2309 /* Then we can these characters as normal characters. */
2310 token->type = CHARACTER;
2311 /* mb_partial and word_char bits should be initialized already
2313 tree = create_token_tree (dfa, NULL, NULL, token);
2314 if (BE (tree == NULL, 0))
2321 if ((token->opr.ctx_type
2322 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2323 && dfa->word_ops_used == 0)
2324 init_word_char (dfa);
2325 if (token->opr.ctx_type == WORD_DELIM
2326 || token->opr.ctx_type == NOT_WORD_DELIM)
2328 bin_tree_t *tree_first, *tree_last;
2329 if (token->opr.ctx_type == WORD_DELIM)
2331 token->opr.ctx_type = WORD_FIRST;
2332 tree_first = create_token_tree (dfa, NULL, NULL, token);
2333 token->opr.ctx_type = WORD_LAST;
2337 token->opr.ctx_type = INSIDE_WORD;
2338 tree_first = create_token_tree (dfa, NULL, NULL, token);
2339 token->opr.ctx_type = INSIDE_NOTWORD;
2341 tree_last = create_token_tree (dfa, NULL, NULL, token);
2342 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2343 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2351 tree = create_token_tree (dfa, NULL, NULL, token);
2352 if (BE (tree == NULL, 0))
2358 /* We must return here, since ANCHORs can't be followed
2359 by repetition operators.
2360 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2361 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2362 fetch_token (token, regexp, syntax);
2365 tree = create_token_tree (dfa, NULL, NULL, token);
2366 if (BE (tree == NULL, 0))
2371 if (dfa->mb_cur_max > 1)
2372 dfa->has_mb_node = 1;
2376 tree = build_charclass_op (dfa, regexp->trans,
2377 (const unsigned char *) "alnum",
2378 (const unsigned char *) "_",
2379 token->type == OP_NOTWORD, err);
2380 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2385 tree = build_charclass_op (dfa, regexp->trans,
2386 (const unsigned char *) "space",
2387 (const unsigned char *) "",
2388 token->type == OP_NOTSPACE, err);
2389 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2399 /* Must not happen? */
2405 fetch_token (token, regexp, syntax);
2407 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2408 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2410 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2411 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2413 /* In BRE consecutive duplications are not allowed. */
2414 if ((syntax & RE_CONTEXT_INVALID_DUP)
2415 && (token->type == OP_DUP_ASTERISK
2416 || token->type == OP_OPEN_DUP_NUM))
2426 /* This function build the following tree, from regular expression
2434 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2435 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2437 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2440 cur_nsub = preg->re_nsub++;
2442 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2444 /* The subexpression may be a null string. */
2445 if (token->type == OP_CLOSE_SUBEXP)
2449 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2450 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2452 if (BE (*err != REG_NOERROR, 0))
2456 if (cur_nsub <= '9' - '1')
2457 dfa->completed_bkref_map |= 1 << cur_nsub;
2459 tree = create_tree (dfa, tree, NULL, SUBEXP);
2460 if (BE (tree == NULL, 0))
2465 tree->token.opr.idx = cur_nsub;
2469 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2472 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2473 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2475 bin_tree_t *tree = NULL, *old_tree = NULL;
2476 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2477 re_token_t start_token = *token;
2479 if (token->type == OP_OPEN_DUP_NUM)
2482 start = fetch_number (regexp, token, syntax);
2483 if (start == REG_MISSING)
2485 if (token->type == CHARACTER && token->opr.c == ',')
2486 start = 0; /* We treat "{,m}" as "{0,m}". */
2489 *err = REG_BADBR; /* <re>{} is invalid. */
2493 if (BE (start != REG_ERROR, 1))
2495 /* We treat "{n}" as "{n,n}". */
2496 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2497 : ((token->type == CHARACTER && token->opr.c == ',')
2498 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2500 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2502 /* Invalid sequence. */
2503 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2505 if (token->type == END_OF_RE)
2513 /* If the syntax bit is set, rollback. */
2514 re_string_set_index (regexp, start_idx);
2515 *token = start_token;
2516 token->type = CHARACTER;
2517 /* mb_partial and word_char bits should be already initialized by
2522 if (BE ((end != REG_MISSING && start > end)
2523 || token->type != OP_CLOSE_DUP_NUM, 0))
2525 /* First number greater than second. */
2532 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2533 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2536 fetch_token (token, regexp, syntax);
2538 if (BE (elem == NULL, 0))
2540 if (BE (start == 0 && end == 0, 0))
2542 postorder (elem, free_tree, NULL);
2546 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2547 if (BE (start > 0, 0))
2550 for (i = 2; i <= start; ++i)
2552 elem = duplicate_tree (elem, dfa);
2553 tree = create_tree (dfa, tree, elem, CONCAT);
2554 if (BE (elem == NULL || tree == NULL, 0))
2555 goto parse_dup_op_espace;
2561 /* Duplicate ELEM before it is marked optional. */
2562 elem = duplicate_tree (elem, dfa);
2568 if (elem->token.type == SUBEXP)
2569 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2571 tree = create_tree (dfa, elem, NULL,
2572 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2573 if (BE (tree == NULL, 0))
2574 goto parse_dup_op_espace;
2576 /* This loop is actually executed only when end != REG_MISSING,
2577 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2578 already created the start+1-th copy. */
2579 if ((Idx) -1 < 0 || end != REG_MISSING)
2580 for (i = start + 2; i <= end; ++i)
2582 elem = duplicate_tree (elem, dfa);
2583 tree = create_tree (dfa, tree, elem, CONCAT);
2584 if (BE (elem == NULL || tree == NULL, 0))
2585 goto parse_dup_op_espace;
2587 tree = create_tree (dfa, tree, NULL, OP_ALT);
2588 if (BE (tree == NULL, 0))
2589 goto parse_dup_op_espace;
2593 tree = create_tree (dfa, old_tree, tree, CONCAT);
2597 parse_dup_op_espace:
2602 /* Size of the names for collating symbol/equivalence_class/character_class.
2603 I'm not sure, but maybe enough. */
2604 #define BRACKET_NAME_BUF_SIZE 32
2607 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2608 Build the range expression which starts from START_ELEM, and ends
2609 at END_ELEM. The result are written to MBCSET and SBCSET.
2610 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2611 mbcset->range_ends, is a pointer argument sinse we may
2614 static reg_errcode_t
2616 # ifdef RE_ENABLE_I18N
2617 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2618 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2619 # else /* not RE_ENABLE_I18N */
2620 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2621 bracket_elem_t *end_elem)
2622 # endif /* not RE_ENABLE_I18N */
2624 unsigned int start_ch, end_ch;
2625 /* Equivalence Classes and Character Classes can't be a range start/end. */
2626 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2627 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2631 /* We can handle no multi character collating elements without libc
2633 if (BE ((start_elem->type == COLL_SYM
2634 && strlen ((char *) start_elem->opr.name) > 1)
2635 || (end_elem->type == COLL_SYM
2636 && strlen ((char *) end_elem->opr.name) > 1), 0))
2637 return REG_ECOLLATE;
2639 # ifdef RE_ENABLE_I18N
2644 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2646 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2647 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2649 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2650 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2652 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2653 ? __btowc (start_ch) : start_elem->opr.wch);
2654 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2655 ? __btowc (end_ch) : end_elem->opr.wch);
2656 if (start_wc == WEOF || end_wc == WEOF)
2657 return REG_ECOLLATE;
2658 cmp_buf[0] = start_wc;
2659 cmp_buf[4] = end_wc;
2660 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2663 /* Got valid collation sequence values, add them as a new entry.
2664 However, for !_LIBC we have no collation elements: if the
2665 character set is single byte, the single byte character set
2666 that we build below suffices. parse_bracket_exp passes
2667 no MBCSET if dfa->mb_cur_max == 1. */
2670 /* Check the space of the arrays. */
2671 if (BE (*range_alloc == mbcset->nranges, 0))
2673 /* There is not enough space, need realloc. */
2674 wchar_t *new_array_start, *new_array_end;
2677 /* +1 in case of mbcset->nranges is 0. */
2678 new_nranges = 2 * mbcset->nranges + 1;
2679 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2680 are NULL if *range_alloc == 0. */
2681 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2683 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2686 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2689 mbcset->range_starts = new_array_start;
2690 mbcset->range_ends = new_array_end;
2691 *range_alloc = new_nranges;
2694 mbcset->range_starts[mbcset->nranges] = start_wc;
2695 mbcset->range_ends[mbcset->nranges++] = end_wc;
2698 /* Build the table for single byte characters. */
2699 for (wc = 0; wc < SBC_MAX; ++wc)
2702 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2703 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2704 bitset_set (sbcset, wc);
2707 # else /* not RE_ENABLE_I18N */
2710 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2711 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2713 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2714 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2716 if (start_ch > end_ch)
2718 /* Build the table for single byte characters. */
2719 for (ch = 0; ch < SBC_MAX; ++ch)
2720 if (start_ch <= ch && ch <= end_ch)
2721 bitset_set (sbcset, ch);
2723 # endif /* not RE_ENABLE_I18N */
2726 #endif /* not _LIBC */
2729 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2730 Build the collating element which is represented by NAME.
2731 The result are written to MBCSET and SBCSET.
2732 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2733 pointer argument since we may update it. */
2735 static reg_errcode_t
2737 build_collating_symbol (bitset_t sbcset,
2738 # ifdef RE_ENABLE_I18N
2739 re_charset_t *mbcset, Idx *coll_sym_alloc,
2741 const unsigned char *name)
2743 size_t name_len = strlen ((const char *) name);
2744 if (BE (name_len != 1, 0))
2745 return REG_ECOLLATE;
2748 bitset_set (sbcset, name[0]);
2752 #endif /* not _LIBC */
2754 /* This function parse bracket expression like "[abc]", "[a-c]",
2758 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2759 reg_syntax_t syntax, reg_errcode_t *err)
2762 const unsigned char *collseqmb;
2763 const char *collseqwc;
2766 const int32_t *symb_table;
2767 const unsigned char *extra;
2769 /* Local function for parse_bracket_exp used in _LIBC environement.
2770 Seek the collating symbol entry correspondings to NAME.
2771 Return the index of the symbol in the SYMB_TABLE. */
2774 __attribute ((always_inline))
2775 seek_collating_symbol_entry (name, name_len)
2776 const unsigned char *name;
2779 int32_t hash = elem_hash ((const char *) name, name_len);
2780 int32_t elem = hash % table_size;
2781 if (symb_table[2 * elem] != 0)
2783 int32_t second = hash % (table_size - 2) + 1;
2787 /* First compare the hashing value. */
2788 if (symb_table[2 * elem] == hash
2789 /* Compare the length of the name. */
2790 && name_len == extra[symb_table[2 * elem + 1]]
2791 /* Compare the name. */
2792 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2795 /* Yep, this is the entry. */
2802 while (symb_table[2 * elem] != 0);
2807 /* Local function for parse_bracket_exp used in _LIBC environement.
2808 Look up the collation sequence value of BR_ELEM.
2809 Return the value if succeeded, UINT_MAX otherwise. */
2811 auto inline unsigned int
2812 __attribute ((always_inline))
2813 lookup_collation_sequence_value (br_elem)
2814 bracket_elem_t *br_elem;
2816 if (br_elem->type == SB_CHAR)
2819 if (MB_CUR_MAX == 1)
2822 return collseqmb[br_elem->opr.ch];
2825 wint_t wc = __btowc (br_elem->opr.ch);
2826 return __collseq_table_lookup (collseqwc, wc);
2829 else if (br_elem->type == MB_CHAR)
2831 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2833 else if (br_elem->type == COLL_SYM)
2835 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2839 elem = seek_collating_symbol_entry (br_elem->opr.name,
2841 if (symb_table[2 * elem] != 0)
2843 /* We found the entry. */
2844 idx = symb_table[2 * elem + 1];
2845 /* Skip the name of collating element name. */
2846 idx += 1 + extra[idx];
2847 /* Skip the byte sequence of the collating element. */
2848 idx += 1 + extra[idx];
2849 /* Adjust for the alignment. */
2850 idx = (idx + 3) & ~3;
2851 /* Skip the multibyte collation sequence value. */
2852 idx += sizeof (unsigned int);
2853 /* Skip the wide char sequence of the collating element. */
2854 idx += sizeof (unsigned int) *
2855 (1 + *(unsigned int *) (extra + idx));
2856 /* Return the collation sequence value. */
2857 return *(unsigned int *) (extra + idx);
2859 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2861 /* No valid character. Match it as a single byte
2863 return collseqmb[br_elem->opr.name[0]];
2866 else if (sym_name_len == 1)
2867 return collseqmb[br_elem->opr.name[0]];
2872 /* Local function for parse_bracket_exp used in _LIBC environement.
2873 Build the range expression which starts from START_ELEM, and ends
2874 at END_ELEM. The result are written to MBCSET and SBCSET.
2875 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2876 mbcset->range_ends, is a pointer argument sinse we may
2879 auto inline reg_errcode_t
2880 __attribute ((always_inline))
2881 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2882 re_charset_t *mbcset;
2885 bracket_elem_t *start_elem, *end_elem;
2888 uint32_t start_collseq;
2889 uint32_t end_collseq;
2891 /* Equivalence Classes and Character Classes can't be a range
2893 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2894 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2898 start_collseq = lookup_collation_sequence_value (start_elem);
2899 end_collseq = lookup_collation_sequence_value (end_elem);
2900 /* Check start/end collation sequence values. */
2901 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2902 return REG_ECOLLATE;
2903 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2906 /* Got valid collation sequence values, add them as a new entry.
2907 However, if we have no collation elements, and the character set
2908 is single byte, the single byte character set that we
2909 build below suffices. */
2910 if (nrules > 0 || dfa->mb_cur_max > 1)
2912 /* Check the space of the arrays. */
2913 if (BE (*range_alloc == mbcset->nranges, 0))
2915 /* There is not enough space, need realloc. */
2916 uint32_t *new_array_start;
2917 uint32_t *new_array_end;
2920 /* +1 in case of mbcset->nranges is 0. */
2921 new_nranges = 2 * mbcset->nranges + 1;
2922 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2924 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2927 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2930 mbcset->range_starts = new_array_start;
2931 mbcset->range_ends = new_array_end;
2932 *range_alloc = new_nranges;
2935 mbcset->range_starts[mbcset->nranges] = start_collseq;
2936 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2939 /* Build the table for single byte characters. */
2940 for (ch = 0; ch < SBC_MAX; ch++)
2942 uint32_t ch_collseq;
2944 if (MB_CUR_MAX == 1)
2947 ch_collseq = collseqmb[ch];
2949 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2950 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2951 bitset_set (sbcset, ch);
2956 /* Local function for parse_bracket_exp used in _LIBC environement.
2957 Build the collating element which is represented by NAME.
2958 The result are written to MBCSET and SBCSET.
2959 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2960 pointer argument sinse we may update it. */
2962 auto inline reg_errcode_t
2963 __attribute ((always_inline))
2964 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2965 re_charset_t *mbcset;
2966 Idx *coll_sym_alloc;
2968 const unsigned char *name;
2971 size_t name_len = strlen ((const char *) name);
2974 elem = seek_collating_symbol_entry (name, name_len);
2975 if (symb_table[2 * elem] != 0)
2977 /* We found the entry. */
2978 idx = symb_table[2 * elem + 1];
2979 /* Skip the name of collating element name. */
2980 idx += 1 + extra[idx];
2982 else if (symb_table[2 * elem] == 0 && name_len == 1)
2984 /* No valid character, treat it as a normal
2986 bitset_set (sbcset, name[0]);
2990 return REG_ECOLLATE;
2992 /* Got valid collation sequence, add it as a new entry. */
2993 /* Check the space of the arrays. */
2994 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2996 /* Not enough, realloc it. */
2997 /* +1 in case of mbcset->ncoll_syms is 0. */
2998 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2999 /* Use realloc since mbcset->coll_syms is NULL
3001 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3002 new_coll_sym_alloc);
3003 if (BE (new_coll_syms == NULL, 0))
3005 mbcset->coll_syms = new_coll_syms;
3006 *coll_sym_alloc = new_coll_sym_alloc;
3008 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3013 if (BE (name_len != 1, 0))
3014 return REG_ECOLLATE;
3017 bitset_set (sbcset, name[0]);
3024 re_token_t br_token;
3025 re_bitset_ptr_t sbcset;
3026 #ifdef RE_ENABLE_I18N
3027 re_charset_t *mbcset;
3028 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3029 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3030 #endif /* not RE_ENABLE_I18N */
3031 bool non_match = false;
3032 bin_tree_t *work_tree;
3034 bool first_round = true;
3036 collseqmb = (const unsigned char *)
3037 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3038 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3044 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3045 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3046 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3047 _NL_COLLATE_SYMB_TABLEMB);
3048 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3049 _NL_COLLATE_SYMB_EXTRAMB);
3052 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3053 #ifdef RE_ENABLE_I18N
3054 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3055 #endif /* RE_ENABLE_I18N */
3056 #ifdef RE_ENABLE_I18N
3057 if (BE (sbcset == NULL || mbcset == NULL, 0))
3059 if (BE (sbcset == NULL, 0))
3060 #endif /* RE_ENABLE_I18N */
3066 token_len = peek_token_bracket (token, regexp, syntax);
3067 if (BE (token->type == END_OF_RE, 0))
3070 goto parse_bracket_exp_free_return;
3072 if (token->type == OP_NON_MATCH_LIST)
3074 #ifdef RE_ENABLE_I18N
3075 mbcset->non_match = 1;
3076 #endif /* not RE_ENABLE_I18N */
3078 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3079 bitset_set (sbcset, '\n');
3080 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3081 token_len = peek_token_bracket (token, regexp, syntax);
3082 if (BE (token->type == END_OF_RE, 0))
3085 goto parse_bracket_exp_free_return;
3089 /* We treat the first ']' as a normal character. */
3090 if (token->type == OP_CLOSE_BRACKET)
3091 token->type = CHARACTER;
3095 bracket_elem_t start_elem, end_elem;
3096 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3097 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3100 bool is_range_exp = false;
3103 start_elem.opr.name = start_name_buf;
3104 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3105 syntax, first_round);
3106 if (BE (ret != REG_NOERROR, 0))
3109 goto parse_bracket_exp_free_return;
3111 first_round = false;
3113 /* Get information about the next token. We need it in any case. */
3114 token_len = peek_token_bracket (token, regexp, syntax);
3116 /* Do not check for ranges if we know they are not allowed. */
3117 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3119 if (BE (token->type == END_OF_RE, 0))
3122 goto parse_bracket_exp_free_return;
3124 if (token->type == OP_CHARSET_RANGE)
3126 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3127 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3128 if (BE (token2.type == END_OF_RE, 0))
3131 goto parse_bracket_exp_free_return;
3133 if (token2.type == OP_CLOSE_BRACKET)
3135 /* We treat the last '-' as a normal character. */
3136 re_string_skip_bytes (regexp, -token_len);
3137 token->type = CHARACTER;
3140 is_range_exp = true;
3144 if (is_range_exp == true)
3146 end_elem.opr.name = end_name_buf;
3147 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3149 if (BE (ret != REG_NOERROR, 0))
3152 goto parse_bracket_exp_free_return;
3155 token_len = peek_token_bracket (token, regexp, syntax);
3158 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3159 &start_elem, &end_elem);
3161 # ifdef RE_ENABLE_I18N
3162 *err = build_range_exp (sbcset,
3163 dfa->mb_cur_max > 1 ? mbcset : NULL,
3164 &range_alloc, &start_elem, &end_elem);
3166 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3168 #endif /* RE_ENABLE_I18N */
3169 if (BE (*err != REG_NOERROR, 0))
3170 goto parse_bracket_exp_free_return;
3174 switch (start_elem.type)
3177 bitset_set (sbcset, start_elem.opr.ch);
3179 #ifdef RE_ENABLE_I18N
3181 /* Check whether the array has enough space. */
3182 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3184 wchar_t *new_mbchars;
3185 /* Not enough, realloc it. */
3186 /* +1 in case of mbcset->nmbchars is 0. */
3187 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3188 /* Use realloc since array is NULL if *alloc == 0. */
3189 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3191 if (BE (new_mbchars == NULL, 0))
3192 goto parse_bracket_exp_espace;
3193 mbcset->mbchars = new_mbchars;
3195 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3197 #endif /* RE_ENABLE_I18N */
3199 *err = build_equiv_class (sbcset,
3200 #ifdef RE_ENABLE_I18N
3201 mbcset, &equiv_class_alloc,
3202 #endif /* RE_ENABLE_I18N */
3203 start_elem.opr.name);
3204 if (BE (*err != REG_NOERROR, 0))
3205 goto parse_bracket_exp_free_return;
3208 *err = build_collating_symbol (sbcset,
3209 #ifdef RE_ENABLE_I18N
3210 mbcset, &coll_sym_alloc,
3211 #endif /* RE_ENABLE_I18N */
3212 start_elem.opr.name);
3213 if (BE (*err != REG_NOERROR, 0))
3214 goto parse_bracket_exp_free_return;
3217 *err = build_charclass (regexp->trans, sbcset,
3218 #ifdef RE_ENABLE_I18N
3219 mbcset, &char_class_alloc,
3220 #endif /* RE_ENABLE_I18N */
3221 start_elem.opr.name, syntax);
3222 if (BE (*err != REG_NOERROR, 0))
3223 goto parse_bracket_exp_free_return;
3230 if (BE (token->type == END_OF_RE, 0))
3233 goto parse_bracket_exp_free_return;
3235 if (token->type == OP_CLOSE_BRACKET)
3239 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3241 /* If it is non-matching list. */
3243 bitset_not (sbcset);
3245 #ifdef RE_ENABLE_I18N
3246 /* Ensure only single byte characters are set. */
3247 if (dfa->mb_cur_max > 1)
3248 bitset_mask (sbcset, dfa->sb_char);
3250 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3251 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3252 || mbcset->non_match)))
3254 bin_tree_t *mbc_tree;
3256 /* Build a tree for complex bracket. */
3257 dfa->has_mb_node = 1;
3258 br_token.type = COMPLEX_BRACKET;
3259 br_token.opr.mbcset = mbcset;
3260 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3261 if (BE (mbc_tree == NULL, 0))
3262 goto parse_bracket_exp_espace;
3263 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3264 if (sbcset[sbc_idx])
3266 /* If there are no bits set in sbcset, there is no point
3267 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3268 if (sbc_idx < BITSET_WORDS)
3270 /* Build a tree for simple bracket. */
3271 br_token.type = SIMPLE_BRACKET;
3272 br_token.opr.sbcset = sbcset;
3273 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3274 if (BE (work_tree == NULL, 0))
3275 goto parse_bracket_exp_espace;
3277 /* Then join them by ALT node. */
3278 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3279 if (BE (work_tree == NULL, 0))
3280 goto parse_bracket_exp_espace;
3285 work_tree = mbc_tree;
3289 #endif /* not RE_ENABLE_I18N */
3291 #ifdef RE_ENABLE_I18N
3292 free_charset (mbcset);
3294 /* Build a tree for simple bracket. */
3295 br_token.type = SIMPLE_BRACKET;
3296 br_token.opr.sbcset = sbcset;
3297 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3298 if (BE (work_tree == NULL, 0))
3299 goto parse_bracket_exp_espace;
3303 parse_bracket_exp_espace:
3305 parse_bracket_exp_free_return:
3307 #ifdef RE_ENABLE_I18N
3308 free_charset (mbcset);
3309 #endif /* RE_ENABLE_I18N */
3313 /* Parse an element in the bracket expression. */
3315 static reg_errcode_t
3316 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3317 re_token_t *token, int token_len, re_dfa_t *dfa,
3318 reg_syntax_t syntax, bool accept_hyphen)
3320 #ifdef RE_ENABLE_I18N
3322 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3323 if (cur_char_size > 1)
3325 elem->type = MB_CHAR;
3326 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3327 re_string_skip_bytes (regexp, cur_char_size);
3330 #endif /* RE_ENABLE_I18N */
3331 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3332 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3333 || token->type == OP_OPEN_EQUIV_CLASS)
3334 return parse_bracket_symbol (elem, regexp, token);
3335 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3337 /* A '-' must only appear as anything but a range indicator before
3338 the closing bracket. Everything else is an error. */
3340 (void) peek_token_bracket (&token2, regexp, syntax);
3341 if (token2.type != OP_CLOSE_BRACKET)
3342 /* The actual error value is not standardized since this whole
3343 case is undefined. But ERANGE makes good sense. */
3346 elem->type = SB_CHAR;
3347 elem->opr.ch = token->opr.c;
3351 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3352 such as [:<character_class>:], [.<collating_element>.], and
3353 [=<equivalent_class>=]. */
3355 static reg_errcode_t
3356 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3359 unsigned char ch, delim = token->opr.c;
3361 if (re_string_eoi(regexp))
3365 if (i >= BRACKET_NAME_BUF_SIZE)
3367 if (token->type == OP_OPEN_CHAR_CLASS)
3368 ch = re_string_fetch_byte_case (regexp);
3370 ch = re_string_fetch_byte (regexp);
3371 if (re_string_eoi(regexp))
3373 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3375 elem->opr.name[i] = ch;
3377 re_string_skip_bytes (regexp, 1);
3378 elem->opr.name[i] = '\0';
3379 switch (token->type)
3381 case OP_OPEN_COLL_ELEM:
3382 elem->type = COLL_SYM;
3384 case OP_OPEN_EQUIV_CLASS:
3385 elem->type = EQUIV_CLASS;
3387 case OP_OPEN_CHAR_CLASS:
3388 elem->type = CHAR_CLASS;
3396 /* Helper function for parse_bracket_exp.
3397 Build the equivalence class which is represented by NAME.
3398 The result are written to MBCSET and SBCSET.
3399 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3400 is a pointer argument sinse we may update it. */
3402 static reg_errcode_t
3403 #ifdef RE_ENABLE_I18N
3404 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3405 Idx *equiv_class_alloc, const unsigned char *name)
3406 #else /* not RE_ENABLE_I18N */
3407 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3408 #endif /* not RE_ENABLE_I18N */
3411 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3414 const int32_t *table, *indirect;
3415 const unsigned char *weights, *extra, *cp;
3416 unsigned char char_buf[2];
3420 /* This #include defines a local function! */
3421 # include <locale/weight.h>
3422 /* Calculate the index for equivalence class. */
3424 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3425 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3426 _NL_COLLATE_WEIGHTMB);
3427 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3428 _NL_COLLATE_EXTRAMB);
3429 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3430 _NL_COLLATE_INDIRECTMB);
3431 idx1 = findidx (&cp);
3432 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3433 /* This isn't a valid character. */
3434 return REG_ECOLLATE;
3436 /* Build single byte matcing table for this equivalence class. */
3437 char_buf[1] = (unsigned char) '\0';
3438 len = weights[idx1];
3439 for (ch = 0; ch < SBC_MAX; ++ch)
3443 idx2 = findidx (&cp);
3448 /* This isn't a valid character. */
3450 if (len == weights[idx2])
3453 while (cnt <= len &&
3454 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3458 bitset_set (sbcset, ch);
3461 /* Check whether the array has enough space. */
3462 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3464 /* Not enough, realloc it. */
3465 /* +1 in case of mbcset->nequiv_classes is 0. */
3466 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3467 /* Use realloc since the array is NULL if *alloc == 0. */
3468 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3470 new_equiv_class_alloc);
3471 if (BE (new_equiv_classes == NULL, 0))
3473 mbcset->equiv_classes = new_equiv_classes;
3474 *equiv_class_alloc = new_equiv_class_alloc;
3476 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3481 if (BE (strlen ((const char *) name) != 1, 0))
3482 return REG_ECOLLATE;
3483 bitset_set (sbcset, *name);
3488 /* Helper function for parse_bracket_exp.
3489 Build the character class which is represented by NAME.
3490 The result are written to MBCSET and SBCSET.
3491 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3492 is a pointer argument sinse we may update it. */
3494 static reg_errcode_t
3495 #ifdef RE_ENABLE_I18N
3496 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3497 re_charset_t *mbcset, Idx *char_class_alloc,
3498 const unsigned char *class_name, reg_syntax_t syntax)
3499 #else /* not RE_ENABLE_I18N */
3500 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3501 const unsigned char *class_name, reg_syntax_t syntax)
3502 #endif /* not RE_ENABLE_I18N */
3505 const char *name = (const char *) class_name;
3507 /* In case of REG_ICASE "upper" and "lower" match the both of
3508 upper and lower cases. */
3509 if ((syntax & RE_ICASE)
3510 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3513 #ifdef RE_ENABLE_I18N
3514 /* Check the space of the arrays. */
3515 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3517 /* Not enough, realloc it. */
3518 /* +1 in case of mbcset->nchar_classes is 0. */
3519 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3520 /* Use realloc since array is NULL if *alloc == 0. */
3521 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3522 new_char_class_alloc);
3523 if (BE (new_char_classes == NULL, 0))
3525 mbcset->char_classes = new_char_classes;
3526 *char_class_alloc = new_char_class_alloc;
3528 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3529 #endif /* RE_ENABLE_I18N */
3531 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3533 if (BE (trans != NULL, 0)) \
3535 for (i = 0; i < SBC_MAX; ++i) \
3536 if (ctype_func (i)) \
3537 bitset_set (sbcset, trans[i]); \
3541 for (i = 0; i < SBC_MAX; ++i) \
3542 if (ctype_func (i)) \
3543 bitset_set (sbcset, i); \
3547 if (strcmp (name, "alnum") == 0)
3548 BUILD_CHARCLASS_LOOP (isalnum);
3549 else if (strcmp (name, "cntrl") == 0)
3550 BUILD_CHARCLASS_LOOP (iscntrl);
3551 else if (strcmp (name, "lower") == 0)
3552 BUILD_CHARCLASS_LOOP (islower);
3553 else if (strcmp (name, "space") == 0)
3554 BUILD_CHARCLASS_LOOP (isspace);
3555 else if (strcmp (name, "alpha") == 0)
3556 BUILD_CHARCLASS_LOOP (isalpha);
3557 else if (strcmp (name, "digit") == 0)
3558 BUILD_CHARCLASS_LOOP (isdigit);
3559 else if (strcmp (name, "print") == 0)
3560 BUILD_CHARCLASS_LOOP (isprint);
3561 else if (strcmp (name, "upper") == 0)
3562 BUILD_CHARCLASS_LOOP (isupper);
3563 else if (strcmp (name, "blank") == 0)
3564 BUILD_CHARCLASS_LOOP (isblank);
3565 else if (strcmp (name, "graph") == 0)
3566 BUILD_CHARCLASS_LOOP (isgraph);
3567 else if (strcmp (name, "punct") == 0)
3568 BUILD_CHARCLASS_LOOP (ispunct);
3569 else if (strcmp (name, "xdigit") == 0)
3570 BUILD_CHARCLASS_LOOP (isxdigit);
3578 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3579 const unsigned char *class_name,
3580 const unsigned char *extra, bool non_match,
3583 re_bitset_ptr_t sbcset;
3584 #ifdef RE_ENABLE_I18N
3585 re_charset_t *mbcset;
3587 #endif /* not RE_ENABLE_I18N */
3589 re_token_t br_token;
3592 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3593 #ifdef RE_ENABLE_I18N
3594 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3595 #endif /* RE_ENABLE_I18N */
3597 #ifdef RE_ENABLE_I18N
3598 if (BE (sbcset == NULL || mbcset == NULL, 0))
3599 #else /* not RE_ENABLE_I18N */
3600 if (BE (sbcset == NULL, 0))
3601 #endif /* not RE_ENABLE_I18N */
3609 #ifdef RE_ENABLE_I18N
3610 mbcset->non_match = 1;
3611 #endif /* not RE_ENABLE_I18N */
3614 /* We don't care the syntax in this case. */
3615 ret = build_charclass (trans, sbcset,
3616 #ifdef RE_ENABLE_I18N
3618 #endif /* RE_ENABLE_I18N */
3621 if (BE (ret != REG_NOERROR, 0))
3624 #ifdef RE_ENABLE_I18N
3625 free_charset (mbcset);
3626 #endif /* RE_ENABLE_I18N */
3630 /* \w match '_' also. */
3631 for (; *extra; extra++)
3632 bitset_set (sbcset, *extra);
3634 /* If it is non-matching list. */
3636 bitset_not (sbcset);
3638 #ifdef RE_ENABLE_I18N
3639 /* Ensure only single byte characters are set. */
3640 if (dfa->mb_cur_max > 1)
3641 bitset_mask (sbcset, dfa->sb_char);
3644 /* Build a tree for simple bracket. */
3645 br_token.type = SIMPLE_BRACKET;
3646 br_token.opr.sbcset = sbcset;
3647 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3648 if (BE (tree == NULL, 0))
3649 goto build_word_op_espace;
3651 #ifdef RE_ENABLE_I18N
3652 if (dfa->mb_cur_max > 1)
3654 bin_tree_t *mbc_tree;
3655 /* Build a tree for complex bracket. */
3656 br_token.type = COMPLEX_BRACKET;
3657 br_token.opr.mbcset = mbcset;
3658 dfa->has_mb_node = 1;
3659 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3660 if (BE (mbc_tree == NULL, 0))
3661 goto build_word_op_espace;
3662 /* Then join them by ALT node. */
3663 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3664 if (BE (mbc_tree != NULL, 1))
3669 free_charset (mbcset);
3672 #else /* not RE_ENABLE_I18N */
3674 #endif /* not RE_ENABLE_I18N */
3676 build_word_op_espace:
3678 #ifdef RE_ENABLE_I18N
3679 free_charset (mbcset);
3680 #endif /* RE_ENABLE_I18N */
3685 /* This is intended for the expressions like "a{1,3}".
3686 Fetch a number from `input', and return the number.
3687 Return REG_MISSING if the number field is empty like "{,1}".
3688 Return REG_ERROR if an error occurred. */
3691 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3693 Idx num = REG_MISSING;
3697 fetch_token (token, input, syntax);
3699 if (BE (token->type == END_OF_RE, 0))
3701 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3703 num = ((token->type != CHARACTER || c < '0' || '9' < c
3704 || num == REG_ERROR)
3706 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3707 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3712 #ifdef RE_ENABLE_I18N
3714 free_charset (re_charset_t *cset)
3716 re_free (cset->mbchars);
3718 re_free (cset->coll_syms);
3719 re_free (cset->equiv_classes);
3720 re_free (cset->range_starts);
3721 re_free (cset->range_ends);
3723 re_free (cset->char_classes);
3726 #endif /* RE_ENABLE_I18N */
3728 /* Functions for binary tree operation. */
3730 /* Create a tree node. */
3733 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3734 re_token_type_t type)
3738 return create_token_tree (dfa, left, right, &t);
3742 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3743 const re_token_t *token)
3746 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3748 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3750 if (storage == NULL)
3752 storage->next = dfa->str_tree_storage;
3753 dfa->str_tree_storage = storage;
3754 dfa->str_tree_storage_idx = 0;
3756 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3758 tree->parent = NULL;
3760 tree->right = right;
3761 tree->token = *token;
3762 tree->token.duplicated = 0;
3763 tree->token.opt_subexp = 0;
3766 tree->node_idx = REG_MISSING;
3769 left->parent = tree;
3771 right->parent = tree;
3775 /* Mark the tree SRC as an optional subexpression.
3776 To be called from preorder or postorder. */
3778 static reg_errcode_t
3779 mark_opt_subexp (void *extra, bin_tree_t *node)
3781 Idx idx = (Idx) (long) extra;
3782 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3783 node->token.opt_subexp = 1;
3788 /* Free the allocated memory inside NODE. */
3791 free_token (re_token_t *node)
3793 #ifdef RE_ENABLE_I18N
3794 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3795 free_charset (node->opr.mbcset);
3797 #endif /* RE_ENABLE_I18N */
3798 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3799 re_free (node->opr.sbcset);
3802 /* Worker function for tree walking. Free the allocated memory inside NODE
3803 and its children. */
3805 static reg_errcode_t
3806 free_tree (void *extra, bin_tree_t *node)
3808 free_token (&node->token);
3813 /* Duplicate the node SRC, and return new node. This is a preorder
3814 visit similar to the one implemented by the generic visitor, but
3815 we need more infrastructure to maintain two parallel trees --- so,
3816 it's easier to duplicate. */
3819 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3821 const bin_tree_t *node;
3822 bin_tree_t *dup_root;
3823 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3825 for (node = root; ; )
3827 /* Create a new tree and link it back to the current parent. */
3828 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3831 (*p_new)->parent = dup_node;
3832 (*p_new)->token.duplicated = 1;
3835 /* Go to the left node, or up and to the right. */
3839 p_new = &dup_node->left;
3843 const bin_tree_t *prev = NULL;
3844 while (node->right == prev || node->right == NULL)
3847 node = node->parent;
3848 dup_node = dup_node->parent;
3853 p_new = &dup_node->right;