1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21 # if defined __GNUC__ && defined __GNUC_MINOR__
22 # define __GNUC_PREREQ(maj, min) \
23 ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
25 # define __GNUC_PREREQ(maj, min) 0
29 #if !__GNUC_PREREQ (3, 1)
30 # define always_inline
33 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
34 Idx length, reg_syntax_t syntax);
35 static void re_compile_fastmap_iter (regex_t *bufp,
36 const re_dfastate_t *init_state,
38 static reg_errcode_t init_dfa (re_dfa_t *dfa, Idx pat_len);
40 static void free_charset (re_charset_t *cset);
41 #endif /* RE_ENABLE_I18N */
42 static void free_workarea_compile (regex_t *preg);
43 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
45 static void optimize_utf8 (re_dfa_t *dfa);
47 static reg_errcode_t analyze (regex_t *preg);
48 static reg_errcode_t preorder (bin_tree_t *root,
49 reg_errcode_t (fn (void *, bin_tree_t *)),
51 static reg_errcode_t postorder (bin_tree_t *root,
52 reg_errcode_t (fn (void *, bin_tree_t *)),
54 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
55 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
56 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
58 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
59 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
60 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
61 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
62 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
63 unsigned int constraint);
64 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
65 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
67 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
68 static Idx fetch_number (re_string_t *input, re_token_t *token,
70 static int peek_token (re_token_t *token, re_string_t *input,
72 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
73 reg_syntax_t syntax, reg_errcode_t *err);
74 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
75 re_token_t *token, reg_syntax_t syntax,
76 Idx nest, reg_errcode_t *err);
77 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
78 re_token_t *token, reg_syntax_t syntax,
79 Idx nest, reg_errcode_t *err);
80 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
81 re_token_t *token, reg_syntax_t syntax,
82 Idx nest, reg_errcode_t *err);
83 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
84 re_token_t *token, reg_syntax_t syntax,
85 Idx nest, reg_errcode_t *err);
86 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
87 re_dfa_t *dfa, re_token_t *token,
88 reg_syntax_t syntax, reg_errcode_t *err);
89 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
90 re_token_t *token, reg_syntax_t syntax,
92 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
94 re_token_t *token, int token_len,
98 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
101 #ifdef RE_ENABLE_I18N
102 static reg_errcode_t build_equiv_class (bitset sbcset,
103 re_charset_t *mbcset,
104 Idx *equiv_class_alloc,
105 const unsigned char *name);
106 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
108 re_charset_t *mbcset,
109 Idx *char_class_alloc,
110 const unsigned char *class_name,
111 reg_syntax_t syntax);
112 #else /* not RE_ENABLE_I18N */
113 static reg_errcode_t build_equiv_class (bitset sbcset,
114 const unsigned char *name);
115 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
117 const unsigned char *class_name,
118 reg_syntax_t syntax);
119 #endif /* not RE_ENABLE_I18N */
120 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
121 unsigned REG_TRANSLATE_TYPE trans,
122 const unsigned char *class_name,
123 const unsigned char *extra,
124 bool non_match, reg_errcode_t *err);
125 static bin_tree_t *create_tree (re_dfa_t *dfa,
126 bin_tree_t *left, bin_tree_t *right,
127 re_token_type_t type);
128 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
129 bin_tree_t *left, bin_tree_t *right,
130 const re_token_t *token);
131 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
132 static void free_token (re_token_t *node);
133 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
134 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
136 /* This table gives an error message for each of the error codes listed
137 in regex.h. Obviously the order here has to be same as there.
138 POSIX doesn't require that we do anything for REG_NOERROR,
139 but why not be nice? */
141 const char __re_error_msgid[] attribute_hidden =
143 #define REG_NOERROR_IDX 0
144 gettext_noop ("Success") /* REG_NOERROR */
146 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
147 gettext_noop ("No match") /* REG_NOMATCH */
149 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
150 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
152 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
153 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
155 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
156 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
158 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
159 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
161 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
162 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
164 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
165 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
167 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
168 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
170 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
171 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
173 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
174 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
176 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
177 gettext_noop ("Invalid range end") /* REG_ERANGE */
179 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
180 gettext_noop ("Memory exhausted") /* REG_ESPACE */
182 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
183 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
185 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
186 gettext_noop ("Premature end of regular expression") /* REG_EEND */
188 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
189 gettext_noop ("Regular expression too big") /* REG_ESIZE */
191 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
192 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
195 const size_t __re_error_msgid_idx[] attribute_hidden =
216 /* Entry points for GNU code. */
218 /* re_compile_pattern is the GNU regular expression compiler: it
219 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
220 Returns 0 if the pattern was valid, otherwise an error string.
222 Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
223 are set in BUFP on entry. */
226 re_compile_pattern (const char *pattern, size_t length,
227 struct re_pattern_buffer *bufp)
231 /* And GNU code determines whether or not to get register information
232 by passing null for the REGS argument to re_match, etc., not by
233 setting re_no_sub, unless REG_NO_SUB is set. */
234 bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
236 /* Match anchors at newline. */
237 bufp->re_newline_anchor = 1;
239 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
243 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
246 weak_alias (__re_compile_pattern, re_compile_pattern)
249 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
250 also be assigned to arbitrarily: each pattern buffer stores its own
251 syntax, so it can be changed between regex compilations. */
252 /* This has no initializer because initialized variables in Emacs
253 become read-only after dumping. */
254 reg_syntax_t re_syntax_options;
257 /* Specify the precise syntax of regexps for compilation. This provides
258 for compatibility for various utilities which historically have
259 different, incompatible syntaxes.
261 The argument SYNTAX is a bit mask comprised of the various bits
262 defined in regex.h. We return the old syntax. */
265 re_set_syntax (reg_syntax_t syntax)
267 reg_syntax_t ret = re_syntax_options;
269 re_syntax_options = syntax;
273 weak_alias (__re_set_syntax, re_set_syntax)
277 re_compile_fastmap (struct re_pattern_buffer *bufp)
279 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
280 char *fastmap = bufp->re_fastmap;
282 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
283 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
284 if (dfa->init_state != dfa->init_state_word)
285 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
286 if (dfa->init_state != dfa->init_state_nl)
287 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
288 if (dfa->init_state != dfa->init_state_begbuf)
289 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
290 bufp->re_fastmap_accurate = 1;
294 weak_alias (__re_compile_fastmap, re_compile_fastmap)
298 __attribute ((always_inline))
299 re_set_fastmap (char *fastmap, bool icase, int ch)
303 fastmap[tolower (ch)] = 1;
306 /* Helper function for re_compile_fastmap.
307 Compile fastmap for the initial_state INIT_STATE. */
310 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
313 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
315 bool icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
316 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
318 Idx node = init_state->nodes.elems[node_cnt];
319 re_token_type_t type = dfa->nodes[node].type;
321 if (type == CHARACTER)
323 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
324 #ifdef RE_ENABLE_I18N
325 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
327 unsigned char buf[MB_LEN_MAX];
333 *p++ = dfa->nodes[node].opr.c;
334 while (++node < dfa->nodes_len
335 && dfa->nodes[node].type == CHARACTER
336 && dfa->nodes[node].mb_partial)
337 *p++ = dfa->nodes[node].opr.c;
338 memset (&state, 0, sizeof (state));
339 if (mbrtowc (&wc, (const char *) buf, p - buf,
341 && (__wcrtomb ((char *) buf, towlower (wc), &state)
343 re_set_fastmap (fastmap, false, buf[0]);
347 else if (type == SIMPLE_BRACKET)
350 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
351 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352 if (dfa->nodes[node].opr.sbcset[i] & ((bitset_word) 1 << j))
353 re_set_fastmap (fastmap, icase, ch);
355 #ifdef RE_ENABLE_I18N
356 else if (type == COMPLEX_BRACKET)
359 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
360 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
361 || cset->nranges || cset->nchar_classes)
364 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
366 /* In this case we want to catch the bytes which are
367 the first byte of any collation elements.
368 e.g. In da_DK, we want to catch 'a' since "aa"
369 is a valid collation element, and don't catch
370 'b' since 'b' is the only collation element
371 which starts from 'b'. */
372 const int32_t *table = (const int32_t *)
373 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
374 for (i = 0; i < SBC_MAX; ++i)
376 re_set_fastmap (fastmap, icase, i);
379 if (dfa->mb_cur_max > 1)
380 for (i = 0; i < SBC_MAX; ++i)
381 if (__btowc (i) == WEOF)
382 re_set_fastmap (fastmap, icase, i);
383 # endif /* not _LIBC */
385 for (i = 0; i < cset->nmbchars; ++i)
389 memset (&state, '\0', sizeof (state));
390 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
391 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
392 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
394 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
396 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
400 #endif /* RE_ENABLE_I18N */
401 else if (type == OP_PERIOD
402 #ifdef RE_ENABLE_I18N
403 || type == OP_UTF8_PERIOD
404 #endif /* RE_ENABLE_I18N */
405 || type == END_OF_RE)
407 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
408 if (type == END_OF_RE)
409 bufp->re_can_be_null = 1;
415 /* Entry point for POSIX code. */
416 /* regcomp takes a regular expression as a string and compiles it.
418 PREG is a regex_t *. We do not expect any fields to be initialized,
419 since POSIX says we shouldn't. Thus, we set
421 `re_buffer' to the compiled pattern;
422 `re_used' to the length of the compiled pattern;
423 `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
424 REG_EXTENDED bit in CFLAGS is set; otherwise, to
425 REG_SYNTAX_POSIX_BASIC;
426 `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
427 `re_fastmap' to an allocated space for the fastmap;
428 `re_fastmap_accurate' to zero;
429 `re_nsub' to the number of subexpressions in PATTERN.
431 PATTERN is the address of the pattern string.
433 CFLAGS is a series of bits which affect compilation.
435 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
436 use POSIX basic syntax.
438 If REG_NEWLINE is set, then . and [^...] don't match newline.
439 Also, regexec will try a match beginning after every newline.
441 If REG_ICASE is set, then we considers upper- and lowercase
442 versions of letters to be equivalent when matching.
444 If REG_NOSUB is set, then when PREG is passed to regexec, that
445 routine will report only success or failure, and nothing about the
448 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
449 the return codes and their meanings.) */
452 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
455 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
456 : REG_SYNTAX_POSIX_BASIC);
458 preg->re_buffer = NULL;
459 preg->re_allocated = 0;
462 /* Try to allocate space for the fastmap. */
463 preg->re_fastmap = re_malloc (char, SBC_MAX);
464 if (BE (preg->re_fastmap == NULL, 0))
467 syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
469 /* If REG_NEWLINE is set, newlines are treated differently. */
470 if (cflags & REG_NEWLINE)
471 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
472 syntax &= ~REG_DOT_NEWLINE;
473 syntax |= REG_HAT_LISTS_NOT_NEWLINE;
474 /* It also changes the matching behavior. */
475 preg->re_newline_anchor = 1;
478 preg->re_newline_anchor = 0;
479 preg->re_no_sub = !!(cflags & REG_NOSUB);
480 preg->re_translate = NULL;
482 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
484 /* POSIX doesn't distinguish between an unmatched open-group and an
485 unmatched close-group: both are REG_EPAREN. */
486 if (ret == REG_ERPAREN)
489 /* We have already checked preg->re_fastmap != NULL. */
490 if (BE (ret == REG_NOERROR, 1))
491 /* Compute the fastmap now, since regexec cannot modify the pattern
492 buffer. This function never fails in this implementation. */
493 (void) re_compile_fastmap (preg);
496 /* Some error occurred while compiling the expression. */
497 re_free (preg->re_fastmap);
498 preg->re_fastmap = NULL;
504 weak_alias (__regcomp, regcomp)
507 /* Returns a message corresponding to an error code, ERRCODE, returned
508 from either regcomp or regexec. We don't use PREG here. */
511 regerror (int errcode, const regex_t *__restrict preg,
512 char *__restrict errbuf, size_t errbuf_size)
518 || errcode >= (int) (sizeof (__re_error_msgid_idx)
519 / sizeof (__re_error_msgid_idx[0])), 0))
520 /* Only error codes returned by the rest of the code should be passed
521 to this routine. If we are given anything else, or if other regex
522 code generates an invalid error code, then the program has a bug.
523 Dump core so we can fix it. */
526 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
528 msg_size = strlen (msg) + 1; /* Includes the null. */
530 if (BE (errbuf_size != 0, 1))
532 if (BE (msg_size > errbuf_size, 0))
534 #if defined HAVE_MEMPCPY || defined _LIBC
535 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
537 memcpy (errbuf, msg, errbuf_size - 1);
538 errbuf[errbuf_size - 1] = 0;
542 memcpy (errbuf, msg, msg_size);
548 weak_alias (__regerror, regerror)
552 #ifdef RE_ENABLE_I18N
553 /* This static array is used for the map to single-byte characters when
554 UTF-8 is used. Otherwise we would allocate memory just to initialize
555 it the same all the time. UTF-8 is the preferred encoding so this is
556 a worthwhile optimization. */
557 static const bitset utf8_sb_map =
559 /* Set the first 128 bits. */
560 # if 2 < BITSET_WORDS
563 # if 4 < BITSET_WORDS
566 # if 6 < BITSET_WORDS
569 # if 8 < BITSET_WORDS
570 # error "Invalid BITSET_WORDS"
573 >> (SBC_MAX % BITSET_WORD_BITS == 0
575 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
581 free_dfa_content (re_dfa_t *dfa)
586 for (i = 0; i < dfa->nodes_len; ++i)
587 free_token (dfa->nodes + i);
588 re_free (dfa->nexts);
589 for (i = 0; i < dfa->nodes_len; ++i)
591 if (dfa->eclosures != NULL)
592 re_node_set_free (dfa->eclosures + i);
593 if (dfa->inveclosures != NULL)
594 re_node_set_free (dfa->inveclosures + i);
595 if (dfa->edests != NULL)
596 re_node_set_free (dfa->edests + i);
598 re_free (dfa->edests);
599 re_free (dfa->eclosures);
600 re_free (dfa->inveclosures);
601 re_free (dfa->nodes);
603 if (dfa->state_table)
604 for (i = 0; i <= dfa->state_hash_mask; ++i)
606 struct re_state_table_entry *entry = dfa->state_table + i;
607 for (j = 0; j < entry->num; ++j)
609 re_dfastate_t *state = entry->array[j];
612 re_free (entry->array);
614 re_free (dfa->state_table);
615 #ifdef RE_ENABLE_I18N
616 if (dfa->sb_char != utf8_sb_map)
617 re_free (dfa->sb_char);
619 re_free (dfa->subexp_map);
621 re_free (dfa->re_str);
628 /* Free dynamically allocated space used by PREG. */
631 regfree (regex_t *preg)
633 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
634 if (BE (dfa != NULL, 1))
635 free_dfa_content (dfa);
636 preg->re_buffer = NULL;
637 preg->re_allocated = 0;
639 re_free (preg->re_fastmap);
640 preg->re_fastmap = NULL;
642 re_free (preg->re_translate);
643 preg->re_translate = NULL;
646 weak_alias (__regfree, regfree)
649 /* Entry points compatible with 4.2 BSD regex library. We don't define
650 them unless specifically requested. */
652 #if defined _REGEX_RE_COMP || defined _LIBC
654 /* BSD has one and only one pattern buffer. */
655 static struct re_pattern_buffer re_comp_buf;
659 /* Make these definitions weak in libc, so POSIX programs can redefine
660 these names if they don't use our functions, and still use
661 regcomp/regexec above without link errors. */
664 re_comp (const char *s)
671 if (!re_comp_buf.re_buffer)
672 return gettext ("No previous regular expression");
676 if (re_comp_buf.re_buffer)
678 fastmap = re_comp_buf.re_fastmap;
679 re_comp_buf.re_fastmap = NULL;
680 __regfree (&re_comp_buf);
681 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
682 re_comp_buf.re_fastmap = fastmap;
685 if (re_comp_buf.re_fastmap == NULL)
687 re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
688 if (re_comp_buf.re_fastmap == NULL)
689 return (char *) gettext (__re_error_msgid
690 + __re_error_msgid_idx[(int) REG_ESPACE]);
693 /* Since `re_exec' always passes NULL for the `regs' argument, we
694 don't need to initialize the pattern buffer fields which affect it. */
696 /* Match anchors at newlines. */
697 re_comp_buf.re_newline_anchor = 1;
699 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
704 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
705 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
709 libc_freeres_fn (free_mem)
711 __regfree (&re_comp_buf);
715 #endif /* _REGEX_RE_COMP */
717 /* Internal entry point.
718 Compile the regular expression PATTERN, whose length is LENGTH.
719 SYNTAX indicate regular expression's syntax. */
722 re_compile_internal (regex_t *preg, const char *pattern, Idx length,
725 reg_errcode_t err = REG_NOERROR;
729 /* Initialize the pattern buffer. */
730 preg->re_fastmap_accurate = 0;
731 preg->re_syntax = syntax;
732 preg->re_not_bol = preg->re_not_eol = 0;
735 preg->re_can_be_null = 0;
736 preg->re_regs_allocated = REG_UNALLOCATED;
738 /* Initialize the dfa. */
739 dfa = (re_dfa_t *) preg->re_buffer;
740 if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
742 /* If zero allocated, but buffer is non-null, try to realloc
743 enough space. This loses if buffer's address is bogus, but
744 that is the user's responsibility. If buffer is null this
745 is a simple allocation. */
746 dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
749 preg->re_allocated = sizeof (re_dfa_t);
750 preg->re_buffer = (unsigned char *) dfa;
752 preg->re_used = sizeof (re_dfa_t);
754 __libc_lock_init (dfa->lock);
756 err = init_dfa (dfa, length);
757 if (BE (err != REG_NOERROR, 0))
759 free_dfa_content (dfa);
760 preg->re_buffer = NULL;
761 preg->re_allocated = 0;
765 dfa->re_str = re_malloc (char, length + 1);
766 strncpy (dfa->re_str, pattern, length + 1);
769 err = re_string_construct (®exp, pattern, length, preg->re_translate,
770 syntax & REG_IGNORE_CASE, dfa);
771 if (BE (err != REG_NOERROR, 0))
773 re_compile_internal_free_return:
774 free_workarea_compile (preg);
775 re_string_destruct (®exp);
776 free_dfa_content (dfa);
777 preg->re_buffer = NULL;
778 preg->re_allocated = 0;
782 /* Parse the regular expression, and build a structure tree. */
784 dfa->str_tree = parse (®exp, preg, syntax, &err);
785 if (BE (dfa->str_tree == NULL, 0))
786 goto re_compile_internal_free_return;
788 /* Analyze the tree and create the nfa. */
789 err = analyze (preg);
790 if (BE (err != REG_NOERROR, 0))
791 goto re_compile_internal_free_return;
793 #ifdef RE_ENABLE_I18N
794 /* If possible, do searching in single byte encoding to speed things up. */
795 if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
799 /* Then create the initial state of the dfa. */
800 err = create_initial_state (dfa);
802 /* Release work areas. */
803 free_workarea_compile (preg);
804 re_string_destruct (®exp);
806 if (BE (err != REG_NOERROR, 0))
808 free_dfa_content (dfa);
809 preg->re_buffer = NULL;
810 preg->re_allocated = 0;
816 /* Initialize DFA. We use the length of the regular expression PAT_LEN
817 as the initial length of some arrays. */
820 init_dfa (re_dfa_t *dfa, Idx pat_len)
822 __re_size_t table_size;
827 memset (dfa, '\0', sizeof (re_dfa_t));
829 /* Force allocation of str_tree_storage the first time. */
830 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
832 dfa->nodes_alloc = pat_len + 1;
833 dfa->nodes = re_xmalloc (re_token_t, dfa->nodes_alloc);
835 /* table_size = 2 ^ ceil(log pat_len) */
836 for (table_size = 1; table_size <= pat_len; table_size <<= 1)
837 if (0 < (Idx) -1 && table_size == 0)
840 dfa->state_table = re_calloc (struct re_state_table_entry, table_size);
841 dfa->state_hash_mask = table_size - 1;
843 dfa->mb_cur_max = MB_CUR_MAX;
845 if (dfa->mb_cur_max == 6
846 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
848 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
851 # ifdef HAVE_LANGINFO_CODESET
852 codeset_name = nl_langinfo (CODESET);
854 codeset_name = getenv ("LC_ALL");
855 if (codeset_name == NULL || codeset_name[0] == '\0')
856 codeset_name = getenv ("LC_CTYPE");
857 if (codeset_name == NULL || codeset_name[0] == '\0')
858 codeset_name = getenv ("LANG");
859 if (codeset_name == NULL)
861 else if (strchr (codeset_name, '.') != NULL)
862 codeset_name = strchr (codeset_name, '.') + 1;
865 if (strcasecmp (codeset_name, "UTF-8") == 0
866 || strcasecmp (codeset_name, "UTF8") == 0)
869 /* We check exhaustively in the loop below if this charset is a
870 superset of ASCII. */
871 dfa->map_notascii = 0;
874 #ifdef RE_ENABLE_I18N
875 if (dfa->mb_cur_max > 1)
878 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
883 dfa->sb_char = re_calloc (bitset_word, BITSET_WORDS);
884 if (BE (dfa->sb_char == NULL, 0))
887 /* Set the bits corresponding to single byte chars. */
888 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
889 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
891 wint_t wch = __btowc (ch);
893 dfa->sb_char[i] |= (bitset_word) 1 << j;
895 if (isascii (ch) && wch != ch)
896 dfa->map_notascii = 1;
903 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
908 /* Initialize WORD_CHAR table, which indicate which character is
909 "word". In this case "word" means that it is the word construction
910 character used by some operators like "\<", "\>", etc. */
913 init_word_char (re_dfa_t *dfa)
916 dfa->word_ops_used = 1;
917 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
918 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
919 if (isalnum (ch) || ch == '_')
920 dfa->word_char[i] |= (bitset_word) 1 << j;
923 /* Free the work area which are only used while compiling. */
926 free_workarea_compile (regex_t *preg)
928 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
929 bin_tree_storage_t *storage, *next;
930 for (storage = dfa->str_tree_storage; storage; storage = next)
932 next = storage->next;
935 dfa->str_tree_storage = NULL;
936 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
937 dfa->str_tree = NULL;
938 re_free (dfa->org_indices);
939 dfa->org_indices = NULL;
942 /* Create initial states for all contexts. */
945 create_initial_state (re_dfa_t *dfa)
949 re_node_set init_nodes;
951 /* Initial states have the epsilon closure of the node which is
952 the first node of the regular expression. */
953 first = dfa->str_tree->first->node_idx;
954 dfa->init_node = first;
955 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
956 if (BE (err != REG_NOERROR, 0))
959 /* The back-references which are in initial states can epsilon transit,
960 since in this case all of the subexpressions can be null.
961 Then we add epsilon closures of the nodes which are the next nodes of
962 the back-references. */
963 if (dfa->nbackref > 0)
964 for (i = 0; i < init_nodes.nelem; ++i)
966 Idx node_idx = init_nodes.elems[i];
967 re_token_type_t type = dfa->nodes[node_idx].type;
970 if (type != OP_BACK_REF)
972 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
974 re_token_t *clexp_node;
975 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
976 if (clexp_node->type == OP_CLOSE_SUBEXP
977 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
980 if (clexp_idx == init_nodes.nelem)
983 if (type == OP_BACK_REF)
985 Idx dest_idx = dfa->edests[node_idx].elems[0];
986 if (!re_node_set_contains (&init_nodes, dest_idx))
988 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
994 /* It must be the first time to invoke acquire_state. */
995 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
996 /* We don't check ERR here, since the initial state must not be NULL. */
997 if (BE (dfa->init_state == NULL, 0))
999 if (dfa->init_state->has_constraint)
1001 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1003 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1005 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1009 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1010 || dfa->init_state_begbuf == NULL, 0))
1014 dfa->init_state_word = dfa->init_state_nl
1015 = dfa->init_state_begbuf = dfa->init_state;
1017 re_node_set_free (&init_nodes);
1021 #ifdef RE_ENABLE_I18N
1022 /* If it is possible to do searching in single byte encoding instead of UTF-8
1023 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1024 DFA nodes where needed. */
1027 optimize_utf8 (re_dfa_t *dfa)
1031 bool mb_chars = false;
1032 bool has_period = false;
1034 for (node = 0; node < dfa->nodes_len; ++node)
1035 switch (dfa->nodes[node].type)
1038 if (dfa->nodes[node].opr.c >= 0x80)
1042 switch (dfa->nodes[node].opr.idx)
1050 /* Word anchors etc. cannot be handled. */
1060 case OP_DUP_ASTERISK:
1061 case OP_OPEN_SUBEXP:
1062 case OP_CLOSE_SUBEXP:
1064 case COMPLEX_BRACKET:
1066 case SIMPLE_BRACKET:
1067 /* Just double check. */
1070 (SBC_MAX / 2 % BITSET_WORD_BITS == 0
1072 : BITSET_WORD_BITS - SBC_MAX / 2 % BITSET_WORD_BITS);
1073 for (i = SBC_MAX / 2 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1075 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1085 if (mb_chars || has_period)
1086 for (node = 0; node < dfa->nodes_len; ++node)
1088 if (dfa->nodes[node].type == CHARACTER
1089 && dfa->nodes[node].opr.c >= 0x80)
1090 dfa->nodes[node].mb_partial = 0;
1091 else if (dfa->nodes[node].type == OP_PERIOD)
1092 dfa->nodes[node].type = OP_UTF8_PERIOD;
1095 /* The search can be in single byte locale. */
1096 dfa->mb_cur_max = 1;
1098 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1102 /* Analyze the structure tree, and calculate "first", "next", "edest",
1103 "eclosure", and "inveclosure". */
1105 static reg_errcode_t
1106 analyze (regex_t *preg)
1108 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1111 /* Allocate arrays. */
1112 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1113 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1114 dfa->edests = re_xmalloc (re_node_set, dfa->nodes_alloc);
1115 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1116 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1117 || dfa->eclosures == NULL, 0))
1120 dfa->subexp_map = re_xmalloc (Idx, preg->re_nsub);
1121 if (dfa->subexp_map != NULL)
1124 for (i = 0; i < preg->re_nsub; i++)
1125 dfa->subexp_map[i] = i;
1126 preorder (dfa->str_tree, optimize_subexps, dfa);
1127 for (i = 0; i < preg->re_nsub; i++)
1128 if (dfa->subexp_map[i] != i)
1130 if (i == preg->re_nsub)
1132 free (dfa->subexp_map);
1133 dfa->subexp_map = NULL;
1137 ret = postorder (dfa->str_tree, lower_subexps, preg);
1138 if (BE (ret != REG_NOERROR, 0))
1140 ret = postorder (dfa->str_tree, calc_first, dfa);
1141 if (BE (ret != REG_NOERROR, 0))
1143 preorder (dfa->str_tree, calc_next, dfa);
1144 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1145 if (BE (ret != REG_NOERROR, 0))
1147 ret = calc_eclosure (dfa);
1148 if (BE (ret != REG_NOERROR, 0))
1151 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1152 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1153 if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1156 dfa->inveclosures = re_xmalloc (re_node_set, dfa->nodes_len);
1157 if (BE (dfa->inveclosures == NULL, 0))
1159 ret = calc_inveclosure (dfa);
1165 /* Our parse trees are very unbalanced, so we cannot use a stack to
1166 implement parse tree visits. Instead, we use parent pointers and
1167 some hairy code in these two functions. */
1168 static reg_errcode_t
1169 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1172 bin_tree_t *node, *prev;
1174 for (node = root; ; )
1176 /* Descend down the tree, preferably to the left (or to the right
1177 if that's the only child). */
1178 while (node->left || node->right)
1186 reg_errcode_t err = fn (extra, node);
1187 if (BE (err != REG_NOERROR, 0))
1189 if (node->parent == NULL)
1192 node = node->parent;
1194 /* Go up while we have a node that is reached from the right. */
1195 while (node->right == prev || node->right == NULL);
1200 static reg_errcode_t
1201 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1206 for (node = root; ; )
1208 reg_errcode_t err = fn (extra, node);
1209 if (BE (err != REG_NOERROR, 0))
1212 /* Go to the left node, or up and to the right. */
1217 bin_tree_t *prev = NULL;
1218 while (node->right == prev || node->right == NULL)
1221 node = node->parent;
1230 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1231 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1232 backreferences as well. Requires a preorder visit. */
1233 static reg_errcode_t
1234 optimize_subexps (void *extra, bin_tree_t *node)
1236 re_dfa_t *dfa = (re_dfa_t *) extra;
1238 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1240 int idx = node->token.opr.idx;
1241 node->token.opr.idx = dfa->subexp_map[idx];
1242 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1245 else if (node->token.type == SUBEXP
1246 && node->left && node->left->token.type == SUBEXP)
1248 Idx other_idx = node->left->token.opr.idx;
1250 node->left = node->left->left;
1252 node->left->parent = node;
1254 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1255 if (other_idx < BITSET_WORD_BITS)
1256 dfa->used_bkref_map &= ~ ((bitset_word) 1 << other_idx);
1262 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1263 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1264 static reg_errcode_t
1265 lower_subexps (void *extra, bin_tree_t *node)
1267 regex_t *preg = (regex_t *) extra;
1268 reg_errcode_t err = REG_NOERROR;
1270 if (node->left && node->left->token.type == SUBEXP)
1272 node->left = lower_subexp (&err, preg, node->left);
1274 node->left->parent = node;
1276 if (node->right && node->right->token.type == SUBEXP)
1278 node->right = lower_subexp (&err, preg, node->right);
1280 node->right->parent = node;
1287 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1289 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1290 bin_tree_t *body = node->left;
1291 bin_tree_t *op, *cls, *tree1, *tree;
1294 /* We do not optimize empty subexpressions, because otherwise we may
1295 have bad CONCAT nodes with NULL children. This is obviously not
1296 very common, so we do not lose much. An example that triggers
1297 this case is the sed "script" /\(\)/x. */
1298 && node->left != NULL
1299 && ! (node->token.opr.idx < BITSET_WORD_BITS
1300 && dfa->used_bkref_map & ((bitset_word) 1 << node->token.opr.idx)))
1303 /* Convert the SUBEXP node to the concatenation of an
1304 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1305 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1306 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1307 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1308 tree = create_tree (dfa, op, tree1, CONCAT);
1309 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1315 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1316 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1320 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1321 nodes. Requires a postorder visit. */
1322 static reg_errcode_t
1323 calc_first (void *extra, bin_tree_t *node)
1325 re_dfa_t *dfa = (re_dfa_t *) extra;
1326 if (node->token.type == CONCAT)
1328 node->first = node->left->first;
1329 node->node_idx = node->left->node_idx;
1334 node->node_idx = re_dfa_add_node (dfa, node->token);
1335 if (BE (node->node_idx == REG_MISSING, 0))
1341 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1342 static reg_errcode_t
1343 calc_next (void *extra, bin_tree_t *node)
1345 switch (node->token.type)
1347 case OP_DUP_ASTERISK:
1348 node->left->next = node;
1351 node->left->next = node->right->first;
1352 node->right->next = node->next;
1356 node->left->next = node->next;
1358 node->right->next = node->next;
1364 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1365 static reg_errcode_t
1366 link_nfa_nodes (void *extra, bin_tree_t *node)
1368 re_dfa_t *dfa = (re_dfa_t *) extra;
1369 Idx idx = node->node_idx;
1370 reg_errcode_t err = REG_NOERROR;
1372 switch (node->token.type)
1378 assert (node->next == NULL);
1381 case OP_DUP_ASTERISK:
1385 dfa->has_plural_match = 1;
1386 if (node->left != NULL)
1387 left = node->left->first->node_idx;
1389 left = node->next->node_idx;
1390 if (node->right != NULL)
1391 right = node->right->first->node_idx;
1393 right = node->next->node_idx;
1394 assert (REG_VALID_INDEX (left));
1395 assert (REG_VALID_INDEX (right));
1396 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1401 case OP_OPEN_SUBEXP:
1402 case OP_CLOSE_SUBEXP:
1403 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1407 dfa->nexts[idx] = node->next->node_idx;
1408 if (node->token.type == OP_BACK_REF)
1409 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1413 assert (!IS_EPSILON_NODE (node->token.type));
1414 dfa->nexts[idx] = node->next->node_idx;
1421 /* Duplicate the epsilon closure of the node ROOT_NODE.
1422 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1423 to their own constraint. */
1425 static reg_errcode_t
1426 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node,
1427 Idx top_clone_node, Idx root_node,
1428 unsigned int init_constraint)
1430 Idx org_node, clone_node;
1432 unsigned int constraint = init_constraint;
1433 for (org_node = top_org_node, clone_node = top_clone_node;;)
1435 Idx org_dest, clone_dest;
1436 if (dfa->nodes[org_node].type == OP_BACK_REF)
1438 /* If the back reference epsilon-transit, its destination must
1439 also have the constraint. Then duplicate the epsilon closure
1440 of the destination of the back reference, and store it in
1441 edests of the back reference. */
1442 org_dest = dfa->nexts[org_node];
1443 re_node_set_empty (dfa->edests + clone_node);
1444 clone_dest = duplicate_node (dfa, org_dest, constraint);
1445 if (BE (clone_dest == REG_MISSING, 0))
1447 dfa->nexts[clone_node] = dfa->nexts[org_node];
1448 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1452 else if (dfa->edests[org_node].nelem == 0)
1454 /* In case of the node can't epsilon-transit, don't duplicate the
1455 destination and store the original destination as the
1456 destination of the node. */
1457 dfa->nexts[clone_node] = dfa->nexts[org_node];
1460 else if (dfa->edests[org_node].nelem == 1)
1462 /* In case of the node can epsilon-transit, and it has only one
1464 org_dest = dfa->edests[org_node].elems[0];
1465 re_node_set_empty (dfa->edests + clone_node);
1466 if (dfa->nodes[org_node].type == ANCHOR)
1468 /* In case of the node has another constraint, append it. */
1469 if (org_node == root_node && clone_node != org_node)
1471 /* ...but if the node is root_node itself, it means the
1472 epsilon closure have a loop, then tie it to the
1473 destination of the root_node. */
1474 ok = re_node_set_insert (dfa->edests + clone_node,
1480 constraint |= dfa->nodes[org_node].opr.ctx_type;
1482 clone_dest = duplicate_node (dfa, org_dest, constraint);
1483 if (BE (clone_dest == REG_MISSING, 0))
1485 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1489 else /* dfa->edests[org_node].nelem == 2 */
1491 /* In case of the node can epsilon-transit, and it has two
1492 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1493 org_dest = dfa->edests[org_node].elems[0];
1494 re_node_set_empty (dfa->edests + clone_node);
1495 /* Search for a duplicated node which satisfies the constraint. */
1496 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1497 if (clone_dest == REG_MISSING)
1499 /* There are no such a duplicated node, create a new one. */
1501 clone_dest = duplicate_node (dfa, org_dest, constraint);
1502 if (BE (clone_dest == REG_MISSING, 0))
1504 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1507 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1508 root_node, constraint);
1509 if (BE (err != REG_NOERROR, 0))
1514 /* There are a duplicated node which satisfy the constraint,
1515 use it to avoid infinite loop. */
1516 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1521 org_dest = dfa->edests[org_node].elems[1];
1522 clone_dest = duplicate_node (dfa, org_dest, constraint);
1523 if (BE (clone_dest == REG_MISSING, 0))
1525 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1529 org_node = org_dest;
1530 clone_node = clone_dest;
1535 /* Search for a node which is duplicated from the node ORG_NODE, and
1536 satisfies the constraint CONSTRAINT. */
1539 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1540 unsigned int constraint)
1543 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1545 if (org_node == dfa->org_indices[idx]
1546 && constraint == dfa->nodes[idx].constraint)
1547 return idx; /* Found. */
1549 return REG_MISSING; /* Not found. */
1552 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1553 Return the index of the new node, or REG_MISSING if insufficient storage is
1557 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1559 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1560 if (BE (dup_idx != REG_MISSING, 1))
1562 dfa->nodes[dup_idx].constraint = constraint;
1563 if (dfa->nodes[org_idx].type == ANCHOR)
1564 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1565 dfa->nodes[dup_idx].duplicated = 1;
1567 /* Store the index of the original node. */
1568 dfa->org_indices[dup_idx] = org_idx;
1573 static reg_errcode_t
1574 calc_inveclosure (re_dfa_t *dfa)
1578 for (idx = 0; idx < dfa->nodes_len; ++idx)
1579 re_node_set_init_empty (dfa->inveclosures + idx);
1581 for (src = 0; src < dfa->nodes_len; ++src)
1583 Idx *elems = dfa->eclosures[src].elems;
1584 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1586 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1595 /* Calculate "eclosure" for all the node in DFA. */
1597 static reg_errcode_t
1598 calc_eclosure (re_dfa_t *dfa)
1603 assert (dfa->nodes_len > 0);
1606 /* For each nodes, calculate epsilon closure. */
1607 for (node_idx = 0; ; ++node_idx)
1610 re_node_set eclosure_elem;
1611 if (node_idx == dfa->nodes_len)
1620 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1623 /* If we have already calculated, skip it. */
1624 if (dfa->eclosures[node_idx].nelem != 0)
1626 /* Calculate epsilon closure of `node_idx'. */
1627 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1628 if (BE (err != REG_NOERROR, 0))
1631 if (dfa->eclosures[node_idx].nelem == 0)
1634 re_node_set_free (&eclosure_elem);
1640 /* Calculate epsilon closure of NODE. */
1642 static reg_errcode_t
1643 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1646 unsigned int constraint;
1650 re_node_set eclosure;
1652 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1653 if (BE (err != REG_NOERROR, 0))
1656 /* This indicates that we are calculating this node now.
1657 We reference this value to avoid infinite loop. */
1658 dfa->eclosures[node].nelem = REG_MISSING;
1660 constraint = ((dfa->nodes[node].type == ANCHOR)
1661 ? dfa->nodes[node].opr.ctx_type : 0);
1662 /* If the current node has constraints, duplicate all nodes.
1663 Since they must inherit the constraints. */
1665 && dfa->edests[node].nelem
1666 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1668 Idx org_node, cur_node;
1669 org_node = cur_node = node;
1670 err = duplicate_node_closure (dfa, node, node, node, constraint);
1671 if (BE (err != REG_NOERROR, 0))
1675 /* Expand each epsilon destination nodes. */
1676 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1677 for (i = 0; i < dfa->edests[node].nelem; ++i)
1679 re_node_set eclosure_elem;
1680 Idx edest = dfa->edests[node].elems[i];
1681 /* If calculating the epsilon closure of `edest' is in progress,
1682 return intermediate result. */
1683 if (dfa->eclosures[edest].nelem == REG_MISSING)
1688 /* If we haven't calculated the epsilon closure of `edest' yet,
1689 calculate now. Otherwise use calculated epsilon closure. */
1690 if (dfa->eclosures[edest].nelem == 0)
1692 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1693 if (BE (err != REG_NOERROR, 0))
1697 eclosure_elem = dfa->eclosures[edest];
1698 /* Merge the epsilon closure of `edest'. */
1699 re_node_set_merge (&eclosure, &eclosure_elem);
1700 /* If the epsilon closure of `edest' is incomplete,
1701 the epsilon closure of this node is also incomplete. */
1702 if (dfa->eclosures[edest].nelem == 0)
1705 re_node_set_free (&eclosure_elem);
1709 /* Epsilon closures include itself. */
1710 ok = re_node_set_insert (&eclosure, node);
1713 if (incomplete && !root)
1714 dfa->eclosures[node].nelem = 0;
1716 dfa->eclosures[node] = eclosure;
1717 *new_set = eclosure;
1721 /* Functions for token which are used in the parser. */
1723 /* Fetch a token from INPUT.
1724 We must not use this function inside bracket expressions. */
1727 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1729 re_string_skip_bytes (input, peek_token (result, input, syntax));
1732 /* Peek a token from INPUT, and return the length of the token.
1733 We must not use this function inside bracket expressions. */
1736 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1740 if (re_string_eoi (input))
1742 token->type = END_OF_RE;
1746 c = re_string_peek_byte (input, 0);
1749 token->word_char = 0;
1750 #ifdef RE_ENABLE_I18N
1751 token->mb_partial = 0;
1752 if (input->mb_cur_max > 1 &&
1753 !re_string_first_byte (input, re_string_cur_idx (input)))
1755 token->type = CHARACTER;
1756 token->mb_partial = 1;
1763 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1765 token->type = BACK_SLASH;
1769 c2 = re_string_peek_byte_case (input, 1);
1771 token->type = CHARACTER;
1772 #ifdef RE_ENABLE_I18N
1773 if (input->mb_cur_max > 1)
1775 wint_t wc = re_string_wchar_at (input,
1776 re_string_cur_idx (input) + 1);
1777 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1781 token->word_char = IS_WORD_CHAR (c2) != 0;
1786 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1787 token->type = OP_ALT;
1789 case '1': case '2': case '3': case '4': case '5':
1790 case '6': case '7': case '8': case '9':
1791 if (!(syntax & REG_NO_BK_REFS))
1793 token->type = OP_BACK_REF;
1794 token->opr.idx = c2 - '1';
1798 if (!(syntax & REG_NO_GNU_OPS))
1800 token->type = ANCHOR;
1801 token->opr.ctx_type = WORD_FIRST;
1805 if (!(syntax & REG_NO_GNU_OPS))
1807 token->type = ANCHOR;
1808 token->opr.ctx_type = WORD_LAST;
1812 if (!(syntax & REG_NO_GNU_OPS))
1814 token->type = ANCHOR;
1815 token->opr.ctx_type = WORD_DELIM;
1819 if (!(syntax & REG_NO_GNU_OPS))
1821 token->type = ANCHOR;
1822 token->opr.ctx_type = NOT_WORD_DELIM;
1826 if (!(syntax & REG_NO_GNU_OPS))
1827 token->type = OP_WORD;
1830 if (!(syntax & REG_NO_GNU_OPS))
1831 token->type = OP_NOTWORD;
1834 if (!(syntax & REG_NO_GNU_OPS))
1835 token->type = OP_SPACE;
1838 if (!(syntax & REG_NO_GNU_OPS))
1839 token->type = OP_NOTSPACE;
1842 if (!(syntax & REG_NO_GNU_OPS))
1844 token->type = ANCHOR;
1845 token->opr.ctx_type = BUF_FIRST;
1849 if (!(syntax & REG_NO_GNU_OPS))
1851 token->type = ANCHOR;
1852 token->opr.ctx_type = BUF_LAST;
1856 if (!(syntax & REG_NO_BK_PARENS))
1857 token->type = OP_OPEN_SUBEXP;
1860 if (!(syntax & REG_NO_BK_PARENS))
1861 token->type = OP_CLOSE_SUBEXP;
1864 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1865 token->type = OP_DUP_PLUS;
1868 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1869 token->type = OP_DUP_QUESTION;
1872 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1873 token->type = OP_OPEN_DUP_NUM;
1876 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1877 token->type = OP_CLOSE_DUP_NUM;
1885 token->type = CHARACTER;
1886 #ifdef RE_ENABLE_I18N
1887 if (input->mb_cur_max > 1)
1889 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1890 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1894 token->word_char = IS_WORD_CHAR (token->opr.c);
1899 if (syntax & REG_NEWLINE_ALT)
1900 token->type = OP_ALT;
1903 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1904 token->type = OP_ALT;
1907 token->type = OP_DUP_ASTERISK;
1910 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1911 token->type = OP_DUP_PLUS;
1914 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1915 token->type = OP_DUP_QUESTION;
1918 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1919 token->type = OP_OPEN_DUP_NUM;
1922 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1923 token->type = OP_CLOSE_DUP_NUM;
1926 if (syntax & REG_NO_BK_PARENS)
1927 token->type = OP_OPEN_SUBEXP;
1930 if (syntax & REG_NO_BK_PARENS)
1931 token->type = OP_CLOSE_SUBEXP;
1934 token->type = OP_OPEN_BRACKET;
1937 token->type = OP_PERIOD;
1940 if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1941 re_string_cur_idx (input) != 0)
1943 char prev = re_string_peek_byte (input, -1);
1944 if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1947 token->type = ANCHOR;
1948 token->opr.ctx_type = LINE_FIRST;
1951 if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1952 re_string_cur_idx (input) + 1 != re_string_length (input))
1955 re_string_skip_bytes (input, 1);
1956 peek_token (&next, input, syntax);
1957 re_string_skip_bytes (input, -1);
1958 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1961 token->type = ANCHOR;
1962 token->opr.ctx_type = LINE_LAST;
1970 /* Peek a token from INPUT, and return the length of the token.
1971 We must not use this function out of bracket expressions. */
1974 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1977 if (re_string_eoi (input))
1979 token->type = END_OF_RE;
1982 c = re_string_peek_byte (input, 0);
1985 #ifdef RE_ENABLE_I18N
1986 if (input->mb_cur_max > 1 &&
1987 !re_string_first_byte (input, re_string_cur_idx (input)))
1989 token->type = CHARACTER;
1992 #endif /* RE_ENABLE_I18N */
1994 if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1995 && re_string_cur_idx (input) + 1 < re_string_length (input))
1997 /* In this case, '\' escape a character. */
1999 re_string_skip_bytes (input, 1);
2000 c2 = re_string_peek_byte (input, 0);
2002 token->type = CHARACTER;
2005 if (c == '[') /* '[' is a special char in a bracket exps. */
2009 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2010 c2 = re_string_peek_byte (input, 1);
2018 token->type = OP_OPEN_COLL_ELEM;
2021 token->type = OP_OPEN_EQUIV_CLASS;
2024 if (syntax & REG_CHAR_CLASSES)
2026 token->type = OP_OPEN_CHAR_CLASS;
2029 /* else fall through. */
2031 token->type = CHARACTER;
2041 token->type = OP_CHARSET_RANGE;
2044 token->type = OP_CLOSE_BRACKET;
2047 token->type = OP_NON_MATCH_LIST;
2050 token->type = CHARACTER;
2055 /* Functions for parser. */
2057 /* Entry point of the parser.
2058 Parse the regular expression REGEXP and return the structure tree.
2059 If an error is occured, ERR is set by error code, and return NULL.
2060 This function build the following tree, from regular expression <reg_exp>:
2066 CAT means concatenation.
2067 EOR means end of regular expression. */
2070 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2073 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2074 bin_tree_t *tree, *eor, *root;
2075 re_token_t current_token;
2076 dfa->syntax = syntax;
2077 fetch_token (¤t_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2078 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2079 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2081 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2083 root = create_tree (dfa, tree, eor, CONCAT);
2086 if (BE (eor == NULL || root == NULL, 0))
2094 /* This function build the following tree, from regular expression
2095 <branch1>|<branch2>:
2101 ALT means alternative, which represents the operator `|'. */
2104 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2105 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2107 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2108 bin_tree_t *tree, *branch = NULL;
2109 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2110 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2113 while (token->type == OP_ALT)
2115 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2116 if (token->type != OP_ALT && token->type != END_OF_RE
2117 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2119 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2120 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2125 tree = create_tree (dfa, tree, branch, OP_ALT);
2126 if (BE (tree == NULL, 0))
2135 /* This function build the following tree, from regular expression
2142 CAT means concatenation. */
2145 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2146 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2148 bin_tree_t *tree, *exp;
2149 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2150 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2151 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2154 while (token->type != OP_ALT && token->type != END_OF_RE
2155 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2157 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2158 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2162 if (tree != NULL && exp != NULL)
2164 tree = create_tree (dfa, tree, exp, CONCAT);
2171 else if (tree == NULL)
2173 /* Otherwise exp == NULL, we don't need to create new tree. */
2178 /* This function build the following tree, from regular expression a*:
2185 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2186 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2188 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2190 switch (token->type)
2193 tree = create_token_tree (dfa, NULL, NULL, token);
2194 if (BE (tree == NULL, 0))
2199 #ifdef RE_ENABLE_I18N
2200 if (dfa->mb_cur_max > 1)
2202 while (!re_string_eoi (regexp)
2203 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2205 bin_tree_t *mbc_remain;
2206 fetch_token (token, regexp, syntax);
2207 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2208 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2209 if (BE (mbc_remain == NULL || tree == NULL, 0))
2218 case OP_OPEN_SUBEXP:
2219 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2220 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2223 case OP_OPEN_BRACKET:
2224 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2225 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2229 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2234 dfa->used_bkref_map |= 1 << token->opr.idx;
2235 tree = create_token_tree (dfa, NULL, NULL, token);
2236 if (BE (tree == NULL, 0))
2242 dfa->has_mb_node = 1;
2244 case OP_OPEN_DUP_NUM:
2245 if (syntax & REG_CONTEXT_INVALID_DUP)
2251 case OP_DUP_ASTERISK:
2253 case OP_DUP_QUESTION:
2254 if (syntax & REG_CONTEXT_INVALID_OPS)
2259 else if (syntax & REG_CONTEXT_INDEP_OPS)
2261 fetch_token (token, regexp, syntax);
2262 return parse_expression (regexp, preg, token, syntax, nest, err);
2264 /* else fall through */
2265 case OP_CLOSE_SUBEXP:
2266 if ((token->type == OP_CLOSE_SUBEXP) &&
2267 !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2272 /* else fall through */
2273 case OP_CLOSE_DUP_NUM:
2274 /* We treat it as a normal character. */
2276 /* Then we can these characters as normal characters. */
2277 token->type = CHARACTER;
2278 /* mb_partial and word_char bits should be initialized already
2280 tree = create_token_tree (dfa, NULL, NULL, token);
2281 if (BE (tree == NULL, 0))
2288 if ((token->opr.ctx_type
2289 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2290 && dfa->word_ops_used == 0)
2291 init_word_char (dfa);
2292 if (token->opr.ctx_type == WORD_DELIM
2293 || token->opr.ctx_type == NOT_WORD_DELIM)
2295 bin_tree_t *tree_first, *tree_last;
2296 if (token->opr.ctx_type == WORD_DELIM)
2298 token->opr.ctx_type = WORD_FIRST;
2299 tree_first = create_token_tree (dfa, NULL, NULL, token);
2300 token->opr.ctx_type = WORD_LAST;
2304 token->opr.ctx_type = INSIDE_WORD;
2305 tree_first = create_token_tree (dfa, NULL, NULL, token);
2306 token->opr.ctx_type = INSIDE_NOTWORD;
2308 tree_last = create_token_tree (dfa, NULL, NULL, token);
2309 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2310 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2318 tree = create_token_tree (dfa, NULL, NULL, token);
2319 if (BE (tree == NULL, 0))
2325 /* We must return here, since ANCHORs can't be followed
2326 by repetition operators.
2327 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2328 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2329 fetch_token (token, regexp, syntax);
2332 tree = create_token_tree (dfa, NULL, NULL, token);
2333 if (BE (tree == NULL, 0))
2338 if (dfa->mb_cur_max > 1)
2339 dfa->has_mb_node = 1;
2343 tree = build_charclass_op (dfa, regexp->trans,
2344 (const unsigned char *) "alnum",
2345 (const unsigned char *) "_",
2346 token->type == OP_NOTWORD, err);
2347 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2352 tree = build_charclass_op (dfa, regexp->trans,
2353 (const unsigned char *) "space",
2354 (const unsigned char *) "",
2355 token->type == OP_NOTSPACE, err);
2356 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2366 /* Must not happen? */
2372 fetch_token (token, regexp, syntax);
2374 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2375 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2377 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2378 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2380 /* In BRE consecutive duplications are not allowed. */
2381 if ((syntax & REG_CONTEXT_INVALID_DUP)
2382 && (token->type == OP_DUP_ASTERISK
2383 || token->type == OP_OPEN_DUP_NUM))
2393 /* This function build the following tree, from regular expression
2401 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2402 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2404 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2407 cur_nsub = preg->re_nsub++;
2409 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2411 /* The subexpression may be a null string. */
2412 if (token->type == OP_CLOSE_SUBEXP)
2416 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2417 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2419 if (BE (*err != REG_NOERROR, 0))
2423 if (cur_nsub <= '9' - '1')
2424 dfa->completed_bkref_map |= 1 << cur_nsub;
2426 tree = create_tree (dfa, tree, NULL, SUBEXP);
2427 if (BE (tree == NULL, 0))
2432 tree->token.opr.idx = cur_nsub;
2436 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2439 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2440 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2442 bin_tree_t *tree = NULL, *old_tree = NULL;
2443 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2444 re_token_t start_token = *token;
2446 if (token->type == OP_OPEN_DUP_NUM)
2449 start = fetch_number (regexp, token, syntax);
2450 if (start == REG_MISSING)
2452 if (token->type == CHARACTER && token->opr.c == ',')
2453 start = 0; /* We treat "{,m}" as "{0,m}". */
2456 *err = REG_BADBR; /* <re>{} is invalid. */
2460 if (BE (start != REG_ERROR, 1))
2462 /* We treat "{n}" as "{n,n}". */
2463 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2464 : ((token->type == CHARACTER && token->opr.c == ',')
2465 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2467 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2469 /* Invalid sequence. */
2470 if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2472 if (token->type == END_OF_RE)
2480 /* If the syntax bit is set, rollback. */
2481 re_string_set_index (regexp, start_idx);
2482 *token = start_token;
2483 token->type = CHARACTER;
2484 /* mb_partial and word_char bits should be already initialized by
2489 if (BE (end != REG_MISSING && start > end, 0))
2491 /* First number greater than second. */
2498 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2499 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2502 fetch_token (token, regexp, syntax);
2504 if (BE (elem == NULL, 0))
2506 if (BE (start == 0 && end == 0, 0))
2508 postorder (elem, free_tree, NULL);
2512 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2513 if (BE (start > 0, 0))
2516 for (i = 2; i <= start; ++i)
2518 elem = duplicate_tree (elem, dfa);
2519 tree = create_tree (dfa, tree, elem, CONCAT);
2520 if (BE (elem == NULL || tree == NULL, 0))
2521 goto parse_dup_op_espace;
2527 /* Duplicate ELEM before it is marked optional. */
2528 elem = duplicate_tree (elem, dfa);
2534 if (elem->token.type == SUBEXP)
2535 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2537 tree = create_tree (dfa, elem, NULL,
2538 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2539 if (BE (tree == NULL, 0))
2540 goto parse_dup_op_espace;
2542 /* This loop is actually executed only when end != REG_MISSING,
2543 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2544 already created the start+1-th copy. */
2545 if ((Idx) -1 < 0 || end != REG_MISSING)
2546 for (i = start + 2; i <= end; ++i)
2548 elem = duplicate_tree (elem, dfa);
2549 tree = create_tree (dfa, tree, elem, CONCAT);
2550 if (BE (elem == NULL || tree == NULL, 0))
2551 goto parse_dup_op_espace;
2553 tree = create_tree (dfa, tree, NULL, OP_ALT);
2554 if (BE (tree == NULL, 0))
2555 goto parse_dup_op_espace;
2559 tree = create_tree (dfa, old_tree, tree, CONCAT);
2563 parse_dup_op_espace:
2568 /* Size of the names for collating symbol/equivalence_class/character_class.
2569 I'm not sure, but maybe enough. */
2570 #define BRACKET_NAME_BUF_SIZE 32
2573 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2574 Build the range expression which starts from START_ELEM, and ends
2575 at END_ELEM. The result are written to MBCSET and SBCSET.
2576 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2577 mbcset->range_ends, is a pointer argument sinse we may
2580 static reg_errcode_t
2581 build_range_exp (bitset sbcset,
2582 # ifdef RE_ENABLE_I18N
2583 re_charset_t *mbcset, Idx *range_alloc,
2585 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2587 unsigned int start_ch, end_ch;
2588 /* Equivalence Classes and Character Classes can't be a range start/end. */
2589 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2590 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2594 /* We can handle no multi character collating elements without libc
2596 if (BE ((start_elem->type == COLL_SYM
2597 && strlen ((char *) start_elem->opr.name) > 1)
2598 || (end_elem->type == COLL_SYM
2599 && strlen ((char *) end_elem->opr.name) > 1), 0))
2600 return REG_ECOLLATE;
2602 # ifdef RE_ENABLE_I18N
2605 wint_t start_wc, end_wc;
2606 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2608 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2609 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2611 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2612 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2614 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2615 ? __btowc (start_ch) : start_elem->opr.wch);
2616 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2617 ? __btowc (end_ch) : end_elem->opr.wch);
2618 if (start_wc == WEOF || end_wc == WEOF)
2619 return REG_ECOLLATE;
2620 cmp_buf[0] = start_wc;
2621 cmp_buf[4] = end_wc;
2622 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2625 /* Got valid collation sequence values, add them as a new entry.
2626 However, for !_LIBC we have no collation elements: if the
2627 character set is single byte, the single byte character set
2628 that we build below suffices. parse_bracket_exp passes
2629 no MBCSET if dfa->mb_cur_max == 1. */
2632 /* Check the space of the arrays. */
2633 if (BE (*range_alloc == mbcset->nranges, 0))
2635 /* There is not enough space, need realloc. */
2636 wchar_t *new_array_start, *new_array_end;
2639 new_nranges = mbcset->nranges;
2640 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2641 are NULL if *range_alloc == 0. */
2642 new_array_start = re_x2realloc (mbcset->range_starts, wchar_t,
2644 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2647 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2650 mbcset->range_starts = new_array_start;
2651 mbcset->range_ends = new_array_end;
2652 *range_alloc = new_nranges;
2655 mbcset->range_starts[mbcset->nranges] = start_wc;
2656 mbcset->range_ends[mbcset->nranges++] = end_wc;
2659 /* Build the table for single byte characters. */
2660 for (wc = 0; wc < SBC_MAX; ++wc)
2663 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2664 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2665 bitset_set (sbcset, wc);
2668 # else /* not RE_ENABLE_I18N */
2671 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2672 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2674 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2675 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2677 if (start_ch > end_ch)
2679 /* Build the table for single byte characters. */
2680 for (ch = 0; ch < SBC_MAX; ++ch)
2681 if (start_ch <= ch && ch <= end_ch)
2682 bitset_set (sbcset, ch);
2684 # endif /* not RE_ENABLE_I18N */
2687 #endif /* not _LIBC */
2690 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2691 Build the collating element which is represented by NAME.
2692 The result are written to MBCSET and SBCSET.
2693 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2694 pointer argument since we may update it. */
2696 static reg_errcode_t
2697 build_collating_symbol (bitset sbcset,
2698 # ifdef RE_ENABLE_I18N
2699 re_charset_t *mbcset, Idx *coll_sym_alloc,
2701 const unsigned char *name)
2703 size_t name_len = strlen ((const char *) name);
2704 if (BE (name_len != 1, 0))
2705 return REG_ECOLLATE;
2708 bitset_set (sbcset, name[0]);
2712 #endif /* not _LIBC */
2714 /* This function parse bracket expression like "[abc]", "[a-c]",
2718 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2719 reg_syntax_t syntax, reg_errcode_t *err)
2722 const unsigned char *collseqmb;
2723 const char *collseqwc;
2726 const int32_t *symb_table;
2727 const unsigned char *extra;
2729 /* Local function for parse_bracket_exp used in _LIBC environement.
2730 Seek the collating symbol entry correspondings to NAME.
2731 Return the index of the symbol in the SYMB_TABLE. */
2734 __attribute ((always_inline))
2735 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2737 int32_t hash = elem_hash ((const char *) name, name_len);
2738 int32_t elem = hash % table_size;
2739 int32_t second = hash % (table_size - 2);
2740 while (symb_table[2 * elem] != 0)
2742 /* First compare the hashing value. */
2743 if (symb_table[2 * elem] == hash
2744 /* Compare the length of the name. */
2745 && name_len == extra[symb_table[2 * elem + 1]]
2746 /* Compare the name. */
2747 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2750 /* Yep, this is the entry. */
2760 /* Local function for parse_bracket_exp used in _LIBC environement.
2761 Look up the collation sequence value of BR_ELEM.
2762 Return the value if succeeded, UINT_MAX otherwise. */
2764 auto inline unsigned int
2765 __attribute ((always_inline))
2766 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2768 if (br_elem->type == SB_CHAR)
2771 if (MB_CUR_MAX == 1)
2774 return collseqmb[br_elem->opr.ch];
2777 wint_t wc = __btowc (br_elem->opr.ch);
2778 return __collseq_table_lookup (collseqwc, wc);
2781 else if (br_elem->type == MB_CHAR)
2783 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2785 else if (br_elem->type == COLL_SYM)
2787 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2791 elem = seek_collating_symbol_entry (br_elem->opr.name,
2793 if (symb_table[2 * elem] != 0)
2795 /* We found the entry. */
2796 idx = symb_table[2 * elem + 1];
2797 /* Skip the name of collating element name. */
2798 idx += 1 + extra[idx];
2799 /* Skip the byte sequence of the collating element. */
2800 idx += 1 + extra[idx];
2801 /* Adjust for the alignment. */
2802 idx = (idx + 3) & ~3;
2803 /* Skip the multibyte collation sequence value. */
2804 idx += sizeof (unsigned int);
2805 /* Skip the wide char sequence of the collating element. */
2806 idx += sizeof (unsigned int) *
2807 (1 + *(unsigned int *) (extra + idx));
2808 /* Return the collation sequence value. */
2809 return *(unsigned int *) (extra + idx);
2811 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2813 /* No valid character. Match it as a single byte
2815 return collseqmb[br_elem->opr.name[0]];
2818 else if (sym_name_len == 1)
2819 return collseqmb[br_elem->opr.name[0]];
2824 /* Local function for parse_bracket_exp used in _LIBC environement.
2825 Build the range expression which starts from START_ELEM, and ends
2826 at END_ELEM. The result are written to MBCSET and SBCSET.
2827 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2828 mbcset->range_ends, is a pointer argument sinse we may
2831 auto inline reg_errcode_t
2832 __attribute ((always_inline))
2833 build_range_exp (bitset sbcset, re_charset_t *mbcset,
2835 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2838 uint32_t start_collseq;
2839 uint32_t end_collseq;
2841 /* Equivalence Classes and Character Classes can't be a range
2843 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2844 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2848 start_collseq = lookup_collation_sequence_value (start_elem);
2849 end_collseq = lookup_collation_sequence_value (end_elem);
2850 /* Check start/end collation sequence values. */
2851 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2852 return REG_ECOLLATE;
2853 if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2856 /* Got valid collation sequence values, add them as a new entry.
2857 However, if we have no collation elements, and the character set
2858 is single byte, the single byte character set that we
2859 build below suffices. */
2860 if (nrules > 0 || dfa->mb_cur_max > 1)
2862 /* Check the space of the arrays. */
2863 if (BE (*range_alloc == mbcset->nranges, 0))
2865 /* There is not enough space, need realloc. */
2866 uint32_t *new_array_start;
2867 uint32_t *new_array_end;
2870 new_nranges = mbcset->nranges;
2871 new_array_start = re_x2realloc (mbcset->range_starts, uint32_t,
2873 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2876 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2879 mbcset->range_starts = new_array_start;
2880 mbcset->range_ends = new_array_end;
2881 *range_alloc = new_nranges;
2884 mbcset->range_starts[mbcset->nranges] = start_collseq;
2885 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2888 /* Build the table for single byte characters. */
2889 for (ch = 0; ch < SBC_MAX; ch++)
2891 uint32_t ch_collseq;
2893 if (MB_CUR_MAX == 1)
2896 ch_collseq = collseqmb[ch];
2898 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2899 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2900 bitset_set (sbcset, ch);
2905 /* Local function for parse_bracket_exp used in _LIBC environement.
2906 Build the collating element which is represented by NAME.
2907 The result are written to MBCSET and SBCSET.
2908 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2909 pointer argument sinse we may update it. */
2911 auto inline reg_errcode_t
2912 __attribute ((always_inline))
2913 build_collating_symbol (bitset sbcset, re_charset_t *mbcset,
2914 Idx *coll_sym_alloc, const unsigned char *name)
2917 size_t name_len = strlen ((const char *) name);
2920 elem = seek_collating_symbol_entry (name, name_len);
2921 if (symb_table[2 * elem] != 0)
2923 /* We found the entry. */
2924 idx = symb_table[2 * elem + 1];
2925 /* Skip the name of collating element name. */
2926 idx += 1 + extra[idx];
2928 else if (symb_table[2 * elem] == 0 && name_len == 1)
2930 /* No valid character, treat it as a normal
2932 bitset_set (sbcset, name[0]);
2936 return REG_ECOLLATE;
2938 /* Got valid collation sequence, add it as a new entry. */
2939 /* Check the space of the arrays. */
2940 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2942 /* Not enough, realloc it. */
2943 Idx new_coll_sym_alloc = mbcset->ncoll_syms;
2944 /* Use realloc since mbcset->coll_syms is NULL
2946 int32_t *new_coll_syms = re_x2realloc (mbcset->coll_syms, int32_t,
2947 &new_coll_sym_alloc);
2948 if (BE (new_coll_syms == NULL, 0))
2950 mbcset->coll_syms = new_coll_syms;
2951 *coll_sym_alloc = new_coll_sym_alloc;
2953 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2958 if (BE (name_len != 1, 0))
2959 return REG_ECOLLATE;
2962 bitset_set (sbcset, name[0]);
2969 re_token_t br_token;
2970 re_bitset_ptr_t sbcset;
2971 #ifdef RE_ENABLE_I18N
2972 re_charset_t *mbcset;
2973 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2974 Idx equiv_class_alloc = 0, char_class_alloc = 0;
2975 #endif /* not RE_ENABLE_I18N */
2976 bool non_match = false;
2977 bin_tree_t *work_tree;
2979 bool first_round = true;
2981 collseqmb = (const unsigned char *)
2982 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2983 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2989 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2990 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2991 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2992 _NL_COLLATE_SYMB_TABLEMB);
2993 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2994 _NL_COLLATE_SYMB_EXTRAMB);
2997 sbcset = re_calloc (bitset_word, BITSET_WORDS);
2998 #ifdef RE_ENABLE_I18N
2999 mbcset = re_calloc (re_charset_t, 1);
3000 #endif /* RE_ENABLE_I18N */
3001 #ifdef RE_ENABLE_I18N
3002 if (BE (sbcset == NULL || mbcset == NULL, 0))
3004 if (BE (sbcset == NULL, 0))
3005 #endif /* RE_ENABLE_I18N */
3011 token_len = peek_token_bracket (token, regexp, syntax);
3012 if (BE (token->type == END_OF_RE, 0))
3015 goto parse_bracket_exp_free_return;
3017 if (token->type == OP_NON_MATCH_LIST)
3019 #ifdef RE_ENABLE_I18N
3020 mbcset->non_match = 1;
3021 #endif /* not RE_ENABLE_I18N */
3023 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3024 bitset_set (sbcset, '\0');
3025 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3026 token_len = peek_token_bracket (token, regexp, syntax);
3027 if (BE (token->type == END_OF_RE, 0))
3030 goto parse_bracket_exp_free_return;
3034 /* We treat the first ']' as a normal character. */
3035 if (token->type == OP_CLOSE_BRACKET)
3036 token->type = CHARACTER;
3040 bracket_elem_t start_elem, end_elem;
3041 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3042 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3045 bool is_range_exp = false;
3048 start_elem.opr.name = start_name_buf;
3049 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3050 syntax, first_round);
3051 if (BE (ret != REG_NOERROR, 0))
3054 goto parse_bracket_exp_free_return;
3056 first_round = false;
3058 /* Get information about the next token. We need it in any case. */
3059 token_len = peek_token_bracket (token, regexp, syntax);
3061 /* Do not check for ranges if we know they are not allowed. */
3062 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3064 if (BE (token->type == END_OF_RE, 0))
3067 goto parse_bracket_exp_free_return;
3069 if (token->type == OP_CHARSET_RANGE)
3071 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3072 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3073 if (BE (token2.type == END_OF_RE, 0))
3076 goto parse_bracket_exp_free_return;
3078 if (token2.type == OP_CLOSE_BRACKET)
3080 /* We treat the last '-' as a normal character. */
3081 re_string_skip_bytes (regexp, -token_len);
3082 token->type = CHARACTER;
3085 is_range_exp = true;
3089 if (is_range_exp == true)
3091 end_elem.opr.name = end_name_buf;
3092 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3094 if (BE (ret != REG_NOERROR, 0))
3097 goto parse_bracket_exp_free_return;
3100 token_len = peek_token_bracket (token, regexp, syntax);
3103 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3104 &start_elem, &end_elem);
3106 # ifdef RE_ENABLE_I18N
3107 *err = build_range_exp (sbcset,
3108 dfa->mb_cur_max > 1 ? mbcset : NULL,
3109 &range_alloc, &start_elem, &end_elem);
3111 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3113 #endif /* RE_ENABLE_I18N */
3114 if (BE (*err != REG_NOERROR, 0))
3115 goto parse_bracket_exp_free_return;
3119 switch (start_elem.type)
3122 bitset_set (sbcset, start_elem.opr.ch);
3124 #ifdef RE_ENABLE_I18N
3126 /* Check whether the array has enough space. */
3127 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3129 wchar_t *new_mbchars;
3130 /* Not enough, realloc it. */
3131 mbchar_alloc = mbcset->nmbchars;
3132 /* Use realloc since array is NULL if *alloc == 0. */
3133 new_mbchars = re_x2realloc (mbcset->mbchars, wchar_t,
3135 if (BE (new_mbchars == NULL, 0))
3136 goto parse_bracket_exp_espace;
3137 mbcset->mbchars = new_mbchars;
3139 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3141 #endif /* RE_ENABLE_I18N */
3143 *err = build_equiv_class (sbcset,
3144 #ifdef RE_ENABLE_I18N
3145 mbcset, &equiv_class_alloc,
3146 #endif /* RE_ENABLE_I18N */
3147 start_elem.opr.name);
3148 if (BE (*err != REG_NOERROR, 0))
3149 goto parse_bracket_exp_free_return;
3152 *err = build_collating_symbol (sbcset,
3153 #ifdef RE_ENABLE_I18N
3154 mbcset, &coll_sym_alloc,
3155 #endif /* RE_ENABLE_I18N */
3156 start_elem.opr.name);
3157 if (BE (*err != REG_NOERROR, 0))
3158 goto parse_bracket_exp_free_return;
3161 *err = build_charclass (regexp->trans, sbcset,
3162 #ifdef RE_ENABLE_I18N
3163 mbcset, &char_class_alloc,
3164 #endif /* RE_ENABLE_I18N */
3165 start_elem.opr.name, syntax);
3166 if (BE (*err != REG_NOERROR, 0))
3167 goto parse_bracket_exp_free_return;
3174 if (BE (token->type == END_OF_RE, 0))
3177 goto parse_bracket_exp_free_return;
3179 if (token->type == OP_CLOSE_BRACKET)
3183 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3185 /* If it is non-matching list. */
3187 bitset_not (sbcset);
3189 #ifdef RE_ENABLE_I18N
3190 /* Ensure only single byte characters are set. */
3191 if (dfa->mb_cur_max > 1)
3192 bitset_mask (sbcset, dfa->sb_char);
3194 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3195 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3196 || mbcset->non_match)))
3198 bin_tree_t *mbc_tree;
3200 /* Build a tree for complex bracket. */
3201 dfa->has_mb_node = 1;
3202 br_token.type = COMPLEX_BRACKET;
3203 br_token.opr.mbcset = mbcset;
3204 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3205 if (BE (mbc_tree == NULL, 0))
3206 goto parse_bracket_exp_espace;
3207 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3208 if (sbcset[sbc_idx])
3210 /* If there are no bits set in sbcset, there is no point
3211 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3212 if (sbc_idx < BITSET_WORDS)
3214 /* Build a tree for simple bracket. */
3215 br_token.type = SIMPLE_BRACKET;
3216 br_token.opr.sbcset = sbcset;
3217 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3218 if (BE (work_tree == NULL, 0))
3219 goto parse_bracket_exp_espace;
3221 /* Then join them by ALT node. */
3222 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3223 if (BE (work_tree == NULL, 0))
3224 goto parse_bracket_exp_espace;
3229 work_tree = mbc_tree;
3233 #endif /* not RE_ENABLE_I18N */
3235 #ifdef RE_ENABLE_I18N
3236 free_charset (mbcset);
3238 /* Build a tree for simple bracket. */
3239 br_token.type = SIMPLE_BRACKET;
3240 br_token.opr.sbcset = sbcset;
3241 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3242 if (BE (work_tree == NULL, 0))
3243 goto parse_bracket_exp_espace;
3247 parse_bracket_exp_espace:
3249 parse_bracket_exp_free_return:
3251 #ifdef RE_ENABLE_I18N
3252 free_charset (mbcset);
3253 #endif /* RE_ENABLE_I18N */
3257 /* Parse an element in the bracket expression. */
3259 static reg_errcode_t
3260 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3261 re_token_t *token, int token_len, re_dfa_t *dfa,
3262 reg_syntax_t syntax, bool accept_hyphen)
3264 #ifdef RE_ENABLE_I18N
3266 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3267 if (cur_char_size > 1)
3269 elem->type = MB_CHAR;
3270 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3271 re_string_skip_bytes (regexp, cur_char_size);
3274 #endif /* RE_ENABLE_I18N */
3275 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3276 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3277 || token->type == OP_OPEN_EQUIV_CLASS)
3278 return parse_bracket_symbol (elem, regexp, token);
3279 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3281 /* A '-' must only appear as anything but a range indicator before
3282 the closing bracket. Everything else is an error. */
3284 (void) peek_token_bracket (&token2, regexp, syntax);
3285 if (token2.type != OP_CLOSE_BRACKET)
3286 /* The actual error value is not standardized since this whole
3287 case is undefined. But ERANGE makes good sense. */
3290 elem->type = SB_CHAR;
3291 elem->opr.ch = token->opr.c;
3295 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3296 such as [:<character_class>:], [.<collating_element>.], and
3297 [=<equivalent_class>=]. */
3299 static reg_errcode_t
3300 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3303 unsigned char ch, delim = token->opr.c;
3305 if (re_string_eoi(regexp))
3309 if (i >= BRACKET_NAME_BUF_SIZE)
3311 if (token->type == OP_OPEN_CHAR_CLASS)
3312 ch = re_string_fetch_byte_case (regexp);
3314 ch = re_string_fetch_byte (regexp);
3315 if (re_string_eoi(regexp))
3317 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3319 elem->opr.name[i] = ch;
3321 re_string_skip_bytes (regexp, 1);
3322 elem->opr.name[i] = '\0';
3323 switch (token->type)
3325 case OP_OPEN_COLL_ELEM:
3326 elem->type = COLL_SYM;
3328 case OP_OPEN_EQUIV_CLASS:
3329 elem->type = EQUIV_CLASS;
3331 case OP_OPEN_CHAR_CLASS:
3332 elem->type = CHAR_CLASS;
3340 /* Helper function for parse_bracket_exp.
3341 Build the equivalence class which is represented by NAME.
3342 The result are written to MBCSET and SBCSET.
3343 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3344 is a pointer argument sinse we may update it. */
3346 static reg_errcode_t
3347 build_equiv_class (bitset sbcset,
3348 #ifdef RE_ENABLE_I18N
3349 re_charset_t *mbcset, Idx *equiv_class_alloc,
3351 const unsigned char *name)
3354 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3357 const int32_t *table, *indirect;
3358 const unsigned char *weights, *extra, *cp;
3359 unsigned char char_buf[2];
3363 /* This #include defines a local function! */
3364 # include <locale/weight.h>
3365 /* Calculate the index for equivalence class. */
3367 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3368 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3369 _NL_COLLATE_WEIGHTMB);
3370 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3371 _NL_COLLATE_EXTRAMB);
3372 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3373 _NL_COLLATE_INDIRECTMB);
3374 idx1 = findidx (&cp);
3375 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3376 /* This isn't a valid character. */
3377 return REG_ECOLLATE;
3379 /* Build single byte matcing table for this equivalence class. */
3380 char_buf[1] = (unsigned char) '\0';
3381 len = weights[idx1];
3382 for (ch = 0; ch < SBC_MAX; ++ch)
3386 idx2 = findidx (&cp);
3391 /* This isn't a valid character. */
3393 if (len == weights[idx2])
3396 while (cnt <= len &&
3397 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3401 bitset_set (sbcset, ch);
3404 /* Check whether the array has enough space. */
3405 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3407 /* Not enough, realloc it. */
3408 Idx new_equiv_class_alloc = mbcset->nequiv_classes;
3409 /* Use realloc since the array is NULL if *alloc == 0. */
3410 int32_t *new_equiv_classes = re_x2realloc (mbcset->equiv_classes,
3412 &new_equiv_class_alloc);
3413 if (BE (new_equiv_classes == NULL, 0))
3415 mbcset->equiv_classes = new_equiv_classes;
3416 *equiv_class_alloc = new_equiv_class_alloc;
3418 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3423 if (BE (strlen ((const char *) name) != 1, 0))
3424 return REG_ECOLLATE;
3425 bitset_set (sbcset, *name);
3430 /* Helper function for parse_bracket_exp.
3431 Build the character class which is represented by NAME.
3432 The result are written to MBCSET and SBCSET.
3433 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3434 is a pointer argument sinse we may update it. */
3436 static reg_errcode_t
3437 build_charclass (unsigned REG_TRANSLATE_TYPE trans, bitset sbcset,
3438 #ifdef RE_ENABLE_I18N
3439 re_charset_t *mbcset, Idx *char_class_alloc,
3441 const unsigned char *class_name, reg_syntax_t syntax)
3444 const char *name = (const char *) class_name;
3446 /* In case of REG_ICASE "upper" and "lower" match the both of
3447 upper and lower cases. */
3448 if ((syntax & REG_IGNORE_CASE)
3449 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3452 #ifdef RE_ENABLE_I18N
3453 /* Check the space of the arrays. */
3454 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3456 /* Not enough, realloc it. */
3457 Idx new_char_class_alloc = mbcset->nchar_classes;
3458 /* Use realloc since array is NULL if *alloc == 0. */
3459 wctype_t *new_char_classes = re_x2realloc (mbcset->char_classes, wctype_t,
3460 &new_char_class_alloc);
3461 if (BE (new_char_classes == NULL, 0))
3463 mbcset->char_classes = new_char_classes;
3464 *char_class_alloc = new_char_class_alloc;
3466 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3467 #endif /* RE_ENABLE_I18N */
3469 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3470 for (i = 0; i < SBC_MAX; ++i) \
3472 if (ctype_func (i)) \
3474 int ch = trans ? trans[i] : i; \
3475 bitset_set (sbcset, ch); \
3479 if (strcmp (name, "alnum") == 0)
3480 BUILD_CHARCLASS_LOOP (isalnum)
3481 else if (strcmp (name, "cntrl") == 0)
3482 BUILD_CHARCLASS_LOOP (iscntrl)
3483 else if (strcmp (name, "lower") == 0)
3484 BUILD_CHARCLASS_LOOP (islower)
3485 else if (strcmp (name, "space") == 0)
3486 BUILD_CHARCLASS_LOOP (isspace)
3487 else if (strcmp (name, "alpha") == 0)
3488 BUILD_CHARCLASS_LOOP (isalpha)
3489 else if (strcmp (name, "digit") == 0)
3490 BUILD_CHARCLASS_LOOP (isdigit)
3491 else if (strcmp (name, "print") == 0)
3492 BUILD_CHARCLASS_LOOP (isprint)
3493 else if (strcmp (name, "upper") == 0)
3494 BUILD_CHARCLASS_LOOP (isupper)
3495 else if (strcmp (name, "blank") == 0)
3496 BUILD_CHARCLASS_LOOP (isblank)
3497 else if (strcmp (name, "graph") == 0)
3498 BUILD_CHARCLASS_LOOP (isgraph)
3499 else if (strcmp (name, "punct") == 0)
3500 BUILD_CHARCLASS_LOOP (ispunct)
3501 else if (strcmp (name, "xdigit") == 0)
3502 BUILD_CHARCLASS_LOOP (isxdigit)
3510 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3511 const unsigned char *class_name,
3512 const unsigned char *extra,
3513 bool non_match, reg_errcode_t *err)
3515 re_bitset_ptr_t sbcset;
3516 #ifdef RE_ENABLE_I18N
3517 re_charset_t *mbcset;
3519 #endif /* not RE_ENABLE_I18N */
3521 re_token_t br_token;
3524 sbcset = re_calloc (bitset_word, BITSET_WORDS);
3525 #ifdef RE_ENABLE_I18N
3526 mbcset = re_calloc (re_charset_t, 1);
3527 #endif /* RE_ENABLE_I18N */
3529 #ifdef RE_ENABLE_I18N
3530 if (BE (sbcset == NULL || mbcset == NULL, 0))
3531 #else /* not RE_ENABLE_I18N */
3532 if (BE (sbcset == NULL, 0))
3533 #endif /* not RE_ENABLE_I18N */
3541 #ifdef RE_ENABLE_I18N
3543 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3544 bitset_set(cset->sbcset, '\0');
3546 mbcset->non_match = 1;
3547 #endif /* not RE_ENABLE_I18N */
3550 /* We don't care the syntax in this case. */
3551 ret = build_charclass (trans, sbcset,
3552 #ifdef RE_ENABLE_I18N
3554 #endif /* RE_ENABLE_I18N */
3557 if (BE (ret != REG_NOERROR, 0))
3560 #ifdef RE_ENABLE_I18N
3561 free_charset (mbcset);
3562 #endif /* RE_ENABLE_I18N */
3566 /* \w match '_' also. */
3567 for (; *extra; extra++)
3568 bitset_set (sbcset, *extra);
3570 /* If it is non-matching list. */
3572 bitset_not (sbcset);
3574 #ifdef RE_ENABLE_I18N
3575 /* Ensure only single byte characters are set. */
3576 if (dfa->mb_cur_max > 1)
3577 bitset_mask (sbcset, dfa->sb_char);
3580 /* Build a tree for simple bracket. */
3581 br_token.type = SIMPLE_BRACKET;
3582 br_token.opr.sbcset = sbcset;
3583 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3584 if (BE (tree == NULL, 0))
3585 goto build_word_op_espace;
3587 #ifdef RE_ENABLE_I18N
3588 if (dfa->mb_cur_max > 1)
3590 bin_tree_t *mbc_tree;
3591 /* Build a tree for complex bracket. */
3592 br_token.type = COMPLEX_BRACKET;
3593 br_token.opr.mbcset = mbcset;
3594 dfa->has_mb_node = 1;
3595 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3596 if (BE (mbc_tree == NULL, 0))
3597 goto build_word_op_espace;
3598 /* Then join them by ALT node. */
3599 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3600 if (BE (mbc_tree != NULL, 1))
3605 free_charset (mbcset);
3608 #else /* not RE_ENABLE_I18N */
3610 #endif /* not RE_ENABLE_I18N */
3612 build_word_op_espace:
3614 #ifdef RE_ENABLE_I18N
3615 free_charset (mbcset);
3616 #endif /* RE_ENABLE_I18N */
3621 /* This is intended for the expressions like "a{1,3}".
3622 Fetch a number from `input', and return the number.
3623 Return REG_MISSING if the number field is empty like "{,1}".
3624 Return REG_ERROR if an error occurred. */
3627 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3629 Idx num = REG_MISSING;
3633 fetch_token (token, input, syntax);
3635 if (BE (token->type == END_OF_RE, 0))
3637 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3639 num = ((token->type != CHARACTER || c < '0' || '9' < c
3640 || num == REG_ERROR)
3642 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3643 num = (num > REG_DUP_MAX) ? REG_ERROR : num;
3648 #ifdef RE_ENABLE_I18N
3650 free_charset (re_charset_t *cset)
3652 re_free (cset->mbchars);
3654 re_free (cset->coll_syms);
3655 re_free (cset->equiv_classes);
3656 re_free (cset->range_starts);
3657 re_free (cset->range_ends);
3659 re_free (cset->char_classes);
3662 #endif /* RE_ENABLE_I18N */
3664 /* Functions for binary tree operation. */
3666 /* Create a tree node. */
3669 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3670 re_token_type_t type)
3674 return create_token_tree (dfa, left, right, &t);
3678 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3679 const re_token_t *token)
3682 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3684 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3686 if (storage == NULL)
3688 storage->next = dfa->str_tree_storage;
3689 dfa->str_tree_storage = storage;
3690 dfa->str_tree_storage_idx = 0;
3692 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3694 tree->parent = NULL;
3696 tree->right = right;
3697 tree->token = *token;
3698 tree->token.duplicated = 0;
3699 tree->token.opt_subexp = 0;
3702 tree->node_idx = REG_MISSING;
3705 left->parent = tree;
3707 right->parent = tree;
3711 /* Mark the tree SRC as an optional subexpression.
3712 To be called from preorder or postorder. */
3714 static reg_errcode_t
3715 mark_opt_subexp (void *extra, bin_tree_t *node)
3717 Idx idx = (Idx) (long) extra;
3718 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3719 node->token.opt_subexp = 1;
3724 /* Free the allocated memory inside NODE. */
3727 free_token (re_token_t *node)
3729 #ifdef RE_ENABLE_I18N
3730 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3731 free_charset (node->opr.mbcset);
3733 #endif /* RE_ENABLE_I18N */
3734 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3735 re_free (node->opr.sbcset);
3738 /* Worker function for tree walking. Free the allocated memory inside NODE
3739 and its children. */
3741 static reg_errcode_t
3742 free_tree (void *extra, bin_tree_t *node)
3744 free_token (&node->token);
3749 /* Duplicate the node SRC, and return new node. This is a preorder
3750 visit similar to the one implemented by the generic visitor, but
3751 we need more infrastructure to maintain two parallel trees --- so,
3752 it's easier to duplicate. */
3755 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3757 const bin_tree_t *node;
3758 bin_tree_t *dup_root;
3759 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3761 for (node = root; ; )
3763 /* Create a new tree and link it back to the current parent. */
3764 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3767 (*p_new)->parent = dup_node;
3768 (*p_new)->token.duplicated = 1;
3771 /* Go to the left node, or up and to the right. */
3775 p_new = &dup_node->left;
3779 const bin_tree_t *prev = NULL;
3780 while (node->right == prev || node->right == NULL)
3783 node = node->parent;
3784 dup_node = dup_node->parent;
3789 p_new = &dup_node->right;