1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 Idx length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, Idx pat_len);
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_token_t *token, int token_len,
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
91 Idx *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
94 re_bitset_ptr_t sbcset,
96 Idx *char_class_alloc,
97 const unsigned char *class_name,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
103 re_bitset_ptr_t sbcset,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 unsigned REG_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 bool non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 const char __re_error_msgid[] attribute_hidden =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 const size_t __re_error_msgid_idx[] attribute_hidden =
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
210 are set in BUFP on entry. */
213 re_compile_pattern (const char *pattern, size_t length,
214 struct re_pattern_buffer *bufp)
218 /* And GNU code determines whether or not to get register information
219 by passing null for the REGS argument to re_match, etc., not by
220 setting re_no_sub, unless REG_NO_SUB is set. */
221 bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
223 /* Match anchors at newline. */
224 bufp->re_newline_anchor = 1;
226 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
233 weak_alias (__re_compile_pattern, re_compile_pattern)
236 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
237 also be assigned to arbitrarily: each pattern buffer stores its own
238 syntax, so it can be changed between regex compilations. */
239 /* This has no initializer because initialized variables in Emacs
240 become read-only after dumping. */
241 reg_syntax_t re_syntax_options;
244 /* Specify the precise syntax of regexps for compilation. This provides
245 for compatibility for various utilities which historically have
246 different, incompatible syntaxes.
248 The argument SYNTAX is a bit mask comprised of the various bits
249 defined in regex.h. We return the old syntax. */
252 re_set_syntax (reg_syntax_t syntax)
254 reg_syntax_t ret = re_syntax_options;
256 re_syntax_options = syntax;
260 weak_alias (__re_set_syntax, re_set_syntax)
264 re_compile_fastmap (struct re_pattern_buffer *bufp)
266 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
267 char *fastmap = bufp->re_fastmap;
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->re_fastmap_accurate = 1;
281 weak_alias (__re_compile_fastmap, re_compile_fastmap)
285 __attribute ((always_inline))
286 re_set_fastmap (char *fastmap, bool icase, int ch)
290 fastmap[tolower (ch)] = 1;
293 /* Helper function for re_compile_fastmap.
294 Compile fastmap for the initial_state INIT_STATE. */
297 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
300 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
302 bool icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
303 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
305 Idx node = init_state->nodes.elems[node_cnt];
306 re_token_type_t type = dfa->nodes[node].type;
308 if (type == CHARACTER)
310 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
311 #ifdef RE_ENABLE_I18N
312 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
314 unsigned char buf[MB_LEN_MAX];
320 *p++ = dfa->nodes[node].opr.c;
321 while (++node < dfa->nodes_len
322 && dfa->nodes[node].type == CHARACTER
323 && dfa->nodes[node].mb_partial)
324 *p++ = dfa->nodes[node].opr.c;
325 memset (&state, 0, sizeof (state));
326 if (mbrtowc (&wc, (const char *) buf, p - buf,
328 && (__wcrtomb ((char *) buf, towlower (wc), &state)
330 re_set_fastmap (fastmap, false, buf[0]);
334 else if (type == SIMPLE_BRACKET)
337 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
338 for (j = 0; j < UINT_BITS; ++j, ++ch)
339 if (dfa->nodes[node].opr.sbcset[i] & (1u << j))
340 re_set_fastmap (fastmap, icase, ch);
342 #ifdef RE_ENABLE_I18N
343 else if (type == COMPLEX_BRACKET)
346 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
347 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
348 || cset->nranges || cset->nchar_classes)
351 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
353 /* In this case we want to catch the bytes which are
354 the first byte of any collation elements.
355 e.g. In da_DK, we want to catch 'a' since "aa"
356 is a valid collation element, and don't catch
357 'b' since 'b' is the only collation element
358 which starts from 'b'. */
360 const int32_t *table = (const int32_t *)
361 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
362 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
363 for (j = 0; j < UINT_BITS; ++j, ++ch)
365 re_set_fastmap (fastmap, icase, ch);
368 if (dfa->mb_cur_max > 1)
369 for (i = 0; i < SBC_MAX; ++i)
370 if (__btowc (i) == WEOF)
371 re_set_fastmap (fastmap, icase, i);
372 # endif /* not _LIBC */
374 for (i = 0; i < cset->nmbchars; ++i)
378 memset (&state, '\0', sizeof (state));
379 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
380 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
381 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
383 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
385 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
389 #endif /* RE_ENABLE_I18N */
390 else if (type == OP_PERIOD
391 #ifdef RE_ENABLE_I18N
392 || type == OP_UTF8_PERIOD
393 #endif /* RE_ENABLE_I18N */
394 || type == END_OF_RE)
396 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
397 if (type == END_OF_RE)
398 bufp->re_can_be_null = 1;
404 /* Entry point for POSIX code. */
405 /* regcomp takes a regular expression as a string and compiles it.
407 PREG is a regex_t *. We do not expect any fields to be initialized,
408 since POSIX says we shouldn't. Thus, we set
410 `re_buffer' to the compiled pattern;
411 `re_used' to the length of the compiled pattern;
412 `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
413 REG_EXTENDED bit in CFLAGS is set; otherwise, to
414 REG_SYNTAX_POSIX_BASIC;
415 `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
416 `re_fastmap' to an allocated space for the fastmap;
417 `re_fastmap_accurate' to zero;
418 `re_nsub' to the number of subexpressions in PATTERN.
420 PATTERN is the address of the pattern string.
422 CFLAGS is a series of bits which affect compilation.
424 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
425 use POSIX basic syntax.
427 If REG_NEWLINE is set, then . and [^...] don't match newline.
428 Also, regexec will try a match beginning after every newline.
430 If REG_ICASE is set, then we considers upper- and lowercase
431 versions of letters to be equivalent when matching.
433 If REG_NOSUB is set, then when PREG is passed to regexec, that
434 routine will report only success or failure, and nothing about the
437 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
438 the return codes and their meanings.) */
441 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
444 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
445 : REG_SYNTAX_POSIX_BASIC);
447 preg->re_buffer = NULL;
448 preg->re_allocated = 0;
451 /* Try to allocate space for the fastmap. */
452 preg->re_fastmap = re_malloc (char, SBC_MAX);
453 if (BE (preg->re_fastmap == NULL, 0))
456 syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
458 /* If REG_NEWLINE is set, newlines are treated differently. */
459 if (cflags & REG_NEWLINE)
460 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
461 syntax &= ~REG_DOT_NEWLINE;
462 syntax |= REG_HAT_LISTS_NOT_NEWLINE;
463 /* It also changes the matching behavior. */
464 preg->re_newline_anchor = 1;
467 preg->re_newline_anchor = 0;
468 preg->re_no_sub = !!(cflags & REG_NOSUB);
469 preg->re_translate = NULL;
471 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
473 /* POSIX doesn't distinguish between an unmatched open-group and an
474 unmatched close-group: both are REG_EPAREN. */
475 if (ret == REG_ERPAREN)
478 /* We have already checked preg->re_fastmap != NULL. */
479 if (BE (ret == REG_NOERROR, 1))
480 /* Compute the fastmap now, since regexec cannot modify the pattern
481 buffer. This function never fails in this implementation. */
482 (void) re_compile_fastmap (preg);
485 /* Some error occurred while compiling the expression. */
486 re_free (preg->re_fastmap);
487 preg->re_fastmap = NULL;
493 weak_alias (__regcomp, regcomp)
496 /* Returns a message corresponding to an error code, ERRCODE, returned
497 from either regcomp or regexec. We don't use PREG here. */
500 regerror (int errcode, const regex_t *__restrict preg,
501 char *__restrict errbuf, size_t errbuf_size)
507 || errcode >= (int) (sizeof (__re_error_msgid_idx)
508 / sizeof (__re_error_msgid_idx[0])), 0))
509 /* Only error codes returned by the rest of the code should be passed
510 to this routine. If we are given anything else, or if other regex
511 code generates an invalid error code, then the program has a bug.
512 Dump core so we can fix it. */
515 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
517 msg_size = strlen (msg) + 1; /* Includes the null. */
519 if (BE (errbuf_size != 0, 1))
521 if (BE (msg_size > errbuf_size, 0))
523 #if defined HAVE_MEMPCPY || defined _LIBC
524 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
526 memcpy (errbuf, msg, errbuf_size - 1);
527 errbuf[errbuf_size - 1] = 0;
531 memcpy (errbuf, msg, msg_size);
537 weak_alias (__regerror, regerror)
541 #ifdef RE_ENABLE_I18N
542 /* This static array is used for the map to single-byte characters when
543 UTF-8 is used. Otherwise we would allocate memory just to initialize
544 it the same all the time. UTF-8 is the preferred encoding so this is
545 a worthwhile optimization. */
546 static const bitset utf8_sb_map =
548 /* Set the first 128 bits. */
549 # if UINT_MAX == 0xffffffff
550 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
552 # error "Add case for new unsigned int size"
559 free_dfa_content (re_dfa_t *dfa)
564 for (i = 0; i < dfa->nodes_len; ++i)
565 free_token (dfa->nodes + i);
566 re_free (dfa->nexts);
567 for (i = 0; i < dfa->nodes_len; ++i)
569 if (dfa->eclosures != NULL)
570 re_node_set_free (dfa->eclosures + i);
571 if (dfa->inveclosures != NULL)
572 re_node_set_free (dfa->inveclosures + i);
573 if (dfa->edests != NULL)
574 re_node_set_free (dfa->edests + i);
576 re_free (dfa->edests);
577 re_free (dfa->eclosures);
578 re_free (dfa->inveclosures);
579 re_free (dfa->nodes);
581 if (dfa->state_table)
582 for (i = 0; i <= dfa->state_hash_mask; ++i)
584 struct re_state_table_entry *entry = dfa->state_table + i;
585 for (j = 0; j < entry->num; ++j)
587 re_dfastate_t *state = entry->array[j];
590 re_free (entry->array);
592 re_free (dfa->state_table);
593 #ifdef RE_ENABLE_I18N
594 if (dfa->sb_char != utf8_sb_map)
595 re_free (dfa->sb_char);
597 re_free (dfa->subexp_map);
599 re_free (dfa->re_str);
606 /* Free dynamically allocated space used by PREG. */
609 regfree (regex_t *preg)
611 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
612 if (BE (dfa != NULL, 1))
613 free_dfa_content (dfa);
614 preg->re_buffer = NULL;
615 preg->re_allocated = 0;
617 re_free (preg->re_fastmap);
618 preg->re_fastmap = NULL;
620 re_free (preg->re_translate);
621 preg->re_translate = NULL;
624 weak_alias (__regfree, regfree)
627 /* Entry points compatible with 4.2 BSD regex library. We don't define
628 them unless specifically requested. */
630 #if defined _REGEX_RE_COMP || defined _LIBC
632 /* BSD has one and only one pattern buffer. */
633 static struct re_pattern_buffer re_comp_buf;
637 /* Make these definitions weak in libc, so POSIX programs can redefine
638 these names if they don't use our functions, and still use
639 regcomp/regexec above without link errors. */
642 re_comp (const char *s)
649 if (!re_comp_buf.re_buffer)
650 return gettext ("No previous regular expression");
654 if (re_comp_buf.re_buffer)
656 fastmap = re_comp_buf.re_fastmap;
657 re_comp_buf.re_fastmap = NULL;
658 __regfree (&re_comp_buf);
659 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
660 re_comp_buf.re_fastmap = fastmap;
663 if (re_comp_buf.re_fastmap == NULL)
665 re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
666 if (re_comp_buf.re_fastmap == NULL)
667 return (char *) gettext (__re_error_msgid
668 + __re_error_msgid_idx[(int) REG_ESPACE]);
671 /* Since `re_exec' always passes NULL for the `regs' argument, we
672 don't need to initialize the pattern buffer fields which affect it. */
674 /* Match anchors at newlines. */
675 re_comp_buf.re_newline_anchor = 1;
677 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
682 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
683 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
687 libc_freeres_fn (free_mem)
689 __regfree (&re_comp_buf);
693 #endif /* _REGEX_RE_COMP */
695 /* Internal entry point.
696 Compile the regular expression PATTERN, whose length is LENGTH.
697 SYNTAX indicate regular expression's syntax. */
700 re_compile_internal (regex_t *preg, const char *pattern, Idx length,
703 reg_errcode_t err = REG_NOERROR;
707 /* Initialize the pattern buffer. */
708 preg->re_fastmap_accurate = 0;
709 preg->re_syntax = syntax;
710 preg->re_not_bol = preg->re_not_eol = 0;
713 preg->re_can_be_null = 0;
714 preg->re_regs_allocated = REG_UNALLOCATED;
716 /* Initialize the dfa. */
717 dfa = (re_dfa_t *) preg->re_buffer;
718 if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
720 /* If zero allocated, but buffer is non-null, try to realloc
721 enough space. This loses if buffer's address is bogus, but
722 that is the user's responsibility. If buffer is null this
723 is a simple allocation. */
724 dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
727 preg->re_allocated = sizeof (re_dfa_t);
728 preg->re_buffer = (unsigned char *) dfa;
730 preg->re_used = sizeof (re_dfa_t);
732 __libc_lock_init (dfa->lock);
734 err = init_dfa (dfa, length);
735 if (BE (err != REG_NOERROR, 0))
737 free_dfa_content (dfa);
738 preg->re_buffer = NULL;
739 preg->re_allocated = 0;
743 dfa->re_str = re_malloc (char, length + 1);
744 strncpy (dfa->re_str, pattern, length + 1);
747 err = re_string_construct (®exp, pattern, length, preg->re_translate,
748 syntax & REG_IGNORE_CASE, dfa);
749 if (BE (err != REG_NOERROR, 0))
751 re_compile_internal_free_return:
752 free_workarea_compile (preg);
753 re_string_destruct (®exp);
754 free_dfa_content (dfa);
755 preg->re_buffer = NULL;
756 preg->re_allocated = 0;
760 /* Parse the regular expression, and build a structure tree. */
762 dfa->str_tree = parse (®exp, preg, syntax, &err);
763 if (BE (dfa->str_tree == NULL, 0))
764 goto re_compile_internal_free_return;
766 /* Analyze the tree and create the nfa. */
767 err = analyze (preg);
768 if (BE (err != REG_NOERROR, 0))
769 goto re_compile_internal_free_return;
771 #ifdef RE_ENABLE_I18N
772 /* If possible, do searching in single byte encoding to speed things up. */
773 if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
777 /* Then create the initial state of the dfa. */
778 err = create_initial_state (dfa);
780 /* Release work areas. */
781 free_workarea_compile (preg);
782 re_string_destruct (®exp);
784 if (BE (err != REG_NOERROR, 0))
786 free_dfa_content (dfa);
787 preg->re_buffer = NULL;
788 preg->re_allocated = 0;
794 /* Initialize DFA. We use the length of the regular expression PAT_LEN
795 as the initial length of some arrays. */
798 init_dfa (re_dfa_t *dfa, Idx pat_len)
800 __re_size_t table_size;
805 memset (dfa, '\0', sizeof (re_dfa_t));
807 /* Force allocation of str_tree_storage the first time. */
808 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
810 dfa->nodes_alloc = pat_len + 1;
811 dfa->nodes = re_xmalloc (re_token_t, dfa->nodes_alloc);
813 /* table_size = 2 ^ ceil(log pat_len) */
814 for (table_size = 1; table_size <= pat_len; table_size <<= 1)
815 if (0 < (Idx) -1 && table_size == 0)
818 dfa->state_table = re_calloc (struct re_state_table_entry, table_size);
819 dfa->state_hash_mask = table_size - 1;
821 dfa->mb_cur_max = MB_CUR_MAX;
823 if (dfa->mb_cur_max == 6
824 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
826 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
829 # ifdef HAVE_LANGINFO_CODESET
830 codeset_name = nl_langinfo (CODESET);
832 codeset_name = getenv ("LC_ALL");
833 if (codeset_name == NULL || codeset_name[0] == '\0')
834 codeset_name = getenv ("LC_CTYPE");
835 if (codeset_name == NULL || codeset_name[0] == '\0')
836 codeset_name = getenv ("LANG");
837 if (codeset_name == NULL)
839 else if (strchr (codeset_name, '.') != NULL)
840 codeset_name = strchr (codeset_name, '.') + 1;
843 if (strcasecmp (codeset_name, "UTF-8") == 0
844 || strcasecmp (codeset_name, "UTF8") == 0)
847 /* We check exhaustively in the loop below if this charset is a
848 superset of ASCII. */
849 dfa->map_notascii = 0;
852 #ifdef RE_ENABLE_I18N
853 if (dfa->mb_cur_max > 1)
856 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
861 dfa->sb_char = re_calloc (unsigned int, BITSET_UINTS);
862 if (BE (dfa->sb_char == NULL, 0))
865 /* Clear all bits by, then set those corresponding to single
867 bitset_empty (dfa->sb_char);
869 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
870 for (j = 0; j < UINT_BITS; ++j, ++ch)
872 wint_t wch = __btowc (ch);
874 dfa->sb_char[i] |= 1u << j;
876 if (isascii (ch) && wch != ch)
877 dfa->map_notascii = 1;
884 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
889 /* Initialize WORD_CHAR table, which indicate which character is
890 "word". In this case "word" means that it is the word construction
891 character used by some operators like "\<", "\>", etc. */
894 init_word_char (re_dfa_t *dfa)
897 dfa->word_ops_used = 1;
898 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
899 for (j = 0; j < UINT_BITS; ++j, ++ch)
900 if (isalnum (ch) || ch == '_')
901 dfa->word_char[i] |= 1u << j;
904 /* Free the work area which are only used while compiling. */
907 free_workarea_compile (regex_t *preg)
909 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
910 bin_tree_storage_t *storage, *next;
911 for (storage = dfa->str_tree_storage; storage; storage = next)
913 next = storage->next;
916 dfa->str_tree_storage = NULL;
917 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
918 dfa->str_tree = NULL;
919 re_free (dfa->org_indices);
920 dfa->org_indices = NULL;
923 /* Create initial states for all contexts. */
926 create_initial_state (re_dfa_t *dfa)
930 re_node_set init_nodes;
932 /* Initial states have the epsilon closure of the node which is
933 the first node of the regular expression. */
934 first = dfa->str_tree->first->node_idx;
935 dfa->init_node = first;
936 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
937 if (BE (err != REG_NOERROR, 0))
940 /* The back-references which are in initial states can epsilon transit,
941 since in this case all of the subexpressions can be null.
942 Then we add epsilon closures of the nodes which are the next nodes of
943 the back-references. */
944 if (dfa->nbackref > 0)
945 for (i = 0; i < init_nodes.nelem; ++i)
947 Idx node_idx = init_nodes.elems[i];
948 re_token_type_t type = dfa->nodes[node_idx].type;
951 if (type != OP_BACK_REF)
953 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
955 re_token_t *clexp_node;
956 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
957 if (clexp_node->type == OP_CLOSE_SUBEXP
958 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
961 if (clexp_idx == init_nodes.nelem)
964 if (type == OP_BACK_REF)
966 Idx dest_idx = dfa->edests[node_idx].elems[0];
967 if (!re_node_set_contains (&init_nodes, dest_idx))
969 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
975 /* It must be the first time to invoke acquire_state. */
976 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
977 /* We don't check ERR here, since the initial state must not be NULL. */
978 if (BE (dfa->init_state == NULL, 0))
980 if (dfa->init_state->has_constraint)
982 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
984 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
986 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
990 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
991 || dfa->init_state_begbuf == NULL, 0))
995 dfa->init_state_word = dfa->init_state_nl
996 = dfa->init_state_begbuf = dfa->init_state;
998 re_node_set_free (&init_nodes);
1002 #ifdef RE_ENABLE_I18N
1003 /* If it is possible to do searching in single byte encoding instead of UTF-8
1004 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1005 DFA nodes where needed. */
1008 optimize_utf8 (re_dfa_t *dfa)
1012 bool mb_chars = false;
1013 bool has_period = false;
1015 for (node = 0; node < dfa->nodes_len; ++node)
1016 switch (dfa->nodes[node].type)
1019 if (dfa->nodes[node].opr.c >= 0x80)
1023 switch (dfa->nodes[node].opr.idx)
1031 /* Word anchors etc. cannot be handled. */
1041 case OP_DUP_ASTERISK:
1042 case OP_OPEN_SUBEXP:
1043 case OP_CLOSE_SUBEXP:
1045 case COMPLEX_BRACKET:
1047 case SIMPLE_BRACKET:
1048 /* Just double check. */
1049 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1050 if (dfa->nodes[node].opr.sbcset[i])
1057 if (mb_chars || has_period)
1058 for (node = 0; node < dfa->nodes_len; ++node)
1060 if (dfa->nodes[node].type == CHARACTER
1061 && dfa->nodes[node].opr.c >= 0x80)
1062 dfa->nodes[node].mb_partial = 0;
1063 else if (dfa->nodes[node].type == OP_PERIOD)
1064 dfa->nodes[node].type = OP_UTF8_PERIOD;
1067 /* The search can be in single byte locale. */
1068 dfa->mb_cur_max = 1;
1070 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1074 /* Analyze the structure tree, and calculate "first", "next", "edest",
1075 "eclosure", and "inveclosure". */
1077 static reg_errcode_t
1078 analyze (regex_t *preg)
1080 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1083 /* Allocate arrays. */
1084 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1085 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1086 dfa->edests = re_xmalloc (re_node_set, dfa->nodes_alloc);
1087 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1088 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1089 || dfa->eclosures == NULL, 0))
1092 dfa->subexp_map = re_xmalloc (Idx, preg->re_nsub);
1093 if (dfa->subexp_map != NULL)
1096 for (i = 0; i < preg->re_nsub; i++)
1097 dfa->subexp_map[i] = i;
1098 preorder (dfa->str_tree, optimize_subexps, dfa);
1099 for (i = 0; i < preg->re_nsub; i++)
1100 if (dfa->subexp_map[i] != i)
1102 if (i == preg->re_nsub)
1104 free (dfa->subexp_map);
1105 dfa->subexp_map = NULL;
1109 ret = postorder (dfa->str_tree, lower_subexps, preg);
1110 if (BE (ret != REG_NOERROR, 0))
1112 ret = postorder (dfa->str_tree, calc_first, dfa);
1113 if (BE (ret != REG_NOERROR, 0))
1115 preorder (dfa->str_tree, calc_next, dfa);
1116 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1117 if (BE (ret != REG_NOERROR, 0))
1119 ret = calc_eclosure (dfa);
1120 if (BE (ret != REG_NOERROR, 0))
1123 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1124 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1125 if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1128 dfa->inveclosures = re_xmalloc (re_node_set, dfa->nodes_len);
1129 if (BE (dfa->inveclosures == NULL, 0))
1131 ret = calc_inveclosure (dfa);
1137 /* Our parse trees are very unbalanced, so we cannot use a stack to
1138 implement parse tree visits. Instead, we use parent pointers and
1139 some hairy code in these two functions. */
1140 static reg_errcode_t
1141 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1144 bin_tree_t *node, *prev;
1146 for (node = root; ; )
1148 /* Descend down the tree, preferably to the left (or to the right
1149 if that's the only child). */
1150 while (node->left || node->right)
1158 reg_errcode_t err = fn (extra, node);
1159 if (BE (err != REG_NOERROR, 0))
1161 if (node->parent == NULL)
1164 node = node->parent;
1166 /* Go up while we have a node that is reached from the right. */
1167 while (node->right == prev || node->right == NULL);
1172 static reg_errcode_t
1173 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1178 for (node = root; ; )
1180 reg_errcode_t err = fn (extra, node);
1181 if (BE (err != REG_NOERROR, 0))
1184 /* Go to the left node, or up and to the right. */
1189 bin_tree_t *prev = NULL;
1190 while (node->right == prev || node->right == NULL)
1193 node = node->parent;
1202 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1203 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1204 backreferences as well. Requires a preorder visit. */
1205 static reg_errcode_t
1206 optimize_subexps (void *extra, bin_tree_t *node)
1208 re_dfa_t *dfa = (re_dfa_t *) extra;
1210 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1212 int idx = node->token.opr.idx;
1213 node->token.opr.idx = dfa->subexp_map[idx];
1214 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1217 else if (node->token.type == SUBEXP
1218 && node->left && node->left->token.type == SUBEXP)
1220 Idx other_idx = node->left->token.opr.idx;
1222 node->left = node->left->left;
1224 node->left->parent = node;
1226 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1227 if (other_idx < CHAR_BIT * sizeof dfa->used_bkref_map)
1228 dfa->used_bkref_map &= ~(1u << other_idx);
1234 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1235 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1236 static reg_errcode_t
1237 lower_subexps (void *extra, bin_tree_t *node)
1239 regex_t *preg = (regex_t *) extra;
1240 reg_errcode_t err = REG_NOERROR;
1242 if (node->left && node->left->token.type == SUBEXP)
1244 node->left = lower_subexp (&err, preg, node->left);
1246 node->left->parent = node;
1248 if (node->right && node->right->token.type == SUBEXP)
1250 node->right = lower_subexp (&err, preg, node->right);
1252 node->right->parent = node;
1259 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1261 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1262 bin_tree_t *body = node->left;
1263 bin_tree_t *op, *cls, *tree1, *tree;
1266 /* We do not optimize empty subexpressions, because otherwise we may
1267 have bad CONCAT nodes with NULL children. This is obviously not
1268 very common, so we do not lose much. An example that triggers
1269 this case is the sed "script" /\(\)/x. */
1270 && node->left != NULL
1271 && (node->token.opr.idx >= CHAR_BIT * sizeof dfa->used_bkref_map
1272 || !(dfa->used_bkref_map & (1u << node->token.opr.idx))))
1275 /* Convert the SUBEXP node to the concatenation of an
1276 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1277 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1278 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1279 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1280 tree = create_tree (dfa, op, tree1, CONCAT);
1281 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1287 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1288 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1292 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1293 nodes. Requires a postorder visit. */
1294 static reg_errcode_t
1295 calc_first (void *extra, bin_tree_t *node)
1297 re_dfa_t *dfa = (re_dfa_t *) extra;
1298 if (node->token.type == CONCAT)
1300 node->first = node->left->first;
1301 node->node_idx = node->left->node_idx;
1306 node->node_idx = re_dfa_add_node (dfa, node->token);
1307 if (BE (node->node_idx == REG_MISSING, 0))
1313 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1314 static reg_errcode_t
1315 calc_next (void *extra, bin_tree_t *node)
1317 switch (node->token.type)
1319 case OP_DUP_ASTERISK:
1320 node->left->next = node;
1323 node->left->next = node->right->first;
1324 node->right->next = node->next;
1328 node->left->next = node->next;
1330 node->right->next = node->next;
1336 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1337 static reg_errcode_t
1338 link_nfa_nodes (void *extra, bin_tree_t *node)
1340 re_dfa_t *dfa = (re_dfa_t *) extra;
1341 Idx idx = node->node_idx;
1342 reg_errcode_t err = REG_NOERROR;
1344 switch (node->token.type)
1350 assert (node->next == NULL);
1353 case OP_DUP_ASTERISK:
1357 dfa->has_plural_match = 1;
1358 if (node->left != NULL)
1359 left = node->left->first->node_idx;
1361 left = node->next->node_idx;
1362 if (node->right != NULL)
1363 right = node->right->first->node_idx;
1365 right = node->next->node_idx;
1366 assert (REG_VALID_INDEX (left));
1367 assert (REG_VALID_INDEX (right));
1368 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1373 case OP_OPEN_SUBEXP:
1374 case OP_CLOSE_SUBEXP:
1375 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1379 dfa->nexts[idx] = node->next->node_idx;
1380 if (node->token.type == OP_BACK_REF)
1381 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1385 assert (!IS_EPSILON_NODE (node->token.type));
1386 dfa->nexts[idx] = node->next->node_idx;
1393 /* Duplicate the epsilon closure of the node ROOT_NODE.
1394 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1395 to their own constraint. */
1397 static reg_errcode_t
1398 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node,
1399 Idx top_clone_node, Idx root_node,
1400 unsigned int init_constraint)
1402 Idx org_node, clone_node;
1404 unsigned int constraint = init_constraint;
1405 for (org_node = top_org_node, clone_node = top_clone_node;;)
1407 Idx org_dest, clone_dest;
1408 if (dfa->nodes[org_node].type == OP_BACK_REF)
1410 /* If the back reference epsilon-transit, its destination must
1411 also have the constraint. Then duplicate the epsilon closure
1412 of the destination of the back reference, and store it in
1413 edests of the back reference. */
1414 org_dest = dfa->nexts[org_node];
1415 re_node_set_empty (dfa->edests + clone_node);
1416 clone_dest = duplicate_node (dfa, org_dest, constraint);
1417 if (BE (clone_dest == REG_MISSING, 0))
1419 dfa->nexts[clone_node] = dfa->nexts[org_node];
1420 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1424 else if (dfa->edests[org_node].nelem == 0)
1426 /* In case of the node can't epsilon-transit, don't duplicate the
1427 destination and store the original destination as the
1428 destination of the node. */
1429 dfa->nexts[clone_node] = dfa->nexts[org_node];
1432 else if (dfa->edests[org_node].nelem == 1)
1434 /* In case of the node can epsilon-transit, and it has only one
1436 org_dest = dfa->edests[org_node].elems[0];
1437 re_node_set_empty (dfa->edests + clone_node);
1438 if (dfa->nodes[org_node].type == ANCHOR)
1440 /* In case of the node has another constraint, append it. */
1441 if (org_node == root_node && clone_node != org_node)
1443 /* ...but if the node is root_node itself, it means the
1444 epsilon closure have a loop, then tie it to the
1445 destination of the root_node. */
1446 ok = re_node_set_insert (dfa->edests + clone_node,
1452 constraint |= dfa->nodes[org_node].opr.ctx_type;
1454 clone_dest = duplicate_node (dfa, org_dest, constraint);
1455 if (BE (clone_dest == REG_MISSING, 0))
1457 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1461 else /* dfa->edests[org_node].nelem == 2 */
1463 /* In case of the node can epsilon-transit, and it has two
1464 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1465 org_dest = dfa->edests[org_node].elems[0];
1466 re_node_set_empty (dfa->edests + clone_node);
1467 /* Search for a duplicated node which satisfies the constraint. */
1468 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1469 if (clone_dest == REG_MISSING)
1471 /* There are no such a duplicated node, create a new one. */
1473 clone_dest = duplicate_node (dfa, org_dest, constraint);
1474 if (BE (clone_dest == REG_MISSING, 0))
1476 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1479 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1480 root_node, constraint);
1481 if (BE (err != REG_NOERROR, 0))
1486 /* There are a duplicated node which satisfy the constraint,
1487 use it to avoid infinite loop. */
1488 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1493 org_dest = dfa->edests[org_node].elems[1];
1494 clone_dest = duplicate_node (dfa, org_dest, constraint);
1495 if (BE (clone_dest == REG_MISSING, 0))
1497 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1501 org_node = org_dest;
1502 clone_node = clone_dest;
1507 /* Search for a node which is duplicated from the node ORG_NODE, and
1508 satisfies the constraint CONSTRAINT. */
1511 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1512 unsigned int constraint)
1515 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1517 if (org_node == dfa->org_indices[idx]
1518 && constraint == dfa->nodes[idx].constraint)
1519 return idx; /* Found. */
1521 return REG_MISSING; /* Not found. */
1524 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1525 Return the index of the new node, or REG_MISSING if insufficient storage is
1529 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1531 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1532 if (BE (dup_idx != REG_MISSING, 1))
1534 dfa->nodes[dup_idx].constraint = constraint;
1535 if (dfa->nodes[org_idx].type == ANCHOR)
1536 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1537 dfa->nodes[dup_idx].duplicated = 1;
1539 /* Store the index of the original node. */
1540 dfa->org_indices[dup_idx] = org_idx;
1545 static reg_errcode_t
1546 calc_inveclosure (re_dfa_t *dfa)
1550 for (idx = 0; idx < dfa->nodes_len; ++idx)
1551 re_node_set_init_empty (dfa->inveclosures + idx);
1553 for (src = 0; src < dfa->nodes_len; ++src)
1555 Idx *elems = dfa->eclosures[src].elems;
1556 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1558 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1567 /* Calculate "eclosure" for all the node in DFA. */
1569 static reg_errcode_t
1570 calc_eclosure (re_dfa_t *dfa)
1575 assert (dfa->nodes_len > 0);
1578 /* For each nodes, calculate epsilon closure. */
1579 for (node_idx = 0; ; ++node_idx)
1582 re_node_set eclosure_elem;
1583 if (node_idx == dfa->nodes_len)
1592 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1595 /* If we have already calculated, skip it. */
1596 if (dfa->eclosures[node_idx].nelem != 0)
1598 /* Calculate epsilon closure of `node_idx'. */
1599 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1600 if (BE (err != REG_NOERROR, 0))
1603 if (dfa->eclosures[node_idx].nelem == 0)
1606 re_node_set_free (&eclosure_elem);
1612 /* Calculate epsilon closure of NODE. */
1614 static reg_errcode_t
1615 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1618 unsigned int constraint;
1622 re_node_set eclosure;
1624 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1625 if (BE (err != REG_NOERROR, 0))
1628 /* This indicates that we are calculating this node now.
1629 We reference this value to avoid infinite loop. */
1630 dfa->eclosures[node].nelem = REG_MISSING;
1632 constraint = ((dfa->nodes[node].type == ANCHOR)
1633 ? dfa->nodes[node].opr.ctx_type : 0);
1634 /* If the current node has constraints, duplicate all nodes.
1635 Since they must inherit the constraints. */
1637 && dfa->edests[node].nelem
1638 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1640 Idx org_node, cur_node;
1641 org_node = cur_node = node;
1642 err = duplicate_node_closure (dfa, node, node, node, constraint);
1643 if (BE (err != REG_NOERROR, 0))
1647 /* Expand each epsilon destination nodes. */
1648 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1649 for (i = 0; i < dfa->edests[node].nelem; ++i)
1651 re_node_set eclosure_elem;
1652 Idx edest = dfa->edests[node].elems[i];
1653 /* If calculating the epsilon closure of `edest' is in progress,
1654 return intermediate result. */
1655 if (dfa->eclosures[edest].nelem == REG_MISSING)
1660 /* If we haven't calculated the epsilon closure of `edest' yet,
1661 calculate now. Otherwise use calculated epsilon closure. */
1662 if (dfa->eclosures[edest].nelem == 0)
1664 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1665 if (BE (err != REG_NOERROR, 0))
1669 eclosure_elem = dfa->eclosures[edest];
1670 /* Merge the epsilon closure of `edest'. */
1671 re_node_set_merge (&eclosure, &eclosure_elem);
1672 /* If the epsilon closure of `edest' is incomplete,
1673 the epsilon closure of this node is also incomplete. */
1674 if (dfa->eclosures[edest].nelem == 0)
1677 re_node_set_free (&eclosure_elem);
1681 /* Epsilon closures include itself. */
1682 ok = re_node_set_insert (&eclosure, node);
1685 if (incomplete && !root)
1686 dfa->eclosures[node].nelem = 0;
1688 dfa->eclosures[node] = eclosure;
1689 *new_set = eclosure;
1693 /* Functions for token which are used in the parser. */
1695 /* Fetch a token from INPUT.
1696 We must not use this function inside bracket expressions. */
1699 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1701 re_string_skip_bytes (input, peek_token (result, input, syntax));
1704 /* Peek a token from INPUT, and return the length of the token.
1705 We must not use this function inside bracket expressions. */
1708 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1712 if (re_string_eoi (input))
1714 token->type = END_OF_RE;
1718 c = re_string_peek_byte (input, 0);
1721 token->word_char = 0;
1722 #ifdef RE_ENABLE_I18N
1723 token->mb_partial = 0;
1724 if (input->mb_cur_max > 1 &&
1725 !re_string_first_byte (input, re_string_cur_idx (input)))
1727 token->type = CHARACTER;
1728 token->mb_partial = 1;
1735 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1737 token->type = BACK_SLASH;
1741 c2 = re_string_peek_byte_case (input, 1);
1743 token->type = CHARACTER;
1744 #ifdef RE_ENABLE_I18N
1745 if (input->mb_cur_max > 1)
1747 wint_t wc = re_string_wchar_at (input,
1748 re_string_cur_idx (input) + 1);
1749 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1753 token->word_char = IS_WORD_CHAR (c2) != 0;
1758 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1759 token->type = OP_ALT;
1761 case '1': case '2': case '3': case '4': case '5':
1762 case '6': case '7': case '8': case '9':
1763 if (!(syntax & REG_NO_BK_REFS))
1765 token->type = OP_BACK_REF;
1766 token->opr.idx = c2 - '1';
1770 if (!(syntax & REG_NO_GNU_OPS))
1772 token->type = ANCHOR;
1773 token->opr.ctx_type = WORD_FIRST;
1777 if (!(syntax & REG_NO_GNU_OPS))
1779 token->type = ANCHOR;
1780 token->opr.ctx_type = WORD_LAST;
1784 if (!(syntax & REG_NO_GNU_OPS))
1786 token->type = ANCHOR;
1787 token->opr.ctx_type = WORD_DELIM;
1791 if (!(syntax & REG_NO_GNU_OPS))
1793 token->type = ANCHOR;
1794 token->opr.ctx_type = NOT_WORD_DELIM;
1798 if (!(syntax & REG_NO_GNU_OPS))
1799 token->type = OP_WORD;
1802 if (!(syntax & REG_NO_GNU_OPS))
1803 token->type = OP_NOTWORD;
1806 if (!(syntax & REG_NO_GNU_OPS))
1807 token->type = OP_SPACE;
1810 if (!(syntax & REG_NO_GNU_OPS))
1811 token->type = OP_NOTSPACE;
1814 if (!(syntax & REG_NO_GNU_OPS))
1816 token->type = ANCHOR;
1817 token->opr.ctx_type = BUF_FIRST;
1821 if (!(syntax & REG_NO_GNU_OPS))
1823 token->type = ANCHOR;
1824 token->opr.ctx_type = BUF_LAST;
1828 if (!(syntax & REG_NO_BK_PARENS))
1829 token->type = OP_OPEN_SUBEXP;
1832 if (!(syntax & REG_NO_BK_PARENS))
1833 token->type = OP_CLOSE_SUBEXP;
1836 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1837 token->type = OP_DUP_PLUS;
1840 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1841 token->type = OP_DUP_QUESTION;
1844 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1845 token->type = OP_OPEN_DUP_NUM;
1848 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1849 token->type = OP_CLOSE_DUP_NUM;
1857 token->type = CHARACTER;
1858 #ifdef RE_ENABLE_I18N
1859 if (input->mb_cur_max > 1)
1861 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1862 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1866 token->word_char = IS_WORD_CHAR (token->opr.c);
1871 if (syntax & REG_NEWLINE_ALT)
1872 token->type = OP_ALT;
1875 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1876 token->type = OP_ALT;
1879 token->type = OP_DUP_ASTERISK;
1882 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1883 token->type = OP_DUP_PLUS;
1886 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1887 token->type = OP_DUP_QUESTION;
1890 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1891 token->type = OP_OPEN_DUP_NUM;
1894 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1895 token->type = OP_CLOSE_DUP_NUM;
1898 if (syntax & REG_NO_BK_PARENS)
1899 token->type = OP_OPEN_SUBEXP;
1902 if (syntax & REG_NO_BK_PARENS)
1903 token->type = OP_CLOSE_SUBEXP;
1906 token->type = OP_OPEN_BRACKET;
1909 token->type = OP_PERIOD;
1912 if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1913 re_string_cur_idx (input) != 0)
1915 char prev = re_string_peek_byte (input, -1);
1916 if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1919 token->type = ANCHOR;
1920 token->opr.ctx_type = LINE_FIRST;
1923 if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1924 re_string_cur_idx (input) + 1 != re_string_length (input))
1927 re_string_skip_bytes (input, 1);
1928 peek_token (&next, input, syntax);
1929 re_string_skip_bytes (input, -1);
1930 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1933 token->type = ANCHOR;
1934 token->opr.ctx_type = LINE_LAST;
1942 /* Peek a token from INPUT, and return the length of the token.
1943 We must not use this function out of bracket expressions. */
1946 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1949 if (re_string_eoi (input))
1951 token->type = END_OF_RE;
1954 c = re_string_peek_byte (input, 0);
1957 #ifdef RE_ENABLE_I18N
1958 if (input->mb_cur_max > 1 &&
1959 !re_string_first_byte (input, re_string_cur_idx (input)))
1961 token->type = CHARACTER;
1964 #endif /* RE_ENABLE_I18N */
1966 if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1967 && re_string_cur_idx (input) + 1 < re_string_length (input))
1969 /* In this case, '\' escape a character. */
1971 re_string_skip_bytes (input, 1);
1972 c2 = re_string_peek_byte (input, 0);
1974 token->type = CHARACTER;
1977 if (c == '[') /* '[' is a special char in a bracket exps. */
1981 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1982 c2 = re_string_peek_byte (input, 1);
1990 token->type = OP_OPEN_COLL_ELEM;
1993 token->type = OP_OPEN_EQUIV_CLASS;
1996 if (syntax & REG_CHAR_CLASSES)
1998 token->type = OP_OPEN_CHAR_CLASS;
2001 /* else fall through. */
2003 token->type = CHARACTER;
2013 token->type = OP_CHARSET_RANGE;
2016 token->type = OP_CLOSE_BRACKET;
2019 token->type = OP_NON_MATCH_LIST;
2022 token->type = CHARACTER;
2027 /* Functions for parser. */
2029 /* Entry point of the parser.
2030 Parse the regular expression REGEXP and return the structure tree.
2031 If an error is occured, ERR is set by error code, and return NULL.
2032 This function build the following tree, from regular expression <reg_exp>:
2038 CAT means concatenation.
2039 EOR means end of regular expression. */
2042 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2045 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2046 bin_tree_t *tree, *eor, *root;
2047 re_token_t current_token;
2048 dfa->syntax = syntax;
2049 fetch_token (¤t_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2050 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2051 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2053 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2055 root = create_tree (dfa, tree, eor, CONCAT);
2058 if (BE (eor == NULL || root == NULL, 0))
2066 /* This function build the following tree, from regular expression
2067 <branch1>|<branch2>:
2073 ALT means alternative, which represents the operator `|'. */
2076 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2077 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2079 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2080 bin_tree_t *tree, *branch = NULL;
2081 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2082 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2085 while (token->type == OP_ALT)
2087 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2088 if (token->type != OP_ALT && token->type != END_OF_RE
2089 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2091 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2092 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2097 tree = create_tree (dfa, tree, branch, OP_ALT);
2098 if (BE (tree == NULL, 0))
2107 /* This function build the following tree, from regular expression
2114 CAT means concatenation. */
2117 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2118 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2120 bin_tree_t *tree, *exp;
2121 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2122 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2123 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2126 while (token->type != OP_ALT && token->type != END_OF_RE
2127 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2129 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2130 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2134 if (tree != NULL && exp != NULL)
2136 tree = create_tree (dfa, tree, exp, CONCAT);
2143 else if (tree == NULL)
2145 /* Otherwise exp == NULL, we don't need to create new tree. */
2150 /* This function build the following tree, from regular expression a*:
2157 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2158 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2160 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2162 switch (token->type)
2165 tree = create_token_tree (dfa, NULL, NULL, token);
2166 if (BE (tree == NULL, 0))
2171 #ifdef RE_ENABLE_I18N
2172 if (dfa->mb_cur_max > 1)
2174 while (!re_string_eoi (regexp)
2175 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2177 bin_tree_t *mbc_remain;
2178 fetch_token (token, regexp, syntax);
2179 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2180 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2181 if (BE (mbc_remain == NULL || tree == NULL, 0))
2190 case OP_OPEN_SUBEXP:
2191 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2192 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2195 case OP_OPEN_BRACKET:
2196 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2197 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2201 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2206 dfa->used_bkref_map |= 1 << token->opr.idx;
2207 tree = create_token_tree (dfa, NULL, NULL, token);
2208 if (BE (tree == NULL, 0))
2214 dfa->has_mb_node = 1;
2216 case OP_OPEN_DUP_NUM:
2217 if (syntax & REG_CONTEXT_INVALID_DUP)
2223 case OP_DUP_ASTERISK:
2225 case OP_DUP_QUESTION:
2226 if (syntax & REG_CONTEXT_INVALID_OPS)
2231 else if (syntax & REG_CONTEXT_INDEP_OPS)
2233 fetch_token (token, regexp, syntax);
2234 return parse_expression (regexp, preg, token, syntax, nest, err);
2236 /* else fall through */
2237 case OP_CLOSE_SUBEXP:
2238 if ((token->type == OP_CLOSE_SUBEXP) &&
2239 !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2244 /* else fall through */
2245 case OP_CLOSE_DUP_NUM:
2246 /* We treat it as a normal character. */
2248 /* Then we can these characters as normal characters. */
2249 token->type = CHARACTER;
2250 /* mb_partial and word_char bits should be initialized already
2252 tree = create_token_tree (dfa, NULL, NULL, token);
2253 if (BE (tree == NULL, 0))
2260 if ((token->opr.ctx_type
2261 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2262 && dfa->word_ops_used == 0)
2263 init_word_char (dfa);
2264 if (token->opr.ctx_type == WORD_DELIM
2265 || token->opr.ctx_type == NOT_WORD_DELIM)
2267 bin_tree_t *tree_first, *tree_last;
2268 if (token->opr.ctx_type == WORD_DELIM)
2270 token->opr.ctx_type = WORD_FIRST;
2271 tree_first = create_token_tree (dfa, NULL, NULL, token);
2272 token->opr.ctx_type = WORD_LAST;
2276 token->opr.ctx_type = INSIDE_WORD;
2277 tree_first = create_token_tree (dfa, NULL, NULL, token);
2278 token->opr.ctx_type = INSIDE_NOTWORD;
2280 tree_last = create_token_tree (dfa, NULL, NULL, token);
2281 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2282 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2290 tree = create_token_tree (dfa, NULL, NULL, token);
2291 if (BE (tree == NULL, 0))
2297 /* We must return here, since ANCHORs can't be followed
2298 by repetition operators.
2299 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2300 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2301 fetch_token (token, regexp, syntax);
2304 tree = create_token_tree (dfa, NULL, NULL, token);
2305 if (BE (tree == NULL, 0))
2310 if (dfa->mb_cur_max > 1)
2311 dfa->has_mb_node = 1;
2315 tree = build_charclass_op (dfa, regexp->trans,
2316 (const unsigned char *) "alnum",
2317 (const unsigned char *) "_",
2318 token->type == OP_NOTWORD, err);
2319 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2324 tree = build_charclass_op (dfa, regexp->trans,
2325 (const unsigned char *) "space",
2326 (const unsigned char *) "",
2327 token->type == OP_NOTSPACE, err);
2328 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2338 /* Must not happen? */
2344 fetch_token (token, regexp, syntax);
2346 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2347 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2349 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2350 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2352 /* In BRE consecutive duplications are not allowed. */
2353 if ((syntax & REG_CONTEXT_INVALID_DUP)
2354 && (token->type == OP_DUP_ASTERISK
2355 || token->type == OP_OPEN_DUP_NUM))
2365 /* This function build the following tree, from regular expression
2373 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2374 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2376 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2379 cur_nsub = preg->re_nsub++;
2381 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2383 /* The subexpression may be a null string. */
2384 if (token->type == OP_CLOSE_SUBEXP)
2388 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2389 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2391 if (BE (*err != REG_NOERROR, 0))
2395 if (cur_nsub <= '9' - '1')
2396 dfa->completed_bkref_map |= 1 << cur_nsub;
2398 tree = create_tree (dfa, tree, NULL, SUBEXP);
2399 if (BE (tree == NULL, 0))
2404 tree->token.opr.idx = cur_nsub;
2408 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2411 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2412 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2414 bin_tree_t *tree = NULL, *old_tree = NULL;
2415 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2416 re_token_t start_token = *token;
2418 if (token->type == OP_OPEN_DUP_NUM)
2421 start = fetch_number (regexp, token, syntax);
2422 if (start == REG_MISSING)
2424 if (token->type == CHARACTER && token->opr.c == ',')
2425 start = 0; /* We treat "{,m}" as "{0,m}". */
2428 *err = REG_BADBR; /* <re>{} is invalid. */
2432 if (BE (start != REG_ERROR, 1))
2434 /* We treat "{n}" as "{n,n}". */
2435 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2436 : ((token->type == CHARACTER && token->opr.c == ',')
2437 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2439 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2441 /* Invalid sequence. */
2442 if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2444 if (token->type == END_OF_RE)
2452 /* If the syntax bit is set, rollback. */
2453 re_string_set_index (regexp, start_idx);
2454 *token = start_token;
2455 token->type = CHARACTER;
2456 /* mb_partial and word_char bits should be already initialized by
2461 if (BE (end != REG_MISSING && start > end, 0))
2463 /* First number greater than second. */
2470 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2471 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2474 fetch_token (token, regexp, syntax);
2476 if (BE (elem == NULL, 0))
2478 if (BE (start == 0 && end == 0, 0))
2480 postorder (elem, free_tree, NULL);
2484 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2485 if (BE (start > 0, 0))
2488 for (i = 2; i <= start; ++i)
2490 elem = duplicate_tree (elem, dfa);
2491 tree = create_tree (dfa, tree, elem, CONCAT);
2492 if (BE (elem == NULL || tree == NULL, 0))
2493 goto parse_dup_op_espace;
2499 /* Duplicate ELEM before it is marked optional. */
2500 elem = duplicate_tree (elem, dfa);
2506 if (elem->token.type == SUBEXP)
2507 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2509 tree = create_tree (dfa, elem, NULL,
2510 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2511 if (BE (tree == NULL, 0))
2512 goto parse_dup_op_espace;
2514 /* This loop is actually executed only when end != REG_MISSING,
2515 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2516 already created the start+1-th copy. */
2517 if ((Idx) -1 < 0 || end != REG_MISSING)
2518 for (i = start + 2; i <= end; ++i)
2520 elem = duplicate_tree (elem, dfa);
2521 tree = create_tree (dfa, tree, elem, CONCAT);
2522 if (BE (elem == NULL || tree == NULL, 0))
2523 goto parse_dup_op_espace;
2525 tree = create_tree (dfa, tree, NULL, OP_ALT);
2526 if (BE (tree == NULL, 0))
2527 goto parse_dup_op_espace;
2531 tree = create_tree (dfa, old_tree, tree, CONCAT);
2535 parse_dup_op_espace:
2540 /* Size of the names for collating symbol/equivalence_class/character_class.
2541 I'm not sure, but maybe enough. */
2542 #define BRACKET_NAME_BUF_SIZE 32
2545 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2546 Build the range expression which starts from START_ELEM, and ends
2547 at END_ELEM. The result are written to MBCSET and SBCSET.
2548 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2549 mbcset->range_ends, is a pointer argument sinse we may
2552 static reg_errcode_t
2553 build_range_exp (re_bitset_ptr_t sbcset,
2554 # ifdef RE_ENABLE_I18N
2555 re_charset_t *mbcset, Idx *range_alloc,
2557 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2559 unsigned int start_ch, end_ch;
2560 /* Equivalence Classes and Character Classes can't be a range start/end. */
2561 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2562 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2566 /* We can handle no multi character collating elements without libc
2568 if (BE ((start_elem->type == COLL_SYM
2569 && strlen ((char *) start_elem->opr.name) > 1)
2570 || (end_elem->type == COLL_SYM
2571 && strlen ((char *) end_elem->opr.name) > 1), 0))
2572 return REG_ECOLLATE;
2574 # ifdef RE_ENABLE_I18N
2577 wint_t start_wc, end_wc;
2578 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2580 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2581 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2583 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2584 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2586 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2587 ? __btowc (start_ch) : start_elem->opr.wch);
2588 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2589 ? __btowc (end_ch) : end_elem->opr.wch);
2590 if (start_wc == WEOF || end_wc == WEOF)
2591 return REG_ECOLLATE;
2592 cmp_buf[0] = start_wc;
2593 cmp_buf[4] = end_wc;
2594 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2597 /* Got valid collation sequence values, add them as a new entry.
2598 However, for !_LIBC we have no collation elements: if the
2599 character set is single byte, the single byte character set
2600 that we build below suffices. parse_bracket_exp passes
2601 no MBCSET if dfa->mb_cur_max == 1. */
2604 /* Check the space of the arrays. */
2605 if (BE (*range_alloc == mbcset->nranges, 0))
2607 /* There is not enough space, need realloc. */
2608 wchar_t *new_array_start, *new_array_end;
2611 new_nranges = mbcset->nranges;
2612 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2613 are NULL if *range_alloc == 0. */
2614 new_array_start = re_x2realloc (mbcset->range_starts, wchar_t,
2616 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2619 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2622 mbcset->range_starts = new_array_start;
2623 mbcset->range_ends = new_array_end;
2624 *range_alloc = new_nranges;
2627 mbcset->range_starts[mbcset->nranges] = start_wc;
2628 mbcset->range_ends[mbcset->nranges++] = end_wc;
2631 /* Build the table for single byte characters. */
2632 for (wc = 0; wc < SBC_MAX; ++wc)
2635 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2636 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2637 bitset_set (sbcset, wc);
2640 # else /* not RE_ENABLE_I18N */
2643 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2644 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2646 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2647 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2649 if (start_ch > end_ch)
2651 /* Build the table for single byte characters. */
2652 for (ch = 0; ch < SBC_MAX; ++ch)
2653 if (start_ch <= ch && ch <= end_ch)
2654 bitset_set (sbcset, ch);
2656 # endif /* not RE_ENABLE_I18N */
2659 #endif /* not _LIBC */
2662 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2663 Build the collating element which is represented by NAME.
2664 The result are written to MBCSET and SBCSET.
2665 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2666 pointer argument since we may update it. */
2668 static reg_errcode_t
2669 build_collating_symbol (re_bitset_ptr_t sbcset,
2670 # ifdef RE_ENABLE_I18N
2671 re_charset_t *mbcset, Idx *coll_sym_alloc,
2673 const unsigned char *name)
2675 size_t name_len = strlen ((const char *) name);
2676 if (BE (name_len != 1, 0))
2677 return REG_ECOLLATE;
2680 bitset_set (sbcset, name[0]);
2684 #endif /* not _LIBC */
2686 /* This function parse bracket expression like "[abc]", "[a-c]",
2690 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2691 reg_syntax_t syntax, reg_errcode_t *err)
2694 const unsigned char *collseqmb;
2695 const char *collseqwc;
2698 const int32_t *symb_table;
2699 const unsigned char *extra;
2701 /* Local function for parse_bracket_exp used in _LIBC environement.
2702 Seek the collating symbol entry correspondings to NAME.
2703 Return the index of the symbol in the SYMB_TABLE. */
2706 __attribute ((always_inline))
2707 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2709 int32_t hash = elem_hash ((const char *) name, name_len);
2710 int32_t elem = hash % table_size;
2711 int32_t second = hash % (table_size - 2);
2712 while (symb_table[2 * elem] != 0)
2714 /* First compare the hashing value. */
2715 if (symb_table[2 * elem] == hash
2716 /* Compare the length of the name. */
2717 && name_len == extra[symb_table[2 * elem + 1]]
2718 /* Compare the name. */
2719 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2722 /* Yep, this is the entry. */
2732 /* Local function for parse_bracket_exp used in _LIBC environement.
2733 Look up the collation sequence value of BR_ELEM.
2734 Return the value if succeeded, UINT_MAX otherwise. */
2736 auto inline unsigned int
2737 __attribute ((always_inline))
2738 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2740 if (br_elem->type == SB_CHAR)
2743 if (MB_CUR_MAX == 1)
2746 return collseqmb[br_elem->opr.ch];
2749 wint_t wc = __btowc (br_elem->opr.ch);
2750 return __collseq_table_lookup (collseqwc, wc);
2753 else if (br_elem->type == MB_CHAR)
2755 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2757 else if (br_elem->type == COLL_SYM)
2759 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2763 elem = seek_collating_symbol_entry (br_elem->opr.name,
2765 if (symb_table[2 * elem] != 0)
2767 /* We found the entry. */
2768 idx = symb_table[2 * elem + 1];
2769 /* Skip the name of collating element name. */
2770 idx += 1 + extra[idx];
2771 /* Skip the byte sequence of the collating element. */
2772 idx += 1 + extra[idx];
2773 /* Adjust for the alignment. */
2774 idx = (idx + 3) & ~3;
2775 /* Skip the multibyte collation sequence value. */
2776 idx += sizeof (unsigned int);
2777 /* Skip the wide char sequence of the collating element. */
2778 idx += sizeof (unsigned int) *
2779 (1 + *(unsigned int *) (extra + idx));
2780 /* Return the collation sequence value. */
2781 return *(unsigned int *) (extra + idx);
2783 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2785 /* No valid character. Match it as a single byte
2787 return collseqmb[br_elem->opr.name[0]];
2790 else if (sym_name_len == 1)
2791 return collseqmb[br_elem->opr.name[0]];
2796 /* Local function for parse_bracket_exp used in _LIBC environement.
2797 Build the range expression which starts from START_ELEM, and ends
2798 at END_ELEM. The result are written to MBCSET and SBCSET.
2799 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2800 mbcset->range_ends, is a pointer argument sinse we may
2803 auto inline reg_errcode_t
2804 __attribute ((always_inline))
2805 build_range_exp (re_bitset_ptr_t sbcset, re_charset_t *mbcset,
2807 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2810 uint32_t start_collseq;
2811 uint32_t end_collseq;
2813 /* Equivalence Classes and Character Classes can't be a range
2815 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2816 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2820 start_collseq = lookup_collation_sequence_value (start_elem);
2821 end_collseq = lookup_collation_sequence_value (end_elem);
2822 /* Check start/end collation sequence values. */
2823 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2824 return REG_ECOLLATE;
2825 if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2828 /* Got valid collation sequence values, add them as a new entry.
2829 However, if we have no collation elements, and the character set
2830 is single byte, the single byte character set that we
2831 build below suffices. */
2832 if (nrules > 0 || dfa->mb_cur_max > 1)
2834 /* Check the space of the arrays. */
2835 if (BE (*range_alloc == mbcset->nranges, 0))
2837 /* There is not enough space, need realloc. */
2838 uint32_t *new_array_start;
2839 uint32_t *new_array_end;
2842 new_nranges = mbcset->nranges;
2843 new_array_start = re_x2realloc (mbcset->range_starts, uint32_t,
2845 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2848 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2851 mbcset->range_starts = new_array_start;
2852 mbcset->range_ends = new_array_end;
2853 *range_alloc = new_nranges;
2856 mbcset->range_starts[mbcset->nranges] = start_collseq;
2857 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2860 /* Build the table for single byte characters. */
2861 for (ch = 0; ch < SBC_MAX; ch++)
2863 uint32_t ch_collseq;
2865 if (MB_CUR_MAX == 1)
2868 ch_collseq = collseqmb[ch];
2870 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2871 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2872 bitset_set (sbcset, ch);
2877 /* Local function for parse_bracket_exp used in _LIBC environement.
2878 Build the collating element which is represented by NAME.
2879 The result are written to MBCSET and SBCSET.
2880 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2881 pointer argument sinse we may update it. */
2883 auto inline reg_errcode_t
2884 __attribute ((always_inline))
2885 build_collating_symbol (re_bitset_ptr_t sbcset, re_charset_t *mbcset,
2886 Idx *coll_sym_alloc, const unsigned char *name)
2889 size_t name_len = strlen ((const char *) name);
2892 elem = seek_collating_symbol_entry (name, name_len);
2893 if (symb_table[2 * elem] != 0)
2895 /* We found the entry. */
2896 idx = symb_table[2 * elem + 1];
2897 /* Skip the name of collating element name. */
2898 idx += 1 + extra[idx];
2900 else if (symb_table[2 * elem] == 0 && name_len == 1)
2902 /* No valid character, treat it as a normal
2904 bitset_set (sbcset, name[0]);
2908 return REG_ECOLLATE;
2910 /* Got valid collation sequence, add it as a new entry. */
2911 /* Check the space of the arrays. */
2912 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2914 /* Not enough, realloc it. */
2915 Idx new_coll_sym_alloc = mbcset->ncoll_syms;
2916 /* Use realloc since mbcset->coll_syms is NULL
2918 int32_t *new_coll_syms = re_x2realloc (mbcset->coll_syms, int32_t,
2919 &new_coll_sym_alloc);
2920 if (BE (new_coll_syms == NULL, 0))
2922 mbcset->coll_syms = new_coll_syms;
2923 *coll_sym_alloc = new_coll_sym_alloc;
2925 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2930 if (BE (name_len != 1, 0))
2931 return REG_ECOLLATE;
2934 bitset_set (sbcset, name[0]);
2941 re_token_t br_token;
2942 re_bitset_ptr_t sbcset;
2943 #ifdef RE_ENABLE_I18N
2944 re_charset_t *mbcset;
2945 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2946 Idx equiv_class_alloc = 0, char_class_alloc = 0;
2947 #endif /* not RE_ENABLE_I18N */
2948 bool non_match = false;
2949 bin_tree_t *work_tree;
2951 bool first_round = true;
2953 collseqmb = (const unsigned char *)
2954 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2955 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2961 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2962 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2963 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2964 _NL_COLLATE_SYMB_TABLEMB);
2965 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2966 _NL_COLLATE_SYMB_EXTRAMB);
2969 sbcset = re_calloc (unsigned int, BITSET_UINTS);
2970 #ifdef RE_ENABLE_I18N
2971 mbcset = re_calloc (re_charset_t, 1);
2972 #endif /* RE_ENABLE_I18N */
2973 #ifdef RE_ENABLE_I18N
2974 if (BE (sbcset == NULL || mbcset == NULL, 0))
2976 if (BE (sbcset == NULL, 0))
2977 #endif /* RE_ENABLE_I18N */
2983 token_len = peek_token_bracket (token, regexp, syntax);
2984 if (BE (token->type == END_OF_RE, 0))
2987 goto parse_bracket_exp_free_return;
2989 if (token->type == OP_NON_MATCH_LIST)
2991 #ifdef RE_ENABLE_I18N
2992 mbcset->non_match = 1;
2993 #endif /* not RE_ENABLE_I18N */
2995 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
2996 bitset_set (sbcset, '\0');
2997 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2998 token_len = peek_token_bracket (token, regexp, syntax);
2999 if (BE (token->type == END_OF_RE, 0))
3002 goto parse_bracket_exp_free_return;
3006 /* We treat the first ']' as a normal character. */
3007 if (token->type == OP_CLOSE_BRACKET)
3008 token->type = CHARACTER;
3012 bracket_elem_t start_elem, end_elem;
3013 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3014 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3017 bool is_range_exp = false;
3020 start_elem.opr.name = start_name_buf;
3021 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3022 syntax, first_round);
3023 if (BE (ret != REG_NOERROR, 0))
3026 goto parse_bracket_exp_free_return;
3028 first_round = false;
3030 /* Get information about the next token. We need it in any case. */
3031 token_len = peek_token_bracket (token, regexp, syntax);
3033 /* Do not check for ranges if we know they are not allowed. */
3034 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3036 if (BE (token->type == END_OF_RE, 0))
3039 goto parse_bracket_exp_free_return;
3041 if (token->type == OP_CHARSET_RANGE)
3043 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3044 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3045 if (BE (token2.type == END_OF_RE, 0))
3048 goto parse_bracket_exp_free_return;
3050 if (token2.type == OP_CLOSE_BRACKET)
3052 /* We treat the last '-' as a normal character. */
3053 re_string_skip_bytes (regexp, -token_len);
3054 token->type = CHARACTER;
3057 is_range_exp = true;
3061 if (is_range_exp == true)
3063 end_elem.opr.name = end_name_buf;
3064 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3066 if (BE (ret != REG_NOERROR, 0))
3069 goto parse_bracket_exp_free_return;
3072 token_len = peek_token_bracket (token, regexp, syntax);
3075 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3076 &start_elem, &end_elem);
3078 # ifdef RE_ENABLE_I18N
3079 *err = build_range_exp (sbcset,
3080 dfa->mb_cur_max > 1 ? mbcset : NULL,
3081 &range_alloc, &start_elem, &end_elem);
3083 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3085 #endif /* RE_ENABLE_I18N */
3086 if (BE (*err != REG_NOERROR, 0))
3087 goto parse_bracket_exp_free_return;
3091 switch (start_elem.type)
3094 bitset_set (sbcset, start_elem.opr.ch);
3096 #ifdef RE_ENABLE_I18N
3098 /* Check whether the array has enough space. */
3099 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3101 wchar_t *new_mbchars;
3102 /* Not enough, realloc it. */
3103 mbchar_alloc = mbcset->nmbchars;
3104 /* Use realloc since array is NULL if *alloc == 0. */
3105 new_mbchars = re_x2realloc (mbcset->mbchars, wchar_t,
3107 if (BE (new_mbchars == NULL, 0))
3108 goto parse_bracket_exp_espace;
3109 mbcset->mbchars = new_mbchars;
3111 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3113 #endif /* RE_ENABLE_I18N */
3115 *err = build_equiv_class (sbcset,
3116 #ifdef RE_ENABLE_I18N
3117 mbcset, &equiv_class_alloc,
3118 #endif /* RE_ENABLE_I18N */
3119 start_elem.opr.name);
3120 if (BE (*err != REG_NOERROR, 0))
3121 goto parse_bracket_exp_free_return;
3124 *err = build_collating_symbol (sbcset,
3125 #ifdef RE_ENABLE_I18N
3126 mbcset, &coll_sym_alloc,
3127 #endif /* RE_ENABLE_I18N */
3128 start_elem.opr.name);
3129 if (BE (*err != REG_NOERROR, 0))
3130 goto parse_bracket_exp_free_return;
3133 *err = build_charclass (regexp->trans, sbcset,
3134 #ifdef RE_ENABLE_I18N
3135 mbcset, &char_class_alloc,
3136 #endif /* RE_ENABLE_I18N */
3137 start_elem.opr.name, syntax);
3138 if (BE (*err != REG_NOERROR, 0))
3139 goto parse_bracket_exp_free_return;
3146 if (BE (token->type == END_OF_RE, 0))
3149 goto parse_bracket_exp_free_return;
3151 if (token->type == OP_CLOSE_BRACKET)
3155 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3157 /* If it is non-matching list. */
3159 bitset_not (sbcset);
3161 #ifdef RE_ENABLE_I18N
3162 /* Ensure only single byte characters are set. */
3163 if (dfa->mb_cur_max > 1)
3164 bitset_mask (sbcset, dfa->sb_char);
3166 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3167 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3168 || mbcset->non_match)))
3170 bin_tree_t *mbc_tree;
3172 /* Build a tree for complex bracket. */
3173 dfa->has_mb_node = 1;
3174 br_token.type = COMPLEX_BRACKET;
3175 br_token.opr.mbcset = mbcset;
3176 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3177 if (BE (mbc_tree == NULL, 0))
3178 goto parse_bracket_exp_espace;
3179 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3180 if (sbcset[sbc_idx])
3182 /* If there are no bits set in sbcset, there is no point
3183 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3184 if (sbc_idx < BITSET_UINTS)
3186 /* Build a tree for simple bracket. */
3187 br_token.type = SIMPLE_BRACKET;
3188 br_token.opr.sbcset = sbcset;
3189 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3190 if (BE (work_tree == NULL, 0))
3191 goto parse_bracket_exp_espace;
3193 /* Then join them by ALT node. */
3194 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3195 if (BE (work_tree == NULL, 0))
3196 goto parse_bracket_exp_espace;
3201 work_tree = mbc_tree;
3205 #endif /* not RE_ENABLE_I18N */
3207 #ifdef RE_ENABLE_I18N
3208 free_charset (mbcset);
3210 /* Build a tree for simple bracket. */
3211 br_token.type = SIMPLE_BRACKET;
3212 br_token.opr.sbcset = sbcset;
3213 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3214 if (BE (work_tree == NULL, 0))
3215 goto parse_bracket_exp_espace;
3219 parse_bracket_exp_espace:
3221 parse_bracket_exp_free_return:
3223 #ifdef RE_ENABLE_I18N
3224 free_charset (mbcset);
3225 #endif /* RE_ENABLE_I18N */
3229 /* Parse an element in the bracket expression. */
3231 static reg_errcode_t
3232 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3233 re_token_t *token, int token_len, re_dfa_t *dfa,
3234 reg_syntax_t syntax, bool accept_hyphen)
3236 #ifdef RE_ENABLE_I18N
3238 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3239 if (cur_char_size > 1)
3241 elem->type = MB_CHAR;
3242 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3243 re_string_skip_bytes (regexp, cur_char_size);
3246 #endif /* RE_ENABLE_I18N */
3247 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3248 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3249 || token->type == OP_OPEN_EQUIV_CLASS)
3250 return parse_bracket_symbol (elem, regexp, token);
3251 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3253 /* A '-' must only appear as anything but a range indicator before
3254 the closing bracket. Everything else is an error. */
3256 (void) peek_token_bracket (&token2, regexp, syntax);
3257 if (token2.type != OP_CLOSE_BRACKET)
3258 /* The actual error value is not standardized since this whole
3259 case is undefined. But ERANGE makes good sense. */
3262 elem->type = SB_CHAR;
3263 elem->opr.ch = token->opr.c;
3267 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3268 such as [:<character_class>:], [.<collating_element>.], and
3269 [=<equivalent_class>=]. */
3271 static reg_errcode_t
3272 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3275 unsigned char ch, delim = token->opr.c;
3277 if (re_string_eoi(regexp))
3281 if (i >= BRACKET_NAME_BUF_SIZE)
3283 if (token->type == OP_OPEN_CHAR_CLASS)
3284 ch = re_string_fetch_byte_case (regexp);
3286 ch = re_string_fetch_byte (regexp);
3287 if (re_string_eoi(regexp))
3289 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3291 elem->opr.name[i] = ch;
3293 re_string_skip_bytes (regexp, 1);
3294 elem->opr.name[i] = '\0';
3295 switch (token->type)
3297 case OP_OPEN_COLL_ELEM:
3298 elem->type = COLL_SYM;
3300 case OP_OPEN_EQUIV_CLASS:
3301 elem->type = EQUIV_CLASS;
3303 case OP_OPEN_CHAR_CLASS:
3304 elem->type = CHAR_CLASS;
3312 /* Helper function for parse_bracket_exp.
3313 Build the equivalence class which is represented by NAME.
3314 The result are written to MBCSET and SBCSET.
3315 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3316 is a pointer argument sinse we may update it. */
3318 static reg_errcode_t
3319 build_equiv_class (re_bitset_ptr_t sbcset,
3320 #ifdef RE_ENABLE_I18N
3321 re_charset_t *mbcset, Idx *equiv_class_alloc,
3323 const unsigned char *name)
3326 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3329 const int32_t *table, *indirect;
3330 const unsigned char *weights, *extra, *cp;
3331 unsigned char char_buf[2];
3335 /* This #include defines a local function! */
3336 # include <locale/weight.h>
3337 /* Calculate the index for equivalence class. */
3339 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3340 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3341 _NL_COLLATE_WEIGHTMB);
3342 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3343 _NL_COLLATE_EXTRAMB);
3344 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3345 _NL_COLLATE_INDIRECTMB);
3346 idx1 = findidx (&cp);
3347 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3348 /* This isn't a valid character. */
3349 return REG_ECOLLATE;
3351 /* Build single byte matcing table for this equivalence class. */
3352 char_buf[1] = (unsigned char) '\0';
3353 len = weights[idx1];
3354 for (ch = 0; ch < SBC_MAX; ++ch)
3358 idx2 = findidx (&cp);
3363 /* This isn't a valid character. */
3365 if (len == weights[idx2])
3368 while (cnt <= len &&
3369 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3373 bitset_set (sbcset, ch);
3376 /* Check whether the array has enough space. */
3377 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3379 /* Not enough, realloc it. */
3380 Idx new_equiv_class_alloc = mbcset->nequiv_classes;
3381 /* Use realloc since the array is NULL if *alloc == 0. */
3382 int32_t *new_equiv_classes = re_x2realloc (mbcset->equiv_classes,
3384 &new_equiv_class_alloc);
3385 if (BE (new_equiv_classes == NULL, 0))
3387 mbcset->equiv_classes = new_equiv_classes;
3388 *equiv_class_alloc = new_equiv_class_alloc;
3390 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3395 if (BE (strlen ((const char *) name) != 1, 0))
3396 return REG_ECOLLATE;
3397 bitset_set (sbcset, *name);
3402 /* Helper function for parse_bracket_exp.
3403 Build the character class which is represented by NAME.
3404 The result are written to MBCSET and SBCSET.
3405 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3406 is a pointer argument sinse we may update it. */
3408 static reg_errcode_t
3409 build_charclass (unsigned REG_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset,
3410 #ifdef RE_ENABLE_I18N
3411 re_charset_t *mbcset, Idx *char_class_alloc,
3413 const unsigned char *class_name, reg_syntax_t syntax)
3416 const char *name = (const char *) class_name;
3418 /* In case of REG_ICASE "upper" and "lower" match the both of
3419 upper and lower cases. */
3420 if ((syntax & REG_IGNORE_CASE)
3421 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3424 #ifdef RE_ENABLE_I18N
3425 /* Check the space of the arrays. */
3426 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3428 /* Not enough, realloc it. */
3429 Idx new_char_class_alloc = mbcset->nchar_classes;
3430 /* Use realloc since array is NULL if *alloc == 0. */
3431 wctype_t *new_char_classes = re_x2realloc (mbcset->char_classes, wctype_t,
3432 &new_char_class_alloc);
3433 if (BE (new_char_classes == NULL, 0))
3435 mbcset->char_classes = new_char_classes;
3436 *char_class_alloc = new_char_class_alloc;
3438 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3439 #endif /* RE_ENABLE_I18N */
3441 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3442 for (i = 0; i < SBC_MAX; ++i) \
3444 if (ctype_func (i)) \
3446 int ch = trans ? trans[i] : i; \
3447 bitset_set (sbcset, ch); \
3451 if (strcmp (name, "alnum") == 0)
3452 BUILD_CHARCLASS_LOOP (isalnum)
3453 else if (strcmp (name, "cntrl") == 0)
3454 BUILD_CHARCLASS_LOOP (iscntrl)
3455 else if (strcmp (name, "lower") == 0)
3456 BUILD_CHARCLASS_LOOP (islower)
3457 else if (strcmp (name, "space") == 0)
3458 BUILD_CHARCLASS_LOOP (isspace)
3459 else if (strcmp (name, "alpha") == 0)
3460 BUILD_CHARCLASS_LOOP (isalpha)
3461 else if (strcmp (name, "digit") == 0)
3462 BUILD_CHARCLASS_LOOP (isdigit)
3463 else if (strcmp (name, "print") == 0)
3464 BUILD_CHARCLASS_LOOP (isprint)
3465 else if (strcmp (name, "upper") == 0)
3466 BUILD_CHARCLASS_LOOP (isupper)
3467 else if (strcmp (name, "blank") == 0)
3468 BUILD_CHARCLASS_LOOP (isblank)
3469 else if (strcmp (name, "graph") == 0)
3470 BUILD_CHARCLASS_LOOP (isgraph)
3471 else if (strcmp (name, "punct") == 0)
3472 BUILD_CHARCLASS_LOOP (ispunct)
3473 else if (strcmp (name, "xdigit") == 0)
3474 BUILD_CHARCLASS_LOOP (isxdigit)
3482 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3483 const unsigned char *class_name,
3484 const unsigned char *extra,
3485 bool non_match, reg_errcode_t *err)
3487 re_bitset_ptr_t sbcset;
3488 #ifdef RE_ENABLE_I18N
3489 re_charset_t *mbcset;
3491 #endif /* not RE_ENABLE_I18N */
3493 re_token_t br_token;
3496 sbcset = re_calloc (unsigned int, BITSET_UINTS);
3497 #ifdef RE_ENABLE_I18N
3498 mbcset = re_calloc (re_charset_t, 1);
3499 #endif /* RE_ENABLE_I18N */
3501 #ifdef RE_ENABLE_I18N
3502 if (BE (sbcset == NULL || mbcset == NULL, 0))
3503 #else /* not RE_ENABLE_I18N */
3504 if (BE (sbcset == NULL, 0))
3505 #endif /* not RE_ENABLE_I18N */
3513 #ifdef RE_ENABLE_I18N
3515 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3516 bitset_set(cset->sbcset, '\0');
3518 mbcset->non_match = 1;
3519 #endif /* not RE_ENABLE_I18N */
3522 /* We don't care the syntax in this case. */
3523 ret = build_charclass (trans, sbcset,
3524 #ifdef RE_ENABLE_I18N
3526 #endif /* RE_ENABLE_I18N */
3529 if (BE (ret != REG_NOERROR, 0))
3532 #ifdef RE_ENABLE_I18N
3533 free_charset (mbcset);
3534 #endif /* RE_ENABLE_I18N */
3538 /* \w match '_' also. */
3539 for (; *extra; extra++)
3540 bitset_set (sbcset, *extra);
3542 /* If it is non-matching list. */
3544 bitset_not (sbcset);
3546 #ifdef RE_ENABLE_I18N
3547 /* Ensure only single byte characters are set. */
3548 if (dfa->mb_cur_max > 1)
3549 bitset_mask (sbcset, dfa->sb_char);
3552 /* Build a tree for simple bracket. */
3553 br_token.type = SIMPLE_BRACKET;
3554 br_token.opr.sbcset = sbcset;
3555 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3556 if (BE (tree == NULL, 0))
3557 goto build_word_op_espace;
3559 #ifdef RE_ENABLE_I18N
3560 if (dfa->mb_cur_max > 1)
3562 bin_tree_t *mbc_tree;
3563 /* Build a tree for complex bracket. */
3564 br_token.type = COMPLEX_BRACKET;
3565 br_token.opr.mbcset = mbcset;
3566 dfa->has_mb_node = 1;
3567 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3568 if (BE (mbc_tree == NULL, 0))
3569 goto build_word_op_espace;
3570 /* Then join them by ALT node. */
3571 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3572 if (BE (mbc_tree != NULL, 1))
3577 free_charset (mbcset);
3580 #else /* not RE_ENABLE_I18N */
3582 #endif /* not RE_ENABLE_I18N */
3584 build_word_op_espace:
3586 #ifdef RE_ENABLE_I18N
3587 free_charset (mbcset);
3588 #endif /* RE_ENABLE_I18N */
3593 /* This is intended for the expressions like "a{1,3}".
3594 Fetch a number from `input', and return the number.
3595 Return REG_MISSING if the number field is empty like "{,1}".
3596 Return REG_ERROR if an error occurred. */
3599 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3601 Idx num = REG_MISSING;
3605 fetch_token (token, input, syntax);
3607 if (BE (token->type == END_OF_RE, 0))
3609 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3611 num = ((token->type != CHARACTER || c < '0' || '9' < c
3612 || num == REG_ERROR)
3614 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3615 num = (num > REG_DUP_MAX) ? REG_ERROR : num;
3620 #ifdef RE_ENABLE_I18N
3622 free_charset (re_charset_t *cset)
3624 re_free (cset->mbchars);
3626 re_free (cset->coll_syms);
3627 re_free (cset->equiv_classes);
3628 re_free (cset->range_starts);
3629 re_free (cset->range_ends);
3631 re_free (cset->char_classes);
3634 #endif /* RE_ENABLE_I18N */
3636 /* Functions for binary tree operation. */
3638 /* Create a tree node. */
3641 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3642 re_token_type_t type)
3646 return create_token_tree (dfa, left, right, &t);
3650 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3651 const re_token_t *token)
3654 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3656 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3658 if (storage == NULL)
3660 storage->next = dfa->str_tree_storage;
3661 dfa->str_tree_storage = storage;
3662 dfa->str_tree_storage_idx = 0;
3664 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3666 tree->parent = NULL;
3668 tree->right = right;
3669 tree->token = *token;
3670 tree->token.duplicated = 0;
3671 tree->token.opt_subexp = 0;
3674 tree->node_idx = REG_MISSING;
3677 left->parent = tree;
3679 right->parent = tree;
3683 /* Mark the tree SRC as an optional subexpression.
3684 To be called from preorder or postorder. */
3686 static reg_errcode_t
3687 mark_opt_subexp (void *extra, bin_tree_t *node)
3689 Idx idx = (Idx) (long) extra;
3690 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3691 node->token.opt_subexp = 1;
3696 /* Free the allocated memory inside NODE. */
3699 free_token (re_token_t *node)
3701 #ifdef RE_ENABLE_I18N
3702 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3703 free_charset (node->opr.mbcset);
3705 #endif /* RE_ENABLE_I18N */
3706 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3707 re_free (node->opr.sbcset);
3710 /* Worker function for tree walking. Free the allocated memory inside NODE
3711 and its children. */
3713 static reg_errcode_t
3714 free_tree (void *extra, bin_tree_t *node)
3716 free_token (&node->token);
3721 /* Duplicate the node SRC, and return new node. This is a preorder
3722 visit similar to the one implemented by the generic visitor, but
3723 we need more infrastructure to maintain two parallel trees --- so,
3724 it's easier to duplicate. */
3727 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3729 const bin_tree_t *node;
3730 bin_tree_t *dup_root;
3731 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3733 for (node = root; ; )
3735 /* Create a new tree and link it back to the current parent. */
3736 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3739 (*p_new)->parent = dup_node;
3740 (*p_new)->token.duplicated = 1;
3743 /* Go to the left node, or up and to the right. */
3747 p_new = &dup_node->left;
3751 const bin_tree_t *prev = NULL;
3752 while (node->right == prev || node->right == NULL)
3755 node = node->parent;
3756 dup_node = dup_node->parent;
3761 p_new = &dup_node->right;