1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 int length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
49 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static int fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_token_t *token, int token_len,
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
91 int *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
94 re_bitset_ptr_t sbcset,
96 int *char_class_alloc,
97 const unsigned char *class_name,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
103 re_bitset_ptr_t sbcset,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 unsigned RE_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 int non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 const char __re_error_msgid[] attribute_hidden =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 const size_t __re_error_msgid_idx[] attribute_hidden =
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210 are set in BUFP on entry. */
213 re_compile_pattern (const char *pattern, size_t length,
214 struct re_pattern_buffer *bufp)
218 /* And GNU code determines whether or not to get register information
219 by passing null for the REGS argument to re_match, etc., not by
220 setting no_sub, unless RE_NO_SUB is set. */
221 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
223 /* Match anchors at newline. */
224 bufp->newline_anchor = 1;
226 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
233 weak_alias (__re_compile_pattern, re_compile_pattern)
236 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
237 also be assigned to arbitrarily: each pattern buffer stores its own
238 syntax, so it can be changed between regex compilations. */
239 /* This has no initializer because initialized variables in Emacs
240 become read-only after dumping. */
241 reg_syntax_t re_syntax_options;
244 /* Specify the precise syntax of regexps for compilation. This provides
245 for compatibility for various utilities which historically have
246 different, incompatible syntaxes.
248 The argument SYNTAX is a bit mask comprised of the various bits
249 defined in regex.h. We return the old syntax. */
252 re_set_syntax (reg_syntax_t syntax)
254 reg_syntax_t ret = re_syntax_options;
256 re_syntax_options = syntax;
260 weak_alias (__re_set_syntax, re_set_syntax)
264 re_compile_fastmap (struct re_pattern_buffer *bufp)
266 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
267 char *fastmap = bufp->fastmap;
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->fastmap_accurate = 1;
281 weak_alias (__re_compile_fastmap, re_compile_fastmap)
285 __attribute ((always_inline))
286 re_set_fastmap (char *fastmap, int icase, int ch)
290 fastmap[tolower (ch)] = 1;
293 /* Helper function for re_compile_fastmap.
294 Compile fastmap for the initial_state INIT_STATE. */
297 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
300 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
302 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
303 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
305 int node = init_state->nodes.elems[node_cnt];
306 re_token_type_t type = dfa->nodes[node].type;
308 if (type == CHARACTER)
310 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
311 #ifdef RE_ENABLE_I18N
312 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
314 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
319 *p++ = dfa->nodes[node].opr.c;
320 while (++node < dfa->nodes_len
321 && dfa->nodes[node].type == CHARACTER
322 && dfa->nodes[node].mb_partial)
323 *p++ = dfa->nodes[node].opr.c;
324 memset (&state, 0, sizeof (state));
325 if (mbrtowc (&wc, (const char *) buf, p - buf,
327 && (__wcrtomb ((char *) buf, towlower (wc), &state)
329 re_set_fastmap (fastmap, 0, buf[0]);
333 else if (type == SIMPLE_BRACKET)
336 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
337 for (j = 0; j < UINT_BITS; ++j, ++ch)
338 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
339 re_set_fastmap (fastmap, icase, ch);
341 #ifdef RE_ENABLE_I18N
342 else if (type == COMPLEX_BRACKET)
345 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
346 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
347 || cset->nranges || cset->nchar_classes)
350 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
352 /* In this case we want to catch the bytes which are
353 the first byte of any collation elements.
354 e.g. In da_DK, we want to catch 'a' since "aa"
355 is a valid collation element, and don't catch
356 'b' since 'b' is the only collation element
357 which starts from 'b'. */
359 const int32_t *table = (const int32_t *)
360 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
361 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
362 for (j = 0; j < UINT_BITS; ++j, ++ch)
364 re_set_fastmap (fastmap, icase, ch);
367 if (dfa->mb_cur_max > 1)
368 for (i = 0; i < SBC_MAX; ++i)
369 if (__btowc (i) == WEOF)
370 re_set_fastmap (fastmap, icase, i);
371 # endif /* not _LIBC */
373 for (i = 0; i < cset->nmbchars; ++i)
377 memset (&state, '\0', sizeof (state));
378 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
379 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
380 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
382 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
384 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
388 #endif /* RE_ENABLE_I18N */
389 else if (type == OP_PERIOD
390 #ifdef RE_ENABLE_I18N
391 || type == OP_UTF8_PERIOD
392 #endif /* RE_ENABLE_I18N */
393 || type == END_OF_RE)
395 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
396 if (type == END_OF_RE)
397 bufp->can_be_null = 1;
403 /* Entry point for POSIX code. */
404 /* regcomp takes a regular expression as a string and compiles it.
406 PREG is a regex_t *. We do not expect any fields to be initialized,
407 since POSIX says we shouldn't. Thus, we set
409 `buffer' to the compiled pattern;
410 `used' to the length of the compiled pattern;
411 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
412 REG_EXTENDED bit in CFLAGS is set; otherwise, to
413 RE_SYNTAX_POSIX_BASIC;
414 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
415 `fastmap' to an allocated space for the fastmap;
416 `fastmap_accurate' to zero;
417 `re_nsub' to the number of subexpressions in PATTERN.
419 PATTERN is the address of the pattern string.
421 CFLAGS is a series of bits which affect compilation.
423 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
424 use POSIX basic syntax.
426 If REG_NEWLINE is set, then . and [^...] don't match newline.
427 Also, regexec will try a match beginning after every newline.
429 If REG_ICASE is set, then we considers upper- and lowercase
430 versions of letters to be equivalent when matching.
432 If REG_NOSUB is set, then when PREG is passed to regexec, that
433 routine will report only success or failure, and nothing about the
436 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
437 the return codes and their meanings.) */
440 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
443 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
444 : RE_SYNTAX_POSIX_BASIC);
450 /* Try to allocate space for the fastmap. */
451 preg->fastmap = re_malloc (char, SBC_MAX);
452 if (BE (preg->fastmap == NULL, 0))
455 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
457 /* If REG_NEWLINE is set, newlines are treated differently. */
458 if (cflags & REG_NEWLINE)
459 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
460 syntax &= ~RE_DOT_NEWLINE;
461 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
462 /* It also changes the matching behavior. */
463 preg->newline_anchor = 1;
466 preg->newline_anchor = 0;
467 preg->no_sub = !!(cflags & REG_NOSUB);
468 preg->translate = NULL;
470 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
472 /* POSIX doesn't distinguish between an unmatched open-group and an
473 unmatched close-group: both are REG_EPAREN. */
474 if (ret == REG_ERPAREN)
477 /* We have already checked preg->fastmap != NULL. */
478 if (BE (ret == REG_NOERROR, 1))
479 /* Compute the fastmap now, since regexec cannot modify the pattern
480 buffer. This function never fails in this implementation. */
481 (void) re_compile_fastmap (preg);
484 /* Some error occurred while compiling the expression. */
485 re_free (preg->fastmap);
486 preg->fastmap = NULL;
492 weak_alias (__regcomp, regcomp)
495 /* Returns a message corresponding to an error code, ERRCODE, returned
496 from either regcomp or regexec. We don't use PREG here. */
499 regerror (int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
505 || errcode >= (int) (sizeof (__re_error_msgid_idx)
506 / sizeof (__re_error_msgid_idx[0])), 0))
507 /* Only error codes returned by the rest of the code should be passed
508 to this routine. If we are given anything else, or if other regex
509 code generates an invalid error code, then the program has a bug.
510 Dump core so we can fix it. */
513 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
515 msg_size = strlen (msg) + 1; /* Includes the null. */
517 if (BE (errbuf_size != 0, 1))
519 if (BE (msg_size > errbuf_size, 0))
521 #if defined HAVE_MEMPCPY || defined _LIBC
522 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
524 memcpy (errbuf, msg, errbuf_size - 1);
525 errbuf[errbuf_size - 1] = 0;
529 memcpy (errbuf, msg, msg_size);
535 weak_alias (__regerror, regerror)
539 #ifdef RE_ENABLE_I18N
540 /* This static array is used for the map to single-byte characters when
541 UTF-8 is used. Otherwise we would allocate memory just to initialize
542 it the same all the time. UTF-8 is the preferred encoding so this is
543 a worthwhile optimization. */
544 static const bitset utf8_sb_map =
546 /* Set the first 128 bits. */
547 # if UINT_MAX == 0xffffffff
548 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
550 # error "Add case for new unsigned int size"
557 free_dfa_content (re_dfa_t *dfa)
562 for (i = 0; i < dfa->nodes_len; ++i)
563 free_token (dfa->nodes + i);
564 re_free (dfa->nexts);
565 for (i = 0; i < dfa->nodes_len; ++i)
567 if (dfa->eclosures != NULL)
568 re_node_set_free (dfa->eclosures + i);
569 if (dfa->inveclosures != NULL)
570 re_node_set_free (dfa->inveclosures + i);
571 if (dfa->edests != NULL)
572 re_node_set_free (dfa->edests + i);
574 re_free (dfa->edests);
575 re_free (dfa->eclosures);
576 re_free (dfa->inveclosures);
577 re_free (dfa->nodes);
579 if (dfa->state_table)
580 for (i = 0; i <= dfa->state_hash_mask; ++i)
582 struct re_state_table_entry *entry = dfa->state_table + i;
583 for (j = 0; j < entry->num; ++j)
585 re_dfastate_t *state = entry->array[j];
588 re_free (entry->array);
590 re_free (dfa->state_table);
591 #ifdef RE_ENABLE_I18N
592 if (dfa->sb_char != utf8_sb_map)
593 re_free (dfa->sb_char);
595 re_free (dfa->subexp_map);
597 re_free (dfa->re_str);
604 /* Free dynamically allocated space used by PREG. */
607 regfree (regex_t *preg)
609 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
610 if (BE (dfa != NULL, 1))
611 free_dfa_content (dfa);
615 re_free (preg->fastmap);
616 preg->fastmap = NULL;
618 re_free (preg->translate);
619 preg->translate = NULL;
622 weak_alias (__regfree, regfree)
625 /* Entry points compatible with 4.2 BSD regex library. We don't define
626 them unless specifically requested. */
628 #if defined _REGEX_RE_COMP || defined _LIBC
630 /* BSD has one and only one pattern buffer. */
631 static struct re_pattern_buffer re_comp_buf;
635 /* Make these definitions weak in libc, so POSIX programs can redefine
636 these names if they don't use our functions, and still use
637 regcomp/regexec above without link errors. */
648 if (!re_comp_buf.buffer)
649 return gettext ("No previous regular expression");
653 if (re_comp_buf.buffer)
655 fastmap = re_comp_buf.fastmap;
656 re_comp_buf.fastmap = NULL;
657 __regfree (&re_comp_buf);
658 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
659 re_comp_buf.fastmap = fastmap;
662 if (re_comp_buf.fastmap == NULL)
664 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
665 if (re_comp_buf.fastmap == NULL)
666 return (char *) gettext (__re_error_msgid
667 + __re_error_msgid_idx[(int) REG_ESPACE]);
670 /* Since `re_exec' always passes NULL for the `regs' argument, we
671 don't need to initialize the pattern buffer fields which affect it. */
673 /* Match anchors at newlines. */
674 re_comp_buf.newline_anchor = 1;
676 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
681 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
682 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
686 libc_freeres_fn (free_mem)
688 __regfree (&re_comp_buf);
692 #endif /* _REGEX_RE_COMP */
694 /* Internal entry point.
695 Compile the regular expression PATTERN, whose length is LENGTH.
696 SYNTAX indicate regular expression's syntax. */
699 re_compile_internal (regex_t *preg, const char * pattern, int length,
702 reg_errcode_t err = REG_NOERROR;
706 /* Initialize the pattern buffer. */
707 preg->fastmap_accurate = 0;
708 preg->syntax = syntax;
709 preg->not_bol = preg->not_eol = 0;
712 preg->can_be_null = 0;
713 preg->regs_allocated = REGS_UNALLOCATED;
715 /* Initialize the dfa. */
716 dfa = (re_dfa_t *) preg->buffer;
717 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
719 /* If zero allocated, but buffer is non-null, try to realloc
720 enough space. This loses if buffer's address is bogus, but
721 that is the user's responsibility. If ->buffer is NULL this
722 is a simple allocation. */
723 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
726 preg->allocated = sizeof (re_dfa_t);
727 preg->buffer = (unsigned char *) dfa;
729 preg->used = sizeof (re_dfa_t);
731 __libc_lock_init (dfa->lock);
733 err = init_dfa (dfa, length);
734 if (BE (err != REG_NOERROR, 0))
736 free_dfa_content (dfa);
742 dfa->re_str = re_malloc (char, length + 1);
743 strncpy (dfa->re_str, pattern, length + 1);
746 err = re_string_construct (®exp, pattern, length, preg->translate,
747 syntax & RE_ICASE, dfa);
748 if (BE (err != REG_NOERROR, 0))
750 re_compile_internal_free_return:
751 free_workarea_compile (preg);
752 re_string_destruct (®exp);
753 free_dfa_content (dfa);
759 /* Parse the regular expression, and build a structure tree. */
761 dfa->str_tree = parse (®exp, preg, syntax, &err);
762 if (BE (dfa->str_tree == NULL, 0))
763 goto re_compile_internal_free_return;
765 /* Analyze the tree and create the nfa. */
766 err = analyze (preg);
767 if (BE (err != REG_NOERROR, 0))
768 goto re_compile_internal_free_return;
770 #ifdef RE_ENABLE_I18N
771 /* If possible, do searching in single byte encoding to speed things up. */
772 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
776 /* Then create the initial state of the dfa. */
777 err = create_initial_state (dfa);
779 /* Release work areas. */
780 free_workarea_compile (preg);
781 re_string_destruct (®exp);
783 if (BE (err != REG_NOERROR, 0))
785 free_dfa_content (dfa);
793 /* Initialize DFA. We use the length of the regular expression PAT_LEN
794 as the initial length of some arrays. */
797 init_dfa (re_dfa_t *dfa, int pat_len)
804 memset (dfa, '\0', sizeof (re_dfa_t));
806 /* Force allocation of str_tree_storage the first time. */
807 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
809 dfa->nodes_alloc = pat_len + 1;
810 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
812 dfa->states_alloc = pat_len + 1;
814 /* table_size = 2 ^ ceil(log pat_len) */
815 for (table_size = 1; table_size > 0; table_size <<= 1)
816 if (table_size > pat_len)
819 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
820 dfa->state_hash_mask = table_size - 1;
822 dfa->mb_cur_max = MB_CUR_MAX;
824 if (dfa->mb_cur_max == 6
825 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
827 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
830 # ifdef HAVE_LANGINFO_CODESET
831 codeset_name = nl_langinfo (CODESET);
833 codeset_name = getenv ("LC_ALL");
834 if (codeset_name == NULL || codeset_name[0] == '\0')
835 codeset_name = getenv ("LC_CTYPE");
836 if (codeset_name == NULL || codeset_name[0] == '\0')
837 codeset_name = getenv ("LANG");
838 if (codeset_name == NULL)
840 else if (strchr (codeset_name, '.') != NULL)
841 codeset_name = strchr (codeset_name, '.') + 1;
844 if (strcasecmp (codeset_name, "UTF-8") == 0
845 || strcasecmp (codeset_name, "UTF8") == 0)
848 /* We check exhaustively in the loop below if this charset is a
849 superset of ASCII. */
850 dfa->map_notascii = 0;
853 #ifdef RE_ENABLE_I18N
854 if (dfa->mb_cur_max > 1)
857 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
862 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
863 if (BE (dfa->sb_char == NULL, 0))
866 /* Clear all bits by, then set those corresponding to single
868 bitset_empty (dfa->sb_char);
870 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
871 for (j = 0; j < UINT_BITS; ++j, ++ch)
873 wint_t wch = __btowc (ch);
875 dfa->sb_char[i] |= 1 << j;
877 if (isascii (ch) && wch != ch)
878 dfa->map_notascii = 1;
885 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
890 /* Initialize WORD_CHAR table, which indicate which character is
891 "word". In this case "word" means that it is the word construction
892 character used by some operators like "\<", "\>", etc. */
895 init_word_char (re_dfa_t *dfa)
898 dfa->word_ops_used = 1;
899 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
900 for (j = 0; j < UINT_BITS; ++j, ++ch)
901 if (isalnum (ch) || ch == '_')
902 dfa->word_char[i] |= 1 << j;
905 /* Free the work area which are only used while compiling. */
908 free_workarea_compile (regex_t *preg)
910 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
911 bin_tree_storage_t *storage, *next;
912 for (storage = dfa->str_tree_storage; storage; storage = next)
914 next = storage->next;
917 dfa->str_tree_storage = NULL;
918 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
919 dfa->str_tree = NULL;
920 re_free (dfa->org_indices);
921 dfa->org_indices = NULL;
924 /* Create initial states for all contexts. */
927 create_initial_state (re_dfa_t *dfa)
931 re_node_set init_nodes;
933 /* Initial states have the epsilon closure of the node which is
934 the first node of the regular expression. */
935 first = dfa->str_tree->first->node_idx;
936 dfa->init_node = first;
937 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
938 if (BE (err != REG_NOERROR, 0))
941 /* The back-references which are in initial states can epsilon transit,
942 since in this case all of the subexpressions can be null.
943 Then we add epsilon closures of the nodes which are the next nodes of
944 the back-references. */
945 if (dfa->nbackref > 0)
946 for (i = 0; i < init_nodes.nelem; ++i)
948 int node_idx = init_nodes.elems[i];
949 re_token_type_t type = dfa->nodes[node_idx].type;
952 if (type != OP_BACK_REF)
954 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
956 re_token_t *clexp_node;
957 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
958 if (clexp_node->type == OP_CLOSE_SUBEXP
959 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
962 if (clexp_idx == init_nodes.nelem)
965 if (type == OP_BACK_REF)
967 int dest_idx = dfa->edests[node_idx].elems[0];
968 if (!re_node_set_contains (&init_nodes, dest_idx))
970 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
976 /* It must be the first time to invoke acquire_state. */
977 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
978 /* We don't check ERR here, since the initial state must not be NULL. */
979 if (BE (dfa->init_state == NULL, 0))
981 if (dfa->init_state->has_constraint)
983 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
985 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
987 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
991 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
992 || dfa->init_state_begbuf == NULL, 0))
996 dfa->init_state_word = dfa->init_state_nl
997 = dfa->init_state_begbuf = dfa->init_state;
999 re_node_set_free (&init_nodes);
1003 #ifdef RE_ENABLE_I18N
1004 /* If it is possible to do searching in single byte encoding instead of UTF-8
1005 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1006 DFA nodes where needed. */
1009 optimize_utf8 (re_dfa_t *dfa)
1011 int node, i, mb_chars = 0, has_period = 0;
1013 for (node = 0; node < dfa->nodes_len; ++node)
1014 switch (dfa->nodes[node].type)
1017 if (dfa->nodes[node].opr.c >= 0x80)
1021 switch (dfa->nodes[node].opr.idx)
1029 /* Word anchors etc. cannot be handled. */
1039 case OP_DUP_ASTERISK:
1040 case OP_OPEN_SUBEXP:
1041 case OP_CLOSE_SUBEXP:
1043 case COMPLEX_BRACKET:
1045 case SIMPLE_BRACKET:
1046 /* Just double check. */
1047 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1048 if (dfa->nodes[node].opr.sbcset[i])
1055 if (mb_chars || has_period)
1056 for (node = 0; node < dfa->nodes_len; ++node)
1058 if (dfa->nodes[node].type == CHARACTER
1059 && dfa->nodes[node].opr.c >= 0x80)
1060 dfa->nodes[node].mb_partial = 0;
1061 else if (dfa->nodes[node].type == OP_PERIOD)
1062 dfa->nodes[node].type = OP_UTF8_PERIOD;
1065 /* The search can be in single byte locale. */
1066 dfa->mb_cur_max = 1;
1068 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1072 /* Analyze the structure tree, and calculate "first", "next", "edest",
1073 "eclosure", and "inveclosure". */
1075 static reg_errcode_t
1076 analyze (regex_t *preg)
1078 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1081 /* Allocate arrays. */
1082 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1083 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1084 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1085 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1086 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1087 || dfa->eclosures == NULL, 0))
1090 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1091 if (dfa->subexp_map != NULL)
1094 for (i = 0; i < preg->re_nsub; i++)
1095 dfa->subexp_map[i] = i;
1096 preorder (dfa->str_tree, optimize_subexps, dfa);
1097 for (i = 0; i < preg->re_nsub; i++)
1098 if (dfa->subexp_map[i] != i)
1100 if (i == preg->re_nsub)
1102 free (dfa->subexp_map);
1103 dfa->subexp_map = NULL;
1107 ret = postorder (dfa->str_tree, lower_subexps, preg);
1108 if (BE (ret != REG_NOERROR, 0))
1110 ret = postorder (dfa->str_tree, calc_first, dfa);
1111 if (BE (ret != REG_NOERROR, 0))
1113 preorder (dfa->str_tree, calc_next, dfa);
1114 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1115 if (BE (ret != REG_NOERROR, 0))
1117 ret = calc_eclosure (dfa);
1118 if (BE (ret != REG_NOERROR, 0))
1121 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1122 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1123 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1126 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1127 if (BE (dfa->inveclosures == NULL, 0))
1129 ret = calc_inveclosure (dfa);
1135 /* Our parse trees are very unbalanced, so we cannot use a stack to
1136 implement parse tree visits. Instead, we use parent pointers and
1137 some hairy code in these two functions. */
1138 static reg_errcode_t
1139 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1142 bin_tree_t *node, *prev;
1144 for (node = root; ; )
1146 /* Descend down the tree, preferably to the left (or to the right
1147 if that's the only child). */
1148 while (node->left || node->right)
1156 reg_errcode_t err = fn (extra, node);
1157 if (BE (err != REG_NOERROR, 0))
1159 if (node->parent == NULL)
1162 node = node->parent;
1164 /* Go up while we have a node that is reached from the right. */
1165 while (node->right == prev || node->right == NULL);
1170 static reg_errcode_t
1171 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1176 for (node = root; ; )
1178 reg_errcode_t err = fn (extra, node);
1179 if (BE (err != REG_NOERROR, 0))
1182 /* Go to the left node, or up and to the right. */
1187 bin_tree_t *prev = NULL;
1188 while (node->right == prev || node->right == NULL)
1191 node = node->parent;
1200 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1201 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1202 backreferences as well. Requires a preorder visit. */
1203 static reg_errcode_t
1204 optimize_subexps (void *extra, bin_tree_t *node)
1206 re_dfa_t *dfa = (re_dfa_t *) extra;
1208 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1210 int idx = node->token.opr.idx;
1211 node->token.opr.idx = dfa->subexp_map[idx];
1212 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1215 else if (node->token.type == SUBEXP
1216 && node->left && node->left->token.type == SUBEXP)
1218 int other_idx = node->left->token.opr.idx;
1220 node->left = node->left->left;
1222 node->left->parent = node;
1224 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1225 if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1226 dfa->used_bkref_map &= ~(1 << other_idx);
1232 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1233 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1234 static reg_errcode_t
1235 lower_subexps (void *extra, bin_tree_t *node)
1237 regex_t *preg = (regex_t *) extra;
1238 reg_errcode_t err = REG_NOERROR;
1240 if (node->left && node->left->token.type == SUBEXP)
1242 node->left = lower_subexp (&err, preg, node->left);
1244 node->left->parent = node;
1246 if (node->right && node->right->token.type == SUBEXP)
1248 node->right = lower_subexp (&err, preg, node->right);
1250 node->right->parent = node;
1257 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1259 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1260 bin_tree_t *body = node->left;
1261 bin_tree_t *op, *cls, *tree1, *tree;
1264 /* We do not optimize empty subexpressions, because otherwise we may
1265 have bad CONCAT nodes with NULL children. This is obviously not
1266 very common, so we do not lose much. An example that triggers
1267 this case is the sed "script" /\(\)/x. */
1268 && node->left != NULL
1269 && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1270 || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1273 /* Convert the SUBEXP node to the concatenation of an
1274 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1275 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1276 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1277 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1278 tree = create_tree (dfa, op, tree1, CONCAT);
1279 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1285 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1286 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1290 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1291 nodes. Requires a postorder visit. */
1292 static reg_errcode_t
1293 calc_first (void *extra, bin_tree_t *node)
1295 re_dfa_t *dfa = (re_dfa_t *) extra;
1296 if (node->token.type == CONCAT)
1298 node->first = node->left->first;
1299 node->node_idx = node->left->node_idx;
1304 node->node_idx = re_dfa_add_node (dfa, node->token);
1305 if (BE (node->node_idx == -1, 0))
1311 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1312 static reg_errcode_t
1313 calc_next (void *extra, bin_tree_t *node)
1315 switch (node->token.type)
1317 case OP_DUP_ASTERISK:
1318 node->left->next = node;
1321 node->left->next = node->right->first;
1322 node->right->next = node->next;
1326 node->left->next = node->next;
1328 node->right->next = node->next;
1334 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1335 static reg_errcode_t
1336 link_nfa_nodes (void *extra, bin_tree_t *node)
1338 re_dfa_t *dfa = (re_dfa_t *) extra;
1339 int idx = node->node_idx;
1340 reg_errcode_t err = REG_NOERROR;
1342 switch (node->token.type)
1348 assert (node->next == NULL);
1351 case OP_DUP_ASTERISK:
1355 dfa->has_plural_match = 1;
1356 if (node->left != NULL)
1357 left = node->left->first->node_idx;
1359 left = node->next->node_idx;
1360 if (node->right != NULL)
1361 right = node->right->first->node_idx;
1363 right = node->next->node_idx;
1365 assert (right > -1);
1366 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1371 case OP_OPEN_SUBEXP:
1372 case OP_CLOSE_SUBEXP:
1373 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1377 dfa->nexts[idx] = node->next->node_idx;
1378 if (node->token.type == OP_BACK_REF)
1379 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1383 assert (!IS_EPSILON_NODE (node->token.type));
1384 dfa->nexts[idx] = node->next->node_idx;
1391 /* Duplicate the epsilon closure of the node ROOT_NODE.
1392 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1393 to their own constraint. */
1395 static reg_errcode_t
1396 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1397 int root_node, unsigned int init_constraint)
1399 int org_node, clone_node, ret;
1400 unsigned int constraint = init_constraint;
1401 for (org_node = top_org_node, clone_node = top_clone_node;;)
1403 int org_dest, clone_dest;
1404 if (dfa->nodes[org_node].type == OP_BACK_REF)
1406 /* If the back reference epsilon-transit, its destination must
1407 also have the constraint. Then duplicate the epsilon closure
1408 of the destination of the back reference, and store it in
1409 edests of the back reference. */
1410 org_dest = dfa->nexts[org_node];
1411 re_node_set_empty (dfa->edests + clone_node);
1412 clone_dest = duplicate_node (dfa, org_dest, constraint);
1413 if (BE (clone_dest == -1, 0))
1415 dfa->nexts[clone_node] = dfa->nexts[org_node];
1416 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1417 if (BE (ret < 0, 0))
1420 else if (dfa->edests[org_node].nelem == 0)
1422 /* In case of the node can't epsilon-transit, don't duplicate the
1423 destination and store the original destination as the
1424 destination of the node. */
1425 dfa->nexts[clone_node] = dfa->nexts[org_node];
1428 else if (dfa->edests[org_node].nelem == 1)
1430 /* In case of the node can epsilon-transit, and it has only one
1432 org_dest = dfa->edests[org_node].elems[0];
1433 re_node_set_empty (dfa->edests + clone_node);
1434 if (dfa->nodes[org_node].type == ANCHOR)
1436 /* In case of the node has another constraint, append it. */
1437 if (org_node == root_node && clone_node != org_node)
1439 /* ...but if the node is root_node itself, it means the
1440 epsilon closure have a loop, then tie it to the
1441 destination of the root_node. */
1442 ret = re_node_set_insert (dfa->edests + clone_node,
1444 if (BE (ret < 0, 0))
1448 constraint |= dfa->nodes[org_node].opr.ctx_type;
1450 clone_dest = duplicate_node (dfa, org_dest, constraint);
1451 if (BE (clone_dest == -1, 0))
1453 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1454 if (BE (ret < 0, 0))
1457 else /* dfa->edests[org_node].nelem == 2 */
1459 /* In case of the node can epsilon-transit, and it has two
1460 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1461 org_dest = dfa->edests[org_node].elems[0];
1462 re_node_set_empty (dfa->edests + clone_node);
1463 /* Search for a duplicated node which satisfies the constraint. */
1464 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1465 if (clone_dest == -1)
1467 /* There are no such a duplicated node, create a new one. */
1469 clone_dest = duplicate_node (dfa, org_dest, constraint);
1470 if (BE (clone_dest == -1, 0))
1472 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1473 if (BE (ret < 0, 0))
1475 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1476 root_node, constraint);
1477 if (BE (err != REG_NOERROR, 0))
1482 /* There are a duplicated node which satisfy the constraint,
1483 use it to avoid infinite loop. */
1484 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1485 if (BE (ret < 0, 0))
1489 org_dest = dfa->edests[org_node].elems[1];
1490 clone_dest = duplicate_node (dfa, org_dest, constraint);
1491 if (BE (clone_dest == -1, 0))
1493 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1494 if (BE (ret < 0, 0))
1497 org_node = org_dest;
1498 clone_node = clone_dest;
1503 /* Search for a node which is duplicated from the node ORG_NODE, and
1504 satisfies the constraint CONSTRAINT. */
1507 search_duplicated_node (re_dfa_t *dfa, int org_node, unsigned int constraint)
1510 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1512 if (org_node == dfa->org_indices[idx]
1513 && constraint == dfa->nodes[idx].constraint)
1514 return idx; /* Found. */
1516 return -1; /* Not found. */
1519 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1520 Return the index of the new node, or -1 if insufficient storage is
1524 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1526 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1527 if (BE (dup_idx != -1, 1))
1529 dfa->nodes[dup_idx].constraint = constraint;
1530 if (dfa->nodes[org_idx].type == ANCHOR)
1531 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1532 dfa->nodes[dup_idx].duplicated = 1;
1534 /* Store the index of the original node. */
1535 dfa->org_indices[dup_idx] = org_idx;
1540 static reg_errcode_t
1541 calc_inveclosure (re_dfa_t *dfa)
1544 for (idx = 0; idx < dfa->nodes_len; ++idx)
1545 re_node_set_init_empty (dfa->inveclosures + idx);
1547 for (src = 0; src < dfa->nodes_len; ++src)
1549 int *elems = dfa->eclosures[src].elems;
1550 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1552 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1553 if (BE (ret == -1, 0))
1561 /* Calculate "eclosure" for all the node in DFA. */
1563 static reg_errcode_t
1564 calc_eclosure (re_dfa_t *dfa)
1566 int node_idx, incomplete;
1568 assert (dfa->nodes_len > 0);
1571 /* For each nodes, calculate epsilon closure. */
1572 for (node_idx = 0; ; ++node_idx)
1575 re_node_set eclosure_elem;
1576 if (node_idx == dfa->nodes_len)
1585 assert (dfa->eclosures[node_idx].nelem != -1);
1588 /* If we have already calculated, skip it. */
1589 if (dfa->eclosures[node_idx].nelem != 0)
1591 /* Calculate epsilon closure of `node_idx'. */
1592 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1593 if (BE (err != REG_NOERROR, 0))
1596 if (dfa->eclosures[node_idx].nelem == 0)
1599 re_node_set_free (&eclosure_elem);
1605 /* Calculate epsilon closure of NODE. */
1607 static reg_errcode_t
1608 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1611 unsigned int constraint;
1613 re_node_set eclosure;
1615 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1616 if (BE (err != REG_NOERROR, 0))
1619 /* This indicates that we are calculating this node now.
1620 We reference this value to avoid infinite loop. */
1621 dfa->eclosures[node].nelem = -1;
1623 constraint = ((dfa->nodes[node].type == ANCHOR)
1624 ? dfa->nodes[node].opr.ctx_type : 0);
1625 /* If the current node has constraints, duplicate all nodes.
1626 Since they must inherit the constraints. */
1628 && dfa->edests[node].nelem
1629 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1631 int org_node, cur_node;
1632 org_node = cur_node = node;
1633 err = duplicate_node_closure (dfa, node, node, node, constraint);
1634 if (BE (err != REG_NOERROR, 0))
1638 /* Expand each epsilon destination nodes. */
1639 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1640 for (i = 0; i < dfa->edests[node].nelem; ++i)
1642 re_node_set eclosure_elem;
1643 int edest = dfa->edests[node].elems[i];
1644 /* If calculating the epsilon closure of `edest' is in progress,
1645 return intermediate result. */
1646 if (dfa->eclosures[edest].nelem == -1)
1651 /* If we haven't calculated the epsilon closure of `edest' yet,
1652 calculate now. Otherwise use calculated epsilon closure. */
1653 if (dfa->eclosures[edest].nelem == 0)
1655 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1656 if (BE (err != REG_NOERROR, 0))
1660 eclosure_elem = dfa->eclosures[edest];
1661 /* Merge the epsilon closure of `edest'. */
1662 re_node_set_merge (&eclosure, &eclosure_elem);
1663 /* If the epsilon closure of `edest' is incomplete,
1664 the epsilon closure of this node is also incomplete. */
1665 if (dfa->eclosures[edest].nelem == 0)
1668 re_node_set_free (&eclosure_elem);
1672 /* Epsilon closures include itself. */
1673 re_node_set_insert (&eclosure, node);
1674 if (incomplete && !root)
1675 dfa->eclosures[node].nelem = 0;
1677 dfa->eclosures[node] = eclosure;
1678 *new_set = eclosure;
1682 /* Functions for token which are used in the parser. */
1684 /* Fetch a token from INPUT.
1685 We must not use this function inside bracket expressions. */
1688 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1690 re_string_skip_bytes (input, peek_token (result, input, syntax));
1693 /* Peek a token from INPUT, and return the length of the token.
1694 We must not use this function inside bracket expressions. */
1697 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1701 if (re_string_eoi (input))
1703 token->type = END_OF_RE;
1707 c = re_string_peek_byte (input, 0);
1710 token->word_char = 0;
1711 #ifdef RE_ENABLE_I18N
1712 token->mb_partial = 0;
1713 if (input->mb_cur_max > 1 &&
1714 !re_string_first_byte (input, re_string_cur_idx (input)))
1716 token->type = CHARACTER;
1717 token->mb_partial = 1;
1724 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1726 token->type = BACK_SLASH;
1730 c2 = re_string_peek_byte_case (input, 1);
1732 token->type = CHARACTER;
1733 #ifdef RE_ENABLE_I18N
1734 if (input->mb_cur_max > 1)
1736 wint_t wc = re_string_wchar_at (input,
1737 re_string_cur_idx (input) + 1);
1738 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1742 token->word_char = IS_WORD_CHAR (c2) != 0;
1747 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1748 token->type = OP_ALT;
1750 case '1': case '2': case '3': case '4': case '5':
1751 case '6': case '7': case '8': case '9':
1752 if (!(syntax & RE_NO_BK_REFS))
1754 token->type = OP_BACK_REF;
1755 token->opr.idx = c2 - '1';
1759 if (!(syntax & RE_NO_GNU_OPS))
1761 token->type = ANCHOR;
1762 token->opr.ctx_type = WORD_FIRST;
1766 if (!(syntax & RE_NO_GNU_OPS))
1768 token->type = ANCHOR;
1769 token->opr.ctx_type = WORD_LAST;
1773 if (!(syntax & RE_NO_GNU_OPS))
1775 token->type = ANCHOR;
1776 token->opr.ctx_type = WORD_DELIM;
1780 if (!(syntax & RE_NO_GNU_OPS))
1782 token->type = ANCHOR;
1783 token->opr.ctx_type = NOT_WORD_DELIM;
1787 if (!(syntax & RE_NO_GNU_OPS))
1788 token->type = OP_WORD;
1791 if (!(syntax & RE_NO_GNU_OPS))
1792 token->type = OP_NOTWORD;
1795 if (!(syntax & RE_NO_GNU_OPS))
1796 token->type = OP_SPACE;
1799 if (!(syntax & RE_NO_GNU_OPS))
1800 token->type = OP_NOTSPACE;
1803 if (!(syntax & RE_NO_GNU_OPS))
1805 token->type = ANCHOR;
1806 token->opr.ctx_type = BUF_FIRST;
1810 if (!(syntax & RE_NO_GNU_OPS))
1812 token->type = ANCHOR;
1813 token->opr.ctx_type = BUF_LAST;
1817 if (!(syntax & RE_NO_BK_PARENS))
1818 token->type = OP_OPEN_SUBEXP;
1821 if (!(syntax & RE_NO_BK_PARENS))
1822 token->type = OP_CLOSE_SUBEXP;
1825 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1826 token->type = OP_DUP_PLUS;
1829 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1830 token->type = OP_DUP_QUESTION;
1833 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1834 token->type = OP_OPEN_DUP_NUM;
1837 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1838 token->type = OP_CLOSE_DUP_NUM;
1846 token->type = CHARACTER;
1847 #ifdef RE_ENABLE_I18N
1848 if (input->mb_cur_max > 1)
1850 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1851 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1855 token->word_char = IS_WORD_CHAR (token->opr.c);
1860 if (syntax & RE_NEWLINE_ALT)
1861 token->type = OP_ALT;
1864 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1865 token->type = OP_ALT;
1868 token->type = OP_DUP_ASTERISK;
1871 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1872 token->type = OP_DUP_PLUS;
1875 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1876 token->type = OP_DUP_QUESTION;
1879 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1880 token->type = OP_OPEN_DUP_NUM;
1883 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1884 token->type = OP_CLOSE_DUP_NUM;
1887 if (syntax & RE_NO_BK_PARENS)
1888 token->type = OP_OPEN_SUBEXP;
1891 if (syntax & RE_NO_BK_PARENS)
1892 token->type = OP_CLOSE_SUBEXP;
1895 token->type = OP_OPEN_BRACKET;
1898 token->type = OP_PERIOD;
1901 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1902 re_string_cur_idx (input) != 0)
1904 char prev = re_string_peek_byte (input, -1);
1905 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1908 token->type = ANCHOR;
1909 token->opr.ctx_type = LINE_FIRST;
1912 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1913 re_string_cur_idx (input) + 1 != re_string_length (input))
1916 re_string_skip_bytes (input, 1);
1917 peek_token (&next, input, syntax);
1918 re_string_skip_bytes (input, -1);
1919 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1922 token->type = ANCHOR;
1923 token->opr.ctx_type = LINE_LAST;
1931 /* Peek a token from INPUT, and return the length of the token.
1932 We must not use this function out of bracket expressions. */
1935 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1938 if (re_string_eoi (input))
1940 token->type = END_OF_RE;
1943 c = re_string_peek_byte (input, 0);
1946 #ifdef RE_ENABLE_I18N
1947 if (input->mb_cur_max > 1 &&
1948 !re_string_first_byte (input, re_string_cur_idx (input)))
1950 token->type = CHARACTER;
1953 #endif /* RE_ENABLE_I18N */
1955 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1956 && re_string_cur_idx (input) + 1 < re_string_length (input))
1958 /* In this case, '\' escape a character. */
1960 re_string_skip_bytes (input, 1);
1961 c2 = re_string_peek_byte (input, 0);
1963 token->type = CHARACTER;
1966 if (c == '[') /* '[' is a special char in a bracket exps. */
1970 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1971 c2 = re_string_peek_byte (input, 1);
1979 token->type = OP_OPEN_COLL_ELEM;
1982 token->type = OP_OPEN_EQUIV_CLASS;
1985 if (syntax & RE_CHAR_CLASSES)
1987 token->type = OP_OPEN_CHAR_CLASS;
1990 /* else fall through. */
1992 token->type = CHARACTER;
2002 token->type = OP_CHARSET_RANGE;
2005 token->type = OP_CLOSE_BRACKET;
2008 token->type = OP_NON_MATCH_LIST;
2011 token->type = CHARACTER;
2016 /* Functions for parser. */
2018 /* Entry point of the parser.
2019 Parse the regular expression REGEXP and return the structure tree.
2020 If an error is occured, ERR is set by error code, and return NULL.
2021 This function build the following tree, from regular expression <reg_exp>:
2027 CAT means concatenation.
2028 EOR means end of regular expression. */
2031 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2034 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2035 bin_tree_t *tree, *eor, *root;
2036 re_token_t current_token;
2037 dfa->syntax = syntax;
2038 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2039 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2040 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2042 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2044 root = create_tree (dfa, tree, eor, CONCAT);
2047 if (BE (eor == NULL || root == NULL, 0))
2055 /* This function build the following tree, from regular expression
2056 <branch1>|<branch2>:
2062 ALT means alternative, which represents the operator `|'. */
2065 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2066 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2068 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2069 bin_tree_t *tree, *branch = NULL;
2070 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2071 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2074 while (token->type == OP_ALT)
2076 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2077 if (token->type != OP_ALT && token->type != END_OF_RE
2078 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2080 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2081 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2086 tree = create_tree (dfa, tree, branch, OP_ALT);
2087 if (BE (tree == NULL, 0))
2096 /* This function build the following tree, from regular expression
2103 CAT means concatenation. */
2106 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2107 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2109 bin_tree_t *tree, *exp;
2110 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2111 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2112 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2115 while (token->type != OP_ALT && token->type != END_OF_RE
2116 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2118 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2119 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2123 if (tree != NULL && exp != NULL)
2125 tree = create_tree (dfa, tree, exp, CONCAT);
2132 else if (tree == NULL)
2134 /* Otherwise exp == NULL, we don't need to create new tree. */
2139 /* This function build the following tree, from regular expression a*:
2146 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2147 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2149 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2151 switch (token->type)
2154 tree = create_token_tree (dfa, NULL, NULL, token);
2155 if (BE (tree == NULL, 0))
2160 #ifdef RE_ENABLE_I18N
2161 if (dfa->mb_cur_max > 1)
2163 while (!re_string_eoi (regexp)
2164 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2166 bin_tree_t *mbc_remain;
2167 fetch_token (token, regexp, syntax);
2168 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2169 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2170 if (BE (mbc_remain == NULL || tree == NULL, 0))
2179 case OP_OPEN_SUBEXP:
2180 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2181 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2184 case OP_OPEN_BRACKET:
2185 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2186 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2190 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2195 dfa->used_bkref_map |= 1 << token->opr.idx;
2196 tree = create_token_tree (dfa, NULL, NULL, token);
2197 if (BE (tree == NULL, 0))
2203 dfa->has_mb_node = 1;
2205 case OP_OPEN_DUP_NUM:
2206 if (syntax & RE_CONTEXT_INVALID_DUP)
2212 case OP_DUP_ASTERISK:
2214 case OP_DUP_QUESTION:
2215 if (syntax & RE_CONTEXT_INVALID_OPS)
2220 else if (syntax & RE_CONTEXT_INDEP_OPS)
2222 fetch_token (token, regexp, syntax);
2223 return parse_expression (regexp, preg, token, syntax, nest, err);
2225 /* else fall through */
2226 case OP_CLOSE_SUBEXP:
2227 if ((token->type == OP_CLOSE_SUBEXP) &&
2228 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2233 /* else fall through */
2234 case OP_CLOSE_DUP_NUM:
2235 /* We treat it as a normal character. */
2237 /* Then we can these characters as normal characters. */
2238 token->type = CHARACTER;
2239 /* mb_partial and word_char bits should be initialized already
2241 tree = create_token_tree (dfa, NULL, NULL, token);
2242 if (BE (tree == NULL, 0))
2249 if ((token->opr.ctx_type
2250 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2251 && dfa->word_ops_used == 0)
2252 init_word_char (dfa);
2253 if (token->opr.ctx_type == WORD_DELIM
2254 || token->opr.ctx_type == NOT_WORD_DELIM)
2256 bin_tree_t *tree_first, *tree_last;
2257 if (token->opr.ctx_type == WORD_DELIM)
2259 token->opr.ctx_type = WORD_FIRST;
2260 tree_first = create_token_tree (dfa, NULL, NULL, token);
2261 token->opr.ctx_type = WORD_LAST;
2265 token->opr.ctx_type = INSIDE_WORD;
2266 tree_first = create_token_tree (dfa, NULL, NULL, token);
2267 token->opr.ctx_type = INSIDE_NOTWORD;
2269 tree_last = create_token_tree (dfa, NULL, NULL, token);
2270 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2271 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2279 tree = create_token_tree (dfa, NULL, NULL, token);
2280 if (BE (tree == NULL, 0))
2286 /* We must return here, since ANCHORs can't be followed
2287 by repetition operators.
2288 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2289 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2290 fetch_token (token, regexp, syntax);
2293 tree = create_token_tree (dfa, NULL, NULL, token);
2294 if (BE (tree == NULL, 0))
2299 if (dfa->mb_cur_max > 1)
2300 dfa->has_mb_node = 1;
2304 tree = build_charclass_op (dfa, regexp->trans,
2305 (const unsigned char *) "alnum",
2306 (const unsigned char *) "_",
2307 token->type == OP_NOTWORD, err);
2308 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2313 tree = build_charclass_op (dfa, regexp->trans,
2314 (const unsigned char *) "space",
2315 (const unsigned char *) "",
2316 token->type == OP_NOTSPACE, err);
2317 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2327 /* Must not happen? */
2333 fetch_token (token, regexp, syntax);
2335 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2336 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2338 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2339 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2341 /* In BRE consecutive duplications are not allowed. */
2342 if ((syntax & RE_CONTEXT_INVALID_DUP)
2343 && (token->type == OP_DUP_ASTERISK
2344 || token->type == OP_OPEN_DUP_NUM))
2354 /* This function build the following tree, from regular expression
2362 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2363 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2365 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2368 cur_nsub = preg->re_nsub++;
2370 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2372 /* The subexpression may be a null string. */
2373 if (token->type == OP_CLOSE_SUBEXP)
2377 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2378 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2380 if (BE (*err != REG_NOERROR, 0))
2383 dfa->completed_bkref_map |= 1 << cur_nsub;
2385 tree = create_tree (dfa, tree, NULL, SUBEXP);
2386 if (BE (tree == NULL, 0))
2391 tree->token.opr.idx = cur_nsub;
2395 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2398 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2399 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2401 bin_tree_t *tree = NULL, *old_tree = NULL;
2402 int i, start, end, start_idx = re_string_cur_idx (regexp);
2403 re_token_t start_token = *token;
2405 if (token->type == OP_OPEN_DUP_NUM)
2408 start = fetch_number (regexp, token, syntax);
2411 if (token->type == CHARACTER && token->opr.c == ',')
2412 start = 0; /* We treat "{,m}" as "{0,m}". */
2415 *err = REG_BADBR; /* <re>{} is invalid. */
2419 if (BE (start != -2, 1))
2421 /* We treat "{n}" as "{n,n}". */
2422 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2423 : ((token->type == CHARACTER && token->opr.c == ',')
2424 ? fetch_number (regexp, token, syntax) : -2));
2426 if (BE (start == -2 || end == -2, 0))
2428 /* Invalid sequence. */
2429 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2431 if (token->type == END_OF_RE)
2439 /* If the syntax bit is set, rollback. */
2440 re_string_set_index (regexp, start_idx);
2441 *token = start_token;
2442 token->type = CHARACTER;
2443 /* mb_partial and word_char bits should be already initialized by
2448 if (BE (end != -1 && start > end, 0))
2450 /* First number greater than second. */
2457 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2458 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2461 fetch_token (token, regexp, syntax);
2463 if (BE (elem == NULL, 0))
2465 if (BE (start == 0 && end == 0, 0))
2467 postorder (elem, free_tree, NULL);
2471 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2472 if (BE (start > 0, 0))
2475 for (i = 2; i <= start; ++i)
2477 elem = duplicate_tree (elem, dfa);
2478 tree = create_tree (dfa, tree, elem, CONCAT);
2479 if (BE (elem == NULL || tree == NULL, 0))
2480 goto parse_dup_op_espace;
2486 /* Duplicate ELEM before it is marked optional. */
2487 elem = duplicate_tree (elem, dfa);
2493 if (elem->token.type == SUBEXP)
2494 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2496 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2497 if (BE (tree == NULL, 0))
2498 goto parse_dup_op_espace;
2500 /* This loop is actually executed only when end != -1,
2501 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2502 already created the start+1-th copy. */
2503 for (i = start + 2; i <= end; ++i)
2505 elem = duplicate_tree (elem, dfa);
2506 tree = create_tree (dfa, tree, elem, CONCAT);
2507 if (BE (elem == NULL || tree == NULL, 0))
2508 goto parse_dup_op_espace;
2510 tree = create_tree (dfa, tree, NULL, OP_ALT);
2511 if (BE (tree == NULL, 0))
2512 goto parse_dup_op_espace;
2516 tree = create_tree (dfa, old_tree, tree, CONCAT);
2520 parse_dup_op_espace:
2525 /* Size of the names for collating symbol/equivalence_class/character_class.
2526 I'm not sure, but maybe enough. */
2527 #define BRACKET_NAME_BUF_SIZE 32
2530 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2531 Build the range expression which starts from START_ELEM, and ends
2532 at END_ELEM. The result are written to MBCSET and SBCSET.
2533 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2534 mbcset->range_ends, is a pointer argument sinse we may
2537 static reg_errcode_t
2538 build_range_exp (re_bitset_ptr_t sbcset,
2539 # ifdef RE_ENABLE_I18N
2540 re_charset_t *mbcset, int *range_alloc,
2542 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2544 unsigned int start_ch, end_ch;
2545 /* Equivalence Classes and Character Classes can't be a range start/end. */
2546 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2547 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2551 /* We can handle no multi character collating elements without libc
2553 if (BE ((start_elem->type == COLL_SYM
2554 && strlen ((char *) start_elem->opr.name) > 1)
2555 || (end_elem->type == COLL_SYM
2556 && strlen ((char *) end_elem->opr.name) > 1), 0))
2557 return REG_ECOLLATE;
2559 # ifdef RE_ENABLE_I18N
2562 wint_t start_wc, end_wc;
2563 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2565 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2566 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2568 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2569 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2571 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2572 ? __btowc (start_ch) : start_elem->opr.wch);
2573 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2574 ? __btowc (end_ch) : end_elem->opr.wch);
2575 if (start_wc == WEOF || end_wc == WEOF)
2576 return REG_ECOLLATE;
2577 cmp_buf[0] = start_wc;
2578 cmp_buf[4] = end_wc;
2579 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2582 /* Got valid collation sequence values, add them as a new entry.
2583 However, for !_LIBC we have no collation elements: if the
2584 character set is single byte, the single byte character set
2585 that we build below suffices. parse_bracket_exp passes
2586 no MBCSET if dfa->mb_cur_max == 1. */
2589 /* Check the space of the arrays. */
2590 if (BE (*range_alloc == mbcset->nranges, 0))
2592 /* There is not enough space, need realloc. */
2593 wchar_t *new_array_start, *new_array_end;
2596 /* +1 in case of mbcset->nranges is 0. */
2597 new_nranges = 2 * mbcset->nranges + 1;
2598 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2599 are NULL if *range_alloc == 0. */
2600 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2602 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2605 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2608 mbcset->range_starts = new_array_start;
2609 mbcset->range_ends = new_array_end;
2610 *range_alloc = new_nranges;
2613 mbcset->range_starts[mbcset->nranges] = start_wc;
2614 mbcset->range_ends[mbcset->nranges++] = end_wc;
2617 /* Build the table for single byte characters. */
2618 for (wc = 0; wc < SBC_MAX; ++wc)
2621 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2622 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2623 bitset_set (sbcset, wc);
2626 # else /* not RE_ENABLE_I18N */
2629 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2630 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2632 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2633 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2635 if (start_ch > end_ch)
2637 /* Build the table for single byte characters. */
2638 for (ch = 0; ch < SBC_MAX; ++ch)
2639 if (start_ch <= ch && ch <= end_ch)
2640 bitset_set (sbcset, ch);
2642 # endif /* not RE_ENABLE_I18N */
2645 #endif /* not _LIBC */
2648 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2649 Build the collating element which is represented by NAME.
2650 The result are written to MBCSET and SBCSET.
2651 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2652 pointer argument since we may update it. */
2654 static reg_errcode_t
2655 build_collating_symbol (re_bitset_ptr_t sbcset,
2656 # ifdef RE_ENABLE_I18N
2657 re_charset_t *mbcset, int *coll_sym_alloc,
2659 const unsigned char *name)
2661 size_t name_len = strlen ((const char *) name);
2662 if (BE (name_len != 1, 0))
2663 return REG_ECOLLATE;
2666 bitset_set (sbcset, name[0]);
2670 #endif /* not _LIBC */
2672 /* This function parse bracket expression like "[abc]", "[a-c]",
2676 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2677 reg_syntax_t syntax, reg_errcode_t *err)
2680 const unsigned char *collseqmb;
2681 const char *collseqwc;
2684 const int32_t *symb_table;
2685 const unsigned char *extra;
2687 /* Local function for parse_bracket_exp used in _LIBC environement.
2688 Seek the collating symbol entry correspondings to NAME.
2689 Return the index of the symbol in the SYMB_TABLE. */
2692 __attribute ((always_inline))
2693 seek_collating_symbol_entry (name, name_len)
2694 const unsigned char *name;
2697 int32_t hash = elem_hash ((const char *) name, name_len);
2698 int32_t elem = hash % table_size;
2699 int32_t second = hash % (table_size - 2);
2700 while (symb_table[2 * elem] != 0)
2702 /* First compare the hashing value. */
2703 if (symb_table[2 * elem] == hash
2704 /* Compare the length of the name. */
2705 && name_len == extra[symb_table[2 * elem + 1]]
2706 /* Compare the name. */
2707 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2710 /* Yep, this is the entry. */
2720 /* Local function for parse_bracket_exp used in _LIBC environement.
2721 Look up the collation sequence value of BR_ELEM.
2722 Return the value if succeeded, UINT_MAX otherwise. */
2724 auto inline unsigned int
2725 __attribute ((always_inline))
2726 lookup_collation_sequence_value (br_elem)
2727 bracket_elem_t *br_elem;
2729 if (br_elem->type == SB_CHAR)
2732 if (MB_CUR_MAX == 1)
2735 return collseqmb[br_elem->opr.ch];
2738 wint_t wc = __btowc (br_elem->opr.ch);
2739 return __collseq_table_lookup (collseqwc, wc);
2742 else if (br_elem->type == MB_CHAR)
2744 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2746 else if (br_elem->type == COLL_SYM)
2748 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2752 elem = seek_collating_symbol_entry (br_elem->opr.name,
2754 if (symb_table[2 * elem] != 0)
2756 /* We found the entry. */
2757 idx = symb_table[2 * elem + 1];
2758 /* Skip the name of collating element name. */
2759 idx += 1 + extra[idx];
2760 /* Skip the byte sequence of the collating element. */
2761 idx += 1 + extra[idx];
2762 /* Adjust for the alignment. */
2763 idx = (idx + 3) & ~3;
2764 /* Skip the multibyte collation sequence value. */
2765 idx += sizeof (unsigned int);
2766 /* Skip the wide char sequence of the collating element. */
2767 idx += sizeof (unsigned int) *
2768 (1 + *(unsigned int *) (extra + idx));
2769 /* Return the collation sequence value. */
2770 return *(unsigned int *) (extra + idx);
2772 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2774 /* No valid character. Match it as a single byte
2776 return collseqmb[br_elem->opr.name[0]];
2779 else if (sym_name_len == 1)
2780 return collseqmb[br_elem->opr.name[0]];
2785 /* Local function for parse_bracket_exp used in _LIBC environement.
2786 Build the range expression which starts from START_ELEM, and ends
2787 at END_ELEM. The result are written to MBCSET and SBCSET.
2788 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2789 mbcset->range_ends, is a pointer argument sinse we may
2792 auto inline reg_errcode_t
2793 __attribute ((always_inline))
2794 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2795 re_charset_t *mbcset;
2797 re_bitset_ptr_t sbcset;
2798 bracket_elem_t *start_elem, *end_elem;
2801 uint32_t start_collseq;
2802 uint32_t end_collseq;
2804 /* Equivalence Classes and Character Classes can't be a range
2806 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2807 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2811 start_collseq = lookup_collation_sequence_value (start_elem);
2812 end_collseq = lookup_collation_sequence_value (end_elem);
2813 /* Check start/end collation sequence values. */
2814 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2815 return REG_ECOLLATE;
2816 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2819 /* Got valid collation sequence values, add them as a new entry.
2820 However, if we have no collation elements, and the character set
2821 is single byte, the single byte character set that we
2822 build below suffices. */
2823 if (nrules > 0 || dfa->mb_cur_max > 1)
2825 /* Check the space of the arrays. */
2826 if (BE (*range_alloc == mbcset->nranges, 0))
2828 /* There is not enough space, need realloc. */
2829 uint32_t *new_array_start;
2830 uint32_t *new_array_end;
2833 /* +1 in case of mbcset->nranges is 0. */
2834 new_nranges = 2 * mbcset->nranges + 1;
2835 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2837 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2840 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2843 mbcset->range_starts = new_array_start;
2844 mbcset->range_ends = new_array_end;
2845 *range_alloc = new_nranges;
2848 mbcset->range_starts[mbcset->nranges] = start_collseq;
2849 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2852 /* Build the table for single byte characters. */
2853 for (ch = 0; ch < SBC_MAX; ch++)
2855 uint32_t ch_collseq;
2857 if (MB_CUR_MAX == 1)
2860 ch_collseq = collseqmb[ch];
2862 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2863 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2864 bitset_set (sbcset, ch);
2869 /* Local function for parse_bracket_exp used in _LIBC environement.
2870 Build the collating element which is represented by NAME.
2871 The result are written to MBCSET and SBCSET.
2872 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2873 pointer argument sinse we may update it. */
2875 auto inline reg_errcode_t
2876 __attribute ((always_inline))
2877 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2878 re_charset_t *mbcset;
2879 int *coll_sym_alloc;
2880 re_bitset_ptr_t sbcset;
2881 const unsigned char *name;
2884 size_t name_len = strlen ((const char *) name);
2887 elem = seek_collating_symbol_entry (name, name_len);
2888 if (symb_table[2 * elem] != 0)
2890 /* We found the entry. */
2891 idx = symb_table[2 * elem + 1];
2892 /* Skip the name of collating element name. */
2893 idx += 1 + extra[idx];
2895 else if (symb_table[2 * elem] == 0 && name_len == 1)
2897 /* No valid character, treat it as a normal
2899 bitset_set (sbcset, name[0]);
2903 return REG_ECOLLATE;
2905 /* Got valid collation sequence, add it as a new entry. */
2906 /* Check the space of the arrays. */
2907 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2909 /* Not enough, realloc it. */
2910 /* +1 in case of mbcset->ncoll_syms is 0. */
2911 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2912 /* Use realloc since mbcset->coll_syms is NULL
2914 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2915 new_coll_sym_alloc);
2916 if (BE (new_coll_syms == NULL, 0))
2918 mbcset->coll_syms = new_coll_syms;
2919 *coll_sym_alloc = new_coll_sym_alloc;
2921 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2926 if (BE (name_len != 1, 0))
2927 return REG_ECOLLATE;
2930 bitset_set (sbcset, name[0]);
2937 re_token_t br_token;
2938 re_bitset_ptr_t sbcset;
2939 #ifdef RE_ENABLE_I18N
2940 re_charset_t *mbcset;
2941 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2942 int equiv_class_alloc = 0, char_class_alloc = 0;
2943 #endif /* not RE_ENABLE_I18N */
2945 bin_tree_t *work_tree;
2947 int first_round = 1;
2949 collseqmb = (const unsigned char *)
2950 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2951 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2957 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2958 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2959 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2960 _NL_COLLATE_SYMB_TABLEMB);
2961 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2962 _NL_COLLATE_SYMB_EXTRAMB);
2965 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2966 #ifdef RE_ENABLE_I18N
2967 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2968 #endif /* RE_ENABLE_I18N */
2969 #ifdef RE_ENABLE_I18N
2970 if (BE (sbcset == NULL || mbcset == NULL, 0))
2972 if (BE (sbcset == NULL, 0))
2973 #endif /* RE_ENABLE_I18N */
2979 token_len = peek_token_bracket (token, regexp, syntax);
2980 if (BE (token->type == END_OF_RE, 0))
2983 goto parse_bracket_exp_free_return;
2985 if (token->type == OP_NON_MATCH_LIST)
2987 #ifdef RE_ENABLE_I18N
2988 mbcset->non_match = 1;
2989 #endif /* not RE_ENABLE_I18N */
2991 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2992 bitset_set (sbcset, '\0');
2993 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2994 token_len = peek_token_bracket (token, regexp, syntax);
2995 if (BE (token->type == END_OF_RE, 0))
2998 goto parse_bracket_exp_free_return;
3002 /* We treat the first ']' as a normal character. */
3003 if (token->type == OP_CLOSE_BRACKET)
3004 token->type = CHARACTER;
3008 bracket_elem_t start_elem, end_elem;
3009 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3010 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3012 int token_len2 = 0, is_range_exp = 0;
3015 start_elem.opr.name = start_name_buf;
3016 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3017 syntax, first_round);
3018 if (BE (ret != REG_NOERROR, 0))
3021 goto parse_bracket_exp_free_return;
3025 /* Get information about the next token. We need it in any case. */
3026 token_len = peek_token_bracket (token, regexp, syntax);
3028 /* Do not check for ranges if we know they are not allowed. */
3029 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3031 if (BE (token->type == END_OF_RE, 0))
3034 goto parse_bracket_exp_free_return;
3036 if (token->type == OP_CHARSET_RANGE)
3038 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3039 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3040 if (BE (token2.type == END_OF_RE, 0))
3043 goto parse_bracket_exp_free_return;
3045 if (token2.type == OP_CLOSE_BRACKET)
3047 /* We treat the last '-' as a normal character. */
3048 re_string_skip_bytes (regexp, -token_len);
3049 token->type = CHARACTER;
3056 if (is_range_exp == 1)
3058 end_elem.opr.name = end_name_buf;
3059 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3061 if (BE (ret != REG_NOERROR, 0))
3064 goto parse_bracket_exp_free_return;
3067 token_len = peek_token_bracket (token, regexp, syntax);
3070 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3071 &start_elem, &end_elem);
3073 # ifdef RE_ENABLE_I18N
3074 *err = build_range_exp (sbcset,
3075 dfa->mb_cur_max > 1 ? mbcset : NULL,
3076 &range_alloc, &start_elem, &end_elem);
3078 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3080 #endif /* RE_ENABLE_I18N */
3081 if (BE (*err != REG_NOERROR, 0))
3082 goto parse_bracket_exp_free_return;
3086 switch (start_elem.type)
3089 bitset_set (sbcset, start_elem.opr.ch);
3091 #ifdef RE_ENABLE_I18N
3093 /* Check whether the array has enough space. */
3094 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3096 wchar_t *new_mbchars;
3097 /* Not enough, realloc it. */
3098 /* +1 in case of mbcset->nmbchars is 0. */
3099 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3100 /* Use realloc since array is NULL if *alloc == 0. */
3101 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3103 if (BE (new_mbchars == NULL, 0))
3104 goto parse_bracket_exp_espace;
3105 mbcset->mbchars = new_mbchars;
3107 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3109 #endif /* RE_ENABLE_I18N */
3111 *err = build_equiv_class (sbcset,
3112 #ifdef RE_ENABLE_I18N
3113 mbcset, &equiv_class_alloc,
3114 #endif /* RE_ENABLE_I18N */
3115 start_elem.opr.name);
3116 if (BE (*err != REG_NOERROR, 0))
3117 goto parse_bracket_exp_free_return;
3120 *err = build_collating_symbol (sbcset,
3121 #ifdef RE_ENABLE_I18N
3122 mbcset, &coll_sym_alloc,
3123 #endif /* RE_ENABLE_I18N */
3124 start_elem.opr.name);
3125 if (BE (*err != REG_NOERROR, 0))
3126 goto parse_bracket_exp_free_return;
3129 *err = build_charclass (regexp->trans, sbcset,
3130 #ifdef RE_ENABLE_I18N
3131 mbcset, &char_class_alloc,
3132 #endif /* RE_ENABLE_I18N */
3133 start_elem.opr.name, syntax);
3134 if (BE (*err != REG_NOERROR, 0))
3135 goto parse_bracket_exp_free_return;
3142 if (BE (token->type == END_OF_RE, 0))
3145 goto parse_bracket_exp_free_return;
3147 if (token->type == OP_CLOSE_BRACKET)
3151 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3153 /* If it is non-matching list. */
3155 bitset_not (sbcset);
3157 #ifdef RE_ENABLE_I18N
3158 /* Ensure only single byte characters are set. */
3159 if (dfa->mb_cur_max > 1)
3160 bitset_mask (sbcset, dfa->sb_char);
3162 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3163 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3164 || mbcset->non_match)))
3166 bin_tree_t *mbc_tree;
3168 /* Build a tree for complex bracket. */
3169 dfa->has_mb_node = 1;
3170 br_token.type = COMPLEX_BRACKET;
3171 br_token.opr.mbcset = mbcset;
3172 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3173 if (BE (mbc_tree == NULL, 0))
3174 goto parse_bracket_exp_espace;
3175 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3176 if (sbcset[sbc_idx])
3178 /* If there are no bits set in sbcset, there is no point
3179 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3180 if (sbc_idx < BITSET_UINTS)
3182 /* Build a tree for simple bracket. */
3183 br_token.type = SIMPLE_BRACKET;
3184 br_token.opr.sbcset = sbcset;
3185 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3186 if (BE (work_tree == NULL, 0))
3187 goto parse_bracket_exp_espace;
3189 /* Then join them by ALT node. */
3190 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3191 if (BE (work_tree == NULL, 0))
3192 goto parse_bracket_exp_espace;
3197 work_tree = mbc_tree;
3201 #endif /* not RE_ENABLE_I18N */
3203 #ifdef RE_ENABLE_I18N
3204 free_charset (mbcset);
3206 /* Build a tree for simple bracket. */
3207 br_token.type = SIMPLE_BRACKET;
3208 br_token.opr.sbcset = sbcset;
3209 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3210 if (BE (work_tree == NULL, 0))
3211 goto parse_bracket_exp_espace;
3215 parse_bracket_exp_espace:
3217 parse_bracket_exp_free_return:
3219 #ifdef RE_ENABLE_I18N
3220 free_charset (mbcset);
3221 #endif /* RE_ENABLE_I18N */
3225 /* Parse an element in the bracket expression. */
3227 static reg_errcode_t
3228 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3229 re_token_t *token, int token_len, re_dfa_t *dfa,
3230 reg_syntax_t syntax, int accept_hyphen)
3232 #ifdef RE_ENABLE_I18N
3234 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3235 if (cur_char_size > 1)
3237 elem->type = MB_CHAR;
3238 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3239 re_string_skip_bytes (regexp, cur_char_size);
3242 #endif /* RE_ENABLE_I18N */
3243 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3244 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3245 || token->type == OP_OPEN_EQUIV_CLASS)
3246 return parse_bracket_symbol (elem, regexp, token);
3247 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3249 /* A '-' must only appear as anything but a range indicator before
3250 the closing bracket. Everything else is an error. */
3252 (void) peek_token_bracket (&token2, regexp, syntax);
3253 if (token2.type != OP_CLOSE_BRACKET)
3254 /* The actual error value is not standardized since this whole
3255 case is undefined. But ERANGE makes good sense. */
3258 elem->type = SB_CHAR;
3259 elem->opr.ch = token->opr.c;
3263 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3264 such as [:<character_class>:], [.<collating_element>.], and
3265 [=<equivalent_class>=]. */
3267 static reg_errcode_t
3268 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3271 unsigned char ch, delim = token->opr.c;
3273 if (re_string_eoi(regexp))
3277 if (i >= BRACKET_NAME_BUF_SIZE)
3279 if (token->type == OP_OPEN_CHAR_CLASS)
3280 ch = re_string_fetch_byte_case (regexp);
3282 ch = re_string_fetch_byte (regexp);
3283 if (re_string_eoi(regexp))
3285 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3287 elem->opr.name[i] = ch;
3289 re_string_skip_bytes (regexp, 1);
3290 elem->opr.name[i] = '\0';
3291 switch (token->type)
3293 case OP_OPEN_COLL_ELEM:
3294 elem->type = COLL_SYM;
3296 case OP_OPEN_EQUIV_CLASS:
3297 elem->type = EQUIV_CLASS;
3299 case OP_OPEN_CHAR_CLASS:
3300 elem->type = CHAR_CLASS;
3308 /* Helper function for parse_bracket_exp.
3309 Build the equivalence class which is represented by NAME.
3310 The result are written to MBCSET and SBCSET.
3311 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3312 is a pointer argument sinse we may update it. */
3314 static reg_errcode_t
3315 build_equiv_class (re_bitset_ptr_t sbcset,
3316 #ifdef RE_ENABLE_I18N
3317 re_charset_t *mbcset, int *equiv_class_alloc,
3319 const unsigned char *name)
3322 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3325 const int32_t *table, *indirect;
3326 const unsigned char *weights, *extra, *cp;
3327 unsigned char char_buf[2];
3331 /* This #include defines a local function! */
3332 # include <locale/weight.h>
3333 /* Calculate the index for equivalence class. */
3335 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3336 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3337 _NL_COLLATE_WEIGHTMB);
3338 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3339 _NL_COLLATE_EXTRAMB);
3340 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3341 _NL_COLLATE_INDIRECTMB);
3342 idx1 = findidx (&cp);
3343 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3344 /* This isn't a valid character. */
3345 return REG_ECOLLATE;
3347 /* Build single byte matcing table for this equivalence class. */
3348 char_buf[1] = (unsigned char) '\0';
3349 len = weights[idx1];
3350 for (ch = 0; ch < SBC_MAX; ++ch)
3354 idx2 = findidx (&cp);
3359 /* This isn't a valid character. */
3361 if (len == weights[idx2])
3364 while (cnt <= len &&
3365 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3369 bitset_set (sbcset, ch);
3372 /* Check whether the array has enough space. */
3373 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3375 /* Not enough, realloc it. */
3376 /* +1 in case of mbcset->nequiv_classes is 0. */
3377 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3378 /* Use realloc since the array is NULL if *alloc == 0. */
3379 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3381 new_equiv_class_alloc);
3382 if (BE (new_equiv_classes == NULL, 0))
3384 mbcset->equiv_classes = new_equiv_classes;
3385 *equiv_class_alloc = new_equiv_class_alloc;
3387 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3392 if (BE (strlen ((const char *) name) != 1, 0))
3393 return REG_ECOLLATE;
3394 bitset_set (sbcset, *name);
3399 /* Helper function for parse_bracket_exp.
3400 Build the character class which is represented by NAME.
3401 The result are written to MBCSET and SBCSET.
3402 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3403 is a pointer argument sinse we may update it. */
3405 static reg_errcode_t
3406 build_charclass (unsigned RE_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset,
3407 #ifdef RE_ENABLE_I18N
3408 re_charset_t *mbcset, int *char_class_alloc,
3410 const unsigned char *class_name, reg_syntax_t syntax)
3413 const char *name = (const char *) class_name;
3415 /* In case of REG_ICASE "upper" and "lower" match the both of
3416 upper and lower cases. */
3417 if ((syntax & RE_ICASE)
3418 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3421 #ifdef RE_ENABLE_I18N
3422 /* Check the space of the arrays. */
3423 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3425 /* Not enough, realloc it. */
3426 /* +1 in case of mbcset->nchar_classes is 0. */
3427 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3428 /* Use realloc since array is NULL if *alloc == 0. */
3429 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3430 new_char_class_alloc);
3431 if (BE (new_char_classes == NULL, 0))
3433 mbcset->char_classes = new_char_classes;
3434 *char_class_alloc = new_char_class_alloc;
3436 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3437 #endif /* RE_ENABLE_I18N */
3439 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3440 for (i = 0; i < SBC_MAX; ++i) \
3442 if (ctype_func (i)) \
3444 int ch = trans ? trans[i] : i; \
3445 bitset_set (sbcset, ch); \
3449 if (strcmp (name, "alnum") == 0)
3450 BUILD_CHARCLASS_LOOP (isalnum)
3451 else if (strcmp (name, "cntrl") == 0)
3452 BUILD_CHARCLASS_LOOP (iscntrl)
3453 else if (strcmp (name, "lower") == 0)
3454 BUILD_CHARCLASS_LOOP (islower)
3455 else if (strcmp (name, "space") == 0)
3456 BUILD_CHARCLASS_LOOP (isspace)
3457 else if (strcmp (name, "alpha") == 0)
3458 BUILD_CHARCLASS_LOOP (isalpha)
3459 else if (strcmp (name, "digit") == 0)
3460 BUILD_CHARCLASS_LOOP (isdigit)
3461 else if (strcmp (name, "print") == 0)
3462 BUILD_CHARCLASS_LOOP (isprint)
3463 else if (strcmp (name, "upper") == 0)
3464 BUILD_CHARCLASS_LOOP (isupper)
3465 else if (strcmp (name, "blank") == 0)
3466 BUILD_CHARCLASS_LOOP (isblank)
3467 else if (strcmp (name, "graph") == 0)
3468 BUILD_CHARCLASS_LOOP (isgraph)
3469 else if (strcmp (name, "punct") == 0)
3470 BUILD_CHARCLASS_LOOP (ispunct)
3471 else if (strcmp (name, "xdigit") == 0)
3472 BUILD_CHARCLASS_LOOP (isxdigit)
3480 build_charclass_op (re_dfa_t *dfa, unsigned RE_TRANSLATE_TYPE trans,
3481 const unsigned char *class_name,
3482 const unsigned char *extra,
3483 int non_match, reg_errcode_t *err)
3485 re_bitset_ptr_t sbcset;
3486 #ifdef RE_ENABLE_I18N
3487 re_charset_t *mbcset;
3489 #endif /* not RE_ENABLE_I18N */
3491 re_token_t br_token;
3494 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3495 #ifdef RE_ENABLE_I18N
3496 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3497 #endif /* RE_ENABLE_I18N */
3499 #ifdef RE_ENABLE_I18N
3500 if (BE (sbcset == NULL || mbcset == NULL, 0))
3501 #else /* not RE_ENABLE_I18N */
3502 if (BE (sbcset == NULL, 0))
3503 #endif /* not RE_ENABLE_I18N */
3511 #ifdef RE_ENABLE_I18N
3513 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3514 bitset_set(cset->sbcset, '\0');
3516 mbcset->non_match = 1;
3517 #endif /* not RE_ENABLE_I18N */
3520 /* We don't care the syntax in this case. */
3521 ret = build_charclass (trans, sbcset,
3522 #ifdef RE_ENABLE_I18N
3524 #endif /* RE_ENABLE_I18N */
3527 if (BE (ret != REG_NOERROR, 0))
3530 #ifdef RE_ENABLE_I18N
3531 free_charset (mbcset);
3532 #endif /* RE_ENABLE_I18N */
3536 /* \w match '_' also. */
3537 for (; *extra; extra++)
3538 bitset_set (sbcset, *extra);
3540 /* If it is non-matching list. */
3542 bitset_not (sbcset);
3544 #ifdef RE_ENABLE_I18N
3545 /* Ensure only single byte characters are set. */
3546 if (dfa->mb_cur_max > 1)
3547 bitset_mask (sbcset, dfa->sb_char);
3550 /* Build a tree for simple bracket. */
3551 br_token.type = SIMPLE_BRACKET;
3552 br_token.opr.sbcset = sbcset;
3553 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3554 if (BE (tree == NULL, 0))
3555 goto build_word_op_espace;
3557 #ifdef RE_ENABLE_I18N
3558 if (dfa->mb_cur_max > 1)
3560 bin_tree_t *mbc_tree;
3561 /* Build a tree for complex bracket. */
3562 br_token.type = COMPLEX_BRACKET;
3563 br_token.opr.mbcset = mbcset;
3564 dfa->has_mb_node = 1;
3565 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3566 if (BE (mbc_tree == NULL, 0))
3567 goto build_word_op_espace;
3568 /* Then join them by ALT node. */
3569 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3570 if (BE (mbc_tree != NULL, 1))
3575 free_charset (mbcset);
3578 #else /* not RE_ENABLE_I18N */
3580 #endif /* not RE_ENABLE_I18N */
3582 build_word_op_espace:
3584 #ifdef RE_ENABLE_I18N
3585 free_charset (mbcset);
3586 #endif /* RE_ENABLE_I18N */
3591 /* This is intended for the expressions like "a{1,3}".
3592 Fetch a number from `input', and return the number.
3593 Return -1, if the number field is empty like "{,1}".
3594 Return -2, If an error is occured. */
3597 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3603 fetch_token (token, input, syntax);
3605 if (BE (token->type == END_OF_RE, 0))
3607 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3609 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3610 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3611 num = (num > RE_DUP_MAX) ? -2 : num;
3616 #ifdef RE_ENABLE_I18N
3618 free_charset (re_charset_t *cset)
3620 re_free (cset->mbchars);
3622 re_free (cset->coll_syms);
3623 re_free (cset->equiv_classes);
3624 re_free (cset->range_starts);
3625 re_free (cset->range_ends);
3627 re_free (cset->char_classes);
3630 #endif /* RE_ENABLE_I18N */
3632 /* Functions for binary tree operation. */
3634 /* Create a tree node. */
3637 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3638 re_token_type_t type)
3642 return create_token_tree (dfa, left, right, &t);
3646 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3647 const re_token_t *token)
3650 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3652 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3654 if (storage == NULL)
3656 storage->next = dfa->str_tree_storage;
3657 dfa->str_tree_storage = storage;
3658 dfa->str_tree_storage_idx = 0;
3660 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3662 tree->parent = NULL;
3664 tree->right = right;
3665 tree->token = *token;
3666 tree->token.duplicated = 0;
3667 tree->token.opt_subexp = 0;
3670 tree->node_idx = -1;
3673 left->parent = tree;
3675 right->parent = tree;
3679 /* Mark the tree SRC as an optional subexpression.
3680 To be called from preorder or postorder. */
3682 static reg_errcode_t
3683 mark_opt_subexp (void *extra, bin_tree_t *node)
3685 int idx = (int) (long) extra;
3686 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3687 node->token.opt_subexp = 1;
3692 /* Free the allocated memory inside NODE. */
3695 free_token (re_token_t *node)
3697 #ifdef RE_ENABLE_I18N
3698 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3699 free_charset (node->opr.mbcset);
3701 #endif /* RE_ENABLE_I18N */
3702 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3703 re_free (node->opr.sbcset);
3706 /* Worker function for tree walking. Free the allocated memory inside NODE
3707 and its children. */
3709 static reg_errcode_t
3710 free_tree (void *extra, bin_tree_t *node)
3712 free_token (&node->token);
3717 /* Duplicate the node SRC, and return new node. This is a preorder
3718 visit similar to the one implemented by the generic visitor, but
3719 we need more infrastructure to maintain two parallel trees --- so,
3720 it's easier to duplicate. */
3723 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3725 const bin_tree_t *node;
3726 bin_tree_t *dup_root;
3727 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3729 for (node = root; ; )
3731 /* Create a new tree and link it back to the current parent. */
3732 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3735 (*p_new)->parent = dup_node;
3736 (*p_new)->token.duplicated = 1;
3739 /* Go to the left node, or up and to the right. */
3743 p_new = &dup_node->left;
3747 const bin_tree_t *prev = NULL;
3748 while (node->right == prev || node->right == NULL)
3751 node = node->parent;
3752 dup_node = dup_node->parent;
3757 p_new = &dup_node->right;