1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 int length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
49 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static int fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_token_t *token, int token_len,
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
91 int *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
94 re_bitset_ptr_t sbcset,
96 int *char_class_alloc,
97 const unsigned char *class_name,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
103 re_bitset_ptr_t sbcset,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 unsigned REG_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 int non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 const char __re_error_msgid[] attribute_hidden =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 const size_t __re_error_msgid_idx[] attribute_hidden =
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
210 are set in BUFP on entry. */
213 re_compile_pattern (const char *pattern, size_t length,
214 struct re_pattern_buffer *bufp)
218 /* And GNU code determines whether or not to get register information
219 by passing null for the REGS argument to re_match, etc., not by
220 setting re_no_sub, unless REG_NO_SUB is set. */
221 bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
223 /* Match anchors at newline. */
224 bufp->re_newline_anchor = 1;
226 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
233 weak_alias (__re_compile_pattern, re_compile_pattern)
236 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
237 also be assigned to arbitrarily: each pattern buffer stores its own
238 syntax, so it can be changed between regex compilations. */
239 /* This has no initializer because initialized variables in Emacs
240 become read-only after dumping. */
241 reg_syntax_t re_syntax_options;
244 /* Specify the precise syntax of regexps for compilation. This provides
245 for compatibility for various utilities which historically have
246 different, incompatible syntaxes.
248 The argument SYNTAX is a bit mask comprised of the various bits
249 defined in regex.h. We return the old syntax. */
252 re_set_syntax (reg_syntax_t syntax)
254 reg_syntax_t ret = re_syntax_options;
256 re_syntax_options = syntax;
260 weak_alias (__re_set_syntax, re_set_syntax)
264 re_compile_fastmap (struct re_pattern_buffer *bufp)
266 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
267 char *fastmap = bufp->re_fastmap;
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->re_fastmap_accurate = 1;
281 weak_alias (__re_compile_fastmap, re_compile_fastmap)
285 __attribute ((always_inline))
286 re_set_fastmap (char *fastmap, int icase, int ch)
290 fastmap[tolower (ch)] = 1;
293 /* Helper function for re_compile_fastmap.
294 Compile fastmap for the initial_state INIT_STATE. */
297 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
300 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
302 int icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
303 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
305 int node = init_state->nodes.elems[node_cnt];
306 re_token_type_t type = dfa->nodes[node].type;
308 if (type == CHARACTER)
310 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
311 #ifdef RE_ENABLE_I18N
312 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
314 unsigned char buf[MB_LEN_MAX];
320 *p++ = dfa->nodes[node].opr.c;
321 while (++node < dfa->nodes_len
322 && dfa->nodes[node].type == CHARACTER
323 && dfa->nodes[node].mb_partial)
324 *p++ = dfa->nodes[node].opr.c;
325 memset (&state, 0, sizeof (state));
326 if (mbrtowc (&wc, (const char *) buf, p - buf,
328 && (__wcrtomb ((char *) buf, towlower (wc), &state)
330 re_set_fastmap (fastmap, 0, buf[0]);
334 else if (type == SIMPLE_BRACKET)
337 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
338 for (j = 0; j < UINT_BITS; ++j, ++ch)
339 if (dfa->nodes[node].opr.sbcset[i] & (1u << j))
340 re_set_fastmap (fastmap, icase, ch);
342 #ifdef RE_ENABLE_I18N
343 else if (type == COMPLEX_BRACKET)
346 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
347 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
348 || cset->nranges || cset->nchar_classes)
351 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
353 /* In this case we want to catch the bytes which are
354 the first byte of any collation elements.
355 e.g. In da_DK, we want to catch 'a' since "aa"
356 is a valid collation element, and don't catch
357 'b' since 'b' is the only collation element
358 which starts from 'b'. */
360 const int32_t *table = (const int32_t *)
361 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
362 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
363 for (j = 0; j < UINT_BITS; ++j, ++ch)
365 re_set_fastmap (fastmap, icase, ch);
368 if (dfa->mb_cur_max > 1)
369 for (i = 0; i < SBC_MAX; ++i)
370 if (__btowc (i) == WEOF)
371 re_set_fastmap (fastmap, icase, i);
372 # endif /* not _LIBC */
374 for (i = 0; i < cset->nmbchars; ++i)
378 memset (&state, '\0', sizeof (state));
379 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
380 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
381 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
383 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
385 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
389 #endif /* RE_ENABLE_I18N */
390 else if (type == OP_PERIOD
391 #ifdef RE_ENABLE_I18N
392 || type == OP_UTF8_PERIOD
393 #endif /* RE_ENABLE_I18N */
394 || type == END_OF_RE)
396 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
397 if (type == END_OF_RE)
398 bufp->re_can_be_null = 1;
404 /* Entry point for POSIX code. */
405 /* regcomp takes a regular expression as a string and compiles it.
407 PREG is a regex_t *. We do not expect any fields to be initialized,
408 since POSIX says we shouldn't. Thus, we set
410 `re_buffer' to the compiled pattern;
411 `re_used' to the length of the compiled pattern;
412 `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
413 REG_EXTENDED bit in CFLAGS is set; otherwise, to
414 REG_SYNTAX_POSIX_BASIC;
415 `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
416 `re_fastmap' to an allocated space for the fastmap;
417 `re_fastmap_accurate' to zero;
418 `re_nsub' to the number of subexpressions in PATTERN.
420 PATTERN is the address of the pattern string.
422 CFLAGS is a series of bits which affect compilation.
424 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
425 use POSIX basic syntax.
427 If REG_NEWLINE is set, then . and [^...] don't match newline.
428 Also, regexec will try a match beginning after every newline.
430 If REG_ICASE is set, then we considers upper- and lowercase
431 versions of letters to be equivalent when matching.
433 If REG_NOSUB is set, then when PREG is passed to regexec, that
434 routine will report only success or failure, and nothing about the
437 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
438 the return codes and their meanings.) */
441 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
444 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
445 : REG_SYNTAX_POSIX_BASIC);
447 preg->re_buffer = NULL;
448 preg->re_allocated = 0;
451 /* Try to allocate space for the fastmap. */
452 preg->re_fastmap = re_malloc (char, SBC_MAX);
453 if (BE (preg->re_fastmap == NULL, 0))
456 syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
458 /* If REG_NEWLINE is set, newlines are treated differently. */
459 if (cflags & REG_NEWLINE)
460 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
461 syntax &= ~REG_DOT_NEWLINE;
462 syntax |= REG_HAT_LISTS_NOT_NEWLINE;
463 /* It also changes the matching behavior. */
464 preg->re_newline_anchor = 1;
467 preg->re_newline_anchor = 0;
468 preg->re_no_sub = !!(cflags & REG_NOSUB);
469 preg->re_translate = NULL;
471 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
473 /* POSIX doesn't distinguish between an unmatched open-group and an
474 unmatched close-group: both are REG_EPAREN. */
475 if (ret == REG_ERPAREN)
478 /* We have already checked preg->re_fastmap != NULL. */
479 if (BE (ret == REG_NOERROR, 1))
480 /* Compute the fastmap now, since regexec cannot modify the pattern
481 buffer. This function never fails in this implementation. */
482 (void) re_compile_fastmap (preg);
485 /* Some error occurred while compiling the expression. */
486 re_free (preg->re_fastmap);
487 preg->re_fastmap = NULL;
493 weak_alias (__regcomp, regcomp)
496 /* Returns a message corresponding to an error code, ERRCODE, returned
497 from either regcomp or regexec. We don't use PREG here. */
500 regerror (int errcode, const regex_t *__restrict preg,
501 char *__restrict errbuf, size_t errbuf_size)
507 || errcode >= (int) (sizeof (__re_error_msgid_idx)
508 / sizeof (__re_error_msgid_idx[0])), 0))
509 /* Only error codes returned by the rest of the code should be passed
510 to this routine. If we are given anything else, or if other regex
511 code generates an invalid error code, then the program has a bug.
512 Dump core so we can fix it. */
515 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
517 msg_size = strlen (msg) + 1; /* Includes the null. */
519 if (BE (errbuf_size != 0, 1))
521 if (BE (msg_size > errbuf_size, 0))
523 #if defined HAVE_MEMPCPY || defined _LIBC
524 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
526 memcpy (errbuf, msg, errbuf_size - 1);
527 errbuf[errbuf_size - 1] = 0;
531 memcpy (errbuf, msg, msg_size);
537 weak_alias (__regerror, regerror)
541 #ifdef RE_ENABLE_I18N
542 /* This static array is used for the map to single-byte characters when
543 UTF-8 is used. Otherwise we would allocate memory just to initialize
544 it the same all the time. UTF-8 is the preferred encoding so this is
545 a worthwhile optimization. */
546 static const bitset utf8_sb_map =
548 /* Set the first 128 bits. */
549 # if UINT_MAX == 0xffffffff
550 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
552 # error "Add case for new unsigned int size"
559 free_dfa_content (re_dfa_t *dfa)
564 for (i = 0; i < dfa->nodes_len; ++i)
565 free_token (dfa->nodes + i);
566 re_free (dfa->nexts);
567 for (i = 0; i < dfa->nodes_len; ++i)
569 if (dfa->eclosures != NULL)
570 re_node_set_free (dfa->eclosures + i);
571 if (dfa->inveclosures != NULL)
572 re_node_set_free (dfa->inveclosures + i);
573 if (dfa->edests != NULL)
574 re_node_set_free (dfa->edests + i);
576 re_free (dfa->edests);
577 re_free (dfa->eclosures);
578 re_free (dfa->inveclosures);
579 re_free (dfa->nodes);
581 if (dfa->state_table)
582 for (i = 0; i <= dfa->state_hash_mask; ++i)
584 struct re_state_table_entry *entry = dfa->state_table + i;
585 for (j = 0; j < entry->num; ++j)
587 re_dfastate_t *state = entry->array[j];
590 re_free (entry->array);
592 re_free (dfa->state_table);
593 #ifdef RE_ENABLE_I18N
594 if (dfa->sb_char != utf8_sb_map)
595 re_free (dfa->sb_char);
597 re_free (dfa->subexp_map);
599 re_free (dfa->re_str);
606 /* Free dynamically allocated space used by PREG. */
609 regfree (regex_t *preg)
611 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
612 if (BE (dfa != NULL, 1))
613 free_dfa_content (dfa);
614 preg->re_buffer = NULL;
615 preg->re_allocated = 0;
617 re_free (preg->re_fastmap);
618 preg->re_fastmap = NULL;
620 re_free (preg->re_translate);
621 preg->re_translate = NULL;
624 weak_alias (__regfree, regfree)
627 /* Entry points compatible with 4.2 BSD regex library. We don't define
628 them unless specifically requested. */
630 #if defined _REGEX_RE_COMP || defined _LIBC
632 /* BSD has one and only one pattern buffer. */
633 static struct re_pattern_buffer re_comp_buf;
637 /* Make these definitions weak in libc, so POSIX programs can redefine
638 these names if they don't use our functions, and still use
639 regcomp/regexec above without link errors. */
650 if (!re_comp_buf.re_buffer)
651 return gettext ("No previous regular expression");
655 if (re_comp_buf.re_buffer)
657 fastmap = re_comp_buf.re_fastmap;
658 re_comp_buf.re_fastmap = NULL;
659 __regfree (&re_comp_buf);
660 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
661 re_comp_buf.re_fastmap = fastmap;
664 if (re_comp_buf.re_fastmap == NULL)
666 re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
667 if (re_comp_buf.re_fastmap == NULL)
668 return (char *) gettext (__re_error_msgid
669 + __re_error_msgid_idx[(int) REG_ESPACE]);
672 /* Since `re_exec' always passes NULL for the `regs' argument, we
673 don't need to initialize the pattern buffer fields which affect it. */
675 /* Match anchors at newlines. */
676 re_comp_buf.re_newline_anchor = 1;
678 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
683 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
684 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
688 libc_freeres_fn (free_mem)
690 __regfree (&re_comp_buf);
694 #endif /* _REGEX_RE_COMP */
696 /* Internal entry point.
697 Compile the regular expression PATTERN, whose length is LENGTH.
698 SYNTAX indicate regular expression's syntax. */
701 re_compile_internal (regex_t *preg, const char * pattern, int length,
704 reg_errcode_t err = REG_NOERROR;
708 /* Initialize the pattern buffer. */
709 preg->re_fastmap_accurate = 0;
710 preg->re_syntax = syntax;
711 preg->re_not_bol = preg->re_not_eol = 0;
714 preg->re_can_be_null = 0;
715 preg->re_regs_allocated = REG_UNALLOCATED;
717 /* Initialize the dfa. */
718 dfa = (re_dfa_t *) preg->re_buffer;
719 if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
721 /* If zero allocated, but buffer is non-null, try to realloc
722 enough space. This loses if buffer's address is bogus, but
723 that is the user's responsibility. If buffer is null this
724 is a simple allocation. */
725 dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
728 preg->re_allocated = sizeof (re_dfa_t);
729 preg->re_buffer = (unsigned char *) dfa;
731 preg->re_used = sizeof (re_dfa_t);
733 __libc_lock_init (dfa->lock);
735 err = init_dfa (dfa, length);
736 if (BE (err != REG_NOERROR, 0))
738 free_dfa_content (dfa);
739 preg->re_buffer = NULL;
740 preg->re_allocated = 0;
744 dfa->re_str = re_malloc (char, length + 1);
745 strncpy (dfa->re_str, pattern, length + 1);
748 err = re_string_construct (®exp, pattern, length, preg->re_translate,
749 syntax & REG_IGNORE_CASE, dfa);
750 if (BE (err != REG_NOERROR, 0))
752 re_compile_internal_free_return:
753 free_workarea_compile (preg);
754 re_string_destruct (®exp);
755 free_dfa_content (dfa);
756 preg->re_buffer = NULL;
757 preg->re_allocated = 0;
761 /* Parse the regular expression, and build a structure tree. */
763 dfa->str_tree = parse (®exp, preg, syntax, &err);
764 if (BE (dfa->str_tree == NULL, 0))
765 goto re_compile_internal_free_return;
767 /* Analyze the tree and create the nfa. */
768 err = analyze (preg);
769 if (BE (err != REG_NOERROR, 0))
770 goto re_compile_internal_free_return;
772 #ifdef RE_ENABLE_I18N
773 /* If possible, do searching in single byte encoding to speed things up. */
774 if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
778 /* Then create the initial state of the dfa. */
779 err = create_initial_state (dfa);
781 /* Release work areas. */
782 free_workarea_compile (preg);
783 re_string_destruct (®exp);
785 if (BE (err != REG_NOERROR, 0))
787 free_dfa_content (dfa);
788 preg->re_buffer = NULL;
789 preg->re_allocated = 0;
795 /* Initialize DFA. We use the length of the regular expression PAT_LEN
796 as the initial length of some arrays. */
799 init_dfa (re_dfa_t *dfa, int pat_len)
801 unsigned int table_size;
806 memset (dfa, '\0', sizeof (re_dfa_t));
808 /* Force allocation of str_tree_storage the first time. */
809 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
811 dfa->nodes_alloc = pat_len + 1;
812 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
814 /* table_size = 2 ^ ceil(log pat_len) */
815 for (table_size = 1; ; table_size <<= 1)
816 if (table_size > pat_len)
819 dfa->state_table = re_calloc (struct re_state_table_entry, table_size);
820 dfa->state_hash_mask = table_size - 1;
822 dfa->mb_cur_max = MB_CUR_MAX;
824 if (dfa->mb_cur_max == 6
825 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
827 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
830 # ifdef HAVE_LANGINFO_CODESET
831 codeset_name = nl_langinfo (CODESET);
833 codeset_name = getenv ("LC_ALL");
834 if (codeset_name == NULL || codeset_name[0] == '\0')
835 codeset_name = getenv ("LC_CTYPE");
836 if (codeset_name == NULL || codeset_name[0] == '\0')
837 codeset_name = getenv ("LANG");
838 if (codeset_name == NULL)
840 else if (strchr (codeset_name, '.') != NULL)
841 codeset_name = strchr (codeset_name, '.') + 1;
844 if (strcasecmp (codeset_name, "UTF-8") == 0
845 || strcasecmp (codeset_name, "UTF8") == 0)
848 /* We check exhaustively in the loop below if this charset is a
849 superset of ASCII. */
850 dfa->map_notascii = 0;
853 #ifdef RE_ENABLE_I18N
854 if (dfa->mb_cur_max > 1)
857 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
862 dfa->sb_char = re_calloc (unsigned int, BITSET_UINTS);
863 if (BE (dfa->sb_char == NULL, 0))
866 /* Clear all bits by, then set those corresponding to single
868 bitset_empty (dfa->sb_char);
870 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
871 for (j = 0; j < UINT_BITS; ++j, ++ch)
873 wint_t wch = __btowc (ch);
875 dfa->sb_char[i] |= 1u << j;
877 if (isascii (ch) && wch != ch)
878 dfa->map_notascii = 1;
885 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
890 /* Initialize WORD_CHAR table, which indicate which character is
891 "word". In this case "word" means that it is the word construction
892 character used by some operators like "\<", "\>", etc. */
895 init_word_char (re_dfa_t *dfa)
898 dfa->word_ops_used = 1;
899 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
900 for (j = 0; j < UINT_BITS; ++j, ++ch)
901 if (isalnum (ch) || ch == '_')
902 dfa->word_char[i] |= 1u << j;
905 /* Free the work area which are only used while compiling. */
908 free_workarea_compile (regex_t *preg)
910 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
911 bin_tree_storage_t *storage, *next;
912 for (storage = dfa->str_tree_storage; storage; storage = next)
914 next = storage->next;
917 dfa->str_tree_storage = NULL;
918 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
919 dfa->str_tree = NULL;
920 re_free (dfa->org_indices);
921 dfa->org_indices = NULL;
924 /* Create initial states for all contexts. */
927 create_initial_state (re_dfa_t *dfa)
931 re_node_set init_nodes;
933 /* Initial states have the epsilon closure of the node which is
934 the first node of the regular expression. */
935 first = dfa->str_tree->first->node_idx;
936 dfa->init_node = first;
937 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
938 if (BE (err != REG_NOERROR, 0))
941 /* The back-references which are in initial states can epsilon transit,
942 since in this case all of the subexpressions can be null.
943 Then we add epsilon closures of the nodes which are the next nodes of
944 the back-references. */
945 if (dfa->nbackref > 0)
946 for (i = 0; i < init_nodes.nelem; ++i)
948 int node_idx = init_nodes.elems[i];
949 re_token_type_t type = dfa->nodes[node_idx].type;
952 if (type != OP_BACK_REF)
954 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
956 re_token_t *clexp_node;
957 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
958 if (clexp_node->type == OP_CLOSE_SUBEXP
959 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
962 if (clexp_idx == init_nodes.nelem)
965 if (type == OP_BACK_REF)
967 int dest_idx = dfa->edests[node_idx].elems[0];
968 if (!re_node_set_contains (&init_nodes, dest_idx))
970 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
976 /* It must be the first time to invoke acquire_state. */
977 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
978 /* We don't check ERR here, since the initial state must not be NULL. */
979 if (BE (dfa->init_state == NULL, 0))
981 if (dfa->init_state->has_constraint)
983 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
985 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
987 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
991 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
992 || dfa->init_state_begbuf == NULL, 0))
996 dfa->init_state_word = dfa->init_state_nl
997 = dfa->init_state_begbuf = dfa->init_state;
999 re_node_set_free (&init_nodes);
1003 #ifdef RE_ENABLE_I18N
1004 /* If it is possible to do searching in single byte encoding instead of UTF-8
1005 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1006 DFA nodes where needed. */
1009 optimize_utf8 (re_dfa_t *dfa)
1011 int node, i, mb_chars = 0, has_period = 0;
1013 for (node = 0; node < dfa->nodes_len; ++node)
1014 switch (dfa->nodes[node].type)
1017 if (dfa->nodes[node].opr.c >= 0x80)
1021 switch (dfa->nodes[node].opr.idx)
1029 /* Word anchors etc. cannot be handled. */
1039 case OP_DUP_ASTERISK:
1040 case OP_OPEN_SUBEXP:
1041 case OP_CLOSE_SUBEXP:
1043 case COMPLEX_BRACKET:
1045 case SIMPLE_BRACKET:
1046 /* Just double check. */
1047 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1048 if (dfa->nodes[node].opr.sbcset[i])
1055 if (mb_chars || has_period)
1056 for (node = 0; node < dfa->nodes_len; ++node)
1058 if (dfa->nodes[node].type == CHARACTER
1059 && dfa->nodes[node].opr.c >= 0x80)
1060 dfa->nodes[node].mb_partial = 0;
1061 else if (dfa->nodes[node].type == OP_PERIOD)
1062 dfa->nodes[node].type = OP_UTF8_PERIOD;
1065 /* The search can be in single byte locale. */
1066 dfa->mb_cur_max = 1;
1068 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1072 /* Analyze the structure tree, and calculate "first", "next", "edest",
1073 "eclosure", and "inveclosure". */
1075 static reg_errcode_t
1076 analyze (regex_t *preg)
1078 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1081 /* Allocate arrays. */
1082 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1083 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1084 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1085 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1086 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1087 || dfa->eclosures == NULL, 0))
1090 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1091 if (dfa->subexp_map != NULL)
1094 for (i = 0; i < preg->re_nsub; i++)
1095 dfa->subexp_map[i] = i;
1096 preorder (dfa->str_tree, optimize_subexps, dfa);
1097 for (i = 0; i < preg->re_nsub; i++)
1098 if (dfa->subexp_map[i] != i)
1100 if (i == preg->re_nsub)
1102 free (dfa->subexp_map);
1103 dfa->subexp_map = NULL;
1107 ret = postorder (dfa->str_tree, lower_subexps, preg);
1108 if (BE (ret != REG_NOERROR, 0))
1110 ret = postorder (dfa->str_tree, calc_first, dfa);
1111 if (BE (ret != REG_NOERROR, 0))
1113 preorder (dfa->str_tree, calc_next, dfa);
1114 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1115 if (BE (ret != REG_NOERROR, 0))
1117 ret = calc_eclosure (dfa);
1118 if (BE (ret != REG_NOERROR, 0))
1121 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1122 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1123 if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1126 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1127 if (BE (dfa->inveclosures == NULL, 0))
1129 ret = calc_inveclosure (dfa);
1135 /* Our parse trees are very unbalanced, so we cannot use a stack to
1136 implement parse tree visits. Instead, we use parent pointers and
1137 some hairy code in these two functions. */
1138 static reg_errcode_t
1139 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1142 bin_tree_t *node, *prev;
1144 for (node = root; ; )
1146 /* Descend down the tree, preferably to the left (or to the right
1147 if that's the only child). */
1148 while (node->left || node->right)
1156 reg_errcode_t err = fn (extra, node);
1157 if (BE (err != REG_NOERROR, 0))
1159 if (node->parent == NULL)
1162 node = node->parent;
1164 /* Go up while we have a node that is reached from the right. */
1165 while (node->right == prev || node->right == NULL);
1170 static reg_errcode_t
1171 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1176 for (node = root; ; )
1178 reg_errcode_t err = fn (extra, node);
1179 if (BE (err != REG_NOERROR, 0))
1182 /* Go to the left node, or up and to the right. */
1187 bin_tree_t *prev = NULL;
1188 while (node->right == prev || node->right == NULL)
1191 node = node->parent;
1200 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1201 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1202 backreferences as well. Requires a preorder visit. */
1203 static reg_errcode_t
1204 optimize_subexps (void *extra, bin_tree_t *node)
1206 re_dfa_t *dfa = (re_dfa_t *) extra;
1208 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1210 int idx = node->token.opr.idx;
1211 node->token.opr.idx = dfa->subexp_map[idx];
1212 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1215 else if (node->token.type == SUBEXP
1216 && node->left && node->left->token.type == SUBEXP)
1218 int other_idx = node->left->token.opr.idx;
1220 node->left = node->left->left;
1222 node->left->parent = node;
1224 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1225 if (other_idx < CHAR_BIT * sizeof dfa->used_bkref_map)
1226 dfa->used_bkref_map &= ~(1u << other_idx);
1232 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1233 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1234 static reg_errcode_t
1235 lower_subexps (void *extra, bin_tree_t *node)
1237 regex_t *preg = (regex_t *) extra;
1238 reg_errcode_t err = REG_NOERROR;
1240 if (node->left && node->left->token.type == SUBEXP)
1242 node->left = lower_subexp (&err, preg, node->left);
1244 node->left->parent = node;
1246 if (node->right && node->right->token.type == SUBEXP)
1248 node->right = lower_subexp (&err, preg, node->right);
1250 node->right->parent = node;
1257 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1259 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1260 bin_tree_t *body = node->left;
1261 bin_tree_t *op, *cls, *tree1, *tree;
1264 /* We do not optimize empty subexpressions, because otherwise we may
1265 have bad CONCAT nodes with NULL children. This is obviously not
1266 very common, so we do not lose much. An example that triggers
1267 this case is the sed "script" /\(\)/x. */
1268 && node->left != NULL
1269 && (node->token.opr.idx >= CHAR_BIT * sizeof dfa->used_bkref_map
1270 || !(dfa->used_bkref_map & (1u << node->token.opr.idx))))
1273 /* Convert the SUBEXP node to the concatenation of an
1274 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1275 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1276 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1277 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1278 tree = create_tree (dfa, op, tree1, CONCAT);
1279 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1285 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1286 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1290 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1291 nodes. Requires a postorder visit. */
1292 static reg_errcode_t
1293 calc_first (void *extra, bin_tree_t *node)
1295 re_dfa_t *dfa = (re_dfa_t *) extra;
1296 if (node->token.type == CONCAT)
1298 node->first = node->left->first;
1299 node->node_idx = node->left->node_idx;
1304 node->node_idx = re_dfa_add_node (dfa, node->token);
1305 if (BE (node->node_idx == -1, 0))
1311 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1312 static reg_errcode_t
1313 calc_next (void *extra, bin_tree_t *node)
1315 switch (node->token.type)
1317 case OP_DUP_ASTERISK:
1318 node->left->next = node;
1321 node->left->next = node->right->first;
1322 node->right->next = node->next;
1326 node->left->next = node->next;
1328 node->right->next = node->next;
1334 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1335 static reg_errcode_t
1336 link_nfa_nodes (void *extra, bin_tree_t *node)
1338 re_dfa_t *dfa = (re_dfa_t *) extra;
1339 int idx = node->node_idx;
1340 reg_errcode_t err = REG_NOERROR;
1342 switch (node->token.type)
1348 assert (node->next == NULL);
1351 case OP_DUP_ASTERISK:
1355 dfa->has_plural_match = 1;
1356 if (node->left != NULL)
1357 left = node->left->first->node_idx;
1359 left = node->next->node_idx;
1360 if (node->right != NULL)
1361 right = node->right->first->node_idx;
1363 right = node->next->node_idx;
1365 assert (right > -1);
1366 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1371 case OP_OPEN_SUBEXP:
1372 case OP_CLOSE_SUBEXP:
1373 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1377 dfa->nexts[idx] = node->next->node_idx;
1378 if (node->token.type == OP_BACK_REF)
1379 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1383 assert (!IS_EPSILON_NODE (node->token.type));
1384 dfa->nexts[idx] = node->next->node_idx;
1391 /* Duplicate the epsilon closure of the node ROOT_NODE.
1392 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1393 to their own constraint. */
1395 static reg_errcode_t
1396 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1397 int root_node, unsigned int init_constraint)
1399 int org_node, clone_node, ret;
1400 unsigned int constraint = init_constraint;
1401 for (org_node = top_org_node, clone_node = top_clone_node;;)
1403 int org_dest, clone_dest;
1404 if (dfa->nodes[org_node].type == OP_BACK_REF)
1406 /* If the back reference epsilon-transit, its destination must
1407 also have the constraint. Then duplicate the epsilon closure
1408 of the destination of the back reference, and store it in
1409 edests of the back reference. */
1410 org_dest = dfa->nexts[org_node];
1411 re_node_set_empty (dfa->edests + clone_node);
1412 clone_dest = duplicate_node (dfa, org_dest, constraint);
1413 if (BE (clone_dest == -1, 0))
1415 dfa->nexts[clone_node] = dfa->nexts[org_node];
1416 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1417 if (BE (ret < 0, 0))
1420 else if (dfa->edests[org_node].nelem == 0)
1422 /* In case of the node can't epsilon-transit, don't duplicate the
1423 destination and store the original destination as the
1424 destination of the node. */
1425 dfa->nexts[clone_node] = dfa->nexts[org_node];
1428 else if (dfa->edests[org_node].nelem == 1)
1430 /* In case of the node can epsilon-transit, and it has only one
1432 org_dest = dfa->edests[org_node].elems[0];
1433 re_node_set_empty (dfa->edests + clone_node);
1434 if (dfa->nodes[org_node].type == ANCHOR)
1436 /* In case of the node has another constraint, append it. */
1437 if (org_node == root_node && clone_node != org_node)
1439 /* ...but if the node is root_node itself, it means the
1440 epsilon closure have a loop, then tie it to the
1441 destination of the root_node. */
1442 ret = re_node_set_insert (dfa->edests + clone_node,
1444 if (BE (ret < 0, 0))
1448 constraint |= dfa->nodes[org_node].opr.ctx_type;
1450 clone_dest = duplicate_node (dfa, org_dest, constraint);
1451 if (BE (clone_dest == -1, 0))
1453 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1454 if (BE (ret < 0, 0))
1457 else /* dfa->edests[org_node].nelem == 2 */
1459 /* In case of the node can epsilon-transit, and it has two
1460 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1461 org_dest = dfa->edests[org_node].elems[0];
1462 re_node_set_empty (dfa->edests + clone_node);
1463 /* Search for a duplicated node which satisfies the constraint. */
1464 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1465 if (clone_dest == -1)
1467 /* There are no such a duplicated node, create a new one. */
1469 clone_dest = duplicate_node (dfa, org_dest, constraint);
1470 if (BE (clone_dest == -1, 0))
1472 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1473 if (BE (ret < 0, 0))
1475 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1476 root_node, constraint);
1477 if (BE (err != REG_NOERROR, 0))
1482 /* There are a duplicated node which satisfy the constraint,
1483 use it to avoid infinite loop. */
1484 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1485 if (BE (ret < 0, 0))
1489 org_dest = dfa->edests[org_node].elems[1];
1490 clone_dest = duplicate_node (dfa, org_dest, constraint);
1491 if (BE (clone_dest == -1, 0))
1493 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1494 if (BE (ret < 0, 0))
1497 org_node = org_dest;
1498 clone_node = clone_dest;
1503 /* Search for a node which is duplicated from the node ORG_NODE, and
1504 satisfies the constraint CONSTRAINT. */
1507 search_duplicated_node (re_dfa_t *dfa, int org_node, unsigned int constraint)
1510 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1512 if (org_node == dfa->org_indices[idx]
1513 && constraint == dfa->nodes[idx].constraint)
1514 return idx; /* Found. */
1516 return -1; /* Not found. */
1519 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1520 Return the index of the new node, or -1 if insufficient storage is
1524 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1526 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1527 if (BE (dup_idx != -1, 1))
1529 dfa->nodes[dup_idx].constraint = constraint;
1530 if (dfa->nodes[org_idx].type == ANCHOR)
1531 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1532 dfa->nodes[dup_idx].duplicated = 1;
1534 /* Store the index of the original node. */
1535 dfa->org_indices[dup_idx] = org_idx;
1540 static reg_errcode_t
1541 calc_inveclosure (re_dfa_t *dfa)
1544 for (idx = 0; idx < dfa->nodes_len; ++idx)
1545 re_node_set_init_empty (dfa->inveclosures + idx);
1547 for (src = 0; src < dfa->nodes_len; ++src)
1549 int *elems = dfa->eclosures[src].elems;
1550 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1552 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1553 if (BE (ret == -1, 0))
1561 /* Calculate "eclosure" for all the node in DFA. */
1563 static reg_errcode_t
1564 calc_eclosure (re_dfa_t *dfa)
1566 int node_idx, incomplete;
1568 assert (dfa->nodes_len > 0);
1571 /* For each nodes, calculate epsilon closure. */
1572 for (node_idx = 0; ; ++node_idx)
1575 re_node_set eclosure_elem;
1576 if (node_idx == dfa->nodes_len)
1585 assert (dfa->eclosures[node_idx].nelem != -1);
1588 /* If we have already calculated, skip it. */
1589 if (dfa->eclosures[node_idx].nelem != 0)
1591 /* Calculate epsilon closure of `node_idx'. */
1592 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1593 if (BE (err != REG_NOERROR, 0))
1596 if (dfa->eclosures[node_idx].nelem == 0)
1599 re_node_set_free (&eclosure_elem);
1605 /* Calculate epsilon closure of NODE. */
1607 static reg_errcode_t
1608 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1611 unsigned int constraint;
1613 re_node_set eclosure;
1615 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1616 if (BE (err != REG_NOERROR, 0))
1619 /* This indicates that we are calculating this node now.
1620 We reference this value to avoid infinite loop. */
1621 dfa->eclosures[node].nelem = -1;
1623 constraint = ((dfa->nodes[node].type == ANCHOR)
1624 ? dfa->nodes[node].opr.ctx_type : 0);
1625 /* If the current node has constraints, duplicate all nodes.
1626 Since they must inherit the constraints. */
1628 && dfa->edests[node].nelem
1629 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1631 int org_node, cur_node;
1632 org_node = cur_node = node;
1633 err = duplicate_node_closure (dfa, node, node, node, constraint);
1634 if (BE (err != REG_NOERROR, 0))
1638 /* Expand each epsilon destination nodes. */
1639 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1640 for (i = 0; i < dfa->edests[node].nelem; ++i)
1642 re_node_set eclosure_elem;
1643 int edest = dfa->edests[node].elems[i];
1644 /* If calculating the epsilon closure of `edest' is in progress,
1645 return intermediate result. */
1646 if (dfa->eclosures[edest].nelem == -1)
1651 /* If we haven't calculated the epsilon closure of `edest' yet,
1652 calculate now. Otherwise use calculated epsilon closure. */
1653 if (dfa->eclosures[edest].nelem == 0)
1655 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1656 if (BE (err != REG_NOERROR, 0))
1660 eclosure_elem = dfa->eclosures[edest];
1661 /* Merge the epsilon closure of `edest'. */
1662 re_node_set_merge (&eclosure, &eclosure_elem);
1663 /* If the epsilon closure of `edest' is incomplete,
1664 the epsilon closure of this node is also incomplete. */
1665 if (dfa->eclosures[edest].nelem == 0)
1668 re_node_set_free (&eclosure_elem);
1672 /* Epsilon closures include itself. */
1673 re_node_set_insert (&eclosure, node);
1674 if (incomplete && !root)
1675 dfa->eclosures[node].nelem = 0;
1677 dfa->eclosures[node] = eclosure;
1678 *new_set = eclosure;
1682 /* Functions for token which are used in the parser. */
1684 /* Fetch a token from INPUT.
1685 We must not use this function inside bracket expressions. */
1688 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1690 re_string_skip_bytes (input, peek_token (result, input, syntax));
1693 /* Peek a token from INPUT, and return the length of the token.
1694 We must not use this function inside bracket expressions. */
1697 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1701 if (re_string_eoi (input))
1703 token->type = END_OF_RE;
1707 c = re_string_peek_byte (input, 0);
1710 token->word_char = 0;
1711 #ifdef RE_ENABLE_I18N
1712 token->mb_partial = 0;
1713 if (input->mb_cur_max > 1 &&
1714 !re_string_first_byte (input, re_string_cur_idx (input)))
1716 token->type = CHARACTER;
1717 token->mb_partial = 1;
1724 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1726 token->type = BACK_SLASH;
1730 c2 = re_string_peek_byte_case (input, 1);
1732 token->type = CHARACTER;
1733 #ifdef RE_ENABLE_I18N
1734 if (input->mb_cur_max > 1)
1736 wint_t wc = re_string_wchar_at (input,
1737 re_string_cur_idx (input) + 1);
1738 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1742 token->word_char = IS_WORD_CHAR (c2) != 0;
1747 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1748 token->type = OP_ALT;
1750 case '1': case '2': case '3': case '4': case '5':
1751 case '6': case '7': case '8': case '9':
1752 if (!(syntax & REG_NO_BK_REFS))
1754 token->type = OP_BACK_REF;
1755 token->opr.idx = c2 - '1';
1759 if (!(syntax & REG_NO_GNU_OPS))
1761 token->type = ANCHOR;
1762 token->opr.ctx_type = WORD_FIRST;
1766 if (!(syntax & REG_NO_GNU_OPS))
1768 token->type = ANCHOR;
1769 token->opr.ctx_type = WORD_LAST;
1773 if (!(syntax & REG_NO_GNU_OPS))
1775 token->type = ANCHOR;
1776 token->opr.ctx_type = WORD_DELIM;
1780 if (!(syntax & REG_NO_GNU_OPS))
1782 token->type = ANCHOR;
1783 token->opr.ctx_type = NOT_WORD_DELIM;
1787 if (!(syntax & REG_NO_GNU_OPS))
1788 token->type = OP_WORD;
1791 if (!(syntax & REG_NO_GNU_OPS))
1792 token->type = OP_NOTWORD;
1795 if (!(syntax & REG_NO_GNU_OPS))
1796 token->type = OP_SPACE;
1799 if (!(syntax & REG_NO_GNU_OPS))
1800 token->type = OP_NOTSPACE;
1803 if (!(syntax & REG_NO_GNU_OPS))
1805 token->type = ANCHOR;
1806 token->opr.ctx_type = BUF_FIRST;
1810 if (!(syntax & REG_NO_GNU_OPS))
1812 token->type = ANCHOR;
1813 token->opr.ctx_type = BUF_LAST;
1817 if (!(syntax & REG_NO_BK_PARENS))
1818 token->type = OP_OPEN_SUBEXP;
1821 if (!(syntax & REG_NO_BK_PARENS))
1822 token->type = OP_CLOSE_SUBEXP;
1825 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1826 token->type = OP_DUP_PLUS;
1829 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1830 token->type = OP_DUP_QUESTION;
1833 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1834 token->type = OP_OPEN_DUP_NUM;
1837 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1838 token->type = OP_CLOSE_DUP_NUM;
1846 token->type = CHARACTER;
1847 #ifdef RE_ENABLE_I18N
1848 if (input->mb_cur_max > 1)
1850 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1851 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1855 token->word_char = IS_WORD_CHAR (token->opr.c);
1860 if (syntax & REG_NEWLINE_ALT)
1861 token->type = OP_ALT;
1864 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1865 token->type = OP_ALT;
1868 token->type = OP_DUP_ASTERISK;
1871 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1872 token->type = OP_DUP_PLUS;
1875 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1876 token->type = OP_DUP_QUESTION;
1879 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1880 token->type = OP_OPEN_DUP_NUM;
1883 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1884 token->type = OP_CLOSE_DUP_NUM;
1887 if (syntax & REG_NO_BK_PARENS)
1888 token->type = OP_OPEN_SUBEXP;
1891 if (syntax & REG_NO_BK_PARENS)
1892 token->type = OP_CLOSE_SUBEXP;
1895 token->type = OP_OPEN_BRACKET;
1898 token->type = OP_PERIOD;
1901 if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1902 re_string_cur_idx (input) != 0)
1904 char prev = re_string_peek_byte (input, -1);
1905 if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1908 token->type = ANCHOR;
1909 token->opr.ctx_type = LINE_FIRST;
1912 if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1913 re_string_cur_idx (input) + 1 != re_string_length (input))
1916 re_string_skip_bytes (input, 1);
1917 peek_token (&next, input, syntax);
1918 re_string_skip_bytes (input, -1);
1919 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1922 token->type = ANCHOR;
1923 token->opr.ctx_type = LINE_LAST;
1931 /* Peek a token from INPUT, and return the length of the token.
1932 We must not use this function out of bracket expressions. */
1935 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1938 if (re_string_eoi (input))
1940 token->type = END_OF_RE;
1943 c = re_string_peek_byte (input, 0);
1946 #ifdef RE_ENABLE_I18N
1947 if (input->mb_cur_max > 1 &&
1948 !re_string_first_byte (input, re_string_cur_idx (input)))
1950 token->type = CHARACTER;
1953 #endif /* RE_ENABLE_I18N */
1955 if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1956 && re_string_cur_idx (input) + 1 < re_string_length (input))
1958 /* In this case, '\' escape a character. */
1960 re_string_skip_bytes (input, 1);
1961 c2 = re_string_peek_byte (input, 0);
1963 token->type = CHARACTER;
1966 if (c == '[') /* '[' is a special char in a bracket exps. */
1970 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1971 c2 = re_string_peek_byte (input, 1);
1979 token->type = OP_OPEN_COLL_ELEM;
1982 token->type = OP_OPEN_EQUIV_CLASS;
1985 if (syntax & REG_CHAR_CLASSES)
1987 token->type = OP_OPEN_CHAR_CLASS;
1990 /* else fall through. */
1992 token->type = CHARACTER;
2002 token->type = OP_CHARSET_RANGE;
2005 token->type = OP_CLOSE_BRACKET;
2008 token->type = OP_NON_MATCH_LIST;
2011 token->type = CHARACTER;
2016 /* Functions for parser. */
2018 /* Entry point of the parser.
2019 Parse the regular expression REGEXP and return the structure tree.
2020 If an error is occured, ERR is set by error code, and return NULL.
2021 This function build the following tree, from regular expression <reg_exp>:
2027 CAT means concatenation.
2028 EOR means end of regular expression. */
2031 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2034 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2035 bin_tree_t *tree, *eor, *root;
2036 re_token_t current_token;
2037 dfa->syntax = syntax;
2038 fetch_token (¤t_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2039 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2040 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2042 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2044 root = create_tree (dfa, tree, eor, CONCAT);
2047 if (BE (eor == NULL || root == NULL, 0))
2055 /* This function build the following tree, from regular expression
2056 <branch1>|<branch2>:
2062 ALT means alternative, which represents the operator `|'. */
2065 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2066 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2068 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2069 bin_tree_t *tree, *branch = NULL;
2070 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2071 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2074 while (token->type == OP_ALT)
2076 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2077 if (token->type != OP_ALT && token->type != END_OF_RE
2078 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2080 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2081 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2086 tree = create_tree (dfa, tree, branch, OP_ALT);
2087 if (BE (tree == NULL, 0))
2096 /* This function build the following tree, from regular expression
2103 CAT means concatenation. */
2106 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2107 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2109 bin_tree_t *tree, *exp;
2110 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2111 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2112 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2115 while (token->type != OP_ALT && token->type != END_OF_RE
2116 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2118 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2119 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2123 if (tree != NULL && exp != NULL)
2125 tree = create_tree (dfa, tree, exp, CONCAT);
2132 else if (tree == NULL)
2134 /* Otherwise exp == NULL, we don't need to create new tree. */
2139 /* This function build the following tree, from regular expression a*:
2146 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2147 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2149 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2151 switch (token->type)
2154 tree = create_token_tree (dfa, NULL, NULL, token);
2155 if (BE (tree == NULL, 0))
2160 #ifdef RE_ENABLE_I18N
2161 if (dfa->mb_cur_max > 1)
2163 while (!re_string_eoi (regexp)
2164 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2166 bin_tree_t *mbc_remain;
2167 fetch_token (token, regexp, syntax);
2168 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2169 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2170 if (BE (mbc_remain == NULL || tree == NULL, 0))
2179 case OP_OPEN_SUBEXP:
2180 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2181 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2184 case OP_OPEN_BRACKET:
2185 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2186 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2190 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2195 dfa->used_bkref_map |= 1 << token->opr.idx;
2196 tree = create_token_tree (dfa, NULL, NULL, token);
2197 if (BE (tree == NULL, 0))
2203 dfa->has_mb_node = 1;
2205 case OP_OPEN_DUP_NUM:
2206 if (syntax & REG_CONTEXT_INVALID_DUP)
2212 case OP_DUP_ASTERISK:
2214 case OP_DUP_QUESTION:
2215 if (syntax & REG_CONTEXT_INVALID_OPS)
2220 else if (syntax & REG_CONTEXT_INDEP_OPS)
2222 fetch_token (token, regexp, syntax);
2223 return parse_expression (regexp, preg, token, syntax, nest, err);
2225 /* else fall through */
2226 case OP_CLOSE_SUBEXP:
2227 if ((token->type == OP_CLOSE_SUBEXP) &&
2228 !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2233 /* else fall through */
2234 case OP_CLOSE_DUP_NUM:
2235 /* We treat it as a normal character. */
2237 /* Then we can these characters as normal characters. */
2238 token->type = CHARACTER;
2239 /* mb_partial and word_char bits should be initialized already
2241 tree = create_token_tree (dfa, NULL, NULL, token);
2242 if (BE (tree == NULL, 0))
2249 if ((token->opr.ctx_type
2250 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2251 && dfa->word_ops_used == 0)
2252 init_word_char (dfa);
2253 if (token->opr.ctx_type == WORD_DELIM
2254 || token->opr.ctx_type == NOT_WORD_DELIM)
2256 bin_tree_t *tree_first, *tree_last;
2257 if (token->opr.ctx_type == WORD_DELIM)
2259 token->opr.ctx_type = WORD_FIRST;
2260 tree_first = create_token_tree (dfa, NULL, NULL, token);
2261 token->opr.ctx_type = WORD_LAST;
2265 token->opr.ctx_type = INSIDE_WORD;
2266 tree_first = create_token_tree (dfa, NULL, NULL, token);
2267 token->opr.ctx_type = INSIDE_NOTWORD;
2269 tree_last = create_token_tree (dfa, NULL, NULL, token);
2270 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2271 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2279 tree = create_token_tree (dfa, NULL, NULL, token);
2280 if (BE (tree == NULL, 0))
2286 /* We must return here, since ANCHORs can't be followed
2287 by repetition operators.
2288 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2289 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2290 fetch_token (token, regexp, syntax);
2293 tree = create_token_tree (dfa, NULL, NULL, token);
2294 if (BE (tree == NULL, 0))
2299 if (dfa->mb_cur_max > 1)
2300 dfa->has_mb_node = 1;
2304 tree = build_charclass_op (dfa, regexp->trans,
2305 (const unsigned char *) "alnum",
2306 (const unsigned char *) "_",
2307 token->type == OP_NOTWORD, err);
2308 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2313 tree = build_charclass_op (dfa, regexp->trans,
2314 (const unsigned char *) "space",
2315 (const unsigned char *) "",
2316 token->type == OP_NOTSPACE, err);
2317 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2327 /* Must not happen? */
2333 fetch_token (token, regexp, syntax);
2335 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2336 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2338 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2339 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2341 /* In BRE consecutive duplications are not allowed. */
2342 if ((syntax & REG_CONTEXT_INVALID_DUP)
2343 && (token->type == OP_DUP_ASTERISK
2344 || token->type == OP_OPEN_DUP_NUM))
2354 /* This function build the following tree, from regular expression
2362 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2363 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2365 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2368 cur_nsub = preg->re_nsub++;
2370 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2372 /* The subexpression may be a null string. */
2373 if (token->type == OP_CLOSE_SUBEXP)
2377 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2378 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2380 if (BE (*err != REG_NOERROR, 0))
2384 if (cur_nsub <= '9' - '1')
2385 dfa->completed_bkref_map |= 1 << cur_nsub;
2387 tree = create_tree (dfa, tree, NULL, SUBEXP);
2388 if (BE (tree == NULL, 0))
2393 tree->token.opr.idx = cur_nsub;
2397 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2400 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2401 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2403 bin_tree_t *tree = NULL, *old_tree = NULL;
2404 int i, start, end, start_idx = re_string_cur_idx (regexp);
2405 re_token_t start_token = *token;
2407 if (token->type == OP_OPEN_DUP_NUM)
2410 start = fetch_number (regexp, token, syntax);
2413 if (token->type == CHARACTER && token->opr.c == ',')
2414 start = 0; /* We treat "{,m}" as "{0,m}". */
2417 *err = REG_BADBR; /* <re>{} is invalid. */
2421 if (BE (start != -2, 1))
2423 /* We treat "{n}" as "{n,n}". */
2424 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2425 : ((token->type == CHARACTER && token->opr.c == ',')
2426 ? fetch_number (regexp, token, syntax) : -2));
2428 if (BE (start == -2 || end == -2, 0))
2430 /* Invalid sequence. */
2431 if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2433 if (token->type == END_OF_RE)
2441 /* If the syntax bit is set, rollback. */
2442 re_string_set_index (regexp, start_idx);
2443 *token = start_token;
2444 token->type = CHARACTER;
2445 /* mb_partial and word_char bits should be already initialized by
2450 if (BE (end != -1 && start > end, 0))
2452 /* First number greater than second. */
2459 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2460 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2463 fetch_token (token, regexp, syntax);
2465 if (BE (elem == NULL, 0))
2467 if (BE (start == 0 && end == 0, 0))
2469 postorder (elem, free_tree, NULL);
2473 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2474 if (BE (start > 0, 0))
2477 for (i = 2; i <= start; ++i)
2479 elem = duplicate_tree (elem, dfa);
2480 tree = create_tree (dfa, tree, elem, CONCAT);
2481 if (BE (elem == NULL || tree == NULL, 0))
2482 goto parse_dup_op_espace;
2488 /* Duplicate ELEM before it is marked optional. */
2489 elem = duplicate_tree (elem, dfa);
2495 if (elem->token.type == SUBEXP)
2496 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2498 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2499 if (BE (tree == NULL, 0))
2500 goto parse_dup_op_espace;
2502 /* This loop is actually executed only when end != -1,
2503 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2504 already created the start+1-th copy. */
2505 for (i = start + 2; i <= end; ++i)
2507 elem = duplicate_tree (elem, dfa);
2508 tree = create_tree (dfa, tree, elem, CONCAT);
2509 if (BE (elem == NULL || tree == NULL, 0))
2510 goto parse_dup_op_espace;
2512 tree = create_tree (dfa, tree, NULL, OP_ALT);
2513 if (BE (tree == NULL, 0))
2514 goto parse_dup_op_espace;
2518 tree = create_tree (dfa, old_tree, tree, CONCAT);
2522 parse_dup_op_espace:
2527 /* Size of the names for collating symbol/equivalence_class/character_class.
2528 I'm not sure, but maybe enough. */
2529 #define BRACKET_NAME_BUF_SIZE 32
2532 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2533 Build the range expression which starts from START_ELEM, and ends
2534 at END_ELEM. The result are written to MBCSET and SBCSET.
2535 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2536 mbcset->range_ends, is a pointer argument sinse we may
2539 static reg_errcode_t
2540 build_range_exp (re_bitset_ptr_t sbcset,
2541 # ifdef RE_ENABLE_I18N
2542 re_charset_t *mbcset, int *range_alloc,
2544 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2546 unsigned int start_ch, end_ch;
2547 /* Equivalence Classes and Character Classes can't be a range start/end. */
2548 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2549 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2553 /* We can handle no multi character collating elements without libc
2555 if (BE ((start_elem->type == COLL_SYM
2556 && strlen ((char *) start_elem->opr.name) > 1)
2557 || (end_elem->type == COLL_SYM
2558 && strlen ((char *) end_elem->opr.name) > 1), 0))
2559 return REG_ECOLLATE;
2561 # ifdef RE_ENABLE_I18N
2564 wint_t start_wc, end_wc;
2565 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2567 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2568 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2570 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2571 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2573 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2574 ? __btowc (start_ch) : start_elem->opr.wch);
2575 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2576 ? __btowc (end_ch) : end_elem->opr.wch);
2577 if (start_wc == WEOF || end_wc == WEOF)
2578 return REG_ECOLLATE;
2579 cmp_buf[0] = start_wc;
2580 cmp_buf[4] = end_wc;
2581 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2584 /* Got valid collation sequence values, add them as a new entry.
2585 However, for !_LIBC we have no collation elements: if the
2586 character set is single byte, the single byte character set
2587 that we build below suffices. parse_bracket_exp passes
2588 no MBCSET if dfa->mb_cur_max == 1. */
2591 /* Check the space of the arrays. */
2592 if (BE (*range_alloc == mbcset->nranges, 0))
2594 /* There is not enough space, need realloc. */
2595 wchar_t *new_array_start, *new_array_end;
2598 /* +1 in case of mbcset->nranges is 0. */
2599 new_nranges = 2 * mbcset->nranges + 1;
2600 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2601 are NULL if *range_alloc == 0. */
2602 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2604 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2607 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2610 mbcset->range_starts = new_array_start;
2611 mbcset->range_ends = new_array_end;
2612 *range_alloc = new_nranges;
2615 mbcset->range_starts[mbcset->nranges] = start_wc;
2616 mbcset->range_ends[mbcset->nranges++] = end_wc;
2619 /* Build the table for single byte characters. */
2620 for (wc = 0; wc < SBC_MAX; ++wc)
2623 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2624 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2625 bitset_set (sbcset, wc);
2628 # else /* not RE_ENABLE_I18N */
2631 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2632 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2634 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2635 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2637 if (start_ch > end_ch)
2639 /* Build the table for single byte characters. */
2640 for (ch = 0; ch < SBC_MAX; ++ch)
2641 if (start_ch <= ch && ch <= end_ch)
2642 bitset_set (sbcset, ch);
2644 # endif /* not RE_ENABLE_I18N */
2647 #endif /* not _LIBC */
2650 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2651 Build the collating element which is represented by NAME.
2652 The result are written to MBCSET and SBCSET.
2653 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2654 pointer argument since we may update it. */
2656 static reg_errcode_t
2657 build_collating_symbol (re_bitset_ptr_t sbcset,
2658 # ifdef RE_ENABLE_I18N
2659 re_charset_t *mbcset, int *coll_sym_alloc,
2661 const unsigned char *name)
2663 size_t name_len = strlen ((const char *) name);
2664 if (BE (name_len != 1, 0))
2665 return REG_ECOLLATE;
2668 bitset_set (sbcset, name[0]);
2672 #endif /* not _LIBC */
2674 /* This function parse bracket expression like "[abc]", "[a-c]",
2678 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2679 reg_syntax_t syntax, reg_errcode_t *err)
2682 const unsigned char *collseqmb;
2683 const char *collseqwc;
2686 const int32_t *symb_table;
2687 const unsigned char *extra;
2689 /* Local function for parse_bracket_exp used in _LIBC environement.
2690 Seek the collating symbol entry correspondings to NAME.
2691 Return the index of the symbol in the SYMB_TABLE. */
2694 __attribute ((always_inline))
2695 seek_collating_symbol_entry (name, name_len)
2696 const unsigned char *name;
2699 int32_t hash = elem_hash ((const char *) name, name_len);
2700 int32_t elem = hash % table_size;
2701 int32_t second = hash % (table_size - 2);
2702 while (symb_table[2 * elem] != 0)
2704 /* First compare the hashing value. */
2705 if (symb_table[2 * elem] == hash
2706 /* Compare the length of the name. */
2707 && name_len == extra[symb_table[2 * elem + 1]]
2708 /* Compare the name. */
2709 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2712 /* Yep, this is the entry. */
2722 /* Local function for parse_bracket_exp used in _LIBC environement.
2723 Look up the collation sequence value of BR_ELEM.
2724 Return the value if succeeded, UINT_MAX otherwise. */
2726 auto inline unsigned int
2727 __attribute ((always_inline))
2728 lookup_collation_sequence_value (br_elem)
2729 bracket_elem_t *br_elem;
2731 if (br_elem->type == SB_CHAR)
2734 if (MB_CUR_MAX == 1)
2737 return collseqmb[br_elem->opr.ch];
2740 wint_t wc = __btowc (br_elem->opr.ch);
2741 return __collseq_table_lookup (collseqwc, wc);
2744 else if (br_elem->type == MB_CHAR)
2746 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2748 else if (br_elem->type == COLL_SYM)
2750 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2754 elem = seek_collating_symbol_entry (br_elem->opr.name,
2756 if (symb_table[2 * elem] != 0)
2758 /* We found the entry. */
2759 idx = symb_table[2 * elem + 1];
2760 /* Skip the name of collating element name. */
2761 idx += 1 + extra[idx];
2762 /* Skip the byte sequence of the collating element. */
2763 idx += 1 + extra[idx];
2764 /* Adjust for the alignment. */
2765 idx = (idx + 3) & ~3;
2766 /* Skip the multibyte collation sequence value. */
2767 idx += sizeof (unsigned int);
2768 /* Skip the wide char sequence of the collating element. */
2769 idx += sizeof (unsigned int) *
2770 (1 + *(unsigned int *) (extra + idx));
2771 /* Return the collation sequence value. */
2772 return *(unsigned int *) (extra + idx);
2774 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2776 /* No valid character. Match it as a single byte
2778 return collseqmb[br_elem->opr.name[0]];
2781 else if (sym_name_len == 1)
2782 return collseqmb[br_elem->opr.name[0]];
2787 /* Local function for parse_bracket_exp used in _LIBC environement.
2788 Build the range expression which starts from START_ELEM, and ends
2789 at END_ELEM. The result are written to MBCSET and SBCSET.
2790 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2791 mbcset->range_ends, is a pointer argument sinse we may
2794 auto inline reg_errcode_t
2795 __attribute ((always_inline))
2796 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2797 re_charset_t *mbcset;
2799 re_bitset_ptr_t sbcset;
2800 bracket_elem_t *start_elem, *end_elem;
2803 uint32_t start_collseq;
2804 uint32_t end_collseq;
2806 /* Equivalence Classes and Character Classes can't be a range
2808 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2809 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2813 start_collseq = lookup_collation_sequence_value (start_elem);
2814 end_collseq = lookup_collation_sequence_value (end_elem);
2815 /* Check start/end collation sequence values. */
2816 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2817 return REG_ECOLLATE;
2818 if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2821 /* Got valid collation sequence values, add them as a new entry.
2822 However, if we have no collation elements, and the character set
2823 is single byte, the single byte character set that we
2824 build below suffices. */
2825 if (nrules > 0 || dfa->mb_cur_max > 1)
2827 /* Check the space of the arrays. */
2828 if (BE (*range_alloc == mbcset->nranges, 0))
2830 /* There is not enough space, need realloc. */
2831 uint32_t *new_array_start;
2832 uint32_t *new_array_end;
2835 /* +1 in case of mbcset->nranges is 0. */
2836 new_nranges = 2 * mbcset->nranges + 1;
2837 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2839 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2842 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2845 mbcset->range_starts = new_array_start;
2846 mbcset->range_ends = new_array_end;
2847 *range_alloc = new_nranges;
2850 mbcset->range_starts[mbcset->nranges] = start_collseq;
2851 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2854 /* Build the table for single byte characters. */
2855 for (ch = 0; ch < SBC_MAX; ch++)
2857 uint32_t ch_collseq;
2859 if (MB_CUR_MAX == 1)
2862 ch_collseq = collseqmb[ch];
2864 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2865 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2866 bitset_set (sbcset, ch);
2871 /* Local function for parse_bracket_exp used in _LIBC environement.
2872 Build the collating element which is represented by NAME.
2873 The result are written to MBCSET and SBCSET.
2874 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2875 pointer argument sinse we may update it. */
2877 auto inline reg_errcode_t
2878 __attribute ((always_inline))
2879 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2880 re_charset_t *mbcset;
2881 int *coll_sym_alloc;
2882 re_bitset_ptr_t sbcset;
2883 const unsigned char *name;
2886 size_t name_len = strlen ((const char *) name);
2889 elem = seek_collating_symbol_entry (name, name_len);
2890 if (symb_table[2 * elem] != 0)
2892 /* We found the entry. */
2893 idx = symb_table[2 * elem + 1];
2894 /* Skip the name of collating element name. */
2895 idx += 1 + extra[idx];
2897 else if (symb_table[2 * elem] == 0 && name_len == 1)
2899 /* No valid character, treat it as a normal
2901 bitset_set (sbcset, name[0]);
2905 return REG_ECOLLATE;
2907 /* Got valid collation sequence, add it as a new entry. */
2908 /* Check the space of the arrays. */
2909 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2911 /* Not enough, realloc it. */
2912 /* +1 in case of mbcset->ncoll_syms is 0. */
2913 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2914 /* Use realloc since mbcset->coll_syms is NULL
2916 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2917 new_coll_sym_alloc);
2918 if (BE (new_coll_syms == NULL, 0))
2920 mbcset->coll_syms = new_coll_syms;
2921 *coll_sym_alloc = new_coll_sym_alloc;
2923 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2928 if (BE (name_len != 1, 0))
2929 return REG_ECOLLATE;
2932 bitset_set (sbcset, name[0]);
2939 re_token_t br_token;
2940 re_bitset_ptr_t sbcset;
2941 #ifdef RE_ENABLE_I18N
2942 re_charset_t *mbcset;
2943 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2944 int equiv_class_alloc = 0, char_class_alloc = 0;
2945 #endif /* not RE_ENABLE_I18N */
2947 bin_tree_t *work_tree;
2949 int first_round = 1;
2951 collseqmb = (const unsigned char *)
2952 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2953 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2959 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2960 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2961 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2962 _NL_COLLATE_SYMB_TABLEMB);
2963 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2964 _NL_COLLATE_SYMB_EXTRAMB);
2967 sbcset = re_calloc (unsigned int, BITSET_UINTS);
2968 #ifdef RE_ENABLE_I18N
2969 mbcset = re_calloc (re_charset_t, 1);
2970 #endif /* RE_ENABLE_I18N */
2971 #ifdef RE_ENABLE_I18N
2972 if (BE (sbcset == NULL || mbcset == NULL, 0))
2974 if (BE (sbcset == NULL, 0))
2975 #endif /* RE_ENABLE_I18N */
2981 token_len = peek_token_bracket (token, regexp, syntax);
2982 if (BE (token->type == END_OF_RE, 0))
2985 goto parse_bracket_exp_free_return;
2987 if (token->type == OP_NON_MATCH_LIST)
2989 #ifdef RE_ENABLE_I18N
2990 mbcset->non_match = 1;
2991 #endif /* not RE_ENABLE_I18N */
2993 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
2994 bitset_set (sbcset, '\0');
2995 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2996 token_len = peek_token_bracket (token, regexp, syntax);
2997 if (BE (token->type == END_OF_RE, 0))
3000 goto parse_bracket_exp_free_return;
3004 /* We treat the first ']' as a normal character. */
3005 if (token->type == OP_CLOSE_BRACKET)
3006 token->type = CHARACTER;
3010 bracket_elem_t start_elem, end_elem;
3011 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3012 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3014 int token_len2 = 0, is_range_exp = 0;
3017 start_elem.opr.name = start_name_buf;
3018 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3019 syntax, first_round);
3020 if (BE (ret != REG_NOERROR, 0))
3023 goto parse_bracket_exp_free_return;
3027 /* Get information about the next token. We need it in any case. */
3028 token_len = peek_token_bracket (token, regexp, syntax);
3030 /* Do not check for ranges if we know they are not allowed. */
3031 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3033 if (BE (token->type == END_OF_RE, 0))
3036 goto parse_bracket_exp_free_return;
3038 if (token->type == OP_CHARSET_RANGE)
3040 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3041 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3042 if (BE (token2.type == END_OF_RE, 0))
3045 goto parse_bracket_exp_free_return;
3047 if (token2.type == OP_CLOSE_BRACKET)
3049 /* We treat the last '-' as a normal character. */
3050 re_string_skip_bytes (regexp, -token_len);
3051 token->type = CHARACTER;
3058 if (is_range_exp == 1)
3060 end_elem.opr.name = end_name_buf;
3061 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3063 if (BE (ret != REG_NOERROR, 0))
3066 goto parse_bracket_exp_free_return;
3069 token_len = peek_token_bracket (token, regexp, syntax);
3072 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3073 &start_elem, &end_elem);
3075 # ifdef RE_ENABLE_I18N
3076 *err = build_range_exp (sbcset,
3077 dfa->mb_cur_max > 1 ? mbcset : NULL,
3078 &range_alloc, &start_elem, &end_elem);
3080 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3082 #endif /* RE_ENABLE_I18N */
3083 if (BE (*err != REG_NOERROR, 0))
3084 goto parse_bracket_exp_free_return;
3088 switch (start_elem.type)
3091 bitset_set (sbcset, start_elem.opr.ch);
3093 #ifdef RE_ENABLE_I18N
3095 /* Check whether the array has enough space. */
3096 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3098 wchar_t *new_mbchars;
3099 /* Not enough, realloc it. */
3100 /* +1 in case of mbcset->nmbchars is 0. */
3101 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3102 /* Use realloc since array is NULL if *alloc == 0. */
3103 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3105 if (BE (new_mbchars == NULL, 0))
3106 goto parse_bracket_exp_espace;
3107 mbcset->mbchars = new_mbchars;
3109 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3111 #endif /* RE_ENABLE_I18N */
3113 *err = build_equiv_class (sbcset,
3114 #ifdef RE_ENABLE_I18N
3115 mbcset, &equiv_class_alloc,
3116 #endif /* RE_ENABLE_I18N */
3117 start_elem.opr.name);
3118 if (BE (*err != REG_NOERROR, 0))
3119 goto parse_bracket_exp_free_return;
3122 *err = build_collating_symbol (sbcset,
3123 #ifdef RE_ENABLE_I18N
3124 mbcset, &coll_sym_alloc,
3125 #endif /* RE_ENABLE_I18N */
3126 start_elem.opr.name);
3127 if (BE (*err != REG_NOERROR, 0))
3128 goto parse_bracket_exp_free_return;
3131 *err = build_charclass (regexp->trans, sbcset,
3132 #ifdef RE_ENABLE_I18N
3133 mbcset, &char_class_alloc,
3134 #endif /* RE_ENABLE_I18N */
3135 start_elem.opr.name, syntax);
3136 if (BE (*err != REG_NOERROR, 0))
3137 goto parse_bracket_exp_free_return;
3144 if (BE (token->type == END_OF_RE, 0))
3147 goto parse_bracket_exp_free_return;
3149 if (token->type == OP_CLOSE_BRACKET)
3153 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3155 /* If it is non-matching list. */
3157 bitset_not (sbcset);
3159 #ifdef RE_ENABLE_I18N
3160 /* Ensure only single byte characters are set. */
3161 if (dfa->mb_cur_max > 1)
3162 bitset_mask (sbcset, dfa->sb_char);
3164 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3165 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3166 || mbcset->non_match)))
3168 bin_tree_t *mbc_tree;
3170 /* Build a tree for complex bracket. */
3171 dfa->has_mb_node = 1;
3172 br_token.type = COMPLEX_BRACKET;
3173 br_token.opr.mbcset = mbcset;
3174 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3175 if (BE (mbc_tree == NULL, 0))
3176 goto parse_bracket_exp_espace;
3177 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3178 if (sbcset[sbc_idx])
3180 /* If there are no bits set in sbcset, there is no point
3181 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3182 if (sbc_idx < BITSET_UINTS)
3184 /* Build a tree for simple bracket. */
3185 br_token.type = SIMPLE_BRACKET;
3186 br_token.opr.sbcset = sbcset;
3187 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3188 if (BE (work_tree == NULL, 0))
3189 goto parse_bracket_exp_espace;
3191 /* Then join them by ALT node. */
3192 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3193 if (BE (work_tree == NULL, 0))
3194 goto parse_bracket_exp_espace;
3199 work_tree = mbc_tree;
3203 #endif /* not RE_ENABLE_I18N */
3205 #ifdef RE_ENABLE_I18N
3206 free_charset (mbcset);
3208 /* Build a tree for simple bracket. */
3209 br_token.type = SIMPLE_BRACKET;
3210 br_token.opr.sbcset = sbcset;
3211 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3212 if (BE (work_tree == NULL, 0))
3213 goto parse_bracket_exp_espace;
3217 parse_bracket_exp_espace:
3219 parse_bracket_exp_free_return:
3221 #ifdef RE_ENABLE_I18N
3222 free_charset (mbcset);
3223 #endif /* RE_ENABLE_I18N */
3227 /* Parse an element in the bracket expression. */
3229 static reg_errcode_t
3230 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3231 re_token_t *token, int token_len, re_dfa_t *dfa,
3232 reg_syntax_t syntax, int accept_hyphen)
3234 #ifdef RE_ENABLE_I18N
3236 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3237 if (cur_char_size > 1)
3239 elem->type = MB_CHAR;
3240 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3241 re_string_skip_bytes (regexp, cur_char_size);
3244 #endif /* RE_ENABLE_I18N */
3245 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3246 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3247 || token->type == OP_OPEN_EQUIV_CLASS)
3248 return parse_bracket_symbol (elem, regexp, token);
3249 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3251 /* A '-' must only appear as anything but a range indicator before
3252 the closing bracket. Everything else is an error. */
3254 (void) peek_token_bracket (&token2, regexp, syntax);
3255 if (token2.type != OP_CLOSE_BRACKET)
3256 /* The actual error value is not standardized since this whole
3257 case is undefined. But ERANGE makes good sense. */
3260 elem->type = SB_CHAR;
3261 elem->opr.ch = token->opr.c;
3265 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3266 such as [:<character_class>:], [.<collating_element>.], and
3267 [=<equivalent_class>=]. */
3269 static reg_errcode_t
3270 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3273 unsigned char ch, delim = token->opr.c;
3275 if (re_string_eoi(regexp))
3279 if (i >= BRACKET_NAME_BUF_SIZE)
3281 if (token->type == OP_OPEN_CHAR_CLASS)
3282 ch = re_string_fetch_byte_case (regexp);
3284 ch = re_string_fetch_byte (regexp);
3285 if (re_string_eoi(regexp))
3287 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3289 elem->opr.name[i] = ch;
3291 re_string_skip_bytes (regexp, 1);
3292 elem->opr.name[i] = '\0';
3293 switch (token->type)
3295 case OP_OPEN_COLL_ELEM:
3296 elem->type = COLL_SYM;
3298 case OP_OPEN_EQUIV_CLASS:
3299 elem->type = EQUIV_CLASS;
3301 case OP_OPEN_CHAR_CLASS:
3302 elem->type = CHAR_CLASS;
3310 /* Helper function for parse_bracket_exp.
3311 Build the equivalence class which is represented by NAME.
3312 The result are written to MBCSET and SBCSET.
3313 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3314 is a pointer argument sinse we may update it. */
3316 static reg_errcode_t
3317 build_equiv_class (re_bitset_ptr_t sbcset,
3318 #ifdef RE_ENABLE_I18N
3319 re_charset_t *mbcset, int *equiv_class_alloc,
3321 const unsigned char *name)
3324 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3327 const int32_t *table, *indirect;
3328 const unsigned char *weights, *extra, *cp;
3329 unsigned char char_buf[2];
3333 /* This #include defines a local function! */
3334 # include <locale/weight.h>
3335 /* Calculate the index for equivalence class. */
3337 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3338 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3339 _NL_COLLATE_WEIGHTMB);
3340 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3341 _NL_COLLATE_EXTRAMB);
3342 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3343 _NL_COLLATE_INDIRECTMB);
3344 idx1 = findidx (&cp);
3345 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3346 /* This isn't a valid character. */
3347 return REG_ECOLLATE;
3349 /* Build single byte matcing table for this equivalence class. */
3350 char_buf[1] = (unsigned char) '\0';
3351 len = weights[idx1];
3352 for (ch = 0; ch < SBC_MAX; ++ch)
3356 idx2 = findidx (&cp);
3361 /* This isn't a valid character. */
3363 if (len == weights[idx2])
3366 while (cnt <= len &&
3367 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3371 bitset_set (sbcset, ch);
3374 /* Check whether the array has enough space. */
3375 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3377 /* Not enough, realloc it. */
3378 /* +1 in case of mbcset->nequiv_classes is 0. */
3379 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3380 /* Use realloc since the array is NULL if *alloc == 0. */
3381 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3383 new_equiv_class_alloc);
3384 if (BE (new_equiv_classes == NULL, 0))
3386 mbcset->equiv_classes = new_equiv_classes;
3387 *equiv_class_alloc = new_equiv_class_alloc;
3389 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3394 if (BE (strlen ((const char *) name) != 1, 0))
3395 return REG_ECOLLATE;
3396 bitset_set (sbcset, *name);
3401 /* Helper function for parse_bracket_exp.
3402 Build the character class which is represented by NAME.
3403 The result are written to MBCSET and SBCSET.
3404 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3405 is a pointer argument sinse we may update it. */
3407 static reg_errcode_t
3408 build_charclass (unsigned REG_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset,
3409 #ifdef RE_ENABLE_I18N
3410 re_charset_t *mbcset, int *char_class_alloc,
3412 const unsigned char *class_name, reg_syntax_t syntax)
3415 const char *name = (const char *) class_name;
3417 /* In case of REG_ICASE "upper" and "lower" match the both of
3418 upper and lower cases. */
3419 if ((syntax & REG_IGNORE_CASE)
3420 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3423 #ifdef RE_ENABLE_I18N
3424 /* Check the space of the arrays. */
3425 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3427 /* Not enough, realloc it. */
3428 /* +1 in case of mbcset->nchar_classes is 0. */
3429 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3430 /* Use realloc since array is NULL if *alloc == 0. */
3431 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3432 new_char_class_alloc);
3433 if (BE (new_char_classes == NULL, 0))
3435 mbcset->char_classes = new_char_classes;
3436 *char_class_alloc = new_char_class_alloc;
3438 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3439 #endif /* RE_ENABLE_I18N */
3441 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3442 for (i = 0; i < SBC_MAX; ++i) \
3444 if (ctype_func (i)) \
3446 int ch = trans ? trans[i] : i; \
3447 bitset_set (sbcset, ch); \
3451 if (strcmp (name, "alnum") == 0)
3452 BUILD_CHARCLASS_LOOP (isalnum)
3453 else if (strcmp (name, "cntrl") == 0)
3454 BUILD_CHARCLASS_LOOP (iscntrl)
3455 else if (strcmp (name, "lower") == 0)
3456 BUILD_CHARCLASS_LOOP (islower)
3457 else if (strcmp (name, "space") == 0)
3458 BUILD_CHARCLASS_LOOP (isspace)
3459 else if (strcmp (name, "alpha") == 0)
3460 BUILD_CHARCLASS_LOOP (isalpha)
3461 else if (strcmp (name, "digit") == 0)
3462 BUILD_CHARCLASS_LOOP (isdigit)
3463 else if (strcmp (name, "print") == 0)
3464 BUILD_CHARCLASS_LOOP (isprint)
3465 else if (strcmp (name, "upper") == 0)
3466 BUILD_CHARCLASS_LOOP (isupper)
3467 else if (strcmp (name, "blank") == 0)
3468 BUILD_CHARCLASS_LOOP (isblank)
3469 else if (strcmp (name, "graph") == 0)
3470 BUILD_CHARCLASS_LOOP (isgraph)
3471 else if (strcmp (name, "punct") == 0)
3472 BUILD_CHARCLASS_LOOP (ispunct)
3473 else if (strcmp (name, "xdigit") == 0)
3474 BUILD_CHARCLASS_LOOP (isxdigit)
3482 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3483 const unsigned char *class_name,
3484 const unsigned char *extra,
3485 int non_match, reg_errcode_t *err)
3487 re_bitset_ptr_t sbcset;
3488 #ifdef RE_ENABLE_I18N
3489 re_charset_t *mbcset;
3491 #endif /* not RE_ENABLE_I18N */
3493 re_token_t br_token;
3496 sbcset = re_calloc (unsigned int, BITSET_UINTS);
3497 #ifdef RE_ENABLE_I18N
3498 mbcset = re_calloc (re_charset_t, 1);
3499 #endif /* RE_ENABLE_I18N */
3501 #ifdef RE_ENABLE_I18N
3502 if (BE (sbcset == NULL || mbcset == NULL, 0))
3503 #else /* not RE_ENABLE_I18N */
3504 if (BE (sbcset == NULL, 0))
3505 #endif /* not RE_ENABLE_I18N */
3513 #ifdef RE_ENABLE_I18N
3515 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3516 bitset_set(cset->sbcset, '\0');
3518 mbcset->non_match = 1;
3519 #endif /* not RE_ENABLE_I18N */
3522 /* We don't care the syntax in this case. */
3523 ret = build_charclass (trans, sbcset,
3524 #ifdef RE_ENABLE_I18N
3526 #endif /* RE_ENABLE_I18N */
3529 if (BE (ret != REG_NOERROR, 0))
3532 #ifdef RE_ENABLE_I18N
3533 free_charset (mbcset);
3534 #endif /* RE_ENABLE_I18N */
3538 /* \w match '_' also. */
3539 for (; *extra; extra++)
3540 bitset_set (sbcset, *extra);
3542 /* If it is non-matching list. */
3544 bitset_not (sbcset);
3546 #ifdef RE_ENABLE_I18N
3547 /* Ensure only single byte characters are set. */
3548 if (dfa->mb_cur_max > 1)
3549 bitset_mask (sbcset, dfa->sb_char);
3552 /* Build a tree for simple bracket. */
3553 br_token.type = SIMPLE_BRACKET;
3554 br_token.opr.sbcset = sbcset;
3555 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3556 if (BE (tree == NULL, 0))
3557 goto build_word_op_espace;
3559 #ifdef RE_ENABLE_I18N
3560 if (dfa->mb_cur_max > 1)
3562 bin_tree_t *mbc_tree;
3563 /* Build a tree for complex bracket. */
3564 br_token.type = COMPLEX_BRACKET;
3565 br_token.opr.mbcset = mbcset;
3566 dfa->has_mb_node = 1;
3567 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3568 if (BE (mbc_tree == NULL, 0))
3569 goto build_word_op_espace;
3570 /* Then join them by ALT node. */
3571 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3572 if (BE (mbc_tree != NULL, 1))
3577 free_charset (mbcset);
3580 #else /* not RE_ENABLE_I18N */
3582 #endif /* not RE_ENABLE_I18N */
3584 build_word_op_espace:
3586 #ifdef RE_ENABLE_I18N
3587 free_charset (mbcset);
3588 #endif /* RE_ENABLE_I18N */
3593 /* This is intended for the expressions like "a{1,3}".
3594 Fetch a number from `input', and return the number.
3595 Return -1, if the number field is empty like "{,1}".
3596 Return -2, If an error is occured. */
3599 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3605 fetch_token (token, input, syntax);
3607 if (BE (token->type == END_OF_RE, 0))
3609 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3611 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3612 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3613 num = (num > REG_DUP_MAX) ? -2 : num;
3618 #ifdef RE_ENABLE_I18N
3620 free_charset (re_charset_t *cset)
3622 re_free (cset->mbchars);
3624 re_free (cset->coll_syms);
3625 re_free (cset->equiv_classes);
3626 re_free (cset->range_starts);
3627 re_free (cset->range_ends);
3629 re_free (cset->char_classes);
3632 #endif /* RE_ENABLE_I18N */
3634 /* Functions for binary tree operation. */
3636 /* Create a tree node. */
3639 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3640 re_token_type_t type)
3644 return create_token_tree (dfa, left, right, &t);
3648 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3649 const re_token_t *token)
3652 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3654 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3656 if (storage == NULL)
3658 storage->next = dfa->str_tree_storage;
3659 dfa->str_tree_storage = storage;
3660 dfa->str_tree_storage_idx = 0;
3662 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3664 tree->parent = NULL;
3666 tree->right = right;
3667 tree->token = *token;
3668 tree->token.duplicated = 0;
3669 tree->token.opt_subexp = 0;
3672 tree->node_idx = -1;
3675 left->parent = tree;
3677 right->parent = tree;
3681 /* Mark the tree SRC as an optional subexpression.
3682 To be called from preorder or postorder. */
3684 static reg_errcode_t
3685 mark_opt_subexp (void *extra, bin_tree_t *node)
3687 int idx = (int) (long) extra;
3688 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3689 node->token.opt_subexp = 1;
3694 /* Free the allocated memory inside NODE. */
3697 free_token (re_token_t *node)
3699 #ifdef RE_ENABLE_I18N
3700 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3701 free_charset (node->opr.mbcset);
3703 #endif /* RE_ENABLE_I18N */
3704 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3705 re_free (node->opr.sbcset);
3708 /* Worker function for tree walking. Free the allocated memory inside NODE
3709 and its children. */
3711 static reg_errcode_t
3712 free_tree (void *extra, bin_tree_t *node)
3714 free_token (&node->token);
3719 /* Duplicate the node SRC, and return new node. This is a preorder
3720 visit similar to the one implemented by the generic visitor, but
3721 we need more infrastructure to maintain two parallel trees --- so,
3722 it's easier to duplicate. */
3725 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3727 const bin_tree_t *node;
3728 bin_tree_t *dup_root;
3729 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3731 for (node = root; ; )
3733 /* Create a new tree and link it back to the current parent. */
3734 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3737 (*p_new)->parent = dup_node;
3738 (*p_new)->token.duplicated = 1;
3741 /* Go to the left node, or up and to the right. */
3745 p_new = &dup_node->left;
3749 const bin_tree_t *prev = NULL;
3750 while (node->right == prev || node->right == NULL)
3753 node = node->parent;
3754 dup_node = dup_node->parent;
3759 p_new = &dup_node->right;