1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 Idx length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, Idx pat_len);
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_token_t *token, int token_len,
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
91 Idx *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
94 re_bitset_ptr_t sbcset,
96 Idx *char_class_alloc,
97 const unsigned char *class_name,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
103 re_bitset_ptr_t sbcset,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 unsigned REG_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 int non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 const char __re_error_msgid[] attribute_hidden =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 const size_t __re_error_msgid_idx[] attribute_hidden =
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
210 are set in BUFP on entry. */
213 re_compile_pattern (const char *pattern, size_t length,
214 struct re_pattern_buffer *bufp)
218 /* And GNU code determines whether or not to get register information
219 by passing null for the REGS argument to re_match, etc., not by
220 setting re_no_sub, unless REG_NO_SUB is set. */
221 bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
223 /* Match anchors at newline. */
224 bufp->re_newline_anchor = 1;
226 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
233 weak_alias (__re_compile_pattern, re_compile_pattern)
236 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
237 also be assigned to arbitrarily: each pattern buffer stores its own
238 syntax, so it can be changed between regex compilations. */
239 /* This has no initializer because initialized variables in Emacs
240 become read-only after dumping. */
241 reg_syntax_t re_syntax_options;
244 /* Specify the precise syntax of regexps for compilation. This provides
245 for compatibility for various utilities which historically have
246 different, incompatible syntaxes.
248 The argument SYNTAX is a bit mask comprised of the various bits
249 defined in regex.h. We return the old syntax. */
252 re_set_syntax (reg_syntax_t syntax)
254 reg_syntax_t ret = re_syntax_options;
256 re_syntax_options = syntax;
260 weak_alias (__re_set_syntax, re_set_syntax)
264 re_compile_fastmap (struct re_pattern_buffer *bufp)
266 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
267 char *fastmap = bufp->re_fastmap;
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->re_fastmap_accurate = 1;
281 weak_alias (__re_compile_fastmap, re_compile_fastmap)
285 __attribute ((always_inline))
286 re_set_fastmap (char *fastmap, int icase, int ch)
290 fastmap[tolower (ch)] = 1;
293 /* Helper function for re_compile_fastmap.
294 Compile fastmap for the initial_state INIT_STATE. */
297 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
300 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
302 int icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
303 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
305 Idx node = init_state->nodes.elems[node_cnt];
306 re_token_type_t type = dfa->nodes[node].type;
308 if (type == CHARACTER)
310 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
311 #ifdef RE_ENABLE_I18N
312 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
314 unsigned char buf[MB_LEN_MAX];
320 *p++ = dfa->nodes[node].opr.c;
321 while (++node < dfa->nodes_len
322 && dfa->nodes[node].type == CHARACTER
323 && dfa->nodes[node].mb_partial)
324 *p++ = dfa->nodes[node].opr.c;
325 memset (&state, 0, sizeof (state));
326 if (mbrtowc (&wc, (const char *) buf, p - buf,
328 && (__wcrtomb ((char *) buf, towlower (wc), &state)
330 re_set_fastmap (fastmap, 0, buf[0]);
334 else if (type == SIMPLE_BRACKET)
337 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
338 for (j = 0; j < UINT_BITS; ++j, ++ch)
339 if (dfa->nodes[node].opr.sbcset[i] & (1u << j))
340 re_set_fastmap (fastmap, icase, ch);
342 #ifdef RE_ENABLE_I18N
343 else if (type == COMPLEX_BRACKET)
346 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
347 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
348 || cset->nranges || cset->nchar_classes)
351 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
353 /* In this case we want to catch the bytes which are
354 the first byte of any collation elements.
355 e.g. In da_DK, we want to catch 'a' since "aa"
356 is a valid collation element, and don't catch
357 'b' since 'b' is the only collation element
358 which starts from 'b'. */
360 const int32_t *table = (const int32_t *)
361 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
362 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
363 for (j = 0; j < UINT_BITS; ++j, ++ch)
365 re_set_fastmap (fastmap, icase, ch);
368 if (dfa->mb_cur_max > 1)
369 for (i = 0; i < SBC_MAX; ++i)
370 if (__btowc (i) == WEOF)
371 re_set_fastmap (fastmap, icase, i);
372 # endif /* not _LIBC */
374 for (i = 0; i < cset->nmbchars; ++i)
378 memset (&state, '\0', sizeof (state));
379 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
380 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
381 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
383 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
385 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
389 #endif /* RE_ENABLE_I18N */
390 else if (type == OP_PERIOD
391 #ifdef RE_ENABLE_I18N
392 || type == OP_UTF8_PERIOD
393 #endif /* RE_ENABLE_I18N */
394 || type == END_OF_RE)
396 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
397 if (type == END_OF_RE)
398 bufp->re_can_be_null = 1;
404 /* Entry point for POSIX code. */
405 /* regcomp takes a regular expression as a string and compiles it.
407 PREG is a regex_t *. We do not expect any fields to be initialized,
408 since POSIX says we shouldn't. Thus, we set
410 `re_buffer' to the compiled pattern;
411 `re_used' to the length of the compiled pattern;
412 `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
413 REG_EXTENDED bit in CFLAGS is set; otherwise, to
414 REG_SYNTAX_POSIX_BASIC;
415 `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
416 `re_fastmap' to an allocated space for the fastmap;
417 `re_fastmap_accurate' to zero;
418 `re_nsub' to the number of subexpressions in PATTERN.
420 PATTERN is the address of the pattern string.
422 CFLAGS is a series of bits which affect compilation.
424 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
425 use POSIX basic syntax.
427 If REG_NEWLINE is set, then . and [^...] don't match newline.
428 Also, regexec will try a match beginning after every newline.
430 If REG_ICASE is set, then we considers upper- and lowercase
431 versions of letters to be equivalent when matching.
433 If REG_NOSUB is set, then when PREG is passed to regexec, that
434 routine will report only success or failure, and nothing about the
437 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
438 the return codes and their meanings.) */
441 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
444 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
445 : REG_SYNTAX_POSIX_BASIC);
447 preg->re_buffer = NULL;
448 preg->re_allocated = 0;
451 /* Try to allocate space for the fastmap. */
452 preg->re_fastmap = re_malloc (char, SBC_MAX);
453 if (BE (preg->re_fastmap == NULL, 0))
456 syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
458 /* If REG_NEWLINE is set, newlines are treated differently. */
459 if (cflags & REG_NEWLINE)
460 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
461 syntax &= ~REG_DOT_NEWLINE;
462 syntax |= REG_HAT_LISTS_NOT_NEWLINE;
463 /* It also changes the matching behavior. */
464 preg->re_newline_anchor = 1;
467 preg->re_newline_anchor = 0;
468 preg->re_no_sub = !!(cflags & REG_NOSUB);
469 preg->re_translate = NULL;
471 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
473 /* POSIX doesn't distinguish between an unmatched open-group and an
474 unmatched close-group: both are REG_EPAREN. */
475 if (ret == REG_ERPAREN)
478 /* We have already checked preg->re_fastmap != NULL. */
479 if (BE (ret == REG_NOERROR, 1))
480 /* Compute the fastmap now, since regexec cannot modify the pattern
481 buffer. This function never fails in this implementation. */
482 (void) re_compile_fastmap (preg);
485 /* Some error occurred while compiling the expression. */
486 re_free (preg->re_fastmap);
487 preg->re_fastmap = NULL;
493 weak_alias (__regcomp, regcomp)
496 /* Returns a message corresponding to an error code, ERRCODE, returned
497 from either regcomp or regexec. We don't use PREG here. */
500 regerror (int errcode, const regex_t *__restrict preg,
501 char *__restrict errbuf, size_t errbuf_size)
507 || errcode >= (int) (sizeof (__re_error_msgid_idx)
508 / sizeof (__re_error_msgid_idx[0])), 0))
509 /* Only error codes returned by the rest of the code should be passed
510 to this routine. If we are given anything else, or if other regex
511 code generates an invalid error code, then the program has a bug.
512 Dump core so we can fix it. */
515 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
517 msg_size = strlen (msg) + 1; /* Includes the null. */
519 if (BE (errbuf_size != 0, 1))
521 if (BE (msg_size > errbuf_size, 0))
523 #if defined HAVE_MEMPCPY || defined _LIBC
524 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
526 memcpy (errbuf, msg, errbuf_size - 1);
527 errbuf[errbuf_size - 1] = 0;
531 memcpy (errbuf, msg, msg_size);
537 weak_alias (__regerror, regerror)
541 #ifdef RE_ENABLE_I18N
542 /* This static array is used for the map to single-byte characters when
543 UTF-8 is used. Otherwise we would allocate memory just to initialize
544 it the same all the time. UTF-8 is the preferred encoding so this is
545 a worthwhile optimization. */
546 static const bitset utf8_sb_map =
548 /* Set the first 128 bits. */
549 # if UINT_MAX == 0xffffffff
550 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
552 # error "Add case for new unsigned int size"
559 free_dfa_content (re_dfa_t *dfa)
564 for (i = 0; i < dfa->nodes_len; ++i)
565 free_token (dfa->nodes + i);
566 re_free (dfa->nexts);
567 for (i = 0; i < dfa->nodes_len; ++i)
569 if (dfa->eclosures != NULL)
570 re_node_set_free (dfa->eclosures + i);
571 if (dfa->inveclosures != NULL)
572 re_node_set_free (dfa->inveclosures + i);
573 if (dfa->edests != NULL)
574 re_node_set_free (dfa->edests + i);
576 re_free (dfa->edests);
577 re_free (dfa->eclosures);
578 re_free (dfa->inveclosures);
579 re_free (dfa->nodes);
581 if (dfa->state_table)
582 for (i = 0; i <= dfa->state_hash_mask; ++i)
584 struct re_state_table_entry *entry = dfa->state_table + i;
585 for (j = 0; j < entry->num; ++j)
587 re_dfastate_t *state = entry->array[j];
590 re_free (entry->array);
592 re_free (dfa->state_table);
593 #ifdef RE_ENABLE_I18N
594 if (dfa->sb_char != utf8_sb_map)
595 re_free (dfa->sb_char);
597 re_free (dfa->subexp_map);
599 re_free (dfa->re_str);
606 /* Free dynamically allocated space used by PREG. */
609 regfree (regex_t *preg)
611 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
612 if (BE (dfa != NULL, 1))
613 free_dfa_content (dfa);
614 preg->re_buffer = NULL;
615 preg->re_allocated = 0;
617 re_free (preg->re_fastmap);
618 preg->re_fastmap = NULL;
620 re_free (preg->re_translate);
621 preg->re_translate = NULL;
624 weak_alias (__regfree, regfree)
627 /* Entry points compatible with 4.2 BSD regex library. We don't define
628 them unless specifically requested. */
630 #if defined _REGEX_RE_COMP || defined _LIBC
632 /* BSD has one and only one pattern buffer. */
633 static struct re_pattern_buffer re_comp_buf;
637 /* Make these definitions weak in libc, so POSIX programs can redefine
638 these names if they don't use our functions, and still use
639 regcomp/regexec above without link errors. */
642 re_comp (const char *s)
649 if (!re_comp_buf.re_buffer)
650 return gettext ("No previous regular expression");
654 if (re_comp_buf.re_buffer)
656 fastmap = re_comp_buf.re_fastmap;
657 re_comp_buf.re_fastmap = NULL;
658 __regfree (&re_comp_buf);
659 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
660 re_comp_buf.re_fastmap = fastmap;
663 if (re_comp_buf.re_fastmap == NULL)
665 re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
666 if (re_comp_buf.re_fastmap == NULL)
667 return (char *) gettext (__re_error_msgid
668 + __re_error_msgid_idx[(int) REG_ESPACE]);
671 /* Since `re_exec' always passes NULL for the `regs' argument, we
672 don't need to initialize the pattern buffer fields which affect it. */
674 /* Match anchors at newlines. */
675 re_comp_buf.re_newline_anchor = 1;
677 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
682 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
683 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
687 libc_freeres_fn (free_mem)
689 __regfree (&re_comp_buf);
693 #endif /* _REGEX_RE_COMP */
695 /* Internal entry point.
696 Compile the regular expression PATTERN, whose length is LENGTH.
697 SYNTAX indicate regular expression's syntax. */
700 re_compile_internal (regex_t *preg, const char *pattern, Idx length,
703 reg_errcode_t err = REG_NOERROR;
707 /* Initialize the pattern buffer. */
708 preg->re_fastmap_accurate = 0;
709 preg->re_syntax = syntax;
710 preg->re_not_bol = preg->re_not_eol = 0;
713 preg->re_can_be_null = 0;
714 preg->re_regs_allocated = REG_UNALLOCATED;
716 /* Initialize the dfa. */
717 dfa = (re_dfa_t *) preg->re_buffer;
718 if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
720 /* If zero allocated, but buffer is non-null, try to realloc
721 enough space. This loses if buffer's address is bogus, but
722 that is the user's responsibility. If buffer is null this
723 is a simple allocation. */
724 dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
727 preg->re_allocated = sizeof (re_dfa_t);
728 preg->re_buffer = (unsigned char *) dfa;
730 preg->re_used = sizeof (re_dfa_t);
732 __libc_lock_init (dfa->lock);
734 err = init_dfa (dfa, length);
735 if (BE (err != REG_NOERROR, 0))
737 free_dfa_content (dfa);
738 preg->re_buffer = NULL;
739 preg->re_allocated = 0;
743 dfa->re_str = re_malloc (char, length + 1);
744 strncpy (dfa->re_str, pattern, length + 1);
747 err = re_string_construct (®exp, pattern, length, preg->re_translate,
748 syntax & REG_IGNORE_CASE, dfa);
749 if (BE (err != REG_NOERROR, 0))
751 re_compile_internal_free_return:
752 free_workarea_compile (preg);
753 re_string_destruct (®exp);
754 free_dfa_content (dfa);
755 preg->re_buffer = NULL;
756 preg->re_allocated = 0;
760 /* Parse the regular expression, and build a structure tree. */
762 dfa->str_tree = parse (®exp, preg, syntax, &err);
763 if (BE (dfa->str_tree == NULL, 0))
764 goto re_compile_internal_free_return;
766 /* Analyze the tree and create the nfa. */
767 err = analyze (preg);
768 if (BE (err != REG_NOERROR, 0))
769 goto re_compile_internal_free_return;
771 #ifdef RE_ENABLE_I18N
772 /* If possible, do searching in single byte encoding to speed things up. */
773 if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
777 /* Then create the initial state of the dfa. */
778 err = create_initial_state (dfa);
780 /* Release work areas. */
781 free_workarea_compile (preg);
782 re_string_destruct (®exp);
784 if (BE (err != REG_NOERROR, 0))
786 free_dfa_content (dfa);
787 preg->re_buffer = NULL;
788 preg->re_allocated = 0;
794 /* Initialize DFA. We use the length of the regular expression PAT_LEN
795 as the initial length of some arrays. */
798 init_dfa (re_dfa_t *dfa, Idx pat_len)
800 __re_size_t table_size;
805 memset (dfa, '\0', sizeof (re_dfa_t));
807 /* Force allocation of str_tree_storage the first time. */
808 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
810 dfa->nodes_alloc = pat_len + 1;
811 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
813 /* table_size = 2 ^ ceil(log pat_len) */
814 for (table_size = 1; table_size <= pat_len; table_size <<= 1)
815 if (0 < (Idx) -1 && table_size == 0)
818 dfa->state_table = re_calloc (struct re_state_table_entry, table_size);
819 dfa->state_hash_mask = table_size - 1;
821 dfa->mb_cur_max = MB_CUR_MAX;
823 if (dfa->mb_cur_max == 6
824 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
826 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
829 # ifdef HAVE_LANGINFO_CODESET
830 codeset_name = nl_langinfo (CODESET);
832 codeset_name = getenv ("LC_ALL");
833 if (codeset_name == NULL || codeset_name[0] == '\0')
834 codeset_name = getenv ("LC_CTYPE");
835 if (codeset_name == NULL || codeset_name[0] == '\0')
836 codeset_name = getenv ("LANG");
837 if (codeset_name == NULL)
839 else if (strchr (codeset_name, '.') != NULL)
840 codeset_name = strchr (codeset_name, '.') + 1;
843 if (strcasecmp (codeset_name, "UTF-8") == 0
844 || strcasecmp (codeset_name, "UTF8") == 0)
847 /* We check exhaustively in the loop below if this charset is a
848 superset of ASCII. */
849 dfa->map_notascii = 0;
852 #ifdef RE_ENABLE_I18N
853 if (dfa->mb_cur_max > 1)
856 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
861 dfa->sb_char = re_calloc (unsigned int, BITSET_UINTS);
862 if (BE (dfa->sb_char == NULL, 0))
865 /* Clear all bits by, then set those corresponding to single
867 bitset_empty (dfa->sb_char);
869 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
870 for (j = 0; j < UINT_BITS; ++j, ++ch)
872 wint_t wch = __btowc (ch);
874 dfa->sb_char[i] |= 1u << j;
876 if (isascii (ch) && wch != ch)
877 dfa->map_notascii = 1;
884 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
889 /* Initialize WORD_CHAR table, which indicate which character is
890 "word". In this case "word" means that it is the word construction
891 character used by some operators like "\<", "\>", etc. */
894 init_word_char (re_dfa_t *dfa)
897 dfa->word_ops_used = 1;
898 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
899 for (j = 0; j < UINT_BITS; ++j, ++ch)
900 if (isalnum (ch) || ch == '_')
901 dfa->word_char[i] |= 1u << j;
904 /* Free the work area which are only used while compiling. */
907 free_workarea_compile (regex_t *preg)
909 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
910 bin_tree_storage_t *storage, *next;
911 for (storage = dfa->str_tree_storage; storage; storage = next)
913 next = storage->next;
916 dfa->str_tree_storage = NULL;
917 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
918 dfa->str_tree = NULL;
919 re_free (dfa->org_indices);
920 dfa->org_indices = NULL;
923 /* Create initial states for all contexts. */
926 create_initial_state (re_dfa_t *dfa)
930 re_node_set init_nodes;
932 /* Initial states have the epsilon closure of the node which is
933 the first node of the regular expression. */
934 first = dfa->str_tree->first->node_idx;
935 dfa->init_node = first;
936 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
937 if (BE (err != REG_NOERROR, 0))
940 /* The back-references which are in initial states can epsilon transit,
941 since in this case all of the subexpressions can be null.
942 Then we add epsilon closures of the nodes which are the next nodes of
943 the back-references. */
944 if (dfa->nbackref > 0)
945 for (i = 0; i < init_nodes.nelem; ++i)
947 Idx node_idx = init_nodes.elems[i];
948 re_token_type_t type = dfa->nodes[node_idx].type;
951 if (type != OP_BACK_REF)
953 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
955 re_token_t *clexp_node;
956 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
957 if (clexp_node->type == OP_CLOSE_SUBEXP
958 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
961 if (clexp_idx == init_nodes.nelem)
964 if (type == OP_BACK_REF)
966 Idx dest_idx = dfa->edests[node_idx].elems[0];
967 if (!re_node_set_contains (&init_nodes, dest_idx))
969 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
975 /* It must be the first time to invoke acquire_state. */
976 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
977 /* We don't check ERR here, since the initial state must not be NULL. */
978 if (BE (dfa->init_state == NULL, 0))
980 if (dfa->init_state->has_constraint)
982 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
984 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
986 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
990 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
991 || dfa->init_state_begbuf == NULL, 0))
995 dfa->init_state_word = dfa->init_state_nl
996 = dfa->init_state_begbuf = dfa->init_state;
998 re_node_set_free (&init_nodes);
1002 #ifdef RE_ENABLE_I18N
1003 /* If it is possible to do searching in single byte encoding instead of UTF-8
1004 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1005 DFA nodes where needed. */
1008 optimize_utf8 (re_dfa_t *dfa)
1011 int i, mb_chars = 0, has_period = 0;
1013 for (node = 0; node < dfa->nodes_len; ++node)
1014 switch (dfa->nodes[node].type)
1017 if (dfa->nodes[node].opr.c >= 0x80)
1021 switch (dfa->nodes[node].opr.idx)
1029 /* Word anchors etc. cannot be handled. */
1039 case OP_DUP_ASTERISK:
1040 case OP_OPEN_SUBEXP:
1041 case OP_CLOSE_SUBEXP:
1043 case COMPLEX_BRACKET:
1045 case SIMPLE_BRACKET:
1046 /* Just double check. */
1047 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1048 if (dfa->nodes[node].opr.sbcset[i])
1055 if (mb_chars || has_period)
1056 for (node = 0; node < dfa->nodes_len; ++node)
1058 if (dfa->nodes[node].type == CHARACTER
1059 && dfa->nodes[node].opr.c >= 0x80)
1060 dfa->nodes[node].mb_partial = 0;
1061 else if (dfa->nodes[node].type == OP_PERIOD)
1062 dfa->nodes[node].type = OP_UTF8_PERIOD;
1065 /* The search can be in single byte locale. */
1066 dfa->mb_cur_max = 1;
1068 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1072 /* Analyze the structure tree, and calculate "first", "next", "edest",
1073 "eclosure", and "inveclosure". */
1075 static reg_errcode_t
1076 analyze (regex_t *preg)
1078 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1081 /* Allocate arrays. */
1082 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1083 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1084 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1085 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1086 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1087 || dfa->eclosures == NULL, 0))
1090 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1091 if (dfa->subexp_map != NULL)
1094 for (i = 0; i < preg->re_nsub; i++)
1095 dfa->subexp_map[i] = i;
1096 preorder (dfa->str_tree, optimize_subexps, dfa);
1097 for (i = 0; i < preg->re_nsub; i++)
1098 if (dfa->subexp_map[i] != i)
1100 if (i == preg->re_nsub)
1102 free (dfa->subexp_map);
1103 dfa->subexp_map = NULL;
1107 ret = postorder (dfa->str_tree, lower_subexps, preg);
1108 if (BE (ret != REG_NOERROR, 0))
1110 ret = postorder (dfa->str_tree, calc_first, dfa);
1111 if (BE (ret != REG_NOERROR, 0))
1113 preorder (dfa->str_tree, calc_next, dfa);
1114 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1115 if (BE (ret != REG_NOERROR, 0))
1117 ret = calc_eclosure (dfa);
1118 if (BE (ret != REG_NOERROR, 0))
1121 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1122 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1123 if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1126 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1127 if (BE (dfa->inveclosures == NULL, 0))
1129 ret = calc_inveclosure (dfa);
1135 /* Our parse trees are very unbalanced, so we cannot use a stack to
1136 implement parse tree visits. Instead, we use parent pointers and
1137 some hairy code in these two functions. */
1138 static reg_errcode_t
1139 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1142 bin_tree_t *node, *prev;
1144 for (node = root; ; )
1146 /* Descend down the tree, preferably to the left (or to the right
1147 if that's the only child). */
1148 while (node->left || node->right)
1156 reg_errcode_t err = fn (extra, node);
1157 if (BE (err != REG_NOERROR, 0))
1159 if (node->parent == NULL)
1162 node = node->parent;
1164 /* Go up while we have a node that is reached from the right. */
1165 while (node->right == prev || node->right == NULL);
1170 static reg_errcode_t
1171 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1176 for (node = root; ; )
1178 reg_errcode_t err = fn (extra, node);
1179 if (BE (err != REG_NOERROR, 0))
1182 /* Go to the left node, or up and to the right. */
1187 bin_tree_t *prev = NULL;
1188 while (node->right == prev || node->right == NULL)
1191 node = node->parent;
1200 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1201 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1202 backreferences as well. Requires a preorder visit. */
1203 static reg_errcode_t
1204 optimize_subexps (void *extra, bin_tree_t *node)
1206 re_dfa_t *dfa = (re_dfa_t *) extra;
1208 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1210 int idx = node->token.opr.idx;
1211 node->token.opr.idx = dfa->subexp_map[idx];
1212 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1215 else if (node->token.type == SUBEXP
1216 && node->left && node->left->token.type == SUBEXP)
1218 Idx other_idx = node->left->token.opr.idx;
1220 node->left = node->left->left;
1222 node->left->parent = node;
1224 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1225 if (other_idx < CHAR_BIT * sizeof dfa->used_bkref_map)
1226 dfa->used_bkref_map &= ~(1u << other_idx);
1232 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1233 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1234 static reg_errcode_t
1235 lower_subexps (void *extra, bin_tree_t *node)
1237 regex_t *preg = (regex_t *) extra;
1238 reg_errcode_t err = REG_NOERROR;
1240 if (node->left && node->left->token.type == SUBEXP)
1242 node->left = lower_subexp (&err, preg, node->left);
1244 node->left->parent = node;
1246 if (node->right && node->right->token.type == SUBEXP)
1248 node->right = lower_subexp (&err, preg, node->right);
1250 node->right->parent = node;
1257 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1259 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1260 bin_tree_t *body = node->left;
1261 bin_tree_t *op, *cls, *tree1, *tree;
1264 /* We do not optimize empty subexpressions, because otherwise we may
1265 have bad CONCAT nodes with NULL children. This is obviously not
1266 very common, so we do not lose much. An example that triggers
1267 this case is the sed "script" /\(\)/x. */
1268 && node->left != NULL
1269 && (node->token.opr.idx >= CHAR_BIT * sizeof dfa->used_bkref_map
1270 || !(dfa->used_bkref_map & (1u << node->token.opr.idx))))
1273 /* Convert the SUBEXP node to the concatenation of an
1274 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1275 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1276 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1277 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1278 tree = create_tree (dfa, op, tree1, CONCAT);
1279 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1285 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1286 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1290 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1291 nodes. Requires a postorder visit. */
1292 static reg_errcode_t
1293 calc_first (void *extra, bin_tree_t *node)
1295 re_dfa_t *dfa = (re_dfa_t *) extra;
1296 if (node->token.type == CONCAT)
1298 node->first = node->left->first;
1299 node->node_idx = node->left->node_idx;
1304 node->node_idx = re_dfa_add_node (dfa, node->token);
1305 if (BE (node->node_idx == REG_MISSING, 0))
1311 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1312 static reg_errcode_t
1313 calc_next (void *extra, bin_tree_t *node)
1315 switch (node->token.type)
1317 case OP_DUP_ASTERISK:
1318 node->left->next = node;
1321 node->left->next = node->right->first;
1322 node->right->next = node->next;
1326 node->left->next = node->next;
1328 node->right->next = node->next;
1334 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1335 static reg_errcode_t
1336 link_nfa_nodes (void *extra, bin_tree_t *node)
1338 re_dfa_t *dfa = (re_dfa_t *) extra;
1339 Idx idx = node->node_idx;
1340 reg_errcode_t err = REG_NOERROR;
1342 switch (node->token.type)
1348 assert (node->next == NULL);
1351 case OP_DUP_ASTERISK:
1355 dfa->has_plural_match = 1;
1356 if (node->left != NULL)
1357 left = node->left->first->node_idx;
1359 left = node->next->node_idx;
1360 if (node->right != NULL)
1361 right = node->right->first->node_idx;
1363 right = node->next->node_idx;
1364 assert (REG_VALID_INDEX (left));
1365 assert (REG_VALID_INDEX (right));
1366 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1371 case OP_OPEN_SUBEXP:
1372 case OP_CLOSE_SUBEXP:
1373 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1377 dfa->nexts[idx] = node->next->node_idx;
1378 if (node->token.type == OP_BACK_REF)
1379 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1383 assert (!IS_EPSILON_NODE (node->token.type));
1384 dfa->nexts[idx] = node->next->node_idx;
1391 /* Duplicate the epsilon closure of the node ROOT_NODE.
1392 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1393 to their own constraint. */
1395 static reg_errcode_t
1396 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node,
1397 Idx top_clone_node, Idx root_node,
1398 unsigned int init_constraint)
1400 Idx org_node, clone_node;
1402 unsigned int constraint = init_constraint;
1403 for (org_node = top_org_node, clone_node = top_clone_node;;)
1405 Idx org_dest, clone_dest;
1406 if (dfa->nodes[org_node].type == OP_BACK_REF)
1408 /* If the back reference epsilon-transit, its destination must
1409 also have the constraint. Then duplicate the epsilon closure
1410 of the destination of the back reference, and store it in
1411 edests of the back reference. */
1412 org_dest = dfa->nexts[org_node];
1413 re_node_set_empty (dfa->edests + clone_node);
1414 clone_dest = duplicate_node (dfa, org_dest, constraint);
1415 if (BE (clone_dest == REG_MISSING, 0))
1417 dfa->nexts[clone_node] = dfa->nexts[org_node];
1418 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1419 if (BE (ret < 0, 0))
1422 else if (dfa->edests[org_node].nelem == 0)
1424 /* In case of the node can't epsilon-transit, don't duplicate the
1425 destination and store the original destination as the
1426 destination of the node. */
1427 dfa->nexts[clone_node] = dfa->nexts[org_node];
1430 else if (dfa->edests[org_node].nelem == 1)
1432 /* In case of the node can epsilon-transit, and it has only one
1434 org_dest = dfa->edests[org_node].elems[0];
1435 re_node_set_empty (dfa->edests + clone_node);
1436 if (dfa->nodes[org_node].type == ANCHOR)
1438 /* In case of the node has another constraint, append it. */
1439 if (org_node == root_node && clone_node != org_node)
1441 /* ...but if the node is root_node itself, it means the
1442 epsilon closure have a loop, then tie it to the
1443 destination of the root_node. */
1444 ret = re_node_set_insert (dfa->edests + clone_node,
1446 if (BE (ret < 0, 0))
1450 constraint |= dfa->nodes[org_node].opr.ctx_type;
1452 clone_dest = duplicate_node (dfa, org_dest, constraint);
1453 if (BE (clone_dest == REG_MISSING, 0))
1455 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1456 if (BE (ret < 0, 0))
1459 else /* dfa->edests[org_node].nelem == 2 */
1461 /* In case of the node can epsilon-transit, and it has two
1462 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1463 org_dest = dfa->edests[org_node].elems[0];
1464 re_node_set_empty (dfa->edests + clone_node);
1465 /* Search for a duplicated node which satisfies the constraint. */
1466 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1467 if (clone_dest == REG_MISSING)
1469 /* There are no such a duplicated node, create a new one. */
1471 clone_dest = duplicate_node (dfa, org_dest, constraint);
1472 if (BE (clone_dest == REG_MISSING, 0))
1474 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1475 if (BE (ret < 0, 0))
1477 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1478 root_node, constraint);
1479 if (BE (err != REG_NOERROR, 0))
1484 /* There are a duplicated node which satisfy the constraint,
1485 use it to avoid infinite loop. */
1486 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1487 if (BE (ret < 0, 0))
1491 org_dest = dfa->edests[org_node].elems[1];
1492 clone_dest = duplicate_node (dfa, org_dest, constraint);
1493 if (BE (clone_dest == REG_MISSING, 0))
1495 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1496 if (BE (ret < 0, 0))
1499 org_node = org_dest;
1500 clone_node = clone_dest;
1505 /* Search for a node which is duplicated from the node ORG_NODE, and
1506 satisfies the constraint CONSTRAINT. */
1509 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1510 unsigned int constraint)
1513 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1515 if (org_node == dfa->org_indices[idx]
1516 && constraint == dfa->nodes[idx].constraint)
1517 return idx; /* Found. */
1519 return REG_MISSING; /* Not found. */
1522 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1523 Return the index of the new node, or REG_MISSING if insufficient storage is
1527 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1529 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1530 if (BE (dup_idx != REG_MISSING, 1))
1532 dfa->nodes[dup_idx].constraint = constraint;
1533 if (dfa->nodes[org_idx].type == ANCHOR)
1534 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1535 dfa->nodes[dup_idx].duplicated = 1;
1537 /* Store the index of the original node. */
1538 dfa->org_indices[dup_idx] = org_idx;
1543 static reg_errcode_t
1544 calc_inveclosure (re_dfa_t *dfa)
1548 for (idx = 0; idx < dfa->nodes_len; ++idx)
1549 re_node_set_init_empty (dfa->inveclosures + idx);
1551 for (src = 0; src < dfa->nodes_len; ++src)
1553 Idx *elems = dfa->eclosures[src].elems;
1554 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1556 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1557 if (BE (ret == REG_MISSING, 0))
1565 /* Calculate "eclosure" for all the node in DFA. */
1567 static reg_errcode_t
1568 calc_eclosure (re_dfa_t *dfa)
1573 assert (dfa->nodes_len > 0);
1576 /* For each nodes, calculate epsilon closure. */
1577 for (node_idx = 0; ; ++node_idx)
1580 re_node_set eclosure_elem;
1581 if (node_idx == dfa->nodes_len)
1590 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1593 /* If we have already calculated, skip it. */
1594 if (dfa->eclosures[node_idx].nelem != 0)
1596 /* Calculate epsilon closure of `node_idx'. */
1597 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1598 if (BE (err != REG_NOERROR, 0))
1601 if (dfa->eclosures[node_idx].nelem == 0)
1604 re_node_set_free (&eclosure_elem);
1610 /* Calculate epsilon closure of NODE. */
1612 static reg_errcode_t
1613 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, int root)
1616 unsigned int constraint;
1619 re_node_set eclosure;
1621 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1622 if (BE (err != REG_NOERROR, 0))
1625 /* This indicates that we are calculating this node now.
1626 We reference this value to avoid infinite loop. */
1627 dfa->eclosures[node].nelem = REG_MISSING;
1629 constraint = ((dfa->nodes[node].type == ANCHOR)
1630 ? dfa->nodes[node].opr.ctx_type : 0);
1631 /* If the current node has constraints, duplicate all nodes.
1632 Since they must inherit the constraints. */
1634 && dfa->edests[node].nelem
1635 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1637 Idx org_node, cur_node;
1638 org_node = cur_node = node;
1639 err = duplicate_node_closure (dfa, node, node, node, constraint);
1640 if (BE (err != REG_NOERROR, 0))
1644 /* Expand each epsilon destination nodes. */
1645 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1646 for (i = 0; i < dfa->edests[node].nelem; ++i)
1648 re_node_set eclosure_elem;
1649 Idx edest = dfa->edests[node].elems[i];
1650 /* If calculating the epsilon closure of `edest' is in progress,
1651 return intermediate result. */
1652 if (dfa->eclosures[edest].nelem == REG_MISSING)
1657 /* If we haven't calculated the epsilon closure of `edest' yet,
1658 calculate now. Otherwise use calculated epsilon closure. */
1659 if (dfa->eclosures[edest].nelem == 0)
1661 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1662 if (BE (err != REG_NOERROR, 0))
1666 eclosure_elem = dfa->eclosures[edest];
1667 /* Merge the epsilon closure of `edest'. */
1668 re_node_set_merge (&eclosure, &eclosure_elem);
1669 /* If the epsilon closure of `edest' is incomplete,
1670 the epsilon closure of this node is also incomplete. */
1671 if (dfa->eclosures[edest].nelem == 0)
1674 re_node_set_free (&eclosure_elem);
1678 /* Epsilon closures include itself. */
1679 re_node_set_insert (&eclosure, node);
1680 if (incomplete && !root)
1681 dfa->eclosures[node].nelem = 0;
1683 dfa->eclosures[node] = eclosure;
1684 *new_set = eclosure;
1688 /* Functions for token which are used in the parser. */
1690 /* Fetch a token from INPUT.
1691 We must not use this function inside bracket expressions. */
1694 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1696 re_string_skip_bytes (input, peek_token (result, input, syntax));
1699 /* Peek a token from INPUT, and return the length of the token.
1700 We must not use this function inside bracket expressions. */
1703 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1707 if (re_string_eoi (input))
1709 token->type = END_OF_RE;
1713 c = re_string_peek_byte (input, 0);
1716 token->word_char = 0;
1717 #ifdef RE_ENABLE_I18N
1718 token->mb_partial = 0;
1719 if (input->mb_cur_max > 1 &&
1720 !re_string_first_byte (input, re_string_cur_idx (input)))
1722 token->type = CHARACTER;
1723 token->mb_partial = 1;
1730 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1732 token->type = BACK_SLASH;
1736 c2 = re_string_peek_byte_case (input, 1);
1738 token->type = CHARACTER;
1739 #ifdef RE_ENABLE_I18N
1740 if (input->mb_cur_max > 1)
1742 wint_t wc = re_string_wchar_at (input,
1743 re_string_cur_idx (input) + 1);
1744 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1748 token->word_char = IS_WORD_CHAR (c2) != 0;
1753 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1754 token->type = OP_ALT;
1756 case '1': case '2': case '3': case '4': case '5':
1757 case '6': case '7': case '8': case '9':
1758 if (!(syntax & REG_NO_BK_REFS))
1760 token->type = OP_BACK_REF;
1761 token->opr.idx = c2 - '1';
1765 if (!(syntax & REG_NO_GNU_OPS))
1767 token->type = ANCHOR;
1768 token->opr.ctx_type = WORD_FIRST;
1772 if (!(syntax & REG_NO_GNU_OPS))
1774 token->type = ANCHOR;
1775 token->opr.ctx_type = WORD_LAST;
1779 if (!(syntax & REG_NO_GNU_OPS))
1781 token->type = ANCHOR;
1782 token->opr.ctx_type = WORD_DELIM;
1786 if (!(syntax & REG_NO_GNU_OPS))
1788 token->type = ANCHOR;
1789 token->opr.ctx_type = NOT_WORD_DELIM;
1793 if (!(syntax & REG_NO_GNU_OPS))
1794 token->type = OP_WORD;
1797 if (!(syntax & REG_NO_GNU_OPS))
1798 token->type = OP_NOTWORD;
1801 if (!(syntax & REG_NO_GNU_OPS))
1802 token->type = OP_SPACE;
1805 if (!(syntax & REG_NO_GNU_OPS))
1806 token->type = OP_NOTSPACE;
1809 if (!(syntax & REG_NO_GNU_OPS))
1811 token->type = ANCHOR;
1812 token->opr.ctx_type = BUF_FIRST;
1816 if (!(syntax & REG_NO_GNU_OPS))
1818 token->type = ANCHOR;
1819 token->opr.ctx_type = BUF_LAST;
1823 if (!(syntax & REG_NO_BK_PARENS))
1824 token->type = OP_OPEN_SUBEXP;
1827 if (!(syntax & REG_NO_BK_PARENS))
1828 token->type = OP_CLOSE_SUBEXP;
1831 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1832 token->type = OP_DUP_PLUS;
1835 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1836 token->type = OP_DUP_QUESTION;
1839 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1840 token->type = OP_OPEN_DUP_NUM;
1843 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1844 token->type = OP_CLOSE_DUP_NUM;
1852 token->type = CHARACTER;
1853 #ifdef RE_ENABLE_I18N
1854 if (input->mb_cur_max > 1)
1856 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1857 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1861 token->word_char = IS_WORD_CHAR (token->opr.c);
1866 if (syntax & REG_NEWLINE_ALT)
1867 token->type = OP_ALT;
1870 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1871 token->type = OP_ALT;
1874 token->type = OP_DUP_ASTERISK;
1877 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1878 token->type = OP_DUP_PLUS;
1881 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1882 token->type = OP_DUP_QUESTION;
1885 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1886 token->type = OP_OPEN_DUP_NUM;
1889 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1890 token->type = OP_CLOSE_DUP_NUM;
1893 if (syntax & REG_NO_BK_PARENS)
1894 token->type = OP_OPEN_SUBEXP;
1897 if (syntax & REG_NO_BK_PARENS)
1898 token->type = OP_CLOSE_SUBEXP;
1901 token->type = OP_OPEN_BRACKET;
1904 token->type = OP_PERIOD;
1907 if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1908 re_string_cur_idx (input) != 0)
1910 char prev = re_string_peek_byte (input, -1);
1911 if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1914 token->type = ANCHOR;
1915 token->opr.ctx_type = LINE_FIRST;
1918 if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1919 re_string_cur_idx (input) + 1 != re_string_length (input))
1922 re_string_skip_bytes (input, 1);
1923 peek_token (&next, input, syntax);
1924 re_string_skip_bytes (input, -1);
1925 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1928 token->type = ANCHOR;
1929 token->opr.ctx_type = LINE_LAST;
1937 /* Peek a token from INPUT, and return the length of the token.
1938 We must not use this function out of bracket expressions. */
1941 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1944 if (re_string_eoi (input))
1946 token->type = END_OF_RE;
1949 c = re_string_peek_byte (input, 0);
1952 #ifdef RE_ENABLE_I18N
1953 if (input->mb_cur_max > 1 &&
1954 !re_string_first_byte (input, re_string_cur_idx (input)))
1956 token->type = CHARACTER;
1959 #endif /* RE_ENABLE_I18N */
1961 if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1962 && re_string_cur_idx (input) + 1 < re_string_length (input))
1964 /* In this case, '\' escape a character. */
1966 re_string_skip_bytes (input, 1);
1967 c2 = re_string_peek_byte (input, 0);
1969 token->type = CHARACTER;
1972 if (c == '[') /* '[' is a special char in a bracket exps. */
1976 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1977 c2 = re_string_peek_byte (input, 1);
1985 token->type = OP_OPEN_COLL_ELEM;
1988 token->type = OP_OPEN_EQUIV_CLASS;
1991 if (syntax & REG_CHAR_CLASSES)
1993 token->type = OP_OPEN_CHAR_CLASS;
1996 /* else fall through. */
1998 token->type = CHARACTER;
2008 token->type = OP_CHARSET_RANGE;
2011 token->type = OP_CLOSE_BRACKET;
2014 token->type = OP_NON_MATCH_LIST;
2017 token->type = CHARACTER;
2022 /* Functions for parser. */
2024 /* Entry point of the parser.
2025 Parse the regular expression REGEXP and return the structure tree.
2026 If an error is occured, ERR is set by error code, and return NULL.
2027 This function build the following tree, from regular expression <reg_exp>:
2033 CAT means concatenation.
2034 EOR means end of regular expression. */
2037 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2040 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2041 bin_tree_t *tree, *eor, *root;
2042 re_token_t current_token;
2043 dfa->syntax = syntax;
2044 fetch_token (¤t_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2045 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2046 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2048 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2050 root = create_tree (dfa, tree, eor, CONCAT);
2053 if (BE (eor == NULL || root == NULL, 0))
2061 /* This function build the following tree, from regular expression
2062 <branch1>|<branch2>:
2068 ALT means alternative, which represents the operator `|'. */
2071 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2072 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2074 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2075 bin_tree_t *tree, *branch = NULL;
2076 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2077 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2080 while (token->type == OP_ALT)
2082 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2083 if (token->type != OP_ALT && token->type != END_OF_RE
2084 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2086 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2087 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2092 tree = create_tree (dfa, tree, branch, OP_ALT);
2093 if (BE (tree == NULL, 0))
2102 /* This function build the following tree, from regular expression
2109 CAT means concatenation. */
2112 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2113 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2115 bin_tree_t *tree, *exp;
2116 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2117 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2118 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2121 while (token->type != OP_ALT && token->type != END_OF_RE
2122 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2124 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2125 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2129 if (tree != NULL && exp != NULL)
2131 tree = create_tree (dfa, tree, exp, CONCAT);
2138 else if (tree == NULL)
2140 /* Otherwise exp == NULL, we don't need to create new tree. */
2145 /* This function build the following tree, from regular expression a*:
2152 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2153 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2155 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2157 switch (token->type)
2160 tree = create_token_tree (dfa, NULL, NULL, token);
2161 if (BE (tree == NULL, 0))
2166 #ifdef RE_ENABLE_I18N
2167 if (dfa->mb_cur_max > 1)
2169 while (!re_string_eoi (regexp)
2170 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2172 bin_tree_t *mbc_remain;
2173 fetch_token (token, regexp, syntax);
2174 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2175 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2176 if (BE (mbc_remain == NULL || tree == NULL, 0))
2185 case OP_OPEN_SUBEXP:
2186 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2187 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2190 case OP_OPEN_BRACKET:
2191 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2192 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2196 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2201 dfa->used_bkref_map |= 1 << token->opr.idx;
2202 tree = create_token_tree (dfa, NULL, NULL, token);
2203 if (BE (tree == NULL, 0))
2209 dfa->has_mb_node = 1;
2211 case OP_OPEN_DUP_NUM:
2212 if (syntax & REG_CONTEXT_INVALID_DUP)
2218 case OP_DUP_ASTERISK:
2220 case OP_DUP_QUESTION:
2221 if (syntax & REG_CONTEXT_INVALID_OPS)
2226 else if (syntax & REG_CONTEXT_INDEP_OPS)
2228 fetch_token (token, regexp, syntax);
2229 return parse_expression (regexp, preg, token, syntax, nest, err);
2231 /* else fall through */
2232 case OP_CLOSE_SUBEXP:
2233 if ((token->type == OP_CLOSE_SUBEXP) &&
2234 !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2239 /* else fall through */
2240 case OP_CLOSE_DUP_NUM:
2241 /* We treat it as a normal character. */
2243 /* Then we can these characters as normal characters. */
2244 token->type = CHARACTER;
2245 /* mb_partial and word_char bits should be initialized already
2247 tree = create_token_tree (dfa, NULL, NULL, token);
2248 if (BE (tree == NULL, 0))
2255 if ((token->opr.ctx_type
2256 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2257 && dfa->word_ops_used == 0)
2258 init_word_char (dfa);
2259 if (token->opr.ctx_type == WORD_DELIM
2260 || token->opr.ctx_type == NOT_WORD_DELIM)
2262 bin_tree_t *tree_first, *tree_last;
2263 if (token->opr.ctx_type == WORD_DELIM)
2265 token->opr.ctx_type = WORD_FIRST;
2266 tree_first = create_token_tree (dfa, NULL, NULL, token);
2267 token->opr.ctx_type = WORD_LAST;
2271 token->opr.ctx_type = INSIDE_WORD;
2272 tree_first = create_token_tree (dfa, NULL, NULL, token);
2273 token->opr.ctx_type = INSIDE_NOTWORD;
2275 tree_last = create_token_tree (dfa, NULL, NULL, token);
2276 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2277 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2285 tree = create_token_tree (dfa, NULL, NULL, token);
2286 if (BE (tree == NULL, 0))
2292 /* We must return here, since ANCHORs can't be followed
2293 by repetition operators.
2294 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2295 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2296 fetch_token (token, regexp, syntax);
2299 tree = create_token_tree (dfa, NULL, NULL, token);
2300 if (BE (tree == NULL, 0))
2305 if (dfa->mb_cur_max > 1)
2306 dfa->has_mb_node = 1;
2310 tree = build_charclass_op (dfa, regexp->trans,
2311 (const unsigned char *) "alnum",
2312 (const unsigned char *) "_",
2313 token->type == OP_NOTWORD, err);
2314 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2319 tree = build_charclass_op (dfa, regexp->trans,
2320 (const unsigned char *) "space",
2321 (const unsigned char *) "",
2322 token->type == OP_NOTSPACE, err);
2323 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2333 /* Must not happen? */
2339 fetch_token (token, regexp, syntax);
2341 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2342 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2344 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2345 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2347 /* In BRE consecutive duplications are not allowed. */
2348 if ((syntax & REG_CONTEXT_INVALID_DUP)
2349 && (token->type == OP_DUP_ASTERISK
2350 || token->type == OP_OPEN_DUP_NUM))
2360 /* This function build the following tree, from regular expression
2368 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2369 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2371 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2374 cur_nsub = preg->re_nsub++;
2376 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2378 /* The subexpression may be a null string. */
2379 if (token->type == OP_CLOSE_SUBEXP)
2383 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2384 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2386 if (BE (*err != REG_NOERROR, 0))
2390 if (cur_nsub <= '9' - '1')
2391 dfa->completed_bkref_map |= 1 << cur_nsub;
2393 tree = create_tree (dfa, tree, NULL, SUBEXP);
2394 if (BE (tree == NULL, 0))
2399 tree->token.opr.idx = cur_nsub;
2403 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2406 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2407 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2409 bin_tree_t *tree = NULL, *old_tree = NULL;
2410 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2411 re_token_t start_token = *token;
2413 if (token->type == OP_OPEN_DUP_NUM)
2416 start = fetch_number (regexp, token, syntax);
2417 if (start == REG_MISSING)
2419 if (token->type == CHARACTER && token->opr.c == ',')
2420 start = 0; /* We treat "{,m}" as "{0,m}". */
2423 *err = REG_BADBR; /* <re>{} is invalid. */
2427 if (BE (start != REG_ERROR, 1))
2429 /* We treat "{n}" as "{n,n}". */
2430 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2431 : ((token->type == CHARACTER && token->opr.c == ',')
2432 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2434 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2436 /* Invalid sequence. */
2437 if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2439 if (token->type == END_OF_RE)
2447 /* If the syntax bit is set, rollback. */
2448 re_string_set_index (regexp, start_idx);
2449 *token = start_token;
2450 token->type = CHARACTER;
2451 /* mb_partial and word_char bits should be already initialized by
2456 if (BE (end != REG_MISSING && start > end, 0))
2458 /* First number greater than second. */
2465 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2466 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2469 fetch_token (token, regexp, syntax);
2471 if (BE (elem == NULL, 0))
2473 if (BE (start == 0 && end == 0, 0))
2475 postorder (elem, free_tree, NULL);
2479 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2480 if (BE (start > 0, 0))
2483 for (i = 2; i <= start; ++i)
2485 elem = duplicate_tree (elem, dfa);
2486 tree = create_tree (dfa, tree, elem, CONCAT);
2487 if (BE (elem == NULL || tree == NULL, 0))
2488 goto parse_dup_op_espace;
2494 /* Duplicate ELEM before it is marked optional. */
2495 elem = duplicate_tree (elem, dfa);
2501 if (elem->token.type == SUBEXP)
2502 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2504 tree = create_tree (dfa, elem, NULL,
2505 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2506 if (BE (tree == NULL, 0))
2507 goto parse_dup_op_espace;
2509 /* This loop is actually executed only when end != REG_MISSING,
2510 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2511 already created the start+1-th copy. */
2512 if ((Idx) -1 < 0 || end != REG_MISSING)
2513 for (i = start + 2; i <= end; ++i)
2515 elem = duplicate_tree (elem, dfa);
2516 tree = create_tree (dfa, tree, elem, CONCAT);
2517 if (BE (elem == NULL || tree == NULL, 0))
2518 goto parse_dup_op_espace;
2520 tree = create_tree (dfa, tree, NULL, OP_ALT);
2521 if (BE (tree == NULL, 0))
2522 goto parse_dup_op_espace;
2526 tree = create_tree (dfa, old_tree, tree, CONCAT);
2530 parse_dup_op_espace:
2535 /* Size of the names for collating symbol/equivalence_class/character_class.
2536 I'm not sure, but maybe enough. */
2537 #define BRACKET_NAME_BUF_SIZE 32
2540 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2541 Build the range expression which starts from START_ELEM, and ends
2542 at END_ELEM. The result are written to MBCSET and SBCSET.
2543 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2544 mbcset->range_ends, is a pointer argument sinse we may
2547 static reg_errcode_t
2548 build_range_exp (re_bitset_ptr_t sbcset,
2549 # ifdef RE_ENABLE_I18N
2550 re_charset_t *mbcset, Idx *range_alloc,
2552 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2554 unsigned int start_ch, end_ch;
2555 /* Equivalence Classes and Character Classes can't be a range start/end. */
2556 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2557 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2561 /* We can handle no multi character collating elements without libc
2563 if (BE ((start_elem->type == COLL_SYM
2564 && strlen ((char *) start_elem->opr.name) > 1)
2565 || (end_elem->type == COLL_SYM
2566 && strlen ((char *) end_elem->opr.name) > 1), 0))
2567 return REG_ECOLLATE;
2569 # ifdef RE_ENABLE_I18N
2572 wint_t start_wc, end_wc;
2573 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2575 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2576 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2578 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2579 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2581 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2582 ? __btowc (start_ch) : start_elem->opr.wch);
2583 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2584 ? __btowc (end_ch) : end_elem->opr.wch);
2585 if (start_wc == WEOF || end_wc == WEOF)
2586 return REG_ECOLLATE;
2587 cmp_buf[0] = start_wc;
2588 cmp_buf[4] = end_wc;
2589 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2592 /* Got valid collation sequence values, add them as a new entry.
2593 However, for !_LIBC we have no collation elements: if the
2594 character set is single byte, the single byte character set
2595 that we build below suffices. parse_bracket_exp passes
2596 no MBCSET if dfa->mb_cur_max == 1. */
2599 /* Check the space of the arrays. */
2600 if (BE (*range_alloc == mbcset->nranges, 0))
2602 /* There is not enough space, need realloc. */
2603 wchar_t *new_array_start, *new_array_end;
2606 /* +1 in case of mbcset->nranges is 0. */
2607 new_nranges = 2 * mbcset->nranges + 1;
2608 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2609 are NULL if *range_alloc == 0. */
2610 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2612 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2615 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2618 mbcset->range_starts = new_array_start;
2619 mbcset->range_ends = new_array_end;
2620 *range_alloc = new_nranges;
2623 mbcset->range_starts[mbcset->nranges] = start_wc;
2624 mbcset->range_ends[mbcset->nranges++] = end_wc;
2627 /* Build the table for single byte characters. */
2628 for (wc = 0; wc < SBC_MAX; ++wc)
2631 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2632 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2633 bitset_set (sbcset, wc);
2636 # else /* not RE_ENABLE_I18N */
2639 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2640 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2642 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2643 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2645 if (start_ch > end_ch)
2647 /* Build the table for single byte characters. */
2648 for (ch = 0; ch < SBC_MAX; ++ch)
2649 if (start_ch <= ch && ch <= end_ch)
2650 bitset_set (sbcset, ch);
2652 # endif /* not RE_ENABLE_I18N */
2655 #endif /* not _LIBC */
2658 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2659 Build the collating element which is represented by NAME.
2660 The result are written to MBCSET and SBCSET.
2661 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2662 pointer argument since we may update it. */
2664 static reg_errcode_t
2665 build_collating_symbol (re_bitset_ptr_t sbcset,
2666 # ifdef RE_ENABLE_I18N
2667 re_charset_t *mbcset, Idx *coll_sym_alloc,
2669 const unsigned char *name)
2671 size_t name_len = strlen ((const char *) name);
2672 if (BE (name_len != 1, 0))
2673 return REG_ECOLLATE;
2676 bitset_set (sbcset, name[0]);
2680 #endif /* not _LIBC */
2682 /* This function parse bracket expression like "[abc]", "[a-c]",
2686 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2687 reg_syntax_t syntax, reg_errcode_t *err)
2690 const unsigned char *collseqmb;
2691 const char *collseqwc;
2694 const int32_t *symb_table;
2695 const unsigned char *extra;
2697 /* Local function for parse_bracket_exp used in _LIBC environement.
2698 Seek the collating symbol entry correspondings to NAME.
2699 Return the index of the symbol in the SYMB_TABLE. */
2702 __attribute ((always_inline))
2703 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2705 int32_t hash = elem_hash ((const char *) name, name_len);
2706 int32_t elem = hash % table_size;
2707 int32_t second = hash % (table_size - 2);
2708 while (symb_table[2 * elem] != 0)
2710 /* First compare the hashing value. */
2711 if (symb_table[2 * elem] == hash
2712 /* Compare the length of the name. */
2713 && name_len == extra[symb_table[2 * elem + 1]]
2714 /* Compare the name. */
2715 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2718 /* Yep, this is the entry. */
2728 /* Local function for parse_bracket_exp used in _LIBC environement.
2729 Look up the collation sequence value of BR_ELEM.
2730 Return the value if succeeded, UINT_MAX otherwise. */
2732 auto inline unsigned int
2733 __attribute ((always_inline))
2734 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2736 if (br_elem->type == SB_CHAR)
2739 if (MB_CUR_MAX == 1)
2742 return collseqmb[br_elem->opr.ch];
2745 wint_t wc = __btowc (br_elem->opr.ch);
2746 return __collseq_table_lookup (collseqwc, wc);
2749 else if (br_elem->type == MB_CHAR)
2751 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2753 else if (br_elem->type == COLL_SYM)
2755 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2759 elem = seek_collating_symbol_entry (br_elem->opr.name,
2761 if (symb_table[2 * elem] != 0)
2763 /* We found the entry. */
2764 idx = symb_table[2 * elem + 1];
2765 /* Skip the name of collating element name. */
2766 idx += 1 + extra[idx];
2767 /* Skip the byte sequence of the collating element. */
2768 idx += 1 + extra[idx];
2769 /* Adjust for the alignment. */
2770 idx = (idx + 3) & ~3;
2771 /* Skip the multibyte collation sequence value. */
2772 idx += sizeof (unsigned int);
2773 /* Skip the wide char sequence of the collating element. */
2774 idx += sizeof (unsigned int) *
2775 (1 + *(unsigned int *) (extra + idx));
2776 /* Return the collation sequence value. */
2777 return *(unsigned int *) (extra + idx);
2779 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2781 /* No valid character. Match it as a single byte
2783 return collseqmb[br_elem->opr.name[0]];
2786 else if (sym_name_len == 1)
2787 return collseqmb[br_elem->opr.name[0]];
2792 /* Local function for parse_bracket_exp used in _LIBC environement.
2793 Build the range expression which starts from START_ELEM, and ends
2794 at END_ELEM. The result are written to MBCSET and SBCSET.
2795 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2796 mbcset->range_ends, is a pointer argument sinse we may
2799 auto inline reg_errcode_t
2800 __attribute ((always_inline))
2801 build_range_exp (re_bitset_ptr_t sbcset, re_charset_t *mbcset,
2803 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2806 uint32_t start_collseq;
2807 uint32_t end_collseq;
2809 /* Equivalence Classes and Character Classes can't be a range
2811 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2812 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2816 start_collseq = lookup_collation_sequence_value (start_elem);
2817 end_collseq = lookup_collation_sequence_value (end_elem);
2818 /* Check start/end collation sequence values. */
2819 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2820 return REG_ECOLLATE;
2821 if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2824 /* Got valid collation sequence values, add them as a new entry.
2825 However, if we have no collation elements, and the character set
2826 is single byte, the single byte character set that we
2827 build below suffices. */
2828 if (nrules > 0 || dfa->mb_cur_max > 1)
2830 /* Check the space of the arrays. */
2831 if (BE (*range_alloc == mbcset->nranges, 0))
2833 /* There is not enough space, need realloc. */
2834 uint32_t *new_array_start;
2835 uint32_t *new_array_end;
2838 /* +1 in case of mbcset->nranges is 0. */
2839 new_nranges = 2 * mbcset->nranges + 1;
2840 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2842 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2845 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2848 mbcset->range_starts = new_array_start;
2849 mbcset->range_ends = new_array_end;
2850 *range_alloc = new_nranges;
2853 mbcset->range_starts[mbcset->nranges] = start_collseq;
2854 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2857 /* Build the table for single byte characters. */
2858 for (ch = 0; ch < SBC_MAX; ch++)
2860 uint32_t ch_collseq;
2862 if (MB_CUR_MAX == 1)
2865 ch_collseq = collseqmb[ch];
2867 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2868 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2869 bitset_set (sbcset, ch);
2874 /* Local function for parse_bracket_exp used in _LIBC environement.
2875 Build the collating element which is represented by NAME.
2876 The result are written to MBCSET and SBCSET.
2877 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2878 pointer argument sinse we may update it. */
2880 auto inline reg_errcode_t
2881 __attribute ((always_inline))
2882 build_collating_symbol (re_bitset_ptr_t sbcset, re_charset_t *mbcset,
2883 Idx *coll_sym_alloc, const unsigned char *name)
2886 size_t name_len = strlen ((const char *) name);
2889 elem = seek_collating_symbol_entry (name, name_len);
2890 if (symb_table[2 * elem] != 0)
2892 /* We found the entry. */
2893 idx = symb_table[2 * elem + 1];
2894 /* Skip the name of collating element name. */
2895 idx += 1 + extra[idx];
2897 else if (symb_table[2 * elem] == 0 && name_len == 1)
2899 /* No valid character, treat it as a normal
2901 bitset_set (sbcset, name[0]);
2905 return REG_ECOLLATE;
2907 /* Got valid collation sequence, add it as a new entry. */
2908 /* Check the space of the arrays. */
2909 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2911 /* Not enough, realloc it. */
2912 /* +1 in case of mbcset->ncoll_syms is 0. */
2913 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2914 /* Use realloc since mbcset->coll_syms is NULL
2916 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2917 new_coll_sym_alloc);
2918 if (BE (new_coll_syms == NULL, 0))
2920 mbcset->coll_syms = new_coll_syms;
2921 *coll_sym_alloc = new_coll_sym_alloc;
2923 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2928 if (BE (name_len != 1, 0))
2929 return REG_ECOLLATE;
2932 bitset_set (sbcset, name[0]);
2939 re_token_t br_token;
2940 re_bitset_ptr_t sbcset;
2941 #ifdef RE_ENABLE_I18N
2942 re_charset_t *mbcset;
2943 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2944 Idx equiv_class_alloc = 0, char_class_alloc = 0;
2945 #endif /* not RE_ENABLE_I18N */
2947 bin_tree_t *work_tree;
2949 int first_round = 1;
2951 collseqmb = (const unsigned char *)
2952 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2953 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2959 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2960 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2961 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2962 _NL_COLLATE_SYMB_TABLEMB);
2963 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2964 _NL_COLLATE_SYMB_EXTRAMB);
2967 sbcset = re_calloc (unsigned int, BITSET_UINTS);
2968 #ifdef RE_ENABLE_I18N
2969 mbcset = re_calloc (re_charset_t, 1);
2970 #endif /* RE_ENABLE_I18N */
2971 #ifdef RE_ENABLE_I18N
2972 if (BE (sbcset == NULL || mbcset == NULL, 0))
2974 if (BE (sbcset == NULL, 0))
2975 #endif /* RE_ENABLE_I18N */
2981 token_len = peek_token_bracket (token, regexp, syntax);
2982 if (BE (token->type == END_OF_RE, 0))
2985 goto parse_bracket_exp_free_return;
2987 if (token->type == OP_NON_MATCH_LIST)
2989 #ifdef RE_ENABLE_I18N
2990 mbcset->non_match = 1;
2991 #endif /* not RE_ENABLE_I18N */
2993 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
2994 bitset_set (sbcset, '\0');
2995 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2996 token_len = peek_token_bracket (token, regexp, syntax);
2997 if (BE (token->type == END_OF_RE, 0))
3000 goto parse_bracket_exp_free_return;
3004 /* We treat the first ']' as a normal character. */
3005 if (token->type == OP_CLOSE_BRACKET)
3006 token->type = CHARACTER;
3010 bracket_elem_t start_elem, end_elem;
3011 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3012 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3014 int token_len2 = 0, is_range_exp = 0;
3017 start_elem.opr.name = start_name_buf;
3018 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3019 syntax, first_round);
3020 if (BE (ret != REG_NOERROR, 0))
3023 goto parse_bracket_exp_free_return;
3027 /* Get information about the next token. We need it in any case. */
3028 token_len = peek_token_bracket (token, regexp, syntax);
3030 /* Do not check for ranges if we know they are not allowed. */
3031 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3033 if (BE (token->type == END_OF_RE, 0))
3036 goto parse_bracket_exp_free_return;
3038 if (token->type == OP_CHARSET_RANGE)
3040 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3041 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3042 if (BE (token2.type == END_OF_RE, 0))
3045 goto parse_bracket_exp_free_return;
3047 if (token2.type == OP_CLOSE_BRACKET)
3049 /* We treat the last '-' as a normal character. */
3050 re_string_skip_bytes (regexp, -token_len);
3051 token->type = CHARACTER;
3058 if (is_range_exp == 1)
3060 end_elem.opr.name = end_name_buf;
3061 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3063 if (BE (ret != REG_NOERROR, 0))
3066 goto parse_bracket_exp_free_return;
3069 token_len = peek_token_bracket (token, regexp, syntax);
3072 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3073 &start_elem, &end_elem);
3075 # ifdef RE_ENABLE_I18N
3076 *err = build_range_exp (sbcset,
3077 dfa->mb_cur_max > 1 ? mbcset : NULL,
3078 &range_alloc, &start_elem, &end_elem);
3080 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3082 #endif /* RE_ENABLE_I18N */
3083 if (BE (*err != REG_NOERROR, 0))
3084 goto parse_bracket_exp_free_return;
3088 switch (start_elem.type)
3091 bitset_set (sbcset, start_elem.opr.ch);
3093 #ifdef RE_ENABLE_I18N
3095 /* Check whether the array has enough space. */
3096 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3098 wchar_t *new_mbchars;
3099 /* Not enough, realloc it. */
3100 /* +1 in case of mbcset->nmbchars is 0. */
3101 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3102 /* Use realloc since array is NULL if *alloc == 0. */
3103 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3105 if (BE (new_mbchars == NULL, 0))
3106 goto parse_bracket_exp_espace;
3107 mbcset->mbchars = new_mbchars;
3109 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3111 #endif /* RE_ENABLE_I18N */
3113 *err = build_equiv_class (sbcset,
3114 #ifdef RE_ENABLE_I18N
3115 mbcset, &equiv_class_alloc,
3116 #endif /* RE_ENABLE_I18N */
3117 start_elem.opr.name);
3118 if (BE (*err != REG_NOERROR, 0))
3119 goto parse_bracket_exp_free_return;
3122 *err = build_collating_symbol (sbcset,
3123 #ifdef RE_ENABLE_I18N
3124 mbcset, &coll_sym_alloc,
3125 #endif /* RE_ENABLE_I18N */
3126 start_elem.opr.name);
3127 if (BE (*err != REG_NOERROR, 0))
3128 goto parse_bracket_exp_free_return;
3131 *err = build_charclass (regexp->trans, sbcset,
3132 #ifdef RE_ENABLE_I18N
3133 mbcset, &char_class_alloc,
3134 #endif /* RE_ENABLE_I18N */
3135 start_elem.opr.name, syntax);
3136 if (BE (*err != REG_NOERROR, 0))
3137 goto parse_bracket_exp_free_return;
3144 if (BE (token->type == END_OF_RE, 0))
3147 goto parse_bracket_exp_free_return;
3149 if (token->type == OP_CLOSE_BRACKET)
3153 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3155 /* If it is non-matching list. */
3157 bitset_not (sbcset);
3159 #ifdef RE_ENABLE_I18N
3160 /* Ensure only single byte characters are set. */
3161 if (dfa->mb_cur_max > 1)
3162 bitset_mask (sbcset, dfa->sb_char);
3164 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3165 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3166 || mbcset->non_match)))
3168 bin_tree_t *mbc_tree;
3170 /* Build a tree for complex bracket. */
3171 dfa->has_mb_node = 1;
3172 br_token.type = COMPLEX_BRACKET;
3173 br_token.opr.mbcset = mbcset;
3174 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3175 if (BE (mbc_tree == NULL, 0))
3176 goto parse_bracket_exp_espace;
3177 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3178 if (sbcset[sbc_idx])
3180 /* If there are no bits set in sbcset, there is no point
3181 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3182 if (sbc_idx < BITSET_UINTS)
3184 /* Build a tree for simple bracket. */
3185 br_token.type = SIMPLE_BRACKET;
3186 br_token.opr.sbcset = sbcset;
3187 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3188 if (BE (work_tree == NULL, 0))
3189 goto parse_bracket_exp_espace;
3191 /* Then join them by ALT node. */
3192 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3193 if (BE (work_tree == NULL, 0))
3194 goto parse_bracket_exp_espace;
3199 work_tree = mbc_tree;
3203 #endif /* not RE_ENABLE_I18N */
3205 #ifdef RE_ENABLE_I18N
3206 free_charset (mbcset);
3208 /* Build a tree for simple bracket. */
3209 br_token.type = SIMPLE_BRACKET;
3210 br_token.opr.sbcset = sbcset;
3211 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3212 if (BE (work_tree == NULL, 0))
3213 goto parse_bracket_exp_espace;
3217 parse_bracket_exp_espace:
3219 parse_bracket_exp_free_return:
3221 #ifdef RE_ENABLE_I18N
3222 free_charset (mbcset);
3223 #endif /* RE_ENABLE_I18N */
3227 /* Parse an element in the bracket expression. */
3229 static reg_errcode_t
3230 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3231 re_token_t *token, int token_len, re_dfa_t *dfa,
3232 reg_syntax_t syntax, int accept_hyphen)
3234 #ifdef RE_ENABLE_I18N
3236 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3237 if (cur_char_size > 1)
3239 elem->type = MB_CHAR;
3240 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3241 re_string_skip_bytes (regexp, cur_char_size);
3244 #endif /* RE_ENABLE_I18N */
3245 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3246 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3247 || token->type == OP_OPEN_EQUIV_CLASS)
3248 return parse_bracket_symbol (elem, regexp, token);
3249 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3251 /* A '-' must only appear as anything but a range indicator before
3252 the closing bracket. Everything else is an error. */
3254 (void) peek_token_bracket (&token2, regexp, syntax);
3255 if (token2.type != OP_CLOSE_BRACKET)
3256 /* The actual error value is not standardized since this whole
3257 case is undefined. But ERANGE makes good sense. */
3260 elem->type = SB_CHAR;
3261 elem->opr.ch = token->opr.c;
3265 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3266 such as [:<character_class>:], [.<collating_element>.], and
3267 [=<equivalent_class>=]. */
3269 static reg_errcode_t
3270 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3273 unsigned char ch, delim = token->opr.c;
3275 if (re_string_eoi(regexp))
3279 if (i >= BRACKET_NAME_BUF_SIZE)
3281 if (token->type == OP_OPEN_CHAR_CLASS)
3282 ch = re_string_fetch_byte_case (regexp);
3284 ch = re_string_fetch_byte (regexp);
3285 if (re_string_eoi(regexp))
3287 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3289 elem->opr.name[i] = ch;
3291 re_string_skip_bytes (regexp, 1);
3292 elem->opr.name[i] = '\0';
3293 switch (token->type)
3295 case OP_OPEN_COLL_ELEM:
3296 elem->type = COLL_SYM;
3298 case OP_OPEN_EQUIV_CLASS:
3299 elem->type = EQUIV_CLASS;
3301 case OP_OPEN_CHAR_CLASS:
3302 elem->type = CHAR_CLASS;
3310 /* Helper function for parse_bracket_exp.
3311 Build the equivalence class which is represented by NAME.
3312 The result are written to MBCSET and SBCSET.
3313 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3314 is a pointer argument sinse we may update it. */
3316 static reg_errcode_t
3317 build_equiv_class (re_bitset_ptr_t sbcset,
3318 #ifdef RE_ENABLE_I18N
3319 re_charset_t *mbcset, Idx *equiv_class_alloc,
3321 const unsigned char *name)
3324 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3327 const int32_t *table, *indirect;
3328 const unsigned char *weights, *extra, *cp;
3329 unsigned char char_buf[2];
3333 /* This #include defines a local function! */
3334 # include <locale/weight.h>
3335 /* Calculate the index for equivalence class. */
3337 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3338 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3339 _NL_COLLATE_WEIGHTMB);
3340 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3341 _NL_COLLATE_EXTRAMB);
3342 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3343 _NL_COLLATE_INDIRECTMB);
3344 idx1 = findidx (&cp);
3345 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3346 /* This isn't a valid character. */
3347 return REG_ECOLLATE;
3349 /* Build single byte matcing table for this equivalence class. */
3350 char_buf[1] = (unsigned char) '\0';
3351 len = weights[idx1];
3352 for (ch = 0; ch < SBC_MAX; ++ch)
3356 idx2 = findidx (&cp);
3361 /* This isn't a valid character. */
3363 if (len == weights[idx2])
3366 while (cnt <= len &&
3367 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3371 bitset_set (sbcset, ch);
3374 /* Check whether the array has enough space. */
3375 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3377 /* Not enough, realloc it. */
3378 /* +1 in case of mbcset->nequiv_classes is 0. */
3379 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3380 /* Use realloc since the array is NULL if *alloc == 0. */
3381 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3383 new_equiv_class_alloc);
3384 if (BE (new_equiv_classes == NULL, 0))
3386 mbcset->equiv_classes = new_equiv_classes;
3387 *equiv_class_alloc = new_equiv_class_alloc;
3389 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3394 if (BE (strlen ((const char *) name) != 1, 0))
3395 return REG_ECOLLATE;
3396 bitset_set (sbcset, *name);
3401 /* Helper function for parse_bracket_exp.
3402 Build the character class which is represented by NAME.
3403 The result are written to MBCSET and SBCSET.
3404 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3405 is a pointer argument sinse we may update it. */
3407 static reg_errcode_t
3408 build_charclass (unsigned REG_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset,
3409 #ifdef RE_ENABLE_I18N
3410 re_charset_t *mbcset, Idx *char_class_alloc,
3412 const unsigned char *class_name, reg_syntax_t syntax)
3415 const char *name = (const char *) class_name;
3417 /* In case of REG_ICASE "upper" and "lower" match the both of
3418 upper and lower cases. */
3419 if ((syntax & REG_IGNORE_CASE)
3420 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3423 #ifdef RE_ENABLE_I18N
3424 /* Check the space of the arrays. */
3425 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3427 /* Not enough, realloc it. */
3428 /* +1 in case of mbcset->nchar_classes is 0. */
3429 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3430 /* Use realloc since array is NULL if *alloc == 0. */
3431 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3432 new_char_class_alloc);
3433 if (BE (new_char_classes == NULL, 0))
3435 mbcset->char_classes = new_char_classes;
3436 *char_class_alloc = new_char_class_alloc;
3438 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3439 #endif /* RE_ENABLE_I18N */
3441 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3442 for (i = 0; i < SBC_MAX; ++i) \
3444 if (ctype_func (i)) \
3446 int ch = trans ? trans[i] : i; \
3447 bitset_set (sbcset, ch); \
3451 if (strcmp (name, "alnum") == 0)
3452 BUILD_CHARCLASS_LOOP (isalnum)
3453 else if (strcmp (name, "cntrl") == 0)
3454 BUILD_CHARCLASS_LOOP (iscntrl)
3455 else if (strcmp (name, "lower") == 0)
3456 BUILD_CHARCLASS_LOOP (islower)
3457 else if (strcmp (name, "space") == 0)
3458 BUILD_CHARCLASS_LOOP (isspace)
3459 else if (strcmp (name, "alpha") == 0)
3460 BUILD_CHARCLASS_LOOP (isalpha)
3461 else if (strcmp (name, "digit") == 0)
3462 BUILD_CHARCLASS_LOOP (isdigit)
3463 else if (strcmp (name, "print") == 0)
3464 BUILD_CHARCLASS_LOOP (isprint)
3465 else if (strcmp (name, "upper") == 0)
3466 BUILD_CHARCLASS_LOOP (isupper)
3467 else if (strcmp (name, "blank") == 0)
3468 BUILD_CHARCLASS_LOOP (isblank)
3469 else if (strcmp (name, "graph") == 0)
3470 BUILD_CHARCLASS_LOOP (isgraph)
3471 else if (strcmp (name, "punct") == 0)
3472 BUILD_CHARCLASS_LOOP (ispunct)
3473 else if (strcmp (name, "xdigit") == 0)
3474 BUILD_CHARCLASS_LOOP (isxdigit)
3482 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3483 const unsigned char *class_name,
3484 const unsigned char *extra,
3485 int non_match, reg_errcode_t *err)
3487 re_bitset_ptr_t sbcset;
3488 #ifdef RE_ENABLE_I18N
3489 re_charset_t *mbcset;
3491 #endif /* not RE_ENABLE_I18N */
3493 re_token_t br_token;
3496 sbcset = re_calloc (unsigned int, BITSET_UINTS);
3497 #ifdef RE_ENABLE_I18N
3498 mbcset = re_calloc (re_charset_t, 1);
3499 #endif /* RE_ENABLE_I18N */
3501 #ifdef RE_ENABLE_I18N
3502 if (BE (sbcset == NULL || mbcset == NULL, 0))
3503 #else /* not RE_ENABLE_I18N */
3504 if (BE (sbcset == NULL, 0))
3505 #endif /* not RE_ENABLE_I18N */
3513 #ifdef RE_ENABLE_I18N
3515 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3516 bitset_set(cset->sbcset, '\0');
3518 mbcset->non_match = 1;
3519 #endif /* not RE_ENABLE_I18N */
3522 /* We don't care the syntax in this case. */
3523 ret = build_charclass (trans, sbcset,
3524 #ifdef RE_ENABLE_I18N
3526 #endif /* RE_ENABLE_I18N */
3529 if (BE (ret != REG_NOERROR, 0))
3532 #ifdef RE_ENABLE_I18N
3533 free_charset (mbcset);
3534 #endif /* RE_ENABLE_I18N */
3538 /* \w match '_' also. */
3539 for (; *extra; extra++)
3540 bitset_set (sbcset, *extra);
3542 /* If it is non-matching list. */
3544 bitset_not (sbcset);
3546 #ifdef RE_ENABLE_I18N
3547 /* Ensure only single byte characters are set. */
3548 if (dfa->mb_cur_max > 1)
3549 bitset_mask (sbcset, dfa->sb_char);
3552 /* Build a tree for simple bracket. */
3553 br_token.type = SIMPLE_BRACKET;
3554 br_token.opr.sbcset = sbcset;
3555 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3556 if (BE (tree == NULL, 0))
3557 goto build_word_op_espace;
3559 #ifdef RE_ENABLE_I18N
3560 if (dfa->mb_cur_max > 1)
3562 bin_tree_t *mbc_tree;
3563 /* Build a tree for complex bracket. */
3564 br_token.type = COMPLEX_BRACKET;
3565 br_token.opr.mbcset = mbcset;
3566 dfa->has_mb_node = 1;
3567 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3568 if (BE (mbc_tree == NULL, 0))
3569 goto build_word_op_espace;
3570 /* Then join them by ALT node. */
3571 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3572 if (BE (mbc_tree != NULL, 1))
3577 free_charset (mbcset);
3580 #else /* not RE_ENABLE_I18N */
3582 #endif /* not RE_ENABLE_I18N */
3584 build_word_op_espace:
3586 #ifdef RE_ENABLE_I18N
3587 free_charset (mbcset);
3588 #endif /* RE_ENABLE_I18N */
3593 /* This is intended for the expressions like "a{1,3}".
3594 Fetch a number from `input', and return the number.
3595 Return REG_MISSING if the number field is empty like "{,1}".
3596 Return REG_ERROR if an error occurred. */
3599 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3601 Idx num = REG_MISSING;
3605 fetch_token (token, input, syntax);
3607 if (BE (token->type == END_OF_RE, 0))
3609 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3611 num = ((token->type != CHARACTER || c < '0' || '9' < c
3612 || num == REG_ERROR)
3614 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3615 num = (num > REG_DUP_MAX) ? REG_ERROR : num;
3620 #ifdef RE_ENABLE_I18N
3622 free_charset (re_charset_t *cset)
3624 re_free (cset->mbchars);
3626 re_free (cset->coll_syms);
3627 re_free (cset->equiv_classes);
3628 re_free (cset->range_starts);
3629 re_free (cset->range_ends);
3631 re_free (cset->char_classes);
3634 #endif /* RE_ENABLE_I18N */
3636 /* Functions for binary tree operation. */
3638 /* Create a tree node. */
3641 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3642 re_token_type_t type)
3646 return create_token_tree (dfa, left, right, &t);
3650 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3651 const re_token_t *token)
3654 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3656 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3658 if (storage == NULL)
3660 storage->next = dfa->str_tree_storage;
3661 dfa->str_tree_storage = storage;
3662 dfa->str_tree_storage_idx = 0;
3664 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3666 tree->parent = NULL;
3668 tree->right = right;
3669 tree->token = *token;
3670 tree->token.duplicated = 0;
3671 tree->token.opt_subexp = 0;
3674 tree->node_idx = REG_MISSING;
3677 left->parent = tree;
3679 right->parent = tree;
3683 /* Mark the tree SRC as an optional subexpression.
3684 To be called from preorder or postorder. */
3686 static reg_errcode_t
3687 mark_opt_subexp (void *extra, bin_tree_t *node)
3689 Idx idx = (Idx) (long) extra;
3690 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3691 node->token.opt_subexp = 1;
3696 /* Free the allocated memory inside NODE. */
3699 free_token (re_token_t *node)
3701 #ifdef RE_ENABLE_I18N
3702 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3703 free_charset (node->opr.mbcset);
3705 #endif /* RE_ENABLE_I18N */
3706 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3707 re_free (node->opr.sbcset);
3710 /* Worker function for tree walking. Free the allocated memory inside NODE
3711 and its children. */
3713 static reg_errcode_t
3714 free_tree (void *extra, bin_tree_t *node)
3716 free_token (&node->token);
3721 /* Duplicate the node SRC, and return new node. This is a preorder
3722 visit similar to the one implemented by the generic visitor, but
3723 we need more infrastructure to maintain two parallel trees --- so,
3724 it's easier to duplicate. */
3727 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3729 const bin_tree_t *node;
3730 bin_tree_t *dup_root;
3731 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3733 for (node = root; ; )
3735 /* Create a new tree and link it back to the current parent. */
3736 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3739 (*p_new)->parent = dup_node;
3740 (*p_new)->token.duplicated = 1;
3743 /* Go to the left node, or up and to the right. */
3747 p_new = &dup_node->left;
3751 const bin_tree_t *prev = NULL;
3752 while (node->right == prev || node->right == NULL)
3755 node = node->parent;
3756 dup_node = dup_node->parent;
3761 p_new = &dup_node->right;