1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
3 Software Foundation, Inc.
4 This file is part of the GNU C Library.
5 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation,
19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 static void optimize_utf8 (re_dfa_t *dfa);
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
50 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static Idx fetch_number (re_string_t *input, re_token_t *token,
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 Idx nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
82 re_token_t *token, int token_len,
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
92 Idx *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
97 Idx *char_class_alloc,
98 const unsigned char *class_name,
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
105 const unsigned char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name,
111 const unsigned char *extra,
112 bool non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 static const char __re_error_msgid[] =
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 static const size_t __re_error_msgid_idx[] =
204 /* Entry points for GNU code. */
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
215 re_compile_pattern (pattern, length, bufp)
218 struct re_pattern_buffer *bufp;
219 #else /* size_t might promote */
221 re_compile_pattern (const char *pattern, size_t length,
222 struct re_pattern_buffer *bufp)
227 /* And GNU code determines whether or not to get register information
228 by passing null for the REGS argument to re_match, etc., not by
229 setting no_sub, unless RE_NO_SUB is set. */
230 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
232 /* Match anchors at newline. */
233 bufp->newline_anchor = 1;
235 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
239 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
242 weak_alias (__re_compile_pattern, re_compile_pattern)
245 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
246 also be assigned to arbitrarily: each pattern buffer stores its own
247 syntax, so it can be changed between regex compilations. */
248 /* This has no initializer because initialized variables in Emacs
249 become read-only after dumping. */
250 reg_syntax_t re_syntax_options;
253 /* Specify the precise syntax of regexps for compilation. This provides
254 for compatibility for various utilities which historically have
255 different, incompatible syntaxes.
257 The argument SYNTAX is a bit mask comprised of the various bits
258 defined in regex.h. We return the old syntax. */
261 re_set_syntax (syntax)
264 reg_syntax_t ret = re_syntax_options;
266 re_syntax_options = syntax;
270 weak_alias (__re_set_syntax, re_set_syntax)
274 re_compile_fastmap (bufp)
275 struct re_pattern_buffer *bufp;
277 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
278 char *fastmap = bufp->fastmap;
280 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
281 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
282 if (dfa->init_state != dfa->init_state_word)
283 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
284 if (dfa->init_state != dfa->init_state_nl)
285 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
286 if (dfa->init_state != dfa->init_state_begbuf)
287 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
288 bufp->fastmap_accurate = 1;
292 weak_alias (__re_compile_fastmap, re_compile_fastmap)
296 __attribute ((always_inline))
297 re_set_fastmap (char *fastmap, bool icase, int ch)
301 fastmap[tolower (ch)] = 1;
304 /* Helper function for re_compile_fastmap.
305 Compile fastmap for the initial_state INIT_STATE. */
308 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
311 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
313 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
314 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
316 Idx node = init_state->nodes.elems[node_cnt];
317 re_token_type_t type = dfa->nodes[node].type;
319 if (type == CHARACTER)
321 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
322 #ifdef RE_ENABLE_I18N
323 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
325 unsigned char buf[MB_LEN_MAX];
331 *p++ = dfa->nodes[node].opr.c;
332 while (++node < dfa->nodes_len
333 && dfa->nodes[node].type == CHARACTER
334 && dfa->nodes[node].mb_partial)
335 *p++ = dfa->nodes[node].opr.c;
336 memset (&state, '\0', sizeof (state));
337 if (__mbrtowc (&wc, (const char *) buf, p - buf,
339 && (__wcrtomb ((char *) buf, towlower (wc), &state)
341 re_set_fastmap (fastmap, false, buf[0]);
345 else if (type == SIMPLE_BRACKET)
348 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
351 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
352 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
353 if (w & ((bitset_word_t) 1 << j))
354 re_set_fastmap (fastmap, icase, ch);
357 #ifdef RE_ENABLE_I18N
358 else if (type == COMPLEX_BRACKET)
360 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
364 /* See if we have to try all bytes which start multiple collation
366 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
367 collation element, and don't catch 'b' since 'b' is
368 the only collation element which starts from 'b' (and
369 it is caught by SIMPLE_BRACKET). */
370 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
371 && (cset->ncoll_syms || cset->nranges))
373 const int32_t *table = (const int32_t *)
374 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
375 for (i = 0; i < SBC_MAX; ++i)
377 re_set_fastmap (fastmap, icase, i);
381 /* See if we have to start the match at all multibyte characters,
382 i.e. where we would not find an invalid sequence. This only
383 applies to multibyte character sets; for single byte character
384 sets, the SIMPLE_BRACKET again suffices. */
385 if (dfa->mb_cur_max > 1
386 && (cset->nchar_classes || cset->non_match || cset->nranges
388 || cset->nequiv_classes
396 memset (&mbs, 0, sizeof (mbs));
397 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
398 re_set_fastmap (fastmap, false, (int) c);
405 /* ... Else catch all bytes which can start the mbchars. */
406 for (i = 0; i < cset->nmbchars; ++i)
410 memset (&state, '\0', sizeof (state));
411 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
412 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
413 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
415 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
417 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
422 #endif /* RE_ENABLE_I18N */
423 else if (type == OP_PERIOD
424 #ifdef RE_ENABLE_I18N
425 || type == OP_UTF8_PERIOD
426 #endif /* RE_ENABLE_I18N */
427 || type == END_OF_RE)
429 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
430 if (type == END_OF_RE)
431 bufp->can_be_null = 1;
437 /* Entry point for POSIX code. */
438 /* regcomp takes a regular expression as a string and compiles it.
440 PREG is a regex_t *. We do not expect any fields to be initialized,
441 since POSIX says we shouldn't. Thus, we set
443 `buffer' to the compiled pattern;
444 `used' to the length of the compiled pattern;
445 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
446 REG_EXTENDED bit in CFLAGS is set; otherwise, to
447 RE_SYNTAX_POSIX_BASIC;
448 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
449 `fastmap' to an allocated space for the fastmap;
450 `fastmap_accurate' to zero;
451 `re_nsub' to the number of subexpressions in PATTERN.
453 PATTERN is the address of the pattern string.
455 CFLAGS is a series of bits which affect compilation.
457 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
458 use POSIX basic syntax.
460 If REG_NEWLINE is set, then . and [^...] don't match newline.
461 Also, regexec will try a match beginning after every newline.
463 If REG_ICASE is set, then we considers upper- and lowercase
464 versions of letters to be equivalent when matching.
466 If REG_NOSUB is set, then when PREG is passed to regexec, that
467 routine will report only success or failure, and nothing about the
470 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
471 the return codes and their meanings.) */
474 regcomp (preg, pattern, cflags)
475 regex_t *_Restrict_ preg;
476 const char *_Restrict_ pattern;
480 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
481 : RE_SYNTAX_POSIX_BASIC);
487 /* Try to allocate space for the fastmap. */
488 preg->fastmap = re_malloc (char, SBC_MAX);
489 if (BE (preg->fastmap == NULL, 0))
492 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
494 /* If REG_NEWLINE is set, newlines are treated differently. */
495 if (cflags & REG_NEWLINE)
496 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
497 syntax &= ~RE_DOT_NEWLINE;
498 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
499 /* It also changes the matching behavior. */
500 preg->newline_anchor = 1;
503 preg->newline_anchor = 0;
504 preg->no_sub = !!(cflags & REG_NOSUB);
505 preg->translate = NULL;
507 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
509 /* POSIX doesn't distinguish between an unmatched open-group and an
510 unmatched close-group: both are REG_EPAREN. */
511 if (ret == REG_ERPAREN)
514 /* We have already checked preg->fastmap != NULL. */
515 if (BE (ret == REG_NOERROR, 1))
516 /* Compute the fastmap now, since regexec cannot modify the pattern
517 buffer. This function never fails in this implementation. */
518 (void) re_compile_fastmap (preg);
521 /* Some error occurred while compiling the expression. */
522 re_free (preg->fastmap);
523 preg->fastmap = NULL;
529 weak_alias (__regcomp, regcomp)
532 /* Returns a message corresponding to an error code, ERRCODE, returned
533 from either regcomp or regexec. We don't use PREG here. */
537 regerror (errcode, preg, errbuf, errbuf_size)
539 const regex_t *_Restrict_ preg;
540 char *_Restrict_ errbuf;
542 #else /* size_t might promote */
544 regerror (int errcode, const regex_t *_Restrict_ preg,
545 char *_Restrict_ errbuf, size_t errbuf_size)
552 || errcode >= (int) (sizeof (__re_error_msgid_idx)
553 / sizeof (__re_error_msgid_idx[0])), 0))
554 /* Only error codes returned by the rest of the code should be passed
555 to this routine. If we are given anything else, or if other regex
556 code generates an invalid error code, then the program has a bug.
557 Dump core so we can fix it. */
560 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
562 msg_size = strlen (msg) + 1; /* Includes the null. */
564 if (BE (errbuf_size != 0, 1))
566 size_t cpy_size = msg_size;
567 if (BE (msg_size > errbuf_size, 0))
569 cpy_size = errbuf_size - 1;
570 errbuf[cpy_size] = '\0';
572 memcpy (errbuf, msg, cpy_size);
578 weak_alias (__regerror, regerror)
582 #ifdef RE_ENABLE_I18N
583 /* This static array is used for the map to single-byte characters when
584 UTF-8 is used. Otherwise we would allocate memory just to initialize
585 it the same all the time. UTF-8 is the preferred encoding so this is
586 a worthwhile optimization. */
587 static const bitset_t utf8_sb_map =
589 /* Set the first 128 bits. */
590 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
591 # error "bitset_word_t is narrower than 32 bits"
592 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
593 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
594 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
595 BITSET_WORD_MAX, BITSET_WORD_MAX,
596 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
600 >> (SBC_MAX % BITSET_WORD_BITS == 0
602 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
608 free_dfa_content (re_dfa_t *dfa)
613 for (i = 0; i < dfa->nodes_len; ++i)
614 free_token (dfa->nodes + i);
615 re_free (dfa->nexts);
616 for (i = 0; i < dfa->nodes_len; ++i)
618 if (dfa->eclosures != NULL)
619 re_node_set_free (dfa->eclosures + i);
620 if (dfa->inveclosures != NULL)
621 re_node_set_free (dfa->inveclosures + i);
622 if (dfa->edests != NULL)
623 re_node_set_free (dfa->edests + i);
625 re_free (dfa->edests);
626 re_free (dfa->eclosures);
627 re_free (dfa->inveclosures);
628 re_free (dfa->nodes);
630 if (dfa->state_table)
631 for (i = 0; i <= dfa->state_hash_mask; ++i)
633 struct re_state_table_entry *entry = dfa->state_table + i;
634 for (j = 0; j < entry->num; ++j)
636 re_dfastate_t *state = entry->array[j];
639 re_free (entry->array);
641 re_free (dfa->state_table);
642 #ifdef RE_ENABLE_I18N
643 if (dfa->sb_char != utf8_sb_map)
644 re_free (dfa->sb_char);
646 re_free (dfa->subexp_map);
648 re_free (dfa->re_str);
655 /* Free dynamically allocated space used by PREG. */
661 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
662 if (BE (dfa != NULL, 1))
663 free_dfa_content (dfa);
667 re_free (preg->fastmap);
668 preg->fastmap = NULL;
670 re_free (preg->translate);
671 preg->translate = NULL;
674 weak_alias (__regfree, regfree)
677 /* Entry points compatible with 4.2 BSD regex library. We don't define
678 them unless specifically requested. */
680 #if defined _REGEX_RE_COMP || defined _LIBC
682 /* BSD has one and only one pattern buffer. */
683 static struct re_pattern_buffer re_comp_buf;
687 /* Make these definitions weak in libc, so POSIX programs can redefine
688 these names if they don't use our functions, and still use
689 regcomp/regexec above without link errors. */
700 if (!re_comp_buf.buffer)
701 return gettext ("No previous regular expression");
705 if (re_comp_buf.buffer)
707 fastmap = re_comp_buf.fastmap;
708 re_comp_buf.fastmap = NULL;
709 __regfree (&re_comp_buf);
710 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
711 re_comp_buf.fastmap = fastmap;
714 if (re_comp_buf.fastmap == NULL)
716 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
717 if (re_comp_buf.fastmap == NULL)
718 return (char *) gettext (__re_error_msgid
719 + __re_error_msgid_idx[(int) REG_ESPACE]);
722 /* Since `re_exec' always passes NULL for the `regs' argument, we
723 don't need to initialize the pattern buffer fields which affect it. */
725 /* Match anchors at newlines. */
726 re_comp_buf.newline_anchor = 1;
728 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
733 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
734 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
738 libc_freeres_fn (free_mem)
740 __regfree (&re_comp_buf);
744 #endif /* _REGEX_RE_COMP */
746 /* Internal entry point.
747 Compile the regular expression PATTERN, whose length is LENGTH.
748 SYNTAX indicate regular expression's syntax. */
751 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
754 reg_errcode_t err = REG_NOERROR;
758 /* Initialize the pattern buffer. */
759 preg->fastmap_accurate = 0;
760 preg->syntax = syntax;
761 preg->not_bol = preg->not_eol = 0;
764 preg->can_be_null = 0;
765 preg->regs_allocated = REGS_UNALLOCATED;
767 /* Initialize the dfa. */
768 dfa = (re_dfa_t *) preg->buffer;
769 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
771 /* If zero allocated, but buffer is non-null, try to realloc
772 enough space. This loses if buffer's address is bogus, but
773 that is the user's responsibility. If ->buffer is NULL this
774 is a simple allocation. */
775 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
778 preg->allocated = sizeof (re_dfa_t);
779 preg->buffer = (unsigned char *) dfa;
781 preg->used = sizeof (re_dfa_t);
783 err = init_dfa (dfa, length);
784 if (BE (err != REG_NOERROR, 0))
786 free_dfa_content (dfa);
792 /* Note: length+1 will not overflow since it is checked in init_dfa. */
793 dfa->re_str = re_malloc (char, length + 1);
794 strncpy (dfa->re_str, pattern, length + 1);
797 __libc_lock_init (dfa->lock);
799 err = re_string_construct (®exp, pattern, length, preg->translate,
800 (syntax & RE_ICASE) != 0, dfa);
801 if (BE (err != REG_NOERROR, 0))
803 re_compile_internal_free_return:
804 free_workarea_compile (preg);
805 re_string_destruct (®exp);
806 free_dfa_content (dfa);
812 /* Parse the regular expression, and build a structure tree. */
814 dfa->str_tree = parse (®exp, preg, syntax, &err);
815 if (BE (dfa->str_tree == NULL, 0))
816 goto re_compile_internal_free_return;
818 /* Analyze the tree and create the nfa. */
819 err = analyze (preg);
820 if (BE (err != REG_NOERROR, 0))
821 goto re_compile_internal_free_return;
823 #ifdef RE_ENABLE_I18N
824 /* If possible, do searching in single byte encoding to speed things up. */
825 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
829 /* Then create the initial state of the dfa. */
830 err = create_initial_state (dfa);
832 /* Release work areas. */
833 free_workarea_compile (preg);
834 re_string_destruct (®exp);
836 if (BE (err != REG_NOERROR, 0))
838 free_dfa_content (dfa);
846 /* Initialize DFA. We use the length of the regular expression PAT_LEN
847 as the initial length of some arrays. */
850 init_dfa (re_dfa_t *dfa, size_t pat_len)
852 __re_size_t table_size;
856 #ifdef RE_ENABLE_I18N
857 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
859 size_t max_i18n_object_size = 0;
861 size_t max_object_size =
862 MAX (sizeof (struct re_state_table_entry),
863 MAX (sizeof (re_token_t),
864 MAX (sizeof (re_node_set),
865 MAX (sizeof (regmatch_t),
866 max_i18n_object_size))));
868 memset (dfa, '\0', sizeof (re_dfa_t));
870 /* Force allocation of str_tree_storage the first time. */
871 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
873 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
874 calculation below, and for similar doubling calculations
875 elsewhere. And it's <= rather than <, because some of the
876 doubling calculations add 1 afterwards. */
877 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
880 dfa->nodes_alloc = pat_len + 1;
881 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
883 /* table_size = 2 ^ ceil(log pat_len) */
884 for (table_size = 1; ; table_size <<= 1)
885 if (table_size > pat_len)
888 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
889 dfa->state_hash_mask = table_size - 1;
891 dfa->mb_cur_max = MB_CUR_MAX;
893 if (dfa->mb_cur_max == 6
894 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
896 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
899 codeset_name = nl_langinfo (CODESET);
900 if (strcasecmp (codeset_name, "UTF-8") == 0
901 || strcasecmp (codeset_name, "UTF8") == 0)
904 /* We check exhaustively in the loop below if this charset is a
905 superset of ASCII. */
906 dfa->map_notascii = 0;
909 #ifdef RE_ENABLE_I18N
910 if (dfa->mb_cur_max > 1)
913 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
918 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
919 if (BE (dfa->sb_char == NULL, 0))
922 /* Set the bits corresponding to single byte chars. */
923 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
924 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
926 wint_t wch = __btowc (ch);
928 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
930 if (isascii (ch) && wch != ch)
931 dfa->map_notascii = 1;
938 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
943 /* Initialize WORD_CHAR table, which indicate which character is
944 "word". In this case "word" means that it is the word construction
945 character used by some operators like "\<", "\>", etc. */
949 init_word_char (re_dfa_t *dfa)
952 dfa->word_ops_used = 1;
953 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
954 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
955 if (isalnum (ch) || ch == '_')
956 dfa->word_char[i] |= (bitset_word_t) 1 << j;
959 /* Free the work area which are only used while compiling. */
962 free_workarea_compile (regex_t *preg)
964 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
965 bin_tree_storage_t *storage, *next;
966 for (storage = dfa->str_tree_storage; storage; storage = next)
968 next = storage->next;
971 dfa->str_tree_storage = NULL;
972 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
973 dfa->str_tree = NULL;
974 re_free (dfa->org_indices);
975 dfa->org_indices = NULL;
978 /* Create initial states for all contexts. */
981 create_initial_state (re_dfa_t *dfa)
985 re_node_set init_nodes;
987 /* Initial states have the epsilon closure of the node which is
988 the first node of the regular expression. */
989 first = dfa->str_tree->first->node_idx;
990 dfa->init_node = first;
991 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
992 if (BE (err != REG_NOERROR, 0))
995 /* The back-references which are in initial states can epsilon transit,
996 since in this case all of the subexpressions can be null.
997 Then we add epsilon closures of the nodes which are the next nodes of
998 the back-references. */
999 if (dfa->nbackref > 0)
1000 for (i = 0; i < init_nodes.nelem; ++i)
1002 Idx node_idx = init_nodes.elems[i];
1003 re_token_type_t type = dfa->nodes[node_idx].type;
1006 if (type != OP_BACK_REF)
1008 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1010 re_token_t *clexp_node;
1011 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1012 if (clexp_node->type == OP_CLOSE_SUBEXP
1013 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1016 if (clexp_idx == init_nodes.nelem)
1019 if (type == OP_BACK_REF)
1021 Idx dest_idx = dfa->edests[node_idx].elems[0];
1022 if (!re_node_set_contains (&init_nodes, dest_idx))
1024 reg_errcode_t merge_err
1025 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1026 if (merge_err != REG_NOERROR)
1033 /* It must be the first time to invoke acquire_state. */
1034 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1035 /* We don't check ERR here, since the initial state must not be NULL. */
1036 if (BE (dfa->init_state == NULL, 0))
1038 if (dfa->init_state->has_constraint)
1040 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1042 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1044 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1048 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1049 || dfa->init_state_begbuf == NULL, 0))
1053 dfa->init_state_word = dfa->init_state_nl
1054 = dfa->init_state_begbuf = dfa->init_state;
1056 re_node_set_free (&init_nodes);
1060 #ifdef RE_ENABLE_I18N
1061 /* If it is possible to do searching in single byte encoding instead of UTF-8
1062 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1063 DFA nodes where needed. */
1066 optimize_utf8 (re_dfa_t *dfa)
1070 bool mb_chars = false;
1071 bool has_period = false;
1073 for (node = 0; node < dfa->nodes_len; ++node)
1074 switch (dfa->nodes[node].type)
1077 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1081 switch (dfa->nodes[node].opr.ctx_type)
1089 /* Word anchors etc. cannot be handled. It's okay to test
1090 opr.ctx_type since constraints (for all DFA nodes) are
1091 created by ORing one or more opr.ctx_type values. */
1101 case OP_DUP_ASTERISK:
1102 case OP_OPEN_SUBEXP:
1103 case OP_CLOSE_SUBEXP:
1105 case COMPLEX_BRACKET:
1107 case SIMPLE_BRACKET:
1108 /* Just double check. */
1110 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1112 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1113 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1115 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1125 if (mb_chars || has_period)
1126 for (node = 0; node < dfa->nodes_len; ++node)
1128 if (dfa->nodes[node].type == CHARACTER
1129 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1130 dfa->nodes[node].mb_partial = 0;
1131 else if (dfa->nodes[node].type == OP_PERIOD)
1132 dfa->nodes[node].type = OP_UTF8_PERIOD;
1135 /* The search can be in single byte locale. */
1136 dfa->mb_cur_max = 1;
1138 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1142 /* Analyze the structure tree, and calculate "first", "next", "edest",
1143 "eclosure", and "inveclosure". */
1145 static reg_errcode_t
1146 analyze (regex_t *preg)
1148 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1151 /* Allocate arrays. */
1152 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1153 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1154 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1155 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1156 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1157 || dfa->eclosures == NULL, 0))
1160 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1161 if (dfa->subexp_map != NULL)
1164 for (i = 0; i < preg->re_nsub; i++)
1165 dfa->subexp_map[i] = i;
1166 preorder (dfa->str_tree, optimize_subexps, dfa);
1167 for (i = 0; i < preg->re_nsub; i++)
1168 if (dfa->subexp_map[i] != i)
1170 if (i == preg->re_nsub)
1172 free (dfa->subexp_map);
1173 dfa->subexp_map = NULL;
1177 ret = postorder (dfa->str_tree, lower_subexps, preg);
1178 if (BE (ret != REG_NOERROR, 0))
1180 ret = postorder (dfa->str_tree, calc_first, dfa);
1181 if (BE (ret != REG_NOERROR, 0))
1183 preorder (dfa->str_tree, calc_next, dfa);
1184 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1185 if (BE (ret != REG_NOERROR, 0))
1187 ret = calc_eclosure (dfa);
1188 if (BE (ret != REG_NOERROR, 0))
1191 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1192 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1193 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1196 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1197 if (BE (dfa->inveclosures == NULL, 0))
1199 ret = calc_inveclosure (dfa);
1205 /* Our parse trees are very unbalanced, so we cannot use a stack to
1206 implement parse tree visits. Instead, we use parent pointers and
1207 some hairy code in these two functions. */
1208 static reg_errcode_t
1209 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1212 bin_tree_t *node, *prev;
1214 for (node = root; ; )
1216 /* Descend down the tree, preferably to the left (or to the right
1217 if that's the only child). */
1218 while (node->left || node->right)
1226 reg_errcode_t err = fn (extra, node);
1227 if (BE (err != REG_NOERROR, 0))
1229 if (node->parent == NULL)
1232 node = node->parent;
1234 /* Go up while we have a node that is reached from the right. */
1235 while (node->right == prev || node->right == NULL);
1240 static reg_errcode_t
1241 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1246 for (node = root; ; )
1248 reg_errcode_t err = fn (extra, node);
1249 if (BE (err != REG_NOERROR, 0))
1252 /* Go to the left node, or up and to the right. */
1257 bin_tree_t *prev = NULL;
1258 while (node->right == prev || node->right == NULL)
1261 node = node->parent;
1270 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1271 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1272 backreferences as well. Requires a preorder visit. */
1273 static reg_errcode_t
1274 optimize_subexps (void *extra, bin_tree_t *node)
1276 re_dfa_t *dfa = (re_dfa_t *) extra;
1278 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1280 int idx = node->token.opr.idx;
1281 node->token.opr.idx = dfa->subexp_map[idx];
1282 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1285 else if (node->token.type == SUBEXP
1286 && node->left && node->left->token.type == SUBEXP)
1288 Idx other_idx = node->left->token.opr.idx;
1290 node->left = node->left->left;
1292 node->left->parent = node;
1294 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1295 if (other_idx < BITSET_WORD_BITS)
1296 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1302 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1303 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1304 static reg_errcode_t
1305 lower_subexps (void *extra, bin_tree_t *node)
1307 regex_t *preg = (regex_t *) extra;
1308 reg_errcode_t err = REG_NOERROR;
1310 if (node->left && node->left->token.type == SUBEXP)
1312 node->left = lower_subexp (&err, preg, node->left);
1314 node->left->parent = node;
1316 if (node->right && node->right->token.type == SUBEXP)
1318 node->right = lower_subexp (&err, preg, node->right);
1320 node->right->parent = node;
1327 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1329 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1330 bin_tree_t *body = node->left;
1331 bin_tree_t *op, *cls, *tree1, *tree;
1334 /* We do not optimize empty subexpressions, because otherwise we may
1335 have bad CONCAT nodes with NULL children. This is obviously not
1336 very common, so we do not lose much. An example that triggers
1337 this case is the sed "script" /\(\)/x. */
1338 && node->left != NULL
1339 && (node->token.opr.idx >= BITSET_WORD_BITS
1340 || !(dfa->used_bkref_map
1341 & ((bitset_word_t) 1 << node->token.opr.idx))))
1344 /* Convert the SUBEXP node to the concatenation of an
1345 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1346 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1347 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1348 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1349 tree = create_tree (dfa, op, tree1, CONCAT);
1350 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1356 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1357 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1361 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1362 nodes. Requires a postorder visit. */
1363 static reg_errcode_t
1364 calc_first (void *extra, bin_tree_t *node)
1366 re_dfa_t *dfa = (re_dfa_t *) extra;
1367 if (node->token.type == CONCAT)
1369 node->first = node->left->first;
1370 node->node_idx = node->left->node_idx;
1375 node->node_idx = re_dfa_add_node (dfa, node->token);
1376 if (BE (node->node_idx == REG_MISSING, 0))
1378 if (node->token.type == ANCHOR)
1379 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1384 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1385 static reg_errcode_t
1386 calc_next (void *extra, bin_tree_t *node)
1388 switch (node->token.type)
1390 case OP_DUP_ASTERISK:
1391 node->left->next = node;
1394 node->left->next = node->right->first;
1395 node->right->next = node->next;
1399 node->left->next = node->next;
1401 node->right->next = node->next;
1407 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1408 static reg_errcode_t
1409 link_nfa_nodes (void *extra, bin_tree_t *node)
1411 re_dfa_t *dfa = (re_dfa_t *) extra;
1412 Idx idx = node->node_idx;
1413 reg_errcode_t err = REG_NOERROR;
1415 switch (node->token.type)
1421 assert (node->next == NULL);
1424 case OP_DUP_ASTERISK:
1428 dfa->has_plural_match = 1;
1429 if (node->left != NULL)
1430 left = node->left->first->node_idx;
1432 left = node->next->node_idx;
1433 if (node->right != NULL)
1434 right = node->right->first->node_idx;
1436 right = node->next->node_idx;
1437 assert (REG_VALID_INDEX (left));
1438 assert (REG_VALID_INDEX (right));
1439 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1444 case OP_OPEN_SUBEXP:
1445 case OP_CLOSE_SUBEXP:
1446 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1450 dfa->nexts[idx] = node->next->node_idx;
1451 if (node->token.type == OP_BACK_REF)
1452 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1456 assert (!IS_EPSILON_NODE (node->token.type));
1457 dfa->nexts[idx] = node->next->node_idx;
1464 /* Duplicate the epsilon closure of the node ROOT_NODE.
1465 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1466 to their own constraint. */
1468 static reg_errcode_t
1470 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1471 Idx root_node, unsigned int init_constraint)
1473 Idx org_node, clone_node;
1475 unsigned int constraint = init_constraint;
1476 for (org_node = top_org_node, clone_node = top_clone_node;;)
1478 Idx org_dest, clone_dest;
1479 if (dfa->nodes[org_node].type == OP_BACK_REF)
1481 /* If the back reference epsilon-transit, its destination must
1482 also have the constraint. Then duplicate the epsilon closure
1483 of the destination of the back reference, and store it in
1484 edests of the back reference. */
1485 org_dest = dfa->nexts[org_node];
1486 re_node_set_empty (dfa->edests + clone_node);
1487 clone_dest = duplicate_node (dfa, org_dest, constraint);
1488 if (BE (clone_dest == REG_MISSING, 0))
1490 dfa->nexts[clone_node] = dfa->nexts[org_node];
1491 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1495 else if (dfa->edests[org_node].nelem == 0)
1497 /* In case of the node can't epsilon-transit, don't duplicate the
1498 destination and store the original destination as the
1499 destination of the node. */
1500 dfa->nexts[clone_node] = dfa->nexts[org_node];
1503 else if (dfa->edests[org_node].nelem == 1)
1505 /* In case of the node can epsilon-transit, and it has only one
1507 org_dest = dfa->edests[org_node].elems[0];
1508 re_node_set_empty (dfa->edests + clone_node);
1509 /* If the node is root_node itself, it means the epsilon closure
1510 has a loop. Then tie it to the destination of the root_node. */
1511 if (org_node == root_node && clone_node != org_node)
1513 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1518 /* In case the node has another constraint, append it. */
1519 constraint |= dfa->nodes[org_node].constraint;
1520 clone_dest = duplicate_node (dfa, org_dest, constraint);
1521 if (BE (clone_dest == REG_MISSING, 0))
1523 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1527 else /* dfa->edests[org_node].nelem == 2 */
1529 /* In case of the node can epsilon-transit, and it has two
1530 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1531 org_dest = dfa->edests[org_node].elems[0];
1532 re_node_set_empty (dfa->edests + clone_node);
1533 /* Search for a duplicated node which satisfies the constraint. */
1534 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1535 if (clone_dest == REG_MISSING)
1537 /* There is no such duplicated node, create a new one. */
1539 clone_dest = duplicate_node (dfa, org_dest, constraint);
1540 if (BE (clone_dest == REG_MISSING, 0))
1542 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1545 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1546 root_node, constraint);
1547 if (BE (err != REG_NOERROR, 0))
1552 /* There is a duplicated node which satisfies the constraint,
1553 use it to avoid infinite loop. */
1554 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1559 org_dest = dfa->edests[org_node].elems[1];
1560 clone_dest = duplicate_node (dfa, org_dest, constraint);
1561 if (BE (clone_dest == REG_MISSING, 0))
1563 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1567 org_node = org_dest;
1568 clone_node = clone_dest;
1573 /* Search for a node which is duplicated from the node ORG_NODE, and
1574 satisfies the constraint CONSTRAINT. */
1577 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1578 unsigned int constraint)
1581 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1583 if (org_node == dfa->org_indices[idx]
1584 && constraint == dfa->nodes[idx].constraint)
1585 return idx; /* Found. */
1587 return REG_MISSING; /* Not found. */
1590 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1591 Return the index of the new node, or REG_MISSING if insufficient storage is
1595 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1597 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1598 if (BE (dup_idx != REG_MISSING, 1))
1600 dfa->nodes[dup_idx].constraint = constraint;
1601 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1602 dfa->nodes[dup_idx].duplicated = 1;
1604 /* Store the index of the original node. */
1605 dfa->org_indices[dup_idx] = org_idx;
1610 static reg_errcode_t
1611 calc_inveclosure (re_dfa_t *dfa)
1615 for (idx = 0; idx < dfa->nodes_len; ++idx)
1616 re_node_set_init_empty (dfa->inveclosures + idx);
1618 for (src = 0; src < dfa->nodes_len; ++src)
1620 Idx *elems = dfa->eclosures[src].elems;
1621 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1623 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1632 /* Calculate "eclosure" for all the node in DFA. */
1634 static reg_errcode_t
1635 calc_eclosure (re_dfa_t *dfa)
1640 assert (dfa->nodes_len > 0);
1643 /* For each nodes, calculate epsilon closure. */
1644 for (node_idx = 0; ; ++node_idx)
1647 re_node_set eclosure_elem;
1648 if (node_idx == dfa->nodes_len)
1657 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1660 /* If we have already calculated, skip it. */
1661 if (dfa->eclosures[node_idx].nelem != 0)
1663 /* Calculate epsilon closure of `node_idx'. */
1664 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1665 if (BE (err != REG_NOERROR, 0))
1668 if (dfa->eclosures[node_idx].nelem == 0)
1671 re_node_set_free (&eclosure_elem);
1677 /* Calculate epsilon closure of NODE. */
1679 static reg_errcode_t
1680 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1684 re_node_set eclosure;
1686 bool incomplete = false;
1687 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1688 if (BE (err != REG_NOERROR, 0))
1691 /* This indicates that we are calculating this node now.
1692 We reference this value to avoid infinite loop. */
1693 dfa->eclosures[node].nelem = REG_MISSING;
1695 /* If the current node has constraints, duplicate all nodes
1696 since they must inherit the constraints. */
1697 if (dfa->nodes[node].constraint
1698 && dfa->edests[node].nelem
1699 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1701 err = duplicate_node_closure (dfa, node, node, node,
1702 dfa->nodes[node].constraint);
1703 if (BE (err != REG_NOERROR, 0))
1707 /* Expand each epsilon destination nodes. */
1708 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1709 for (i = 0; i < dfa->edests[node].nelem; ++i)
1711 re_node_set eclosure_elem;
1712 Idx edest = dfa->edests[node].elems[i];
1713 /* If calculating the epsilon closure of `edest' is in progress,
1714 return intermediate result. */
1715 if (dfa->eclosures[edest].nelem == REG_MISSING)
1720 /* If we haven't calculated the epsilon closure of `edest' yet,
1721 calculate now. Otherwise use calculated epsilon closure. */
1722 if (dfa->eclosures[edest].nelem == 0)
1724 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1725 if (BE (err != REG_NOERROR, 0))
1729 eclosure_elem = dfa->eclosures[edest];
1730 /* Merge the epsilon closure of `edest'. */
1731 err = re_node_set_merge (&eclosure, &eclosure_elem);
1732 if (BE (err != REG_NOERROR, 0))
1734 /* If the epsilon closure of `edest' is incomplete,
1735 the epsilon closure of this node is also incomplete. */
1736 if (dfa->eclosures[edest].nelem == 0)
1739 re_node_set_free (&eclosure_elem);
1743 /* An epsilon closure includes itself. */
1744 ok = re_node_set_insert (&eclosure, node);
1747 if (incomplete && !root)
1748 dfa->eclosures[node].nelem = 0;
1750 dfa->eclosures[node] = eclosure;
1751 *new_set = eclosure;
1755 /* Functions for token which are used in the parser. */
1757 /* Fetch a token from INPUT.
1758 We must not use this function inside bracket expressions. */
1762 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1764 re_string_skip_bytes (input, peek_token (result, input, syntax));
1767 /* Peek a token from INPUT, and return the length of the token.
1768 We must not use this function inside bracket expressions. */
1772 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1776 if (re_string_eoi (input))
1778 token->type = END_OF_RE;
1782 c = re_string_peek_byte (input, 0);
1785 token->word_char = 0;
1786 #ifdef RE_ENABLE_I18N
1787 token->mb_partial = 0;
1788 if (input->mb_cur_max > 1 &&
1789 !re_string_first_byte (input, re_string_cur_idx (input)))
1791 token->type = CHARACTER;
1792 token->mb_partial = 1;
1799 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1801 token->type = BACK_SLASH;
1805 c2 = re_string_peek_byte_case (input, 1);
1807 token->type = CHARACTER;
1808 #ifdef RE_ENABLE_I18N
1809 if (input->mb_cur_max > 1)
1811 wint_t wc = re_string_wchar_at (input,
1812 re_string_cur_idx (input) + 1);
1813 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1817 token->word_char = IS_WORD_CHAR (c2) != 0;
1822 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1823 token->type = OP_ALT;
1825 case '1': case '2': case '3': case '4': case '5':
1826 case '6': case '7': case '8': case '9':
1827 if (!(syntax & RE_NO_BK_REFS))
1829 token->type = OP_BACK_REF;
1830 token->opr.idx = c2 - '1';
1834 if (!(syntax & RE_NO_GNU_OPS))
1836 token->type = ANCHOR;
1837 token->opr.ctx_type = WORD_FIRST;
1841 if (!(syntax & RE_NO_GNU_OPS))
1843 token->type = ANCHOR;
1844 token->opr.ctx_type = WORD_LAST;
1848 if (!(syntax & RE_NO_GNU_OPS))
1850 token->type = ANCHOR;
1851 token->opr.ctx_type = WORD_DELIM;
1855 if (!(syntax & RE_NO_GNU_OPS))
1857 token->type = ANCHOR;
1858 token->opr.ctx_type = NOT_WORD_DELIM;
1862 if (!(syntax & RE_NO_GNU_OPS))
1863 token->type = OP_WORD;
1866 if (!(syntax & RE_NO_GNU_OPS))
1867 token->type = OP_NOTWORD;
1870 if (!(syntax & RE_NO_GNU_OPS))
1871 token->type = OP_SPACE;
1874 if (!(syntax & RE_NO_GNU_OPS))
1875 token->type = OP_NOTSPACE;
1878 if (!(syntax & RE_NO_GNU_OPS))
1880 token->type = ANCHOR;
1881 token->opr.ctx_type = BUF_FIRST;
1885 if (!(syntax & RE_NO_GNU_OPS))
1887 token->type = ANCHOR;
1888 token->opr.ctx_type = BUF_LAST;
1892 if (!(syntax & RE_NO_BK_PARENS))
1893 token->type = OP_OPEN_SUBEXP;
1896 if (!(syntax & RE_NO_BK_PARENS))
1897 token->type = OP_CLOSE_SUBEXP;
1900 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1901 token->type = OP_DUP_PLUS;
1904 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1905 token->type = OP_DUP_QUESTION;
1908 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1909 token->type = OP_OPEN_DUP_NUM;
1912 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1913 token->type = OP_CLOSE_DUP_NUM;
1921 token->type = CHARACTER;
1922 #ifdef RE_ENABLE_I18N
1923 if (input->mb_cur_max > 1)
1925 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1926 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1930 token->word_char = IS_WORD_CHAR (token->opr.c);
1935 if (syntax & RE_NEWLINE_ALT)
1936 token->type = OP_ALT;
1939 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1940 token->type = OP_ALT;
1943 token->type = OP_DUP_ASTERISK;
1946 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1947 token->type = OP_DUP_PLUS;
1950 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1951 token->type = OP_DUP_QUESTION;
1954 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1955 token->type = OP_OPEN_DUP_NUM;
1958 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1959 token->type = OP_CLOSE_DUP_NUM;
1962 if (syntax & RE_NO_BK_PARENS)
1963 token->type = OP_OPEN_SUBEXP;
1966 if (syntax & RE_NO_BK_PARENS)
1967 token->type = OP_CLOSE_SUBEXP;
1970 token->type = OP_OPEN_BRACKET;
1973 token->type = OP_PERIOD;
1976 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1977 re_string_cur_idx (input) != 0)
1979 char prev = re_string_peek_byte (input, -1);
1980 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1983 token->type = ANCHOR;
1984 token->opr.ctx_type = LINE_FIRST;
1987 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1988 re_string_cur_idx (input) + 1 != re_string_length (input))
1991 re_string_skip_bytes (input, 1);
1992 peek_token (&next, input, syntax);
1993 re_string_skip_bytes (input, -1);
1994 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1997 token->type = ANCHOR;
1998 token->opr.ctx_type = LINE_LAST;
2006 /* Peek a token from INPUT, and return the length of the token.
2007 We must not use this function out of bracket expressions. */
2011 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2014 if (re_string_eoi (input))
2016 token->type = END_OF_RE;
2019 c = re_string_peek_byte (input, 0);
2022 #ifdef RE_ENABLE_I18N
2023 if (input->mb_cur_max > 1 &&
2024 !re_string_first_byte (input, re_string_cur_idx (input)))
2026 token->type = CHARACTER;
2029 #endif /* RE_ENABLE_I18N */
2031 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2032 && re_string_cur_idx (input) + 1 < re_string_length (input))
2034 /* In this case, '\' escape a character. */
2036 re_string_skip_bytes (input, 1);
2037 c2 = re_string_peek_byte (input, 0);
2039 token->type = CHARACTER;
2042 if (c == '[') /* '[' is a special char in a bracket exps. */
2046 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2047 c2 = re_string_peek_byte (input, 1);
2055 token->type = OP_OPEN_COLL_ELEM;
2058 token->type = OP_OPEN_EQUIV_CLASS;
2061 if (syntax & RE_CHAR_CLASSES)
2063 token->type = OP_OPEN_CHAR_CLASS;
2066 /* else fall through. */
2068 token->type = CHARACTER;
2078 token->type = OP_CHARSET_RANGE;
2081 token->type = OP_CLOSE_BRACKET;
2084 token->type = OP_NON_MATCH_LIST;
2087 token->type = CHARACTER;
2092 /* Functions for parser. */
2094 /* Entry point of the parser.
2095 Parse the regular expression REGEXP and return the structure tree.
2096 If an error is occured, ERR is set by error code, and return NULL.
2097 This function build the following tree, from regular expression <reg_exp>:
2103 CAT means concatenation.
2104 EOR means end of regular expression. */
2107 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2110 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2111 bin_tree_t *tree, *eor, *root;
2112 re_token_t current_token;
2113 dfa->syntax = syntax;
2114 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2115 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2116 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2118 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2120 root = create_tree (dfa, tree, eor, CONCAT);
2123 if (BE (eor == NULL || root == NULL, 0))
2131 /* This function build the following tree, from regular expression
2132 <branch1>|<branch2>:
2138 ALT means alternative, which represents the operator `|'. */
2141 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2142 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2144 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2145 bin_tree_t *tree, *branch = NULL;
2146 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2147 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2150 while (token->type == OP_ALT)
2152 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2153 if (token->type != OP_ALT && token->type != END_OF_RE
2154 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2156 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2157 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2162 tree = create_tree (dfa, tree, branch, OP_ALT);
2163 if (BE (tree == NULL, 0))
2172 /* This function build the following tree, from regular expression
2179 CAT means concatenation. */
2182 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2183 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2185 bin_tree_t *tree, *expr;
2186 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2187 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2188 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2191 while (token->type != OP_ALT && token->type != END_OF_RE
2192 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2194 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2195 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2199 if (tree != NULL && expr != NULL)
2201 tree = create_tree (dfa, tree, expr, CONCAT);
2208 else if (tree == NULL)
2210 /* Otherwise expr == NULL, we don't need to create new tree. */
2215 /* This function build the following tree, from regular expression a*:
2222 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2223 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2225 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2227 switch (token->type)
2230 tree = create_token_tree (dfa, NULL, NULL, token);
2231 if (BE (tree == NULL, 0))
2236 #ifdef RE_ENABLE_I18N
2237 if (dfa->mb_cur_max > 1)
2239 while (!re_string_eoi (regexp)
2240 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2242 bin_tree_t *mbc_remain;
2243 fetch_token (token, regexp, syntax);
2244 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2245 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2246 if (BE (mbc_remain == NULL || tree == NULL, 0))
2255 case OP_OPEN_SUBEXP:
2256 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2257 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2260 case OP_OPEN_BRACKET:
2261 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2262 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2266 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2271 dfa->used_bkref_map |= 1 << token->opr.idx;
2272 tree = create_token_tree (dfa, NULL, NULL, token);
2273 if (BE (tree == NULL, 0))
2279 dfa->has_mb_node = 1;
2281 case OP_OPEN_DUP_NUM:
2282 if (syntax & RE_CONTEXT_INVALID_DUP)
2288 case OP_DUP_ASTERISK:
2290 case OP_DUP_QUESTION:
2291 if (syntax & RE_CONTEXT_INVALID_OPS)
2296 else if (syntax & RE_CONTEXT_INDEP_OPS)
2298 fetch_token (token, regexp, syntax);
2299 return parse_expression (regexp, preg, token, syntax, nest, err);
2301 /* else fall through */
2302 case OP_CLOSE_SUBEXP:
2303 if ((token->type == OP_CLOSE_SUBEXP) &&
2304 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2309 /* else fall through */
2310 case OP_CLOSE_DUP_NUM:
2311 /* We treat it as a normal character. */
2313 /* Then we can these characters as normal characters. */
2314 token->type = CHARACTER;
2315 /* mb_partial and word_char bits should be initialized already
2317 tree = create_token_tree (dfa, NULL, NULL, token);
2318 if (BE (tree == NULL, 0))
2325 if ((token->opr.ctx_type
2326 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2327 && dfa->word_ops_used == 0)
2328 init_word_char (dfa);
2329 if (token->opr.ctx_type == WORD_DELIM
2330 || token->opr.ctx_type == NOT_WORD_DELIM)
2332 bin_tree_t *tree_first, *tree_last;
2333 if (token->opr.ctx_type == WORD_DELIM)
2335 token->opr.ctx_type = WORD_FIRST;
2336 tree_first = create_token_tree (dfa, NULL, NULL, token);
2337 token->opr.ctx_type = WORD_LAST;
2341 token->opr.ctx_type = INSIDE_WORD;
2342 tree_first = create_token_tree (dfa, NULL, NULL, token);
2343 token->opr.ctx_type = INSIDE_NOTWORD;
2345 tree_last = create_token_tree (dfa, NULL, NULL, token);
2346 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2347 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2355 tree = create_token_tree (dfa, NULL, NULL, token);
2356 if (BE (tree == NULL, 0))
2362 /* We must return here, since ANCHORs can't be followed
2363 by repetition operators.
2364 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2365 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2366 fetch_token (token, regexp, syntax);
2369 tree = create_token_tree (dfa, NULL, NULL, token);
2370 if (BE (tree == NULL, 0))
2375 if (dfa->mb_cur_max > 1)
2376 dfa->has_mb_node = 1;
2380 tree = build_charclass_op (dfa, regexp->trans,
2381 (const unsigned char *) "alnum",
2382 (const unsigned char *) "_",
2383 token->type == OP_NOTWORD, err);
2384 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389 tree = build_charclass_op (dfa, regexp->trans,
2390 (const unsigned char *) "space",
2391 (const unsigned char *) "",
2392 token->type == OP_NOTSPACE, err);
2393 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2403 /* Must not happen? */
2409 fetch_token (token, regexp, syntax);
2411 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2412 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2414 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2415 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2417 /* In BRE consecutive duplications are not allowed. */
2418 if ((syntax & RE_CONTEXT_INVALID_DUP)
2419 && (token->type == OP_DUP_ASTERISK
2420 || token->type == OP_OPEN_DUP_NUM))
2430 /* This function build the following tree, from regular expression
2438 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2439 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2441 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2444 cur_nsub = preg->re_nsub++;
2446 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2448 /* The subexpression may be a null string. */
2449 if (token->type == OP_CLOSE_SUBEXP)
2453 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2454 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2456 if (BE (*err != REG_NOERROR, 0))
2460 if (cur_nsub <= '9' - '1')
2461 dfa->completed_bkref_map |= 1 << cur_nsub;
2463 tree = create_tree (dfa, tree, NULL, SUBEXP);
2464 if (BE (tree == NULL, 0))
2469 tree->token.opr.idx = cur_nsub;
2473 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2476 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2477 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2479 bin_tree_t *tree = NULL, *old_tree = NULL;
2480 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2481 re_token_t start_token = *token;
2483 if (token->type == OP_OPEN_DUP_NUM)
2486 start = fetch_number (regexp, token, syntax);
2487 if (start == REG_MISSING)
2489 if (token->type == CHARACTER && token->opr.c == ',')
2490 start = 0; /* We treat "{,m}" as "{0,m}". */
2493 *err = REG_BADBR; /* <re>{} is invalid. */
2497 if (BE (start != REG_ERROR, 1))
2499 /* We treat "{n}" as "{n,n}". */
2500 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2501 : ((token->type == CHARACTER && token->opr.c == ',')
2502 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2504 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2506 /* Invalid sequence. */
2507 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2509 if (token->type == END_OF_RE)
2517 /* If the syntax bit is set, rollback. */
2518 re_string_set_index (regexp, start_idx);
2519 *token = start_token;
2520 token->type = CHARACTER;
2521 /* mb_partial and word_char bits should be already initialized by
2526 if (BE ((end != REG_MISSING && start > end)
2527 || token->type != OP_CLOSE_DUP_NUM, 0))
2529 /* First number greater than second. */
2536 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2537 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2540 fetch_token (token, regexp, syntax);
2542 if (BE (elem == NULL, 0))
2544 if (BE (start == 0 && end == 0, 0))
2546 postorder (elem, free_tree, NULL);
2550 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2551 if (BE (start > 0, 0))
2554 for (i = 2; i <= start; ++i)
2556 elem = duplicate_tree (elem, dfa);
2557 tree = create_tree (dfa, tree, elem, CONCAT);
2558 if (BE (elem == NULL || tree == NULL, 0))
2559 goto parse_dup_op_espace;
2565 /* Duplicate ELEM before it is marked optional. */
2566 elem = duplicate_tree (elem, dfa);
2572 if (elem->token.type == SUBEXP)
2573 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2575 tree = create_tree (dfa, elem, NULL,
2576 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2577 if (BE (tree == NULL, 0))
2578 goto parse_dup_op_espace;
2580 /* From gnulib's "intprops.h":
2581 True if the arithmetic type T is signed. */
2582 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2584 /* This loop is actually executed only when end != REG_MISSING,
2585 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2586 already created the start+1-th copy. */
2587 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2588 for (i = start + 2; i <= end; ++i)
2590 elem = duplicate_tree (elem, dfa);
2591 tree = create_tree (dfa, tree, elem, CONCAT);
2592 if (BE (elem == NULL || tree == NULL, 0))
2593 goto parse_dup_op_espace;
2595 tree = create_tree (dfa, tree, NULL, OP_ALT);
2596 if (BE (tree == NULL, 0))
2597 goto parse_dup_op_espace;
2601 tree = create_tree (dfa, old_tree, tree, CONCAT);
2605 parse_dup_op_espace:
2610 /* Size of the names for collating symbol/equivalence_class/character_class.
2611 I'm not sure, but maybe enough. */
2612 #define BRACKET_NAME_BUF_SIZE 32
2615 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2616 Build the range expression which starts from START_ELEM, and ends
2617 at END_ELEM. The result are written to MBCSET and SBCSET.
2618 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2619 mbcset->range_ends, is a pointer argument sinse we may
2622 static reg_errcode_t
2624 # ifdef RE_ENABLE_I18N
2625 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2626 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2627 # else /* not RE_ENABLE_I18N */
2628 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2629 bracket_elem_t *end_elem)
2630 # endif /* not RE_ENABLE_I18N */
2632 unsigned int start_ch, end_ch;
2633 /* Equivalence Classes and Character Classes can't be a range start/end. */
2634 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2635 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2639 /* We can handle no multi character collating elements without libc
2641 if (BE ((start_elem->type == COLL_SYM
2642 && strlen ((char *) start_elem->opr.name) > 1)
2643 || (end_elem->type == COLL_SYM
2644 && strlen ((char *) end_elem->opr.name) > 1), 0))
2645 return REG_ECOLLATE;
2647 # ifdef RE_ENABLE_I18N
2652 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2654 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2655 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2657 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2658 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2660 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2661 ? __btowc (start_ch) : start_elem->opr.wch);
2662 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2663 ? __btowc (end_ch) : end_elem->opr.wch);
2664 if (start_wc == WEOF || end_wc == WEOF)
2665 return REG_ECOLLATE;
2666 cmp_buf[0] = start_wc;
2667 cmp_buf[4] = end_wc;
2668 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2671 /* Got valid collation sequence values, add them as a new entry.
2672 However, for !_LIBC we have no collation elements: if the
2673 character set is single byte, the single byte character set
2674 that we build below suffices. parse_bracket_exp passes
2675 no MBCSET if dfa->mb_cur_max == 1. */
2678 /* Check the space of the arrays. */
2679 if (BE (*range_alloc == mbcset->nranges, 0))
2681 /* There is not enough space, need realloc. */
2682 wchar_t *new_array_start, *new_array_end;
2685 /* +1 in case of mbcset->nranges is 0. */
2686 new_nranges = 2 * mbcset->nranges + 1;
2687 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2688 are NULL if *range_alloc == 0. */
2689 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2691 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2694 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2697 mbcset->range_starts = new_array_start;
2698 mbcset->range_ends = new_array_end;
2699 *range_alloc = new_nranges;
2702 mbcset->range_starts[mbcset->nranges] = start_wc;
2703 mbcset->range_ends[mbcset->nranges++] = end_wc;
2706 /* Build the table for single byte characters. */
2707 for (wc = 0; wc < SBC_MAX; ++wc)
2710 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2711 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2712 bitset_set (sbcset, wc);
2715 # else /* not RE_ENABLE_I18N */
2718 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2719 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2721 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2722 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2724 if (start_ch > end_ch)
2726 /* Build the table for single byte characters. */
2727 for (ch = 0; ch < SBC_MAX; ++ch)
2728 if (start_ch <= ch && ch <= end_ch)
2729 bitset_set (sbcset, ch);
2731 # endif /* not RE_ENABLE_I18N */
2734 #endif /* not _LIBC */
2737 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2738 Build the collating element which is represented by NAME.
2739 The result are written to MBCSET and SBCSET.
2740 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2741 pointer argument since we may update it. */
2743 static reg_errcode_t
2745 build_collating_symbol (bitset_t sbcset,
2746 # ifdef RE_ENABLE_I18N
2747 re_charset_t *mbcset, Idx *coll_sym_alloc,
2749 const unsigned char *name)
2751 size_t name_len = strlen ((const char *) name);
2752 if (BE (name_len != 1, 0))
2753 return REG_ECOLLATE;
2756 bitset_set (sbcset, name[0]);
2760 #endif /* not _LIBC */
2762 /* This function parse bracket expression like "[abc]", "[a-c]",
2766 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2767 reg_syntax_t syntax, reg_errcode_t *err)
2770 const unsigned char *collseqmb;
2771 const char *collseqwc;
2774 const int32_t *symb_table;
2775 const unsigned char *extra;
2777 /* Local function for parse_bracket_exp used in _LIBC environement.
2778 Seek the collating symbol entry correspondings to NAME.
2779 Return the index of the symbol in the SYMB_TABLE. */
2782 __attribute ((always_inline))
2783 seek_collating_symbol_entry (name, name_len)
2784 const unsigned char *name;
2787 int32_t hash = elem_hash ((const char *) name, name_len);
2788 int32_t elem = hash % table_size;
2789 if (symb_table[2 * elem] != 0)
2791 int32_t second = hash % (table_size - 2) + 1;
2795 /* First compare the hashing value. */
2796 if (symb_table[2 * elem] == hash
2797 /* Compare the length of the name. */
2798 && name_len == extra[symb_table[2 * elem + 1]]
2799 /* Compare the name. */
2800 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2803 /* Yep, this is the entry. */
2810 while (symb_table[2 * elem] != 0);
2815 /* Local function for parse_bracket_exp used in _LIBC environment.
2816 Look up the collation sequence value of BR_ELEM.
2817 Return the value if succeeded, UINT_MAX otherwise. */
2819 auto inline unsigned int
2820 __attribute ((always_inline))
2821 lookup_collation_sequence_value (br_elem)
2822 bracket_elem_t *br_elem;
2824 if (br_elem->type == SB_CHAR)
2827 if (MB_CUR_MAX == 1)
2830 return collseqmb[br_elem->opr.ch];
2833 wint_t wc = __btowc (br_elem->opr.ch);
2834 return __collseq_table_lookup (collseqwc, wc);
2837 else if (br_elem->type == MB_CHAR)
2840 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2842 else if (br_elem->type == COLL_SYM)
2844 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2848 elem = seek_collating_symbol_entry (br_elem->opr.name,
2850 if (symb_table[2 * elem] != 0)
2852 /* We found the entry. */
2853 idx = symb_table[2 * elem + 1];
2854 /* Skip the name of collating element name. */
2855 idx += 1 + extra[idx];
2856 /* Skip the byte sequence of the collating element. */
2857 idx += 1 + extra[idx];
2858 /* Adjust for the alignment. */
2859 idx = (idx + 3) & ~3;
2860 /* Skip the multibyte collation sequence value. */
2861 idx += sizeof (unsigned int);
2862 /* Skip the wide char sequence of the collating element. */
2863 idx += sizeof (unsigned int) *
2864 (1 + *(unsigned int *) (extra + idx));
2865 /* Return the collation sequence value. */
2866 return *(unsigned int *) (extra + idx);
2868 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2870 /* No valid character. Match it as a single byte
2872 return collseqmb[br_elem->opr.name[0]];
2875 else if (sym_name_len == 1)
2876 return collseqmb[br_elem->opr.name[0]];
2881 /* Local function for parse_bracket_exp used in _LIBC environement.
2882 Build the range expression which starts from START_ELEM, and ends
2883 at END_ELEM. The result are written to MBCSET and SBCSET.
2884 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2885 mbcset->range_ends, is a pointer argument sinse we may
2888 auto inline reg_errcode_t
2889 __attribute ((always_inline))
2890 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2891 re_charset_t *mbcset;
2894 bracket_elem_t *start_elem, *end_elem;
2897 uint32_t start_collseq;
2898 uint32_t end_collseq;
2900 /* Equivalence Classes and Character Classes can't be a range
2902 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2903 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2907 start_collseq = lookup_collation_sequence_value (start_elem);
2908 end_collseq = lookup_collation_sequence_value (end_elem);
2909 /* Check start/end collation sequence values. */
2910 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2911 return REG_ECOLLATE;
2912 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2915 /* Got valid collation sequence values, add them as a new entry.
2916 However, if we have no collation elements, and the character set
2917 is single byte, the single byte character set that we
2918 build below suffices. */
2919 if (nrules > 0 || dfa->mb_cur_max > 1)
2921 /* Check the space of the arrays. */
2922 if (BE (*range_alloc == mbcset->nranges, 0))
2924 /* There is not enough space, need realloc. */
2925 uint32_t *new_array_start;
2926 uint32_t *new_array_end;
2929 /* +1 in case of mbcset->nranges is 0. */
2930 new_nranges = 2 * mbcset->nranges + 1;
2931 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2933 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2936 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2939 mbcset->range_starts = new_array_start;
2940 mbcset->range_ends = new_array_end;
2941 *range_alloc = new_nranges;
2944 mbcset->range_starts[mbcset->nranges] = start_collseq;
2945 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2948 /* Build the table for single byte characters. */
2949 for (ch = 0; ch < SBC_MAX; ch++)
2951 uint32_t ch_collseq;
2953 if (MB_CUR_MAX == 1)
2956 ch_collseq = collseqmb[ch];
2958 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2959 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2960 bitset_set (sbcset, ch);
2965 /* Local function for parse_bracket_exp used in _LIBC environement.
2966 Build the collating element which is represented by NAME.
2967 The result are written to MBCSET and SBCSET.
2968 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2969 pointer argument sinse we may update it. */
2971 auto inline reg_errcode_t
2972 __attribute ((always_inline))
2973 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2974 re_charset_t *mbcset;
2975 Idx *coll_sym_alloc;
2977 const unsigned char *name;
2980 size_t name_len = strlen ((const char *) name);
2983 elem = seek_collating_symbol_entry (name, name_len);
2984 if (symb_table[2 * elem] != 0)
2986 /* We found the entry. */
2987 idx = symb_table[2 * elem + 1];
2988 /* Skip the name of collating element name. */
2989 idx += 1 + extra[idx];
2991 else if (symb_table[2 * elem] == 0 && name_len == 1)
2993 /* No valid character, treat it as a normal
2995 bitset_set (sbcset, name[0]);
2999 return REG_ECOLLATE;
3001 /* Got valid collation sequence, add it as a new entry. */
3002 /* Check the space of the arrays. */
3003 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3005 /* Not enough, realloc it. */
3006 /* +1 in case of mbcset->ncoll_syms is 0. */
3007 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3008 /* Use realloc since mbcset->coll_syms is NULL
3010 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3011 new_coll_sym_alloc);
3012 if (BE (new_coll_syms == NULL, 0))
3014 mbcset->coll_syms = new_coll_syms;
3015 *coll_sym_alloc = new_coll_sym_alloc;
3017 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3022 if (BE (name_len != 1, 0))
3023 return REG_ECOLLATE;
3026 bitset_set (sbcset, name[0]);
3033 re_token_t br_token;
3034 re_bitset_ptr_t sbcset;
3035 #ifdef RE_ENABLE_I18N
3036 re_charset_t *mbcset;
3037 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3038 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3039 #endif /* not RE_ENABLE_I18N */
3040 bool non_match = false;
3041 bin_tree_t *work_tree;
3043 bool first_round = true;
3045 collseqmb = (const unsigned char *)
3046 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3047 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3053 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3054 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3055 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3056 _NL_COLLATE_SYMB_TABLEMB);
3057 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3058 _NL_COLLATE_SYMB_EXTRAMB);
3061 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3062 #ifdef RE_ENABLE_I18N
3063 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3064 #endif /* RE_ENABLE_I18N */
3065 #ifdef RE_ENABLE_I18N
3066 if (BE (sbcset == NULL || mbcset == NULL, 0))
3068 if (BE (sbcset == NULL, 0))
3069 #endif /* RE_ENABLE_I18N */
3075 token_len = peek_token_bracket (token, regexp, syntax);
3076 if (BE (token->type == END_OF_RE, 0))
3079 goto parse_bracket_exp_free_return;
3081 if (token->type == OP_NON_MATCH_LIST)
3083 #ifdef RE_ENABLE_I18N
3084 mbcset->non_match = 1;
3085 #endif /* not RE_ENABLE_I18N */
3087 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3088 bitset_set (sbcset, '\n');
3089 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3090 token_len = peek_token_bracket (token, regexp, syntax);
3091 if (BE (token->type == END_OF_RE, 0))
3094 goto parse_bracket_exp_free_return;
3098 /* We treat the first ']' as a normal character. */
3099 if (token->type == OP_CLOSE_BRACKET)
3100 token->type = CHARACTER;
3104 bracket_elem_t start_elem, end_elem;
3105 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3106 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3109 bool is_range_exp = false;
3112 start_elem.opr.name = start_name_buf;
3113 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3114 syntax, first_round);
3115 if (BE (ret != REG_NOERROR, 0))
3118 goto parse_bracket_exp_free_return;
3120 first_round = false;
3122 /* Get information about the next token. We need it in any case. */
3123 token_len = peek_token_bracket (token, regexp, syntax);
3125 /* Do not check for ranges if we know they are not allowed. */
3126 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3128 if (BE (token->type == END_OF_RE, 0))
3131 goto parse_bracket_exp_free_return;
3133 if (token->type == OP_CHARSET_RANGE)
3135 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3136 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3137 if (BE (token2.type == END_OF_RE, 0))
3140 goto parse_bracket_exp_free_return;
3142 if (token2.type == OP_CLOSE_BRACKET)
3144 /* We treat the last '-' as a normal character. */
3145 re_string_skip_bytes (regexp, -token_len);
3146 token->type = CHARACTER;
3149 is_range_exp = true;
3153 if (is_range_exp == true)
3155 end_elem.opr.name = end_name_buf;
3156 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3158 if (BE (ret != REG_NOERROR, 0))
3161 goto parse_bracket_exp_free_return;
3164 token_len = peek_token_bracket (token, regexp, syntax);
3167 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3168 &start_elem, &end_elem);
3170 # ifdef RE_ENABLE_I18N
3171 *err = build_range_exp (sbcset,
3172 dfa->mb_cur_max > 1 ? mbcset : NULL,
3173 &range_alloc, &start_elem, &end_elem);
3175 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3177 #endif /* RE_ENABLE_I18N */
3178 if (BE (*err != REG_NOERROR, 0))
3179 goto parse_bracket_exp_free_return;
3183 switch (start_elem.type)
3186 bitset_set (sbcset, start_elem.opr.ch);
3188 #ifdef RE_ENABLE_I18N
3190 /* Check whether the array has enough space. */
3191 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3193 wchar_t *new_mbchars;
3194 /* Not enough, realloc it. */
3195 /* +1 in case of mbcset->nmbchars is 0. */
3196 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3197 /* Use realloc since array is NULL if *alloc == 0. */
3198 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3200 if (BE (new_mbchars == NULL, 0))
3201 goto parse_bracket_exp_espace;
3202 mbcset->mbchars = new_mbchars;
3204 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3206 #endif /* RE_ENABLE_I18N */
3208 *err = build_equiv_class (sbcset,
3209 #ifdef RE_ENABLE_I18N
3210 mbcset, &equiv_class_alloc,
3211 #endif /* RE_ENABLE_I18N */
3212 start_elem.opr.name);
3213 if (BE (*err != REG_NOERROR, 0))
3214 goto parse_bracket_exp_free_return;
3217 *err = build_collating_symbol (sbcset,
3218 #ifdef RE_ENABLE_I18N
3219 mbcset, &coll_sym_alloc,
3220 #endif /* RE_ENABLE_I18N */
3221 start_elem.opr.name);
3222 if (BE (*err != REG_NOERROR, 0))
3223 goto parse_bracket_exp_free_return;
3226 *err = build_charclass (regexp->trans, sbcset,
3227 #ifdef RE_ENABLE_I18N
3228 mbcset, &char_class_alloc,
3229 #endif /* RE_ENABLE_I18N */
3230 start_elem.opr.name, syntax);
3231 if (BE (*err != REG_NOERROR, 0))
3232 goto parse_bracket_exp_free_return;
3239 if (BE (token->type == END_OF_RE, 0))
3242 goto parse_bracket_exp_free_return;
3244 if (token->type == OP_CLOSE_BRACKET)
3248 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3250 /* If it is non-matching list. */
3252 bitset_not (sbcset);
3254 #ifdef RE_ENABLE_I18N
3255 /* Ensure only single byte characters are set. */
3256 if (dfa->mb_cur_max > 1)
3257 bitset_mask (sbcset, dfa->sb_char);
3259 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3260 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3261 || mbcset->non_match)))
3263 bin_tree_t *mbc_tree;
3265 /* Build a tree for complex bracket. */
3266 dfa->has_mb_node = 1;
3267 br_token.type = COMPLEX_BRACKET;
3268 br_token.opr.mbcset = mbcset;
3269 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3270 if (BE (mbc_tree == NULL, 0))
3271 goto parse_bracket_exp_espace;
3272 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3273 if (sbcset[sbc_idx])
3275 /* If there are no bits set in sbcset, there is no point
3276 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3277 if (sbc_idx < BITSET_WORDS)
3279 /* Build a tree for simple bracket. */
3280 br_token.type = SIMPLE_BRACKET;
3281 br_token.opr.sbcset = sbcset;
3282 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3283 if (BE (work_tree == NULL, 0))
3284 goto parse_bracket_exp_espace;
3286 /* Then join them by ALT node. */
3287 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3288 if (BE (work_tree == NULL, 0))
3289 goto parse_bracket_exp_espace;
3294 work_tree = mbc_tree;
3298 #endif /* not RE_ENABLE_I18N */
3300 #ifdef RE_ENABLE_I18N
3301 free_charset (mbcset);
3303 /* Build a tree for simple bracket. */
3304 br_token.type = SIMPLE_BRACKET;
3305 br_token.opr.sbcset = sbcset;
3306 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3307 if (BE (work_tree == NULL, 0))
3308 goto parse_bracket_exp_espace;
3312 parse_bracket_exp_espace:
3314 parse_bracket_exp_free_return:
3316 #ifdef RE_ENABLE_I18N
3317 free_charset (mbcset);
3318 #endif /* RE_ENABLE_I18N */
3322 /* Parse an element in the bracket expression. */
3324 static reg_errcode_t
3325 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3326 re_token_t *token, int token_len, re_dfa_t *dfa,
3327 reg_syntax_t syntax, bool accept_hyphen)
3329 #ifdef RE_ENABLE_I18N
3331 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3332 if (cur_char_size > 1)
3334 elem->type = MB_CHAR;
3335 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3336 re_string_skip_bytes (regexp, cur_char_size);
3339 #endif /* RE_ENABLE_I18N */
3340 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3341 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3342 || token->type == OP_OPEN_EQUIV_CLASS)
3343 return parse_bracket_symbol (elem, regexp, token);
3344 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3346 /* A '-' must only appear as anything but a range indicator before
3347 the closing bracket. Everything else is an error. */
3349 (void) peek_token_bracket (&token2, regexp, syntax);
3350 if (token2.type != OP_CLOSE_BRACKET)
3351 /* The actual error value is not standardized since this whole
3352 case is undefined. But ERANGE makes good sense. */
3355 elem->type = SB_CHAR;
3356 elem->opr.ch = token->opr.c;
3360 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3361 such as [:<character_class>:], [.<collating_element>.], and
3362 [=<equivalent_class>=]. */
3364 static reg_errcode_t
3365 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3368 unsigned char ch, delim = token->opr.c;
3370 if (re_string_eoi(regexp))
3374 if (i >= BRACKET_NAME_BUF_SIZE)
3376 if (token->type == OP_OPEN_CHAR_CLASS)
3377 ch = re_string_fetch_byte_case (regexp);
3379 ch = re_string_fetch_byte (regexp);
3380 if (re_string_eoi(regexp))
3382 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3384 elem->opr.name[i] = ch;
3386 re_string_skip_bytes (regexp, 1);
3387 elem->opr.name[i] = '\0';
3388 switch (token->type)
3390 case OP_OPEN_COLL_ELEM:
3391 elem->type = COLL_SYM;
3393 case OP_OPEN_EQUIV_CLASS:
3394 elem->type = EQUIV_CLASS;
3396 case OP_OPEN_CHAR_CLASS:
3397 elem->type = CHAR_CLASS;
3405 /* Helper function for parse_bracket_exp.
3406 Build the equivalence class which is represented by NAME.
3407 The result are written to MBCSET and SBCSET.
3408 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3409 is a pointer argument sinse we may update it. */
3411 static reg_errcode_t
3412 #ifdef RE_ENABLE_I18N
3413 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3414 Idx *equiv_class_alloc, const unsigned char *name)
3415 #else /* not RE_ENABLE_I18N */
3416 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3417 #endif /* not RE_ENABLE_I18N */
3420 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3423 const int32_t *table, *indirect;
3424 const unsigned char *weights, *extra, *cp;
3425 unsigned char char_buf[2];
3429 /* This #include defines a local function! */
3430 # include <locale/weight.h>
3431 /* Calculate the index for equivalence class. */
3433 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3434 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3435 _NL_COLLATE_WEIGHTMB);
3436 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3437 _NL_COLLATE_EXTRAMB);
3438 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3439 _NL_COLLATE_INDIRECTMB);
3440 idx1 = findidx (&cp);
3441 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3442 /* This isn't a valid character. */
3443 return REG_ECOLLATE;
3445 /* Build single byte matcing table for this equivalence class. */
3446 char_buf[1] = (unsigned char) '\0';
3447 len = weights[idx1 & 0xffffff];
3448 for (ch = 0; ch < SBC_MAX; ++ch)
3452 idx2 = findidx (&cp);
3457 /* This isn't a valid character. */
3459 /* Compare only if the length matches and the collation rule
3460 index is the same. */
3461 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3465 while (cnt <= len &&
3466 weights[(idx1 & 0xffffff) + 1 + cnt]
3467 == weights[(idx2 & 0xffffff) + 1 + cnt])
3471 bitset_set (sbcset, ch);
3474 /* Check whether the array has enough space. */
3475 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3477 /* Not enough, realloc it. */
3478 /* +1 in case of mbcset->nequiv_classes is 0. */
3479 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3480 /* Use realloc since the array is NULL if *alloc == 0. */
3481 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3483 new_equiv_class_alloc);
3484 if (BE (new_equiv_classes == NULL, 0))
3486 mbcset->equiv_classes = new_equiv_classes;
3487 *equiv_class_alloc = new_equiv_class_alloc;
3489 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3494 if (BE (strlen ((const char *) name) != 1, 0))
3495 return REG_ECOLLATE;
3496 bitset_set (sbcset, *name);
3501 /* Helper function for parse_bracket_exp.
3502 Build the character class which is represented by NAME.
3503 The result are written to MBCSET and SBCSET.
3504 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3505 is a pointer argument sinse we may update it. */
3507 static reg_errcode_t
3508 #ifdef RE_ENABLE_I18N
3509 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3510 re_charset_t *mbcset, Idx *char_class_alloc,
3511 const unsigned char *class_name, reg_syntax_t syntax)
3512 #else /* not RE_ENABLE_I18N */
3513 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3514 const unsigned char *class_name, reg_syntax_t syntax)
3515 #endif /* not RE_ENABLE_I18N */
3518 const char *name = (const char *) class_name;
3520 /* In case of REG_ICASE "upper" and "lower" match the both of
3521 upper and lower cases. */
3522 if ((syntax & RE_ICASE)
3523 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3526 #ifdef RE_ENABLE_I18N
3527 /* Check the space of the arrays. */
3528 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3530 /* Not enough, realloc it. */
3531 /* +1 in case of mbcset->nchar_classes is 0. */
3532 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3533 /* Use realloc since array is NULL if *alloc == 0. */
3534 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3535 new_char_class_alloc);
3536 if (BE (new_char_classes == NULL, 0))
3538 mbcset->char_classes = new_char_classes;
3539 *char_class_alloc = new_char_class_alloc;
3541 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3542 #endif /* RE_ENABLE_I18N */
3544 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3546 if (BE (trans != NULL, 0)) \
3548 for (i = 0; i < SBC_MAX; ++i) \
3549 if (ctype_func (i)) \
3550 bitset_set (sbcset, trans[i]); \
3554 for (i = 0; i < SBC_MAX; ++i) \
3555 if (ctype_func (i)) \
3556 bitset_set (sbcset, i); \
3560 if (strcmp (name, "alnum") == 0)
3561 BUILD_CHARCLASS_LOOP (isalnum);
3562 else if (strcmp (name, "cntrl") == 0)
3563 BUILD_CHARCLASS_LOOP (iscntrl);
3564 else if (strcmp (name, "lower") == 0)
3565 BUILD_CHARCLASS_LOOP (islower);
3566 else if (strcmp (name, "space") == 0)
3567 BUILD_CHARCLASS_LOOP (isspace);
3568 else if (strcmp (name, "alpha") == 0)
3569 BUILD_CHARCLASS_LOOP (isalpha);
3570 else if (strcmp (name, "digit") == 0)
3571 BUILD_CHARCLASS_LOOP (isdigit);
3572 else if (strcmp (name, "print") == 0)
3573 BUILD_CHARCLASS_LOOP (isprint);
3574 else if (strcmp (name, "upper") == 0)
3575 BUILD_CHARCLASS_LOOP (isupper);
3576 else if (strcmp (name, "blank") == 0)
3577 BUILD_CHARCLASS_LOOP (isblank);
3578 else if (strcmp (name, "graph") == 0)
3579 BUILD_CHARCLASS_LOOP (isgraph);
3580 else if (strcmp (name, "punct") == 0)
3581 BUILD_CHARCLASS_LOOP (ispunct);
3582 else if (strcmp (name, "xdigit") == 0)
3583 BUILD_CHARCLASS_LOOP (isxdigit);
3591 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3592 const unsigned char *class_name,
3593 const unsigned char *extra, bool non_match,
3596 re_bitset_ptr_t sbcset;
3597 #ifdef RE_ENABLE_I18N
3598 re_charset_t *mbcset;
3600 #endif /* not RE_ENABLE_I18N */
3602 re_token_t br_token;
3605 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3606 #ifdef RE_ENABLE_I18N
3607 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3608 #endif /* RE_ENABLE_I18N */
3610 #ifdef RE_ENABLE_I18N
3611 if (BE (sbcset == NULL || mbcset == NULL, 0))
3612 #else /* not RE_ENABLE_I18N */
3613 if (BE (sbcset == NULL, 0))
3614 #endif /* not RE_ENABLE_I18N */
3622 #ifdef RE_ENABLE_I18N
3623 mbcset->non_match = 1;
3624 #endif /* not RE_ENABLE_I18N */
3627 /* We don't care the syntax in this case. */
3628 ret = build_charclass (trans, sbcset,
3629 #ifdef RE_ENABLE_I18N
3631 #endif /* RE_ENABLE_I18N */
3634 if (BE (ret != REG_NOERROR, 0))
3637 #ifdef RE_ENABLE_I18N
3638 free_charset (mbcset);
3639 #endif /* RE_ENABLE_I18N */
3643 /* \w match '_' also. */
3644 for (; *extra; extra++)
3645 bitset_set (sbcset, *extra);
3647 /* If it is non-matching list. */
3649 bitset_not (sbcset);
3651 #ifdef RE_ENABLE_I18N
3652 /* Ensure only single byte characters are set. */
3653 if (dfa->mb_cur_max > 1)
3654 bitset_mask (sbcset, dfa->sb_char);
3657 /* Build a tree for simple bracket. */
3658 br_token.type = SIMPLE_BRACKET;
3659 br_token.opr.sbcset = sbcset;
3660 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3661 if (BE (tree == NULL, 0))
3662 goto build_word_op_espace;
3664 #ifdef RE_ENABLE_I18N
3665 if (dfa->mb_cur_max > 1)
3667 bin_tree_t *mbc_tree;
3668 /* Build a tree for complex bracket. */
3669 br_token.type = COMPLEX_BRACKET;
3670 br_token.opr.mbcset = mbcset;
3671 dfa->has_mb_node = 1;
3672 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3673 if (BE (mbc_tree == NULL, 0))
3674 goto build_word_op_espace;
3675 /* Then join them by ALT node. */
3676 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3677 if (BE (mbc_tree != NULL, 1))
3682 free_charset (mbcset);
3685 #else /* not RE_ENABLE_I18N */
3687 #endif /* not RE_ENABLE_I18N */
3689 build_word_op_espace:
3691 #ifdef RE_ENABLE_I18N
3692 free_charset (mbcset);
3693 #endif /* RE_ENABLE_I18N */
3698 /* This is intended for the expressions like "a{1,3}".
3699 Fetch a number from `input', and return the number.
3700 Return REG_MISSING if the number field is empty like "{,1}".
3701 Return REG_ERROR if an error occurred. */
3704 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3706 Idx num = REG_MISSING;
3710 fetch_token (token, input, syntax);
3712 if (BE (token->type == END_OF_RE, 0))
3714 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3716 num = ((token->type != CHARACTER || c < '0' || '9' < c
3717 || num == REG_ERROR)
3719 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3720 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3725 #ifdef RE_ENABLE_I18N
3727 free_charset (re_charset_t *cset)
3729 re_free (cset->mbchars);
3731 re_free (cset->coll_syms);
3732 re_free (cset->equiv_classes);
3733 re_free (cset->range_starts);
3734 re_free (cset->range_ends);
3736 re_free (cset->char_classes);
3739 #endif /* RE_ENABLE_I18N */
3741 /* Functions for binary tree operation. */
3743 /* Create a tree node. */
3746 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3747 re_token_type_t type)
3751 return create_token_tree (dfa, left, right, &t);
3755 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3756 const re_token_t *token)
3759 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3761 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3763 if (storage == NULL)
3765 storage->next = dfa->str_tree_storage;
3766 dfa->str_tree_storage = storage;
3767 dfa->str_tree_storage_idx = 0;
3769 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3771 tree->parent = NULL;
3773 tree->right = right;
3774 tree->token = *token;
3775 tree->token.duplicated = 0;
3776 tree->token.opt_subexp = 0;
3779 tree->node_idx = REG_MISSING;
3782 left->parent = tree;
3784 right->parent = tree;
3788 /* Mark the tree SRC as an optional subexpression.
3789 To be called from preorder or postorder. */
3791 static reg_errcode_t
3792 mark_opt_subexp (void *extra, bin_tree_t *node)
3794 Idx idx = (Idx) (long) extra;
3795 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3796 node->token.opt_subexp = 1;
3801 /* Free the allocated memory inside NODE. */
3804 free_token (re_token_t *node)
3806 #ifdef RE_ENABLE_I18N
3807 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3808 free_charset (node->opr.mbcset);
3810 #endif /* RE_ENABLE_I18N */
3811 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3812 re_free (node->opr.sbcset);
3815 /* Worker function for tree walking. Free the allocated memory inside NODE
3816 and its children. */
3818 static reg_errcode_t
3819 free_tree (void *extra, bin_tree_t *node)
3821 free_token (&node->token);
3826 /* Duplicate the node SRC, and return new node. This is a preorder
3827 visit similar to the one implemented by the generic visitor, but
3828 we need more infrastructure to maintain two parallel trees --- so,
3829 it's easier to duplicate. */
3832 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3834 const bin_tree_t *node;
3835 bin_tree_t *dup_root;
3836 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3838 for (node = root; ; )
3840 /* Create a new tree and link it back to the current parent. */
3841 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3844 (*p_new)->parent = dup_node;
3845 (*p_new)->token.duplicated = 1;
3848 /* Go to the left node, or up and to the right. */
3852 p_new = &dup_node->left;
3856 const bin_tree_t *prev = NULL;
3857 while (node->right == prev || node->right == NULL)
3860 node = node->parent;
3861 dup_node = dup_node->parent;
3866 p_new = &dup_node->right;