1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_token_t *token, int token_len,
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 Idx *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
96 Idx *char_class_alloc,
97 const unsigned char *class_name,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 bool non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 static const char __re_error_msgid[] =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 static const size_t __re_error_msgid_idx[] =
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210 are set in BUFP on entry. */
214 re_compile_pattern (pattern, length, bufp)
217 struct re_pattern_buffer *bufp;
218 #else /* size_t might promote */
220 re_compile_pattern (const char *pattern, size_t length,
221 struct re_pattern_buffer *bufp)
226 /* And GNU code determines whether or not to get register information
227 by passing null for the REGS argument to re_match, etc., not by
228 setting no_sub, unless RE_NO_SUB is set. */
229 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
231 /* Match anchors at newline. */
232 bufp->newline_anchor = 1;
234 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
238 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
241 weak_alias (__re_compile_pattern, re_compile_pattern)
244 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
245 also be assigned to arbitrarily: each pattern buffer stores its own
246 syntax, so it can be changed between regex compilations. */
247 /* This has no initializer because initialized variables in Emacs
248 become read-only after dumping. */
249 reg_syntax_t re_syntax_options;
252 /* Specify the precise syntax of regexps for compilation. This provides
253 for compatibility for various utilities which historically have
254 different, incompatible syntaxes.
256 The argument SYNTAX is a bit mask comprised of the various bits
257 defined in regex.h. We return the old syntax. */
260 re_set_syntax (syntax)
263 reg_syntax_t ret = re_syntax_options;
265 re_syntax_options = syntax;
269 weak_alias (__re_set_syntax, re_set_syntax)
273 re_compile_fastmap (bufp)
274 struct re_pattern_buffer *bufp;
276 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
277 char *fastmap = bufp->fastmap;
279 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281 if (dfa->init_state != dfa->init_state_word)
282 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283 if (dfa->init_state != dfa->init_state_nl)
284 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285 if (dfa->init_state != dfa->init_state_begbuf)
286 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287 bufp->fastmap_accurate = 1;
291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
295 __attribute ((always_inline))
296 re_set_fastmap (char *fastmap, bool icase, int ch)
300 fastmap[tolower (ch)] = 1;
303 /* Helper function for re_compile_fastmap.
304 Compile fastmap for the initial_state INIT_STATE. */
307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
310 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
312 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
315 Idx node = init_state->nodes.elems[node_cnt];
316 re_token_type_t type = dfa->nodes[node].type;
318 if (type == CHARACTER)
320 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321 #ifdef RE_ENABLE_I18N
322 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
324 unsigned char buf[MB_LEN_MAX];
330 *p++ = dfa->nodes[node].opr.c;
331 while (++node < dfa->nodes_len
332 && dfa->nodes[node].type == CHARACTER
333 && dfa->nodes[node].mb_partial)
334 *p++ = dfa->nodes[node].opr.c;
335 memset (&state, '\0', sizeof (state));
336 if (mbrtowc (&wc, (const char *) buf, p - buf,
338 && (__wcrtomb ((char *) buf, towlower (wc), &state)
340 re_set_fastmap (fastmap, false, buf[0]);
344 else if (type == SIMPLE_BRACKET)
347 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
350 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
351 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352 if (w & ((bitset_word_t) 1 << j))
353 re_set_fastmap (fastmap, icase, ch);
356 #ifdef RE_ENABLE_I18N
357 else if (type == COMPLEX_BRACKET)
360 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
361 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
362 || cset->nranges || cset->nchar_classes)
365 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
367 /* In this case we want to catch the bytes which are
368 the first byte of any collation elements.
369 e.g. In da_DK, we want to catch 'a' since "aa"
370 is a valid collation element, and don't catch
371 'b' since 'b' is the only collation element
372 which starts from 'b'. */
373 const int32_t *table = (const int32_t *)
374 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
375 for (i = 0; i < SBC_MAX; ++i)
377 re_set_fastmap (fastmap, icase, i);
380 if (dfa->mb_cur_max > 1)
381 for (i = 0; i < SBC_MAX; ++i)
382 if (__btowc (i) == WEOF)
383 re_set_fastmap (fastmap, icase, i);
384 # endif /* not _LIBC */
386 for (i = 0; i < cset->nmbchars; ++i)
390 memset (&state, '\0', sizeof (state));
391 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
392 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
393 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
395 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
397 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
401 #endif /* RE_ENABLE_I18N */
402 else if (type == OP_PERIOD
403 #ifdef RE_ENABLE_I18N
404 || type == OP_UTF8_PERIOD
405 #endif /* RE_ENABLE_I18N */
406 || type == END_OF_RE)
408 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
409 if (type == END_OF_RE)
410 bufp->can_be_null = 1;
416 /* Entry point for POSIX code. */
417 /* regcomp takes a regular expression as a string and compiles it.
419 PREG is a regex_t *. We do not expect any fields to be initialized,
420 since POSIX says we shouldn't. Thus, we set
422 `buffer' to the compiled pattern;
423 `used' to the length of the compiled pattern;
424 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
425 REG_EXTENDED bit in CFLAGS is set; otherwise, to
426 RE_SYNTAX_POSIX_BASIC;
427 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
428 `fastmap' to an allocated space for the fastmap;
429 `fastmap_accurate' to zero;
430 `re_nsub' to the number of subexpressions in PATTERN.
432 PATTERN is the address of the pattern string.
434 CFLAGS is a series of bits which affect compilation.
436 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
437 use POSIX basic syntax.
439 If REG_NEWLINE is set, then . and [^...] don't match newline.
440 Also, regexec will try a match beginning after every newline.
442 If REG_ICASE is set, then we considers upper- and lowercase
443 versions of letters to be equivalent when matching.
445 If REG_NOSUB is set, then when PREG is passed to regexec, that
446 routine will report only success or failure, and nothing about the
449 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
450 the return codes and their meanings.) */
453 regcomp (preg, pattern, cflags)
454 regex_t *_Restrict_ preg;
455 const char *_Restrict_ pattern;
459 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
460 : RE_SYNTAX_POSIX_BASIC);
466 /* Try to allocate space for the fastmap. */
467 preg->fastmap = re_malloc (char, SBC_MAX);
468 if (BE (preg->fastmap == NULL, 0))
471 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
473 /* If REG_NEWLINE is set, newlines are treated differently. */
474 if (cflags & REG_NEWLINE)
475 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
476 syntax &= ~RE_DOT_NEWLINE;
477 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
478 /* It also changes the matching behavior. */
479 preg->newline_anchor = 1;
482 preg->newline_anchor = 0;
483 preg->no_sub = !!(cflags & REG_NOSUB);
484 preg->translate = NULL;
486 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
488 /* POSIX doesn't distinguish between an unmatched open-group and an
489 unmatched close-group: both are REG_EPAREN. */
490 if (ret == REG_ERPAREN)
493 /* We have already checked preg->fastmap != NULL. */
494 if (BE (ret == REG_NOERROR, 1))
495 /* Compute the fastmap now, since regexec cannot modify the pattern
496 buffer. This function never fails in this implementation. */
497 (void) re_compile_fastmap (preg);
500 /* Some error occurred while compiling the expression. */
501 re_free (preg->fastmap);
502 preg->fastmap = NULL;
508 weak_alias (__regcomp, regcomp)
511 /* Returns a message corresponding to an error code, ERRCODE, returned
512 from either regcomp or regexec. We don't use PREG here. */
516 regerror (errcode, preg, errbuf, errbuf_size)
518 const regex_t *_Restrict_ preg;
519 char *_Restrict_ errbuf;
521 #else /* size_t might promote */
523 regerror (int errcode, const regex_t *_Restrict_ preg,
524 char *_Restrict_ errbuf, size_t errbuf_size)
531 || errcode >= (int) (sizeof (__re_error_msgid_idx)
532 / sizeof (__re_error_msgid_idx[0])), 0))
533 /* Only error codes returned by the rest of the code should be passed
534 to this routine. If we are given anything else, or if other regex
535 code generates an invalid error code, then the program has a bug.
536 Dump core so we can fix it. */
539 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
541 msg_size = strlen (msg) + 1; /* Includes the null. */
543 if (BE (errbuf_size != 0, 1))
545 size_t cpy_size = msg_size;
546 if (BE (msg_size > errbuf_size, 0))
548 cpy_size = errbuf_size - 1;
549 errbuf[cpy_size] = '\0';
551 memcpy (errbuf, msg, cpy_size);
557 weak_alias (__regerror, regerror)
561 #ifdef RE_ENABLE_I18N
562 /* This static array is used for the map to single-byte characters when
563 UTF-8 is used. Otherwise we would allocate memory just to initialize
564 it the same all the time. UTF-8 is the preferred encoding so this is
565 a worthwhile optimization. */
566 static const bitset_t utf8_sb_map =
568 /* Set the first 128 bits. */
569 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
570 # error "bitset_word_t is narrower than 32 bits"
571 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
572 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
573 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
574 BITSET_WORD_MAX, BITSET_WORD_MAX,
575 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
579 >> (SBC_MAX % BITSET_WORD_BITS == 0
581 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
587 free_dfa_content (re_dfa_t *dfa)
592 for (i = 0; i < dfa->nodes_len; ++i)
593 free_token (dfa->nodes + i);
594 re_free (dfa->nexts);
595 for (i = 0; i < dfa->nodes_len; ++i)
597 if (dfa->eclosures != NULL)
598 re_node_set_free (dfa->eclosures + i);
599 if (dfa->inveclosures != NULL)
600 re_node_set_free (dfa->inveclosures + i);
601 if (dfa->edests != NULL)
602 re_node_set_free (dfa->edests + i);
604 re_free (dfa->edests);
605 re_free (dfa->eclosures);
606 re_free (dfa->inveclosures);
607 re_free (dfa->nodes);
609 if (dfa->state_table)
610 for (i = 0; i <= dfa->state_hash_mask; ++i)
612 struct re_state_table_entry *entry = dfa->state_table + i;
613 for (j = 0; j < entry->num; ++j)
615 re_dfastate_t *state = entry->array[j];
618 re_free (entry->array);
620 re_free (dfa->state_table);
621 #ifdef RE_ENABLE_I18N
622 if (dfa->sb_char != utf8_sb_map)
623 re_free (dfa->sb_char);
625 re_free (dfa->subexp_map);
627 re_free (dfa->re_str);
634 /* Free dynamically allocated space used by PREG. */
640 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
641 if (BE (dfa != NULL, 1))
642 free_dfa_content (dfa);
646 re_free (preg->fastmap);
647 preg->fastmap = NULL;
649 re_free (preg->translate);
650 preg->translate = NULL;
653 weak_alias (__regfree, regfree)
656 /* Entry points compatible with 4.2 BSD regex library. We don't define
657 them unless specifically requested. */
659 #if defined _REGEX_RE_COMP || defined _LIBC
661 /* BSD has one and only one pattern buffer. */
662 static struct re_pattern_buffer re_comp_buf;
666 /* Make these definitions weak in libc, so POSIX programs can redefine
667 these names if they don't use our functions, and still use
668 regcomp/regexec above without link errors. */
679 if (!re_comp_buf.buffer)
680 return gettext ("No previous regular expression");
684 if (re_comp_buf.buffer)
686 fastmap = re_comp_buf.fastmap;
687 re_comp_buf.fastmap = NULL;
688 __regfree (&re_comp_buf);
689 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
690 re_comp_buf.fastmap = fastmap;
693 if (re_comp_buf.fastmap == NULL)
695 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
696 if (re_comp_buf.fastmap == NULL)
697 return (char *) gettext (__re_error_msgid
698 + __re_error_msgid_idx[(int) REG_ESPACE]);
701 /* Since `re_exec' always passes NULL for the `regs' argument, we
702 don't need to initialize the pattern buffer fields which affect it. */
704 /* Match anchors at newlines. */
705 re_comp_buf.newline_anchor = 1;
707 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
712 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
713 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
717 libc_freeres_fn (free_mem)
719 __regfree (&re_comp_buf);
723 #endif /* _REGEX_RE_COMP */
725 /* Internal entry point.
726 Compile the regular expression PATTERN, whose length is LENGTH.
727 SYNTAX indicate regular expression's syntax. */
730 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
733 reg_errcode_t err = REG_NOERROR;
737 /* Initialize the pattern buffer. */
738 preg->fastmap_accurate = 0;
739 preg->syntax = syntax;
740 preg->not_bol = preg->not_eol = 0;
743 preg->can_be_null = 0;
744 preg->regs_allocated = REGS_UNALLOCATED;
746 /* Initialize the dfa. */
747 dfa = (re_dfa_t *) preg->buffer;
748 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
750 /* If zero allocated, but buffer is non-null, try to realloc
751 enough space. This loses if buffer's address is bogus, but
752 that is the user's responsibility. If ->buffer is NULL this
753 is a simple allocation. */
754 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
757 preg->allocated = sizeof (re_dfa_t);
758 preg->buffer = (unsigned char *) dfa;
760 preg->used = sizeof (re_dfa_t);
762 err = init_dfa (dfa, length);
763 if (BE (err != REG_NOERROR, 0))
765 free_dfa_content (dfa);
771 /* Note: length+1 will not overflow since it is checked in init_dfa. */
772 dfa->re_str = re_malloc (char, length + 1);
773 strncpy (dfa->re_str, pattern, length + 1);
776 __libc_lock_init (dfa->lock);
778 err = re_string_construct (®exp, pattern, length, preg->translate,
779 (syntax & RE_ICASE) != 0, dfa);
780 if (BE (err != REG_NOERROR, 0))
782 re_compile_internal_free_return:
783 free_workarea_compile (preg);
784 re_string_destruct (®exp);
785 free_dfa_content (dfa);
791 /* Parse the regular expression, and build a structure tree. */
793 dfa->str_tree = parse (®exp, preg, syntax, &err);
794 if (BE (dfa->str_tree == NULL, 0))
795 goto re_compile_internal_free_return;
797 /* Analyze the tree and create the nfa. */
798 err = analyze (preg);
799 if (BE (err != REG_NOERROR, 0))
800 goto re_compile_internal_free_return;
802 #ifdef RE_ENABLE_I18N
803 /* If possible, do searching in single byte encoding to speed things up. */
804 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
808 /* Then create the initial state of the dfa. */
809 err = create_initial_state (dfa);
811 /* Release work areas. */
812 free_workarea_compile (preg);
813 re_string_destruct (®exp);
815 if (BE (err != REG_NOERROR, 0))
817 free_dfa_content (dfa);
825 /* Initialize DFA. We use the length of the regular expression PAT_LEN
826 as the initial length of some arrays. */
829 init_dfa (re_dfa_t *dfa, size_t pat_len)
831 __re_size_t table_size;
832 #ifdef RE_ENABLE_I18N
833 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
835 size_t max_i18n_object_size = 0;
837 size_t max_object_size =
838 MAX (sizeof (struct re_state_table_entry),
839 MAX (sizeof (re_token_t),
840 MAX (sizeof (re_node_set),
841 MAX (sizeof (regmatch_t),
842 max_i18n_object_size))));
844 memset (dfa, '\0', sizeof (re_dfa_t));
846 /* Force allocation of str_tree_storage the first time. */
847 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
849 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
850 calculation below, and for similar doubling calculations
851 elsewhere. And it's <= rather than <, because some of the
852 doubling calculations add 1 afterwards. */
853 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
856 dfa->nodes_alloc = pat_len + 1;
857 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
859 /* table_size = 2 ^ ceil(log pat_len) */
860 for (table_size = 1; ; table_size <<= 1)
861 if (table_size > pat_len)
864 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
865 dfa->state_hash_mask = table_size - 1;
867 dfa->mb_cur_max = MB_CUR_MAX;
869 if (dfa->mb_cur_max == 6
870 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
872 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
875 if (strcmp (locale_charset (), "UTF-8") == 0)
878 /* We check exhaustively in the loop below if this charset is a
879 superset of ASCII. */
880 dfa->map_notascii = 0;
883 #ifdef RE_ENABLE_I18N
884 if (dfa->mb_cur_max > 1)
887 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
892 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
893 if (BE (dfa->sb_char == NULL, 0))
896 /* Set the bits corresponding to single byte chars. */
897 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
898 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
900 wint_t wch = __btowc (ch);
902 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
904 if (isascii (ch) && wch != ch)
905 dfa->map_notascii = 1;
912 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
917 /* Initialize WORD_CHAR table, which indicate which character is
918 "word". In this case "word" means that it is the word construction
919 character used by some operators like "\<", "\>", etc. */
923 init_word_char (re_dfa_t *dfa)
926 dfa->word_ops_used = 1;
927 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
928 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
929 if (isalnum (ch) || ch == '_')
930 dfa->word_char[i] |= (bitset_word_t) 1 << j;
933 /* Free the work area which are only used while compiling. */
936 free_workarea_compile (regex_t *preg)
938 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
939 bin_tree_storage_t *storage, *next;
940 for (storage = dfa->str_tree_storage; storage; storage = next)
942 next = storage->next;
945 dfa->str_tree_storage = NULL;
946 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
947 dfa->str_tree = NULL;
948 re_free (dfa->org_indices);
949 dfa->org_indices = NULL;
952 /* Create initial states for all contexts. */
955 create_initial_state (re_dfa_t *dfa)
959 re_node_set init_nodes;
961 /* Initial states have the epsilon closure of the node which is
962 the first node of the regular expression. */
963 first = dfa->str_tree->first->node_idx;
964 dfa->init_node = first;
965 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
966 if (BE (err != REG_NOERROR, 0))
969 /* The back-references which are in initial states can epsilon transit,
970 since in this case all of the subexpressions can be null.
971 Then we add epsilon closures of the nodes which are the next nodes of
972 the back-references. */
973 if (dfa->nbackref > 0)
974 for (i = 0; i < init_nodes.nelem; ++i)
976 Idx node_idx = init_nodes.elems[i];
977 re_token_type_t type = dfa->nodes[node_idx].type;
980 if (type != OP_BACK_REF)
982 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
984 re_token_t *clexp_node;
985 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
986 if (clexp_node->type == OP_CLOSE_SUBEXP
987 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
990 if (clexp_idx == init_nodes.nelem)
993 if (type == OP_BACK_REF)
995 Idx dest_idx = dfa->edests[node_idx].elems[0];
996 if (!re_node_set_contains (&init_nodes, dest_idx))
998 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1004 /* It must be the first time to invoke acquire_state. */
1005 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1006 /* We don't check ERR here, since the initial state must not be NULL. */
1007 if (BE (dfa->init_state == NULL, 0))
1009 if (dfa->init_state->has_constraint)
1011 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1013 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1015 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1019 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1020 || dfa->init_state_begbuf == NULL, 0))
1024 dfa->init_state_word = dfa->init_state_nl
1025 = dfa->init_state_begbuf = dfa->init_state;
1027 re_node_set_free (&init_nodes);
1031 #ifdef RE_ENABLE_I18N
1032 /* If it is possible to do searching in single byte encoding instead of UTF-8
1033 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1034 DFA nodes where needed. */
1037 optimize_utf8 (re_dfa_t *dfa)
1041 bool mb_chars = false;
1042 bool has_period = false;
1044 for (node = 0; node < dfa->nodes_len; ++node)
1045 switch (dfa->nodes[node].type)
1048 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1052 switch (dfa->nodes[node].opr.ctx_type)
1060 /* Word anchors etc. cannot be handled. It's okay to test
1061 opr.ctx_type since constraints (for all DFA nodes) are
1062 created by ORing one or more opr.ctx_type values. */
1072 case OP_DUP_ASTERISK:
1073 case OP_OPEN_SUBEXP:
1074 case OP_CLOSE_SUBEXP:
1076 case COMPLEX_BRACKET:
1078 case SIMPLE_BRACKET:
1079 /* Just double check. */
1081 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1083 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1084 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1086 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1096 if (mb_chars || has_period)
1097 for (node = 0; node < dfa->nodes_len; ++node)
1099 if (dfa->nodes[node].type == CHARACTER
1100 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1101 dfa->nodes[node].mb_partial = 0;
1102 else if (dfa->nodes[node].type == OP_PERIOD)
1103 dfa->nodes[node].type = OP_UTF8_PERIOD;
1106 /* The search can be in single byte locale. */
1107 dfa->mb_cur_max = 1;
1109 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1113 /* Analyze the structure tree, and calculate "first", "next", "edest",
1114 "eclosure", and "inveclosure". */
1116 static reg_errcode_t
1117 analyze (regex_t *preg)
1119 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1122 /* Allocate arrays. */
1123 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1124 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1125 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1126 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1127 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1128 || dfa->eclosures == NULL, 0))
1131 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1132 if (dfa->subexp_map != NULL)
1135 for (i = 0; i < preg->re_nsub; i++)
1136 dfa->subexp_map[i] = i;
1137 preorder (dfa->str_tree, optimize_subexps, dfa);
1138 for (i = 0; i < preg->re_nsub; i++)
1139 if (dfa->subexp_map[i] != i)
1141 if (i == preg->re_nsub)
1143 free (dfa->subexp_map);
1144 dfa->subexp_map = NULL;
1148 ret = postorder (dfa->str_tree, lower_subexps, preg);
1149 if (BE (ret != REG_NOERROR, 0))
1151 ret = postorder (dfa->str_tree, calc_first, dfa);
1152 if (BE (ret != REG_NOERROR, 0))
1154 preorder (dfa->str_tree, calc_next, dfa);
1155 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1156 if (BE (ret != REG_NOERROR, 0))
1158 ret = calc_eclosure (dfa);
1159 if (BE (ret != REG_NOERROR, 0))
1162 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1163 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1164 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1167 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1168 if (BE (dfa->inveclosures == NULL, 0))
1170 ret = calc_inveclosure (dfa);
1176 /* Our parse trees are very unbalanced, so we cannot use a stack to
1177 implement parse tree visits. Instead, we use parent pointers and
1178 some hairy code in these two functions. */
1179 static reg_errcode_t
1180 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1183 bin_tree_t *node, *prev;
1185 for (node = root; ; )
1187 /* Descend down the tree, preferably to the left (or to the right
1188 if that's the only child). */
1189 while (node->left || node->right)
1197 reg_errcode_t err = fn (extra, node);
1198 if (BE (err != REG_NOERROR, 0))
1200 if (node->parent == NULL)
1203 node = node->parent;
1205 /* Go up while we have a node that is reached from the right. */
1206 while (node->right == prev || node->right == NULL);
1211 static reg_errcode_t
1212 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1217 for (node = root; ; )
1219 reg_errcode_t err = fn (extra, node);
1220 if (BE (err != REG_NOERROR, 0))
1223 /* Go to the left node, or up and to the right. */
1228 bin_tree_t *prev = NULL;
1229 while (node->right == prev || node->right == NULL)
1232 node = node->parent;
1241 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1242 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1243 backreferences as well. Requires a preorder visit. */
1244 static reg_errcode_t
1245 optimize_subexps (void *extra, bin_tree_t *node)
1247 re_dfa_t *dfa = (re_dfa_t *) extra;
1249 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1251 int idx = node->token.opr.idx;
1252 node->token.opr.idx = dfa->subexp_map[idx];
1253 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1256 else if (node->token.type == SUBEXP
1257 && node->left && node->left->token.type == SUBEXP)
1259 Idx other_idx = node->left->token.opr.idx;
1261 node->left = node->left->left;
1263 node->left->parent = node;
1265 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1266 if (other_idx < BITSET_WORD_BITS)
1267 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1273 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1274 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1275 static reg_errcode_t
1276 lower_subexps (void *extra, bin_tree_t *node)
1278 regex_t *preg = (regex_t *) extra;
1279 reg_errcode_t err = REG_NOERROR;
1281 if (node->left && node->left->token.type == SUBEXP)
1283 node->left = lower_subexp (&err, preg, node->left);
1285 node->left->parent = node;
1287 if (node->right && node->right->token.type == SUBEXP)
1289 node->right = lower_subexp (&err, preg, node->right);
1291 node->right->parent = node;
1298 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1300 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1301 bin_tree_t *body = node->left;
1302 bin_tree_t *op, *cls, *tree1, *tree;
1305 /* We do not optimize empty subexpressions, because otherwise we may
1306 have bad CONCAT nodes with NULL children. This is obviously not
1307 very common, so we do not lose much. An example that triggers
1308 this case is the sed "script" /\(\)/x. */
1309 && node->left != NULL
1310 && (node->token.opr.idx >= BITSET_WORD_BITS
1311 || !(dfa->used_bkref_map
1312 & ((bitset_word_t) 1 << node->token.opr.idx))))
1315 /* Convert the SUBEXP node to the concatenation of an
1316 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1317 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1318 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1319 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1320 tree = create_tree (dfa, op, tree1, CONCAT);
1321 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1327 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1328 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1332 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1333 nodes. Requires a postorder visit. */
1334 static reg_errcode_t
1335 calc_first (void *extra, bin_tree_t *node)
1337 re_dfa_t *dfa = (re_dfa_t *) extra;
1338 if (node->token.type == CONCAT)
1340 node->first = node->left->first;
1341 node->node_idx = node->left->node_idx;
1346 node->node_idx = re_dfa_add_node (dfa, node->token);
1347 if (BE (node->node_idx == REG_MISSING, 0))
1349 if (node->token.type == ANCHOR)
1350 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1355 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1356 static reg_errcode_t
1357 calc_next (void *extra, bin_tree_t *node)
1359 switch (node->token.type)
1361 case OP_DUP_ASTERISK:
1362 node->left->next = node;
1365 node->left->next = node->right->first;
1366 node->right->next = node->next;
1370 node->left->next = node->next;
1372 node->right->next = node->next;
1378 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1379 static reg_errcode_t
1380 link_nfa_nodes (void *extra, bin_tree_t *node)
1382 re_dfa_t *dfa = (re_dfa_t *) extra;
1383 Idx idx = node->node_idx;
1384 reg_errcode_t err = REG_NOERROR;
1386 switch (node->token.type)
1392 assert (node->next == NULL);
1395 case OP_DUP_ASTERISK:
1399 dfa->has_plural_match = 1;
1400 if (node->left != NULL)
1401 left = node->left->first->node_idx;
1403 left = node->next->node_idx;
1404 if (node->right != NULL)
1405 right = node->right->first->node_idx;
1407 right = node->next->node_idx;
1408 assert (REG_VALID_INDEX (left));
1409 assert (REG_VALID_INDEX (right));
1410 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1415 case OP_OPEN_SUBEXP:
1416 case OP_CLOSE_SUBEXP:
1417 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1421 dfa->nexts[idx] = node->next->node_idx;
1422 if (node->token.type == OP_BACK_REF)
1423 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1427 assert (!IS_EPSILON_NODE (node->token.type));
1428 dfa->nexts[idx] = node->next->node_idx;
1435 /* Duplicate the epsilon closure of the node ROOT_NODE.
1436 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1437 to their own constraint. */
1439 static reg_errcode_t
1441 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1442 Idx root_node, unsigned int init_constraint)
1444 Idx org_node, clone_node;
1446 unsigned int constraint = init_constraint;
1447 for (org_node = top_org_node, clone_node = top_clone_node;;)
1449 Idx org_dest, clone_dest;
1450 if (dfa->nodes[org_node].type == OP_BACK_REF)
1452 /* If the back reference epsilon-transit, its destination must
1453 also have the constraint. Then duplicate the epsilon closure
1454 of the destination of the back reference, and store it in
1455 edests of the back reference. */
1456 org_dest = dfa->nexts[org_node];
1457 re_node_set_empty (dfa->edests + clone_node);
1458 clone_dest = duplicate_node (dfa, org_dest, constraint);
1459 if (BE (clone_dest == REG_MISSING, 0))
1461 dfa->nexts[clone_node] = dfa->nexts[org_node];
1462 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1466 else if (dfa->edests[org_node].nelem == 0)
1468 /* In case of the node can't epsilon-transit, don't duplicate the
1469 destination and store the original destination as the
1470 destination of the node. */
1471 dfa->nexts[clone_node] = dfa->nexts[org_node];
1474 else if (dfa->edests[org_node].nelem == 1)
1476 /* In case of the node can epsilon-transit, and it has only one
1478 org_dest = dfa->edests[org_node].elems[0];
1479 re_node_set_empty (dfa->edests + clone_node);
1480 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1481 /* If the node is root_node itself, it means the epsilon closure
1482 has a loop. Then tie it to the destination of the root_node. */
1483 if (org_node == root_node && clone_node != org_node)
1485 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1490 /* In case the node has another constraint, append it. */
1491 constraint |= dfa->nodes[org_node].constraint;
1492 clone_dest = duplicate_node (dfa, org_dest, constraint);
1493 if (BE (clone_dest == REG_MISSING, 0))
1495 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1499 else /* dfa->edests[org_node].nelem == 2 */
1501 /* In case of the node can epsilon-transit, and it has two
1502 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1503 org_dest = dfa->edests[org_node].elems[0];
1504 re_node_set_empty (dfa->edests + clone_node);
1505 /* Search for a duplicated node which satisfies the constraint. */
1506 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1507 if (clone_dest == REG_MISSING)
1509 /* There is no such duplicated node, create a new one. */
1511 clone_dest = duplicate_node (dfa, org_dest, constraint);
1512 if (BE (clone_dest == REG_MISSING, 0))
1514 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1517 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1518 root_node, constraint);
1519 if (BE (err != REG_NOERROR, 0))
1524 /* There is a duplicated node which satisfy the constraint,
1525 use it to avoid infinite loop. */
1526 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1531 org_dest = dfa->edests[org_node].elems[1];
1532 clone_dest = duplicate_node (dfa, org_dest, constraint);
1533 if (BE (clone_dest == REG_MISSING, 0))
1535 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1539 org_node = org_dest;
1540 clone_node = clone_dest;
1545 /* Search for a node which is duplicated from the node ORG_NODE, and
1546 satisfies the constraint CONSTRAINT. */
1549 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1550 unsigned int constraint)
1553 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1555 if (org_node == dfa->org_indices[idx]
1556 && constraint == dfa->nodes[idx].constraint)
1557 return idx; /* Found. */
1559 return REG_MISSING; /* Not found. */
1562 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1563 Return the index of the new node, or REG_MISSING if insufficient storage is
1567 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1569 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1570 if (BE (dup_idx != REG_MISSING, 1))
1572 dfa->nodes[dup_idx].constraint = constraint;
1573 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1574 dfa->nodes[dup_idx].duplicated = 1;
1576 /* Store the index of the original node. */
1577 dfa->org_indices[dup_idx] = org_idx;
1582 static reg_errcode_t
1583 calc_inveclosure (re_dfa_t *dfa)
1587 for (idx = 0; idx < dfa->nodes_len; ++idx)
1588 re_node_set_init_empty (dfa->inveclosures + idx);
1590 for (src = 0; src < dfa->nodes_len; ++src)
1592 Idx *elems = dfa->eclosures[src].elems;
1593 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1595 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1604 /* Calculate "eclosure" for all the node in DFA. */
1606 static reg_errcode_t
1607 calc_eclosure (re_dfa_t *dfa)
1612 assert (dfa->nodes_len > 0);
1615 /* For each nodes, calculate epsilon closure. */
1616 for (node_idx = 0; ; ++node_idx)
1619 re_node_set eclosure_elem;
1620 if (node_idx == dfa->nodes_len)
1629 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1632 /* If we have already calculated, skip it. */
1633 if (dfa->eclosures[node_idx].nelem != 0)
1635 /* Calculate epsilon closure of `node_idx'. */
1636 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1637 if (BE (err != REG_NOERROR, 0))
1640 if (dfa->eclosures[node_idx].nelem == 0)
1643 re_node_set_free (&eclosure_elem);
1649 /* Calculate epsilon closure of NODE. */
1651 static reg_errcode_t
1652 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1658 re_node_set eclosure;
1660 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1661 if (BE (err != REG_NOERROR, 0))
1664 /* This indicates that we are calculating this node now.
1665 We reference this value to avoid infinite loop. */
1666 dfa->eclosures[node].nelem = REG_MISSING;
1668 /* If the current node has constraints, duplicate all nodes
1669 since they must inherit the constraints. */
1670 if (dfa->nodes[node].constraint
1671 && dfa->edests[node].nelem
1672 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1674 err = duplicate_node_closure (dfa, node, node, node,
1675 dfa->nodes[node].constraint);
1676 if (BE (err != REG_NOERROR, 0))
1680 /* Expand each epsilon destination nodes. */
1681 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1682 for (i = 0; i < dfa->edests[node].nelem; ++i)
1684 re_node_set eclosure_elem;
1685 Idx edest = dfa->edests[node].elems[i];
1686 /* If calculating the epsilon closure of `edest' is in progress,
1687 return intermediate result. */
1688 if (dfa->eclosures[edest].nelem == REG_MISSING)
1693 /* If we haven't calculated the epsilon closure of `edest' yet,
1694 calculate now. Otherwise use calculated epsilon closure. */
1695 if (dfa->eclosures[edest].nelem == 0)
1697 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1698 if (BE (err != REG_NOERROR, 0))
1702 eclosure_elem = dfa->eclosures[edest];
1703 /* Merge the epsilon closure of `edest'. */
1704 re_node_set_merge (&eclosure, &eclosure_elem);
1705 /* If the epsilon closure of `edest' is incomplete,
1706 the epsilon closure of this node is also incomplete. */
1707 if (dfa->eclosures[edest].nelem == 0)
1710 re_node_set_free (&eclosure_elem);
1714 /* Epsilon closures include itself. */
1715 ok = re_node_set_insert (&eclosure, node);
1718 if (incomplete && !root)
1719 dfa->eclosures[node].nelem = 0;
1721 dfa->eclosures[node] = eclosure;
1722 *new_set = eclosure;
1726 /* Functions for token which are used in the parser. */
1728 /* Fetch a token from INPUT.
1729 We must not use this function inside bracket expressions. */
1733 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1735 re_string_skip_bytes (input, peek_token (result, input, syntax));
1738 /* Peek a token from INPUT, and return the length of the token.
1739 We must not use this function inside bracket expressions. */
1743 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1747 if (re_string_eoi (input))
1749 token->type = END_OF_RE;
1753 c = re_string_peek_byte (input, 0);
1756 token->word_char = 0;
1757 #ifdef RE_ENABLE_I18N
1758 token->mb_partial = 0;
1759 if (input->mb_cur_max > 1 &&
1760 !re_string_first_byte (input, re_string_cur_idx (input)))
1762 token->type = CHARACTER;
1763 token->mb_partial = 1;
1770 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1772 token->type = BACK_SLASH;
1776 c2 = re_string_peek_byte_case (input, 1);
1778 token->type = CHARACTER;
1779 #ifdef RE_ENABLE_I18N
1780 if (input->mb_cur_max > 1)
1782 wint_t wc = re_string_wchar_at (input,
1783 re_string_cur_idx (input) + 1);
1784 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1788 token->word_char = IS_WORD_CHAR (c2) != 0;
1793 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1794 token->type = OP_ALT;
1796 case '1': case '2': case '3': case '4': case '5':
1797 case '6': case '7': case '8': case '9':
1798 if (!(syntax & RE_NO_BK_REFS))
1800 token->type = OP_BACK_REF;
1801 token->opr.idx = c2 - '1';
1805 if (!(syntax & RE_NO_GNU_OPS))
1807 token->type = ANCHOR;
1808 token->opr.ctx_type = WORD_FIRST;
1812 if (!(syntax & RE_NO_GNU_OPS))
1814 token->type = ANCHOR;
1815 token->opr.ctx_type = WORD_LAST;
1819 if (!(syntax & RE_NO_GNU_OPS))
1821 token->type = ANCHOR;
1822 token->opr.ctx_type = WORD_DELIM;
1826 if (!(syntax & RE_NO_GNU_OPS))
1828 token->type = ANCHOR;
1829 token->opr.ctx_type = NOT_WORD_DELIM;
1833 if (!(syntax & RE_NO_GNU_OPS))
1834 token->type = OP_WORD;
1837 if (!(syntax & RE_NO_GNU_OPS))
1838 token->type = OP_NOTWORD;
1841 if (!(syntax & RE_NO_GNU_OPS))
1842 token->type = OP_SPACE;
1845 if (!(syntax & RE_NO_GNU_OPS))
1846 token->type = OP_NOTSPACE;
1849 if (!(syntax & RE_NO_GNU_OPS))
1851 token->type = ANCHOR;
1852 token->opr.ctx_type = BUF_FIRST;
1856 if (!(syntax & RE_NO_GNU_OPS))
1858 token->type = ANCHOR;
1859 token->opr.ctx_type = BUF_LAST;
1863 if (!(syntax & RE_NO_BK_PARENS))
1864 token->type = OP_OPEN_SUBEXP;
1867 if (!(syntax & RE_NO_BK_PARENS))
1868 token->type = OP_CLOSE_SUBEXP;
1871 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1872 token->type = OP_DUP_PLUS;
1875 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1876 token->type = OP_DUP_QUESTION;
1879 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1880 token->type = OP_OPEN_DUP_NUM;
1883 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1884 token->type = OP_CLOSE_DUP_NUM;
1892 token->type = CHARACTER;
1893 #ifdef RE_ENABLE_I18N
1894 if (input->mb_cur_max > 1)
1896 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1897 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1901 token->word_char = IS_WORD_CHAR (token->opr.c);
1906 if (syntax & RE_NEWLINE_ALT)
1907 token->type = OP_ALT;
1910 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1911 token->type = OP_ALT;
1914 token->type = OP_DUP_ASTERISK;
1917 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1918 token->type = OP_DUP_PLUS;
1921 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1922 token->type = OP_DUP_QUESTION;
1925 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1926 token->type = OP_OPEN_DUP_NUM;
1929 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1930 token->type = OP_CLOSE_DUP_NUM;
1933 if (syntax & RE_NO_BK_PARENS)
1934 token->type = OP_OPEN_SUBEXP;
1937 if (syntax & RE_NO_BK_PARENS)
1938 token->type = OP_CLOSE_SUBEXP;
1941 token->type = OP_OPEN_BRACKET;
1944 token->type = OP_PERIOD;
1947 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1948 re_string_cur_idx (input) != 0)
1950 char prev = re_string_peek_byte (input, -1);
1951 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1954 token->type = ANCHOR;
1955 token->opr.ctx_type = LINE_FIRST;
1958 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1959 re_string_cur_idx (input) + 1 != re_string_length (input))
1962 re_string_skip_bytes (input, 1);
1963 peek_token (&next, input, syntax);
1964 re_string_skip_bytes (input, -1);
1965 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1968 token->type = ANCHOR;
1969 token->opr.ctx_type = LINE_LAST;
1977 /* Peek a token from INPUT, and return the length of the token.
1978 We must not use this function out of bracket expressions. */
1982 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1985 if (re_string_eoi (input))
1987 token->type = END_OF_RE;
1990 c = re_string_peek_byte (input, 0);
1993 #ifdef RE_ENABLE_I18N
1994 if (input->mb_cur_max > 1 &&
1995 !re_string_first_byte (input, re_string_cur_idx (input)))
1997 token->type = CHARACTER;
2000 #endif /* RE_ENABLE_I18N */
2002 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2003 && re_string_cur_idx (input) + 1 < re_string_length (input))
2005 /* In this case, '\' escape a character. */
2007 re_string_skip_bytes (input, 1);
2008 c2 = re_string_peek_byte (input, 0);
2010 token->type = CHARACTER;
2013 if (c == '[') /* '[' is a special char in a bracket exps. */
2017 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2018 c2 = re_string_peek_byte (input, 1);
2026 token->type = OP_OPEN_COLL_ELEM;
2029 token->type = OP_OPEN_EQUIV_CLASS;
2032 if (syntax & RE_CHAR_CLASSES)
2034 token->type = OP_OPEN_CHAR_CLASS;
2037 /* else fall through. */
2039 token->type = CHARACTER;
2049 token->type = OP_CHARSET_RANGE;
2052 token->type = OP_CLOSE_BRACKET;
2055 token->type = OP_NON_MATCH_LIST;
2058 token->type = CHARACTER;
2063 /* Functions for parser. */
2065 /* Entry point of the parser.
2066 Parse the regular expression REGEXP and return the structure tree.
2067 If an error is occured, ERR is set by error code, and return NULL.
2068 This function build the following tree, from regular expression <reg_exp>:
2074 CAT means concatenation.
2075 EOR means end of regular expression. */
2078 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2081 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2082 bin_tree_t *tree, *eor, *root;
2083 re_token_t current_token;
2084 dfa->syntax = syntax;
2085 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2086 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2087 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2089 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2091 root = create_tree (dfa, tree, eor, CONCAT);
2094 if (BE (eor == NULL || root == NULL, 0))
2102 /* This function build the following tree, from regular expression
2103 <branch1>|<branch2>:
2109 ALT means alternative, which represents the operator `|'. */
2112 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2113 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2115 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2116 bin_tree_t *tree, *branch = NULL;
2117 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2118 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2121 while (token->type == OP_ALT)
2123 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2124 if (token->type != OP_ALT && token->type != END_OF_RE
2125 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2127 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2128 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2133 tree = create_tree (dfa, tree, branch, OP_ALT);
2134 if (BE (tree == NULL, 0))
2143 /* This function build the following tree, from regular expression
2150 CAT means concatenation. */
2153 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2154 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2156 bin_tree_t *tree, *expr;
2157 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2158 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2159 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2162 while (token->type != OP_ALT && token->type != END_OF_RE
2163 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2165 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2166 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2170 if (tree != NULL && expr != NULL)
2172 tree = create_tree (dfa, tree, expr, CONCAT);
2179 else if (tree == NULL)
2181 /* Otherwise expr == NULL, we don't need to create new tree. */
2186 /* This function build the following tree, from regular expression a*:
2193 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2194 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2196 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2198 switch (token->type)
2201 tree = create_token_tree (dfa, NULL, NULL, token);
2202 if (BE (tree == NULL, 0))
2207 #ifdef RE_ENABLE_I18N
2208 if (dfa->mb_cur_max > 1)
2210 while (!re_string_eoi (regexp)
2211 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2213 bin_tree_t *mbc_remain;
2214 fetch_token (token, regexp, syntax);
2215 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2216 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2217 if (BE (mbc_remain == NULL || tree == NULL, 0))
2226 case OP_OPEN_SUBEXP:
2227 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2228 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2231 case OP_OPEN_BRACKET:
2232 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2233 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2237 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2242 dfa->used_bkref_map |= 1 << token->opr.idx;
2243 tree = create_token_tree (dfa, NULL, NULL, token);
2244 if (BE (tree == NULL, 0))
2250 dfa->has_mb_node = 1;
2252 case OP_OPEN_DUP_NUM:
2253 if (syntax & RE_CONTEXT_INVALID_DUP)
2259 case OP_DUP_ASTERISK:
2261 case OP_DUP_QUESTION:
2262 if (syntax & RE_CONTEXT_INVALID_OPS)
2267 else if (syntax & RE_CONTEXT_INDEP_OPS)
2269 fetch_token (token, regexp, syntax);
2270 return parse_expression (regexp, preg, token, syntax, nest, err);
2272 /* else fall through */
2273 case OP_CLOSE_SUBEXP:
2274 if ((token->type == OP_CLOSE_SUBEXP) &&
2275 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2280 /* else fall through */
2281 case OP_CLOSE_DUP_NUM:
2282 /* We treat it as a normal character. */
2284 /* Then we can these characters as normal characters. */
2285 token->type = CHARACTER;
2286 /* mb_partial and word_char bits should be initialized already
2288 tree = create_token_tree (dfa, NULL, NULL, token);
2289 if (BE (tree == NULL, 0))
2296 if ((token->opr.ctx_type
2297 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2298 && dfa->word_ops_used == 0)
2299 init_word_char (dfa);
2300 if (token->opr.ctx_type == WORD_DELIM
2301 || token->opr.ctx_type == NOT_WORD_DELIM)
2303 bin_tree_t *tree_first, *tree_last;
2304 if (token->opr.ctx_type == WORD_DELIM)
2306 token->opr.ctx_type = WORD_FIRST;
2307 tree_first = create_token_tree (dfa, NULL, NULL, token);
2308 token->opr.ctx_type = WORD_LAST;
2312 token->opr.ctx_type = INSIDE_WORD;
2313 tree_first = create_token_tree (dfa, NULL, NULL, token);
2314 token->opr.ctx_type = INSIDE_NOTWORD;
2316 tree_last = create_token_tree (dfa, NULL, NULL, token);
2317 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2318 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2326 tree = create_token_tree (dfa, NULL, NULL, token);
2327 if (BE (tree == NULL, 0))
2333 /* We must return here, since ANCHORs can't be followed
2334 by repetition operators.
2335 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2336 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2337 fetch_token (token, regexp, syntax);
2340 tree = create_token_tree (dfa, NULL, NULL, token);
2341 if (BE (tree == NULL, 0))
2346 if (dfa->mb_cur_max > 1)
2347 dfa->has_mb_node = 1;
2351 tree = build_charclass_op (dfa, regexp->trans,
2352 (const unsigned char *) "alnum",
2353 (const unsigned char *) "_",
2354 token->type == OP_NOTWORD, err);
2355 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2360 tree = build_charclass_op (dfa, regexp->trans,
2361 (const unsigned char *) "space",
2362 (const unsigned char *) "",
2363 token->type == OP_NOTSPACE, err);
2364 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2374 /* Must not happen? */
2380 fetch_token (token, regexp, syntax);
2382 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2383 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2385 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2386 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2388 /* In BRE consecutive duplications are not allowed. */
2389 if ((syntax & RE_CONTEXT_INVALID_DUP)
2390 && (token->type == OP_DUP_ASTERISK
2391 || token->type == OP_OPEN_DUP_NUM))
2401 /* This function build the following tree, from regular expression
2409 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2410 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2412 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2415 cur_nsub = preg->re_nsub++;
2417 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2419 /* The subexpression may be a null string. */
2420 if (token->type == OP_CLOSE_SUBEXP)
2424 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2425 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2427 if (BE (*err != REG_NOERROR, 0))
2431 if (cur_nsub <= '9' - '1')
2432 dfa->completed_bkref_map |= 1 << cur_nsub;
2434 tree = create_tree (dfa, tree, NULL, SUBEXP);
2435 if (BE (tree == NULL, 0))
2440 tree->token.opr.idx = cur_nsub;
2444 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2447 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2448 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2450 bin_tree_t *tree = NULL, *old_tree = NULL;
2451 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2452 re_token_t start_token = *token;
2454 if (token->type == OP_OPEN_DUP_NUM)
2457 start = fetch_number (regexp, token, syntax);
2458 if (start == REG_MISSING)
2460 if (token->type == CHARACTER && token->opr.c == ',')
2461 start = 0; /* We treat "{,m}" as "{0,m}". */
2464 *err = REG_BADBR; /* <re>{} is invalid. */
2468 if (BE (start != REG_ERROR, 1))
2470 /* We treat "{n}" as "{n,n}". */
2471 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2472 : ((token->type == CHARACTER && token->opr.c == ',')
2473 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2475 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2477 /* Invalid sequence. */
2478 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2480 if (token->type == END_OF_RE)
2488 /* If the syntax bit is set, rollback. */
2489 re_string_set_index (regexp, start_idx);
2490 *token = start_token;
2491 token->type = CHARACTER;
2492 /* mb_partial and word_char bits should be already initialized by
2497 if (BE (end != REG_MISSING && start > end, 0))
2499 /* First number greater than second. */
2506 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2507 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2510 fetch_token (token, regexp, syntax);
2512 if (BE (elem == NULL, 0))
2514 if (BE (start == 0 && end == 0, 0))
2516 postorder (elem, free_tree, NULL);
2520 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2521 if (BE (start > 0, 0))
2524 for (i = 2; i <= start; ++i)
2526 elem = duplicate_tree (elem, dfa);
2527 tree = create_tree (dfa, tree, elem, CONCAT);
2528 if (BE (elem == NULL || tree == NULL, 0))
2529 goto parse_dup_op_espace;
2535 /* Duplicate ELEM before it is marked optional. */
2536 elem = duplicate_tree (elem, dfa);
2542 if (elem->token.type == SUBEXP)
2543 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2545 tree = create_tree (dfa, elem, NULL,
2546 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2547 if (BE (tree == NULL, 0))
2548 goto parse_dup_op_espace;
2550 /* This loop is actually executed only when end != REG_MISSING,
2551 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2552 already created the start+1-th copy. */
2553 if ((Idx) -1 < 0 || end != REG_MISSING)
2554 for (i = start + 2; i <= end; ++i)
2556 elem = duplicate_tree (elem, dfa);
2557 tree = create_tree (dfa, tree, elem, CONCAT);
2558 if (BE (elem == NULL || tree == NULL, 0))
2559 goto parse_dup_op_espace;
2561 tree = create_tree (dfa, tree, NULL, OP_ALT);
2562 if (BE (tree == NULL, 0))
2563 goto parse_dup_op_espace;
2567 tree = create_tree (dfa, old_tree, tree, CONCAT);
2571 parse_dup_op_espace:
2576 /* Size of the names for collating symbol/equivalence_class/character_class.
2577 I'm not sure, but maybe enough. */
2578 #define BRACKET_NAME_BUF_SIZE 32
2581 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2582 Build the range expression which starts from START_ELEM, and ends
2583 at END_ELEM. The result are written to MBCSET and SBCSET.
2584 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2585 mbcset->range_ends, is a pointer argument sinse we may
2588 static reg_errcode_t
2590 # ifdef RE_ENABLE_I18N
2591 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2592 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2593 # else /* not RE_ENABLE_I18N */
2594 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2595 bracket_elem_t *end_elem)
2596 # endif /* not RE_ENABLE_I18N */
2598 unsigned int start_ch, end_ch;
2599 /* Equivalence Classes and Character Classes can't be a range start/end. */
2600 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2601 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2605 /* We can handle no multi character collating elements without libc
2607 if (BE ((start_elem->type == COLL_SYM
2608 && strlen ((char *) start_elem->opr.name) > 1)
2609 || (end_elem->type == COLL_SYM
2610 && strlen ((char *) end_elem->opr.name) > 1), 0))
2611 return REG_ECOLLATE;
2613 # ifdef RE_ENABLE_I18N
2618 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2620 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2621 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2623 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2624 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2626 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2627 ? __btowc (start_ch) : start_elem->opr.wch);
2628 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2629 ? __btowc (end_ch) : end_elem->opr.wch);
2630 if (start_wc == WEOF || end_wc == WEOF)
2631 return REG_ECOLLATE;
2632 cmp_buf[0] = start_wc;
2633 cmp_buf[4] = end_wc;
2634 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2637 /* Got valid collation sequence values, add them as a new entry.
2638 However, for !_LIBC we have no collation elements: if the
2639 character set is single byte, the single byte character set
2640 that we build below suffices. parse_bracket_exp passes
2641 no MBCSET if dfa->mb_cur_max == 1. */
2644 /* Check the space of the arrays. */
2645 if (BE (*range_alloc == mbcset->nranges, 0))
2647 /* There is not enough space, need realloc. */
2648 wchar_t *new_array_start, *new_array_end;
2651 /* +1 in case of mbcset->nranges is 0. */
2652 new_nranges = 2 * mbcset->nranges + 1;
2653 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2654 are NULL if *range_alloc == 0. */
2655 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2657 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2660 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2663 mbcset->range_starts = new_array_start;
2664 mbcset->range_ends = new_array_end;
2665 *range_alloc = new_nranges;
2668 mbcset->range_starts[mbcset->nranges] = start_wc;
2669 mbcset->range_ends[mbcset->nranges++] = end_wc;
2672 /* Build the table for single byte characters. */
2673 for (wc = 0; wc < SBC_MAX; ++wc)
2676 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2677 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2678 bitset_set (sbcset, wc);
2681 # else /* not RE_ENABLE_I18N */
2684 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2685 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2687 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2688 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2690 if (start_ch > end_ch)
2692 /* Build the table for single byte characters. */
2693 for (ch = 0; ch < SBC_MAX; ++ch)
2694 if (start_ch <= ch && ch <= end_ch)
2695 bitset_set (sbcset, ch);
2697 # endif /* not RE_ENABLE_I18N */
2700 #endif /* not _LIBC */
2703 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2704 Build the collating element which is represented by NAME.
2705 The result are written to MBCSET and SBCSET.
2706 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2707 pointer argument since we may update it. */
2709 static reg_errcode_t
2711 build_collating_symbol (bitset_t sbcset,
2712 # ifdef RE_ENABLE_I18N
2713 re_charset_t *mbcset, Idx *coll_sym_alloc,
2715 const unsigned char *name)
2717 size_t name_len = strlen ((const char *) name);
2718 if (BE (name_len != 1, 0))
2719 return REG_ECOLLATE;
2722 bitset_set (sbcset, name[0]);
2726 #endif /* not _LIBC */
2728 /* This function parse bracket expression like "[abc]", "[a-c]",
2732 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2733 reg_syntax_t syntax, reg_errcode_t *err)
2736 const unsigned char *collseqmb;
2737 const char *collseqwc;
2740 const int32_t *symb_table;
2741 const unsigned char *extra;
2743 /* Local function for parse_bracket_exp used in _LIBC environement.
2744 Seek the collating symbol entry correspondings to NAME.
2745 Return the index of the symbol in the SYMB_TABLE. */
2748 __attribute ((always_inline))
2749 seek_collating_symbol_entry (name, name_len)
2750 const unsigned char *name;
2753 int32_t hash = elem_hash ((const char *) name, name_len);
2754 int32_t elem = hash % table_size;
2755 if (symb_table[2 * elem] != 0)
2757 int32_t second = hash % (table_size - 2) + 1;
2761 /* First compare the hashing value. */
2762 if (symb_table[2 * elem] == hash
2763 /* Compare the length of the name. */
2764 && name_len == extra[symb_table[2 * elem + 1]]
2765 /* Compare the name. */
2766 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2769 /* Yep, this is the entry. */
2776 while (symb_table[2 * elem] != 0);
2781 /* Local function for parse_bracket_exp used in _LIBC environement.
2782 Look up the collation sequence value of BR_ELEM.
2783 Return the value if succeeded, UINT_MAX otherwise. */
2785 auto inline unsigned int
2786 __attribute ((always_inline))
2787 lookup_collation_sequence_value (br_elem)
2788 bracket_elem_t *br_elem;
2790 if (br_elem->type == SB_CHAR)
2793 if (MB_CUR_MAX == 1)
2796 return collseqmb[br_elem->opr.ch];
2799 wint_t wc = __btowc (br_elem->opr.ch);
2800 return __collseq_table_lookup (collseqwc, wc);
2803 else if (br_elem->type == MB_CHAR)
2805 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2807 else if (br_elem->type == COLL_SYM)
2809 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2813 elem = seek_collating_symbol_entry (br_elem->opr.name,
2815 if (symb_table[2 * elem] != 0)
2817 /* We found the entry. */
2818 idx = symb_table[2 * elem + 1];
2819 /* Skip the name of collating element name. */
2820 idx += 1 + extra[idx];
2821 /* Skip the byte sequence of the collating element. */
2822 idx += 1 + extra[idx];
2823 /* Adjust for the alignment. */
2824 idx = (idx + 3) & ~3;
2825 /* Skip the multibyte collation sequence value. */
2826 idx += sizeof (unsigned int);
2827 /* Skip the wide char sequence of the collating element. */
2828 idx += sizeof (unsigned int) *
2829 (1 + *(unsigned int *) (extra + idx));
2830 /* Return the collation sequence value. */
2831 return *(unsigned int *) (extra + idx);
2833 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2835 /* No valid character. Match it as a single byte
2837 return collseqmb[br_elem->opr.name[0]];
2840 else if (sym_name_len == 1)
2841 return collseqmb[br_elem->opr.name[0]];
2846 /* Local function for parse_bracket_exp used in _LIBC environement.
2847 Build the range expression which starts from START_ELEM, and ends
2848 at END_ELEM. The result are written to MBCSET and SBCSET.
2849 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2850 mbcset->range_ends, is a pointer argument sinse we may
2853 auto inline reg_errcode_t
2854 __attribute ((always_inline))
2855 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2856 re_charset_t *mbcset;
2859 bracket_elem_t *start_elem, *end_elem;
2862 uint32_t start_collseq;
2863 uint32_t end_collseq;
2865 /* Equivalence Classes and Character Classes can't be a range
2867 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2868 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2872 start_collseq = lookup_collation_sequence_value (start_elem);
2873 end_collseq = lookup_collation_sequence_value (end_elem);
2874 /* Check start/end collation sequence values. */
2875 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2876 return REG_ECOLLATE;
2877 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2880 /* Got valid collation sequence values, add them as a new entry.
2881 However, if we have no collation elements, and the character set
2882 is single byte, the single byte character set that we
2883 build below suffices. */
2884 if (nrules > 0 || dfa->mb_cur_max > 1)
2886 /* Check the space of the arrays. */
2887 if (BE (*range_alloc == mbcset->nranges, 0))
2889 /* There is not enough space, need realloc. */
2890 uint32_t *new_array_start;
2891 uint32_t *new_array_end;
2894 /* +1 in case of mbcset->nranges is 0. */
2895 new_nranges = 2 * mbcset->nranges + 1;
2896 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2898 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2901 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2904 mbcset->range_starts = new_array_start;
2905 mbcset->range_ends = new_array_end;
2906 *range_alloc = new_nranges;
2909 mbcset->range_starts[mbcset->nranges] = start_collseq;
2910 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2913 /* Build the table for single byte characters. */
2914 for (ch = 0; ch < SBC_MAX; ch++)
2916 uint32_t ch_collseq;
2918 if (MB_CUR_MAX == 1)
2921 ch_collseq = collseqmb[ch];
2923 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2924 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2925 bitset_set (sbcset, ch);
2930 /* Local function for parse_bracket_exp used in _LIBC environement.
2931 Build the collating element which is represented by NAME.
2932 The result are written to MBCSET and SBCSET.
2933 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2934 pointer argument sinse we may update it. */
2936 auto inline reg_errcode_t
2937 __attribute ((always_inline))
2938 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2939 re_charset_t *mbcset;
2940 Idx *coll_sym_alloc;
2942 const unsigned char *name;
2945 size_t name_len = strlen ((const char *) name);
2948 elem = seek_collating_symbol_entry (name, name_len);
2949 if (symb_table[2 * elem] != 0)
2951 /* We found the entry. */
2952 idx = symb_table[2 * elem + 1];
2953 /* Skip the name of collating element name. */
2954 idx += 1 + extra[idx];
2956 else if (symb_table[2 * elem] == 0 && name_len == 1)
2958 /* No valid character, treat it as a normal
2960 bitset_set (sbcset, name[0]);
2964 return REG_ECOLLATE;
2966 /* Got valid collation sequence, add it as a new entry. */
2967 /* Check the space of the arrays. */
2968 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2970 /* Not enough, realloc it. */
2971 /* +1 in case of mbcset->ncoll_syms is 0. */
2972 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2973 /* Use realloc since mbcset->coll_syms is NULL
2975 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2976 new_coll_sym_alloc);
2977 if (BE (new_coll_syms == NULL, 0))
2979 mbcset->coll_syms = new_coll_syms;
2980 *coll_sym_alloc = new_coll_sym_alloc;
2982 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2987 if (BE (name_len != 1, 0))
2988 return REG_ECOLLATE;
2991 bitset_set (sbcset, name[0]);
2998 re_token_t br_token;
2999 re_bitset_ptr_t sbcset;
3000 #ifdef RE_ENABLE_I18N
3001 re_charset_t *mbcset;
3002 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3003 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3004 #endif /* not RE_ENABLE_I18N */
3005 bool non_match = false;
3006 bin_tree_t *work_tree;
3008 bool first_round = true;
3010 collseqmb = (const unsigned char *)
3011 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3012 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3018 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3019 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3020 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3021 _NL_COLLATE_SYMB_TABLEMB);
3022 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3023 _NL_COLLATE_SYMB_EXTRAMB);
3026 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3027 #ifdef RE_ENABLE_I18N
3028 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3029 #endif /* RE_ENABLE_I18N */
3030 #ifdef RE_ENABLE_I18N
3031 if (BE (sbcset == NULL || mbcset == NULL, 0))
3033 if (BE (sbcset == NULL, 0))
3034 #endif /* RE_ENABLE_I18N */
3040 token_len = peek_token_bracket (token, regexp, syntax);
3041 if (BE (token->type == END_OF_RE, 0))
3044 goto parse_bracket_exp_free_return;
3046 if (token->type == OP_NON_MATCH_LIST)
3048 #ifdef RE_ENABLE_I18N
3049 mbcset->non_match = 1;
3050 #endif /* not RE_ENABLE_I18N */
3052 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3053 bitset_set (sbcset, '\n');
3054 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3055 token_len = peek_token_bracket (token, regexp, syntax);
3056 if (BE (token->type == END_OF_RE, 0))
3059 goto parse_bracket_exp_free_return;
3063 /* We treat the first ']' as a normal character. */
3064 if (token->type == OP_CLOSE_BRACKET)
3065 token->type = CHARACTER;
3069 bracket_elem_t start_elem, end_elem;
3070 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3071 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3074 bool is_range_exp = false;
3077 start_elem.opr.name = start_name_buf;
3078 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3079 syntax, first_round);
3080 if (BE (ret != REG_NOERROR, 0))
3083 goto parse_bracket_exp_free_return;
3085 first_round = false;
3087 /* Get information about the next token. We need it in any case. */
3088 token_len = peek_token_bracket (token, regexp, syntax);
3090 /* Do not check for ranges if we know they are not allowed. */
3091 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3093 if (BE (token->type == END_OF_RE, 0))
3096 goto parse_bracket_exp_free_return;
3098 if (token->type == OP_CHARSET_RANGE)
3100 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3101 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3102 if (BE (token2.type == END_OF_RE, 0))
3105 goto parse_bracket_exp_free_return;
3107 if (token2.type == OP_CLOSE_BRACKET)
3109 /* We treat the last '-' as a normal character. */
3110 re_string_skip_bytes (regexp, -token_len);
3111 token->type = CHARACTER;
3114 is_range_exp = true;
3118 if (is_range_exp == true)
3120 end_elem.opr.name = end_name_buf;
3121 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3123 if (BE (ret != REG_NOERROR, 0))
3126 goto parse_bracket_exp_free_return;
3129 token_len = peek_token_bracket (token, regexp, syntax);
3132 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3133 &start_elem, &end_elem);
3135 # ifdef RE_ENABLE_I18N
3136 *err = build_range_exp (sbcset,
3137 dfa->mb_cur_max > 1 ? mbcset : NULL,
3138 &range_alloc, &start_elem, &end_elem);
3140 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3142 #endif /* RE_ENABLE_I18N */
3143 if (BE (*err != REG_NOERROR, 0))
3144 goto parse_bracket_exp_free_return;
3148 switch (start_elem.type)
3151 bitset_set (sbcset, start_elem.opr.ch);
3153 #ifdef RE_ENABLE_I18N
3155 /* Check whether the array has enough space. */
3156 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3158 wchar_t *new_mbchars;
3159 /* Not enough, realloc it. */
3160 /* +1 in case of mbcset->nmbchars is 0. */
3161 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3162 /* Use realloc since array is NULL if *alloc == 0. */
3163 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3165 if (BE (new_mbchars == NULL, 0))
3166 goto parse_bracket_exp_espace;
3167 mbcset->mbchars = new_mbchars;
3169 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3171 #endif /* RE_ENABLE_I18N */
3173 *err = build_equiv_class (sbcset,
3174 #ifdef RE_ENABLE_I18N
3175 mbcset, &equiv_class_alloc,
3176 #endif /* RE_ENABLE_I18N */
3177 start_elem.opr.name);
3178 if (BE (*err != REG_NOERROR, 0))
3179 goto parse_bracket_exp_free_return;
3182 *err = build_collating_symbol (sbcset,
3183 #ifdef RE_ENABLE_I18N
3184 mbcset, &coll_sym_alloc,
3185 #endif /* RE_ENABLE_I18N */
3186 start_elem.opr.name);
3187 if (BE (*err != REG_NOERROR, 0))
3188 goto parse_bracket_exp_free_return;
3191 *err = build_charclass (regexp->trans, sbcset,
3192 #ifdef RE_ENABLE_I18N
3193 mbcset, &char_class_alloc,
3194 #endif /* RE_ENABLE_I18N */
3195 start_elem.opr.name, syntax);
3196 if (BE (*err != REG_NOERROR, 0))
3197 goto parse_bracket_exp_free_return;
3204 if (BE (token->type == END_OF_RE, 0))
3207 goto parse_bracket_exp_free_return;
3209 if (token->type == OP_CLOSE_BRACKET)
3213 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3215 /* If it is non-matching list. */
3217 bitset_not (sbcset);
3219 #ifdef RE_ENABLE_I18N
3220 /* Ensure only single byte characters are set. */
3221 if (dfa->mb_cur_max > 1)
3222 bitset_mask (sbcset, dfa->sb_char);
3224 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3225 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3226 || mbcset->non_match)))
3228 bin_tree_t *mbc_tree;
3230 /* Build a tree for complex bracket. */
3231 dfa->has_mb_node = 1;
3232 br_token.type = COMPLEX_BRACKET;
3233 br_token.opr.mbcset = mbcset;
3234 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3235 if (BE (mbc_tree == NULL, 0))
3236 goto parse_bracket_exp_espace;
3237 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3238 if (sbcset[sbc_idx])
3240 /* If there are no bits set in sbcset, there is no point
3241 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3242 if (sbc_idx < BITSET_WORDS)
3244 /* Build a tree for simple bracket. */
3245 br_token.type = SIMPLE_BRACKET;
3246 br_token.opr.sbcset = sbcset;
3247 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3248 if (BE (work_tree == NULL, 0))
3249 goto parse_bracket_exp_espace;
3251 /* Then join them by ALT node. */
3252 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3253 if (BE (work_tree == NULL, 0))
3254 goto parse_bracket_exp_espace;
3259 work_tree = mbc_tree;
3263 #endif /* not RE_ENABLE_I18N */
3265 #ifdef RE_ENABLE_I18N
3266 free_charset (mbcset);
3268 /* Build a tree for simple bracket. */
3269 br_token.type = SIMPLE_BRACKET;
3270 br_token.opr.sbcset = sbcset;
3271 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3272 if (BE (work_tree == NULL, 0))
3273 goto parse_bracket_exp_espace;
3277 parse_bracket_exp_espace:
3279 parse_bracket_exp_free_return:
3281 #ifdef RE_ENABLE_I18N
3282 free_charset (mbcset);
3283 #endif /* RE_ENABLE_I18N */
3287 /* Parse an element in the bracket expression. */
3289 static reg_errcode_t
3290 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3291 re_token_t *token, int token_len, re_dfa_t *dfa,
3292 reg_syntax_t syntax, bool accept_hyphen)
3294 #ifdef RE_ENABLE_I18N
3296 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3297 if (cur_char_size > 1)
3299 elem->type = MB_CHAR;
3300 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3301 re_string_skip_bytes (regexp, cur_char_size);
3304 #endif /* RE_ENABLE_I18N */
3305 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3306 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3307 || token->type == OP_OPEN_EQUIV_CLASS)
3308 return parse_bracket_symbol (elem, regexp, token);
3309 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3311 /* A '-' must only appear as anything but a range indicator before
3312 the closing bracket. Everything else is an error. */
3314 (void) peek_token_bracket (&token2, regexp, syntax);
3315 if (token2.type != OP_CLOSE_BRACKET)
3316 /* The actual error value is not standardized since this whole
3317 case is undefined. But ERANGE makes good sense. */
3320 elem->type = SB_CHAR;
3321 elem->opr.ch = token->opr.c;
3325 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3326 such as [:<character_class>:], [.<collating_element>.], and
3327 [=<equivalent_class>=]. */
3329 static reg_errcode_t
3330 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3333 unsigned char ch, delim = token->opr.c;
3335 if (re_string_eoi(regexp))
3339 if (i >= BRACKET_NAME_BUF_SIZE)
3341 if (token->type == OP_OPEN_CHAR_CLASS)
3342 ch = re_string_fetch_byte_case (regexp);
3344 ch = re_string_fetch_byte (regexp);
3345 if (re_string_eoi(regexp))
3347 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3349 elem->opr.name[i] = ch;
3351 re_string_skip_bytes (regexp, 1);
3352 elem->opr.name[i] = '\0';
3353 switch (token->type)
3355 case OP_OPEN_COLL_ELEM:
3356 elem->type = COLL_SYM;
3358 case OP_OPEN_EQUIV_CLASS:
3359 elem->type = EQUIV_CLASS;
3361 case OP_OPEN_CHAR_CLASS:
3362 elem->type = CHAR_CLASS;
3370 /* Helper function for parse_bracket_exp.
3371 Build the equivalence class which is represented by NAME.
3372 The result are written to MBCSET and SBCSET.
3373 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3374 is a pointer argument sinse we may update it. */
3376 static reg_errcode_t
3377 #ifdef RE_ENABLE_I18N
3378 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3379 Idx *equiv_class_alloc, const unsigned char *name)
3380 #else /* not RE_ENABLE_I18N */
3381 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3382 #endif /* not RE_ENABLE_I18N */
3385 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3388 const int32_t *table, *indirect;
3389 const unsigned char *weights, *extra, *cp;
3390 unsigned char char_buf[2];
3394 /* This #include defines a local function! */
3395 # include <locale/weight.h>
3396 /* Calculate the index for equivalence class. */
3398 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3399 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3400 _NL_COLLATE_WEIGHTMB);
3401 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3402 _NL_COLLATE_EXTRAMB);
3403 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3404 _NL_COLLATE_INDIRECTMB);
3405 idx1 = findidx (&cp);
3406 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3407 /* This isn't a valid character. */
3408 return REG_ECOLLATE;
3410 /* Build single byte matcing table for this equivalence class. */
3411 char_buf[1] = (unsigned char) '\0';
3412 len = weights[idx1];
3413 for (ch = 0; ch < SBC_MAX; ++ch)
3417 idx2 = findidx (&cp);
3422 /* This isn't a valid character. */
3424 if (len == weights[idx2])
3427 while (cnt <= len &&
3428 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3432 bitset_set (sbcset, ch);
3435 /* Check whether the array has enough space. */
3436 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3438 /* Not enough, realloc it. */
3439 /* +1 in case of mbcset->nequiv_classes is 0. */
3440 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3441 /* Use realloc since the array is NULL if *alloc == 0. */
3442 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3444 new_equiv_class_alloc);
3445 if (BE (new_equiv_classes == NULL, 0))
3447 mbcset->equiv_classes = new_equiv_classes;
3448 *equiv_class_alloc = new_equiv_class_alloc;
3450 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3455 if (BE (strlen ((const char *) name) != 1, 0))
3456 return REG_ECOLLATE;
3457 bitset_set (sbcset, *name);
3462 /* Helper function for parse_bracket_exp.
3463 Build the character class which is represented by NAME.
3464 The result are written to MBCSET and SBCSET.
3465 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3466 is a pointer argument sinse we may update it. */
3468 static reg_errcode_t
3469 #ifdef RE_ENABLE_I18N
3470 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3471 re_charset_t *mbcset, Idx *char_class_alloc,
3472 const unsigned char *class_name, reg_syntax_t syntax)
3473 #else /* not RE_ENABLE_I18N */
3474 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3475 const unsigned char *class_name, reg_syntax_t syntax)
3476 #endif /* not RE_ENABLE_I18N */
3479 const char *name = (const char *) class_name;
3481 /* In case of REG_ICASE "upper" and "lower" match the both of
3482 upper and lower cases. */
3483 if ((syntax & RE_ICASE)
3484 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3487 #ifdef RE_ENABLE_I18N
3488 /* Check the space of the arrays. */
3489 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3491 /* Not enough, realloc it. */
3492 /* +1 in case of mbcset->nchar_classes is 0. */
3493 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3494 /* Use realloc since array is NULL if *alloc == 0. */
3495 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3496 new_char_class_alloc);
3497 if (BE (new_char_classes == NULL, 0))
3499 mbcset->char_classes = new_char_classes;
3500 *char_class_alloc = new_char_class_alloc;
3502 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3503 #endif /* RE_ENABLE_I18N */
3505 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3507 if (BE (trans != NULL, 0)) \
3509 for (i = 0; i < SBC_MAX; ++i) \
3510 if (ctype_func (i)) \
3511 bitset_set (sbcset, trans[i]); \
3515 for (i = 0; i < SBC_MAX; ++i) \
3516 if (ctype_func (i)) \
3517 bitset_set (sbcset, i); \
3521 if (strcmp (name, "alnum") == 0)
3522 BUILD_CHARCLASS_LOOP (isalnum);
3523 else if (strcmp (name, "cntrl") == 0)
3524 BUILD_CHARCLASS_LOOP (iscntrl);
3525 else if (strcmp (name, "lower") == 0)
3526 BUILD_CHARCLASS_LOOP (islower);
3527 else if (strcmp (name, "space") == 0)
3528 BUILD_CHARCLASS_LOOP (isspace);
3529 else if (strcmp (name, "alpha") == 0)
3530 BUILD_CHARCLASS_LOOP (isalpha);
3531 else if (strcmp (name, "digit") == 0)
3532 BUILD_CHARCLASS_LOOP (isdigit);
3533 else if (strcmp (name, "print") == 0)
3534 BUILD_CHARCLASS_LOOP (isprint);
3535 else if (strcmp (name, "upper") == 0)
3536 BUILD_CHARCLASS_LOOP (isupper);
3537 else if (strcmp (name, "blank") == 0)
3538 BUILD_CHARCLASS_LOOP (isblank);
3539 else if (strcmp (name, "graph") == 0)
3540 BUILD_CHARCLASS_LOOP (isgraph);
3541 else if (strcmp (name, "punct") == 0)
3542 BUILD_CHARCLASS_LOOP (ispunct);
3543 else if (strcmp (name, "xdigit") == 0)
3544 BUILD_CHARCLASS_LOOP (isxdigit);
3552 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3553 const unsigned char *class_name,
3554 const unsigned char *extra, bool non_match,
3557 re_bitset_ptr_t sbcset;
3558 #ifdef RE_ENABLE_I18N
3559 re_charset_t *mbcset;
3561 #endif /* not RE_ENABLE_I18N */
3563 re_token_t br_token;
3566 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3567 #ifdef RE_ENABLE_I18N
3568 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3569 #endif /* RE_ENABLE_I18N */
3571 #ifdef RE_ENABLE_I18N
3572 if (BE (sbcset == NULL || mbcset == NULL, 0))
3573 #else /* not RE_ENABLE_I18N */
3574 if (BE (sbcset == NULL, 0))
3575 #endif /* not RE_ENABLE_I18N */
3583 #ifdef RE_ENABLE_I18N
3584 mbcset->non_match = 1;
3585 #endif /* not RE_ENABLE_I18N */
3588 /* We don't care the syntax in this case. */
3589 ret = build_charclass (trans, sbcset,
3590 #ifdef RE_ENABLE_I18N
3592 #endif /* RE_ENABLE_I18N */
3595 if (BE (ret != REG_NOERROR, 0))
3598 #ifdef RE_ENABLE_I18N
3599 free_charset (mbcset);
3600 #endif /* RE_ENABLE_I18N */
3604 /* \w match '_' also. */
3605 for (; *extra; extra++)
3606 bitset_set (sbcset, *extra);
3608 /* If it is non-matching list. */
3610 bitset_not (sbcset);
3612 #ifdef RE_ENABLE_I18N
3613 /* Ensure only single byte characters are set. */
3614 if (dfa->mb_cur_max > 1)
3615 bitset_mask (sbcset, dfa->sb_char);
3618 /* Build a tree for simple bracket. */
3619 br_token.type = SIMPLE_BRACKET;
3620 br_token.opr.sbcset = sbcset;
3621 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3622 if (BE (tree == NULL, 0))
3623 goto build_word_op_espace;
3625 #ifdef RE_ENABLE_I18N
3626 if (dfa->mb_cur_max > 1)
3628 bin_tree_t *mbc_tree;
3629 /* Build a tree for complex bracket. */
3630 br_token.type = COMPLEX_BRACKET;
3631 br_token.opr.mbcset = mbcset;
3632 dfa->has_mb_node = 1;
3633 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3634 if (BE (mbc_tree == NULL, 0))
3635 goto build_word_op_espace;
3636 /* Then join them by ALT node. */
3637 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3638 if (BE (mbc_tree != NULL, 1))
3643 free_charset (mbcset);
3646 #else /* not RE_ENABLE_I18N */
3648 #endif /* not RE_ENABLE_I18N */
3650 build_word_op_espace:
3652 #ifdef RE_ENABLE_I18N
3653 free_charset (mbcset);
3654 #endif /* RE_ENABLE_I18N */
3659 /* This is intended for the expressions like "a{1,3}".
3660 Fetch a number from `input', and return the number.
3661 Return REG_MISSING if the number field is empty like "{,1}".
3662 Return REG_ERROR if an error occurred. */
3665 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3667 Idx num = REG_MISSING;
3671 fetch_token (token, input, syntax);
3673 if (BE (token->type == END_OF_RE, 0))
3675 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3677 num = ((token->type != CHARACTER || c < '0' || '9' < c
3678 || num == REG_ERROR)
3680 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3681 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3686 #ifdef RE_ENABLE_I18N
3688 free_charset (re_charset_t *cset)
3690 re_free (cset->mbchars);
3692 re_free (cset->coll_syms);
3693 re_free (cset->equiv_classes);
3694 re_free (cset->range_starts);
3695 re_free (cset->range_ends);
3697 re_free (cset->char_classes);
3700 #endif /* RE_ENABLE_I18N */
3702 /* Functions for binary tree operation. */
3704 /* Create a tree node. */
3707 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3708 re_token_type_t type)
3712 return create_token_tree (dfa, left, right, &t);
3716 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3717 const re_token_t *token)
3720 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3722 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3724 if (storage == NULL)
3726 storage->next = dfa->str_tree_storage;
3727 dfa->str_tree_storage = storage;
3728 dfa->str_tree_storage_idx = 0;
3730 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3732 tree->parent = NULL;
3734 tree->right = right;
3735 tree->token = *token;
3736 tree->token.duplicated = 0;
3737 tree->token.opt_subexp = 0;
3740 tree->node_idx = REG_MISSING;
3743 left->parent = tree;
3745 right->parent = tree;
3749 /* Mark the tree SRC as an optional subexpression.
3750 To be called from preorder or postorder. */
3752 static reg_errcode_t
3753 mark_opt_subexp (void *extra, bin_tree_t *node)
3755 Idx idx = (Idx) (long) extra;
3756 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3757 node->token.opt_subexp = 1;
3762 /* Free the allocated memory inside NODE. */
3765 free_token (re_token_t *node)
3767 #ifdef RE_ENABLE_I18N
3768 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3769 free_charset (node->opr.mbcset);
3771 #endif /* RE_ENABLE_I18N */
3772 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3773 re_free (node->opr.sbcset);
3776 /* Worker function for tree walking. Free the allocated memory inside NODE
3777 and its children. */
3779 static reg_errcode_t
3780 free_tree (void *extra, bin_tree_t *node)
3782 free_token (&node->token);
3787 /* Duplicate the node SRC, and return new node. This is a preorder
3788 visit similar to the one implemented by the generic visitor, but
3789 we need more infrastructure to maintain two parallel trees --- so,
3790 it's easier to duplicate. */
3793 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3795 const bin_tree_t *node;
3796 bin_tree_t *dup_root;
3797 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3799 for (node = root; ; )
3801 /* Create a new tree and link it back to the current parent. */
3802 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3805 (*p_new)->parent = dup_node;
3806 (*p_new)->token.duplicated = 1;
3809 /* Go to the left node, or up and to the right. */
3813 p_new = &dup_node->left;
3817 const bin_tree_t *prev = NULL;
3818 while (node->right == prev || node->right == NULL)
3821 node = node->parent;
3822 dup_node = dup_node->parent;
3827 p_new = &dup_node->right;