1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 int length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
26 static void init_word_char (re_dfa_t *dfa);
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 static void optimize_utf8 (re_dfa_t *dfa);
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
50 int top_clone_node, int root_node,
51 unsigned int constraint);
52 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
53 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
54 unsigned int constraint);
55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59 static int fetch_number (re_string_t *input, re_token_t *token,
61 static void fetch_token (re_token_t *result, re_string_t *input,
63 static int peek_token (re_token_t *token, re_string_t *input,
65 static int peek_token_bracket (re_token_t *token, re_string_t *input,
67 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
68 reg_syntax_t syntax, reg_errcode_t *err);
69 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
70 re_token_t *token, reg_syntax_t syntax,
71 int nest, reg_errcode_t *err);
72 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
73 re_token_t *token, reg_syntax_t syntax,
74 int nest, reg_errcode_t *err);
75 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
76 re_token_t *token, reg_syntax_t syntax,
77 int nest, reg_errcode_t *err);
78 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
79 re_token_t *token, reg_syntax_t syntax,
80 int nest, reg_errcode_t *err);
81 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
82 re_dfa_t *dfa, re_token_t *token,
83 reg_syntax_t syntax, reg_errcode_t *err);
84 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
85 re_token_t *token, reg_syntax_t syntax,
87 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
89 re_token_t *token, int token_len,
93 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
97 # ifdef RE_ENABLE_I18N
98 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
99 re_charset_t *mbcset, int *range_alloc,
100 bracket_elem_t *start_elem,
101 bracket_elem_t *end_elem);
102 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
103 re_charset_t *mbcset,
105 const unsigned char *name);
106 # else /* not RE_ENABLE_I18N */
107 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
108 bracket_elem_t *start_elem,
109 bracket_elem_t *end_elem);
110 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
111 const unsigned char *name);
112 # endif /* not RE_ENABLE_I18N */
113 #endif /* not _LIBC */
114 #ifdef RE_ENABLE_I18N
115 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
116 re_charset_t *mbcset,
117 int *equiv_class_alloc,
118 const unsigned char *name);
119 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
120 re_bitset_ptr_t sbcset,
121 re_charset_t *mbcset,
122 int *char_class_alloc,
123 const unsigned char *class_name,
124 reg_syntax_t syntax);
125 #else /* not RE_ENABLE_I18N */
126 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
127 const unsigned char *name);
128 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
129 re_bitset_ptr_t sbcset,
130 const unsigned char *class_name,
131 reg_syntax_t syntax);
132 #endif /* not RE_ENABLE_I18N */
133 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
134 unsigned RE_TRANSLATE_TYPE trans,
135 const unsigned char *class_name,
136 const unsigned char *extra,
137 int non_match, reg_errcode_t *err);
138 static bin_tree_t *create_tree (re_dfa_t *dfa,
139 bin_tree_t *left, bin_tree_t *right,
140 re_token_type_t type);
141 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
142 bin_tree_t *left, bin_tree_t *right,
143 const re_token_t *token);
144 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
145 static void free_token (re_token_t *node);
146 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
147 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
149 /* This table gives an error message for each of the error codes listed
150 in regex.h. Obviously the order here has to be same as there.
151 POSIX doesn't require that we do anything for REG_NOERROR,
152 but why not be nice? */
154 const char __re_error_msgid[] attribute_hidden =
156 #define REG_NOERROR_IDX 0
157 gettext_noop ("Success") /* REG_NOERROR */
159 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
160 gettext_noop ("No match") /* REG_NOMATCH */
162 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
163 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
165 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
166 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
168 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
169 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
171 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
172 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
174 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
175 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
177 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
178 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
180 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
181 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
183 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
184 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
186 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
187 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
189 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
190 gettext_noop ("Invalid range end") /* REG_ERANGE */
192 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
193 gettext_noop ("Memory exhausted") /* REG_ESPACE */
195 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
196 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
198 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
199 gettext_noop ("Premature end of regular expression") /* REG_EEND */
201 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
202 gettext_noop ("Regular expression too big") /* REG_ESIZE */
204 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
205 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
208 const size_t __re_error_msgid_idx[] attribute_hidden =
229 /* Entry points for GNU code. */
231 /* re_compile_pattern is the GNU regular expression compiler: it
232 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
233 Returns 0 if the pattern was valid, otherwise an error string.
235 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
236 are set in BUFP on entry. */
239 re_compile_pattern (const char *pattern, size_t length,
240 struct re_pattern_buffer *bufp)
244 /* And GNU code determines whether or not to get register information
245 by passing null for the REGS argument to re_match, etc., not by
246 setting no_sub, unless RE_NO_SUB is set. */
247 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
249 /* Match anchors at newline. */
250 bufp->newline_anchor = 1;
252 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
256 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
259 weak_alias (__re_compile_pattern, re_compile_pattern)
262 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
263 also be assigned to arbitrarily: each pattern buffer stores its own
264 syntax, so it can be changed between regex compilations. */
265 /* This has no initializer because initialized variables in Emacs
266 become read-only after dumping. */
267 reg_syntax_t re_syntax_options;
270 /* Specify the precise syntax of regexps for compilation. This provides
271 for compatibility for various utilities which historically have
272 different, incompatible syntaxes.
274 The argument SYNTAX is a bit mask comprised of the various bits
275 defined in regex.h. We return the old syntax. */
278 re_set_syntax (reg_syntax_t syntax)
280 reg_syntax_t ret = re_syntax_options;
282 re_syntax_options = syntax;
286 weak_alias (__re_set_syntax, re_set_syntax)
290 re_compile_fastmap (struct re_pattern_buffer *bufp)
292 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
293 char *fastmap = bufp->fastmap;
295 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
296 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
297 if (dfa->init_state != dfa->init_state_word)
298 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
299 if (dfa->init_state != dfa->init_state_nl)
300 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
301 if (dfa->init_state != dfa->init_state_begbuf)
302 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
303 bufp->fastmap_accurate = 1;
307 weak_alias (__re_compile_fastmap, re_compile_fastmap)
311 __attribute ((always_inline))
312 re_set_fastmap (char *fastmap, int icase, int ch)
316 fastmap[tolower (ch)] = 1;
319 /* Helper function for re_compile_fastmap.
320 Compile fastmap for the initial_state INIT_STATE. */
323 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
326 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
328 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
329 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
331 int node = init_state->nodes.elems[node_cnt];
332 re_token_type_t type = dfa->nodes[node].type;
334 if (type == CHARACTER)
336 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
337 #ifdef RE_ENABLE_I18N
338 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
340 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
345 *p++ = dfa->nodes[node].opr.c;
346 while (++node < dfa->nodes_len
347 && dfa->nodes[node].type == CHARACTER
348 && dfa->nodes[node].mb_partial)
349 *p++ = dfa->nodes[node].opr.c;
350 memset (&state, 0, sizeof (state));
351 if (mbrtowc (&wc, (const char *) buf, p - buf,
353 && (__wcrtomb ((char *) buf, towlower (wc), &state)
355 re_set_fastmap (fastmap, 0, buf[0]);
359 else if (type == SIMPLE_BRACKET)
362 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
363 for (j = 0; j < UINT_BITS; ++j, ++ch)
364 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
365 re_set_fastmap (fastmap, icase, ch);
367 #ifdef RE_ENABLE_I18N
368 else if (type == COMPLEX_BRACKET)
371 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
372 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
373 || cset->nranges || cset->nchar_classes)
376 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
378 /* In this case we want to catch the bytes which are
379 the first byte of any collation elements.
380 e.g. In da_DK, we want to catch 'a' since "aa"
381 is a valid collation element, and don't catch
382 'b' since 'b' is the only collation element
383 which starts from 'b'. */
385 const int32_t *table = (const int32_t *)
386 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
387 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
388 for (j = 0; j < UINT_BITS; ++j, ++ch)
390 re_set_fastmap (fastmap, icase, ch);
393 if (dfa->mb_cur_max > 1)
394 for (i = 0; i < SBC_MAX; ++i)
395 if (__btowc (i) == WEOF)
396 re_set_fastmap (fastmap, icase, i);
397 # endif /* not _LIBC */
399 for (i = 0; i < cset->nmbchars; ++i)
403 memset (&state, '\0', sizeof (state));
404 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
405 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
406 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
408 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
410 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
414 #endif /* RE_ENABLE_I18N */
415 else if (type == OP_PERIOD
416 #ifdef RE_ENABLE_I18N
417 || type == OP_UTF8_PERIOD
418 #endif /* RE_ENABLE_I18N */
419 || type == END_OF_RE)
421 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
422 if (type == END_OF_RE)
423 bufp->can_be_null = 1;
429 /* Entry point for POSIX code. */
430 /* regcomp takes a regular expression as a string and compiles it.
432 PREG is a regex_t *. We do not expect any fields to be initialized,
433 since POSIX says we shouldn't. Thus, we set
435 `buffer' to the compiled pattern;
436 `used' to the length of the compiled pattern;
437 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
438 REG_EXTENDED bit in CFLAGS is set; otherwise, to
439 RE_SYNTAX_POSIX_BASIC;
440 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
441 `fastmap' to an allocated space for the fastmap;
442 `fastmap_accurate' to zero;
443 `re_nsub' to the number of subexpressions in PATTERN.
445 PATTERN is the address of the pattern string.
447 CFLAGS is a series of bits which affect compilation.
449 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
450 use POSIX basic syntax.
452 If REG_NEWLINE is set, then . and [^...] don't match newline.
453 Also, regexec will try a match beginning after every newline.
455 If REG_ICASE is set, then we considers upper- and lowercase
456 versions of letters to be equivalent when matching.
458 If REG_NOSUB is set, then when PREG is passed to regexec, that
459 routine will report only success or failure, and nothing about the
462 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
463 the return codes and their meanings.) */
466 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
469 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
470 : RE_SYNTAX_POSIX_BASIC);
476 /* Try to allocate space for the fastmap. */
477 preg->fastmap = re_malloc (char, SBC_MAX);
478 if (BE (preg->fastmap == NULL, 0))
481 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
483 /* If REG_NEWLINE is set, newlines are treated differently. */
484 if (cflags & REG_NEWLINE)
485 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
486 syntax &= ~RE_DOT_NEWLINE;
487 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
488 /* It also changes the matching behavior. */
489 preg->newline_anchor = 1;
492 preg->newline_anchor = 0;
493 preg->no_sub = !!(cflags & REG_NOSUB);
494 preg->translate = NULL;
496 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
498 /* POSIX doesn't distinguish between an unmatched open-group and an
499 unmatched close-group: both are REG_EPAREN. */
500 if (ret == REG_ERPAREN)
503 /* We have already checked preg->fastmap != NULL. */
504 if (BE (ret == REG_NOERROR, 1))
505 /* Compute the fastmap now, since regexec cannot modify the pattern
506 buffer. This function never fails in this implementation. */
507 (void) re_compile_fastmap (preg);
510 /* Some error occurred while compiling the expression. */
511 re_free (preg->fastmap);
512 preg->fastmap = NULL;
518 weak_alias (__regcomp, regcomp)
521 /* Returns a message corresponding to an error code, ERRCODE, returned
522 from either regcomp or regexec. We don't use PREG here. */
525 regerror (int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
531 || errcode >= (int) (sizeof (__re_error_msgid_idx)
532 / sizeof (__re_error_msgid_idx[0])), 0))
533 /* Only error codes returned by the rest of the code should be passed
534 to this routine. If we are given anything else, or if other regex
535 code generates an invalid error code, then the program has a bug.
536 Dump core so we can fix it. */
539 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
541 msg_size = strlen (msg) + 1; /* Includes the null. */
543 if (BE (errbuf_size != 0, 1))
545 if (BE (msg_size > errbuf_size, 0))
547 #if defined HAVE_MEMPCPY || defined _LIBC
548 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
550 memcpy (errbuf, msg, errbuf_size - 1);
551 errbuf[errbuf_size - 1] = 0;
555 memcpy (errbuf, msg, msg_size);
561 weak_alias (__regerror, regerror)
565 #ifdef RE_ENABLE_I18N
566 /* This static array is used for the map to single-byte characters when
567 UTF-8 is used. Otherwise we would allocate memory just to initialize
568 it the same all the time. UTF-8 is the preferred encoding so this is
569 a worthwhile optimization. */
570 static const bitset utf8_sb_map =
572 /* Set the first 128 bits. */
573 # if UINT_MAX == 0xffffffff
574 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
576 # error "Add case for new unsigned int size"
583 free_dfa_content (re_dfa_t *dfa)
588 for (i = 0; i < dfa->nodes_len; ++i)
589 free_token (dfa->nodes + i);
590 re_free (dfa->nexts);
591 for (i = 0; i < dfa->nodes_len; ++i)
593 if (dfa->eclosures != NULL)
594 re_node_set_free (dfa->eclosures + i);
595 if (dfa->inveclosures != NULL)
596 re_node_set_free (dfa->inveclosures + i);
597 if (dfa->edests != NULL)
598 re_node_set_free (dfa->edests + i);
600 re_free (dfa->edests);
601 re_free (dfa->eclosures);
602 re_free (dfa->inveclosures);
603 re_free (dfa->nodes);
605 if (dfa->state_table)
606 for (i = 0; i <= dfa->state_hash_mask; ++i)
608 struct re_state_table_entry *entry = dfa->state_table + i;
609 for (j = 0; j < entry->num; ++j)
611 re_dfastate_t *state = entry->array[j];
614 re_free (entry->array);
616 re_free (dfa->state_table);
617 #ifdef RE_ENABLE_I18N
618 if (dfa->sb_char != utf8_sb_map)
619 re_free (dfa->sb_char);
621 re_free (dfa->subexp_map);
623 re_free (dfa->re_str);
630 /* Free dynamically allocated space used by PREG. */
633 regfree (regex_t *preg)
635 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
636 if (BE (dfa != NULL, 1))
637 free_dfa_content (dfa);
641 re_free (preg->fastmap);
642 preg->fastmap = NULL;
644 re_free (preg->translate);
645 preg->translate = NULL;
648 weak_alias (__regfree, regfree)
651 /* Entry points compatible with 4.2 BSD regex library. We don't define
652 them unless specifically requested. */
654 #if defined _REGEX_RE_COMP || defined _LIBC
656 /* BSD has one and only one pattern buffer. */
657 static struct re_pattern_buffer re_comp_buf;
661 /* Make these definitions weak in libc, so POSIX programs can redefine
662 these names if they don't use our functions, and still use
663 regcomp/regexec above without link errors. */
674 if (!re_comp_buf.buffer)
675 return gettext ("No previous regular expression");
679 if (re_comp_buf.buffer)
681 fastmap = re_comp_buf.fastmap;
682 re_comp_buf.fastmap = NULL;
683 __regfree (&re_comp_buf);
684 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
685 re_comp_buf.fastmap = fastmap;
688 if (re_comp_buf.fastmap == NULL)
690 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
691 if (re_comp_buf.fastmap == NULL)
692 return (char *) gettext (__re_error_msgid
693 + __re_error_msgid_idx[(int) REG_ESPACE]);
696 /* Since `re_exec' always passes NULL for the `regs' argument, we
697 don't need to initialize the pattern buffer fields which affect it. */
699 /* Match anchors at newlines. */
700 re_comp_buf.newline_anchor = 1;
702 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
707 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
708 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
712 libc_freeres_fn (free_mem)
714 __regfree (&re_comp_buf);
718 #endif /* _REGEX_RE_COMP */
720 /* Internal entry point.
721 Compile the regular expression PATTERN, whose length is LENGTH.
722 SYNTAX indicate regular expression's syntax. */
725 re_compile_internal (regex_t *preg, const char * pattern, int length,
728 reg_errcode_t err = REG_NOERROR;
732 /* Initialize the pattern buffer. */
733 preg->fastmap_accurate = 0;
734 preg->syntax = syntax;
735 preg->not_bol = preg->not_eol = 0;
738 preg->can_be_null = 0;
739 preg->regs_allocated = REGS_UNALLOCATED;
741 /* Initialize the dfa. */
742 dfa = (re_dfa_t *) preg->buffer;
743 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
745 /* If zero allocated, but buffer is non-null, try to realloc
746 enough space. This loses if buffer's address is bogus, but
747 that is the user's responsibility. If ->buffer is NULL this
748 is a simple allocation. */
749 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
752 preg->allocated = sizeof (re_dfa_t);
753 preg->buffer = (unsigned char *) dfa;
755 preg->used = sizeof (re_dfa_t);
757 __libc_lock_init (dfa->lock);
759 err = init_dfa (dfa, length);
760 if (BE (err != REG_NOERROR, 0))
762 free_dfa_content (dfa);
768 dfa->re_str = re_malloc (char, length + 1);
769 strncpy (dfa->re_str, pattern, length + 1);
772 err = re_string_construct (®exp, pattern, length, preg->translate,
773 syntax & RE_ICASE, dfa);
774 if (BE (err != REG_NOERROR, 0))
776 re_compile_internal_free_return:
777 free_workarea_compile (preg);
778 re_string_destruct (®exp);
779 free_dfa_content (dfa);
785 /* Parse the regular expression, and build a structure tree. */
787 dfa->str_tree = parse (®exp, preg, syntax, &err);
788 if (BE (dfa->str_tree == NULL, 0))
789 goto re_compile_internal_free_return;
791 /* Analyze the tree and create the nfa. */
792 err = analyze (preg);
793 if (BE (err != REG_NOERROR, 0))
794 goto re_compile_internal_free_return;
796 #ifdef RE_ENABLE_I18N
797 /* If possible, do searching in single byte encoding to speed things up. */
798 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
802 /* Then create the initial state of the dfa. */
803 err = create_initial_state (dfa);
805 /* Release work areas. */
806 free_workarea_compile (preg);
807 re_string_destruct (®exp);
809 if (BE (err != REG_NOERROR, 0))
811 free_dfa_content (dfa);
819 /* Initialize DFA. We use the length of the regular expression PAT_LEN
820 as the initial length of some arrays. */
823 init_dfa (re_dfa_t *dfa, int pat_len)
830 memset (dfa, '\0', sizeof (re_dfa_t));
832 /* Force allocation of str_tree_storage the first time. */
833 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
835 dfa->nodes_alloc = pat_len + 1;
836 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
838 dfa->states_alloc = pat_len + 1;
840 /* table_size = 2 ^ ceil(log pat_len) */
841 for (table_size = 1; table_size > 0; table_size <<= 1)
842 if (table_size > pat_len)
845 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
846 dfa->state_hash_mask = table_size - 1;
848 dfa->mb_cur_max = MB_CUR_MAX;
850 if (dfa->mb_cur_max == 6
851 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
853 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
856 # ifdef HAVE_LANGINFO_CODESET
857 codeset_name = nl_langinfo (CODESET);
859 codeset_name = getenv ("LC_ALL");
860 if (codeset_name == NULL || codeset_name[0] == '\0')
861 codeset_name = getenv ("LC_CTYPE");
862 if (codeset_name == NULL || codeset_name[0] == '\0')
863 codeset_name = getenv ("LANG");
864 if (codeset_name == NULL)
866 else if (strchr (codeset_name, '.') != NULL)
867 codeset_name = strchr (codeset_name, '.') + 1;
870 if (strcasecmp (codeset_name, "UTF-8") == 0
871 || strcasecmp (codeset_name, "UTF8") == 0)
874 /* We check exhaustively in the loop below if this charset is a
875 superset of ASCII. */
876 dfa->map_notascii = 0;
879 #ifdef RE_ENABLE_I18N
880 if (dfa->mb_cur_max > 1)
883 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
888 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
889 if (BE (dfa->sb_char == NULL, 0))
892 /* Clear all bits by, then set those corresponding to single
894 bitset_empty (dfa->sb_char);
896 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
897 for (j = 0; j < UINT_BITS; ++j, ++ch)
899 wint_t wch = __btowc (ch);
901 dfa->sb_char[i] |= 1 << j;
903 if (isascii (ch) && wch != ch)
904 dfa->map_notascii = 1;
911 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
916 /* Initialize WORD_CHAR table, which indicate which character is
917 "word". In this case "word" means that it is the word construction
918 character used by some operators like "\<", "\>", etc. */
921 init_word_char (re_dfa_t *dfa)
924 dfa->word_ops_used = 1;
925 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
926 for (j = 0; j < UINT_BITS; ++j, ++ch)
927 if (isalnum (ch) || ch == '_')
928 dfa->word_char[i] |= 1 << j;
931 /* Free the work area which are only used while compiling. */
934 free_workarea_compile (regex_t *preg)
936 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
937 bin_tree_storage_t *storage, *next;
938 for (storage = dfa->str_tree_storage; storage; storage = next)
940 next = storage->next;
943 dfa->str_tree_storage = NULL;
944 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
945 dfa->str_tree = NULL;
946 re_free (dfa->org_indices);
947 dfa->org_indices = NULL;
950 /* Create initial states for all contexts. */
953 create_initial_state (re_dfa_t *dfa)
957 re_node_set init_nodes;
959 /* Initial states have the epsilon closure of the node which is
960 the first node of the regular expression. */
961 first = dfa->str_tree->first->node_idx;
962 dfa->init_node = first;
963 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
964 if (BE (err != REG_NOERROR, 0))
967 /* The back-references which are in initial states can epsilon transit,
968 since in this case all of the subexpressions can be null.
969 Then we add epsilon closures of the nodes which are the next nodes of
970 the back-references. */
971 if (dfa->nbackref > 0)
972 for (i = 0; i < init_nodes.nelem; ++i)
974 int node_idx = init_nodes.elems[i];
975 re_token_type_t type = dfa->nodes[node_idx].type;
978 if (type != OP_BACK_REF)
980 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
982 re_token_t *clexp_node;
983 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
984 if (clexp_node->type == OP_CLOSE_SUBEXP
985 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
988 if (clexp_idx == init_nodes.nelem)
991 if (type == OP_BACK_REF)
993 int dest_idx = dfa->edests[node_idx].elems[0];
994 if (!re_node_set_contains (&init_nodes, dest_idx))
996 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1002 /* It must be the first time to invoke acquire_state. */
1003 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1004 /* We don't check ERR here, since the initial state must not be NULL. */
1005 if (BE (dfa->init_state == NULL, 0))
1007 if (dfa->init_state->has_constraint)
1009 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1011 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1013 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1017 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1018 || dfa->init_state_begbuf == NULL, 0))
1022 dfa->init_state_word = dfa->init_state_nl
1023 = dfa->init_state_begbuf = dfa->init_state;
1025 re_node_set_free (&init_nodes);
1029 #ifdef RE_ENABLE_I18N
1030 /* If it is possible to do searching in single byte encoding instead of UTF-8
1031 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1032 DFA nodes where needed. */
1035 optimize_utf8 (re_dfa_t *dfa)
1037 int node, i, mb_chars = 0, has_period = 0;
1039 for (node = 0; node < dfa->nodes_len; ++node)
1040 switch (dfa->nodes[node].type)
1043 if (dfa->nodes[node].opr.c >= 0x80)
1047 switch (dfa->nodes[node].opr.idx)
1055 /* Word anchors etc. cannot be handled. */
1065 case OP_DUP_ASTERISK:
1066 case OP_OPEN_SUBEXP:
1067 case OP_CLOSE_SUBEXP:
1069 case COMPLEX_BRACKET:
1071 case SIMPLE_BRACKET:
1072 /* Just double check. */
1073 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1074 if (dfa->nodes[node].opr.sbcset[i])
1081 if (mb_chars || has_period)
1082 for (node = 0; node < dfa->nodes_len; ++node)
1084 if (dfa->nodes[node].type == CHARACTER
1085 && dfa->nodes[node].opr.c >= 0x80)
1086 dfa->nodes[node].mb_partial = 0;
1087 else if (dfa->nodes[node].type == OP_PERIOD)
1088 dfa->nodes[node].type = OP_UTF8_PERIOD;
1091 /* The search can be in single byte locale. */
1092 dfa->mb_cur_max = 1;
1094 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1098 /* Analyze the structure tree, and calculate "first", "next", "edest",
1099 "eclosure", and "inveclosure". */
1101 static reg_errcode_t
1102 analyze (regex_t *preg)
1104 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1107 /* Allocate arrays. */
1108 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1109 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1110 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1111 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1112 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1113 || dfa->eclosures == NULL, 0))
1116 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1117 if (dfa->subexp_map != NULL)
1120 for (i = 0; i < preg->re_nsub; i++)
1121 dfa->subexp_map[i] = i;
1122 preorder (dfa->str_tree, optimize_subexps, dfa);
1123 for (i = 0; i < preg->re_nsub; i++)
1124 if (dfa->subexp_map[i] != i)
1126 if (i == preg->re_nsub)
1128 free (dfa->subexp_map);
1129 dfa->subexp_map = NULL;
1133 ret = postorder (dfa->str_tree, lower_subexps, preg);
1134 if (BE (ret != REG_NOERROR, 0))
1136 ret = postorder (dfa->str_tree, calc_first, dfa);
1137 if (BE (ret != REG_NOERROR, 0))
1139 preorder (dfa->str_tree, calc_next, dfa);
1140 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1141 if (BE (ret != REG_NOERROR, 0))
1143 ret = calc_eclosure (dfa);
1144 if (BE (ret != REG_NOERROR, 0))
1147 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1148 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1149 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1152 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1153 if (BE (dfa->inveclosures == NULL, 0))
1155 ret = calc_inveclosure (dfa);
1161 /* Our parse trees are very unbalanced, so we cannot use a stack to
1162 implement parse tree visits. Instead, we use parent pointers and
1163 some hairy code in these two functions. */
1164 static reg_errcode_t
1165 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1168 bin_tree_t *node, *prev;
1170 for (node = root; ; )
1172 /* Descend down the tree, preferably to the left (or to the right
1173 if that's the only child). */
1174 while (node->left || node->right)
1182 reg_errcode_t err = fn (extra, node);
1183 if (BE (err != REG_NOERROR, 0))
1185 if (node->parent == NULL)
1188 node = node->parent;
1190 /* Go up while we have a node that is reached from the right. */
1191 while (node->right == prev || node->right == NULL);
1196 static reg_errcode_t
1197 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1202 for (node = root; ; )
1204 reg_errcode_t err = fn (extra, node);
1205 if (BE (err != REG_NOERROR, 0))
1208 /* Go to the left node, or up and to the right. */
1213 bin_tree_t *prev = NULL;
1214 while (node->right == prev || node->right == NULL)
1217 node = node->parent;
1226 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1227 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1228 backreferences as well. Requires a preorder visit. */
1229 static reg_errcode_t
1230 optimize_subexps (void *extra, bin_tree_t *node)
1232 re_dfa_t *dfa = (re_dfa_t *) extra;
1234 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1236 int idx = node->token.opr.idx;
1237 node->token.opr.idx = dfa->subexp_map[idx];
1238 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1241 else if (node->token.type == SUBEXP
1242 && node->left && node->left->token.type == SUBEXP)
1244 int other_idx = node->left->token.opr.idx;
1246 node->left = node->left->left;
1248 node->left->parent = node;
1250 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1251 if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1252 dfa->used_bkref_map &= ~(1 << other_idx);
1258 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1259 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1260 static reg_errcode_t
1261 lower_subexps (void *extra, bin_tree_t *node)
1263 regex_t *preg = (regex_t *) extra;
1264 reg_errcode_t err = REG_NOERROR;
1266 if (node->left && node->left->token.type == SUBEXP)
1268 node->left = lower_subexp (&err, preg, node->left);
1270 node->left->parent = node;
1272 if (node->right && node->right->token.type == SUBEXP)
1274 node->right = lower_subexp (&err, preg, node->right);
1276 node->right->parent = node;
1283 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1285 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1286 bin_tree_t *body = node->left;
1287 bin_tree_t *op, *cls, *tree1, *tree;
1290 /* We do not optimize empty subexpressions, because otherwise we may
1291 have bad CONCAT nodes with NULL children. This is obviously not
1292 very common, so we do not lose much. An example that triggers
1293 this case is the sed "script" /\(\)/x. */
1294 && node->left != NULL
1295 && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1296 || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1299 /* Convert the SUBEXP node to the concatenation of an
1300 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1301 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1302 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1303 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1304 tree = create_tree (dfa, op, tree1, CONCAT);
1305 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1311 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1312 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1316 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1317 nodes. Requires a postorder visit. */
1318 static reg_errcode_t
1319 calc_first (void *extra, bin_tree_t *node)
1321 re_dfa_t *dfa = (re_dfa_t *) extra;
1322 if (node->token.type == CONCAT)
1324 node->first = node->left->first;
1325 node->node_idx = node->left->node_idx;
1330 node->node_idx = re_dfa_add_node (dfa, node->token);
1331 if (BE (node->node_idx == -1, 0))
1337 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1338 static reg_errcode_t
1339 calc_next (void *extra, bin_tree_t *node)
1341 switch (node->token.type)
1343 case OP_DUP_ASTERISK:
1344 node->left->next = node;
1347 node->left->next = node->right->first;
1348 node->right->next = node->next;
1352 node->left->next = node->next;
1354 node->right->next = node->next;
1360 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1361 static reg_errcode_t
1362 link_nfa_nodes (void *extra, bin_tree_t *node)
1364 re_dfa_t *dfa = (re_dfa_t *) extra;
1365 int idx = node->node_idx;
1366 reg_errcode_t err = REG_NOERROR;
1368 switch (node->token.type)
1374 assert (node->next == NULL);
1377 case OP_DUP_ASTERISK:
1381 dfa->has_plural_match = 1;
1382 if (node->left != NULL)
1383 left = node->left->first->node_idx;
1385 left = node->next->node_idx;
1386 if (node->right != NULL)
1387 right = node->right->first->node_idx;
1389 right = node->next->node_idx;
1391 assert (right > -1);
1392 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1397 case OP_OPEN_SUBEXP:
1398 case OP_CLOSE_SUBEXP:
1399 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1403 dfa->nexts[idx] = node->next->node_idx;
1404 if (node->token.type == OP_BACK_REF)
1405 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1409 assert (!IS_EPSILON_NODE (node->token.type));
1410 dfa->nexts[idx] = node->next->node_idx;
1417 /* Duplicate the epsilon closure of the node ROOT_NODE.
1418 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1419 to their own constraint. */
1421 static reg_errcode_t
1422 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1423 int root_node, unsigned int init_constraint)
1425 int org_node, clone_node, ret;
1426 unsigned int constraint = init_constraint;
1427 for (org_node = top_org_node, clone_node = top_clone_node;;)
1429 int org_dest, clone_dest;
1430 if (dfa->nodes[org_node].type == OP_BACK_REF)
1432 /* If the back reference epsilon-transit, its destination must
1433 also have the constraint. Then duplicate the epsilon closure
1434 of the destination of the back reference, and store it in
1435 edests of the back reference. */
1436 org_dest = dfa->nexts[org_node];
1437 re_node_set_empty (dfa->edests + clone_node);
1438 clone_dest = duplicate_node (dfa, org_dest, constraint);
1439 if (BE (clone_dest == -1, 0))
1441 dfa->nexts[clone_node] = dfa->nexts[org_node];
1442 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1443 if (BE (ret < 0, 0))
1446 else if (dfa->edests[org_node].nelem == 0)
1448 /* In case of the node can't epsilon-transit, don't duplicate the
1449 destination and store the original destination as the
1450 destination of the node. */
1451 dfa->nexts[clone_node] = dfa->nexts[org_node];
1454 else if (dfa->edests[org_node].nelem == 1)
1456 /* In case of the node can epsilon-transit, and it has only one
1458 org_dest = dfa->edests[org_node].elems[0];
1459 re_node_set_empty (dfa->edests + clone_node);
1460 if (dfa->nodes[org_node].type == ANCHOR)
1462 /* In case of the node has another constraint, append it. */
1463 if (org_node == root_node && clone_node != org_node)
1465 /* ...but if the node is root_node itself, it means the
1466 epsilon closure have a loop, then tie it to the
1467 destination of the root_node. */
1468 ret = re_node_set_insert (dfa->edests + clone_node,
1470 if (BE (ret < 0, 0))
1474 constraint |= dfa->nodes[org_node].opr.ctx_type;
1476 clone_dest = duplicate_node (dfa, org_dest, constraint);
1477 if (BE (clone_dest == -1, 0))
1479 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1480 if (BE (ret < 0, 0))
1483 else /* dfa->edests[org_node].nelem == 2 */
1485 /* In case of the node can epsilon-transit, and it has two
1486 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1487 org_dest = dfa->edests[org_node].elems[0];
1488 re_node_set_empty (dfa->edests + clone_node);
1489 /* Search for a duplicated node which satisfies the constraint. */
1490 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1491 if (clone_dest == -1)
1493 /* There are no such a duplicated node, create a new one. */
1495 clone_dest = duplicate_node (dfa, org_dest, constraint);
1496 if (BE (clone_dest == -1, 0))
1498 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1499 if (BE (ret < 0, 0))
1501 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1502 root_node, constraint);
1503 if (BE (err != REG_NOERROR, 0))
1508 /* There are a duplicated node which satisfy the constraint,
1509 use it to avoid infinite loop. */
1510 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1511 if (BE (ret < 0, 0))
1515 org_dest = dfa->edests[org_node].elems[1];
1516 clone_dest = duplicate_node (dfa, org_dest, constraint);
1517 if (BE (clone_dest == -1, 0))
1519 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1520 if (BE (ret < 0, 0))
1523 org_node = org_dest;
1524 clone_node = clone_dest;
1529 /* Search for a node which is duplicated from the node ORG_NODE, and
1530 satisfies the constraint CONSTRAINT. */
1533 search_duplicated_node (re_dfa_t *dfa, int org_node, unsigned int constraint)
1536 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1538 if (org_node == dfa->org_indices[idx]
1539 && constraint == dfa->nodes[idx].constraint)
1540 return idx; /* Found. */
1542 return -1; /* Not found. */
1545 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1546 Return the index of the new node, or -1 if insufficient storage is
1550 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1552 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1553 if (BE (dup_idx != -1, 1))
1555 dfa->nodes[dup_idx].constraint = constraint;
1556 if (dfa->nodes[org_idx].type == ANCHOR)
1557 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1558 dfa->nodes[dup_idx].duplicated = 1;
1560 /* Store the index of the original node. */
1561 dfa->org_indices[dup_idx] = org_idx;
1566 static reg_errcode_t
1567 calc_inveclosure (re_dfa_t *dfa)
1570 for (idx = 0; idx < dfa->nodes_len; ++idx)
1571 re_node_set_init_empty (dfa->inveclosures + idx);
1573 for (src = 0; src < dfa->nodes_len; ++src)
1575 int *elems = dfa->eclosures[src].elems;
1576 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1578 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1579 if (BE (ret == -1, 0))
1587 /* Calculate "eclosure" for all the node in DFA. */
1589 static reg_errcode_t
1590 calc_eclosure (re_dfa_t *dfa)
1592 int node_idx, incomplete;
1594 assert (dfa->nodes_len > 0);
1597 /* For each nodes, calculate epsilon closure. */
1598 for (node_idx = 0; ; ++node_idx)
1601 re_node_set eclosure_elem;
1602 if (node_idx == dfa->nodes_len)
1611 assert (dfa->eclosures[node_idx].nelem != -1);
1614 /* If we have already calculated, skip it. */
1615 if (dfa->eclosures[node_idx].nelem != 0)
1617 /* Calculate epsilon closure of `node_idx'. */
1618 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1619 if (BE (err != REG_NOERROR, 0))
1622 if (dfa->eclosures[node_idx].nelem == 0)
1625 re_node_set_free (&eclosure_elem);
1631 /* Calculate epsilon closure of NODE. */
1633 static reg_errcode_t
1634 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1637 unsigned int constraint;
1639 re_node_set eclosure;
1641 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1642 if (BE (err != REG_NOERROR, 0))
1645 /* This indicates that we are calculating this node now.
1646 We reference this value to avoid infinite loop. */
1647 dfa->eclosures[node].nelem = -1;
1649 constraint = ((dfa->nodes[node].type == ANCHOR)
1650 ? dfa->nodes[node].opr.ctx_type : 0);
1651 /* If the current node has constraints, duplicate all nodes.
1652 Since they must inherit the constraints. */
1654 && dfa->edests[node].nelem
1655 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1657 int org_node, cur_node;
1658 org_node = cur_node = node;
1659 err = duplicate_node_closure (dfa, node, node, node, constraint);
1660 if (BE (err != REG_NOERROR, 0))
1664 /* Expand each epsilon destination nodes. */
1665 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1666 for (i = 0; i < dfa->edests[node].nelem; ++i)
1668 re_node_set eclosure_elem;
1669 int edest = dfa->edests[node].elems[i];
1670 /* If calculating the epsilon closure of `edest' is in progress,
1671 return intermediate result. */
1672 if (dfa->eclosures[edest].nelem == -1)
1677 /* If we haven't calculated the epsilon closure of `edest' yet,
1678 calculate now. Otherwise use calculated epsilon closure. */
1679 if (dfa->eclosures[edest].nelem == 0)
1681 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1682 if (BE (err != REG_NOERROR, 0))
1686 eclosure_elem = dfa->eclosures[edest];
1687 /* Merge the epsilon closure of `edest'. */
1688 re_node_set_merge (&eclosure, &eclosure_elem);
1689 /* If the epsilon closure of `edest' is incomplete,
1690 the epsilon closure of this node is also incomplete. */
1691 if (dfa->eclosures[edest].nelem == 0)
1694 re_node_set_free (&eclosure_elem);
1698 /* Epsilon closures include itself. */
1699 re_node_set_insert (&eclosure, node);
1700 if (incomplete && !root)
1701 dfa->eclosures[node].nelem = 0;
1703 dfa->eclosures[node] = eclosure;
1704 *new_set = eclosure;
1708 /* Functions for token which are used in the parser. */
1710 /* Fetch a token from INPUT.
1711 We must not use this function inside bracket expressions. */
1714 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1716 re_string_skip_bytes (input, peek_token (result, input, syntax));
1719 /* Peek a token from INPUT, and return the length of the token.
1720 We must not use this function inside bracket expressions. */
1723 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1727 if (re_string_eoi (input))
1729 token->type = END_OF_RE;
1733 c = re_string_peek_byte (input, 0);
1736 token->word_char = 0;
1737 #ifdef RE_ENABLE_I18N
1738 token->mb_partial = 0;
1739 if (input->mb_cur_max > 1 &&
1740 !re_string_first_byte (input, re_string_cur_idx (input)))
1742 token->type = CHARACTER;
1743 token->mb_partial = 1;
1750 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1752 token->type = BACK_SLASH;
1756 c2 = re_string_peek_byte_case (input, 1);
1758 token->type = CHARACTER;
1759 #ifdef RE_ENABLE_I18N
1760 if (input->mb_cur_max > 1)
1762 wint_t wc = re_string_wchar_at (input,
1763 re_string_cur_idx (input) + 1);
1764 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1768 token->word_char = IS_WORD_CHAR (c2) != 0;
1773 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1774 token->type = OP_ALT;
1776 case '1': case '2': case '3': case '4': case '5':
1777 case '6': case '7': case '8': case '9':
1778 if (!(syntax & RE_NO_BK_REFS))
1780 token->type = OP_BACK_REF;
1781 token->opr.idx = c2 - '1';
1785 if (!(syntax & RE_NO_GNU_OPS))
1787 token->type = ANCHOR;
1788 token->opr.ctx_type = WORD_FIRST;
1792 if (!(syntax & RE_NO_GNU_OPS))
1794 token->type = ANCHOR;
1795 token->opr.ctx_type = WORD_LAST;
1799 if (!(syntax & RE_NO_GNU_OPS))
1801 token->type = ANCHOR;
1802 token->opr.ctx_type = WORD_DELIM;
1806 if (!(syntax & RE_NO_GNU_OPS))
1808 token->type = ANCHOR;
1809 token->opr.ctx_type = NOT_WORD_DELIM;
1813 if (!(syntax & RE_NO_GNU_OPS))
1814 token->type = OP_WORD;
1817 if (!(syntax & RE_NO_GNU_OPS))
1818 token->type = OP_NOTWORD;
1821 if (!(syntax & RE_NO_GNU_OPS))
1822 token->type = OP_SPACE;
1825 if (!(syntax & RE_NO_GNU_OPS))
1826 token->type = OP_NOTSPACE;
1829 if (!(syntax & RE_NO_GNU_OPS))
1831 token->type = ANCHOR;
1832 token->opr.ctx_type = BUF_FIRST;
1836 if (!(syntax & RE_NO_GNU_OPS))
1838 token->type = ANCHOR;
1839 token->opr.ctx_type = BUF_LAST;
1843 if (!(syntax & RE_NO_BK_PARENS))
1844 token->type = OP_OPEN_SUBEXP;
1847 if (!(syntax & RE_NO_BK_PARENS))
1848 token->type = OP_CLOSE_SUBEXP;
1851 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1852 token->type = OP_DUP_PLUS;
1855 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1856 token->type = OP_DUP_QUESTION;
1859 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1860 token->type = OP_OPEN_DUP_NUM;
1863 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1864 token->type = OP_CLOSE_DUP_NUM;
1872 token->type = CHARACTER;
1873 #ifdef RE_ENABLE_I18N
1874 if (input->mb_cur_max > 1)
1876 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1877 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1881 token->word_char = IS_WORD_CHAR (token->opr.c);
1886 if (syntax & RE_NEWLINE_ALT)
1887 token->type = OP_ALT;
1890 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1891 token->type = OP_ALT;
1894 token->type = OP_DUP_ASTERISK;
1897 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1898 token->type = OP_DUP_PLUS;
1901 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1902 token->type = OP_DUP_QUESTION;
1905 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1906 token->type = OP_OPEN_DUP_NUM;
1909 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1910 token->type = OP_CLOSE_DUP_NUM;
1913 if (syntax & RE_NO_BK_PARENS)
1914 token->type = OP_OPEN_SUBEXP;
1917 if (syntax & RE_NO_BK_PARENS)
1918 token->type = OP_CLOSE_SUBEXP;
1921 token->type = OP_OPEN_BRACKET;
1924 token->type = OP_PERIOD;
1927 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1928 re_string_cur_idx (input) != 0)
1930 char prev = re_string_peek_byte (input, -1);
1931 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1934 token->type = ANCHOR;
1935 token->opr.ctx_type = LINE_FIRST;
1938 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1939 re_string_cur_idx (input) + 1 != re_string_length (input))
1942 re_string_skip_bytes (input, 1);
1943 peek_token (&next, input, syntax);
1944 re_string_skip_bytes (input, -1);
1945 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1948 token->type = ANCHOR;
1949 token->opr.ctx_type = LINE_LAST;
1957 /* Peek a token from INPUT, and return the length of the token.
1958 We must not use this function out of bracket expressions. */
1961 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1964 if (re_string_eoi (input))
1966 token->type = END_OF_RE;
1969 c = re_string_peek_byte (input, 0);
1972 #ifdef RE_ENABLE_I18N
1973 if (input->mb_cur_max > 1 &&
1974 !re_string_first_byte (input, re_string_cur_idx (input)))
1976 token->type = CHARACTER;
1979 #endif /* RE_ENABLE_I18N */
1981 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1982 && re_string_cur_idx (input) + 1 < re_string_length (input))
1984 /* In this case, '\' escape a character. */
1986 re_string_skip_bytes (input, 1);
1987 c2 = re_string_peek_byte (input, 0);
1989 token->type = CHARACTER;
1992 if (c == '[') /* '[' is a special char in a bracket exps. */
1996 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1997 c2 = re_string_peek_byte (input, 1);
2005 token->type = OP_OPEN_COLL_ELEM;
2008 token->type = OP_OPEN_EQUIV_CLASS;
2011 if (syntax & RE_CHAR_CLASSES)
2013 token->type = OP_OPEN_CHAR_CLASS;
2016 /* else fall through. */
2018 token->type = CHARACTER;
2028 token->type = OP_CHARSET_RANGE;
2031 token->type = OP_CLOSE_BRACKET;
2034 token->type = OP_NON_MATCH_LIST;
2037 token->type = CHARACTER;
2042 /* Functions for parser. */
2044 /* Entry point of the parser.
2045 Parse the regular expression REGEXP and return the structure tree.
2046 If an error is occured, ERR is set by error code, and return NULL.
2047 This function build the following tree, from regular expression <reg_exp>:
2053 CAT means concatenation.
2054 EOR means end of regular expression. */
2057 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2060 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2061 bin_tree_t *tree, *eor, *root;
2062 re_token_t current_token;
2063 dfa->syntax = syntax;
2064 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2065 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2066 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2068 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2070 root = create_tree (dfa, tree, eor, CONCAT);
2073 if (BE (eor == NULL || root == NULL, 0))
2081 /* This function build the following tree, from regular expression
2082 <branch1>|<branch2>:
2088 ALT means alternative, which represents the operator `|'. */
2091 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2092 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2094 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2095 bin_tree_t *tree, *branch = NULL;
2096 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2097 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2100 while (token->type == OP_ALT)
2102 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2103 if (token->type != OP_ALT && token->type != END_OF_RE
2104 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2106 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2107 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2112 tree = create_tree (dfa, tree, branch, OP_ALT);
2113 if (BE (tree == NULL, 0))
2122 /* This function build the following tree, from regular expression
2129 CAT means concatenation. */
2132 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2133 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2135 bin_tree_t *tree, *exp;
2136 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2137 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2138 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2141 while (token->type != OP_ALT && token->type != END_OF_RE
2142 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2144 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2145 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2149 if (tree != NULL && exp != NULL)
2151 tree = create_tree (dfa, tree, exp, CONCAT);
2158 else if (tree == NULL)
2160 /* Otherwise exp == NULL, we don't need to create new tree. */
2165 /* This function build the following tree, from regular expression a*:
2172 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2173 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2175 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2177 switch (token->type)
2180 tree = create_token_tree (dfa, NULL, NULL, token);
2181 if (BE (tree == NULL, 0))
2186 #ifdef RE_ENABLE_I18N
2187 if (dfa->mb_cur_max > 1)
2189 while (!re_string_eoi (regexp)
2190 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2192 bin_tree_t *mbc_remain;
2193 fetch_token (token, regexp, syntax);
2194 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2195 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2196 if (BE (mbc_remain == NULL || tree == NULL, 0))
2205 case OP_OPEN_SUBEXP:
2206 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2207 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2210 case OP_OPEN_BRACKET:
2211 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2212 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2216 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2221 dfa->used_bkref_map |= 1 << token->opr.idx;
2222 tree = create_token_tree (dfa, NULL, NULL, token);
2223 if (BE (tree == NULL, 0))
2229 dfa->has_mb_node = 1;
2231 case OP_OPEN_DUP_NUM:
2232 if (syntax & RE_CONTEXT_INVALID_DUP)
2238 case OP_DUP_ASTERISK:
2240 case OP_DUP_QUESTION:
2241 if (syntax & RE_CONTEXT_INVALID_OPS)
2246 else if (syntax & RE_CONTEXT_INDEP_OPS)
2248 fetch_token (token, regexp, syntax);
2249 return parse_expression (regexp, preg, token, syntax, nest, err);
2251 /* else fall through */
2252 case OP_CLOSE_SUBEXP:
2253 if ((token->type == OP_CLOSE_SUBEXP) &&
2254 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2259 /* else fall through */
2260 case OP_CLOSE_DUP_NUM:
2261 /* We treat it as a normal character. */
2263 /* Then we can these characters as normal characters. */
2264 token->type = CHARACTER;
2265 /* mb_partial and word_char bits should be initialized already
2267 tree = create_token_tree (dfa, NULL, NULL, token);
2268 if (BE (tree == NULL, 0))
2275 if ((token->opr.ctx_type
2276 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2277 && dfa->word_ops_used == 0)
2278 init_word_char (dfa);
2279 if (token->opr.ctx_type == WORD_DELIM
2280 || token->opr.ctx_type == NOT_WORD_DELIM)
2282 bin_tree_t *tree_first, *tree_last;
2283 if (token->opr.ctx_type == WORD_DELIM)
2285 token->opr.ctx_type = WORD_FIRST;
2286 tree_first = create_token_tree (dfa, NULL, NULL, token);
2287 token->opr.ctx_type = WORD_LAST;
2291 token->opr.ctx_type = INSIDE_WORD;
2292 tree_first = create_token_tree (dfa, NULL, NULL, token);
2293 token->opr.ctx_type = INSIDE_NOTWORD;
2295 tree_last = create_token_tree (dfa, NULL, NULL, token);
2296 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2297 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2305 tree = create_token_tree (dfa, NULL, NULL, token);
2306 if (BE (tree == NULL, 0))
2312 /* We must return here, since ANCHORs can't be followed
2313 by repetition operators.
2314 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2315 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2316 fetch_token (token, regexp, syntax);
2319 tree = create_token_tree (dfa, NULL, NULL, token);
2320 if (BE (tree == NULL, 0))
2325 if (dfa->mb_cur_max > 1)
2326 dfa->has_mb_node = 1;
2330 tree = build_charclass_op (dfa, regexp->trans,
2331 (const unsigned char *) "alnum",
2332 (const unsigned char *) "_",
2333 token->type == OP_NOTWORD, err);
2334 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2339 tree = build_charclass_op (dfa, regexp->trans,
2340 (const unsigned char *) "space",
2341 (const unsigned char *) "",
2342 token->type == OP_NOTSPACE, err);
2343 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2353 /* Must not happen? */
2359 fetch_token (token, regexp, syntax);
2361 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2362 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2364 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2365 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2367 /* In BRE consecutive duplications are not allowed. */
2368 if ((syntax & RE_CONTEXT_INVALID_DUP)
2369 && (token->type == OP_DUP_ASTERISK
2370 || token->type == OP_OPEN_DUP_NUM))
2380 /* This function build the following tree, from regular expression
2388 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2389 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2391 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2394 cur_nsub = preg->re_nsub++;
2396 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2398 /* The subexpression may be a null string. */
2399 if (token->type == OP_CLOSE_SUBEXP)
2403 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2404 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2406 if (BE (*err != REG_NOERROR, 0))
2409 dfa->completed_bkref_map |= 1 << cur_nsub;
2411 tree = create_tree (dfa, tree, NULL, SUBEXP);
2412 if (BE (tree == NULL, 0))
2417 tree->token.opr.idx = cur_nsub;
2421 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2424 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2425 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2427 bin_tree_t *tree = NULL, *old_tree = NULL;
2428 int i, start, end, start_idx = re_string_cur_idx (regexp);
2429 re_token_t start_token = *token;
2431 if (token->type == OP_OPEN_DUP_NUM)
2434 start = fetch_number (regexp, token, syntax);
2437 if (token->type == CHARACTER && token->opr.c == ',')
2438 start = 0; /* We treat "{,m}" as "{0,m}". */
2441 *err = REG_BADBR; /* <re>{} is invalid. */
2445 if (BE (start != -2, 1))
2447 /* We treat "{n}" as "{n,n}". */
2448 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2449 : ((token->type == CHARACTER && token->opr.c == ',')
2450 ? fetch_number (regexp, token, syntax) : -2));
2452 if (BE (start == -2 || end == -2, 0))
2454 /* Invalid sequence. */
2455 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2457 if (token->type == END_OF_RE)
2465 /* If the syntax bit is set, rollback. */
2466 re_string_set_index (regexp, start_idx);
2467 *token = start_token;
2468 token->type = CHARACTER;
2469 /* mb_partial and word_char bits should be already initialized by
2474 if (BE (end != -1 && start > end, 0))
2476 /* First number greater than second. */
2483 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2484 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2487 fetch_token (token, regexp, syntax);
2489 if (BE (elem == NULL, 0))
2491 if (BE (start == 0 && end == 0, 0))
2493 postorder (elem, free_tree, NULL);
2497 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2498 if (BE (start > 0, 0))
2501 for (i = 2; i <= start; ++i)
2503 elem = duplicate_tree (elem, dfa);
2504 tree = create_tree (dfa, tree, elem, CONCAT);
2505 if (BE (elem == NULL || tree == NULL, 0))
2506 goto parse_dup_op_espace;
2512 /* Duplicate ELEM before it is marked optional. */
2513 elem = duplicate_tree (elem, dfa);
2519 if (elem->token.type == SUBEXP)
2520 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2522 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2523 if (BE (tree == NULL, 0))
2524 goto parse_dup_op_espace;
2526 /* This loop is actually executed only when end != -1,
2527 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2528 already created the start+1-th copy. */
2529 for (i = start + 2; i <= end; ++i)
2531 elem = duplicate_tree (elem, dfa);
2532 tree = create_tree (dfa, tree, elem, CONCAT);
2533 if (BE (elem == NULL || tree == NULL, 0))
2534 goto parse_dup_op_espace;
2536 tree = create_tree (dfa, tree, NULL, OP_ALT);
2537 if (BE (tree == NULL, 0))
2538 goto parse_dup_op_espace;
2542 tree = create_tree (dfa, old_tree, tree, CONCAT);
2546 parse_dup_op_espace:
2551 /* Size of the names for collating symbol/equivalence_class/character_class.
2552 I'm not sure, but maybe enough. */
2553 #define BRACKET_NAME_BUF_SIZE 32
2556 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2557 Build the range expression which starts from START_ELEM, and ends
2558 at END_ELEM. The result are written to MBCSET and SBCSET.
2559 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2560 mbcset->range_ends, is a pointer argument sinse we may
2563 static reg_errcode_t
2564 build_range_exp (re_bitset_ptr_t sbcset,
2565 # ifdef RE_ENABLE_I18N
2566 re_charset_t *mbcset, int *range_alloc,
2568 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2570 unsigned int start_ch, end_ch;
2571 /* Equivalence Classes and Character Classes can't be a range start/end. */
2572 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2573 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2577 /* We can handle no multi character collating elements without libc
2579 if (BE ((start_elem->type == COLL_SYM
2580 && strlen ((char *) start_elem->opr.name) > 1)
2581 || (end_elem->type == COLL_SYM
2582 && strlen ((char *) end_elem->opr.name) > 1), 0))
2583 return REG_ECOLLATE;
2585 # ifdef RE_ENABLE_I18N
2588 wint_t start_wc, end_wc;
2589 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2591 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2592 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2594 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2595 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2597 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2598 ? __btowc (start_ch) : start_elem->opr.wch);
2599 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2600 ? __btowc (end_ch) : end_elem->opr.wch);
2601 if (start_wc == WEOF || end_wc == WEOF)
2602 return REG_ECOLLATE;
2603 cmp_buf[0] = start_wc;
2604 cmp_buf[4] = end_wc;
2605 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2608 /* Got valid collation sequence values, add them as a new entry.
2609 However, for !_LIBC we have no collation elements: if the
2610 character set is single byte, the single byte character set
2611 that we build below suffices. parse_bracket_exp passes
2612 no MBCSET if dfa->mb_cur_max == 1. */
2615 /* Check the space of the arrays. */
2616 if (BE (*range_alloc == mbcset->nranges, 0))
2618 /* There is not enough space, need realloc. */
2619 wchar_t *new_array_start, *new_array_end;
2622 /* +1 in case of mbcset->nranges is 0. */
2623 new_nranges = 2 * mbcset->nranges + 1;
2624 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2625 are NULL if *range_alloc == 0. */
2626 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2628 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2631 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2634 mbcset->range_starts = new_array_start;
2635 mbcset->range_ends = new_array_end;
2636 *range_alloc = new_nranges;
2639 mbcset->range_starts[mbcset->nranges] = start_wc;
2640 mbcset->range_ends[mbcset->nranges++] = end_wc;
2643 /* Build the table for single byte characters. */
2644 for (wc = 0; wc < SBC_MAX; ++wc)
2647 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2648 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2649 bitset_set (sbcset, wc);
2652 # else /* not RE_ENABLE_I18N */
2655 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2656 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2658 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2659 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2661 if (start_ch > end_ch)
2663 /* Build the table for single byte characters. */
2664 for (ch = 0; ch < SBC_MAX; ++ch)
2665 if (start_ch <= ch && ch <= end_ch)
2666 bitset_set (sbcset, ch);
2668 # endif /* not RE_ENABLE_I18N */
2671 #endif /* not _LIBC */
2674 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2675 Build the collating element which is represented by NAME.
2676 The result are written to MBCSET and SBCSET.
2677 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2678 pointer argument since we may update it. */
2680 static reg_errcode_t
2681 build_collating_symbol (re_bitset_ptr_t sbcset,
2682 # ifdef RE_ENABLE_I18N
2683 re_charset_t *mbcset, int *coll_sym_alloc,
2685 const unsigned char *name)
2687 size_t name_len = strlen ((const char *) name);
2688 if (BE (name_len != 1, 0))
2689 return REG_ECOLLATE;
2692 bitset_set (sbcset, name[0]);
2696 #endif /* not _LIBC */
2698 /* This function parse bracket expression like "[abc]", "[a-c]",
2702 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2703 reg_syntax_t syntax, reg_errcode_t *err)
2706 const unsigned char *collseqmb;
2707 const char *collseqwc;
2710 const int32_t *symb_table;
2711 const unsigned char *extra;
2713 /* Local function for parse_bracket_exp used in _LIBC environement.
2714 Seek the collating symbol entry correspondings to NAME.
2715 Return the index of the symbol in the SYMB_TABLE. */
2718 __attribute ((always_inline))
2719 seek_collating_symbol_entry (name, name_len)
2720 const unsigned char *name;
2723 int32_t hash = elem_hash ((const char *) name, name_len);
2724 int32_t elem = hash % table_size;
2725 int32_t second = hash % (table_size - 2);
2726 while (symb_table[2 * elem] != 0)
2728 /* First compare the hashing value. */
2729 if (symb_table[2 * elem] == hash
2730 /* Compare the length of the name. */
2731 && name_len == extra[symb_table[2 * elem + 1]]
2732 /* Compare the name. */
2733 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2736 /* Yep, this is the entry. */
2746 /* Local function for parse_bracket_exp used in _LIBC environement.
2747 Look up the collation sequence value of BR_ELEM.
2748 Return the value if succeeded, UINT_MAX otherwise. */
2750 auto inline unsigned int
2751 __attribute ((always_inline))
2752 lookup_collation_sequence_value (br_elem)
2753 bracket_elem_t *br_elem;
2755 if (br_elem->type == SB_CHAR)
2758 if (MB_CUR_MAX == 1)
2761 return collseqmb[br_elem->opr.ch];
2764 wint_t wc = __btowc (br_elem->opr.ch);
2765 return __collseq_table_lookup (collseqwc, wc);
2768 else if (br_elem->type == MB_CHAR)
2770 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2772 else if (br_elem->type == COLL_SYM)
2774 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2778 elem = seek_collating_symbol_entry (br_elem->opr.name,
2780 if (symb_table[2 * elem] != 0)
2782 /* We found the entry. */
2783 idx = symb_table[2 * elem + 1];
2784 /* Skip the name of collating element name. */
2785 idx += 1 + extra[idx];
2786 /* Skip the byte sequence of the collating element. */
2787 idx += 1 + extra[idx];
2788 /* Adjust for the alignment. */
2789 idx = (idx + 3) & ~3;
2790 /* Skip the multibyte collation sequence value. */
2791 idx += sizeof (unsigned int);
2792 /* Skip the wide char sequence of the collating element. */
2793 idx += sizeof (unsigned int) *
2794 (1 + *(unsigned int *) (extra + idx));
2795 /* Return the collation sequence value. */
2796 return *(unsigned int *) (extra + idx);
2798 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2800 /* No valid character. Match it as a single byte
2802 return collseqmb[br_elem->opr.name[0]];
2805 else if (sym_name_len == 1)
2806 return collseqmb[br_elem->opr.name[0]];
2811 /* Local function for parse_bracket_exp used in _LIBC environement.
2812 Build the range expression which starts from START_ELEM, and ends
2813 at END_ELEM. The result are written to MBCSET and SBCSET.
2814 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2815 mbcset->range_ends, is a pointer argument sinse we may
2818 auto inline reg_errcode_t
2819 __attribute ((always_inline))
2820 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2821 re_charset_t *mbcset;
2823 re_bitset_ptr_t sbcset;
2824 bracket_elem_t *start_elem, *end_elem;
2827 uint32_t start_collseq;
2828 uint32_t end_collseq;
2830 /* Equivalence Classes and Character Classes can't be a range
2832 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2833 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2837 start_collseq = lookup_collation_sequence_value (start_elem);
2838 end_collseq = lookup_collation_sequence_value (end_elem);
2839 /* Check start/end collation sequence values. */
2840 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2841 return REG_ECOLLATE;
2842 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2845 /* Got valid collation sequence values, add them as a new entry.
2846 However, if we have no collation elements, and the character set
2847 is single byte, the single byte character set that we
2848 build below suffices. */
2849 if (nrules > 0 || dfa->mb_cur_max > 1)
2851 /* Check the space of the arrays. */
2852 if (BE (*range_alloc == mbcset->nranges, 0))
2854 /* There is not enough space, need realloc. */
2855 uint32_t *new_array_start;
2856 uint32_t *new_array_end;
2859 /* +1 in case of mbcset->nranges is 0. */
2860 new_nranges = 2 * mbcset->nranges + 1;
2861 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2863 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2866 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2869 mbcset->range_starts = new_array_start;
2870 mbcset->range_ends = new_array_end;
2871 *range_alloc = new_nranges;
2874 mbcset->range_starts[mbcset->nranges] = start_collseq;
2875 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2878 /* Build the table for single byte characters. */
2879 for (ch = 0; ch < SBC_MAX; ch++)
2881 uint32_t ch_collseq;
2883 if (MB_CUR_MAX == 1)
2886 ch_collseq = collseqmb[ch];
2888 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2889 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2890 bitset_set (sbcset, ch);
2895 /* Local function for parse_bracket_exp used in _LIBC environement.
2896 Build the collating element which is represented by NAME.
2897 The result are written to MBCSET and SBCSET.
2898 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2899 pointer argument sinse we may update it. */
2901 auto inline reg_errcode_t
2902 __attribute ((always_inline))
2903 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2904 re_charset_t *mbcset;
2905 int *coll_sym_alloc;
2906 re_bitset_ptr_t sbcset;
2907 const unsigned char *name;
2910 size_t name_len = strlen ((const char *) name);
2913 elem = seek_collating_symbol_entry (name, name_len);
2914 if (symb_table[2 * elem] != 0)
2916 /* We found the entry. */
2917 idx = symb_table[2 * elem + 1];
2918 /* Skip the name of collating element name. */
2919 idx += 1 + extra[idx];
2921 else if (symb_table[2 * elem] == 0 && name_len == 1)
2923 /* No valid character, treat it as a normal
2925 bitset_set (sbcset, name[0]);
2929 return REG_ECOLLATE;
2931 /* Got valid collation sequence, add it as a new entry. */
2932 /* Check the space of the arrays. */
2933 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2935 /* Not enough, realloc it. */
2936 /* +1 in case of mbcset->ncoll_syms is 0. */
2937 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2938 /* Use realloc since mbcset->coll_syms is NULL
2940 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2941 new_coll_sym_alloc);
2942 if (BE (new_coll_syms == NULL, 0))
2944 mbcset->coll_syms = new_coll_syms;
2945 *coll_sym_alloc = new_coll_sym_alloc;
2947 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2952 if (BE (name_len != 1, 0))
2953 return REG_ECOLLATE;
2956 bitset_set (sbcset, name[0]);
2963 re_token_t br_token;
2964 re_bitset_ptr_t sbcset;
2965 #ifdef RE_ENABLE_I18N
2966 re_charset_t *mbcset;
2967 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2968 int equiv_class_alloc = 0, char_class_alloc = 0;
2969 #endif /* not RE_ENABLE_I18N */
2971 bin_tree_t *work_tree;
2973 int first_round = 1;
2975 collseqmb = (const unsigned char *)
2976 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2977 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2983 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2984 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2985 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2986 _NL_COLLATE_SYMB_TABLEMB);
2987 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2988 _NL_COLLATE_SYMB_EXTRAMB);
2991 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2992 #ifdef RE_ENABLE_I18N
2993 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2994 #endif /* RE_ENABLE_I18N */
2995 #ifdef RE_ENABLE_I18N
2996 if (BE (sbcset == NULL || mbcset == NULL, 0))
2998 if (BE (sbcset == NULL, 0))
2999 #endif /* RE_ENABLE_I18N */
3005 token_len = peek_token_bracket (token, regexp, syntax);
3006 if (BE (token->type == END_OF_RE, 0))
3009 goto parse_bracket_exp_free_return;
3011 if (token->type == OP_NON_MATCH_LIST)
3013 #ifdef RE_ENABLE_I18N
3014 mbcset->non_match = 1;
3015 #endif /* not RE_ENABLE_I18N */
3017 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3018 bitset_set (sbcset, '\0');
3019 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3020 token_len = peek_token_bracket (token, regexp, syntax);
3021 if (BE (token->type == END_OF_RE, 0))
3024 goto parse_bracket_exp_free_return;
3028 /* We treat the first ']' as a normal character. */
3029 if (token->type == OP_CLOSE_BRACKET)
3030 token->type = CHARACTER;
3034 bracket_elem_t start_elem, end_elem;
3035 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3036 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3038 int token_len2 = 0, is_range_exp = 0;
3041 start_elem.opr.name = start_name_buf;
3042 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3043 syntax, first_round);
3044 if (BE (ret != REG_NOERROR, 0))
3047 goto parse_bracket_exp_free_return;
3051 /* Get information about the next token. We need it in any case. */
3052 token_len = peek_token_bracket (token, regexp, syntax);
3054 /* Do not check for ranges if we know they are not allowed. */
3055 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3057 if (BE (token->type == END_OF_RE, 0))
3060 goto parse_bracket_exp_free_return;
3062 if (token->type == OP_CHARSET_RANGE)
3064 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3065 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3066 if (BE (token2.type == END_OF_RE, 0))
3069 goto parse_bracket_exp_free_return;
3071 if (token2.type == OP_CLOSE_BRACKET)
3073 /* We treat the last '-' as a normal character. */
3074 re_string_skip_bytes (regexp, -token_len);
3075 token->type = CHARACTER;
3082 if (is_range_exp == 1)
3084 end_elem.opr.name = end_name_buf;
3085 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3087 if (BE (ret != REG_NOERROR, 0))
3090 goto parse_bracket_exp_free_return;
3093 token_len = peek_token_bracket (token, regexp, syntax);
3096 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3097 &start_elem, &end_elem);
3099 # ifdef RE_ENABLE_I18N
3100 *err = build_range_exp (sbcset,
3101 dfa->mb_cur_max > 1 ? mbcset : NULL,
3102 &range_alloc, &start_elem, &end_elem);
3104 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3106 #endif /* RE_ENABLE_I18N */
3107 if (BE (*err != REG_NOERROR, 0))
3108 goto parse_bracket_exp_free_return;
3112 switch (start_elem.type)
3115 bitset_set (sbcset, start_elem.opr.ch);
3117 #ifdef RE_ENABLE_I18N
3119 /* Check whether the array has enough space. */
3120 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3122 wchar_t *new_mbchars;
3123 /* Not enough, realloc it. */
3124 /* +1 in case of mbcset->nmbchars is 0. */
3125 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3126 /* Use realloc since array is NULL if *alloc == 0. */
3127 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3129 if (BE (new_mbchars == NULL, 0))
3130 goto parse_bracket_exp_espace;
3131 mbcset->mbchars = new_mbchars;
3133 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3135 #endif /* RE_ENABLE_I18N */
3137 *err = build_equiv_class (sbcset,
3138 #ifdef RE_ENABLE_I18N
3139 mbcset, &equiv_class_alloc,
3140 #endif /* RE_ENABLE_I18N */
3141 start_elem.opr.name);
3142 if (BE (*err != REG_NOERROR, 0))
3143 goto parse_bracket_exp_free_return;
3146 *err = build_collating_symbol (sbcset,
3147 #ifdef RE_ENABLE_I18N
3148 mbcset, &coll_sym_alloc,
3149 #endif /* RE_ENABLE_I18N */
3150 start_elem.opr.name);
3151 if (BE (*err != REG_NOERROR, 0))
3152 goto parse_bracket_exp_free_return;
3155 *err = build_charclass (regexp->trans, sbcset,
3156 #ifdef RE_ENABLE_I18N
3157 mbcset, &char_class_alloc,
3158 #endif /* RE_ENABLE_I18N */
3159 start_elem.opr.name, syntax);
3160 if (BE (*err != REG_NOERROR, 0))
3161 goto parse_bracket_exp_free_return;
3168 if (BE (token->type == END_OF_RE, 0))
3171 goto parse_bracket_exp_free_return;
3173 if (token->type == OP_CLOSE_BRACKET)
3177 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3179 /* If it is non-matching list. */
3181 bitset_not (sbcset);
3183 #ifdef RE_ENABLE_I18N
3184 /* Ensure only single byte characters are set. */
3185 if (dfa->mb_cur_max > 1)
3186 bitset_mask (sbcset, dfa->sb_char);
3188 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3189 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3190 || mbcset->non_match)))
3192 bin_tree_t *mbc_tree;
3194 /* Build a tree for complex bracket. */
3195 dfa->has_mb_node = 1;
3196 br_token.type = COMPLEX_BRACKET;
3197 br_token.opr.mbcset = mbcset;
3198 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3199 if (BE (mbc_tree == NULL, 0))
3200 goto parse_bracket_exp_espace;
3201 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3202 if (sbcset[sbc_idx])
3204 /* If there are no bits set in sbcset, there is no point
3205 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3206 if (sbc_idx < BITSET_UINTS)
3208 /* Build a tree for simple bracket. */
3209 br_token.type = SIMPLE_BRACKET;
3210 br_token.opr.sbcset = sbcset;
3211 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3212 if (BE (work_tree == NULL, 0))
3213 goto parse_bracket_exp_espace;
3215 /* Then join them by ALT node. */
3216 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3217 if (BE (work_tree == NULL, 0))
3218 goto parse_bracket_exp_espace;
3223 work_tree = mbc_tree;
3227 #endif /* not RE_ENABLE_I18N */
3229 #ifdef RE_ENABLE_I18N
3230 free_charset (mbcset);
3232 /* Build a tree for simple bracket. */
3233 br_token.type = SIMPLE_BRACKET;
3234 br_token.opr.sbcset = sbcset;
3235 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3236 if (BE (work_tree == NULL, 0))
3237 goto parse_bracket_exp_espace;
3241 parse_bracket_exp_espace:
3243 parse_bracket_exp_free_return:
3245 #ifdef RE_ENABLE_I18N
3246 free_charset (mbcset);
3247 #endif /* RE_ENABLE_I18N */
3251 /* Parse an element in the bracket expression. */
3253 static reg_errcode_t
3254 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3255 re_token_t *token, int token_len, re_dfa_t *dfa,
3256 reg_syntax_t syntax, int accept_hyphen)
3258 #ifdef RE_ENABLE_I18N
3260 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3261 if (cur_char_size > 1)
3263 elem->type = MB_CHAR;
3264 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3265 re_string_skip_bytes (regexp, cur_char_size);
3268 #endif /* RE_ENABLE_I18N */
3269 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3270 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3271 || token->type == OP_OPEN_EQUIV_CLASS)
3272 return parse_bracket_symbol (elem, regexp, token);
3273 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3275 /* A '-' must only appear as anything but a range indicator before
3276 the closing bracket. Everything else is an error. */
3278 (void) peek_token_bracket (&token2, regexp, syntax);
3279 if (token2.type != OP_CLOSE_BRACKET)
3280 /* The actual error value is not standardized since this whole
3281 case is undefined. But ERANGE makes good sense. */
3284 elem->type = SB_CHAR;
3285 elem->opr.ch = token->opr.c;
3289 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3290 such as [:<character_class>:], [.<collating_element>.], and
3291 [=<equivalent_class>=]. */
3293 static reg_errcode_t
3294 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3297 unsigned char ch, delim = token->opr.c;
3299 if (re_string_eoi(regexp))
3303 if (i >= BRACKET_NAME_BUF_SIZE)
3305 if (token->type == OP_OPEN_CHAR_CLASS)
3306 ch = re_string_fetch_byte_case (regexp);
3308 ch = re_string_fetch_byte (regexp);
3309 if (re_string_eoi(regexp))
3311 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3313 elem->opr.name[i] = ch;
3315 re_string_skip_bytes (regexp, 1);
3316 elem->opr.name[i] = '\0';
3317 switch (token->type)
3319 case OP_OPEN_COLL_ELEM:
3320 elem->type = COLL_SYM;
3322 case OP_OPEN_EQUIV_CLASS:
3323 elem->type = EQUIV_CLASS;
3325 case OP_OPEN_CHAR_CLASS:
3326 elem->type = CHAR_CLASS;
3334 /* Helper function for parse_bracket_exp.
3335 Build the equivalence class which is represented by NAME.
3336 The result are written to MBCSET and SBCSET.
3337 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3338 is a pointer argument sinse we may update it. */
3340 static reg_errcode_t
3341 build_equiv_class (re_bitset_ptr_t sbcset,
3342 #ifdef RE_ENABLE_I18N
3343 re_charset_t *mbcset, int *equiv_class_alloc,
3345 const unsigned char *name)
3348 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3351 const int32_t *table, *indirect;
3352 const unsigned char *weights, *extra, *cp;
3353 unsigned char char_buf[2];
3357 /* This #include defines a local function! */
3358 # include <locale/weight.h>
3359 /* Calculate the index for equivalence class. */
3361 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3362 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3363 _NL_COLLATE_WEIGHTMB);
3364 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3365 _NL_COLLATE_EXTRAMB);
3366 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3367 _NL_COLLATE_INDIRECTMB);
3368 idx1 = findidx (&cp);
3369 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3370 /* This isn't a valid character. */
3371 return REG_ECOLLATE;
3373 /* Build single byte matcing table for this equivalence class. */
3374 char_buf[1] = (unsigned char) '\0';
3375 len = weights[idx1];
3376 for (ch = 0; ch < SBC_MAX; ++ch)
3380 idx2 = findidx (&cp);
3385 /* This isn't a valid character. */
3387 if (len == weights[idx2])
3390 while (cnt <= len &&
3391 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3395 bitset_set (sbcset, ch);
3398 /* Check whether the array has enough space. */
3399 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3401 /* Not enough, realloc it. */
3402 /* +1 in case of mbcset->nequiv_classes is 0. */
3403 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3404 /* Use realloc since the array is NULL if *alloc == 0. */
3405 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3407 new_equiv_class_alloc);
3408 if (BE (new_equiv_classes == NULL, 0))
3410 mbcset->equiv_classes = new_equiv_classes;
3411 *equiv_class_alloc = new_equiv_class_alloc;
3413 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3418 if (BE (strlen ((const char *) name) != 1, 0))
3419 return REG_ECOLLATE;
3420 bitset_set (sbcset, *name);
3425 /* Helper function for parse_bracket_exp.
3426 Build the character class which is represented by NAME.
3427 The result are written to MBCSET and SBCSET.
3428 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3429 is a pointer argument sinse we may update it. */
3431 static reg_errcode_t
3432 build_charclass (unsigned RE_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset,
3433 #ifdef RE_ENABLE_I18N
3434 re_charset_t *mbcset, int *char_class_alloc,
3436 const unsigned char *class_name, reg_syntax_t syntax)
3439 const char *name = (const char *) class_name;
3441 /* In case of REG_ICASE "upper" and "lower" match the both of
3442 upper and lower cases. */
3443 if ((syntax & RE_ICASE)
3444 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3447 #ifdef RE_ENABLE_I18N
3448 /* Check the space of the arrays. */
3449 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3451 /* Not enough, realloc it. */
3452 /* +1 in case of mbcset->nchar_classes is 0. */
3453 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3454 /* Use realloc since array is NULL if *alloc == 0. */
3455 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3456 new_char_class_alloc);
3457 if (BE (new_char_classes == NULL, 0))
3459 mbcset->char_classes = new_char_classes;
3460 *char_class_alloc = new_char_class_alloc;
3462 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3463 #endif /* RE_ENABLE_I18N */
3465 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3466 for (i = 0; i < SBC_MAX; ++i) \
3468 if (ctype_func (i)) \
3470 int ch = trans ? trans[i] : i; \
3471 bitset_set (sbcset, ch); \
3475 if (strcmp (name, "alnum") == 0)
3476 BUILD_CHARCLASS_LOOP (isalnum)
3477 else if (strcmp (name, "cntrl") == 0)
3478 BUILD_CHARCLASS_LOOP (iscntrl)
3479 else if (strcmp (name, "lower") == 0)
3480 BUILD_CHARCLASS_LOOP (islower)
3481 else if (strcmp (name, "space") == 0)
3482 BUILD_CHARCLASS_LOOP (isspace)
3483 else if (strcmp (name, "alpha") == 0)
3484 BUILD_CHARCLASS_LOOP (isalpha)
3485 else if (strcmp (name, "digit") == 0)
3486 BUILD_CHARCLASS_LOOP (isdigit)
3487 else if (strcmp (name, "print") == 0)
3488 BUILD_CHARCLASS_LOOP (isprint)
3489 else if (strcmp (name, "upper") == 0)
3490 BUILD_CHARCLASS_LOOP (isupper)
3491 else if (strcmp (name, "blank") == 0)
3492 BUILD_CHARCLASS_LOOP (isblank)
3493 else if (strcmp (name, "graph") == 0)
3494 BUILD_CHARCLASS_LOOP (isgraph)
3495 else if (strcmp (name, "punct") == 0)
3496 BUILD_CHARCLASS_LOOP (ispunct)
3497 else if (strcmp (name, "xdigit") == 0)
3498 BUILD_CHARCLASS_LOOP (isxdigit)
3506 build_charclass_op (re_dfa_t *dfa, unsigned RE_TRANSLATE_TYPE trans,
3507 const unsigned char *class_name,
3508 const unsigned char *extra,
3509 int non_match, reg_errcode_t *err)
3511 re_bitset_ptr_t sbcset;
3512 #ifdef RE_ENABLE_I18N
3513 re_charset_t *mbcset;
3515 #endif /* not RE_ENABLE_I18N */
3517 re_token_t br_token;
3520 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3521 #ifdef RE_ENABLE_I18N
3522 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3523 #endif /* RE_ENABLE_I18N */
3525 #ifdef RE_ENABLE_I18N
3526 if (BE (sbcset == NULL || mbcset == NULL, 0))
3527 #else /* not RE_ENABLE_I18N */
3528 if (BE (sbcset == NULL, 0))
3529 #endif /* not RE_ENABLE_I18N */
3537 #ifdef RE_ENABLE_I18N
3539 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3540 bitset_set(cset->sbcset, '\0');
3542 mbcset->non_match = 1;
3543 #endif /* not RE_ENABLE_I18N */
3546 /* We don't care the syntax in this case. */
3547 ret = build_charclass (trans, sbcset,
3548 #ifdef RE_ENABLE_I18N
3550 #endif /* RE_ENABLE_I18N */
3553 if (BE (ret != REG_NOERROR, 0))
3556 #ifdef RE_ENABLE_I18N
3557 free_charset (mbcset);
3558 #endif /* RE_ENABLE_I18N */
3562 /* \w match '_' also. */
3563 for (; *extra; extra++)
3564 bitset_set (sbcset, *extra);
3566 /* If it is non-matching list. */
3568 bitset_not (sbcset);
3570 #ifdef RE_ENABLE_I18N
3571 /* Ensure only single byte characters are set. */
3572 if (dfa->mb_cur_max > 1)
3573 bitset_mask (sbcset, dfa->sb_char);
3576 /* Build a tree for simple bracket. */
3577 br_token.type = SIMPLE_BRACKET;
3578 br_token.opr.sbcset = sbcset;
3579 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3580 if (BE (tree == NULL, 0))
3581 goto build_word_op_espace;
3583 #ifdef RE_ENABLE_I18N
3584 if (dfa->mb_cur_max > 1)
3586 bin_tree_t *mbc_tree;
3587 /* Build a tree for complex bracket. */
3588 br_token.type = COMPLEX_BRACKET;
3589 br_token.opr.mbcset = mbcset;
3590 dfa->has_mb_node = 1;
3591 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3592 if (BE (mbc_tree == NULL, 0))
3593 goto build_word_op_espace;
3594 /* Then join them by ALT node. */
3595 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3596 if (BE (mbc_tree != NULL, 1))
3601 free_charset (mbcset);
3604 #else /* not RE_ENABLE_I18N */
3606 #endif /* not RE_ENABLE_I18N */
3608 build_word_op_espace:
3610 #ifdef RE_ENABLE_I18N
3611 free_charset (mbcset);
3612 #endif /* RE_ENABLE_I18N */
3617 /* This is intended for the expressions like "a{1,3}".
3618 Fetch a number from `input', and return the number.
3619 Return -1, if the number field is empty like "{,1}".
3620 Return -2, If an error is occured. */
3623 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3629 fetch_token (token, input, syntax);
3631 if (BE (token->type == END_OF_RE, 0))
3633 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3635 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3636 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3637 num = (num > RE_DUP_MAX) ? -2 : num;
3642 #ifdef RE_ENABLE_I18N
3644 free_charset (re_charset_t *cset)
3646 re_free (cset->mbchars);
3648 re_free (cset->coll_syms);
3649 re_free (cset->equiv_classes);
3650 re_free (cset->range_starts);
3651 re_free (cset->range_ends);
3653 re_free (cset->char_classes);
3656 #endif /* RE_ENABLE_I18N */
3658 /* Functions for binary tree operation. */
3660 /* Create a tree node. */
3663 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3664 re_token_type_t type)
3668 return create_token_tree (dfa, left, right, &t);
3672 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3673 const re_token_t *token)
3676 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3678 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3680 if (storage == NULL)
3682 storage->next = dfa->str_tree_storage;
3683 dfa->str_tree_storage = storage;
3684 dfa->str_tree_storage_idx = 0;
3686 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3688 tree->parent = NULL;
3690 tree->right = right;
3691 tree->token = *token;
3692 tree->token.duplicated = 0;
3693 tree->token.opt_subexp = 0;
3696 tree->node_idx = -1;
3699 left->parent = tree;
3701 right->parent = tree;
3705 /* Mark the tree SRC as an optional subexpression.
3706 To be called from preorder or postorder. */
3708 static reg_errcode_t
3709 mark_opt_subexp (void *extra, bin_tree_t *node)
3711 int idx = (int) (long) extra;
3712 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3713 node->token.opt_subexp = 1;
3718 /* Free the allocated memory inside NODE. */
3721 free_token (re_token_t *node)
3723 #ifdef RE_ENABLE_I18N
3724 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3725 free_charset (node->opr.mbcset);
3727 #endif /* RE_ENABLE_I18N */
3728 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3729 re_free (node->opr.sbcset);
3732 /* Worker function for tree walking. Free the allocated memory inside NODE
3733 and its children. */
3735 static reg_errcode_t
3736 free_tree (void *extra, bin_tree_t *node)
3738 free_token (&node->token);
3743 /* Duplicate the node SRC, and return new node. This is a preorder
3744 visit similar to the one implemented by the generic visitor, but
3745 we need more infrastructure to maintain two parallel trees --- so,
3746 it's easier to duplicate. */
3749 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3751 const bin_tree_t *node;
3752 bin_tree_t *dup_root;
3753 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3755 for (node = root; ; )
3757 /* Create a new tree and link it back to the current parent. */
3758 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3761 (*p_new)->parent = dup_node;
3762 (*p_new)->token.duplicated = 1;
3765 /* Go to the left node, or up and to the right. */
3769 p_new = &dup_node->left;
3773 const bin_tree_t *prev = NULL;
3774 while (node->right == prev || node->right == NULL)
3777 node = node->parent;
3778 dup_node = dup_node->parent;
3783 p_new = &dup_node->right;