1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 int length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
26 static void init_word_char (re_dfa_t *dfa);
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 static void optimize_utf8 (re_dfa_t *dfa);
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
37 static reg_errcode_t preorder (bin_tree_t *root,
38 reg_errcode_t (fn (void *, bin_tree_t *)),
40 static reg_errcode_t postorder (bin_tree_t *root,
41 reg_errcode_t (fn (void *, bin_tree_t *)),
43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
50 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
51 int top_clone_node, int root_node,
52 unsigned int constraint);
53 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
54 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
55 unsigned int constraint);
56 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
57 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
59 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
60 static int fetch_number (re_string_t *input, re_token_t *token,
62 static void fetch_token (re_token_t *result, re_string_t *input,
64 static int peek_token (re_token_t *token, re_string_t *input,
66 static int peek_token_bracket (re_token_t *token, re_string_t *input,
68 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
69 reg_syntax_t syntax, reg_errcode_t *err);
70 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
74 re_token_t *token, reg_syntax_t syntax,
75 int nest, reg_errcode_t *err);
76 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
77 re_token_t *token, reg_syntax_t syntax,
78 int nest, reg_errcode_t *err);
79 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
80 re_token_t *token, reg_syntax_t syntax,
81 int nest, reg_errcode_t *err);
82 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
83 re_dfa_t *dfa, re_token_t *token,
84 reg_syntax_t syntax, reg_errcode_t *err);
85 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
86 re_token_t *token, reg_syntax_t syntax,
88 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
90 re_token_t *token, int token_len,
94 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
98 # ifdef RE_ENABLE_I18N
99 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
100 re_charset_t *mbcset, int *range_alloc,
101 bracket_elem_t *start_elem,
102 bracket_elem_t *end_elem);
103 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
104 re_charset_t *mbcset,
106 const unsigned char *name);
107 # else /* not RE_ENABLE_I18N */
108 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
109 bracket_elem_t *start_elem,
110 bracket_elem_t *end_elem);
111 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
112 const unsigned char *name);
113 # endif /* not RE_ENABLE_I18N */
114 #endif /* not _LIBC */
115 #ifdef RE_ENABLE_I18N
116 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
117 re_charset_t *mbcset,
118 int *equiv_class_alloc,
119 const unsigned char *name);
120 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
121 re_bitset_ptr_t sbcset,
122 re_charset_t *mbcset,
123 int *char_class_alloc,
124 const unsigned char *class_name,
125 reg_syntax_t syntax);
126 #else /* not RE_ENABLE_I18N */
127 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
128 const unsigned char *name);
129 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
130 re_bitset_ptr_t sbcset,
131 const unsigned char *class_name,
132 reg_syntax_t syntax);
133 #endif /* not RE_ENABLE_I18N */
134 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
135 unsigned RE_TRANSLATE_TYPE trans,
136 const unsigned char *class_name,
137 const unsigned char *extra,
138 int non_match, reg_errcode_t *err);
139 static bin_tree_t *create_tree (re_dfa_t *dfa,
140 bin_tree_t *left, bin_tree_t *right,
141 re_token_type_t type);
142 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
143 bin_tree_t *left, bin_tree_t *right,
144 const re_token_t *token);
145 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
146 static void free_token (re_token_t *node);
147 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
148 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
150 /* This table gives an error message for each of the error codes listed
151 in regex.h. Obviously the order here has to be same as there.
152 POSIX doesn't require that we do anything for REG_NOERROR,
153 but why not be nice? */
155 const char __re_error_msgid[] attribute_hidden =
157 #define REG_NOERROR_IDX 0
158 gettext_noop ("Success") /* REG_NOERROR */
160 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
161 gettext_noop ("No match") /* REG_NOMATCH */
163 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
164 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
166 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
167 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
169 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
170 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
172 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
173 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
175 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
176 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
178 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
179 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
181 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
182 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
184 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
185 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
187 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
188 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
190 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
191 gettext_noop ("Invalid range end") /* REG_ERANGE */
193 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
194 gettext_noop ("Memory exhausted") /* REG_ESPACE */
196 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
197 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
199 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
200 gettext_noop ("Premature end of regular expression") /* REG_EEND */
202 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
203 gettext_noop ("Regular expression too big") /* REG_ESIZE */
205 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
206 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
209 const size_t __re_error_msgid_idx[] attribute_hidden =
230 /* Entry points for GNU code. */
232 /* re_compile_pattern is the GNU regular expression compiler: it
233 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
234 Returns 0 if the pattern was valid, otherwise an error string.
236 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
237 are set in BUFP on entry. */
240 re_compile_pattern (const char *pattern, size_t length,
241 struct re_pattern_buffer *bufp)
245 /* And GNU code determines whether or not to get register information
246 by passing null for the REGS argument to re_match, etc., not by
247 setting no_sub, unless RE_NO_SUB is set. */
248 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
250 /* Match anchors at newline. */
251 bufp->newline_anchor = 1;
253 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
257 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
260 weak_alias (__re_compile_pattern, re_compile_pattern)
263 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
264 also be assigned to arbitrarily: each pattern buffer stores its own
265 syntax, so it can be changed between regex compilations. */
266 /* This has no initializer because initialized variables in Emacs
267 become read-only after dumping. */
268 reg_syntax_t re_syntax_options;
271 /* Specify the precise syntax of regexps for compilation. This provides
272 for compatibility for various utilities which historically have
273 different, incompatible syntaxes.
275 The argument SYNTAX is a bit mask comprised of the various bits
276 defined in regex.h. We return the old syntax. */
279 re_set_syntax (reg_syntax_t syntax)
281 reg_syntax_t ret = re_syntax_options;
283 re_syntax_options = syntax;
287 weak_alias (__re_set_syntax, re_set_syntax)
291 re_compile_fastmap (struct re_pattern_buffer *bufp)
293 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
294 char *fastmap = bufp->fastmap;
296 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
297 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
298 if (dfa->init_state != dfa->init_state_word)
299 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
300 if (dfa->init_state != dfa->init_state_nl)
301 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
302 if (dfa->init_state != dfa->init_state_begbuf)
303 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
304 bufp->fastmap_accurate = 1;
308 weak_alias (__re_compile_fastmap, re_compile_fastmap)
312 __attribute ((always_inline))
313 re_set_fastmap (char *fastmap, int icase, int ch)
317 fastmap[tolower (ch)] = 1;
320 /* Helper function for re_compile_fastmap.
321 Compile fastmap for the initial_state INIT_STATE. */
324 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
327 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
329 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
330 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
332 int node = init_state->nodes.elems[node_cnt];
333 re_token_type_t type = dfa->nodes[node].type;
335 if (type == CHARACTER)
337 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
338 #ifdef RE_ENABLE_I18N
339 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
341 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
346 *p++ = dfa->nodes[node].opr.c;
347 while (++node < dfa->nodes_len
348 && dfa->nodes[node].type == CHARACTER
349 && dfa->nodes[node].mb_partial)
350 *p++ = dfa->nodes[node].opr.c;
351 memset (&state, 0, sizeof (state));
352 if (mbrtowc (&wc, (const char *) buf, p - buf,
354 && (__wcrtomb ((char *) buf, towlower (wc), &state)
356 re_set_fastmap (fastmap, 0, buf[0]);
360 else if (type == SIMPLE_BRACKET)
363 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
364 for (j = 0; j < UINT_BITS; ++j, ++ch)
365 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
366 re_set_fastmap (fastmap, icase, ch);
368 #ifdef RE_ENABLE_I18N
369 else if (type == COMPLEX_BRACKET)
372 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
373 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
374 || cset->nranges || cset->nchar_classes)
377 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
379 /* In this case we want to catch the bytes which are
380 the first byte of any collation elements.
381 e.g. In da_DK, we want to catch 'a' since "aa"
382 is a valid collation element, and don't catch
383 'b' since 'b' is the only collation element
384 which starts from 'b'. */
386 const int32_t *table = (const int32_t *)
387 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
388 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
389 for (j = 0; j < UINT_BITS; ++j, ++ch)
391 re_set_fastmap (fastmap, icase, ch);
394 if (dfa->mb_cur_max > 1)
395 for (i = 0; i < SBC_MAX; ++i)
396 if (__btowc (i) == WEOF)
397 re_set_fastmap (fastmap, icase, i);
398 # endif /* not _LIBC */
400 for (i = 0; i < cset->nmbchars; ++i)
404 memset (&state, '\0', sizeof (state));
405 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
406 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
407 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
409 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
411 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
415 #endif /* RE_ENABLE_I18N */
416 else if (type == OP_PERIOD
417 #ifdef RE_ENABLE_I18N
418 || type == OP_UTF8_PERIOD
419 #endif /* RE_ENABLE_I18N */
420 || type == END_OF_RE)
422 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
423 if (type == END_OF_RE)
424 bufp->can_be_null = 1;
430 /* Entry point for POSIX code. */
431 /* regcomp takes a regular expression as a string and compiles it.
433 PREG is a regex_t *. We do not expect any fields to be initialized,
434 since POSIX says we shouldn't. Thus, we set
436 `buffer' to the compiled pattern;
437 `used' to the length of the compiled pattern;
438 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439 REG_EXTENDED bit in CFLAGS is set; otherwise, to
440 RE_SYNTAX_POSIX_BASIC;
441 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
442 `fastmap' to an allocated space for the fastmap;
443 `fastmap_accurate' to zero;
444 `re_nsub' to the number of subexpressions in PATTERN.
446 PATTERN is the address of the pattern string.
448 CFLAGS is a series of bits which affect compilation.
450 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451 use POSIX basic syntax.
453 If REG_NEWLINE is set, then . and [^...] don't match newline.
454 Also, regexec will try a match beginning after every newline.
456 If REG_ICASE is set, then we considers upper- and lowercase
457 versions of letters to be equivalent when matching.
459 If REG_NOSUB is set, then when PREG is passed to regexec, that
460 routine will report only success or failure, and nothing about the
463 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
464 the return codes and their meanings.) */
467 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
470 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
471 : RE_SYNTAX_POSIX_BASIC);
477 /* Try to allocate space for the fastmap. */
478 preg->fastmap = re_malloc (char, SBC_MAX);
479 if (BE (preg->fastmap == NULL, 0))
482 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
484 /* If REG_NEWLINE is set, newlines are treated differently. */
485 if (cflags & REG_NEWLINE)
486 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
487 syntax &= ~RE_DOT_NEWLINE;
488 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
489 /* It also changes the matching behavior. */
490 preg->newline_anchor = 1;
493 preg->newline_anchor = 0;
494 preg->no_sub = !!(cflags & REG_NOSUB);
495 preg->translate = NULL;
497 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
499 /* POSIX doesn't distinguish between an unmatched open-group and an
500 unmatched close-group: both are REG_EPAREN. */
501 if (ret == REG_ERPAREN)
504 /* We have already checked preg->fastmap != NULL. */
505 if (BE (ret == REG_NOERROR, 1))
506 /* Compute the fastmap now, since regexec cannot modify the pattern
507 buffer. This function never fails in this implementation. */
508 (void) re_compile_fastmap (preg);
511 /* Some error occurred while compiling the expression. */
512 re_free (preg->fastmap);
513 preg->fastmap = NULL;
519 weak_alias (__regcomp, regcomp)
522 /* Returns a message corresponding to an error code, ERRCODE, returned
523 from either regcomp or regexec. We don't use PREG here. */
526 regerror (int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
532 || errcode >= (int) (sizeof (__re_error_msgid_idx)
533 / sizeof (__re_error_msgid_idx[0])), 0))
534 /* Only error codes returned by the rest of the code should be passed
535 to this routine. If we are given anything else, or if other regex
536 code generates an invalid error code, then the program has a bug.
537 Dump core so we can fix it. */
540 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
542 msg_size = strlen (msg) + 1; /* Includes the null. */
544 if (BE (errbuf_size != 0, 1))
546 if (BE (msg_size > errbuf_size, 0))
548 #if defined HAVE_MEMPCPY || defined _LIBC
549 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
551 memcpy (errbuf, msg, errbuf_size - 1);
552 errbuf[errbuf_size - 1] = 0;
556 memcpy (errbuf, msg, msg_size);
562 weak_alias (__regerror, regerror)
566 #ifdef RE_ENABLE_I18N
567 /* This static array is used for the map to single-byte characters when
568 UTF-8 is used. Otherwise we would allocate memory just to initialize
569 it the same all the time. UTF-8 is the preferred encoding so this is
570 a worthwhile optimization. */
571 static const bitset utf8_sb_map =
573 /* Set the first 128 bits. */
574 # if UINT_MAX == 0xffffffff
575 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
577 # error "Add case for new unsigned int size"
584 free_dfa_content (re_dfa_t *dfa)
589 for (i = 0; i < dfa->nodes_len; ++i)
590 free_token (dfa->nodes + i);
591 re_free (dfa->nexts);
592 for (i = 0; i < dfa->nodes_len; ++i)
594 if (dfa->eclosures != NULL)
595 re_node_set_free (dfa->eclosures + i);
596 if (dfa->inveclosures != NULL)
597 re_node_set_free (dfa->inveclosures + i);
598 if (dfa->edests != NULL)
599 re_node_set_free (dfa->edests + i);
601 re_free (dfa->edests);
602 re_free (dfa->eclosures);
603 re_free (dfa->inveclosures);
604 re_free (dfa->nodes);
606 if (dfa->state_table)
607 for (i = 0; i <= dfa->state_hash_mask; ++i)
609 struct re_state_table_entry *entry = dfa->state_table + i;
610 for (j = 0; j < entry->num; ++j)
612 re_dfastate_t *state = entry->array[j];
615 re_free (entry->array);
617 re_free (dfa->state_table);
618 #ifdef RE_ENABLE_I18N
619 if (dfa->sb_char != utf8_sb_map)
620 re_free (dfa->sb_char);
622 re_free (dfa->subexp_map);
624 re_free (dfa->re_str);
631 /* Free dynamically allocated space used by PREG. */
634 regfree (regex_t *preg)
636 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
637 if (BE (dfa != NULL, 1))
638 free_dfa_content (dfa);
642 re_free (preg->fastmap);
643 preg->fastmap = NULL;
645 re_free (preg->translate);
646 preg->translate = NULL;
649 weak_alias (__regfree, regfree)
652 /* Entry points compatible with 4.2 BSD regex library. We don't define
653 them unless specifically requested. */
655 #if defined _REGEX_RE_COMP || defined _LIBC
657 /* BSD has one and only one pattern buffer. */
658 static struct re_pattern_buffer re_comp_buf;
662 /* Make these definitions weak in libc, so POSIX programs can redefine
663 these names if they don't use our functions, and still use
664 regcomp/regexec above without link errors. */
675 if (!re_comp_buf.buffer)
676 return gettext ("No previous regular expression");
680 if (re_comp_buf.buffer)
682 fastmap = re_comp_buf.fastmap;
683 re_comp_buf.fastmap = NULL;
684 __regfree (&re_comp_buf);
685 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
686 re_comp_buf.fastmap = fastmap;
689 if (re_comp_buf.fastmap == NULL)
691 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
692 if (re_comp_buf.fastmap == NULL)
693 return (char *) gettext (__re_error_msgid
694 + __re_error_msgid_idx[(int) REG_ESPACE]);
697 /* Since `re_exec' always passes NULL for the `regs' argument, we
698 don't need to initialize the pattern buffer fields which affect it. */
700 /* Match anchors at newlines. */
701 re_comp_buf.newline_anchor = 1;
703 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
708 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
709 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
713 libc_freeres_fn (free_mem)
715 __regfree (&re_comp_buf);
719 #endif /* _REGEX_RE_COMP */
721 /* Internal entry point.
722 Compile the regular expression PATTERN, whose length is LENGTH.
723 SYNTAX indicate regular expression's syntax. */
726 re_compile_internal (regex_t *preg, const char * pattern, int length,
729 reg_errcode_t err = REG_NOERROR;
733 /* Initialize the pattern buffer. */
734 preg->fastmap_accurate = 0;
735 preg->syntax = syntax;
736 preg->not_bol = preg->not_eol = 0;
739 preg->can_be_null = 0;
740 preg->regs_allocated = REGS_UNALLOCATED;
742 /* Initialize the dfa. */
743 dfa = (re_dfa_t *) preg->buffer;
744 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
746 /* If zero allocated, but buffer is non-null, try to realloc
747 enough space. This loses if buffer's address is bogus, but
748 that is the user's responsibility. If ->buffer is NULL this
749 is a simple allocation. */
750 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
753 preg->allocated = sizeof (re_dfa_t);
754 preg->buffer = (unsigned char *) dfa;
756 preg->used = sizeof (re_dfa_t);
758 __libc_lock_init (dfa->lock);
760 err = init_dfa (dfa, length);
761 if (BE (err != REG_NOERROR, 0))
763 free_dfa_content (dfa);
769 dfa->re_str = re_malloc (char, length + 1);
770 strncpy (dfa->re_str, pattern, length + 1);
773 err = re_string_construct (®exp, pattern, length, preg->translate,
774 syntax & RE_ICASE, dfa);
775 if (BE (err != REG_NOERROR, 0))
777 re_compile_internal_free_return:
778 free_workarea_compile (preg);
779 re_string_destruct (®exp);
780 free_dfa_content (dfa);
786 /* Parse the regular expression, and build a structure tree. */
788 dfa->str_tree = parse (®exp, preg, syntax, &err);
789 if (BE (dfa->str_tree == NULL, 0))
790 goto re_compile_internal_free_return;
792 /* Analyze the tree and create the nfa. */
793 err = analyze (preg);
794 if (BE (err != REG_NOERROR, 0))
795 goto re_compile_internal_free_return;
797 #ifdef RE_ENABLE_I18N
798 /* If possible, do searching in single byte encoding to speed things up. */
799 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
803 /* Then create the initial state of the dfa. */
804 err = create_initial_state (dfa);
806 /* Release work areas. */
807 free_workarea_compile (preg);
808 re_string_destruct (®exp);
810 if (BE (err != REG_NOERROR, 0))
812 free_dfa_content (dfa);
820 /* Initialize DFA. We use the length of the regular expression PAT_LEN
821 as the initial length of some arrays. */
824 init_dfa (re_dfa_t *dfa, int pat_len)
831 memset (dfa, '\0', sizeof (re_dfa_t));
833 /* Force allocation of str_tree_storage the first time. */
834 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
836 dfa->nodes_alloc = pat_len + 1;
837 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
839 dfa->states_alloc = pat_len + 1;
841 /* table_size = 2 ^ ceil(log pat_len) */
842 for (table_size = 1; table_size > 0; table_size <<= 1)
843 if (table_size > pat_len)
846 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
847 dfa->state_hash_mask = table_size - 1;
849 dfa->mb_cur_max = MB_CUR_MAX;
851 if (dfa->mb_cur_max == 6
852 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
854 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
857 # ifdef HAVE_LANGINFO_CODESET
858 codeset_name = nl_langinfo (CODESET);
860 codeset_name = getenv ("LC_ALL");
861 if (codeset_name == NULL || codeset_name[0] == '\0')
862 codeset_name = getenv ("LC_CTYPE");
863 if (codeset_name == NULL || codeset_name[0] == '\0')
864 codeset_name = getenv ("LANG");
865 if (codeset_name == NULL)
867 else if (strchr (codeset_name, '.') != NULL)
868 codeset_name = strchr (codeset_name, '.') + 1;
871 if (strcasecmp (codeset_name, "UTF-8") == 0
872 || strcasecmp (codeset_name, "UTF8") == 0)
875 /* We check exhaustively in the loop below if this charset is a
876 superset of ASCII. */
877 dfa->map_notascii = 0;
880 #ifdef RE_ENABLE_I18N
881 if (dfa->mb_cur_max > 1)
884 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
889 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
890 if (BE (dfa->sb_char == NULL, 0))
893 /* Clear all bits by, then set those corresponding to single
895 bitset_empty (dfa->sb_char);
897 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
898 for (j = 0; j < UINT_BITS; ++j, ++ch)
900 wint_t wch = __btowc (ch);
902 dfa->sb_char[i] |= 1 << j;
904 if (isascii (ch) && wch != ch)
905 dfa->map_notascii = 1;
912 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
917 /* Initialize WORD_CHAR table, which indicate which character is
918 "word". In this case "word" means that it is the word construction
919 character used by some operators like "\<", "\>", etc. */
922 init_word_char (re_dfa_t *dfa)
925 dfa->word_ops_used = 1;
926 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
927 for (j = 0; j < UINT_BITS; ++j, ++ch)
928 if (isalnum (ch) || ch == '_')
929 dfa->word_char[i] |= 1 << j;
932 /* Free the work area which are only used while compiling. */
935 free_workarea_compile (regex_t *preg)
937 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
938 bin_tree_storage_t *storage, *next;
939 for (storage = dfa->str_tree_storage; storage; storage = next)
941 next = storage->next;
944 dfa->str_tree_storage = NULL;
945 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
946 dfa->str_tree = NULL;
947 re_free (dfa->org_indices);
948 dfa->org_indices = NULL;
951 /* Create initial states for all contexts. */
954 create_initial_state (re_dfa_t *dfa)
958 re_node_set init_nodes;
960 /* Initial states have the epsilon closure of the node which is
961 the first node of the regular expression. */
962 first = dfa->str_tree->first->node_idx;
963 dfa->init_node = first;
964 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
965 if (BE (err != REG_NOERROR, 0))
968 /* The back-references which are in initial states can epsilon transit,
969 since in this case all of the subexpressions can be null.
970 Then we add epsilon closures of the nodes which are the next nodes of
971 the back-references. */
972 if (dfa->nbackref > 0)
973 for (i = 0; i < init_nodes.nelem; ++i)
975 int node_idx = init_nodes.elems[i];
976 re_token_type_t type = dfa->nodes[node_idx].type;
979 if (type != OP_BACK_REF)
981 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
983 re_token_t *clexp_node;
984 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
985 if (clexp_node->type == OP_CLOSE_SUBEXP
986 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
989 if (clexp_idx == init_nodes.nelem)
992 if (type == OP_BACK_REF)
994 int dest_idx = dfa->edests[node_idx].elems[0];
995 if (!re_node_set_contains (&init_nodes, dest_idx))
997 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1003 /* It must be the first time to invoke acquire_state. */
1004 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1005 /* We don't check ERR here, since the initial state must not be NULL. */
1006 if (BE (dfa->init_state == NULL, 0))
1008 if (dfa->init_state->has_constraint)
1010 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1012 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1014 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1018 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1019 || dfa->init_state_begbuf == NULL, 0))
1023 dfa->init_state_word = dfa->init_state_nl
1024 = dfa->init_state_begbuf = dfa->init_state;
1026 re_node_set_free (&init_nodes);
1030 #ifdef RE_ENABLE_I18N
1031 /* If it is possible to do searching in single byte encoding instead of UTF-8
1032 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1033 DFA nodes where needed. */
1036 optimize_utf8 (re_dfa_t *dfa)
1038 int node, i, mb_chars = 0, has_period = 0;
1040 for (node = 0; node < dfa->nodes_len; ++node)
1041 switch (dfa->nodes[node].type)
1044 if (dfa->nodes[node].opr.c >= 0x80)
1048 switch (dfa->nodes[node].opr.idx)
1056 /* Word anchors etc. cannot be handled. */
1066 case OP_DUP_ASTERISK:
1067 case OP_OPEN_SUBEXP:
1068 case OP_CLOSE_SUBEXP:
1070 case COMPLEX_BRACKET:
1072 case SIMPLE_BRACKET:
1073 /* Just double check. */
1074 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1075 if (dfa->nodes[node].opr.sbcset[i])
1082 if (mb_chars || has_period)
1083 for (node = 0; node < dfa->nodes_len; ++node)
1085 if (dfa->nodes[node].type == CHARACTER
1086 && dfa->nodes[node].opr.c >= 0x80)
1087 dfa->nodes[node].mb_partial = 0;
1088 else if (dfa->nodes[node].type == OP_PERIOD)
1089 dfa->nodes[node].type = OP_UTF8_PERIOD;
1092 /* The search can be in single byte locale. */
1093 dfa->mb_cur_max = 1;
1095 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1099 /* Analyze the structure tree, and calculate "first", "next", "edest",
1100 "eclosure", and "inveclosure". */
1102 static reg_errcode_t
1103 analyze (regex_t *preg)
1105 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1108 /* Allocate arrays. */
1109 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1110 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1111 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1112 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1113 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1114 || dfa->eclosures == NULL, 0))
1117 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1118 if (dfa->subexp_map != NULL)
1121 for (i = 0; i < preg->re_nsub; i++)
1122 dfa->subexp_map[i] = i;
1123 preorder (dfa->str_tree, optimize_subexps, dfa);
1124 for (i = 0; i < preg->re_nsub; i++)
1125 if (dfa->subexp_map[i] != i)
1127 if (i == preg->re_nsub)
1129 free (dfa->subexp_map);
1130 dfa->subexp_map = NULL;
1134 ret = postorder (dfa->str_tree, lower_subexps, preg);
1135 if (BE (ret != REG_NOERROR, 0))
1137 ret = postorder (dfa->str_tree, calc_first, dfa);
1138 if (BE (ret != REG_NOERROR, 0))
1140 preorder (dfa->str_tree, calc_next, dfa);
1141 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1142 if (BE (ret != REG_NOERROR, 0))
1144 ret = calc_eclosure (dfa);
1145 if (BE (ret != REG_NOERROR, 0))
1148 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1149 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1150 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1153 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1154 if (BE (dfa->inveclosures == NULL, 0))
1156 ret = calc_inveclosure (dfa);
1162 /* Our parse trees are very unbalanced, so we cannot use a stack to
1163 implement parse tree visits. Instead, we use parent pointers and
1164 some hairy code in these two functions. */
1165 static reg_errcode_t
1166 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1169 bin_tree_t *node, *prev;
1171 for (node = root; ; )
1173 /* Descend down the tree, preferably to the left (or to the right
1174 if that's the only child). */
1175 while (node->left || node->right)
1183 reg_errcode_t err = fn (extra, node);
1184 if (BE (err != REG_NOERROR, 0))
1186 if (node->parent == NULL)
1189 node = node->parent;
1191 /* Go up while we have a node that is reached from the right. */
1192 while (node->right == prev || node->right == NULL);
1197 static reg_errcode_t
1198 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1203 for (node = root; ; )
1205 reg_errcode_t err = fn (extra, node);
1206 if (BE (err != REG_NOERROR, 0))
1209 /* Go to the left node, or up and to the right. */
1214 bin_tree_t *prev = NULL;
1215 while (node->right == prev || node->right == NULL)
1218 node = node->parent;
1227 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1228 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1229 backreferences as well. Requires a preorder visit. */
1230 static reg_errcode_t
1231 optimize_subexps (void *extra, bin_tree_t *node)
1233 re_dfa_t *dfa = (re_dfa_t *) extra;
1235 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1237 int idx = node->token.opr.idx;
1238 node->token.opr.idx = dfa->subexp_map[idx];
1239 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1242 else if (node->token.type == SUBEXP
1243 && node->left && node->left->token.type == SUBEXP)
1245 int other_idx = node->left->token.opr.idx;
1247 node->left = node->left->left;
1249 node->left->parent = node;
1251 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1252 if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1253 dfa->used_bkref_map &= ~(1 << other_idx);
1259 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1260 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1261 static reg_errcode_t
1262 lower_subexps (void *extra, bin_tree_t *node)
1264 regex_t *preg = (regex_t *) extra;
1265 reg_errcode_t err = REG_NOERROR;
1267 if (node->left && node->left->token.type == SUBEXP)
1269 node->left = lower_subexp (&err, preg, node->left);
1271 node->left->parent = node;
1273 if (node->right && node->right->token.type == SUBEXP)
1275 node->right = lower_subexp (&err, preg, node->right);
1277 node->right->parent = node;
1284 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1286 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1287 bin_tree_t *body = node->left;
1288 bin_tree_t *op, *cls, *tree1, *tree;
1291 /* We do not optimize empty subexpressions, because otherwise we may
1292 have bad CONCAT nodes with NULL children. This is obviously not
1293 very common, so we do not lose much. An example that triggers
1294 this case is the sed "script" /\(\)/x. */
1295 && node->left != NULL
1296 && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1297 || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1300 /* Convert the SUBEXP node to the concatenation of an
1301 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1302 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1303 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1304 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1305 tree = create_tree (dfa, op, tree1, CONCAT);
1306 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1312 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1313 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1317 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1318 nodes. Requires a postorder visit. */
1319 static reg_errcode_t
1320 calc_first (void *extra, bin_tree_t *node)
1322 re_dfa_t *dfa = (re_dfa_t *) extra;
1323 if (node->token.type == CONCAT)
1325 node->first = node->left->first;
1326 node->node_idx = node->left->node_idx;
1331 node->node_idx = re_dfa_add_node (dfa, node->token);
1332 if (BE (node->node_idx == -1, 0))
1338 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1339 static reg_errcode_t
1340 calc_next (void *extra, bin_tree_t *node)
1342 switch (node->token.type)
1344 case OP_DUP_ASTERISK:
1345 node->left->next = node;
1348 node->left->next = node->right->first;
1349 node->right->next = node->next;
1353 node->left->next = node->next;
1355 node->right->next = node->next;
1361 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1362 static reg_errcode_t
1363 link_nfa_nodes (void *extra, bin_tree_t *node)
1365 re_dfa_t *dfa = (re_dfa_t *) extra;
1366 int idx = node->node_idx;
1367 reg_errcode_t err = REG_NOERROR;
1369 switch (node->token.type)
1375 assert (node->next == NULL);
1378 case OP_DUP_ASTERISK:
1382 dfa->has_plural_match = 1;
1383 if (node->left != NULL)
1384 left = node->left->first->node_idx;
1386 left = node->next->node_idx;
1387 if (node->right != NULL)
1388 right = node->right->first->node_idx;
1390 right = node->next->node_idx;
1392 assert (right > -1);
1393 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1398 case OP_OPEN_SUBEXP:
1399 case OP_CLOSE_SUBEXP:
1400 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1404 dfa->nexts[idx] = node->next->node_idx;
1405 if (node->token.type == OP_BACK_REF)
1406 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1410 assert (!IS_EPSILON_NODE (node->token.type));
1411 dfa->nexts[idx] = node->next->node_idx;
1418 /* Duplicate the epsilon closure of the node ROOT_NODE.
1419 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1420 to their own constraint. */
1422 static reg_errcode_t
1423 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1424 int root_node, unsigned int init_constraint)
1426 int org_node, clone_node, ret;
1427 unsigned int constraint = init_constraint;
1428 for (org_node = top_org_node, clone_node = top_clone_node;;)
1430 int org_dest, clone_dest;
1431 if (dfa->nodes[org_node].type == OP_BACK_REF)
1433 /* If the back reference epsilon-transit, its destination must
1434 also have the constraint. Then duplicate the epsilon closure
1435 of the destination of the back reference, and store it in
1436 edests of the back reference. */
1437 org_dest = dfa->nexts[org_node];
1438 re_node_set_empty (dfa->edests + clone_node);
1439 clone_dest = duplicate_node (dfa, org_dest, constraint);
1440 if (BE (clone_dest == -1, 0))
1442 dfa->nexts[clone_node] = dfa->nexts[org_node];
1443 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1444 if (BE (ret < 0, 0))
1447 else if (dfa->edests[org_node].nelem == 0)
1449 /* In case of the node can't epsilon-transit, don't duplicate the
1450 destination and store the original destination as the
1451 destination of the node. */
1452 dfa->nexts[clone_node] = dfa->nexts[org_node];
1455 else if (dfa->edests[org_node].nelem == 1)
1457 /* In case of the node can epsilon-transit, and it has only one
1459 org_dest = dfa->edests[org_node].elems[0];
1460 re_node_set_empty (dfa->edests + clone_node);
1461 if (dfa->nodes[org_node].type == ANCHOR)
1463 /* In case of the node has another constraint, append it. */
1464 if (org_node == root_node && clone_node != org_node)
1466 /* ...but if the node is root_node itself, it means the
1467 epsilon closure have a loop, then tie it to the
1468 destination of the root_node. */
1469 ret = re_node_set_insert (dfa->edests + clone_node,
1471 if (BE (ret < 0, 0))
1475 constraint |= dfa->nodes[org_node].opr.ctx_type;
1477 clone_dest = duplicate_node (dfa, org_dest, constraint);
1478 if (BE (clone_dest == -1, 0))
1480 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1481 if (BE (ret < 0, 0))
1484 else /* dfa->edests[org_node].nelem == 2 */
1486 /* In case of the node can epsilon-transit, and it has two
1487 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1488 org_dest = dfa->edests[org_node].elems[0];
1489 re_node_set_empty (dfa->edests + clone_node);
1490 /* Search for a duplicated node which satisfies the constraint. */
1491 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1492 if (clone_dest == -1)
1494 /* There are no such a duplicated node, create a new one. */
1496 clone_dest = duplicate_node (dfa, org_dest, constraint);
1497 if (BE (clone_dest == -1, 0))
1499 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1500 if (BE (ret < 0, 0))
1502 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1503 root_node, constraint);
1504 if (BE (err != REG_NOERROR, 0))
1509 /* There are a duplicated node which satisfy the constraint,
1510 use it to avoid infinite loop. */
1511 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1512 if (BE (ret < 0, 0))
1516 org_dest = dfa->edests[org_node].elems[1];
1517 clone_dest = duplicate_node (dfa, org_dest, constraint);
1518 if (BE (clone_dest == -1, 0))
1520 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1521 if (BE (ret < 0, 0))
1524 org_node = org_dest;
1525 clone_node = clone_dest;
1530 /* Search for a node which is duplicated from the node ORG_NODE, and
1531 satisfies the constraint CONSTRAINT. */
1534 search_duplicated_node (re_dfa_t *dfa, int org_node, unsigned int constraint)
1537 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1539 if (org_node == dfa->org_indices[idx]
1540 && constraint == dfa->nodes[idx].constraint)
1541 return idx; /* Found. */
1543 return -1; /* Not found. */
1546 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1547 Return the index of the new node, or -1 if insufficient storage is
1551 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1553 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1554 if (BE (dup_idx != -1, 1))
1556 dfa->nodes[dup_idx].constraint = constraint;
1557 if (dfa->nodes[org_idx].type == ANCHOR)
1558 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1559 dfa->nodes[dup_idx].duplicated = 1;
1561 /* Store the index of the original node. */
1562 dfa->org_indices[dup_idx] = org_idx;
1567 static reg_errcode_t
1568 calc_inveclosure (re_dfa_t *dfa)
1571 for (idx = 0; idx < dfa->nodes_len; ++idx)
1572 re_node_set_init_empty (dfa->inveclosures + idx);
1574 for (src = 0; src < dfa->nodes_len; ++src)
1576 int *elems = dfa->eclosures[src].elems;
1577 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1579 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1580 if (BE (ret == -1, 0))
1588 /* Calculate "eclosure" for all the node in DFA. */
1590 static reg_errcode_t
1591 calc_eclosure (re_dfa_t *dfa)
1593 int node_idx, incomplete;
1595 assert (dfa->nodes_len > 0);
1598 /* For each nodes, calculate epsilon closure. */
1599 for (node_idx = 0; ; ++node_idx)
1602 re_node_set eclosure_elem;
1603 if (node_idx == dfa->nodes_len)
1612 assert (dfa->eclosures[node_idx].nelem != -1);
1615 /* If we have already calculated, skip it. */
1616 if (dfa->eclosures[node_idx].nelem != 0)
1618 /* Calculate epsilon closure of `node_idx'. */
1619 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1620 if (BE (err != REG_NOERROR, 0))
1623 if (dfa->eclosures[node_idx].nelem == 0)
1626 re_node_set_free (&eclosure_elem);
1632 /* Calculate epsilon closure of NODE. */
1634 static reg_errcode_t
1635 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1638 unsigned int constraint;
1640 re_node_set eclosure;
1642 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1643 if (BE (err != REG_NOERROR, 0))
1646 /* This indicates that we are calculating this node now.
1647 We reference this value to avoid infinite loop. */
1648 dfa->eclosures[node].nelem = -1;
1650 constraint = ((dfa->nodes[node].type == ANCHOR)
1651 ? dfa->nodes[node].opr.ctx_type : 0);
1652 /* If the current node has constraints, duplicate all nodes.
1653 Since they must inherit the constraints. */
1655 && dfa->edests[node].nelem
1656 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1658 int org_node, cur_node;
1659 org_node = cur_node = node;
1660 err = duplicate_node_closure (dfa, node, node, node, constraint);
1661 if (BE (err != REG_NOERROR, 0))
1665 /* Expand each epsilon destination nodes. */
1666 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1667 for (i = 0; i < dfa->edests[node].nelem; ++i)
1669 re_node_set eclosure_elem;
1670 int edest = dfa->edests[node].elems[i];
1671 /* If calculating the epsilon closure of `edest' is in progress,
1672 return intermediate result. */
1673 if (dfa->eclosures[edest].nelem == -1)
1678 /* If we haven't calculated the epsilon closure of `edest' yet,
1679 calculate now. Otherwise use calculated epsilon closure. */
1680 if (dfa->eclosures[edest].nelem == 0)
1682 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1683 if (BE (err != REG_NOERROR, 0))
1687 eclosure_elem = dfa->eclosures[edest];
1688 /* Merge the epsilon closure of `edest'. */
1689 re_node_set_merge (&eclosure, &eclosure_elem);
1690 /* If the epsilon closure of `edest' is incomplete,
1691 the epsilon closure of this node is also incomplete. */
1692 if (dfa->eclosures[edest].nelem == 0)
1695 re_node_set_free (&eclosure_elem);
1699 /* Epsilon closures include itself. */
1700 re_node_set_insert (&eclosure, node);
1701 if (incomplete && !root)
1702 dfa->eclosures[node].nelem = 0;
1704 dfa->eclosures[node] = eclosure;
1705 *new_set = eclosure;
1709 /* Functions for token which are used in the parser. */
1711 /* Fetch a token from INPUT.
1712 We must not use this function inside bracket expressions. */
1715 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1717 re_string_skip_bytes (input, peek_token (result, input, syntax));
1720 /* Peek a token from INPUT, and return the length of the token.
1721 We must not use this function inside bracket expressions. */
1724 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1728 if (re_string_eoi (input))
1730 token->type = END_OF_RE;
1734 c = re_string_peek_byte (input, 0);
1737 token->word_char = 0;
1738 #ifdef RE_ENABLE_I18N
1739 token->mb_partial = 0;
1740 if (input->mb_cur_max > 1 &&
1741 !re_string_first_byte (input, re_string_cur_idx (input)))
1743 token->type = CHARACTER;
1744 token->mb_partial = 1;
1751 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1753 token->type = BACK_SLASH;
1757 c2 = re_string_peek_byte_case (input, 1);
1759 token->type = CHARACTER;
1760 #ifdef RE_ENABLE_I18N
1761 if (input->mb_cur_max > 1)
1763 wint_t wc = re_string_wchar_at (input,
1764 re_string_cur_idx (input) + 1);
1765 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1769 token->word_char = IS_WORD_CHAR (c2) != 0;
1774 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1775 token->type = OP_ALT;
1777 case '1': case '2': case '3': case '4': case '5':
1778 case '6': case '7': case '8': case '9':
1779 if (!(syntax & RE_NO_BK_REFS))
1781 token->type = OP_BACK_REF;
1782 token->opr.idx = c2 - '1';
1786 if (!(syntax & RE_NO_GNU_OPS))
1788 token->type = ANCHOR;
1789 token->opr.ctx_type = WORD_FIRST;
1793 if (!(syntax & RE_NO_GNU_OPS))
1795 token->type = ANCHOR;
1796 token->opr.ctx_type = WORD_LAST;
1800 if (!(syntax & RE_NO_GNU_OPS))
1802 token->type = ANCHOR;
1803 token->opr.ctx_type = WORD_DELIM;
1807 if (!(syntax & RE_NO_GNU_OPS))
1809 token->type = ANCHOR;
1810 token->opr.ctx_type = NOT_WORD_DELIM;
1814 if (!(syntax & RE_NO_GNU_OPS))
1815 token->type = OP_WORD;
1818 if (!(syntax & RE_NO_GNU_OPS))
1819 token->type = OP_NOTWORD;
1822 if (!(syntax & RE_NO_GNU_OPS))
1823 token->type = OP_SPACE;
1826 if (!(syntax & RE_NO_GNU_OPS))
1827 token->type = OP_NOTSPACE;
1830 if (!(syntax & RE_NO_GNU_OPS))
1832 token->type = ANCHOR;
1833 token->opr.ctx_type = BUF_FIRST;
1837 if (!(syntax & RE_NO_GNU_OPS))
1839 token->type = ANCHOR;
1840 token->opr.ctx_type = BUF_LAST;
1844 if (!(syntax & RE_NO_BK_PARENS))
1845 token->type = OP_OPEN_SUBEXP;
1848 if (!(syntax & RE_NO_BK_PARENS))
1849 token->type = OP_CLOSE_SUBEXP;
1852 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1853 token->type = OP_DUP_PLUS;
1856 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1857 token->type = OP_DUP_QUESTION;
1860 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1861 token->type = OP_OPEN_DUP_NUM;
1864 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1865 token->type = OP_CLOSE_DUP_NUM;
1873 token->type = CHARACTER;
1874 #ifdef RE_ENABLE_I18N
1875 if (input->mb_cur_max > 1)
1877 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1878 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1882 token->word_char = IS_WORD_CHAR (token->opr.c);
1887 if (syntax & RE_NEWLINE_ALT)
1888 token->type = OP_ALT;
1891 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1892 token->type = OP_ALT;
1895 token->type = OP_DUP_ASTERISK;
1898 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1899 token->type = OP_DUP_PLUS;
1902 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1903 token->type = OP_DUP_QUESTION;
1906 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1907 token->type = OP_OPEN_DUP_NUM;
1910 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1911 token->type = OP_CLOSE_DUP_NUM;
1914 if (syntax & RE_NO_BK_PARENS)
1915 token->type = OP_OPEN_SUBEXP;
1918 if (syntax & RE_NO_BK_PARENS)
1919 token->type = OP_CLOSE_SUBEXP;
1922 token->type = OP_OPEN_BRACKET;
1925 token->type = OP_PERIOD;
1928 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1929 re_string_cur_idx (input) != 0)
1931 char prev = re_string_peek_byte (input, -1);
1932 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1935 token->type = ANCHOR;
1936 token->opr.ctx_type = LINE_FIRST;
1939 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1940 re_string_cur_idx (input) + 1 != re_string_length (input))
1943 re_string_skip_bytes (input, 1);
1944 peek_token (&next, input, syntax);
1945 re_string_skip_bytes (input, -1);
1946 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1949 token->type = ANCHOR;
1950 token->opr.ctx_type = LINE_LAST;
1958 /* Peek a token from INPUT, and return the length of the token.
1959 We must not use this function out of bracket expressions. */
1962 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1965 if (re_string_eoi (input))
1967 token->type = END_OF_RE;
1970 c = re_string_peek_byte (input, 0);
1973 #ifdef RE_ENABLE_I18N
1974 if (input->mb_cur_max > 1 &&
1975 !re_string_first_byte (input, re_string_cur_idx (input)))
1977 token->type = CHARACTER;
1980 #endif /* RE_ENABLE_I18N */
1982 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1983 && re_string_cur_idx (input) + 1 < re_string_length (input))
1985 /* In this case, '\' escape a character. */
1987 re_string_skip_bytes (input, 1);
1988 c2 = re_string_peek_byte (input, 0);
1990 token->type = CHARACTER;
1993 if (c == '[') /* '[' is a special char in a bracket exps. */
1997 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1998 c2 = re_string_peek_byte (input, 1);
2006 token->type = OP_OPEN_COLL_ELEM;
2009 token->type = OP_OPEN_EQUIV_CLASS;
2012 if (syntax & RE_CHAR_CLASSES)
2014 token->type = OP_OPEN_CHAR_CLASS;
2017 /* else fall through. */
2019 token->type = CHARACTER;
2029 token->type = OP_CHARSET_RANGE;
2032 token->type = OP_CLOSE_BRACKET;
2035 token->type = OP_NON_MATCH_LIST;
2038 token->type = CHARACTER;
2043 /* Functions for parser. */
2045 /* Entry point of the parser.
2046 Parse the regular expression REGEXP and return the structure tree.
2047 If an error is occured, ERR is set by error code, and return NULL.
2048 This function build the following tree, from regular expression <reg_exp>:
2054 CAT means concatenation.
2055 EOR means end of regular expression. */
2058 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2061 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2062 bin_tree_t *tree, *eor, *root;
2063 re_token_t current_token;
2064 dfa->syntax = syntax;
2065 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2066 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2067 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2069 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2071 root = create_tree (dfa, tree, eor, CONCAT);
2074 if (BE (eor == NULL || root == NULL, 0))
2082 /* This function build the following tree, from regular expression
2083 <branch1>|<branch2>:
2089 ALT means alternative, which represents the operator `|'. */
2092 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2093 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2095 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2096 bin_tree_t *tree, *branch = NULL;
2097 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2098 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2101 while (token->type == OP_ALT)
2103 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2104 if (token->type != OP_ALT && token->type != END_OF_RE
2105 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2107 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2108 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2113 tree = create_tree (dfa, tree, branch, OP_ALT);
2114 if (BE (tree == NULL, 0))
2123 /* This function build the following tree, from regular expression
2130 CAT means concatenation. */
2133 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2134 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2136 bin_tree_t *tree, *exp;
2137 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2138 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2139 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2142 while (token->type != OP_ALT && token->type != END_OF_RE
2143 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2145 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2146 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2150 if (tree != NULL && exp != NULL)
2152 tree = create_tree (dfa, tree, exp, CONCAT);
2159 else if (tree == NULL)
2161 /* Otherwise exp == NULL, we don't need to create new tree. */
2166 /* This function build the following tree, from regular expression a*:
2173 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2174 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2176 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2178 switch (token->type)
2181 tree = create_token_tree (dfa, NULL, NULL, token);
2182 if (BE (tree == NULL, 0))
2187 #ifdef RE_ENABLE_I18N
2188 if (dfa->mb_cur_max > 1)
2190 while (!re_string_eoi (regexp)
2191 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2193 bin_tree_t *mbc_remain;
2194 fetch_token (token, regexp, syntax);
2195 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2196 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2197 if (BE (mbc_remain == NULL || tree == NULL, 0))
2206 case OP_OPEN_SUBEXP:
2207 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2208 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2211 case OP_OPEN_BRACKET:
2212 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2213 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2217 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2222 dfa->used_bkref_map |= 1 << token->opr.idx;
2223 tree = create_token_tree (dfa, NULL, NULL, token);
2224 if (BE (tree == NULL, 0))
2230 dfa->has_mb_node = 1;
2232 case OP_OPEN_DUP_NUM:
2233 if (syntax & RE_CONTEXT_INVALID_DUP)
2239 case OP_DUP_ASTERISK:
2241 case OP_DUP_QUESTION:
2242 if (syntax & RE_CONTEXT_INVALID_OPS)
2247 else if (syntax & RE_CONTEXT_INDEP_OPS)
2249 fetch_token (token, regexp, syntax);
2250 return parse_expression (regexp, preg, token, syntax, nest, err);
2252 /* else fall through */
2253 case OP_CLOSE_SUBEXP:
2254 if ((token->type == OP_CLOSE_SUBEXP) &&
2255 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2260 /* else fall through */
2261 case OP_CLOSE_DUP_NUM:
2262 /* We treat it as a normal character. */
2264 /* Then we can these characters as normal characters. */
2265 token->type = CHARACTER;
2266 /* mb_partial and word_char bits should be initialized already
2268 tree = create_token_tree (dfa, NULL, NULL, token);
2269 if (BE (tree == NULL, 0))
2276 if ((token->opr.ctx_type
2277 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2278 && dfa->word_ops_used == 0)
2279 init_word_char (dfa);
2280 if (token->opr.ctx_type == WORD_DELIM
2281 || token->opr.ctx_type == NOT_WORD_DELIM)
2283 bin_tree_t *tree_first, *tree_last;
2284 if (token->opr.ctx_type == WORD_DELIM)
2286 token->opr.ctx_type = WORD_FIRST;
2287 tree_first = create_token_tree (dfa, NULL, NULL, token);
2288 token->opr.ctx_type = WORD_LAST;
2292 token->opr.ctx_type = INSIDE_WORD;
2293 tree_first = create_token_tree (dfa, NULL, NULL, token);
2294 token->opr.ctx_type = INSIDE_NOTWORD;
2296 tree_last = create_token_tree (dfa, NULL, NULL, token);
2297 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2298 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2306 tree = create_token_tree (dfa, NULL, NULL, token);
2307 if (BE (tree == NULL, 0))
2313 /* We must return here, since ANCHORs can't be followed
2314 by repetition operators.
2315 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2316 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2317 fetch_token (token, regexp, syntax);
2320 tree = create_token_tree (dfa, NULL, NULL, token);
2321 if (BE (tree == NULL, 0))
2326 if (dfa->mb_cur_max > 1)
2327 dfa->has_mb_node = 1;
2331 tree = build_charclass_op (dfa, regexp->trans,
2332 (const unsigned char *) "alnum",
2333 (const unsigned char *) "_",
2334 token->type == OP_NOTWORD, err);
2335 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2340 tree = build_charclass_op (dfa, regexp->trans,
2341 (const unsigned char *) "space",
2342 (const unsigned char *) "",
2343 token->type == OP_NOTSPACE, err);
2344 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2354 /* Must not happen? */
2360 fetch_token (token, regexp, syntax);
2362 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2363 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2365 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2366 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2368 /* In BRE consecutive duplications are not allowed. */
2369 if ((syntax & RE_CONTEXT_INVALID_DUP)
2370 && (token->type == OP_DUP_ASTERISK
2371 || token->type == OP_OPEN_DUP_NUM))
2381 /* This function build the following tree, from regular expression
2389 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2390 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2392 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2395 cur_nsub = preg->re_nsub++;
2397 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2399 /* The subexpression may be a null string. */
2400 if (token->type == OP_CLOSE_SUBEXP)
2404 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2405 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2407 if (BE (*err != REG_NOERROR, 0))
2410 dfa->completed_bkref_map |= 1 << cur_nsub;
2412 tree = create_tree (dfa, tree, NULL, SUBEXP);
2413 if (BE (tree == NULL, 0))
2418 tree->token.opr.idx = cur_nsub;
2422 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2425 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2426 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2428 bin_tree_t *tree = NULL, *old_tree = NULL;
2429 int i, start, end, start_idx = re_string_cur_idx (regexp);
2430 re_token_t start_token = *token;
2432 if (token->type == OP_OPEN_DUP_NUM)
2435 start = fetch_number (regexp, token, syntax);
2438 if (token->type == CHARACTER && token->opr.c == ',')
2439 start = 0; /* We treat "{,m}" as "{0,m}". */
2442 *err = REG_BADBR; /* <re>{} is invalid. */
2446 if (BE (start != -2, 1))
2448 /* We treat "{n}" as "{n,n}". */
2449 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2450 : ((token->type == CHARACTER && token->opr.c == ',')
2451 ? fetch_number (regexp, token, syntax) : -2));
2453 if (BE (start == -2 || end == -2, 0))
2455 /* Invalid sequence. */
2456 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2458 if (token->type == END_OF_RE)
2466 /* If the syntax bit is set, rollback. */
2467 re_string_set_index (regexp, start_idx);
2468 *token = start_token;
2469 token->type = CHARACTER;
2470 /* mb_partial and word_char bits should be already initialized by
2475 if (BE (end != -1 && start > end, 0))
2477 /* First number greater than second. */
2484 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2485 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2488 fetch_token (token, regexp, syntax);
2490 if (BE (elem == NULL, 0))
2492 if (BE (start == 0 && end == 0, 0))
2494 postorder (elem, free_tree, NULL);
2498 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2499 if (BE (start > 0, 0))
2502 for (i = 2; i <= start; ++i)
2504 elem = duplicate_tree (elem, dfa);
2505 tree = create_tree (dfa, tree, elem, CONCAT);
2506 if (BE (elem == NULL || tree == NULL, 0))
2507 goto parse_dup_op_espace;
2513 /* Duplicate ELEM before it is marked optional. */
2514 elem = duplicate_tree (elem, dfa);
2520 if (elem->token.type == SUBEXP)
2521 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2523 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2524 if (BE (tree == NULL, 0))
2525 goto parse_dup_op_espace;
2527 /* This loop is actually executed only when end != -1,
2528 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2529 already created the start+1-th copy. */
2530 for (i = start + 2; i <= end; ++i)
2532 elem = duplicate_tree (elem, dfa);
2533 tree = create_tree (dfa, tree, elem, CONCAT);
2534 if (BE (elem == NULL || tree == NULL, 0))
2535 goto parse_dup_op_espace;
2537 tree = create_tree (dfa, tree, NULL, OP_ALT);
2538 if (BE (tree == NULL, 0))
2539 goto parse_dup_op_espace;
2543 tree = create_tree (dfa, old_tree, tree, CONCAT);
2547 parse_dup_op_espace:
2552 /* Size of the names for collating symbol/equivalence_class/character_class.
2553 I'm not sure, but maybe enough. */
2554 #define BRACKET_NAME_BUF_SIZE 32
2557 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2558 Build the range expression which starts from START_ELEM, and ends
2559 at END_ELEM. The result are written to MBCSET and SBCSET.
2560 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2561 mbcset->range_ends, is a pointer argument sinse we may
2564 static reg_errcode_t
2565 build_range_exp (re_bitset_ptr_t sbcset,
2566 # ifdef RE_ENABLE_I18N
2567 re_charset_t *mbcset, int *range_alloc,
2569 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2571 unsigned int start_ch, end_ch;
2572 /* Equivalence Classes and Character Classes can't be a range start/end. */
2573 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2574 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2578 /* We can handle no multi character collating elements without libc
2580 if (BE ((start_elem->type == COLL_SYM
2581 && strlen ((char *) start_elem->opr.name) > 1)
2582 || (end_elem->type == COLL_SYM
2583 && strlen ((char *) end_elem->opr.name) > 1), 0))
2584 return REG_ECOLLATE;
2586 # ifdef RE_ENABLE_I18N
2589 wint_t start_wc, end_wc;
2590 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2592 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2593 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2595 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2596 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2598 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2599 ? __btowc (start_ch) : start_elem->opr.wch);
2600 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2601 ? __btowc (end_ch) : end_elem->opr.wch);
2602 if (start_wc == WEOF || end_wc == WEOF)
2603 return REG_ECOLLATE;
2604 cmp_buf[0] = start_wc;
2605 cmp_buf[4] = end_wc;
2606 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2609 /* Got valid collation sequence values, add them as a new entry.
2610 However, for !_LIBC we have no collation elements: if the
2611 character set is single byte, the single byte character set
2612 that we build below suffices. parse_bracket_exp passes
2613 no MBCSET if dfa->mb_cur_max == 1. */
2616 /* Check the space of the arrays. */
2617 if (BE (*range_alloc == mbcset->nranges, 0))
2619 /* There is not enough space, need realloc. */
2620 wchar_t *new_array_start, *new_array_end;
2623 /* +1 in case of mbcset->nranges is 0. */
2624 new_nranges = 2 * mbcset->nranges + 1;
2625 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2626 are NULL if *range_alloc == 0. */
2627 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2629 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2632 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2635 mbcset->range_starts = new_array_start;
2636 mbcset->range_ends = new_array_end;
2637 *range_alloc = new_nranges;
2640 mbcset->range_starts[mbcset->nranges] = start_wc;
2641 mbcset->range_ends[mbcset->nranges++] = end_wc;
2644 /* Build the table for single byte characters. */
2645 for (wc = 0; wc < SBC_MAX; ++wc)
2648 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2649 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2650 bitset_set (sbcset, wc);
2653 # else /* not RE_ENABLE_I18N */
2656 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2657 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2659 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2660 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2662 if (start_ch > end_ch)
2664 /* Build the table for single byte characters. */
2665 for (ch = 0; ch < SBC_MAX; ++ch)
2666 if (start_ch <= ch && ch <= end_ch)
2667 bitset_set (sbcset, ch);
2669 # endif /* not RE_ENABLE_I18N */
2672 #endif /* not _LIBC */
2675 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2676 Build the collating element which is represented by NAME.
2677 The result are written to MBCSET and SBCSET.
2678 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2679 pointer argument since we may update it. */
2681 static reg_errcode_t
2682 build_collating_symbol (re_bitset_ptr_t sbcset,
2683 # ifdef RE_ENABLE_I18N
2684 re_charset_t *mbcset, int *coll_sym_alloc,
2686 const unsigned char *name)
2688 size_t name_len = strlen ((const char *) name);
2689 if (BE (name_len != 1, 0))
2690 return REG_ECOLLATE;
2693 bitset_set (sbcset, name[0]);
2697 #endif /* not _LIBC */
2699 /* This function parse bracket expression like "[abc]", "[a-c]",
2703 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2704 reg_syntax_t syntax, reg_errcode_t *err)
2707 const unsigned char *collseqmb;
2708 const char *collseqwc;
2711 const int32_t *symb_table;
2712 const unsigned char *extra;
2714 /* Local function for parse_bracket_exp used in _LIBC environement.
2715 Seek the collating symbol entry correspondings to NAME.
2716 Return the index of the symbol in the SYMB_TABLE. */
2719 __attribute ((always_inline))
2720 seek_collating_symbol_entry (name, name_len)
2721 const unsigned char *name;
2724 int32_t hash = elem_hash ((const char *) name, name_len);
2725 int32_t elem = hash % table_size;
2726 int32_t second = hash % (table_size - 2);
2727 while (symb_table[2 * elem] != 0)
2729 /* First compare the hashing value. */
2730 if (symb_table[2 * elem] == hash
2731 /* Compare the length of the name. */
2732 && name_len == extra[symb_table[2 * elem + 1]]
2733 /* Compare the name. */
2734 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2737 /* Yep, this is the entry. */
2747 /* Local function for parse_bracket_exp used in _LIBC environement.
2748 Look up the collation sequence value of BR_ELEM.
2749 Return the value if succeeded, UINT_MAX otherwise. */
2751 auto inline unsigned int
2752 __attribute ((always_inline))
2753 lookup_collation_sequence_value (br_elem)
2754 bracket_elem_t *br_elem;
2756 if (br_elem->type == SB_CHAR)
2759 if (MB_CUR_MAX == 1)
2762 return collseqmb[br_elem->opr.ch];
2765 wint_t wc = __btowc (br_elem->opr.ch);
2766 return __collseq_table_lookup (collseqwc, wc);
2769 else if (br_elem->type == MB_CHAR)
2771 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2773 else if (br_elem->type == COLL_SYM)
2775 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2779 elem = seek_collating_symbol_entry (br_elem->opr.name,
2781 if (symb_table[2 * elem] != 0)
2783 /* We found the entry. */
2784 idx = symb_table[2 * elem + 1];
2785 /* Skip the name of collating element name. */
2786 idx += 1 + extra[idx];
2787 /* Skip the byte sequence of the collating element. */
2788 idx += 1 + extra[idx];
2789 /* Adjust for the alignment. */
2790 idx = (idx + 3) & ~3;
2791 /* Skip the multibyte collation sequence value. */
2792 idx += sizeof (unsigned int);
2793 /* Skip the wide char sequence of the collating element. */
2794 idx += sizeof (unsigned int) *
2795 (1 + *(unsigned int *) (extra + idx));
2796 /* Return the collation sequence value. */
2797 return *(unsigned int *) (extra + idx);
2799 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2801 /* No valid character. Match it as a single byte
2803 return collseqmb[br_elem->opr.name[0]];
2806 else if (sym_name_len == 1)
2807 return collseqmb[br_elem->opr.name[0]];
2812 /* Local function for parse_bracket_exp used in _LIBC environement.
2813 Build the range expression which starts from START_ELEM, and ends
2814 at END_ELEM. The result are written to MBCSET and SBCSET.
2815 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2816 mbcset->range_ends, is a pointer argument sinse we may
2819 auto inline reg_errcode_t
2820 __attribute ((always_inline))
2821 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2822 re_charset_t *mbcset;
2824 re_bitset_ptr_t sbcset;
2825 bracket_elem_t *start_elem, *end_elem;
2828 uint32_t start_collseq;
2829 uint32_t end_collseq;
2831 /* Equivalence Classes and Character Classes can't be a range
2833 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2834 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2838 start_collseq = lookup_collation_sequence_value (start_elem);
2839 end_collseq = lookup_collation_sequence_value (end_elem);
2840 /* Check start/end collation sequence values. */
2841 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2842 return REG_ECOLLATE;
2843 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2846 /* Got valid collation sequence values, add them as a new entry.
2847 However, if we have no collation elements, and the character set
2848 is single byte, the single byte character set that we
2849 build below suffices. */
2850 if (nrules > 0 || dfa->mb_cur_max > 1)
2852 /* Check the space of the arrays. */
2853 if (BE (*range_alloc == mbcset->nranges, 0))
2855 /* There is not enough space, need realloc. */
2856 uint32_t *new_array_start;
2857 uint32_t *new_array_end;
2860 /* +1 in case of mbcset->nranges is 0. */
2861 new_nranges = 2 * mbcset->nranges + 1;
2862 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2864 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2867 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2870 mbcset->range_starts = new_array_start;
2871 mbcset->range_ends = new_array_end;
2872 *range_alloc = new_nranges;
2875 mbcset->range_starts[mbcset->nranges] = start_collseq;
2876 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2879 /* Build the table for single byte characters. */
2880 for (ch = 0; ch < SBC_MAX; ch++)
2882 uint32_t ch_collseq;
2884 if (MB_CUR_MAX == 1)
2887 ch_collseq = collseqmb[ch];
2889 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2890 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2891 bitset_set (sbcset, ch);
2896 /* Local function for parse_bracket_exp used in _LIBC environement.
2897 Build the collating element which is represented by NAME.
2898 The result are written to MBCSET and SBCSET.
2899 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2900 pointer argument sinse we may update it. */
2902 auto inline reg_errcode_t
2903 __attribute ((always_inline))
2904 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2905 re_charset_t *mbcset;
2906 int *coll_sym_alloc;
2907 re_bitset_ptr_t sbcset;
2908 const unsigned char *name;
2911 size_t name_len = strlen ((const char *) name);
2914 elem = seek_collating_symbol_entry (name, name_len);
2915 if (symb_table[2 * elem] != 0)
2917 /* We found the entry. */
2918 idx = symb_table[2 * elem + 1];
2919 /* Skip the name of collating element name. */
2920 idx += 1 + extra[idx];
2922 else if (symb_table[2 * elem] == 0 && name_len == 1)
2924 /* No valid character, treat it as a normal
2926 bitset_set (sbcset, name[0]);
2930 return REG_ECOLLATE;
2932 /* Got valid collation sequence, add it as a new entry. */
2933 /* Check the space of the arrays. */
2934 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2936 /* Not enough, realloc it. */
2937 /* +1 in case of mbcset->ncoll_syms is 0. */
2938 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2939 /* Use realloc since mbcset->coll_syms is NULL
2941 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2942 new_coll_sym_alloc);
2943 if (BE (new_coll_syms == NULL, 0))
2945 mbcset->coll_syms = new_coll_syms;
2946 *coll_sym_alloc = new_coll_sym_alloc;
2948 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2953 if (BE (name_len != 1, 0))
2954 return REG_ECOLLATE;
2957 bitset_set (sbcset, name[0]);
2964 re_token_t br_token;
2965 re_bitset_ptr_t sbcset;
2966 #ifdef RE_ENABLE_I18N
2967 re_charset_t *mbcset;
2968 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2969 int equiv_class_alloc = 0, char_class_alloc = 0;
2970 #endif /* not RE_ENABLE_I18N */
2972 bin_tree_t *work_tree;
2974 int first_round = 1;
2976 collseqmb = (const unsigned char *)
2977 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2978 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2984 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2985 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2986 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2987 _NL_COLLATE_SYMB_TABLEMB);
2988 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2989 _NL_COLLATE_SYMB_EXTRAMB);
2992 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2993 #ifdef RE_ENABLE_I18N
2994 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2995 #endif /* RE_ENABLE_I18N */
2996 #ifdef RE_ENABLE_I18N
2997 if (BE (sbcset == NULL || mbcset == NULL, 0))
2999 if (BE (sbcset == NULL, 0))
3000 #endif /* RE_ENABLE_I18N */
3006 token_len = peek_token_bracket (token, regexp, syntax);
3007 if (BE (token->type == END_OF_RE, 0))
3010 goto parse_bracket_exp_free_return;
3012 if (token->type == OP_NON_MATCH_LIST)
3014 #ifdef RE_ENABLE_I18N
3015 mbcset->non_match = 1;
3016 #endif /* not RE_ENABLE_I18N */
3018 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3019 bitset_set (sbcset, '\0');
3020 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3021 token_len = peek_token_bracket (token, regexp, syntax);
3022 if (BE (token->type == END_OF_RE, 0))
3025 goto parse_bracket_exp_free_return;
3029 /* We treat the first ']' as a normal character. */
3030 if (token->type == OP_CLOSE_BRACKET)
3031 token->type = CHARACTER;
3035 bracket_elem_t start_elem, end_elem;
3036 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3037 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3039 int token_len2 = 0, is_range_exp = 0;
3042 start_elem.opr.name = start_name_buf;
3043 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3044 syntax, first_round);
3045 if (BE (ret != REG_NOERROR, 0))
3048 goto parse_bracket_exp_free_return;
3052 /* Get information about the next token. We need it in any case. */
3053 token_len = peek_token_bracket (token, regexp, syntax);
3055 /* Do not check for ranges if we know they are not allowed. */
3056 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3058 if (BE (token->type == END_OF_RE, 0))
3061 goto parse_bracket_exp_free_return;
3063 if (token->type == OP_CHARSET_RANGE)
3065 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3066 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3067 if (BE (token2.type == END_OF_RE, 0))
3070 goto parse_bracket_exp_free_return;
3072 if (token2.type == OP_CLOSE_BRACKET)
3074 /* We treat the last '-' as a normal character. */
3075 re_string_skip_bytes (regexp, -token_len);
3076 token->type = CHARACTER;
3083 if (is_range_exp == 1)
3085 end_elem.opr.name = end_name_buf;
3086 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3088 if (BE (ret != REG_NOERROR, 0))
3091 goto parse_bracket_exp_free_return;
3094 token_len = peek_token_bracket (token, regexp, syntax);
3097 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3098 &start_elem, &end_elem);
3100 # ifdef RE_ENABLE_I18N
3101 *err = build_range_exp (sbcset,
3102 dfa->mb_cur_max > 1 ? mbcset : NULL,
3103 &range_alloc, &start_elem, &end_elem);
3105 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3107 #endif /* RE_ENABLE_I18N */
3108 if (BE (*err != REG_NOERROR, 0))
3109 goto parse_bracket_exp_free_return;
3113 switch (start_elem.type)
3116 bitset_set (sbcset, start_elem.opr.ch);
3118 #ifdef RE_ENABLE_I18N
3120 /* Check whether the array has enough space. */
3121 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3123 wchar_t *new_mbchars;
3124 /* Not enough, realloc it. */
3125 /* +1 in case of mbcset->nmbchars is 0. */
3126 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3127 /* Use realloc since array is NULL if *alloc == 0. */
3128 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3130 if (BE (new_mbchars == NULL, 0))
3131 goto parse_bracket_exp_espace;
3132 mbcset->mbchars = new_mbchars;
3134 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3136 #endif /* RE_ENABLE_I18N */
3138 *err = build_equiv_class (sbcset,
3139 #ifdef RE_ENABLE_I18N
3140 mbcset, &equiv_class_alloc,
3141 #endif /* RE_ENABLE_I18N */
3142 start_elem.opr.name);
3143 if (BE (*err != REG_NOERROR, 0))
3144 goto parse_bracket_exp_free_return;
3147 *err = build_collating_symbol (sbcset,
3148 #ifdef RE_ENABLE_I18N
3149 mbcset, &coll_sym_alloc,
3150 #endif /* RE_ENABLE_I18N */
3151 start_elem.opr.name);
3152 if (BE (*err != REG_NOERROR, 0))
3153 goto parse_bracket_exp_free_return;
3156 *err = build_charclass (regexp->trans, sbcset,
3157 #ifdef RE_ENABLE_I18N
3158 mbcset, &char_class_alloc,
3159 #endif /* RE_ENABLE_I18N */
3160 start_elem.opr.name, syntax);
3161 if (BE (*err != REG_NOERROR, 0))
3162 goto parse_bracket_exp_free_return;
3169 if (BE (token->type == END_OF_RE, 0))
3172 goto parse_bracket_exp_free_return;
3174 if (token->type == OP_CLOSE_BRACKET)
3178 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3180 /* If it is non-matching list. */
3182 bitset_not (sbcset);
3184 #ifdef RE_ENABLE_I18N
3185 /* Ensure only single byte characters are set. */
3186 if (dfa->mb_cur_max > 1)
3187 bitset_mask (sbcset, dfa->sb_char);
3189 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3190 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3191 || mbcset->non_match)))
3193 bin_tree_t *mbc_tree;
3195 /* Build a tree for complex bracket. */
3196 dfa->has_mb_node = 1;
3197 br_token.type = COMPLEX_BRACKET;
3198 br_token.opr.mbcset = mbcset;
3199 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3200 if (BE (mbc_tree == NULL, 0))
3201 goto parse_bracket_exp_espace;
3202 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3203 if (sbcset[sbc_idx])
3205 /* If there are no bits set in sbcset, there is no point
3206 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3207 if (sbc_idx < BITSET_UINTS)
3209 /* Build a tree for simple bracket. */
3210 br_token.type = SIMPLE_BRACKET;
3211 br_token.opr.sbcset = sbcset;
3212 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3213 if (BE (work_tree == NULL, 0))
3214 goto parse_bracket_exp_espace;
3216 /* Then join them by ALT node. */
3217 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3218 if (BE (work_tree == NULL, 0))
3219 goto parse_bracket_exp_espace;
3224 work_tree = mbc_tree;
3228 #endif /* not RE_ENABLE_I18N */
3230 #ifdef RE_ENABLE_I18N
3231 free_charset (mbcset);
3233 /* Build a tree for simple bracket. */
3234 br_token.type = SIMPLE_BRACKET;
3235 br_token.opr.sbcset = sbcset;
3236 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3237 if (BE (work_tree == NULL, 0))
3238 goto parse_bracket_exp_espace;
3242 parse_bracket_exp_espace:
3244 parse_bracket_exp_free_return:
3246 #ifdef RE_ENABLE_I18N
3247 free_charset (mbcset);
3248 #endif /* RE_ENABLE_I18N */
3252 /* Parse an element in the bracket expression. */
3254 static reg_errcode_t
3255 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3256 re_token_t *token, int token_len, re_dfa_t *dfa,
3257 reg_syntax_t syntax, int accept_hyphen)
3259 #ifdef RE_ENABLE_I18N
3261 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3262 if (cur_char_size > 1)
3264 elem->type = MB_CHAR;
3265 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3266 re_string_skip_bytes (regexp, cur_char_size);
3269 #endif /* RE_ENABLE_I18N */
3270 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3271 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3272 || token->type == OP_OPEN_EQUIV_CLASS)
3273 return parse_bracket_symbol (elem, regexp, token);
3274 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3276 /* A '-' must only appear as anything but a range indicator before
3277 the closing bracket. Everything else is an error. */
3279 (void) peek_token_bracket (&token2, regexp, syntax);
3280 if (token2.type != OP_CLOSE_BRACKET)
3281 /* The actual error value is not standardized since this whole
3282 case is undefined. But ERANGE makes good sense. */
3285 elem->type = SB_CHAR;
3286 elem->opr.ch = token->opr.c;
3290 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3291 such as [:<character_class>:], [.<collating_element>.], and
3292 [=<equivalent_class>=]. */
3294 static reg_errcode_t
3295 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3298 unsigned char ch, delim = token->opr.c;
3300 if (re_string_eoi(regexp))
3304 if (i >= BRACKET_NAME_BUF_SIZE)
3306 if (token->type == OP_OPEN_CHAR_CLASS)
3307 ch = re_string_fetch_byte_case (regexp);
3309 ch = re_string_fetch_byte (regexp);
3310 if (re_string_eoi(regexp))
3312 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3314 elem->opr.name[i] = ch;
3316 re_string_skip_bytes (regexp, 1);
3317 elem->opr.name[i] = '\0';
3318 switch (token->type)
3320 case OP_OPEN_COLL_ELEM:
3321 elem->type = COLL_SYM;
3323 case OP_OPEN_EQUIV_CLASS:
3324 elem->type = EQUIV_CLASS;
3326 case OP_OPEN_CHAR_CLASS:
3327 elem->type = CHAR_CLASS;
3335 /* Helper function for parse_bracket_exp.
3336 Build the equivalence class which is represented by NAME.
3337 The result are written to MBCSET and SBCSET.
3338 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3339 is a pointer argument sinse we may update it. */
3341 static reg_errcode_t
3342 build_equiv_class (re_bitset_ptr_t sbcset,
3343 #ifdef RE_ENABLE_I18N
3344 re_charset_t *mbcset, int *equiv_class_alloc,
3346 const unsigned char *name)
3349 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3352 const int32_t *table, *indirect;
3353 const unsigned char *weights, *extra, *cp;
3354 unsigned char char_buf[2];
3358 /* This #include defines a local function! */
3359 # include <locale/weight.h>
3360 /* Calculate the index for equivalence class. */
3362 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3363 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3364 _NL_COLLATE_WEIGHTMB);
3365 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3366 _NL_COLLATE_EXTRAMB);
3367 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3368 _NL_COLLATE_INDIRECTMB);
3369 idx1 = findidx (&cp);
3370 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3371 /* This isn't a valid character. */
3372 return REG_ECOLLATE;
3374 /* Build single byte matcing table for this equivalence class. */
3375 char_buf[1] = (unsigned char) '\0';
3376 len = weights[idx1];
3377 for (ch = 0; ch < SBC_MAX; ++ch)
3381 idx2 = findidx (&cp);
3386 /* This isn't a valid character. */
3388 if (len == weights[idx2])
3391 while (cnt <= len &&
3392 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3396 bitset_set (sbcset, ch);
3399 /* Check whether the array has enough space. */
3400 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3402 /* Not enough, realloc it. */
3403 /* +1 in case of mbcset->nequiv_classes is 0. */
3404 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3405 /* Use realloc since the array is NULL if *alloc == 0. */
3406 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3408 new_equiv_class_alloc);
3409 if (BE (new_equiv_classes == NULL, 0))
3411 mbcset->equiv_classes = new_equiv_classes;
3412 *equiv_class_alloc = new_equiv_class_alloc;
3414 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3419 if (BE (strlen ((const char *) name) != 1, 0))
3420 return REG_ECOLLATE;
3421 bitset_set (sbcset, *name);
3426 /* Helper function for parse_bracket_exp.
3427 Build the character class which is represented by NAME.
3428 The result are written to MBCSET and SBCSET.
3429 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3430 is a pointer argument sinse we may update it. */
3432 static reg_errcode_t
3433 build_charclass (unsigned RE_TRANSLATE_TYPE trans, re_bitset_ptr_t sbcset,
3434 #ifdef RE_ENABLE_I18N
3435 re_charset_t *mbcset, int *char_class_alloc,
3437 const unsigned char *class_name, reg_syntax_t syntax)
3440 const char *name = (const char *) class_name;
3442 /* In case of REG_ICASE "upper" and "lower" match the both of
3443 upper and lower cases. */
3444 if ((syntax & RE_ICASE)
3445 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3448 #ifdef RE_ENABLE_I18N
3449 /* Check the space of the arrays. */
3450 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3452 /* Not enough, realloc it. */
3453 /* +1 in case of mbcset->nchar_classes is 0. */
3454 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3455 /* Use realloc since array is NULL if *alloc == 0. */
3456 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3457 new_char_class_alloc);
3458 if (BE (new_char_classes == NULL, 0))
3460 mbcset->char_classes = new_char_classes;
3461 *char_class_alloc = new_char_class_alloc;
3463 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3464 #endif /* RE_ENABLE_I18N */
3466 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3467 for (i = 0; i < SBC_MAX; ++i) \
3469 if (ctype_func (i)) \
3471 int ch = trans ? trans[i] : i; \
3472 bitset_set (sbcset, ch); \
3476 if (strcmp (name, "alnum") == 0)
3477 BUILD_CHARCLASS_LOOP (isalnum)
3478 else if (strcmp (name, "cntrl") == 0)
3479 BUILD_CHARCLASS_LOOP (iscntrl)
3480 else if (strcmp (name, "lower") == 0)
3481 BUILD_CHARCLASS_LOOP (islower)
3482 else if (strcmp (name, "space") == 0)
3483 BUILD_CHARCLASS_LOOP (isspace)
3484 else if (strcmp (name, "alpha") == 0)
3485 BUILD_CHARCLASS_LOOP (isalpha)
3486 else if (strcmp (name, "digit") == 0)
3487 BUILD_CHARCLASS_LOOP (isdigit)
3488 else if (strcmp (name, "print") == 0)
3489 BUILD_CHARCLASS_LOOP (isprint)
3490 else if (strcmp (name, "upper") == 0)
3491 BUILD_CHARCLASS_LOOP (isupper)
3492 else if (strcmp (name, "blank") == 0)
3493 BUILD_CHARCLASS_LOOP (isblank)
3494 else if (strcmp (name, "graph") == 0)
3495 BUILD_CHARCLASS_LOOP (isgraph)
3496 else if (strcmp (name, "punct") == 0)
3497 BUILD_CHARCLASS_LOOP (ispunct)
3498 else if (strcmp (name, "xdigit") == 0)
3499 BUILD_CHARCLASS_LOOP (isxdigit)
3507 build_charclass_op (re_dfa_t *dfa, unsigned RE_TRANSLATE_TYPE trans,
3508 const unsigned char *class_name,
3509 const unsigned char *extra,
3510 int non_match, reg_errcode_t *err)
3512 re_bitset_ptr_t sbcset;
3513 #ifdef RE_ENABLE_I18N
3514 re_charset_t *mbcset;
3516 #endif /* not RE_ENABLE_I18N */
3518 re_token_t br_token;
3521 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3522 #ifdef RE_ENABLE_I18N
3523 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3524 #endif /* RE_ENABLE_I18N */
3526 #ifdef RE_ENABLE_I18N
3527 if (BE (sbcset == NULL || mbcset == NULL, 0))
3528 #else /* not RE_ENABLE_I18N */
3529 if (BE (sbcset == NULL, 0))
3530 #endif /* not RE_ENABLE_I18N */
3538 #ifdef RE_ENABLE_I18N
3540 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3541 bitset_set(cset->sbcset, '\0');
3543 mbcset->non_match = 1;
3544 #endif /* not RE_ENABLE_I18N */
3547 /* We don't care the syntax in this case. */
3548 ret = build_charclass (trans, sbcset,
3549 #ifdef RE_ENABLE_I18N
3551 #endif /* RE_ENABLE_I18N */
3554 if (BE (ret != REG_NOERROR, 0))
3557 #ifdef RE_ENABLE_I18N
3558 free_charset (mbcset);
3559 #endif /* RE_ENABLE_I18N */
3563 /* \w match '_' also. */
3564 for (; *extra; extra++)
3565 bitset_set (sbcset, *extra);
3567 /* If it is non-matching list. */
3569 bitset_not (sbcset);
3571 #ifdef RE_ENABLE_I18N
3572 /* Ensure only single byte characters are set. */
3573 if (dfa->mb_cur_max > 1)
3574 bitset_mask (sbcset, dfa->sb_char);
3577 /* Build a tree for simple bracket. */
3578 br_token.type = SIMPLE_BRACKET;
3579 br_token.opr.sbcset = sbcset;
3580 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3581 if (BE (tree == NULL, 0))
3582 goto build_word_op_espace;
3584 #ifdef RE_ENABLE_I18N
3585 if (dfa->mb_cur_max > 1)
3587 bin_tree_t *mbc_tree;
3588 /* Build a tree for complex bracket. */
3589 br_token.type = COMPLEX_BRACKET;
3590 br_token.opr.mbcset = mbcset;
3591 dfa->has_mb_node = 1;
3592 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3593 if (BE (mbc_tree == NULL, 0))
3594 goto build_word_op_espace;
3595 /* Then join them by ALT node. */
3596 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3597 if (BE (mbc_tree != NULL, 1))
3602 free_charset (mbcset);
3605 #else /* not RE_ENABLE_I18N */
3607 #endif /* not RE_ENABLE_I18N */
3609 build_word_op_espace:
3611 #ifdef RE_ENABLE_I18N
3612 free_charset (mbcset);
3613 #endif /* RE_ENABLE_I18N */
3618 /* This is intended for the expressions like "a{1,3}".
3619 Fetch a number from `input', and return the number.
3620 Return -1, if the number field is empty like "{,1}".
3621 Return -2, If an error is occured. */
3624 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3630 fetch_token (token, input, syntax);
3632 if (BE (token->type == END_OF_RE, 0))
3634 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3636 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3637 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3638 num = (num > RE_DUP_MAX) ? -2 : num;
3643 #ifdef RE_ENABLE_I18N
3645 free_charset (re_charset_t *cset)
3647 re_free (cset->mbchars);
3649 re_free (cset->coll_syms);
3650 re_free (cset->equiv_classes);
3651 re_free (cset->range_starts);
3652 re_free (cset->range_ends);
3654 re_free (cset->char_classes);
3657 #endif /* RE_ENABLE_I18N */
3659 /* Functions for binary tree operation. */
3661 /* Create a tree node. */
3664 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3665 re_token_type_t type)
3669 return create_token_tree (dfa, left, right, &t);
3673 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3674 const re_token_t *token)
3677 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3679 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3681 if (storage == NULL)
3683 storage->next = dfa->str_tree_storage;
3684 dfa->str_tree_storage = storage;
3685 dfa->str_tree_storage_idx = 0;
3687 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3689 tree->parent = NULL;
3691 tree->right = right;
3692 tree->token = *token;
3693 tree->token.duplicated = 0;
3694 tree->token.opt_subexp = 0;
3697 tree->node_idx = -1;
3700 left->parent = tree;
3702 right->parent = tree;
3706 /* Mark the tree SRC as an optional subexpression.
3707 To be called from preorder or postorder. */
3709 static reg_errcode_t
3710 mark_opt_subexp (void *extra, bin_tree_t *node)
3712 int idx = (int) (long) extra;
3713 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3714 node->token.opt_subexp = 1;
3719 /* Free the allocated memory inside NODE. */
3722 free_token (re_token_t *node)
3724 #ifdef RE_ENABLE_I18N
3725 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3726 free_charset (node->opr.mbcset);
3728 #endif /* RE_ENABLE_I18N */
3729 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3730 re_free (node->opr.sbcset);
3733 /* Worker function for tree walking. Free the allocated memory inside NODE
3734 and its children. */
3736 static reg_errcode_t
3737 free_tree (void *extra, bin_tree_t *node)
3739 free_token (&node->token);
3744 /* Duplicate the node SRC, and return new node. This is a preorder
3745 visit similar to the one implemented by the generic visitor, but
3746 we need more infrastructure to maintain two parallel trees --- so,
3747 it's easier to duplicate. */
3750 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3752 const bin_tree_t *node;
3753 bin_tree_t *dup_root;
3754 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3756 for (node = root; ; )
3758 /* Create a new tree and link it back to the current parent. */
3759 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3762 (*p_new)->parent = dup_node;
3763 (*p_new)->token.duplicated = 1;
3766 /* Go to the left node, or up and to the right. */
3770 p_new = &dup_node->left;
3774 const bin_tree_t *prev = NULL;
3775 while (node->right == prev || node->right == NULL)
3778 node = node->parent;
3779 dup_node = dup_node->parent;
3784 p_new = &dup_node->right;