1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static void re_string_construct_common (const char *str, int len,
22 REG_TRANSLATE_TYPE trans, int icase,
23 const re_dfa_t *dfa) internal_function;
24 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
25 const re_node_set *nodes,
26 unsigned int hash) internal_function;
27 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
28 const re_node_set *nodes,
30 unsigned int hash) internal_function;
32 /* Functions for string operation. */
34 /* This function allocate the buffers. It is necessary to call
35 re_string_reconstruct before using the object. */
39 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
40 REG_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
45 /* Ensure at least one character fits into the buffers. */
46 if (init_len < dfa->mb_cur_max)
47 init_len = dfa->mb_cur_max;
48 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49 re_string_construct_common (str, len, pstr, trans, icase, dfa);
51 ret = re_string_realloc_buffers (pstr, init_buf_len);
52 if (BE (ret != REG_NOERROR, 0))
55 pstr->word_char = dfa->word_char;
56 pstr->word_ops_used = dfa->word_ops_used;
57 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59 pstr->valid_raw_len = pstr->valid_len;
63 /* This function allocate the buffers, and initialize them. */
67 re_string_construct (re_string_t *pstr, const char *str, int len,
68 REG_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
71 memset (pstr, '\0', sizeof (re_string_t));
72 re_string_construct_common (str, len, pstr, trans, icase, dfa);
76 ret = re_string_realloc_buffers (pstr, len + 1);
77 if (BE (ret != REG_NOERROR, 0))
80 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
85 if (dfa->mb_cur_max > 1)
89 ret = build_wcs_upper_buffer (pstr);
90 if (BE (ret != REG_NOERROR, 0))
92 if (pstr->valid_raw_len >= len)
94 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97 if (BE (ret != REG_NOERROR, 0))
102 #endif /* RE_ENABLE_I18N */
103 build_upper_buffer (pstr);
107 #ifdef RE_ENABLE_I18N
108 if (dfa->mb_cur_max > 1)
109 build_wcs_buffer (pstr);
111 #endif /* RE_ENABLE_I18N */
114 re_string_translate_buffer (pstr);
117 pstr->valid_len = pstr->bufs_len;
118 pstr->valid_raw_len = pstr->bufs_len;
126 /* Helper functions for re_string_allocate, and re_string_construct. */
130 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
132 #ifdef RE_ENABLE_I18N
133 if (pstr->mb_cur_max > 1)
135 wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
136 if (BE (new_wcs == NULL, 0))
139 if (pstr->offsets != NULL)
141 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
142 if (BE (new_offsets == NULL, 0))
144 pstr->offsets = new_offsets;
147 #endif /* RE_ENABLE_I18N */
148 if (pstr->mbs_allocated)
150 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
152 if (BE (new_mbs == NULL, 0))
156 pstr->bufs_len = new_buf_len;
163 re_string_construct_common (const char *str, int len, re_string_t *pstr,
164 REG_TRANSLATE_TYPE trans, int icase,
167 pstr->raw_mbs = (const unsigned char *) str;
170 pstr->trans = (unsigned REG_TRANSLATE_TYPE) trans;
171 pstr->icase = icase ? 1 : 0;
172 pstr->mbs_allocated = (trans != NULL || icase);
173 pstr->mb_cur_max = dfa->mb_cur_max;
174 pstr->is_utf8 = dfa->is_utf8;
175 pstr->map_notascii = dfa->map_notascii;
176 pstr->stop = pstr->len;
177 pstr->raw_stop = pstr->stop;
180 #ifdef RE_ENABLE_I18N
182 /* Build wide character buffer PSTR->WCS.
183 If the byte sequence of the string are:
184 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
185 Then wide character buffer will be:
186 <wc1> , WEOF , <wc2> , WEOF , <wc3>
187 We use WEOF for padding, they indicate that the position isn't
188 a first byte of a multibyte character.
190 Note that this function assumes PSTR->VALID_LEN elements are already
191 built and starts from PSTR->VALID_LEN. */
195 build_wcs_buffer (re_string_t *pstr)
198 unsigned char buf[MB_LEN_MAX];
199 assert (MB_LEN_MAX >= pstr->mb_cur_max);
201 unsigned char buf[64];
204 int byte_idx, end_idx, remain_len;
207 /* Build the buffers from pstr->valid_len to either pstr->len or
209 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
210 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
215 remain_len = end_idx - byte_idx;
216 prev_st = pstr->cur_state;
217 /* Apply the translation if we need. */
218 if (BE (pstr->trans != NULL, 0))
222 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
224 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
225 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
227 p = (const char *) buf;
230 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
231 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
232 if (BE (mbclen == (size_t) -2, 0))
234 /* The buffer doesn't have enough space, finish to build. */
235 pstr->cur_state = prev_st;
238 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
240 /* We treat these cases as a singlebyte character. */
242 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
243 if (BE (pstr->trans != NULL, 0))
244 wc = pstr->trans[wc];
245 pstr->cur_state = prev_st;
248 /* Write wide character and padding. */
249 pstr->wcs[byte_idx++] = wc;
250 /* Write paddings. */
251 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
252 pstr->wcs[byte_idx++] = WEOF;
254 pstr->valid_len = byte_idx;
255 pstr->valid_raw_len = byte_idx;
258 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
259 but for REG_ICASE. */
263 build_wcs_upper_buffer (re_string_t *pstr)
266 int src_idx, byte_idx, end_idx, remain_len;
269 char buf[MB_LEN_MAX];
270 assert (MB_LEN_MAX >= pstr->mb_cur_max);
275 byte_idx = pstr->valid_len;
276 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
278 /* The following optimization assumes that ASCII characters can be
279 mapped to wide characters with a simple cast. */
280 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
282 while (byte_idx < end_idx)
286 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
287 && mbsinit (&pstr->cur_state))
289 /* In case of a singlebyte character. */
291 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
292 /* The next step uses the assumption that wchar_t is encoded
293 ASCII-safe: all ASCII values can be converted like this. */
294 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
299 remain_len = end_idx - byte_idx;
300 prev_st = pstr->cur_state;
301 mbclen = mbrtowc (&wc,
302 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
303 + byte_idx), remain_len, &pstr->cur_state);
304 if (BE (mbclen + 2 > 2, 1))
312 mbcdlen = wcrtomb (buf, wcu, &prev_st);
313 if (BE (mbclen == mbcdlen, 1))
314 memcpy (pstr->mbs + byte_idx, buf, mbclen);
322 memcpy (pstr->mbs + byte_idx,
323 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
324 pstr->wcs[byte_idx++] = wcu;
325 /* Write paddings. */
326 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
327 pstr->wcs[byte_idx++] = WEOF;
329 else if (mbclen == (size_t) -1 || mbclen == 0)
331 /* It is an invalid character or '\0'. Just use the byte. */
332 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
333 pstr->mbs[byte_idx] = ch;
334 /* And also cast it to wide char. */
335 pstr->wcs[byte_idx++] = (wchar_t) ch;
336 if (BE (mbclen == (size_t) -1, 0))
337 pstr->cur_state = prev_st;
341 /* The buffer doesn't have enough space, finish to build. */
342 pstr->cur_state = prev_st;
346 pstr->valid_len = byte_idx;
347 pstr->valid_raw_len = byte_idx;
351 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
356 remain_len = end_idx - byte_idx;
357 prev_st = pstr->cur_state;
358 if (BE (pstr->trans != NULL, 0))
362 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
364 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
365 buf[i] = pstr->trans[ch];
367 p = (const char *) buf;
370 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
371 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
372 if (BE (mbclen + 2 > 2, 1))
380 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
381 if (BE (mbclen == mbcdlen, 1))
382 memcpy (pstr->mbs + byte_idx, buf, mbclen);
383 else if (mbcdlen != (size_t) -1)
387 if (byte_idx + mbcdlen > pstr->bufs_len)
389 pstr->cur_state = prev_st;
393 if (pstr->offsets == NULL)
395 pstr->offsets = re_malloc (int, pstr->bufs_len);
397 if (pstr->offsets == NULL)
400 if (!pstr->offsets_needed)
402 for (i = 0; i < (size_t) byte_idx; ++i)
403 pstr->offsets[i] = i;
404 pstr->offsets_needed = 1;
407 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
408 pstr->wcs[byte_idx] = wcu;
409 pstr->offsets[byte_idx] = src_idx;
410 for (i = 1; i < mbcdlen; ++i)
412 pstr->offsets[byte_idx + i]
413 = src_idx + (i < mbclen ? i : mbclen - 1);
414 pstr->wcs[byte_idx + i] = WEOF;
416 pstr->len += mbcdlen - mbclen;
417 if (pstr->raw_stop > src_idx)
418 pstr->stop += mbcdlen - mbclen;
419 end_idx = (pstr->bufs_len > pstr->len)
420 ? pstr->len : pstr->bufs_len;
426 memcpy (pstr->mbs + byte_idx, p, mbclen);
429 memcpy (pstr->mbs + byte_idx, p, mbclen);
431 if (BE (pstr->offsets_needed != 0, 0))
434 for (i = 0; i < mbclen; ++i)
435 pstr->offsets[byte_idx + i] = src_idx + i;
439 pstr->wcs[byte_idx++] = wcu;
440 /* Write paddings. */
441 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
442 pstr->wcs[byte_idx++] = WEOF;
444 else if (mbclen == (size_t) -1 || mbclen == 0)
446 /* It is an invalid character or '\0'. Just use the byte. */
447 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
449 if (BE (pstr->trans != NULL, 0))
450 ch = pstr->trans [ch];
451 pstr->mbs[byte_idx] = ch;
453 if (BE (pstr->offsets_needed != 0, 0))
454 pstr->offsets[byte_idx] = src_idx;
457 /* And also cast it to wide char. */
458 pstr->wcs[byte_idx++] = (wchar_t) ch;
459 if (BE (mbclen == (size_t) -1, 0))
460 pstr->cur_state = prev_st;
464 /* The buffer doesn't have enough space, finish to build. */
465 pstr->cur_state = prev_st;
469 pstr->valid_len = byte_idx;
470 pstr->valid_raw_len = src_idx;
474 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
479 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
486 /* Skip the characters which are not necessary to check. */
487 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
488 rawbuf_idx < new_raw_idx;)
491 remain_len = pstr->len - rawbuf_idx;
492 prev_st = pstr->cur_state;
493 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
494 remain_len, &pstr->cur_state);
495 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
497 /* We treat these cases as a singlebyte character. */
499 pstr->cur_state = prev_st;
501 /* Then proceed the next character. */
502 rawbuf_idx += mbclen;
504 *last_wc = (wint_t) wc;
507 #endif /* RE_ENABLE_I18N */
509 /* Build the buffer PSTR->MBS, and apply the translation if we need.
510 This function is used in case of REG_ICASE. */
514 build_upper_buffer (re_string_t *pstr)
516 int char_idx, end_idx;
517 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
519 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
521 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
522 if (BE (pstr->trans != NULL, 0))
523 ch = pstr->trans[ch];
525 pstr->mbs[char_idx] = toupper (ch);
527 pstr->mbs[char_idx] = ch;
529 pstr->valid_len = char_idx;
530 pstr->valid_raw_len = char_idx;
533 /* Apply TRANS to the buffer in PSTR. */
537 re_string_translate_buffer (re_string_t *pstr)
539 int buf_idx, end_idx;
540 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
542 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
544 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
545 pstr->mbs[buf_idx] = pstr->trans[ch];
548 pstr->valid_len = buf_idx;
549 pstr->valid_raw_len = buf_idx;
552 /* This function re-construct the buffers.
553 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
554 convert to upper case in case of REG_ICASE, apply translation. */
558 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
560 int offset = idx - pstr->raw_mbs_idx;
561 if (BE (offset < 0, 0))
564 #ifdef RE_ENABLE_I18N
565 if (pstr->mb_cur_max > 1)
566 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
567 #endif /* RE_ENABLE_I18N */
568 pstr->len = pstr->raw_len;
569 pstr->stop = pstr->raw_stop;
571 pstr->raw_mbs_idx = 0;
572 pstr->valid_raw_len = 0;
573 pstr->offsets_needed = 0;
574 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
575 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
576 if (!pstr->mbs_allocated)
577 pstr->mbs = (unsigned char *) pstr->raw_mbs;
581 if (BE (offset != 0, 1))
583 /* Are the characters which are already checked remain? */
584 if (BE (offset < pstr->valid_raw_len, 1)
585 #ifdef RE_ENABLE_I18N
586 /* Handling this would enlarge the code too much.
587 Accept a slowdown in that case. */
588 && pstr->offsets_needed == 0
592 /* Yes, move them to the front of the buffer. */
593 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
594 #ifdef RE_ENABLE_I18N
595 if (pstr->mb_cur_max > 1)
596 memmove (pstr->wcs, pstr->wcs + offset,
597 (pstr->valid_len - offset) * sizeof (wint_t));
598 #endif /* RE_ENABLE_I18N */
599 if (BE (pstr->mbs_allocated, 0))
600 memmove (pstr->mbs, pstr->mbs + offset,
601 pstr->valid_len - offset);
602 pstr->valid_len -= offset;
603 pstr->valid_raw_len -= offset;
605 assert (pstr->valid_len > 0);
610 /* No, skip all characters until IDX. */
611 #ifdef RE_ENABLE_I18N
612 if (BE (pstr->offsets_needed, 0))
614 pstr->len = pstr->raw_len - idx + offset;
615 pstr->stop = pstr->raw_stop - idx + offset;
616 pstr->offsets_needed = 0;
620 pstr->valid_raw_len = 0;
621 #ifdef RE_ENABLE_I18N
622 if (pstr->mb_cur_max > 1)
629 const unsigned char *raw, *p, *q, *end;
631 /* Special case UTF-8. Multi-byte chars start with any
632 byte other than 0x80 - 0xbf. */
633 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
634 end = raw + (offset - pstr->mb_cur_max);
635 for (p = raw + offset - 1; p >= end; --p)
636 if ((*p & 0xc0) != 0x80)
640 int mlen = raw + pstr->len - p;
641 unsigned char buf[6];
644 if (BE (pstr->trans != NULL, 0))
646 int i = mlen < 6 ? mlen : 6;
648 buf[i] = pstr->trans[p[i]];
651 /* XXX Don't use mbrtowc, we know which conversion
652 to use (UTF-8 -> UCS4). */
653 memset (&cur_state, 0, sizeof (cur_state));
654 mlen = (mbrtowc (&wc2, (const char *) p, mlen,
656 - (raw + offset - p));
659 memset (&pstr->cur_state, '\0',
661 pstr->valid_len = mlen;
669 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
670 if (BE (pstr->valid_len, 0))
672 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
673 pstr->wcs[wcs_idx] = WEOF;
674 if (pstr->mbs_allocated)
675 memset (pstr->mbs, 255, pstr->valid_len);
677 pstr->valid_raw_len = pstr->valid_len;
678 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
679 && IS_WIDE_WORD_CHAR (wc))
681 : ((IS_WIDE_NEWLINE (wc)
682 && pstr->newline_anchor)
683 ? CONTEXT_NEWLINE : 0));
686 #endif /* RE_ENABLE_I18N */
688 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
691 pstr->tip_context = (bitset_contain (pstr->word_char, c)
693 : ((IS_NEWLINE (c) && pstr->newline_anchor)
694 ? CONTEXT_NEWLINE : 0));
697 if (!BE (pstr->mbs_allocated, 0))
700 pstr->raw_mbs_idx = idx;
702 pstr->stop -= offset;
704 /* Then build the buffers. */
705 #ifdef RE_ENABLE_I18N
706 if (pstr->mb_cur_max > 1)
710 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
711 if (BE (ret != REG_NOERROR, 0))
715 build_wcs_buffer (pstr);
718 #endif /* RE_ENABLE_I18N */
719 if (BE (pstr->mbs_allocated, 0))
722 build_upper_buffer (pstr);
723 else if (pstr->trans != NULL)
724 re_string_translate_buffer (pstr);
727 pstr->valid_len = pstr->len;
734 internal_function __attribute ((pure))
735 re_string_peek_byte_case (const re_string_t *pstr, int idx)
739 /* Handle the common (easiest) cases first. */
740 if (BE (!pstr->mbs_allocated, 1))
741 return re_string_peek_byte (pstr, idx);
743 #ifdef RE_ENABLE_I18N
744 if (pstr->mb_cur_max > 1
745 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
746 return re_string_peek_byte (pstr, idx);
749 off = pstr->cur_idx + idx;
750 #ifdef RE_ENABLE_I18N
751 if (pstr->offsets_needed)
752 off = pstr->offsets[off];
755 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
757 #ifdef RE_ENABLE_I18N
758 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
759 this function returns CAPITAL LETTER I instead of first byte of
760 DOTLESS SMALL LETTER I. The latter would confuse the parser,
761 since peek_byte_case doesn't advance cur_idx in any way. */
762 if (pstr->offsets_needed && !isascii (ch))
763 return re_string_peek_byte (pstr, idx);
770 internal_function __attribute ((pure))
771 re_string_fetch_byte_case (re_string_t *pstr)
773 if (BE (!pstr->mbs_allocated, 1))
774 return re_string_fetch_byte (pstr);
776 #ifdef RE_ENABLE_I18N
777 if (pstr->offsets_needed)
781 /* For tr_TR.UTF-8 [[:islower:]] there is
782 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
783 in that case the whole multi-byte character and return
784 the original letter. On the other side, with
785 [[: DOTLESS SMALL LETTER I return [[:I, as doing
786 anything else would complicate things too much. */
788 if (!re_string_first_byte (pstr, pstr->cur_idx))
789 return re_string_fetch_byte (pstr);
791 off = pstr->offsets[pstr->cur_idx];
792 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
795 return re_string_fetch_byte (pstr);
797 re_string_skip_bytes (pstr,
798 re_string_char_size_at (pstr, pstr->cur_idx));
803 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
808 re_string_destruct (re_string_t *pstr)
810 #ifdef RE_ENABLE_I18N
812 re_free (pstr->offsets);
813 #endif /* RE_ENABLE_I18N */
814 if (pstr->mbs_allocated)
818 /* Return the context at IDX in INPUT. */
822 re_string_context_at (const re_string_t *input, int idx, int eflags)
826 /* In this case, we use the value stored in input->tip_context,
827 since we can't know the character in input->mbs[-1] here. */
828 return input->tip_context;
829 if (BE (idx == input->len, 0))
830 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
831 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
832 #ifdef RE_ENABLE_I18N
833 if (input->mb_cur_max > 1)
837 while(input->wcs[wc_idx] == WEOF)
840 /* It must not happen. */
841 assert (wc_idx >= 0);
845 return input->tip_context;
847 wc = input->wcs[wc_idx];
848 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
850 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
851 ? CONTEXT_NEWLINE : 0);
856 c = re_string_byte_at (input, idx);
857 if (bitset_contain (input->word_char, c))
859 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
863 /* Functions for set operation. */
867 re_node_set_alloc (re_node_set *set, int size)
871 set->elems = re_malloc (int, size);
872 if (BE (set->elems == NULL, 0))
879 re_node_set_init_1 (re_node_set *set, int elem)
883 set->elems = re_malloc (int, 1);
884 if (BE (set->elems == NULL, 0))
886 set->alloc = set->nelem = 0;
889 set->elems[0] = elem;
895 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
898 set->elems = re_malloc (int, 2);
899 if (BE (set->elems == NULL, 0))
904 set->elems[0] = elem1;
911 set->elems[0] = elem1;
912 set->elems[1] = elem2;
916 set->elems[0] = elem2;
917 set->elems[1] = elem1;
925 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
927 dest->nelem = src->nelem;
930 dest->alloc = dest->nelem;
931 dest->elems = re_malloc (int, dest->alloc);
932 if (BE (dest->elems == NULL, 0))
934 dest->alloc = dest->nelem = 0;
937 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
940 re_node_set_init_empty (dest);
944 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
945 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
946 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
950 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
951 const re_node_set *src2)
953 int i1, i2, is, id, delta, sbase;
954 if (src1->nelem == 0 || src2->nelem == 0)
957 /* We need dest->nelem + 2 * elems_in_intersection; this is a
958 conservative estimate. */
959 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
961 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
962 int *new_elems = re_realloc (dest->elems, int, new_alloc);
963 if (BE (new_elems == NULL, 0))
965 dest->elems = new_elems;
966 dest->alloc = new_alloc;
969 /* Find the items in the intersection of SRC1 and SRC2, and copy
970 into the top of DEST those that are not already in DEST itself. */
971 sbase = dest->nelem + src1->nelem + src2->nelem;
972 i1 = src1->nelem - 1;
973 i2 = src2->nelem - 1;
974 id = dest->nelem - 1;
977 if (src1->elems[i1] == src2->elems[i2])
979 /* Try to find the item in DEST. Maybe we could binary search? */
980 while (id >= 0 && dest->elems[id] > src1->elems[i1])
983 if (id < 0 || dest->elems[id] != src1->elems[i1])
984 dest->elems[--sbase] = src1->elems[i1];
986 if (--i1 < 0 || --i2 < 0)
990 /* Lower the highest of the two items. */
991 else if (src1->elems[i1] < src2->elems[i2])
1003 id = dest->nelem - 1;
1004 is = dest->nelem + src1->nelem + src2->nelem - 1;
1005 delta = is - sbase + 1;
1007 /* Now copy. When DELTA becomes zero, the remaining
1008 DEST elements are already in place; this is more or
1009 less the same loop that is in re_node_set_merge. */
1010 dest->nelem += delta;
1011 if (delta > 0 && id >= 0)
1014 if (dest->elems[is] > dest->elems[id])
1016 /* Copy from the top. */
1017 dest->elems[id + delta--] = dest->elems[is--];
1023 /* Slide from the bottom. */
1024 dest->elems[id + delta] = dest->elems[id];
1030 /* Copy remaining SRC elements. */
1031 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1036 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1037 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1039 static reg_errcode_t
1041 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1042 const re_node_set *src2)
1045 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1047 dest->alloc = src1->nelem + src2->nelem;
1048 dest->elems = re_malloc (int, dest->alloc);
1049 if (BE (dest->elems == NULL, 0))
1054 if (src1 != NULL && src1->nelem > 0)
1055 return re_node_set_init_copy (dest, src1);
1056 else if (src2 != NULL && src2->nelem > 0)
1057 return re_node_set_init_copy (dest, src2);
1059 re_node_set_init_empty (dest);
1062 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1064 if (src1->elems[i1] > src2->elems[i2])
1066 dest->elems[id++] = src2->elems[i2++];
1069 if (src1->elems[i1] == src2->elems[i2])
1071 dest->elems[id++] = src1->elems[i1++];
1073 if (i1 < src1->nelem)
1075 memcpy (dest->elems + id, src1->elems + i1,
1076 (src1->nelem - i1) * sizeof (int));
1077 id += src1->nelem - i1;
1079 else if (i2 < src2->nelem)
1081 memcpy (dest->elems + id, src2->elems + i2,
1082 (src2->nelem - i2) * sizeof (int));
1083 id += src2->nelem - i2;
1089 /* Calculate the union set of the sets DEST and SRC. And store it to
1090 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1092 static reg_errcode_t
1094 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1096 int is, id, sbase, delta;
1097 if (src == NULL || src->nelem == 0)
1099 if (dest->alloc < 2 * src->nelem + dest->nelem)
1101 int new_alloc = 2 * (src->nelem + dest->alloc);
1102 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1103 if (BE (new_buffer == NULL, 0))
1105 dest->elems = new_buffer;
1106 dest->alloc = new_alloc;
1109 if (BE (dest->nelem == 0, 0))
1111 dest->nelem = src->nelem;
1112 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1116 /* Copy into the top of DEST the items of SRC that are not
1117 found in DEST. Maybe we could binary search in DEST? */
1118 for (sbase = dest->nelem + 2 * src->nelem,
1119 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1121 if (dest->elems[id] == src->elems[is])
1123 else if (dest->elems[id] < src->elems[is])
1124 dest->elems[--sbase] = src->elems[is--];
1125 else /* if (dest->elems[id] > src->elems[is]) */
1131 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1133 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1136 id = dest->nelem - 1;
1137 is = dest->nelem + 2 * src->nelem - 1;
1138 delta = is - sbase + 1;
1142 /* Now copy. When DELTA becomes zero, the remaining
1143 DEST elements are already in place. */
1144 dest->nelem += delta;
1147 if (dest->elems[is] > dest->elems[id])
1149 /* Copy from the top. */
1150 dest->elems[id + delta--] = dest->elems[is--];
1156 /* Slide from the bottom. */
1157 dest->elems[id + delta] = dest->elems[id];
1160 /* Copy remaining SRC elements. */
1161 memcpy (dest->elems, dest->elems + sbase,
1162 delta * sizeof (int));
1171 /* Insert the new element ELEM to the re_node_set* SET.
1172 SET should not already have ELEM.
1173 return -1 if an error is occured, return 1 otherwise. */
1177 re_node_set_insert (re_node_set *set, int elem)
1180 /* In case the set is empty. */
1181 if (set->alloc == 0)
1183 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1189 if (BE (set->nelem, 0) == 0)
1191 /* We already guaranteed above that set->alloc != 0. */
1192 set->elems[0] = elem;
1197 /* Realloc if we need. */
1198 if (set->alloc == set->nelem)
1201 set->alloc = set->alloc * 2;
1202 new_elems = re_realloc (set->elems, int, set->alloc);
1203 if (BE (new_elems == NULL, 0))
1205 set->elems = new_elems;
1208 /* Move the elements which follows the new element. Test the
1209 first element separately to skip a check in the inner loop. */
1210 if (elem < set->elems[0])
1213 for (idx = set->nelem; idx > 0; idx--)
1214 set->elems[idx] = set->elems[idx - 1];
1218 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1219 set->elems[idx] = set->elems[idx - 1];
1222 /* Insert the new element. */
1223 set->elems[idx] = elem;
1228 /* Insert the new element ELEM to the re_node_set* SET.
1229 SET should not already have any element greater than or equal to ELEM.
1230 Return -1 if an error is occured, return 1 otherwise. */
1234 re_node_set_insert_last (re_node_set *set, int elem)
1236 /* Realloc if we need. */
1237 if (set->alloc == set->nelem)
1240 set->alloc = (set->alloc + 1) * 2;
1241 new_elems = re_realloc (set->elems, int, set->alloc);
1242 if (BE (new_elems == NULL, 0))
1244 set->elems = new_elems;
1247 /* Insert the new element. */
1248 set->elems[set->nelem++] = elem;
1252 /* Compare two node sets SET1 and SET2.
1253 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1256 internal_function __attribute ((pure))
1257 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1260 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1262 for (i = set1->nelem ; --i >= 0 ; )
1263 if (set1->elems[i] != set2->elems[i])
1268 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1271 internal_function __attribute ((pure))
1272 re_node_set_contains (const re_node_set *set, int elem)
1274 unsigned int idx, right, mid;
1275 if (set->nelem <= 0)
1278 /* Binary search the element. */
1280 right = set->nelem - 1;
1283 mid = (idx + right) / 2;
1284 if (set->elems[mid] < elem)
1289 return set->elems[idx] == elem ? idx + 1 : 0;
1294 re_node_set_remove_at (re_node_set *set, int idx)
1296 if (idx < 0 || idx >= set->nelem)
1299 for (; idx < set->nelem; idx++)
1300 set->elems[idx] = set->elems[idx + 1];
1304 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1305 Or return -1, if an error will be occured. */
1309 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1311 int type = token.type;
1312 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1314 int new_nodes_alloc = dfa->nodes_alloc * 2;
1315 int *new_nexts, *new_indices;
1316 re_node_set *new_edests, *new_eclosures;
1318 re_token_t *new_nodes = re_realloc (dfa->nodes, re_token_t,
1320 if (BE (new_nodes == NULL, 0))
1322 dfa->nodes = new_nodes;
1323 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1324 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1325 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1326 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1327 if (BE (new_nexts == NULL || new_indices == NULL
1328 || new_edests == NULL || new_eclosures == NULL, 0))
1330 dfa->nexts = new_nexts;
1331 dfa->org_indices = new_indices;
1332 dfa->edests = new_edests;
1333 dfa->eclosures = new_eclosures;
1334 dfa->nodes_alloc = new_nodes_alloc;
1336 dfa->nodes[dfa->nodes_len] = token;
1337 dfa->nodes[dfa->nodes_len].constraint = 0;
1338 #ifdef RE_ENABLE_I18N
1339 dfa->nodes[dfa->nodes_len].accept_mb =
1340 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1342 dfa->nexts[dfa->nodes_len] = -1;
1343 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1344 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1345 return dfa->nodes_len++;
1348 static inline unsigned int
1350 calc_state_hash (const re_node_set *nodes, unsigned int context)
1352 unsigned int hash = nodes->nelem + context;
1354 for (i = 0 ; i < nodes->nelem ; i++)
1355 hash += nodes->elems[i];
1359 /* Search for the state whose node_set is equivalent to NODES.
1360 Return the pointer to the state, if we found it in the DFA.
1361 Otherwise create the new one and return it. In case of an error
1362 return NULL and set the error code in ERR.
1363 Note: - We assume NULL as the invalid state, then it is possible that
1364 return value is NULL and ERR is REG_NOERROR.
1365 - We never return non-NULL value in case of any errors, it is for
1368 static re_dfastate_t*
1370 re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes)
1373 re_dfastate_t *new_state;
1374 struct re_state_table_entry *spot;
1377 /* Suppress bogus uninitialized-variable warnings. */
1380 if (BE (nodes->nelem == 0, 0))
1385 hash = calc_state_hash (nodes, 0);
1386 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1388 for (i = 0 ; i < spot->num ; i++)
1390 re_dfastate_t *state = spot->array[i];
1391 if (hash != state->hash)
1393 if (re_node_set_compare (&state->nodes, nodes))
1397 /* There are no appropriate state in the dfa, create the new one. */
1398 new_state = create_ci_newstate (dfa, nodes, hash);
1399 if (BE (new_state != NULL, 1))
1408 /* Search for the state whose node_set is equivalent to NODES and
1409 whose context is equivalent to CONTEXT.
1410 Return the pointer to the state, if we found it in the DFA.
1411 Otherwise create the new one and return it. In case of an error
1412 return NULL and set the error code in ERR.
1413 Note: - We assume NULL as the invalid state, then it is possible that
1414 return value is NULL and ERR is REG_NOERROR.
1415 - We never return non-NULL value in case of any errors, it is for
1418 static re_dfastate_t*
1420 re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa,
1421 const re_node_set *nodes, unsigned int context)
1424 re_dfastate_t *new_state;
1425 struct re_state_table_entry *spot;
1428 /* Suppress bogus uninitialized-variable warnings. */
1431 if (nodes->nelem == 0)
1436 hash = calc_state_hash (nodes, context);
1437 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1439 for (i = 0 ; i < spot->num ; i++)
1441 re_dfastate_t *state = spot->array[i];
1442 if (state->hash == hash
1443 && state->context == context
1444 && re_node_set_compare (state->entrance_nodes, nodes))
1447 /* There are no appropriate state in `dfa', create the new one. */
1448 new_state = create_cd_newstate (dfa, nodes, context, hash);
1449 if (BE (new_state != NULL, 1))
1458 /* Finish initialization of the new state NEWSTATE, and using its hash value
1459 HASH put in the appropriate bucket of DFA's state table. Return value
1460 indicates the error code if failed. */
1462 static reg_errcode_t
1464 register_state (re_dfa_t *dfa, re_dfastate_t *newstate, unsigned int hash)
1466 struct re_state_table_entry *spot;
1470 newstate->hash = hash;
1471 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1472 if (BE (err != REG_NOERROR, 0))
1474 for (i = 0; i < newstate->nodes.nelem; i++)
1476 int elem = newstate->nodes.elems[i];
1477 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1478 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1481 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1482 if (BE (spot->alloc <= spot->num, 0))
1484 int new_alloc = 2 * spot->num + 2;
1485 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1487 if (BE (new_array == NULL, 0))
1489 spot->array = new_array;
1490 spot->alloc = new_alloc;
1492 spot->array[spot->num++] = newstate;
1496 /* Create the new state which is independ of contexts.
1497 Return the new state if succeeded, otherwise return NULL. */
1499 static re_dfastate_t *
1501 create_ci_newstate (re_dfa_t *dfa, const re_node_set *nodes, unsigned int hash)
1505 re_dfastate_t *newstate;
1507 newstate = re_calloc (re_dfastate_t, 1);
1508 if (BE (newstate == NULL, 0))
1510 err = re_node_set_init_copy (&newstate->nodes, nodes);
1511 if (BE (err != REG_NOERROR, 0))
1517 newstate->entrance_nodes = &newstate->nodes;
1518 for (i = 0 ; i < nodes->nelem ; i++)
1520 re_token_t *node = dfa->nodes + nodes->elems[i];
1521 re_token_type_t type = node->type;
1522 if (type == CHARACTER && !node->constraint)
1524 #ifdef RE_ENABLE_I18N
1525 newstate->accept_mb |= node->accept_mb;
1526 #endif /* RE_ENABLE_I18N */
1528 /* If the state has the halt node, the state is a halt state. */
1529 if (type == END_OF_RE)
1531 else if (type == OP_BACK_REF)
1532 newstate->has_backref = 1;
1533 else if (type == ANCHOR || node->constraint)
1534 newstate->has_constraint = 1;
1536 err = register_state (dfa, newstate, hash);
1537 if (BE (err != REG_NOERROR, 0))
1539 free_state (newstate);
1545 /* Create the new state which is depend on the context CONTEXT.
1546 Return the new state if succeeded, otherwise return NULL. */
1548 static re_dfastate_t *
1550 create_cd_newstate (re_dfa_t *dfa, const re_node_set *nodes,
1551 unsigned int context, unsigned int hash)
1553 int i, nctx_nodes = 0;
1555 re_dfastate_t *newstate;
1557 newstate = re_calloc (re_dfastate_t, 1);
1558 if (BE (newstate == NULL, 0))
1560 err = re_node_set_init_copy (&newstate->nodes, nodes);
1561 if (BE (err != REG_NOERROR, 0))
1567 newstate->context = context;
1568 newstate->entrance_nodes = &newstate->nodes;
1570 for (i = 0 ; i < nodes->nelem ; i++)
1572 unsigned int constraint = 0;
1573 re_token_t *node = dfa->nodes + nodes->elems[i];
1574 re_token_type_t type = node->type;
1575 if (node->constraint)
1576 constraint = node->constraint;
1578 if (type == CHARACTER && !constraint)
1580 #ifdef RE_ENABLE_I18N
1581 newstate->accept_mb |= node->accept_mb;
1582 #endif /* RE_ENABLE_I18N */
1584 /* If the state has the halt node, the state is a halt state. */
1585 if (type == END_OF_RE)
1587 else if (type == OP_BACK_REF)
1588 newstate->has_backref = 1;
1589 else if (type == ANCHOR)
1590 constraint = node->opr.ctx_type;
1594 if (newstate->entrance_nodes == &newstate->nodes)
1596 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1597 if (BE (newstate->entrance_nodes == NULL, 0))
1599 free_state (newstate);
1602 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1604 newstate->has_constraint = 1;
1607 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1609 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1614 err = register_state (dfa, newstate, hash);
1615 if (BE (err != REG_NOERROR, 0))
1617 free_state (newstate);
1625 free_state (re_dfastate_t *state)
1627 re_node_set_free (&state->non_eps_nodes);
1628 re_node_set_free (&state->inveclosure);
1629 if (state->entrance_nodes != &state->nodes)
1631 re_node_set_free (state->entrance_nodes);
1632 re_free (state->entrance_nodes);
1634 re_node_set_free (&state->nodes);
1635 re_free (state->word_trtable);
1636 re_free (state->trtable);