(REGEX_FREE_STACK, RESET_FAIL_STACK): Make them usable as an expression.
authorStefan Monnier <monnier@iro.umontreal.ca>
Mon, 27 Mar 2000 22:26:37 +0000 (22:26 +0000)
committerStefan Monnier <monnier@iro.umontreal.ca>
Mon, 27 Mar 2000 22:26:37 +0000 (22:26 +0000)
(enum re_opcode_t): Update description of succeed_n.
(PATFETCH): Always define.
(regex_compile): Use lookahead rather than PATUNFETCH (for repetition
operators, char classes, shy-groups and intervals).
Optimize special cases of intervals so as to only use succeed_n and
jump_n when really needed.
(re_compile_fastmap): Simplify handling of jump_n and succeed_n now
that we don't have to handle the special cases any more.
Simplify on_failure_jump handling as well.

regex.c

diff --git a/regex.c b/regex.c
index 671f6c152f32ab77961b2944015c7793d7ade57d..f7ca3a7b8b4dbfdf750f82e6c42dae46178c6ddd 100644 (file)
--- a/regex.c
+++ b/regex.c
@@ -21,7 +21,6 @@
 
 /* TODO:
    - use analyze_first to optimize non-empty loops
-   - optimize succeed_n and jump_n away when possible
    - clean up multibyte issues
    - structure the opcode space into opcode+flag.
    - merge with glic's regex.[ch]
@@ -411,7 +410,7 @@ char *alloca ();
 #define REGEX_REALLOCATE_STACK(source, osize, nsize)                   \
    REGEX_REALLOCATE (source, osize, nsize)
 /* No need to explicitly free anything.         */
-#define REGEX_FREE_STACK(arg)
+#define REGEX_FREE_STACK(arg) ((void)0)
 
 #endif /* not REGEX_MALLOC */
 #endif /* not using relocating allocator */
@@ -546,7 +545,9 @@ typedef enum
   on_failure_jump_smart,
 
        /* Followed by two-byte relative address and two-byte number n.
-          After matching N times, jump to the address upon failure.  */
+          After matching N times, jump to the address upon failure.
+          Does not work if N starts at 0: use on_failure_jump_loop
+          instead.  */
   succeed_n,
 
        /* Followed by two-byte relative address, and two-byte number n.
@@ -1289,7 +1290,7 @@ typedef struct
     fail_stack.frame = 0;                                              \
   } while (0)
 
-#define RESET_FAIL_STACK()
+#define RESET_FAIL_STACK() ((void)0)
 #endif
 
 
@@ -1546,13 +1547,11 @@ static unsigned char *skip_one_char _RE_ARGS((unsigned char *p));
    if necessary.  Also cast from a signed character in the constant
    string passed to us by the user to an unsigned char that we can use
    as an array index (in, e.g., `translate').  */
-#ifndef PATFETCH
 #define PATFETCH(c)                                                    \
   do {                                                                 \
     PATFETCH_RAW (c);                                                  \
     if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c);   \
   } while (0)
-#endif
 
 /* Fetch the next character in the uncompiled pattern, with no
    translation.         */
@@ -2103,35 +2102,24 @@ regex_compile (pattern, size, syntax, bufp)
 
                if (p == pend)
                  break;
-
-               PATFETCH (c);
-
-               if (c == '*'
-                   || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
+               else if (*p == '*'
+                        || (!(syntax & RE_BK_PLUS_QM)
+                            && (*p == '+' || *p == '?')))
                  ;
-
-               else if (syntax & RE_BK_PLUS_QM  &&  c == '\\')
+               else if (syntax & RE_BK_PLUS_QM  && *p == '\\')
                  {
-                   if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
-
-                   PATFETCH (c1);
-                   if (!(c1 == '+' || c1 == '?'))
-                     {
-                       PATUNFETCH;
-                       PATUNFETCH;
-                       break;
-                     }
-
-                   c = c1;
+                   if (p+1 == pend)
+                     FREE_STACK_RETURN (REG_EESCAPE);
+                   if (p[1] == '+' || p[1] == '?')
+                     PATFETCH (c); /* Gobble up the backslash.  */
+                   else
+                     break;
                  }
                else
-                 {
-                   PATUNFETCH;
-                   break;
-                 }
-
+                 break;
                /* If we get here, we found another repeat character.  */
-             }
+               PATFETCH (c);
+              }
 
            /* Star, etc. applied to an empty pattern is equivalent
               to an empty pattern.  */
@@ -2306,9 +2294,11 @@ regex_compile (pattern, size, syntax, bufp)
                  {
                    /* Leave room for the null.  */
                    char str[CHAR_CLASS_MAX_LENGTH + 1];
+                   const unsigned char *class_beg;
 
                    PATFETCH (c);
                    c1 = 0;
+                   class_beg = p;
 
                    /* If pattern is `[[:'.  */
                    if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
@@ -2421,9 +2411,8 @@ regex_compile (pattern, size, syntax, bufp)
                      }
                    else
                      {
-                       c1++;
-                       while (c1--)
-                         PATUNFETCH;
+                       /* Go back to right after the "[:".  */
+                       p = class_beg;
                        SET_LIST_BIT ('[');
 
                        /* Because the `:' may starts the range, we
@@ -2584,9 +2573,9 @@ regex_compile (pattern, size, syntax, bufp)
                if (p+1 < pend)
                  {
                    /* Look for a special (?...) construct */
-                   PATFETCH (c);
-                   if ((syntax & RE_SHY_GROUPS) && c == '?')
+                   if ((syntax & RE_SHY_GROUPS) && *p == '?')
                      {
+                       PATFETCH (c); /* Gobble up the '?'.  */
                        PATFETCH (c);
                        switch (c)
                          {
@@ -2596,7 +2585,6 @@ regex_compile (pattern, size, syntax, bufp)
                            FREE_STACK_RETURN (REG_BADPAT);
                          }
                      }
-                   else PATUNFETCH;
                  }
 
                if (!shy)
@@ -2744,8 +2732,9 @@ regex_compile (pattern, size, syntax, bufp)
              if (!(syntax & RE_INTERVALS)
                     /* If we're at `\{' and it's not the open-interval
                        operator.  */
-                 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
-                 || (p - 2 == pattern  &&  p == pend))
+                 || (syntax & RE_NO_BK_BRACES)
+                 /* What is that? -sm  */
+                 /* || (p - 2 == pattern && p == pend) */)
                goto normal_backslash;
 
            handle_interval:
@@ -2755,7 +2744,7 @@ regex_compile (pattern, size, syntax, bufp)
                /* At least (most) this many matches must be made.  */
                int lower_bound = 0, upper_bound = -1;
 
-               beg_interval = p - 1;
+               beg_interval = p;
 
                if (p == pend)
                  {
@@ -2768,16 +2757,13 @@ regex_compile (pattern, size, syntax, bufp)
                GET_UNSIGNED_NUMBER (lower_bound);
 
                if (c == ',')
-                 {
-                   GET_UNSIGNED_NUMBER (upper_bound);
-                   if (upper_bound < 0) upper_bound = RE_DUP_MAX;
-                 }
+                 GET_UNSIGNED_NUMBER (upper_bound);
                else
                  /* Interval such as `{1}' => match exactly once. */
                  upper_bound = lower_bound;
 
                if (lower_bound < 0 || upper_bound > RE_DUP_MAX
-                   || lower_bound > upper_bound)
+                   || (upper_bound >= 0 && lower_bound > upper_bound))
                  {
                    if (syntax & RE_NO_BK_BRACES)
                      goto unfetch_interval;
@@ -2813,15 +2799,13 @@ regex_compile (pattern, size, syntax, bufp)
                      goto unfetch_interval;
                  }
 
-               /* If the upper bound is zero, don't want to succeed at
-                  all; jump from `laststart' to `b + 3', which will be
-                  the end of the buffer after we insert the jump.  */
                 if (upper_bound == 0)
-                  {
-                    GET_BUFFER_SPACE (3);
-                    INSERT_JUMP (jump, laststart, b + 3);
-                    b += 3;
-                  }
+                  /* If the upper bound is zero, just drop the sub pattern
+                     altogether.  */
+                  b = laststart;
+                else if (lower_bound == 1 && upper_bound == 1)
+                  /* Just match it once: nothing to do here.  */
+                  ;
 
                 /* Otherwise, we have a nontrivial interval.  When
                    we're all done, the pattern will look like:
@@ -2835,28 +2819,49 @@ regex_compile (pattern, size, syntax, bufp)
                 else
                   { /* If the upper bound is > 1, we need to insert
                        more at the end of the loop.  */
-                    unsigned nbytes = 10 + (upper_bound > 1) * 10;
-
-                    GET_BUFFER_SPACE (nbytes);
-
-                    /* Initialize lower bound of the `succeed_n', even
-                       though it will be set during matching by its
-                       attendant `set_number_at' (inserted next),
-                       because `re_compile_fastmap' needs to know.
-                       Jump to the `jump_n' we might insert below.  */
-                    INSERT_JUMP2 (succeed_n, laststart,
-                                  b + 5 + (upper_bound > 1) * 5,
-                                  lower_bound);
-                    b += 5;
-
-                    /* Code to initialize the lower bound.  Insert
-                       before the `succeed_n'.  The `5' is the last two
-                       bytes of this `set_number_at', plus 3 bytes of
-                       the following `succeed_n'.  */
-                    insert_op2 (set_number_at, laststart, 5, lower_bound, b);
-                    b += 5;
-
-                    if (upper_bound > 1)
+                    unsigned int nbytes = (upper_bound < 0 ? 3
+                                           : upper_bound > 1 ? 5 : 0);
+                    unsigned int startoffset = 0;
+
+                    GET_BUFFER_SPACE (20); /* We might use less.  */
+
+                    if (lower_bound == 0)
+                      {
+                        /* A succeed_n that starts with 0 is really a
+                           a simple on_failure_jump_loop.  */
+                        INSERT_JUMP (on_failure_jump_loop, laststart,
+                                     b + 3 + nbytes);
+                        b += 3;
+                      }
+                    else
+                      {
+                        /* Initialize lower bound of the `succeed_n', even
+                           though it will be set during matching by its
+                           attendant `set_number_at' (inserted next),
+                           because `re_compile_fastmap' needs to know.
+                           Jump to the `jump_n' we might insert below.  */
+                        INSERT_JUMP2 (succeed_n, laststart,
+                                      b + 5 + nbytes,
+                                      lower_bound);
+                        b += 5;
+
+                        /* Code to initialize the lower bound.  Insert
+                           before the `succeed_n'.      The `5' is the last two
+                           bytes of this `set_number_at', plus 3 bytes of
+                           the following `succeed_n'.  */
+                        insert_op2 (set_number_at, laststart, 5, lower_bound, b);
+                        b += 5;
+                        startoffset += 5;
+                      }
+
+                    if (upper_bound < 0)
+                      {
+                        /* A negative upper bound stands for infinity,
+                           in which case it degenerates to a plain jump.  */
+                        STORE_JUMP (jump, b, laststart + startoffset);
+                        b += 3;
+                      }
+                    else if (upper_bound > 1)
                       { /* More than one repetition is allowed, so
                            append a backward jump to the `succeed_n'
                            that starts this interval.
@@ -2864,7 +2869,7 @@ regex_compile (pattern, size, syntax, bufp)
                            When we've reached this during matching,
                            we'll have matched the interval once, so
                            jump back only `upper_bound - 1' times.  */
-                        STORE_JUMP2 (jump_n, b, laststart + 5,
+                        STORE_JUMP2 (jump_n, b, laststart + startoffset,
                                      upper_bound - 1);
                         b += 5;
 
@@ -2899,14 +2904,15 @@ regex_compile (pattern, size, syntax, bufp)
               beg_interval = NULL;
 
               /* normal_char and normal_backslash need `c'.  */
-              PATFETCH (c);
+              c = '{';
 
               if (!(syntax & RE_NO_BK_BRACES))
                 {
-                  if (p > pattern  &&  p[-1] == '\\')
-                    goto normal_backslash;
+                  assert (p > pattern && p[-1] == '\\');
+                  goto normal_backslash;
                 }
-              goto normal_char;
+              else
+                goto normal_char;
 
 #ifdef emacs
            /* There is no way to specify the before_dot and after_dot
@@ -3311,9 +3317,6 @@ re_compile_fastmap (bufp)
      match the empty string.  */
   boolean path_can_be_null = true;
 
-  /* We aren't doing a `succeed_n' to begin with.  */
-  boolean succeed_n_p = false;
-
   /* If all elements for base leading-codes in fastmap is set, this
      flag is set true. */
   boolean match_any_multibyte_characters = false;
@@ -3353,7 +3356,7 @@ re_compile_fastmap (bufp)
         as used for the *? operator.  */
       unsigned char *p1 = p;
 
-      if (p == pend || *p == succeed)
+      if (p >= pend || *p == succeed)
        {
          /* We have reached the (effective) end of pattern.  */
          if (!PATTERN_STACK_EMPTY ())
@@ -3510,7 +3513,6 @@ re_compile_fastmap (bufp)
          continue;
 
 
-       case jump_n:
        case jump:
          EXTRACT_NUMBER_AND_INCR (j, p);
          if (j < 0)
@@ -3539,51 +3541,30 @@ re_compile_fastmap (bufp)
        case on_failure_jump_nastyloop:
        case on_failure_jump_loop:
        case on_failure_jump_smart:
-       handle_on_failure_jump:
          EXTRACT_NUMBER_AND_INCR (j, p);
-
-         /* For some patterns, e.g., `(a?)?', `p+j' here points to the
-            end of the pattern.  We don't want to push such a point,
-            since when we restore it above, entering the switch will
-            increment `p' past the end of the pattern.  We don't need
-            to push such a point since we obviously won't find any more
-            fastmap entries beyond `pend'.  Such a pattern can match
-            the null string, though.  */
          if (p + j <= p1)
-           /* Backward jump to be ignored.  */
-           ;
-         else if (p + j < pend)
-           {
-             if (!PUSH_PATTERN_OP (p + j, fail_stack))
-               {
-                 RESET_FAIL_STACK ();
-                 return -2;
-               }
-           }
-         else
-           bufp->can_be_null = 1;
-
-         if (succeed_n_p)
-           {
-             EXTRACT_NUMBER_AND_INCR (k, p);   /* Skip the n.  */
-             succeed_n_p = false;
-           }
-
+           ; /* Backward jump to be ignored.  */
+         else if (!PUSH_PATTERN_OP (p + j, fail_stack))
+           return (RESET_FAIL_STACK (), -2);
          continue;
 
 
+       case jump_n:
+         /* This code simply does not properly handle forward jump_n.  */
+         DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
+         p += 4;
+         /* jump_n can either jump or fall through.  The (backward) jump
+            case has already been handled, so we only need to look at the
+            fallthrough case.  */
+         continue;
+         
        case succeed_n:
-         /* Get to the number of times to succeed.  */
-         p += 2;
-
-         /* Increment p past the n for when k != 0.  */
-         EXTRACT_NUMBER_AND_INCR (k, p);
-         if (k == 0)
-           {
-             p -= 4;
-             succeed_n_p = true;  /* Spaghetti code alert.  */
-             goto handle_on_failure_jump;
-           }
+         /* If N == 0, it should be an on_failure_jump_loop instead.  */
+         DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
+         p += 4;
+         /* We only care about one iteration of the loop, so we don't
+            need to consider the case where this behaves like an
+            on_failure_jump.  */
          continue;