From: Stefan Monnier Date: Sun, 26 Mar 2000 23:05:51 +0000 (+0000) Subject: (enum re_opcode_t): New opcode on_failure_jump_nastyloop. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=aeb359a7deabdde9d8d780525f8dcc54d4393d1d;p=pspp (enum re_opcode_t): New opcode on_failure_jump_nastyloop. (print_partial_compiled_pattern, re_compile_fastmap): Handle new opcode. (regex_compile): Use on_failure_jump_nastyloop for non-greedy loops. (re_match_2_internal): Add code for on_failure_jump_nastyloop when executing it as well as when popping it off the stack to find infinite loops in non-greedy repetition operators. --- diff --git a/regex.c b/regex.c index 227a52b46e..671f6c152f 100644 --- a/regex.c +++ b/regex.c @@ -20,7 +20,6 @@ USA. */ /* TODO: - - detect nasty infinite loops like "\\(\\)+?ab" when matching "ac". - use analyze_first to optimize non-empty loops - optimize succeed_n and jump_n away when possible - clean up multibyte issues @@ -532,6 +531,12 @@ typedef enum indefinitely). */ on_failure_jump_loop, + /* Just like `on_failure_jump_loop', except that it checks for + a different kind of loop (the kind that shows up with non-greedy + operators). This operation has to be immediately preceded + by a `no_op'. */ + on_failure_jump_nastyloop, + /* A smart `on_failure_jump' used for greedy * and + operators. It analyses the loop before which it is put and if the loop does not require backtracking, it changes itself to @@ -942,6 +947,11 @@ print_partial_compiled_pattern (start, end) printf ("/on_failure_keep_string_jump to %d", p + mcnt - start); break; + case on_failure_jump_nastyloop: + extract_number_and_incr (&mcnt, &p); + printf ("/on_failure_jump_nastyloop to %d", p + mcnt - start); + break; + case on_failure_jump_loop: extract_number_and_incr (&mcnt, &p); printf ("/on_failure_jump_loop to %d", p + mcnt - start); @@ -2179,19 +2189,19 @@ regex_compile (pattern, size, syntax, bufp) else /* not greedy */ { /* I wish the greedy and non-greedy cases could be merged. */ + GET_BUFFER_SPACE (7); /* We might use less. */ if (many_times_ok) { /* The non-greedy multiple match looks like a repeat..until: we only need a conditional jump at the end of the loop */ - GET_BUFFER_SPACE (3); - STORE_JUMP (on_failure_jump, b, laststart); + BUF_PUSH (no_op); + STORE_JUMP (on_failure_jump_nastyloop, b, laststart); b += 3; if (zero_times_ok) { /* The repeat...until naturally matches one or more. To also match zero times, we need to first jump to the end of the loop (its conditional jump). */ - GET_BUFFER_SPACE (3); INSERT_JUMP (jump, laststart, b); b += 3; } @@ -2199,7 +2209,6 @@ regex_compile (pattern, size, syntax, bufp) else { /* non-greedy a?? */ - GET_BUFFER_SPACE (6); INSERT_JUMP (jump, laststart, b + 3); b += 3; INSERT_JUMP (on_failure_jump, laststart, laststart + 6); @@ -3514,6 +3523,7 @@ re_compile_fastmap (bufp) case on_failure_jump: case on_failure_keep_string_jump: case on_failure_jump_loop: + case on_failure_jump_nastyloop: case on_failure_jump_smart: p++; break; @@ -3526,6 +3536,7 @@ re_compile_fastmap (bufp) case on_failure_jump: case on_failure_keep_string_jump: + case on_failure_jump_nastyloop: case on_failure_jump_loop: case on_failure_jump_smart: handle_on_failure_jump: @@ -3708,7 +3719,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) int anchored_start = 0; /* Nonzero if we have to concern multibyte character. */ - int multibyte = bufp->multibyte; + const boolean multibyte = bufp->multibyte; /* Check for out-of-range STARTPOS. */ if (startpos < 0 || startpos > total_size) @@ -5067,6 +5078,30 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) PUSH_FAILURE_POINT (p - 3, NULL); break; + /* A nasty loop is introduced by the non-greedy *? and +?. + With such loops, the stack only ever contains one failure point + at a time, so that a plain on_failure_jump_loop kind of + cycle detection cannot work. Worse yet, such a detection + can not only fail to detect a cycle, but it can also wrongly + detect a cycle (between different instantiations of the same + loop. + So the method used for those nasty loops is a little different: + We use a special cycle-detection-stack-frame which is pushed + when the on_failure_jump_nastyloop failure-point is *popped*. + This special frame thus marks the beginning of one iteration + through the loop and we can hence easily check right here + whether something matched between the beginning and the end of + the loop. */ + case on_failure_jump_nastyloop: + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n", + mcnt, p + mcnt); + + assert ((re_opcode_t)p[-4] == no_op); + CHECK_INFINITE_LOOP (p - 4, d); + PUSH_FAILURE_POINT (p - 3, d); + break; + /* Simple loop detecting on_failure_jump: just check on the failure stack if the same spot was already hit earlier. */ @@ -5428,6 +5463,11 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) assert (str == NULL); goto continue_failure_jump; + case on_failure_jump_nastyloop: + assert ((re_opcode_t)pat[-2] == no_op); + PUSH_FAILURE_POINT (pat - 2, str); + /* Fallthrough */ + case on_failure_jump_loop: case on_failure_jump: case succeed_n: @@ -5437,6 +5477,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) p = pat + mcnt; break; + case no_op: + /* A special frame used for nastyloops. */ + goto fail; + default: abort(); }