From 1a22a807e81d565483d7bada25c463e91f1164cc Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 6 Jan 2013 22:42:53 -0800 Subject: [PATCH] segment: Don't require the input to end in a new-line. --- Smake | 1 + src/language/lexer/segment.c | 36 ++++++++++----- tests/language/lexer/segment-test.c | 70 +++++++++++++++++++++++------ tests/language/lexer/segment.at | 4 +- 4 files changed, 84 insertions(+), 27 deletions(-) diff --git a/Smake b/Smake index 1e54533169..a404de41e8 100644 --- a/Smake +++ b/Smake @@ -48,6 +48,7 @@ GNULIB_MODULES = \ mbiter \ memcasecmp \ memchr \ + memchr2 \ mempcpy \ memrchr \ minmax \ diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c index 7ff5e5eb73..0c4a6bd800 100644 --- a/src/language/lexer/segment.c +++ b/src/language/lexer/segment.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -28,6 +28,7 @@ #include "gl/c-ctype.h" #include "gl/c-strcase.h" +#include "gl/memchr2.h" enum segmenter_state { @@ -83,9 +84,9 @@ segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n, int ofs; for (ofs = 2; ofs < n; ofs++) - if (input[ofs] == '\n') + if (input[ofs] == '\n' || input[ofs] == '\0') { - if (input[ofs - 1] == '\r') + if (input[ofs] == '\n' && input[ofs - 1] == '\r') ofs--; s->state = S_GENERAL; @@ -123,7 +124,7 @@ skip_comment (const char *input, size_t n, size_t ofs) { for (; ofs < n; ofs++) { - if (input[ofs] == '\n') + if (input[ofs] == '\n' || input[ofs] == '\0') return ofs; else if (input[ofs] == '*') { @@ -171,7 +172,7 @@ skip_spaces_and_comments (const char *input, size_t n, int ofs) static int is_end_of_line (const char *input, size_t n, int ofs) { - if (input[ofs] == '\n') + if (input[ofs] == '\n' || input[ofs] == '\0') return 1; else if (input[ofs] == '\r') { @@ -193,7 +194,6 @@ at_end_of_line (const char *input, size_t n, int ofs) return is_end_of_line (input, n, ofs); } - static int segmenter_parse_newline__ (const char *input, size_t n, enum segment_type *type) @@ -228,7 +228,7 @@ skip_spaces (const char *input, size_t n, size_t ofs) if (mblen < 0) return -1; - if (!lex_uc_is_space (uc) || uc == '\n') + if (!lex_uc_is_space (uc) || uc == '\n' || uc == '\0') return ofs; ofs += mblen; @@ -370,8 +370,9 @@ segmenter_parse_comment_1__ (struct segmenter *s, case '\n': if (ofs > 1 && input[ofs - 1] == '\r') ofs--; - - if (endcmd == -2) + /* Fall through. */ + case '\0': + if (endcmd == -2 || uc == '\0') { /* Blank line ends comment command. */ s->state = S_GENERAL; @@ -491,6 +492,11 @@ segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n, s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2; return ofs; + case '\0': + *type = SEG_DOCUMENT; + s->state = S_DOCUMENT_3; + return ofs; + default: if (!lex_uc_is_space (uc)) end_cmd = false; @@ -998,7 +1004,7 @@ segmenter_detect_command_name__ (const char *input, size_t n, int ofs) if (mblen < 0) return -1; - if (uc == '\n' + if (uc == '\n' || uc == '\0' || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-')) break; @@ -1247,14 +1253,19 @@ static int segmenter_parse_full_line__ (const char *input, size_t n, enum segment_type *type) { - const char *newline = memchr (input, '\n', n); + const char *newline = memchr2 (input, '\n', '\0', n); if (newline == NULL) return -1; else { int ofs = newline - input; - if (ofs == 0 || (ofs == 1 && input[0] == '\r')) + if (*newline == '\0') + { + assert (ofs > 0); + return ofs; + } + else if (ofs == 0 || (ofs == 1 && input[0] == '\r')) { *type = SEG_NEWLINE; return ofs + 1; @@ -1435,6 +1446,7 @@ segmenter_parse_title_2__ (struct segmenter *s, switch (uc) { case '\n': + case '\0': s->state = S_GENERAL; s->substate = 0; *type = SEG_UNQUOTED_STRING; diff --git a/tests/language/lexer/segment-test.c b/tests/language/lexer/segment-test.c index 64243c8a71..ef5ff613fe 100644 --- a/tests/language/lexer/segment-test.c +++ b/tests/language/lexer/segment-test.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -36,6 +36,7 @@ #include "gl/progname.h" #include "gl/read-file.h" #include "gl/xalloc.h" +#include "gl/xmemdup0.h" /* -a/--auto, -b/--batch, -i/--interactive: syntax mode. */ static enum segmenter_mode mode = SEG_MODE_AUTO; @@ -46,18 +47,21 @@ static bool verbose; /* -1, --one-byte: Feed in one byte at a time? */ static bool one_byte; +/* -0, --truncations: Check that every truncation of input yields a result. */ +static bool check_truncations; + static const char *parse_options (int argc, char **argv); static void usage (void) NO_RETURN; +static void check_segmentation (const char *input, size_t length, + bool print_segments); + int main (int argc, char *argv[]) { - size_t offset, line_number, line_offset; const char *file_name; - char *input; - struct segmenter s; size_t length; - int prev_type; + char *input; set_program_name (argv[0]); file_name = parse_options (argc, argv); @@ -69,10 +73,38 @@ main (int argc, char *argv[]) : read_file (file_name, &length)); if (input == NULL) error (EXIT_FAILURE, errno, "reading %s failed", file_name); - input = xrealloc (input, length + 3); - if (length == 0 || input[length - 1] != '\n') - input[length++] = '\n'; - input[length++] = '\0'; + + if (!check_truncations) + { + input = xrealloc (input, length + 3); + if (length == 0 || input[length - 1] != '\n') + input[length++] = '\n'; + input[length++] = '\0'; + + check_segmentation (input, length, true); + } + else + { + size_t test_len; + + for (test_len = 0; test_len <= length; test_len++) + { + char *copy = xmemdup0 (input, test_len); + check_segmentation (copy, test_len + 1, false); + free (copy); + } + } + free (input); + + return 0; +} + +static void +check_segmentation (const char *input, size_t length, bool print_segments) +{ + size_t offset, line_number, line_offset; + struct segmenter s; + int prev_type; segmenter_init (&s, mode); @@ -123,6 +155,12 @@ main (int argc, char *argv[]) else assert (memchr (&input[offset], '\n', n) == NULL); + if (!print_segments) + { + offset += n; + continue; + } + if (!verbose) { if (prev_type != SEG_SPACES && prev_type != -1 @@ -228,11 +266,9 @@ main (int argc, char *argv[]) printf (" (%s)\n", prompt_style_to_string (prompt)); } } - putchar ('\n'); - - free (input); - return 0; + if (print_segments) + putchar ('\n'); } static const char * @@ -243,6 +279,7 @@ parse_options (int argc, char **argv) static const struct option options[] = { {"one-byte", no_argument, NULL, '1'}, + {"truncations", no_argument, NULL, '0'}, {"auto", no_argument, NULL, 'a'}, {"batch", no_argument, NULL, 'b'}, {"interactive", no_argument, NULL, 'i'}, @@ -251,7 +288,7 @@ parse_options (int argc, char **argv) {NULL, 0, NULL, 0}, }; - int c = getopt_long (argc, argv, "1abivh", options, NULL); + int c = getopt_long (argc, argv, "01abivh", options, NULL); if (c == -1) break; @@ -261,6 +298,10 @@ parse_options (int argc, char **argv) one_byte = true; break; + case '0': + check_truncations = true; + break; + case 'a': mode = SEG_MODE_AUTO; break; @@ -308,6 +349,7 @@ usage: %s [OPTIONS] INPUT\n\ \n\ Options:\n\ -1, --one-byte feed one byte at a time\n\ + -0, --truncations check null truncation of each prefix of input\n\ -a, --auto use \"auto\" syntax mode\n\ -b, --batch use \"batch\" syntax mode\n\ -i, --interactive use \"interactive\" syntax mode (default)\n\ diff --git a/tests/language/lexer/segment.at b/tests/language/lexer/segment.at index e1dd0b5eb4..7d0f03aa33 100644 --- a/tests/language/lexer/segment.at +++ b/tests/language/lexer/segment.at @@ -1,7 +1,9 @@ AT_BANNER([syntax segmentation]) m4_define([PSPP_CHECK_SEGMENT], [AT_CHECK([segment-test $1 input], [0], [expout]) - AT_CHECK([segment-test -1 $1 input], [0], [expout])]) + AT_CHECK([segment-test -1 $1 input], [0], [expout]) + AT_CHECK([segment-test -0 $1 input]) + AT_CHECK([segment-test -01 $1 input])]) AT_SETUP([identifiers]) AT_KEYWORDS([segment]) -- 2.30.2