From: Ben Pfaff Date: Sat, 19 Mar 2011 23:30:55 +0000 (-0700) Subject: segment: New library for low-level phase of lexical syntax analysis. X-Git-Tag: v0.7.7~18 X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=75a467ed2d32e1adb0c24cf89676cfb48845be98;p=pspp-builds.git segment: New library for low-level phase of lexical syntax analysis. This library provides for a low-level part of lexical analysis for PSPP syntax, which I call "segmentation". Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label (a segment type) for each byte or contiguous sequence of bytes in the input. The following commit will implement the high-level phase of lexical analysis, called "scanning", that converts a sequence of segments into PSPP tokens. --- diff --git a/Smake b/Smake index 3a3235c0..14c2a75f 100644 --- a/Smake +++ b/Smake @@ -49,6 +49,7 @@ GNULIB_MODULES = \ printf-posix \ printf-safe \ progname \ + read-file \ regex \ relocatable-prog \ rename \ diff --git a/src/language/lexer/automake.mk b/src/language/lexer/automake.mk index 71f6b413..b3d06fec 100644 --- a/src/language/lexer/automake.mk +++ b/src/language/lexer/automake.mk @@ -10,6 +10,8 @@ language_lexer_sources = \ src/language/lexer/subcommand-list.h \ src/language/lexer/format-parser.c \ src/language/lexer/format-parser.h \ + src/language/lexer/segment.c \ + src/language/lexer/segment.h \ src/language/lexer/value-parser.c \ src/language/lexer/value-parser.h \ src/language/lexer/variable-parser.c \ diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c new file mode 100644 index 00000000..9900cd7e --- /dev/null +++ b/src/language/lexer/segment.c @@ -0,0 +1,1631 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "language/lexer/segment.h" + +#include +#include + +#include "data/identifier.h" +#include "language/lexer/command-name.h" +#include "libpspp/assertion.h" +#include "libpspp/cast.h" + +#include "gl/c-ctype.h" +#include "gl/c-strcase.h" + +enum segmenter_state + { + S_SHBANG, + S_GENERAL, + S_COMMENT_1, + S_COMMENT_2, + S_DOCUMENT_1, + S_DOCUMENT_2, + S_DOCUMENT_3, + S_FILE_LABEL, + S_DO_REPEAT_1, + S_DO_REPEAT_2, + S_DO_REPEAT_3, + S_BEGIN_DATA_1, + S_BEGIN_DATA_2, + S_BEGIN_DATA_3, + S_BEGIN_DATA_4, + S_TITLE_1, + S_TITLE_2 + }; + +#define SS_START_OF_LINE (1u << 0) +#define SS_START_OF_COMMAND (1u << 1) + +static int segmenter_detect_command_name__ (const char *input, + size_t n, int ofs); + +static int +segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n) +{ + const uint8_t *input = CHAR_CAST (const uint8_t *, input_); + int mblen; + + assert (n > 0); + + mblen = u8_mbtoucr (puc, input, n); + return (mblen >= 0 ? mblen + : mblen == -2 ? -1 + : u8_mbtouc (puc, input, n)); +} + +static int +segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n, + enum segment_type *type) +{ + if (input[0] == '#') + { + if (n < 2) + return -1; + else if (input[1] == '!') + { + int ofs; + + for (ofs = 2; ofs < n; ofs++) + if (input[ofs] == '\n') + { + if (input[ofs - 1] == '\r') + ofs--; + + s->state = S_GENERAL; + s->substate = SS_START_OF_COMMAND; + *type = SEG_SHBANG; + return ofs; + } + + return -1; + } + } + + s->state = S_GENERAL; + s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND; + return segmenter_push (s, input, n, type); +} + +static int +segmenter_parse_digraph__ (const char *seconds, struct segmenter *s, + const char *input, size_t n, + enum segment_type *type) +{ + assert (s->state == S_GENERAL); + + if (n < 2) + return -1; + + *type = SEG_PUNCT; + s->substate = 0; + return input[1] != '\0' && strchr (seconds, input[1]) != NULL ? 2 : 1; +} + +static int +skip_comment (const char *input, size_t n, size_t ofs) +{ + for (; ofs < n; ofs++) + { + if (input[ofs] == '\n') + return ofs; + else if (input[ofs] == '*') + { + if (ofs + 1 >= n) + return -1; + else if (input[ofs + 1] == '/') + return ofs + 2; + } + } + return -1; +} + +static int +skip_spaces_and_comments (const char *input, size_t n, int ofs) +{ + while (ofs < n) + { + ucs4_t uc; + int mblen; + + mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs); + if (mblen < 0) + return -1; + + if (uc == '/') + { + if (ofs + 1 >= n) + return -1; + else if (input[ofs + 1] != '*') + return ofs; + + ofs = skip_comment (input, n, ofs + 2); + if (ofs < 0) + return -1; + } + else if (lex_uc_is_space (uc) && uc != '\n') + ofs += mblen; + else + return ofs; + } + + return -1; +} + +static int +is_end_of_line (const char *input, size_t n, int ofs) +{ + if (input[ofs] == '\n') + return 1; + else if (input[ofs] == '\r') + { + if (ofs + 1 >= n) + return -1; + return input[ofs + 1] == '\n'; + } + else + return 0; +} + +static int +at_end_of_line (const char *input, size_t n, int ofs) +{ + ofs = skip_spaces_and_comments (input, n, ofs); + if (ofs < 0) + return -1; + + return is_end_of_line (input, n, ofs); +} + + +static int +segmenter_parse_newline__ (const char *input, size_t n, + enum segment_type *type) +{ + int ofs; + + if (input[0] == '\n') + ofs = 1; + else + { + if (n < 2) + return -1; + + assert (input[0] == '\r'); + assert (input[1] == '\n'); + ofs = 2; + } + + *type = SEG_NEWLINE; + return ofs; +} + +static int +skip_spaces (const char *input, size_t n, size_t ofs) +{ + while (ofs < n) + { + ucs4_t uc; + int mblen; + + mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs); + if (mblen < 0) + return -1; + + if (!lex_uc_is_space (uc) || uc == '\n') + return ofs; + + ofs += mblen; + } + + return -1; +} + +static int +skip_digits (const char *input, size_t n, int ofs) +{ + for (; ofs < n; ofs++) + if (!c_isdigit (input[ofs])) + return ofs; + return -1; +} + +static int +segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n, + enum segment_type *type) +{ + int ofs; + + assert (s->state == S_GENERAL); + + ofs = skip_digits (input, n, 0); + if (ofs < 0) + return -1; + + if (input[ofs] == '.') + { + ofs = skip_digits (input, n, ofs + 1); + if (ofs < 0) + return -1; + } + + if (ofs >= n) + return -1; + if (input[ofs] == 'e' || input[ofs] == 'E') + { + ofs++; + if (ofs >= n) + return -1; + + if (input[ofs] == '+' || input[ofs] == '-') + { + ofs++; + if (ofs >= n) + return -1; + } + + if (!c_isdigit (input[ofs])) + { + *type = SEG_EXPECTED_EXPONENT; + s->substate = 0; + return ofs; + } + + ofs = skip_digits (input, n, ofs); + if (ofs < 0) + return -1; + } + + if (input[ofs - 1] == '.') + { + int eol = at_end_of_line (input, n, ofs); + if (eol < 0) + return -1; + else if (eol) + ofs--; + } + + *type = SEG_NUMBER; + s->substate = 0; + return ofs; +} + +static bool +is_reserved_word (const char *s, int n) +{ + char s0, s1, s2, s3; + + s0 = c_toupper (s[0]); + switch (n) + { + case 2: + s1 = c_toupper (s[1]); + return ((s0 == 'B' && s1 == 'Y') + || (s0 == 'E' && s1 == 'Q') + || (s0 == 'G' && (s1 == 'E' || s1 == 'T')) + || (s0 == 'L' && (s1 == 'E' || s1 == 'T')) + || (s0 == 'N' && s1 == 'E') + || (s0 == 'O' && s1 == 'R') + || (s0 == 'T' && s1 == 'O')); + + case 3: + s1 = c_toupper (s[1]); + s2 = c_toupper (s[2]); + return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L') + || (s1 == 'N' && s2 == 'D'))) + || (s0 == 'N' && s1 == 'O' && s2 == 'T')); + + case 4: + s1 = c_toupper (s[1]); + s2 = c_toupper (s[2]); + s3 = c_toupper (s[3]); + return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H'; + + default: + return false; + } +} + +static int +segmenter_parse_comment_1__ (struct segmenter *s, + const char *input, size_t n, + enum segment_type *type) +{ + int endcmd; + int ofs; + + endcmd = -2; + ofs = 0; + while (ofs < n) + { + ucs4_t uc; + int mblen; + + mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs); + if (mblen < 0) + return -1; + + switch (uc) + { + case '.': + endcmd = ofs; + break; + + case '\n': + if (ofs > 1 && input[ofs - 1] == '\r') + ofs--; + + if (endcmd == -2) + { + /* Blank line ends comment command. */ + s->state = S_GENERAL; + s->substate = SS_START_OF_COMMAND; + *type = SEG_SEPARATE_COMMANDS; + return ofs; + } + else if (endcmd >= 0) + { + /* '.' at end of line ends comment command. */ + s->state = S_GENERAL; + s->substate = 0; + *type = SEG_COMMENT_COMMAND; + return endcmd; + } + else + { + /* Comment continues onto next line. */ + *type = SEG_COMMENT_COMMAND; + s->state = S_COMMENT_2; + return ofs; + } + NOT_REACHED (); + + default: + if (!lex_uc_is_space (uc)) + endcmd = -1; + break; + } + + ofs += mblen; + } + return -1; +} + +static int +segmenter_parse_comment_2__ (struct segmenter *s, const char *input, size_t n, + enum segment_type *type) +{ + int new_cmd; + ucs4_t uc; + int mblen; + int ofs; + + ofs = segmenter_parse_newline__ (input, n, type); + if (ofs < 0 || ofs >= n) + return -1; + + mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs); + if (mblen < 0) + return -1; + + if (uc == '+' || uc == '-' || uc == '.') + new_cmd = 1; + else if (!lex_uc_is_space (uc)) + switch (s->mode) + { + case SEG_MODE_INTERACTIVE: + new_cmd = false; + break; + + case SEG_MODE_BATCH: + new_cmd = true; + break; + + case SEG_MODE_AUTO: + new_cmd = segmenter_detect_command_name__ (input, n, ofs); + if (new_cmd < 0) + return -1; + break; + } + + if (new_cmd) + { + s->state = S_GENERAL; + s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND; + } + else + s->state = S_COMMENT_1; + return ofs; +} + +static int +segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n, + enum segment_type *type) +{ + bool end_cmd; + int ofs; + + end_cmd = false; + ofs = 0; + while (ofs < n) + { + ucs4_t uc; + int mblen; + + mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs); + if (mblen < 0) + return -1; + + switch (uc) + { + case '.': + end_cmd = true; + break; + + case '\n': + if (ofs > 1 && input[ofs - 1] == '\r') + ofs--; + + *type = SEG_DOCUMENT; + s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2; + return ofs; + + default: + if (!lex_uc_is_space (uc)) + end_cmd = false; + break; + } + + ofs += mblen; + } + return -1; +} + +static int +segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n, + enum segment_type *type) +{ + int ofs; + + ofs = segmenter_parse_newline__ (input, n, type); + if (ofs < 0) + return -1; + + s->state = S_DOCUMENT_1; + return ofs; +} + +static int +segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type) +{ + *type = SEG_END_COMMAND; + s->state = S_GENERAL; + s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE; + return 0; +} + +static int +segmenter_unquoted (const char *input, size_t n, int ofs) + +{ + char c; + + ofs = skip_spaces_and_comments (input, n, ofs); + if (ofs < 0) + return -1; + + c = input[ofs]; + return c != '\'' && c != '"' && c != '\n' && c != '\0'; +} + +static int +next_id_in_command (const struct segmenter *s, const char *input, size_t n, + int ofs, char id[], size_t id_size) +{ + struct segmenter sub; + + assert (id_size > 0); + + sub.mode = s->mode; + sub.state = S_GENERAL; + sub.substate = 0; + for (;;) + { + enum segment_type type; + int retval; + + retval = segmenter_push (&sub, input + ofs, n - ofs, &type); + if (retval < 0) + { + id[0] = '\0'; + return -1; + } + + switch (type) + { + case SEG_SHBANG: + case SEG_SPACES: + case SEG_COMMENT: + case SEG_NEWLINE: + break; + + case SEG_IDENTIFIER: + if (retval < id_size) + { + memcpy (id, input + ofs, retval); + id[retval] = '\0'; + return ofs + retval; + } + /* fall through */ + + case SEG_NUMBER: + case SEG_QUOTED_STRING: + case SEG_HEX_STRING: + case SEG_UNICODE_STRING: + case SEG_UNQUOTED_STRING: + case SEG_RESERVED_WORD: + case SEG_PUNCT: + case SEG_COMMENT_COMMAND: + case SEG_DO_REPEAT_COMMAND: + case SEG_INLINE_DATA: + case SEG_START_DOCUMENT: + case SEG_DOCUMENT: + case SEG_START_COMMAND: + case SEG_SEPARATE_COMMANDS: + case SEG_END_COMMAND: + case SEG_END: + case SEG_EXPECTED_QUOTE: + case SEG_EXPECTED_EXPONENT: + case SEG_UNEXPECTED_DOT: + case SEG_UNEXPECTED_CHAR: + id[0] = '\0'; + return ofs + retval; + + case SEG_N_TYPES: + NOT_REACHED (); + } + ofs += retval; + } +} + +static int +segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n, + enum segment_type *type) +{ + ucs4_t uc; + int ofs; + + assert (s->state == S_GENERAL); + + ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n); + for (;;) + { + int mblen; + + if (ofs >= n) + return -1; + + mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs); + if (mblen < 0) + return -1; + else if (!lex_uc_is_idn (uc)) + break; + + ofs += mblen; + } + + if (input[ofs - 1] == '.') + { + int eol = at_end_of_line (input, n, ofs); + if (eol < 0) + return -1; + else if (eol) + ofs--; + } + + if (is_reserved_word (input, ofs)) + *type = SEG_RESERVED_WORD; + else + *type = SEG_IDENTIFIER; + + if (s->substate & SS_START_OF_COMMAND) + { + struct substring word = ss_buffer (input, ofs); + + if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4)) + { + s->state = S_COMMENT_1; + return segmenter_parse_comment_1__ (s, input, n, type); + } + else if (lex_id_match (ss_cstr ("DOCUMENT"), word)) + { + s->state = S_DOCUMENT_1; + *type = SEG_START_DOCUMENT; + return 0; + } + else if (lex_id_match (ss_cstr ("TITLE"), word) + || lex_id_match (ss_cstr ("SUBTITLE"), word)) + { + int result = segmenter_unquoted (input, n, ofs); + if (result < 0) + return -1; + else if (result) + { + s->state = S_TITLE_1; + return ofs; + } + } + else if (lex_id_match (ss_cstr ("FILE"), word)) + { + char id[16]; + + if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0) + return -1; + else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id))) + { + s->state = S_FILE_LABEL; + s->substate = 0; + return ofs; + } + } + else if (lex_id_match (ss_cstr ("DO"), word)) + { + char id[16]; + + if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0) + return -1; + else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id))) + { + s->state = S_DO_REPEAT_1; + s->substate = 0; + return ofs; + } + } + else if (lex_id_match (ss_cstr ("BEGIN"), word)) + { + char id[16]; + int ofs2; + + ofs2 = next_id_in_command (s, input, n, ofs, id, sizeof id); + if (ofs2 < 0) + return -1; + else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id))) + { + int eol; + + ofs2 = skip_spaces_and_comments (input, n, ofs2); + if (ofs2 < 0) + return -1; + + if (input[ofs2] == '.') + { + ofs2 = skip_spaces_and_comments (input, n, ofs2 + 1); + if (ofs2 < 0) + return -1; + } + + eol = is_end_of_line (input, n, ofs2); + if (eol < 0) + return -1; + else if (eol) + { + if (memchr (input, '\n', ofs2)) + s->state = S_BEGIN_DATA_1; + else + s->state = S_BEGIN_DATA_2; + s->substate = 0; + return ofs; + } + } + } + } + + s->substate = 0; + return ofs; +} + +static int +segmenter_parse_string__ (enum segment_type string_type, + int ofs, struct segmenter *s, + const char *input, size_t n, enum segment_type *type) +{ + int quote = input[ofs]; + + ofs++; + while (ofs < n) + if (input[ofs] == quote) + { + ofs++; + if (ofs >= n) + return -1; + else if (input[ofs] == quote) + ofs++; + else + { + *type = string_type; + s->substate = 0; + return ofs; + } + } + else if (input[ofs] == '\n' || input[ofs] == '\0') + { + *type = SEG_EXPECTED_QUOTE; + s->substate = 0; + return ofs; + } + else + ofs++; + + return -1; +} + +static int +segmenter_maybe_parse_string__ (enum segment_type string_type, + struct segmenter *s, + const char *input, size_t n, + enum segment_type *type) +{ + if (n < 2) + return -1; + else if (input[1] == '\'' || input[1] == '"') + return segmenter_parse_string__ (string_type, 1, s, input, n, type); + else + return segmenter_parse_id__ (s, input, n, type); +} + +static int +segmenter_parse_mid_command__ (struct segmenter *s, + const char *input, size_t n, + enum segment_type *type) +{ + ucs4_t uc; + int mblen; + int ofs; + + assert (s->state == S_GENERAL); + assert (!(s->substate & SS_START_OF_LINE)); + + mblen = segmenter_u8_to_uc__ (&uc, input, n); + if (mblen < 0) + return -1; + + switch (uc) + { + case '\n': + s->substate |= SS_START_OF_LINE; + *type = SEG_NEWLINE; + return 1; + + case '/': + if (n == 1) + return -1; + else if (input[1] == '*') + { + ofs = skip_comment (input, n, 2); + if (ofs < 0) + return -1; + + *type = SEG_COMMENT; + return ofs; + } + else + { + s->substate = 0; + *type = SEG_PUNCT; + return 1; + } + + case '(': case ')': case ',': case '=': case '-': + case '[': case ']': case '&': case '|': case '+': + *type = SEG_PUNCT; + s->substate = 0; + return 1; + + case '*': + if (s->substate & SS_START_OF_COMMAND) + { + /* '*' at the beginning of a command begins a comment. */ + s->state = S_COMMENT_1; + return segmenter_parse_comment_1__ (s, input, n, type); + } + else + return segmenter_parse_digraph__ ("*", s, input, n, type); + + case '<': + return segmenter_parse_digraph__ ("=>", s, input, n, type); + + case '>': + return segmenter_parse_digraph__ ("=", s, input, n, type); + + case '~': + return segmenter_parse_digraph__ ("=", s, input, n, type); + + case '.': + if (n < 2) + return -1; + else if (c_isdigit (input[1])) + return segmenter_parse_number__ (s, input, n, type); + else + { + int eol = at_end_of_line (input, n, 1); + if (eol < 0) + return -1; + + if (eol) + { + *type = SEG_END_COMMAND; + s->substate = SS_START_OF_COMMAND; + } + else + *type = SEG_UNEXPECTED_DOT; + return 1; + } + NOT_REACHED (); + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + return segmenter_parse_number__ (s, input, n, type); + + case 'u': case 'U': + return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING, + s, input, n, type); + + case 'x': case 'X': + return segmenter_maybe_parse_string__ (SEG_HEX_STRING, + s, input, n, type); + + case '\'': case '"': + return segmenter_parse_string__ (SEG_QUOTED_STRING, 0, + s, input, n, type); + + default: + if (lex_uc_is_space (uc)) + { + ofs = skip_spaces (input, n, mblen); + if (ofs < 0) + return -1; + + if (input[ofs - 1] == '\r' && input[ofs] == '\n') + { + if (ofs == 1) + { + s->substate |= SS_START_OF_LINE; + *type = SEG_NEWLINE; + return 2; + } + else + ofs--; + } + *type = SEG_SPACES; + return ofs; + } + else if (lex_uc_is_id1 (uc)) + return segmenter_parse_id__ (s, input, n, type); + else + { + *type = SEG_UNEXPECTED_CHAR; + s->substate = 0; + return mblen; + } + } +} + +static int +compare_commands (const void *a_, const void *b_) +{ + const char *const *ap = a_; + const char *const *bp = b_; + const char *a = *ap; + const char *b = *bp; + + return c_strcasecmp (a, b); +} + +static const char ** +segmenter_get_command_name_candidates (unsigned char first) +{ +#define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME, +#define UNIMPL_CMD(NAME, DESCRIPTION) NAME, + static const char *commands[] = + { +#include "language/command.def" + "" + }; + static size_t n_commands = (sizeof commands / sizeof *commands) - 1; +#undef DEF_CMD +#undef UNIMPL_CMD + + static bool inited; + + static const char **cindex[UCHAR_MAX + 1]; + + if (!inited) + { + size_t i; + + inited = true; + + qsort (commands, n_commands, sizeof *commands, compare_commands); + for (i = 0; i < n_commands; i++) + { + unsigned char c = c_toupper (commands[i][0]); + if (cindex[c] == NULL) + cindex[c] = &commands[i]; + } + for (i = 0; i <= UCHAR_MAX; i++) + if (cindex[i] == NULL) + cindex[i] = &commands[n_commands]; + } + + return cindex[c_toupper (first)]; +} + +static int +segmenter_detect_command_name__ (const char *input, size_t n, int ofs) +{ + const char **commands; + + input += ofs; + n -= ofs; + ofs = 0; + for (;;) + { + ucs4_t uc; + int mblen; + + if (ofs >= n) + return -1; + + mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs); + if (mblen < 0) + return -1; + + if (uc == '\n' + || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-')) + break; + + ofs += mblen; + } + if (input[ofs - 1] == '.') + ofs--; + + for (commands = segmenter_get_command_name_candidates (input[0]); + c_toupper (input[0]) == c_toupper ((*commands)[0]); + commands++) + { + int missing_words; + bool exact; + + if (command_match (ss_cstr (*commands), ss_buffer (input, ofs), + &exact, &missing_words) + && missing_words <= 0) + return 1; + } + + return 0; +} + +static int +is_start_of_string__ (const char *input, size_t n, int ofs) +{ + int c; + + c = input[ofs]; + if (c == 'x' || c == 'X' || c == 'u' || c == 'U') + { + if (ofs + 1 >= n) + return -1; + + return input[ofs + 1] == '\'' || input[ofs + 1] == '"'; + } + else + return c == '\'' || c == '"' || c == '\n'; +} + +static int +segmenter_parse_start_of_line__ (struct segmenter *s, + const char *input, size_t n, + enum segment_type *type) +{ + ucs4_t uc; + int mblen; + int ofs; + + assert (s->state == S_GENERAL); + assert (s->substate & SS_START_OF_LINE); + + mblen = segmenter_u8_to_uc__ (&uc, input, n); + if (mblen < 0) + return -1; + + switch (uc) + { + case '+': + ofs = skip_spaces_and_comments (input, n, 1); + if (ofs < 0) + return -1; + else + { + int is_string = is_start_of_string__ (input, n, ofs); + if (is_string < 0) + return -1; + else if (is_string) + { + /* This is punctuation that may separate pieces of a string. */ + *type = SEG_PUNCT; + s->substate = 0; + return 1; + } + } + /* Fall through. */ + + case '-': + case '.': + *type = SEG_START_COMMAND; + s->substate = SS_START_OF_COMMAND; + return 1; + + default: + if (lex_uc_is_space (uc)) + { + int eol = at_end_of_line (input, n, 0); + if (eol < 0) + return -1; + else if (eol) + { + s->substate = SS_START_OF_COMMAND; + *type = SEG_SEPARATE_COMMANDS; + return 0; + } + break; + } + + if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND) + break; + else if (s->mode == SEG_MODE_AUTO) + { + int cmd = segmenter_detect_command_name__ (input, n, 0); + if (cmd < 0) + return -1; + else if (cmd == 0) + break; + } + else + assert (s->mode == SEG_MODE_BATCH); + + s->substate = SS_START_OF_COMMAND; + *type = SEG_START_COMMAND; + return 0; + } + + s->substate = SS_START_OF_COMMAND; + return segmenter_parse_mid_command__ (s, input, n, type); +} + +static int +segmenter_parse_file_label__ (struct segmenter *s, + const char *input, size_t n, + enum segment_type *type) +{ + struct segmenter sub; + int ofs; + + sub = *s; + sub.state = S_GENERAL; + ofs = segmenter_push (&sub, input, n, type); + + if (ofs < 0) + return -1; + else if (*type == SEG_IDENTIFIER) + { + int result; + + assert (lex_id_match (ss_cstr ("LABEL"), + ss_buffer ((char *) input, ofs))); + result = segmenter_unquoted (input, n, ofs); + if (result < 0) + return -1; + else + { + if (result) + s->state = S_TITLE_1; + else + *s = sub; + return ofs; + } + } + else + { + s->substate = sub.substate; + return ofs; + } +} + +static int +segmenter_subparse (struct segmenter *s, + const char *input, size_t n, enum segment_type *type) +{ + struct segmenter sub; + int ofs; + + sub.mode = s->mode; + sub.state = S_GENERAL; + sub.substate = s->substate; + ofs = segmenter_push (&sub, input, n, type); + s->substate = sub.substate; + return ofs; +} + +static int +segmenter_parse_do_repeat_1__ (struct segmenter *s, + const char *input, size_t n, + enum segment_type *type) +{ + int ofs = segmenter_subparse (s, input, n, type); + if (ofs < 0) + return -1; + + if (*type == SEG_START_COMMAND || *type == SEG_SEPARATE_COMMANDS) + s->state = S_DO_REPEAT_2; + else if (*type == SEG_END_COMMAND) + { + s->state = S_DO_REPEAT_3; + s->substate = 1; + } + + return ofs; +} + +static int +segmenter_parse_do_repeat_2__ (struct segmenter *s, + const char *input, size_t n, + enum segment_type *type) +{ + int ofs = segmenter_subparse (s, input, n, type); + if (ofs < 0) + return -1; + + if (*type == SEG_NEWLINE) + { + s->state = S_DO_REPEAT_3; + s->substate = 1; + } + + return ofs; +} + +static bool +check_repeat_command (struct segmenter *s, + const char *input, size_t n) +{ + int direction; + char id[16]; + int ofs; + + ofs = 0; + if (input[ofs] == '+' || input[ofs] == '-') + ofs++; + + ofs = next_id_in_command (s, input, n, ofs, id, sizeof id); + if (ofs < 0) + return false; + else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id))) + direction = 1; + else if (lex_id_match (ss_cstr ("END"), ss_cstr (id))) + direction = -1; + else + return true; + + ofs = next_id_in_command (s, input, n, ofs, id, sizeof id); + if (ofs < 0) + return false; + + if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id))) + s->substate += direction; + return true; +} + +static int +segmenter_parse_full_line__ (const char *input, size_t n, + enum segment_type *type) +{ + const char *newline = memchr (input, '\n', n); + + if (newline == NULL) + return -1; + else + { + int ofs = newline - input; + if (ofs == 0 || (ofs == 1 && input[0] == '\r')) + { + *type = SEG_NEWLINE; + return ofs + 1; + } + else + return ofs - (input[ofs - 1] == '\r'); + } +} + +static int +segmenter_parse_do_repeat_3__ (struct segmenter *s, + const char *input, size_t n, + enum segment_type *type) +{ + int ofs; + + ofs = segmenter_parse_full_line__ (input, n, type); + if (ofs < 0 || input[ofs - 1] == '\n') + return ofs; + else if (!check_repeat_command (s, input, n)) + return -1; + else if (s->substate == 0) + { + s->state = S_GENERAL; + s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE; + return segmenter_push (s, input, n, type); + } + else + { + *type = SEG_DO_REPEAT_COMMAND; + return ofs; + } +} + +static int +segmenter_parse_begin_data_1__ (struct segmenter *s, + const char *input, size_t n, + enum segment_type *type) +{ + int ofs = segmenter_subparse (s, input, n, type); + if (ofs < 0) + return -1; + + if (*type == SEG_NEWLINE) + s->state = S_BEGIN_DATA_2; + + return ofs; +} + +static int +segmenter_parse_begin_data_2__ (struct segmenter *s, + const char *input, size_t n, + enum segment_type *type) +{ + int ofs = segmenter_subparse (s, input, n, type); + if (ofs < 0) + return -1; + + if (*type == SEG_NEWLINE) + s->state = S_BEGIN_DATA_3; + + return ofs; +} + +static bool +is_end_data (const char *input, size_t n) +{ + const uint8_t *u_input = CHAR_CAST (const uint8_t *, input); + bool endcmd; + ucs4_t uc; + int mblen; + int ofs; + + if (n < 3 || c_strncasecmp (input, "END", 3)) + return false; + + ofs = 3; + mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs); + if (!lex_uc_is_space (uc)) + return false; + ofs += mblen; + + if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4)) + return false; + ofs += 4; + + endcmd = false; + while (ofs < n) + { + mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs); + if (uc == '.') + { + if (endcmd) + return false; + endcmd = true; + } + else if (!lex_uc_is_space (uc)) + return false; + ofs += mblen; + } + + return true; +} + +static int +segmenter_parse_begin_data_3__ (struct segmenter *s, + const char *input, size_t n, + enum segment_type *type) +{ + int ofs; + + ofs = segmenter_parse_full_line__ (input, n, type); + if (ofs < 0) + return -1; + else if (is_end_data (input, ofs)) + { + s->state = S_GENERAL; + s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE; + return segmenter_push (s, input, n, type); + } + else + { + *type = SEG_INLINE_DATA; + s->state = S_BEGIN_DATA_4; + return input[ofs - 1] == '\n' ? 0 : ofs; + } +} + +static int +segmenter_parse_begin_data_4__ (struct segmenter *s, + const char *input, size_t n, + enum segment_type *type) +{ + int ofs; + + ofs = segmenter_parse_newline__ (input, n, type); + if (ofs < 0) + return -1; + + s->state = S_BEGIN_DATA_3; + return ofs; +} + +static int +segmenter_parse_title_1__ (struct segmenter *s, + const char *input, size_t n, + enum segment_type *type) +{ + int ofs; + + ofs = skip_spaces (input, n, 0); + if (ofs < 0) + return -1; + s->state = S_TITLE_2; + *type = SEG_SPACES; + return ofs; +} + +static int +segmenter_parse_title_2__ (struct segmenter *s, + const char *input, size_t n, + enum segment_type *type) +{ + int endcmd; + int ofs; + + endcmd = -1; + ofs = 0; + while (ofs < n) + { + ucs4_t uc; + int mblen; + + mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs); + if (mblen < 0) + return -1; + + switch (uc) + { + case '\n': + s->state = S_GENERAL; + s->substate = 0; + *type = SEG_UNQUOTED_STRING; + return endcmd >= 0 ? endcmd : ofs; + + case '.': + endcmd = ofs; + break; + + default: + if (!lex_uc_is_space (uc)) + endcmd = -1; + break; + } + + ofs += mblen; + } + + return -1; +} + +/* Returns the name of segment TYPE as a string. The caller must not modify + or free the returned string. + + This is useful only for debugging and testing. */ +const char * +segment_type_to_string (enum segment_type type) +{ + switch (type) + { +#define SEG_TYPE(NAME) case SEG_##NAME: return #NAME; + SEG_TYPES +#undef SEG_TYPE + default: + return "unknown segment type"; + } +} + +/* Initializes S as a segmenter with the given syntax MODE. + + A segmenter does not contain any external references, so nothing needs to be + done to destroy one. For the same reason, segmenters may be copied with + plain struct assignment (or memcpy). */ +void +segmenter_init (struct segmenter *s, enum segmenter_mode mode) +{ + s->state = S_SHBANG; + s->substate = 0; + s->mode = mode; +} + +/* Returns the mode passed to segmenter_init() for S. */ +enum segmenter_mode +segmenter_get_mode (const struct segmenter *s) +{ + return s->mode; +} + +/* Attempts to label a prefix of S's remaining input with a segment type. The + caller supplies the first N bytes of the remaining input as INPUT, which + must be a UTF-8 encoded string. The end of the input stream must be + indicated by a null byte at the beginning of a line, that is, immediately + following a new-line (or as the first byte of the input stream). + + The input may contain '\n' or '\r\n' line ends in any combination. + + If successful, returns the number of bytes in the segment at the beginning + of INPUT (between 0 and N, inclusive) and stores the type of that segment + into *TYPE. The next call to segmenter_push() should not include those + bytes as part of INPUT, because they have (figuratively) been consumed by + the segmenter. + + Failure occurs only if the segment type of the N bytes in INPUT cannot yet + be determined. In this case segmenter_push() returns -1. The caller should + obtain more input and then call segmenter_push() again with a larger N and + repeat until the input is exhausted (which must be indicated as described + above) or until a valid segment is returned. segmenter_push() will never + return -1 when the end of input is visible within INPUT. + + The caller must not, in a sequence of calls, supply contradictory input. + That is, bytes provided as part of INPUT in one call, but not consumed, must + not be provided with *different* values on subsequent calls. This is + because segmenter_push() must often make decisions based on looking ahead + beyond the bytes that it consumes. */ +int +segmenter_push (struct segmenter *s, const char *input, size_t n, + enum segment_type *type) +{ + if (n == 0) + return -1; + + if (input[0] == '\0') + { + *type = SEG_END; + return 1; + } + + switch (s->state) + { + case S_SHBANG: + return segmenter_parse_shbang__ (s, input, n, type); + + case S_GENERAL: + return (s->substate & SS_START_OF_LINE + ? segmenter_parse_start_of_line__ (s, input, n, type) + : segmenter_parse_mid_command__ (s, input, n, type)); + + case S_COMMENT_1: + return segmenter_parse_comment_1__ (s, input, n, type); + case S_COMMENT_2: + return segmenter_parse_comment_2__ (s, input, n, type); + + case S_DOCUMENT_1: + return segmenter_parse_document_1__ (s, input, n, type); + case S_DOCUMENT_2: + return segmenter_parse_document_2__ (s, input, n, type); + case S_DOCUMENT_3: + return segmenter_parse_document_3__ (s, type); + + case S_FILE_LABEL: + return segmenter_parse_file_label__ (s, input, n, type); + + case S_DO_REPEAT_1: + return segmenter_parse_do_repeat_1__ (s, input, n, type); + case S_DO_REPEAT_2: + return segmenter_parse_do_repeat_2__ (s, input, n, type); + case S_DO_REPEAT_3: + return segmenter_parse_do_repeat_3__ (s, input, n, type); + + case S_BEGIN_DATA_1: + return segmenter_parse_begin_data_1__ (s, input, n, type); + case S_BEGIN_DATA_2: + return segmenter_parse_begin_data_2__ (s, input, n, type); + case S_BEGIN_DATA_3: + return segmenter_parse_begin_data_3__ (s, input, n, type); + case S_BEGIN_DATA_4: + return segmenter_parse_begin_data_4__ (s, input, n, type); + + case S_TITLE_1: + return segmenter_parse_title_1__ (s, input, n, type); + case S_TITLE_2: + return segmenter_parse_title_2__ (s, input, n, type); + } + + NOT_REACHED (); +} + +/* Returns the style of command prompt to display to an interactive user for + input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE + and at the beginning of a line (that is, if segmenter_push() consumed as + much as possible of the input up to a new-line). */ +enum prompt_style +segmenter_get_prompt (const struct segmenter *s) +{ + switch (s->state) + { + case S_SHBANG: + return PROMPT_FIRST; + + case S_GENERAL: + return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER; + + case S_COMMENT_1: + case S_COMMENT_2: + return PROMPT_COMMENT; + + case S_DOCUMENT_1: + case S_DOCUMENT_2: + return PROMPT_DOCUMENT; + case S_DOCUMENT_3: + return PROMPT_FIRST; + + case S_FILE_LABEL: + return PROMPT_LATER; + + case S_DO_REPEAT_1: + case S_DO_REPEAT_2: + return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER; + case S_DO_REPEAT_3: + return PROMPT_DO_REPEAT; + + case S_BEGIN_DATA_1: + return PROMPT_FIRST; + case S_BEGIN_DATA_2: + return PROMPT_LATER; + case S_BEGIN_DATA_3: + case S_BEGIN_DATA_4: + return PROMPT_DATA; + + case S_TITLE_1: + case S_TITLE_2: + return PROMPT_FIRST; + } + + NOT_REACHED (); +} diff --git a/src/language/lexer/segment.h b/src/language/lexer/segment.h new file mode 100644 index 00000000..686b471e --- /dev/null +++ b/src/language/lexer/segment.h @@ -0,0 +1,122 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef SEGMENT_H +#define SEGMENT_H 1 + +#include +#include +#include "libpspp/prompt.h" + +/* PSPP syntax segmentation. + + PSPP divides traditional "lexical analysis" or "tokenization" into two + phases: a lower-level phase called "segmentation" and a higher-level phase + called "scanning". This header file provides declarations for the + segmentation phase. scan.h contains declarations for the scanning phase. + + Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label + (a segment type) for each byte or contiguous sequence of bytes in the input. + It also, in a few corner cases, outputs zero-width segments that label the + boundary between a pair of bytes in the input. + + Some segment types correspond directly to tokens; for example, an + "identifier" segment (SEG_IDENTIFIER) becomes an identifier token (T_ID) + later in lexical analysis. Other segments contribute to tokens but do not + correspond diectly; for example, multiple quoted string segments + (SEG_QUOTED_STRING) separated by spaces (SEG_SPACES) and "+" punctuators + (SEG_PUNCT) may be combined to form a single string token (T_STRING). + Still other segments are ignored (e.g. SEG_SPACES) or trigger special + behavior such as error messages later in tokenization + (e.g. SEG_EXPECTED_QUOTE). +*/ + +/* Segmentation mode. + + This corresponds to the syntax mode for which a syntax file is intended. + This is the only configuration setting for a segmenter. */ +enum segmenter_mode + { + /* Try to interpret input correctly regardless of whether it is written + for interactive or batch mode. */ + SEG_MODE_AUTO, + + /* Interactive or batch syntax mode. */ + SEG_MODE_INTERACTIVE, + SEG_MODE_BATCH + }; + +#define SEG_TYPES \ + SEG_TYPE(NUMBER) \ + SEG_TYPE(QUOTED_STRING) \ + SEG_TYPE(HEX_STRING) \ + SEG_TYPE(UNICODE_STRING) \ + SEG_TYPE(UNQUOTED_STRING) \ + SEG_TYPE(RESERVED_WORD) \ + SEG_TYPE(IDENTIFIER) \ + SEG_TYPE(PUNCT) \ + \ + SEG_TYPE(SHBANG) \ + SEG_TYPE(SPACES) \ + SEG_TYPE(COMMENT) \ + SEG_TYPE(NEWLINE) \ + \ + SEG_TYPE(COMMENT_COMMAND) \ + SEG_TYPE(DO_REPEAT_COMMAND) \ + SEG_TYPE(INLINE_DATA) \ + \ + SEG_TYPE(START_DOCUMENT) \ + SEG_TYPE(DOCUMENT) \ + \ + SEG_TYPE(START_COMMAND) \ + SEG_TYPE(SEPARATE_COMMANDS) \ + SEG_TYPE(END_COMMAND) \ + SEG_TYPE(END) \ + \ + SEG_TYPE(EXPECTED_QUOTE) \ + SEG_TYPE(EXPECTED_EXPONENT) \ + SEG_TYPE(UNEXPECTED_DOT) \ + SEG_TYPE(UNEXPECTED_CHAR) + +/* Types of segments. */ +enum segment_type + { +#define SEG_TYPE(NAME) SEG_##NAME, + SEG_TYPES +#undef SEG_TYPE + SEG_N_TYPES + }; + +const char *segment_type_to_string (enum segment_type); + +/* A segmenter. Opaque. */ +struct segmenter + { + unsigned char state; + unsigned char substate; + unsigned char mode; + }; + +void segmenter_init (struct segmenter *, enum segmenter_mode); + +enum segmenter_mode segmenter_get_mode (const struct segmenter *); + +int segmenter_push (struct segmenter *, const char *input, size_t n, + enum segment_type *); + +enum prompt_style segmenter_get_prompt (const struct segmenter *); + +#endif /* segment.h */ diff --git a/src/libpspp/automake.mk b/src/libpspp/automake.mk index a7c92830..fcb28140 100644 --- a/src/libpspp/automake.mk +++ b/src/libpspp/automake.mk @@ -60,6 +60,8 @@ src_libpspp_libpspp_la_SOURCES = \ src/libpspp/msg-locator.h \ src/libpspp/pool.c \ src/libpspp/pool.h \ + src/libpspp/prompt.c \ + src/libpspp/prompt.h \ src/libpspp/range-map.c \ src/libpspp/range-map.h \ src/libpspp/range-set.c \ diff --git a/src/libpspp/prompt.c b/src/libpspp/prompt.c new file mode 100644 index 00000000..0722c3b9 --- /dev/null +++ b/src/libpspp/prompt.c @@ -0,0 +1,42 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "libpspp/prompt.h" + +const char * +prompt_style_to_string (enum prompt_style style) +{ + switch (style) + { + case PROMPT_FIRST: + return "first"; + case PROMPT_LATER: + return "later"; + case PROMPT_DATA: + return "data"; + case PROMPT_COMMENT: + return "COMMENT"; + case PROMPT_DOCUMENT: + return "DOCUMENT"; + case PROMPT_DO_REPEAT: + return "DO REPEAT"; + default: + return "unknown prompt"; + } +} + diff --git a/src/libpspp/prompt.h b/src/libpspp/prompt.h new file mode 100644 index 00000000..14e820b8 --- /dev/null +++ b/src/libpspp/prompt.h @@ -0,0 +1,32 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef PROMPT_H +#define PROMPT_H 1 + +enum prompt_style + { + PROMPT_FIRST, /* First line of command. */ + PROMPT_LATER, /* Second or later line of command. */ + PROMPT_DATA, /* Between BEGIN DATA and END DATA. */ + PROMPT_COMMENT, /* COMMENT or * command. */ + PROMPT_DOCUMENT, /* DOCUMENT command. */ + PROMPT_DO_REPEAT /* DO REPEAT command. */ + }; + +const char *prompt_style_to_string (enum prompt_style); + +#endif /* prompt.h */ diff --git a/tests/automake.mk b/tests/automake.mk index 0b4a825c..4d49e5ba 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -5,6 +5,7 @@ check_PROGRAMS += \ tests/data/sack \ tests/data/inexactify \ tests/language/lexer/command-name-test \ + tests/language/lexer/segment-test \ tests/libpspp/abt-test \ tests/libpspp/bt-test \ tests/libpspp/encoding-guesser-test \ @@ -210,6 +211,20 @@ tests_language_lexer_command_name_test_LDADD = \ $(LIBINTL) tests_language_lexer_command_name_test_CFLAGS = $(AM_CFLAGS) + +check_PROGRAMS += tests/language/lexer/segment-test +tests_language_lexer_segment_test_SOURCES = \ + src/data/identifier.c \ + src/language/lexer/command-name.c \ + src/language/lexer/segment.c \ + src/libpspp/pool.c \ + src/libpspp/prompt.c \ + src/libpspp/str.c \ + src/libpspp/temp-file.c \ + tests/language/lexer/segment-test.c +tests_language_lexer_segment_test_LDADD = gl/libgl.la $(LIBINTL) +tests_language_lexer_segment_test_CFLAGS = $(AM_CFLAGS) + check_PROGRAMS += tests/output/render-test tests_output_render_test_SOURCES = tests/output/render-test.c tests_output_render_test_LDADD = \ @@ -291,6 +306,7 @@ TESTSUITE_AT = \ tests/language/lexer/command-name.at \ tests/language/lexer/lexer.at \ tests/language/lexer/q2c.at \ + tests/language/lexer/segment.at \ tests/language/lexer/variable-parser.at \ tests/language/stats/aggregate.at \ tests/language/stats/autorecode.at \ diff --git a/tests/language/lexer/segment-test.c b/tests/language/lexer/segment-test.c new file mode 100644 index 00000000..64243c8a --- /dev/null +++ b/tests/language/lexer/segment-test.c @@ -0,0 +1,318 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libpspp/assertion.h" +#include "libpspp/cast.h" +#include "libpspp/compiler.h" +#include "libpspp/misc.h" +#include "language/lexer/segment.h" + +#include "gl/error.h" +#include "gl/minmax.h" +#include "gl/progname.h" +#include "gl/read-file.h" +#include "gl/xalloc.h" + +/* -a/--auto, -b/--batch, -i/--interactive: syntax mode. */ +static enum segmenter_mode mode = SEG_MODE_AUTO; + +/* -v, --verbose: Print row and column information. */ +static bool verbose; + +/* -1, --one-byte: Feed in one byte at a time? */ +static bool one_byte; + +static const char *parse_options (int argc, char **argv); +static void usage (void) NO_RETURN; + +int +main (int argc, char *argv[]) +{ + size_t offset, line_number, line_offset; + const char *file_name; + char *input; + struct segmenter s; + size_t length; + int prev_type; + + set_program_name (argv[0]); + file_name = parse_options (argc, argv); + + /* Read from stdin into 'input'. Ensure that 'input' ends in a new-line + followed by a null byte. */ + input = (!strcmp (file_name, "-") + ? fread_file (stdin, &length) + : read_file (file_name, &length)); + if (input == NULL) + error (EXIT_FAILURE, errno, "reading %s failed", file_name); + input = xrealloc (input, length + 3); + if (length == 0 || input[length - 1] != '\n') + input[length++] = '\n'; + input[length++] = '\0'; + + segmenter_init (&s, mode); + + line_number = 1; + line_offset = 0; + prev_type = -1; + for (offset = 0; offset < length; ) + { + enum segment_type type; + const char *type_name, *p; + int n; + + if (one_byte) + { + int n_newlines = 0; + int i; + + for (i = 0; i <= length - offset; i++) + { + /* Make a copy to ensure that segmenter_push() isn't actually + looking ahead. */ + char *copy; + + if (i > 0 && input[offset + i - 1] == '\n') + n_newlines++; + + copy = xmemdup (input + offset, i); + n = segmenter_push (&s, copy, i, &type); + free (copy); + + if (n >= 0) + break; + } + assert (n_newlines <= 2); + } + else + n = segmenter_push (&s, input + offset, length - offset, &type); + + if (n < 0) + error (EXIT_FAILURE, 0, "segmenter_push returned -1 at offset %zu", + offset); + assert (offset + n <= length); + + if (type == SEG_NEWLINE) + assert ((n == 1 && input[offset] == '\n') + || (n == 2 + && input[offset] == '\r' && input[offset + 1] == '\n')); + else + assert (memchr (&input[offset], '\n', n) == NULL); + + if (!verbose) + { + if (prev_type != SEG_SPACES && prev_type != -1 + && type == SEG_SPACES && n == 1 && input[offset] == ' ') + { + printf (" space\n"); + offset++; + prev_type = -1; + continue; + } + } + if (prev_type != -1) + putchar ('\n'); + prev_type = type; + + if (verbose) + printf ("%2zu:%2zu: ", line_number, offset - line_offset); + + type_name = segment_type_to_string (type); + for (p = type_name; *p != '\0'; p++) + putchar (tolower ((unsigned char) *p)); + if (n > 0) + { + int i; + + for (i = MIN (15, strlen (type_name)); i < 16; i++) + putchar (' '); + for (i = 0; i < n; ) + { + const uint8_t *u_input = CHAR_CAST (const uint8_t *, input); + ucs4_t uc; + int mblen; + + mblen = u8_mbtoucr (&uc, u_input + (offset + i), n - i); + if (mblen < 0) + { + int j; + + mblen = u8_mbtouc (&uc, u_input + (offset + i), n - i); + putchar ('<'); + for (j = 0; j < mblen; j++) + { + if (j > 0) + putchar (' '); + printf ("%02x", input[offset + i + j]); + } + putchar ('>'); + } + else + { + switch (uc) + { + case ' ': + printf ("_"); + break; + + case '_': + printf ("\\_"); + break; + + case '\\': + printf ("\\\\"); + break; + + case '\t': + printf ("\\t"); + break; + + case '\r': + printf ("\\r"); + break; + + case '\n': + printf ("\\n"); + break; + + case '\v': + printf ("\\v"); + break; + + default: + if (uc < 0x20 || uc == 0x00a0) + printf ("", uc); + else + fwrite (input + offset + i, 1, mblen, stdout); + break; + } + } + + i += mblen; + } + } + + offset += n; + if (type == SEG_NEWLINE) + { + enum prompt_style prompt; + + line_number++; + line_offset = offset; + + prompt = segmenter_get_prompt (&s); + printf (" (%s)\n", prompt_style_to_string (prompt)); + } + } + putchar ('\n'); + + free (input); + + return 0; +} + +static const char * +parse_options (int argc, char **argv) +{ + for (;;) + { + static const struct option options[] = + { + {"one-byte", no_argument, NULL, '1'}, + {"auto", no_argument, NULL, 'a'}, + {"batch", no_argument, NULL, 'b'}, + {"interactive", no_argument, NULL, 'i'}, + {"verbose", no_argument, NULL, 'v'}, + {"help", no_argument, NULL, 'h'}, + {NULL, 0, NULL, 0}, + }; + + int c = getopt_long (argc, argv, "1abivh", options, NULL); + if (c == -1) + break; + + switch (c) + { + case '1': + one_byte = true; + break; + + case 'a': + mode = SEG_MODE_AUTO; + break; + + case 'b': + mode = SEG_MODE_BATCH; + break; + + case 'i': + mode = SEG_MODE_INTERACTIVE; + break; + + case 'v': + verbose = true; + break; + + case 'h': + usage (); + + case 0: + break; + + case '?': + exit (EXIT_FAILURE); + break; + + default: + NOT_REACHED (); + } + + } + + if (optind + 1 != argc) + error (1, 0, "exactly one non-option argument required; " + "use --help for help"); + return argv[optind]; +} + +static void +usage (void) +{ + printf ("\ +%s, to test breaking PSPP syntax into lexical segments\n\ +usage: %s [OPTIONS] INPUT\n\ +\n\ +Options:\n\ + -1, --one-byte feed one byte at a time\n\ + -a, --auto use \"auto\" syntax mode\n\ + -b, --batch use \"batch\" syntax mode\n\ + -i, --interactive use \"interactive\" syntax mode (default)\n\ + -v, --verbose include rows and column numbers in output\n\ + -h, --help print this help message\n", + program_name, program_name); + exit (EXIT_SUCCESS); +} diff --git a/tests/language/lexer/segment.at b/tests/language/lexer/segment.at new file mode 100644 index 00000000..e1dd0b5e --- /dev/null +++ b/tests/language/lexer/segment.at @@ -0,0 +1,1070 @@ +AT_BANNER([syntax segmentation]) +m4_define([PSPP_CHECK_SEGMENT], + [AT_CHECK([segment-test $1 input], [0], [expout]) + AT_CHECK([segment-test -1 $1 input], [0], [expout])]) + +AT_SETUP([identifiers]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +a ab abc abcd +A AB ABC ABCD +aB aBC aBcD +$x $y $z +grève@<00A0>@Ângstrom@<00A0>@poté +#a #b #c ## #d +@efg @ @@. @#@ @&t@ +## # #12345 #.# +f@#_.#6 +GhIjK +.x 1y _z +]) +AT_DATA([expout], [dnl +identifier a space +identifier ab space +identifier abc space +identifier abcd +newline \n (later) + +identifier A space +identifier AB space +identifier ABC space +identifier ABCD +newline \n (later) + +identifier aB space +identifier aBC space +identifier aBcD +newline \n (later) + +identifier $x space +identifier $y space +identifier $z +newline \n (later) + +identifier grève +spaces +identifier Ângstrom +spaces +identifier poté +newline \n (later) + +identifier #a space +identifier #b space +identifier #c space +identifier ## space +identifier #d +newline \n (later) + +identifier @efg space +identifier @ space +identifier @@. space +identifier @#@ space +newline \n (later) + +identifier ## space +identifier # space +identifier #12345 space +identifier #.# +newline \n (later) + +identifier f@#\_.#6 +newline \n (later) + +identifier GhIjK +newline \n (later) + +start_command . +identifier x space +number 1 +identifier y space +unexpected_char \_ +identifier z +newline \n (later) + +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([identifiers that end in '.']) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +abcd. abcd. +ABCD. ABCD. +aBcD. aBcD. @&t@ +$y. $z. あいうえお. +#c. #d.. +@@. @@.... +#.#. +#abcd. +. +. @&t@ +LMNOP. @&t@ +QRSTUV./* end of line comment */ +qrstuv. /* end of line comment */ +QrStUv./* end of line comment */ @&t@ +wxyz./* unterminated end of line comment +WXYZ. /* unterminated end of line comment +WxYz./* unterminated end of line comment @&t@ +]) +AT_DATA([expout], [dnl +identifier abcd. space +identifier abcd +end_command . +newline \n (first) + +identifier ABCD. space +identifier ABCD +end_command . +newline \n (first) + +identifier aBcD. space +identifier aBcD +end_command . space +newline \n (first) + +identifier $y. space +identifier $z. space +identifier あいうえお +end_command . +newline \n (first) + +identifier #c. space +identifier #d. +end_command . +newline \n (first) + +identifier @@. space +identifier @@... +end_command . +newline \n (first) + +identifier #.# +end_command . +newline \n (first) + +identifier #abcd +end_command . +newline \n (first) + +start_command . +newline \n (first) + +start_command . space +newline \n (first) + +identifier LMNOP +end_command . space +newline \n (first) + +identifier QRSTUV +end_command . +comment /*_end_of_line_comment_*/ +newline \n (first) + +identifier qrstuv +end_command . space +comment /*_end_of_line_comment_*/ +newline \n (first) + +identifier QrStUv +end_command . +comment /*_end_of_line_comment_*/ space +newline \n (first) + +identifier wxyz +end_command . +comment /*_unterminated_end_of_line_comment +newline \n (first) + +identifier WXYZ +end_command . space +comment /*_unterminated_end_of_line_comment +newline \n (first) + +identifier WxYz +end_command . +comment /*_unterminated_end_of_line_comment_ +newline \n (first) + +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([reserved words]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +and or not eq ge gt le lt ne all by to with +AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH +andx orx notx eqx gex gtx lex ltx nex allx byx tox withx +and. with. +]) +AT_DATA([expout], [dnl +reserved_word and space +reserved_word or space +reserved_word not space +reserved_word eq space +reserved_word ge space +reserved_word gt space +reserved_word le space +reserved_word lt space +reserved_word ne space +reserved_word all space +reserved_word by space +reserved_word to space +reserved_word with +newline \n (later) + +reserved_word AND space +reserved_word OR space +reserved_word NOT space +reserved_word EQ space +reserved_word GE space +reserved_word GT space +reserved_word LE space +reserved_word LT space +reserved_word NE space +reserved_word ALL space +reserved_word BY space +reserved_word TO space +reserved_word WITH +newline \n (later) + +identifier andx space +identifier orx space +identifier notx space +identifier eqx space +identifier gex space +identifier gtx space +identifier lex space +identifier ltx space +identifier nex space +identifier allx space +identifier byx space +identifier tox space +identifier withx +newline \n (later) + +identifier and. space +reserved_word with +end_command . +newline \n (first) + +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([punctuation]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] ** +~&|=>=><=<~=<>(),-+*/[[]]** +]) +AT_DATA([expout], [dnl +punct ~ space +punct & space +punct | space +punct = space +punct >= space +punct > space +punct <= space +punct < space +punct ~= space +punct <> space +punct ( space +punct ) space +punct , space +punct - space +punct + space +punct * space +punct / space +punct [[ space +punct ]] space +punct ** +newline \n (later) + +punct ~ +punct & +punct | +punct = +punct >= +punct > +punct <= +punct < +punct ~= +punct <> +punct ( +punct ) +punct , +punct - +punct + +punct * +punct / +punct [[ +punct ]] +punct ** +newline \n (later) + +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([numbers]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +0 1 01 001. 1. +123. /* comment 1 */ /* comment 2 */ +.1 0.1 00.1 00.10 +5e1 6E-1 7e+1 6E+01 6e-03 +.3E1 .4e-1 .5E+1 .6e+01 .7E-03 +1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03 +. 1e e1 1e+ 1e- +]) +AT_DATA([expout], [dnl +number 0 space +number 1 space +number 01 space +number 001. space +number 1 +end_command . +newline \n (first) + +number 123 +end_command . space +comment /*_comment_1_*/ space +comment /*_comment_2_*/ +newline \n (first) + +start_command . +number 1 space +number 0.1 space +number 00.1 space +number 00.10 +newline \n (later) + +number 5e1 space +number 6E-1 space +number 7e+1 space +number 6E+01 space +number 6e-03 +newline \n (later) + +start_command . +number 3E1 space +number .4e-1 space +number .5E+1 space +number .6e+01 space +number .7E-03 +newline \n (later) + +number 1.23e1 space +number 45.6E-1 space +number 78.9e+1 space +number 99.9E+01 space +number 11.2e-03 +newline \n (later) + +start_command . space +expected_exponent 1e space +identifier e1 space +expected_exponent 1e+ space +expected_exponent 1e- +newline \n (later) + +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([strings]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +'x' "y" 'abc' +'Don''t' "Can't" 'Won''t' +"""quoted""" '"quoted"' +'' "" +'missing end quote +"missing double quote +x"4142" X'5152' +u'fffd' U"041" ++ new command ++ /* comment */ 'string continuation' ++ /* also a punctuator on blank line +- 'new command' +]) +AT_DATA([expout], [dnl +quoted_string 'x' space +quoted_string "y" space +quoted_string 'abc' +newline \n (later) + +quoted_string 'Don''t' space +quoted_string "Can't" space +quoted_string 'Won''t' +newline \n (later) + +quoted_string """quoted""" space +quoted_string '"quoted"' +newline \n (later) + +quoted_string '' space +quoted_string "" +newline \n (later) + +expected_quote 'missing_end_quote +newline \n (later) + +expected_quote "missing_double_quote +newline \n (later) + +hex_string x"4142" space +hex_string X'5152' +newline \n (later) + +unicode_string u'fffd' space +unicode_string U"041" +newline \n (later) + +start_command + space +identifier new space +identifier command +newline \n (later) + +punct + space +comment /*_comment_*/ space +quoted_string 'string_continuation' +newline \n (later) + +punct + space +comment /*_also_a_punctuator_on_blank_line +newline \n (later) + +start_command - space +quoted_string 'new_command' +newline \n (later) + +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([@%:@! construct]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +#! /usr/bin/pspp +title my title. +#! /usr/bin/pspp +]) +AT_DATA([expout], [dnl +shbang #!_/usr/bin/pspp +newline \n (first) + +identifier title space +unquoted_string my_title +end_command . +newline \n (first) + +identifier # +unexpected_char ! space +punct / +identifier usr +punct / +identifier bin +punct / +identifier pspp +newline \n (later) + +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([* and COMMENT commands]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +* Comment commands "don't +have to contain valid tokens. + +** Check ambiguity with ** token. +****************. + +comment keyword works too. +COMM also. +com is ambiguous with COMPUTE. + + * Comment need not start at left margin. + +* Comment ends with blank line + +next command. + +]) +AT_DATA([expout], [dnl +comment_command *_Comment_commands_"don't +newline \n (COMMENT) + +comment_command have_to_contain_valid_tokens +end_command . +newline \n (first) + +separate_commands +newline \n (first) + +comment_command **_Check_ambiguity_with_**_token +end_command . +newline \n (first) + +comment_command **************** +end_command . +newline \n (first) + +separate_commands +newline \n (first) + +comment_command comment_keyword_works_too +end_command . +newline \n (first) + +comment_command COMM_also +end_command . +newline \n (first) + +identifier com space +identifier is space +identifier ambiguous space +reserved_word with space +identifier COMPUTE +end_command . +newline \n (first) + +separate_commands +newline \n (first) + +spaces ___ +comment_command *_Comment_need_not_start_at_left_margin +end_command . +newline \n (first) + +separate_commands +newline \n (first) + +comment_command *_Comment_ends_with_blank_line +newline \n (COMMENT) + +separate_commands +newline \n (first) + +identifier next space +identifier command +end_command . +newline \n (first) + +separate_commands +newline \n (first) + +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([DOCUMENT command]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +DOCUMENT one line. +DOC more + than + one + line. +docu +first.paragraph +isn't parsed as tokens + +second paragraph. +]) +AT_DATA([expout], [dnl +start_document +document DOCUMENT_one_line. +end_command +separate_commands +newline \n (first) + +start_document +document DOC_more +newline \n (DOCUMENT) + +document ____than +newline \n (DOCUMENT) + +document ________one +newline \n (DOCUMENT) + +document ____________line. +end_command +separate_commands +newline \n (first) + +start_document +document docu +newline \n (DOCUMENT) + +document first.paragraph +newline \n (DOCUMENT) + +document isn't_parsed_as_tokens +newline \n (DOCUMENT) + +document +newline \n (DOCUMENT) + +document second_paragraph. +end_command +separate_commands +newline \n (first) + +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([TITLE, SUBTITLE, FILE LABEL commands]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +title/**/'Quoted string title'. +tit /* +"Quoted string on second line". +sub "Quoted string subtitle" + . + +TITL /* Not a */ quoted string title. +SUBT Not a quoted string /* subtitle + +FIL label isn't quoted. +FILE + lab 'is quoted'. +FILE /* +/**/ lab not quoted here either + +]) +AT_DATA([expout], [dnl +identifier title +comment /**/ +quoted_string 'Quoted_string_title' +end_command . +newline \n (first) + +identifier tit space +comment /* +newline \n (later) + +quoted_string "Quoted_string_on_second_line" +end_command . +newline \n (first) + +identifier sub space +quoted_string "Quoted_string_subtitle" +newline \n (later) + space +end_command . +newline \n (first) + +separate_commands +newline \n (first) + +identifier TITL space +unquoted_string /*_Not_a_*/_quoted_string_title +end_command . +newline \n (first) + +identifier SUBT space +unquoted_string Not_a_quoted_string_/*_subtitle +newline \n (later) + +separate_commands +newline \n (first) + +identifier FIL space +identifier label space +unquoted_string isn't_quoted +end_command . +newline \n (first) + +identifier FILE +newline \n (later) + +spaces __ +identifier lab space +quoted_string 'is_quoted' +end_command . +newline \n (first) + +identifier FILE space +comment /* +newline \n (later) + +comment /**/ +spaces __ +identifier lab space +unquoted_string not_quoted_here_either +newline \n (later) + +separate_commands +newline \n (first) + +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([BEGIN DATA command]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +begin data. +end data. + +begin data. /* +123 +xxx +end data. + +BEG /**/ DAT /* +5 6 7 /* x + +end data +end data +. + +begin + data. +data +end data. + +begin data "xxx". +begin data 123. +not data +]) +AT_DATA([expout], [dnl +identifier begin space +identifier data +end_command . +newline \n (data) + +identifier end space +identifier data +end_command . +newline \n (first) + +separate_commands +newline \n (first) + +identifier begin space +identifier data +end_command . space +comment /* +newline \n (data) + +inline_data 123 +newline \n (data) + +inline_data xxx +newline \n (data) + +identifier end space +identifier data +end_command . +newline \n (first) + +separate_commands +newline \n (first) + +identifier BEG space +comment /**/ space +identifier DAT space +comment /* +newline \n (data) + +inline_data 5_6_7_/*_x +newline \n (data) + +inline_data +newline \n (data) + +inline_data end__data +newline \n (data) + +identifier end space +identifier data +newline \n (later) + +start_command . +newline \n (first) + +separate_commands +newline \n (first) + +identifier begin +newline \n (later) + space +identifier data +end_command . +newline \n (data) + +inline_data data +newline \n (data) + +identifier end space +identifier data +end_command . +newline \n (first) + +separate_commands +newline \n (first) + +identifier begin space +identifier data space +quoted_string "xxx" +end_command . +newline \n (first) + +identifier begin space +identifier data space +number 123 +end_command . +newline \n (first) + +reserved_word not space +identifier data +newline \n (later) + +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([DO REPEAT command]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +do repeat x=a b c + y=d e f. + do repeat a=1 thru 5. +another command. +second command ++ third command. +end /* x */ /* y */ repeat print. +end + repeat. +do + repeat #a=1. + inner command. +end repeat. +]) +AT_DATA([expout], [dnl +identifier do space +identifier repeat space +identifier x +punct = +identifier a space +identifier b space +identifier c +newline \n (later) + +spaces __________ +identifier y +punct = +identifier d space +identifier e space +identifier f +end_command . +newline \n (DO REPEAT) + +do_repeat_command __do_repeat_a=1_thru_5. +newline \n (DO REPEAT) + +do_repeat_command another_command. +newline \n (DO REPEAT) + +do_repeat_command second_command +newline \n (DO REPEAT) + +do_repeat_command +_third_command. +newline \n (DO REPEAT) + +do_repeat_command end_/*_x_*/_/*_y_*/_repeat_print. +newline \n (DO REPEAT) + +identifier end +newline \n (later) + space +identifier repeat +end_command . +newline \n (first) + +identifier do +newline \n (later) + +spaces __ +identifier repeat space +identifier #a +punct = +number 1 +end_command . +newline \n (DO REPEAT) + +do_repeat_command __inner_command. +newline \n (DO REPEAT) + +identifier end space +identifier repeat +end_command . +newline \n (first) + +end +]) +PSPP_CHECK_SEGMENT([-i]) +AT_CLEANUP + +AT_SETUP([batch mode]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +first command + another line of first command ++ second command +third command + +fourth command. + fifth command. +]) +AT_DATA([expout], [dnl +identifier first space +identifier command +newline \n (later) + +spaces _____ +identifier another space +identifier line space +identifier of space +identifier first space +identifier command +newline \n (later) + +start_command + +spaces __ +identifier second space +identifier command +newline \n (later) + +start_command +identifier third space +identifier command +newline \n (later) + +separate_commands +newline \n (first) + +identifier fourth space +identifier command +end_command . +newline \n (first) + +spaces ___ +identifier fifth space +identifier command +end_command . +newline \n (first) + +end +]) +PSPP_CHECK_SEGMENT([-b]) +AT_CLEANUP + +AT_SETUP([auto mode]) +AT_KEYWORDS([segment]) +AT_DATA([input], [dnl +command + another line of command +2sls ++ another command +another line of second command +data list /x 1 +aggregate. +print eject. +twostep cluster + + +fourth command. + fifth command. +]) +AT_DATA([expout], [dnl +identifier command +newline \n (later) + +spaces _____ +identifier another space +identifier line space +identifier of space +identifier command +newline \n (later) + +start_command +number 2 +identifier sls +newline \n (later) + +start_command + +spaces __ +identifier another space +identifier command +newline \n (later) + +identifier another space +identifier line space +identifier of space +identifier second space +identifier command +newline \n (later) + +start_command +identifier data space +identifier list space +punct / +identifier x space +number 1 +newline \n (later) + +start_command +identifier aggregate +end_command . +newline \n (first) + +identifier print space +identifier eject +end_command . +newline \n (first) + +identifier twostep space +identifier cluster +newline \n (later) + +separate_commands +newline \n (first) + +separate_commands +newline \n (first) + +identifier fourth space +identifier command +end_command . +newline \n (first) + +spaces ___ +identifier fifth space +identifier command +end_command . +newline \n (first) + +end +]) +PSPP_CHECK_SEGMENT([-a]) +AT_CLEANUP