From afdf3096926b561f4e6511c10fcf73fc6796b9d2 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 19 Mar 2011 16:32:16 -0700 Subject: [PATCH] scan: New library for high-level PSPP syntax lexical analysis. This library converts a stream of segments output by the "segment" library into PSPP tokens. --- Smake | 1 + src/language/lexer/automake.mk | 4 + src/language/lexer/scan.c | 596 ++++++++++++++++++++++ src/language/lexer/scan.h | 93 ++++ src/language/lexer/token.c | 173 +++++++ src/language/lexer/token.h | 45 ++ tests/automake.mk | 16 + tests/language/lexer/scan-test.c | 217 ++++++++ tests/language/lexer/scan.at | 818 +++++++++++++++++++++++++++++++ 9 files changed, 1963 insertions(+) create mode 100644 src/language/lexer/scan.c create mode 100644 src/language/lexer/scan.h create mode 100644 src/language/lexer/token.c create mode 100644 src/language/lexer/token.h create mode 100644 tests/language/lexer/scan-test.c create mode 100644 tests/language/lexer/scan.at diff --git a/Smake b/Smake index 14c2a75f..683a8e38 100644 --- a/Smake +++ b/Smake @@ -71,6 +71,7 @@ GNULIB_MODULES = \ sys_stat \ tempname \ trunc \ + unictype/ctype-print \ unictype/property-id-continue \ unictype/property-id-start \ unigbrk/uc-is-grapheme-break \ diff --git a/src/language/lexer/automake.mk b/src/language/lexer/automake.mk index b3d06fec..be48873e 100644 --- a/src/language/lexer/automake.mk +++ b/src/language/lexer/automake.mk @@ -10,8 +10,12 @@ language_lexer_sources = \ src/language/lexer/subcommand-list.h \ src/language/lexer/format-parser.c \ src/language/lexer/format-parser.h \ + src/language/lexer/scan.c \ + src/language/lexer/scan.h \ src/language/lexer/segment.c \ src/language/lexer/segment.h \ + src/language/lexer/token.c \ + src/language/lexer/token.h \ src/language/lexer/value-parser.c \ src/language/lexer/value-parser.h \ src/language/lexer/variable-parser.c \ diff --git a/src/language/lexer/scan.c b/src/language/lexer/scan.c new file mode 100644 index 00000000..caf294a9 --- /dev/null +++ b/src/language/lexer/scan.c @@ -0,0 +1,596 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "language/lexer/scan.h" + +#include +#include + +#include "data/identifier.h" +#include "language/lexer/token.h" +#include "libpspp/assertion.h" +#include "libpspp/cast.h" + +#include "gl/c-ctype.h" +#include "gl/xmemdup0.h" + +enum + { + S_START, + S_DASH, + S_STRING + }; + +#define SS_NL_BEFORE_PLUS (1u << 0) +#define SS_PLUS (1u << 1) +#define SS_NL_AFTER_PLUS (1u << 2) + +/* Returns the integer value of (hex) digit C. */ +static int +digit_value (int c) +{ + switch (c) + { + case '0': return 0; + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + case '7': return 7; + case '8': return 8; + case '9': return 9; + case 'a': case 'A': return 10; + case 'b': case 'B': return 11; + case 'c': case 'C': return 12; + case 'd': case 'D': return 13; + case 'e': case 'E': return 14; + case 'f': case 'F': return 15; + default: return INT_MAX; + } +} + +static bool +scan_quoted_string__ (struct substring s, struct token *token) +{ + int quote; + + /* Trim ' or " from front and back. */ + quote = s.string[s.length - 1]; + s.string++; + s.length -= 2; + + ss_realloc (&token->string, token->string.length + s.length + 1); + + for (;;) + { + size_t pos = ss_find_byte (s, quote); + if (pos == SIZE_MAX) + break; + + memcpy (ss_end (token->string), s.string, pos + 1); + token->string.length += pos + 1; + ss_advance (&s, pos + 2); + } + + memcpy (ss_end (token->string), s.string, ss_length (s)); + token->string.length += ss_length (s); + + return true; +} + +static bool +scan_hex_string__ (struct substring s, struct token *token) +{ + uint8_t *dst; + size_t i; + + /* Trim X' from front and ' from back. */ + s.string += 2; + s.length -= 3; + + if (s.length % 2 != 0) + { + token->type = SCAN_BAD_HEX_LENGTH; + token->number = s.length; + return false; + } + + ss_realloc (&token->string, token->string.length + s.length / 2 + 1); + dst = CHAR_CAST (uint8_t *, ss_end (token->string)); + token->string.length += s.length / 2; + for (i = 0; i < s.length; i += 2) + { + int hi = digit_value (s.string[i]); + int lo = digit_value (s.string[i + 1]); + + if (hi >= 16 || lo >= 16) + { + token->type = SCAN_BAD_HEX_DIGIT; + token->number = s.string[hi >= 16 ? i : i + 1]; + return false; + } + + *dst++ = hi * 16 + lo; + } + + return true; +} + +static bool +scan_unicode_string__ (struct substring s, struct token *token) +{ + uint8_t *dst; + ucs4_t uc; + size_t i; + + /* Trim U' from front and ' from back. */ + s.string += 2; + s.length -= 3; + + if (s.length < 1 || s.length > 8) + { + token->type = SCAN_BAD_UNICODE_LENGTH; + token->number = s.length; + return 0; + } + + ss_realloc (&token->string, token->string.length + 4 + 1); + + uc = 0; + for (i = 0; i < s.length; i++) + { + int digit = digit_value (s.string[i]); + if (digit >= 16) + { + token->type = SCAN_BAD_UNICODE_DIGIT; + token->number = s.string[i]; + return 0; + } + uc = uc * 16 + digit; + } + + if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff) + { + token->type = SCAN_BAD_UNICODE_CODE_POINT; + token->number = uc; + return 0; + } + + dst = CHAR_CAST (uint8_t *, ss_end (token->string)); + token->string.length += u8_uctomb (dst, uc, 4); + + return true; +} + +static enum scan_result +scan_string_segment__ (struct scanner *scanner, enum segment_type type, + struct substring s, struct token *token) +{ + bool ok; + + switch (type) + { + case SEG_QUOTED_STRING: + ok = scan_quoted_string__ (s, token); + break; + + case SEG_HEX_STRING: + ok = scan_hex_string__ (s, token); + break; + + case SEG_UNICODE_STRING: + ok = scan_unicode_string__ (s, token); + break; + + default: + NOT_REACHED (); + } + + if (ok) + { + token->type = T_STRING; + token->string.string[token->string.length] = '\0'; + scanner->state = S_STRING; + scanner->substate = 0; + return SCAN_SAVE; + } + else + { + /* The function we called above should have filled in token->type and + token->number properly to describe the error. */ + ss_dealloc (&token->string); + token->string = ss_empty (); + return SCAN_DONE; + } + +} + +static enum scan_result +add_bit (struct scanner *scanner, unsigned int bit) +{ + if (!(scanner->substate & bit)) + { + scanner->substate |= bit; + return SCAN_MORE; + } + else + return SCAN_BACK; +} + +static enum scan_result +scan_string__ (struct scanner *scanner, enum segment_type type, + struct substring s, struct token *token) +{ + switch (type) + { + case SEG_SPACES: + case SEG_COMMENT: + return SCAN_MORE; + + case SEG_NEWLINE: + if (scanner->substate & SS_PLUS) + return add_bit (scanner, SS_NL_AFTER_PLUS); + else + return add_bit (scanner, SS_NL_BEFORE_PLUS); + + case SEG_PUNCT: + return (s.length == 1 && s.string[0] == '+' + ? add_bit (scanner, SS_PLUS) + : SCAN_BACK); + + case SEG_QUOTED_STRING: + case SEG_HEX_STRING: + case SEG_UNICODE_STRING: + return (scanner->substate & SS_PLUS + ? scan_string_segment__ (scanner, type, s, token) + : SCAN_BACK); + + default: + return SCAN_BACK; + } +} + +static enum token_type +scan_reserved_word__ (struct substring word) +{ + switch (c_toupper (word.string[0])) + { + case 'B': + return T_BY; + + case 'E': + return T_EQ; + + case 'G': + return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT; + + case 'L': + return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT; + + case 'N': + return word.length == 2 ? T_NE : T_NOT; + + case 'O': + return T_OR; + + case 'T': + return T_TO; + + case 'A': + return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND; + + case 'W': + return T_WITH; + } + + NOT_REACHED (); +} + +static enum token_type +scan_punct1__ (char c0) +{ + switch (c0) + { + case '(': return T_LPAREN; + case ')': return T_RPAREN; + case ',': return T_COMMA; + case '=': return T_EQUALS; + case '-': return T_DASH; + case '[': return T_LBRACK; + case ']': return T_RBRACK; + case '&': return T_AND; + case '|': return T_OR; + case '+': return T_PLUS; + case '/': return T_SLASH; + case '*': return T_ASTERISK; + case '<': return T_LT; + case '>': return T_GT; + case '~': return T_NOT; + } + + NOT_REACHED (); +} + +static enum token_type +scan_punct2__ (char c0, char c1) +{ + switch (c0) + { + case '*': + return T_EXP; + + case '<': + return c1 == '=' ? T_LE : T_NE; + + case '>': + return T_GE; + + case '~': + return T_NE; + + case '&': + return T_AND; + + case '|': + return T_OR; + } + + NOT_REACHED (); +} + +static enum token_type +scan_punct__ (struct substring s) +{ + return (s.length == 1 + ? scan_punct1__ (s.string[0]) + : scan_punct2__ (s.string[0], s.string[1])); +} + +static double +scan_number__ (struct substring s) +{ + char buf[128]; + double number; + char *p; + + if (s.length < sizeof buf) + { + p = buf; + memcpy (buf, s.string, s.length); + buf[s.length] = '\0'; + } + else + p = xmemdup0 (s.string, s.length); + + number = strtod (p, NULL); + + if (p != buf) + free (p); + + return number; +} + +static enum scan_result +scan_unexpected_char (const struct substring *s, struct token *token) +{ + ucs4_t uc; + + token->type = SCAN_UNEXPECTED_CHAR; + u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length); + token->number = uc; + + return SCAN_DONE; +} + +const char * +scan_type_to_string (enum scan_type type) +{ + switch (type) + { +#define SCAN_TYPE(NAME) case SCAN_##NAME: return #NAME; + SCAN_TYPES +#undef SCAN_TYPE + + default: + return token_type_to_name (type); + } +} + +bool +is_scan_type (enum scan_type type) +{ + return type > SCAN_FIRST && type < SCAN_LAST; +} + +static enum scan_result +scan_start__ (struct scanner *scanner, enum segment_type type, + struct substring s, struct token *token) +{ + switch (type) + { + case SEG_NUMBER: + token->type = T_POS_NUM; + token->number = scan_number__ (s); + return SCAN_DONE; + + case SEG_QUOTED_STRING: + case SEG_HEX_STRING: + case SEG_UNICODE_STRING: + return scan_string_segment__ (scanner, type, s, token); + + case SEG_UNQUOTED_STRING: + case SEG_DO_REPEAT_COMMAND: + case SEG_INLINE_DATA: + case SEG_DOCUMENT: + token->type = T_STRING; + ss_alloc_substring (&token->string, s); + return SCAN_DONE; + + case SEG_RESERVED_WORD: + token->type = scan_reserved_word__ (s); + return SCAN_DONE; + + case SEG_IDENTIFIER: + token->type = T_ID; + ss_alloc_substring (&token->string, s); + return SCAN_DONE; + + case SEG_PUNCT: + if (s.length == 1 && s.string[0] == '-') + { + scanner->state = S_DASH; + return SCAN_SAVE; + } + else + { + token->type = scan_punct__ (s); + return SCAN_DONE; + } + + case SEG_SHBANG: + case SEG_SPACES: + case SEG_COMMENT: + case SEG_NEWLINE: + case SEG_COMMENT_COMMAND: + token->type = SCAN_SKIP; + return SCAN_DONE; + + case SEG_START_DOCUMENT: + token->type = T_ID; + ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT")); + return SCAN_DONE; + + case SEG_START_COMMAND: + case SEG_SEPARATE_COMMANDS: + case SEG_END_COMMAND: + token->type = T_ENDCMD; + return SCAN_DONE; + + case SEG_END: + token->type = T_STOP; + return SCAN_DONE; + + case SEG_EXPECTED_QUOTE: + token->type = SCAN_EXPECTED_QUOTE; + return SCAN_DONE; + + case SEG_EXPECTED_EXPONENT: + token->type = SCAN_EXPECTED_EXPONENT; + ss_alloc_substring (&token->string, s); + return SCAN_DONE; + + case SEG_UNEXPECTED_DOT: + token->type = SCAN_UNEXPECTED_DOT; + return SCAN_DONE; + + case SEG_UNEXPECTED_CHAR: + return scan_unexpected_char (&s, token); + + case SEG_N_TYPES: + NOT_REACHED (); + } + + NOT_REACHED (); +} + +static enum scan_result +scan_dash__ (enum segment_type type, struct substring s, struct token *token) +{ + switch (type) + { + case SEG_SPACES: + case SEG_COMMENT: + return SCAN_MORE; + + case SEG_NUMBER: + token->type = T_NEG_NUM; + token->number = -scan_number__ (s); + return SCAN_DONE; + + default: + token->type = T_DASH; + return SCAN_BACK; + } +} + +/* Initializes SCANNER for scanning a token from a sequence of segments. + Initializes TOKEN as the output token. (The client retains ownership of + TOKEN, but it must be preserved across subsequent calls to scanner_push() + for SCANNER.) + + A scanner only produces a single token. To obtain the next token, + re-initialize it by calling this function again. + + A scanner does not contain any external references, so nothing needs to be + done to destroy one. For the same reason, scanners may be copied with plain + struct assignment (or memcpy). */ +void +scanner_init (struct scanner *scanner, struct token *token) +{ + scanner->state = S_START; + token_init (token); +} + +/* Adds the segment with type TYPE and UTF-8 text S to SCANNER. TOKEN must be + the same token passed to scanner_init() for SCANNER, or a copy of it. + scanner_push() may modify TOKEN. The client retains ownership of TOKEN, + + The possible return values are: + + - SCAN_DONE: All of the segments that have been passed to scanner_push() + form the token now stored in TOKEN. SCANNER is now "used up" and must + be reinitialized with scanner_init() if it is to be used again. + + Most tokens only consist of a single segment, so this is the most common + return value. + + - SCAN_MORE: The segments passed to scanner_push() don't yet determine a + token. The caller should call scanner_push() again with the next token. + (This won't happen if TYPE is SEG_END indicating the end of input.) + + - SCAN_SAVE: This is similar to SCAN_MORE, with one difference: the caller + needs to "save its place" in the stream of segments for a possible + future SCAN_BACK return. This value can be returned more than once in a + sequence of scanner_push() calls for SCANNER, but the caller only needs + to keep track of the most recent position. + + - SCAN_BACK: This is similar to SCAN_DONE, but the token consists of only + the segments up to and including the segment for which SCAN_SAVE was + most recently returned. Segments following that one should be passed to + the next scanner to be initialized. +*/ +enum scan_result +scanner_push (struct scanner *scanner, enum segment_type type, + struct substring s, struct token *token) +{ + switch (scanner->state) + { + case S_START: + return scan_start__ (scanner, type, s, token); + + case S_DASH: + return scan_dash__ (type, s, token); + + case S_STRING: + return scan_string__ (scanner, type, s, token); + } + + NOT_REACHED (); +} diff --git a/src/language/lexer/scan.h b/src/language/lexer/scan.h new file mode 100644 index 00000000..fdb50801 --- /dev/null +++ b/src/language/lexer/scan.h @@ -0,0 +1,93 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef SCAN_H +#define SCAN_H 1 + +#include "language/lexer/segment.h" +#include "libpspp/str.h" + +struct token; + +/* PSPP syntax scanning. + + PSPP divides traditional "lexical analysis" or "tokenization" into two + phases: a lower-level phase called "segmentation" and a higher-level phase + called "scanning". segment.h provides declarations for the segmentation + phase. This header file contains declarations for the scanning phase. + + Scanning accepts as input a stream of segments, which are UTF-8 strings each + labeled with a segment type. It outputs a stream of "scan tokens", which + are the same as the tokens used by the PSPP parser with a few additional + types. +*/ + +#define SCAN_TYPES \ + SCAN_TYPE(BAD_HEX_LENGTH) \ + SCAN_TYPE(BAD_HEX_DIGIT) \ + \ + SCAN_TYPE(BAD_UNICODE_LENGTH) \ + SCAN_TYPE(BAD_UNICODE_DIGIT) \ + SCAN_TYPE(BAD_UNICODE_CODE_POINT) \ + \ + SCAN_TYPE(EXPECTED_QUOTE) \ + SCAN_TYPE(EXPECTED_EXPONENT) \ + SCAN_TYPE(UNEXPECTED_DOT) \ + SCAN_TYPE(UNEXPECTED_CHAR) \ + \ + SCAN_TYPE(SKIP) + +/* Types of scan tokens. + + Scan token types are a superset of enum token_type. Only the additional + scan token types are defined here, so see the definition of enum token_type + for the others. */ +enum scan_type + { +#define SCAN_TYPE(TYPE) SCAN_##TYPE, + SCAN_FIRST = 255, + SCAN_TYPES + SCAN_LAST +#undef SCAN_TYPE + }; + +const char *scan_type_to_string (enum scan_type); +bool is_scan_type (enum scan_type); + +/* A scanner. Opaque. */ +struct scanner + { + unsigned char state; + unsigned char substate; + }; + +/* scanner_push() return type. */ +enum scan_result + { + /* Complete token. */ + SCAN_DONE, /* Token successfully scanned. */ + SCAN_MORE, /* More segments needed to scan token. */ + + /* Incomplete token. */ + SCAN_BACK, /* Done, but go back to saved position too. */ + SCAN_SAVE /* Need more segments, and save position. */ + }; + +void scanner_init (struct scanner *, struct token *); +enum scan_result scanner_push (struct scanner *, enum segment_type, + struct substring, struct token *); + +#endif /* scan.h */ diff --git a/src/language/lexer/token.c b/src/language/lexer/token.c new file mode 100644 index 00000000..89a5cf01 --- /dev/null +++ b/src/language/lexer/token.c @@ -0,0 +1,173 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "language/lexer/token.h" + +#include +#include +#include + +#include "data/identifier.h" +#include "libpspp/assertion.h" +#include "libpspp/cast.h" + +#include "gl/ftoastr.h" +#include "gl/xalloc.h" + +/* Initializes TOKEN with an arbitrary type, number 0, and a null string. */ +void +token_init (struct token *token) +{ + token->type = 0; + token->number = 0.0; + token->string = ss_empty (); +} + +/* Frees the string that TOKEN contains. */ +void +token_destroy (struct token *token) +{ + if (token != NULL) + ss_dealloc (&token->string); +} + +static char * +number_token_to_string (const struct token *token) +{ + char buffer[DBL_BUFSIZE_BOUND]; + + dtoastr (buffer, sizeof buffer, 0, 0, fabs (token->number)); + return (token->type == T_POS_NUM + ? xstrdup (buffer) + : xasprintf ("-%s", buffer)); +} + +static char * +quoted_string_representation (struct substring ss, size_t n_quotes) +{ + char *rep; + size_t i; + char *p; + + p = rep = xmalloc (1 + ss.length + n_quotes + 1 + 1); + *p++ = '\''; + for (i = 0; i < ss.length; i++) + { + uint8_t c = ss.string[i]; + if (c == '\'') + *p++ = c; + *p++ = c; + } + *p++ = '\''; + *p = '\0'; + + return rep; +} + +static char * +hex_string_representation (struct substring ss) +{ + char *rep; + size_t i; + char *p; + + p = rep = xmalloc (2 + 2 * ss.length + 1 + 1); + *p++ = 'X'; + *p++ = '\''; + for (i = 0; i < ss.length; i++) + { + static const char hex_digits[] = "0123456789abcdef"; + uint8_t c = ss.string[i]; + *p++ = hex_digits[c >> 4]; + *p++ = hex_digits[c & 15]; + } + *p++ = '\''; + *p = '\0'; + + return rep; +} + +static char * +string_representation (struct substring ss) +{ + size_t n_quotes; + size_t ofs; + int mblen; + + n_quotes = 0; + for (ofs = 0; ofs < ss.length; ofs += mblen) + { + ucs4_t uc; + + mblen = u8_mbtoucr (&uc, + CHAR_CAST (const uint8_t *, ss.string + ofs), + ss.length - ofs); + if (mblen < 0 || !uc_is_print (uc)) + return hex_string_representation (ss); + else if (uc == '\'') + n_quotes++; + } + return quoted_string_representation (ss, n_quotes); +} + +/* Returns a UTF-8 string that would yield TOKEN if it appeared in a syntax + file. The caller should free the returned string, with free(), when it is + no longer needed. + + The T_STOP token has no representation, so this function returns NULL. */ +char * +token_to_string (const struct token *token) +{ + const char *name; + + switch (token->type) + { + case T_POS_NUM: + case T_NEG_NUM: + return number_token_to_string (token); + + case T_ID: + return ss_xstrdup (token->string); + + case T_STRING: + return string_representation (token->string); + + default: + name = token_type_to_name (token->type); + return name != NULL ? xstrdup (name) : NULL; + } +} + +/* Prints TOKEN on STREAM, for debugging. */ +void +token_print (const struct token *token, FILE *stream) +{ + fputs (token_type_to_name (token->type), stream); + if (token->type == T_POS_NUM || token->type == T_NEG_NUM + || token->number != 0.0) + { + char s[DBL_BUFSIZE_BOUND]; + + dtoastr (s, sizeof s, 0, 0, token->number); + fprintf (stream, "\t%s", s); + } + if (token->type == T_ID || token->type == T_STRING || token->string.length) + fprintf (stream, "\t\"%.*s\"", + (int) token->string.length, token->string.string); + putc ('\n', stream); +} diff --git a/src/language/lexer/token.h b/src/language/lexer/token.h new file mode 100644 index 00000000..8feaf814 --- /dev/null +++ b/src/language/lexer/token.h @@ -0,0 +1,45 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef TOKEN_H +#define TOKEN_H 1 + +#include +#include "libpspp/str.h" +#include "data/identifier.h" + +/* A PSPP syntax token. + + The 'type' member is used by the scanner (see scan.h) for SCAN_* values as + well, which is why it is not declared as type "enum token_type". */ +struct token + { + int type; /* Usually a "enum token_type" value. */ + double number; + struct substring string; + }; + +#define TOKEN_INITIALIZER(TYPE, NUMBER, STRING) \ + { TYPE, NUMBER, SS_LITERAL_INITIALIZER (STRING) } + +void token_init (struct token *); +void token_destroy (struct token *); + +char *token_to_string (const struct token *); + +void token_print (const struct token *, FILE *); + +#endif /* token.h */ diff --git a/tests/automake.mk b/tests/automake.mk index 4d49e5ba..484ef0e1 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -5,6 +5,7 @@ check_PROGRAMS += \ tests/data/sack \ tests/data/inexactify \ tests/language/lexer/command-name-test \ + tests/language/lexer/scan-test \ tests/language/lexer/segment-test \ tests/libpspp/abt-test \ tests/libpspp/bt-test \ @@ -211,6 +212,20 @@ tests_language_lexer_command_name_test_LDADD = \ $(LIBINTL) tests_language_lexer_command_name_test_CFLAGS = $(AM_CFLAGS) +check_PROGRAMS += tests/language/lexer/scan-test +tests_language_lexer_scan_test_SOURCES = \ + src/data/identifier.c \ + src/language/lexer/command-name.c \ + src/language/lexer/scan.c \ + src/language/lexer/segment.c \ + src/language/lexer/token.c \ + src/libpspp/pool.c \ + src/libpspp/prompt.c \ + src/libpspp/str.c \ + src/libpspp/temp-file.c \ + tests/language/lexer/scan-test.c +tests_language_lexer_scan_test_LDADD = gl/libgl.la $(LIBINTL) +tests_language_lexer_scan_test_CFLAGS = $(AM_CFLAGS) check_PROGRAMS += tests/language/lexer/segment-test tests_language_lexer_segment_test_SOURCES = \ @@ -306,6 +321,7 @@ TESTSUITE_AT = \ tests/language/lexer/command-name.at \ tests/language/lexer/lexer.at \ tests/language/lexer/q2c.at \ + tests/language/lexer/scan.at \ tests/language/lexer/segment.at \ tests/language/lexer/variable-parser.at \ tests/language/stats/aggregate.at \ diff --git a/tests/language/lexer/scan-test.c b/tests/language/lexer/scan-test.c new file mode 100644 index 00000000..a56dfd74 --- /dev/null +++ b/tests/language/lexer/scan-test.c @@ -0,0 +1,217 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "libpspp/assertion.h" +#include "libpspp/compiler.h" +#include "libpspp/misc.h" +#include "language/lexer/scan.h" +#include "language/lexer/token.h" + +#include "gl/error.h" +#include "gl/ftoastr.h" +#include "gl/progname.h" +#include "gl/read-file.h" +#include "gl/xalloc.h" + +/* -a/--auto, -b/--batch, -i/--interactive: syntax mode. */ +static enum segmenter_mode mode = SEG_MODE_AUTO; + +static const char *parse_options (int argc, char **argv); +static void usage (void) NO_RETURN; + +int +main (int argc, char *argv[]) +{ + struct segment + { + enum segment_type type; + struct substring string; + }; + + size_t offset; + const char *file_name; + char *input; + struct segmenter s; + struct segment *segs; + size_t n_segs, allocated_segs; + size_t length; + size_t i; + int n; + + set_program_name (argv[0]); + file_name = parse_options (argc, argv); + + /* Read from stdin into 'input'. Ensure that 'input' ends in a new-line + followed by a null byte. */ + input = (!strcmp (file_name, "-") + ? fread_file (stdin, &length) + : read_file (file_name, &length)); + if (input == NULL) + error (EXIT_FAILURE, errno, "reading %s failed", file_name); + input = xrealloc (input, length + 3); + if (length == 0 || input[length - 1] != '\n') + input[length++] = '\n'; + input[length++] = '\0'; + + segs = NULL; + n_segs = allocated_segs = 0; + + segmenter_init (&s, mode); + for (offset = 0; offset < length; offset += n) + { + enum segment_type type; + + n = segmenter_push (&s, input + offset, length - offset, &type); + assert (n >= 0); + assert (offset + n <= length); + + if (n_segs >= allocated_segs) + segs = x2nrealloc (segs, &allocated_segs, sizeof *segs); + + segs[n_segs].type = type; + segs[n_segs].string.string = input + offset; + segs[n_segs].string.length = n; + n_segs++; + } + + for (i = 0; i < n_segs; ) + { + enum scan_result result; + struct scanner scanner; + struct token token; + int saved = -1; + + scanner_init (&scanner, &token); + do + { + struct segment *seg; + + assert (i < n_segs); + + seg = &segs[i++]; + result = scanner_push (&scanner, seg->type, seg->string, &token); + if (result == SCAN_SAVE) + saved = i; + } + while (result == SCAN_MORE || result == SCAN_SAVE); + + if (result == SCAN_BACK) + { + assert (saved >= 0); + i = saved; + } + + printf ("%s", scan_type_to_string (token.type)); + if (token.number != 0.0) + { + char s[DBL_BUFSIZE_BOUND]; + + dtoastr (s, sizeof s, 0, 0, token.number); + printf (" %s", s); + } + if (token.string.string != NULL || token.string.length > 0) + printf (" \"%.*s\"", (int) token.string.length, token.string.string); + printf ("\n"); + + token_destroy (&token); + } + + free (input); + + return 0; +} + +static const char * +parse_options (int argc, char **argv) +{ + for (;;) + { + static const struct option options[] = + { + {"auto", no_argument, NULL, 'a'}, + {"batch", no_argument, NULL, 'b'}, + {"interactive", no_argument, NULL, 'i'}, + {"help", no_argument, NULL, 'h'}, + {NULL, 0, NULL, 0}, + }; + + int c = getopt_long (argc, argv, "abih", options, NULL); + if (c == -1) + break; + + switch (c) + { + case 'a': + mode = SEG_MODE_AUTO; + break; + + case 'b': + mode = SEG_MODE_BATCH; + break; + + case 'i': + mode = SEG_MODE_INTERACTIVE; + break; + + case 'h': + usage (); + + case 0: + break; + + case '?': + exit (EXIT_FAILURE); + break; + + default: + NOT_REACHED (); + } + + } + + if (optind + 1 != argc) + error (1, 0, "exactly one non-option argument required; " + "use --help for help"); + return argv[optind]; +} + +static void +usage (void) +{ + printf ("\ +%s, to test breaking PSPP syntax into tokens\n\ +usage: %s [OPTIONS] INPUT\n\ +\n\ +Options:\n\ + -1, --one-segment feed one segment at a time\n\ + -a, --auto use \"auto\" syntax mode\n\ + -b, --batch use \"batch\" syntax mode\n\ + -i, --interactive use \"interactive\" syntax mode (default)\n\ + -v, --verbose include rows and column numbers in output\n\ + -h, --help print this help message\n", + program_name, program_name); + exit (EXIT_SUCCESS); +} diff --git a/tests/language/lexer/scan.at b/tests/language/lexer/scan.at new file mode 100644 index 00000000..50ee123d --- /dev/null +++ b/tests/language/lexer/scan.at @@ -0,0 +1,818 @@ +AT_BANNER([syntax scanning]) +m4_define([PSPP_CHECK_SCAN], + [AT_CHECK([scan-test $1 input], [0], [expout])]) + +AT_SETUP([identifiers]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +a aB i5 $x @efg @@. #.# .x _z. +abcd. abcd. +QRSTUV./* end of line comment */ +QrStUv./* end of line comment */ @&t@ +WXYZ. /* unterminated end of line comment +�. /* U+FFFD is not valid in an identifier +]) +AT_DATA([expout], [dnl +ID "a" +SKIP +ID "aB" +SKIP +ID "i5" +SKIP +ID "$x" +SKIP +ID "@efg" +SKIP +ID "@@." +SKIP +ID "#.#" +SKIP +UNEXPECTED_DOT +ID "x" +SKIP +UNEXPECTED_CHAR 95 +ID "z" +ENDCMD +SKIP +ID "abcd." +SKIP +ID "abcd" +ENDCMD +SKIP +ID "QRSTUV" +ENDCMD +SKIP +SKIP +ID "QrStUv" +ENDCMD +SKIP +SKIP +SKIP +ID "WXYZ" +ENDCMD +SKIP +SKIP +SKIP +UNEXPECTED_CHAR 65533 +ENDCMD +SKIP +SKIP +SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([reserved words]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +and or not eq ge gt le lt ne all by to with +AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH +andx orx notx eqx gex gtx lex ltx nex allx byx tox withx +and. with. +]) +AT_DATA([expout], [dnl +AND +SKIP +OR +SKIP +NOT +SKIP +EQ +SKIP +GE +SKIP +GT +SKIP +LE +SKIP +LT +SKIP +NE +SKIP +ALL +SKIP +BY +SKIP +TO +SKIP +WITH +SKIP +AND +SKIP +OR +SKIP +NOT +SKIP +EQ +SKIP +GE +SKIP +GT +SKIP +LE +SKIP +LT +SKIP +NE +SKIP +ALL +SKIP +BY +SKIP +TO +SKIP +WITH +SKIP +ID "andx" +SKIP +ID "orx" +SKIP +ID "notx" +SKIP +ID "eqx" +SKIP +ID "gex" +SKIP +ID "gtx" +SKIP +ID "lex" +SKIP +ID "ltx" +SKIP +ID "nex" +SKIP +ID "allx" +SKIP +ID "byx" +SKIP +ID "tox" +SKIP +ID "withx" +SKIP +ID "and." +SKIP +WITH +ENDCMD +SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([punctuation]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] ** +~&|=>=><=<~=<>(),-+*/[[]]** +]) +AT_DATA([expout], [dnl +NOT +SKIP +AND +SKIP +OR +SKIP +EQUALS +SKIP +GE +SKIP +GT +SKIP +LE +SKIP +LT +SKIP +NE +SKIP +NE +SKIP +LPAREN +SKIP +RPAREN +SKIP +COMMA +SKIP +DASH +SKIP +PLUS +SKIP +ASTERISK +SKIP +SLASH +SKIP +LBRACK +SKIP +RBRACK +SKIP +EXP +SKIP +NOT +AND +OR +EQUALS +GE +GT +LE +LT +NE +NE +LPAREN +RPAREN +COMMA +DASH +PLUS +ASTERISK +SLASH +LBRACK +RBRACK +EXP +SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([numbers]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +0 1 01 001. 1. +123. /* comment 1 */ /* comment 2 */ +.1 0.1 00.1 00.10 +5e1 6E-1 7e+1 6E+01 6e-03 +.3E1 .4e-1 .5E+1 .6e+01 .7E-03 +1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03 +. 1e e1 1e+ 1e- +]) +AT_DATA([expout], [dnl +POS_NUM +SKIP +POS_NUM 1 +SKIP +POS_NUM 1 +SKIP +POS_NUM 1 +SKIP +POS_NUM 1 +ENDCMD +SKIP +POS_NUM 123 +ENDCMD +SKIP +SKIP +SKIP +SKIP +SKIP +ENDCMD +POS_NUM 1 +SKIP +POS_NUM 0.1 +SKIP +POS_NUM 0.1 +SKIP +POS_NUM 0.1 +SKIP +POS_NUM 50 +SKIP +POS_NUM 0.6 +SKIP +POS_NUM 70 +SKIP +POS_NUM 60 +SKIP +POS_NUM 0.006 +SKIP +ENDCMD +POS_NUM 30 +SKIP +POS_NUM 0.04 +SKIP +POS_NUM 5 +SKIP +POS_NUM 6 +SKIP +POS_NUM 0.0007 +SKIP +POS_NUM 12.3 +SKIP +POS_NUM 4.56 +SKIP +POS_NUM 789 +SKIP +POS_NUM 999 +SKIP +POS_NUM 0.0112 +SKIP +ENDCMD +SKIP +EXPECTED_EXPONENT "1e" +SKIP +ID "e1" +SKIP +EXPECTED_EXPONENT "1e+" +SKIP +EXPECTED_EXPONENT "1e-" +SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([strings]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +'x' "y" 'abc' +'Don''t' "Can't" 'Won''t' +"""quoted""" '"quoted"' +'' "" '''' """" +'missing end quote +"missing double quote +'x' + "y" ++ 'z' + +'a' /* abc */ + "b" /* ++ 'c' +/* */"d"/* */+'e' +'foo' ++ /* special case: + in column 0 would ordinarily start a new command +'bar' +'foo' + + +'bar' +'foo' ++ + +'bar' + ++ +x"4142"+'5152' +"4142"+ +x'5152' +x"4142" ++u'304a' +"�あいうえお" +"abc"+U"FFFD"+u'3048'+"xyz" +]) +AT_DATA([expout], [dnl +STRING "x" +SKIP +STRING "y" +SKIP +STRING "abc" +SKIP +STRING "Don't" +SKIP +STRING "Can't" +SKIP +STRING "Won't" +SKIP +STRING ""quoted"" +SKIP +STRING ""quoted"" +SKIP +STRING "" +SKIP +STRING "" +SKIP +STRING "'" +SKIP +STRING """ +SKIP +EXPECTED_QUOTE +SKIP +EXPECTED_QUOTE +SKIP +STRING "xyzabcde" +SKIP +STRING "foobar" +SKIP +STRING "foobar" +SKIP +STRING "foo" +SKIP +PLUS +SKIP +ENDCMD +SKIP +STRING "bar" +SKIP +ENDCMD +SKIP +PLUS +SKIP +STRING "AB5152" +SKIP +STRING "4142QR" +SKIP +STRING "ABお" +SKIP +STRING "�あいうえお" +SKIP +STRING "abc�えxyz" +SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([@%:@! construct]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +#! /usr/bin/pspp +#! /usr/bin/pspp +]) +AT_DATA([expout], [dnl +SKIP +SKIP +ID "#" +UNEXPECTED_CHAR 33 +SKIP +SLASH +ID "usr" +SLASH +ID "bin" +SLASH +ID "pspp" +SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([* and COMMENT commands]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +* Comment commands "don't +have to contain valid tokens. + +** Check ambiguity with ** token. +****************. + +comment keyword works too. +COMM also. +com is ambiguous with COMPUTE. + + * Comment need not start at left margin. + +* Comment ends with blank line + +next command. + +]) +AT_DATA([expout], [dnl +SKIP +SKIP +SKIP +ENDCMD +SKIP +ENDCMD +SKIP +SKIP +ENDCMD +SKIP +SKIP +ENDCMD +SKIP +ENDCMD +SKIP +SKIP +ENDCMD +SKIP +SKIP +ENDCMD +SKIP +ID "com" +SKIP +ID "is" +SKIP +ID "ambiguous" +SKIP +WITH +SKIP +ID "COMPUTE" +ENDCMD +SKIP +ENDCMD +SKIP +SKIP +SKIP +ENDCMD +SKIP +ENDCMD +SKIP +SKIP +SKIP +ENDCMD +SKIP +ID "next" +SKIP +ID "command" +ENDCMD +SKIP +ENDCMD +SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([DOCUMENT command]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +DOCUMENT one line. +DOC more + than + one + line. +docu +first.paragraph +isn't parsed as tokens + +second paragraph. +]) +AT_DATA([expout], [dnl +ID "DOCUMENT" +STRING "DOCUMENT one line." +ENDCMD +ENDCMD +SKIP +ID "DOCUMENT" +STRING "DOC more" +SKIP +STRING " than" +SKIP +STRING " one" +SKIP +STRING " line." +ENDCMD +ENDCMD +SKIP +ID "DOCUMENT" +STRING "docu" +SKIP +STRING "first.paragraph" +SKIP +STRING "isn't parsed as tokens" +SKIP +STRING "" +SKIP +STRING "second paragraph." +ENDCMD +ENDCMD +SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([TITLE, SUBTITLE, FILE LABEL commands]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +title/**/'Quoted string title'. +tit /* +"Quoted string on second line". +sub "Quoted string subtitle" + . + +TITL /* Not a */ quoted string title. +SUBT Not a quoted string /* subtitle + +FIL label isn't quoted. +FILE + lab 'is quoted'. +FILE /* +/**/ lab not quoted here either + +]) +AT_DATA([expout], [dnl +ID "title" +SKIP +STRING "Quoted string title" +ENDCMD +SKIP +ID "tit" +SKIP +SKIP +SKIP +STRING "Quoted string on second line" +ENDCMD +SKIP +ID "sub" +SKIP +STRING "Quoted string subtitle" +SKIP +SKIP +ENDCMD +SKIP +ENDCMD +SKIP +ID "TITL" +SKIP +STRING "/* Not a */ quoted string title" +ENDCMD +SKIP +ID "SUBT" +SKIP +STRING "Not a quoted string /* subtitle" +SKIP +ENDCMD +SKIP +ID "FIL" +SKIP +ID "label" +SKIP +STRING "isn't quoted" +ENDCMD +SKIP +ID "FILE" +SKIP +SKIP +ID "lab" +SKIP +STRING "is quoted" +ENDCMD +SKIP +ID "FILE" +SKIP +SKIP +SKIP +SKIP +SKIP +ID "lab" +SKIP +STRING "not quoted here either" +SKIP +ENDCMD +SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([BEGIN DATA command]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +begin data. +123 +xxx +end data. + +BEG /**/ DAT /* +5 6 7 /* x + +end data +end data +. +]) +AT_DATA([expout], [dnl +ID "begin" +SKIP +ID "data" +ENDCMD +SKIP +STRING "123" +SKIP +STRING "xxx" +SKIP +ID "end" +SKIP +ID "data" +ENDCMD +SKIP +ENDCMD +SKIP +ID "BEG" +SKIP +SKIP +SKIP +ID "DAT" +SKIP +SKIP +SKIP +STRING "5 6 7 /* x" +SKIP +STRING "" +SKIP +STRING "end data" +SKIP +ID "end" +SKIP +ID "data" +SKIP +ENDCMD +SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([DO REPEAT command]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +do repeat x=a b c + y=d e f. + do repeat a=1 thru 5. +another command. +second command ++ third command. +end /* x */ /* y */ repeat print. +end + repeat. +]) +AT_DATA([expout], [dnl +ID "do" +SKIP +ID "repeat" +SKIP +ID "x" +EQUALS +ID "a" +SKIP +ID "b" +SKIP +ID "c" +SKIP +SKIP +ID "y" +EQUALS +ID "d" +SKIP +ID "e" +SKIP +ID "f" +ENDCMD +SKIP +STRING " do repeat a=1 thru 5." +SKIP +STRING "another command." +SKIP +STRING "second command" +SKIP +STRING "+ third command." +SKIP +STRING "end /* x */ /* y */ repeat print." +SKIP +ID "end" +SKIP +SKIP +ID "repeat" +ENDCMD +SKIP +STOP +]) +PSPP_CHECK_SCAN([-i]) +AT_CLEANUP + +AT_SETUP([batch mode]) +AT_KEYWORDS([scan]) +AT_DATA([input], [dnl +first command + another line of first command ++ second command +third command + +fourth command. + fifth command. +]) +AT_DATA([expout], [dnl +ID "first" +SKIP +ID "command" +SKIP +SKIP +ID "another" +SKIP +ID "line" +SKIP +ID "of" +SKIP +ID "first" +SKIP +ID "command" +SKIP +ENDCMD +SKIP +ID "second" +SKIP +ID "command" +SKIP +ENDCMD +ID "third" +SKIP +ID "command" +SKIP +ENDCMD +SKIP +ID "fourth" +SKIP +ID "command" +ENDCMD +SKIP +SKIP +ID "fifth" +SKIP +ID "command" +ENDCMD +SKIP +STOP +]) +PSPP_CHECK_SCAN([-b]) +AT_CLEANUP -- 2.30.2