1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <libpspp/message.h>
29 #include <libpspp/alloc.h>
30 #include <libpspp/assertion.h>
31 #include <language/command.h>
32 #include <libpspp/message.h>
33 #include <language/line-buffer.h>
34 #include <libpspp/magic.h>
35 #include <data/settings.h>
36 #include <libpspp/str.h>
41 #define _(msgid) gettext (msgid)
42 #define N_(msgid) msgid
50 struct string line_buffer;
52 bool (*read_line) (struct string *, bool *);
54 int token; /* Current token. */
55 double tokval; /* T_POS_NUM, T_NEG_NUM: the token's value. */
57 char tokid [LONG_NAME_LEN + 1]; /* T_ID: the identifier. */
59 struct string tokstr; /* T_ID, T_STRING: token string value.
60 For T_ID, this is not truncated as is
63 char *prog; /* Pointer to next token in line_buffer. */
64 bool dot; /* True only if this line ends with a terminal dot. */
65 bool eof; /* True only if the last token returned was T_STOP. */
67 int put_token ; /* If nonzero, next token returned by lex_get().
68 Used only in exceptional circumstances. */
70 struct string put_tokstr;
75 static int parse_id (struct lexer *);
77 /* How a string represents its contents. */
80 CHARACTER_STRING, /* Characters. */
81 BINARY_STRING, /* Binary digits. */
82 OCTAL_STRING, /* Octal digits. */
83 HEX_STRING /* Hexadecimal digits. */
86 static int parse_string (struct lexer *, enum string_type);
89 static void dump_token (void);
94 /* Initializes the lexer. */
96 lex_create (bool (*read_line_func) (struct string *, bool *))
98 struct lexer *lexer = xzalloc (sizeof (*lexer));
100 ds_init_empty (&lexer->tokstr);
101 ds_init_empty (&lexer->put_tokstr);
102 ds_init_empty (&lexer->line_buffer);
103 lexer->read_line = read_line_func;
105 if (!lex_get_line (lexer))
112 lex_destroy (struct lexer *lexer)
114 ds_destroy (&lexer->put_tokstr);
115 ds_destroy (&lexer->tokstr);
116 ds_destroy (&lexer->line_buffer);
122 /* Common functions. */
124 /* Copies put_token, lexer->put_tokstr, put_tokval into token, tokstr,
125 tokval, respectively, and sets tokid appropriately. */
127 restore_token (struct lexer *lexer)
129 assert (lexer->put_token != 0);
130 lexer->token = lexer->put_token;
131 ds_assign_string (&lexer->tokstr, &lexer->put_tokstr);
132 str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
133 lexer->tokval = lexer->put_tokval;
134 lexer->put_token = 0;
137 /* Copies token, tokstr, lexer->tokval into lexer->put_token, put_tokstr,
138 put_lexer->tokval respectively. */
140 save_token (struct lexer *lexer)
142 lexer->put_token = lexer->token;
143 ds_assign_string (&lexer->put_tokstr, &lexer->tokstr);
144 lexer->put_tokval = lexer->tokval;
147 /* Parses a single token, setting appropriate global variables to
148 indicate the token's attributes. */
150 lex_get (struct lexer *lexer)
152 /* If a token was pushed ahead, return it. */
153 if (lexer->put_token)
155 restore_token (lexer);
165 /* Skip whitespace. */
168 lexer->token = T_STOP;
174 while (isspace ((unsigned char) *lexer->prog))
188 else if (!lex_get_line (lexer))
191 lexer->token = T_STOP;
198 if (lexer->put_token)
200 restore_token (lexer);
209 /* Actually parse the token. */
210 ds_clear (&lexer->tokstr);
212 switch (*lexer->prog)
215 case '0': case '1': case '2': case '3': case '4':
216 case '5': case '6': case '7': case '8': case '9':
220 /* `-' can introduce a negative number, or it can be a
221 token by itself. If it is not followed by a digit or a
222 decimal point, it is definitely not a number.
223 Otherwise, it might be either, but most of the time we
224 want it as a number. When the syntax calls for a `-'
225 token, lex_negative_to_dash() must be used to break
226 negative numbers into two tokens. */
227 if (*lexer->prog == '-')
229 ds_put_char (&lexer->tokstr, *lexer->prog++);
230 while (isspace ((unsigned char) *lexer->prog))
233 if (!isdigit ((unsigned char) *lexer->prog) && *lexer->prog != '.')
238 lexer->token = T_NEG_NUM;
241 lexer->token = T_POS_NUM;
243 /* Parse the number, copying it into tokstr. */
244 while (isdigit ((unsigned char) *lexer->prog))
245 ds_put_char (&lexer->tokstr, *lexer->prog++);
246 if (*lexer->prog == '.')
248 ds_put_char (&lexer->tokstr, *lexer->prog++);
249 while (isdigit ((unsigned char) *lexer->prog))
250 ds_put_char (&lexer->tokstr, *lexer->prog++);
252 if (*lexer->prog == 'e' || *lexer->prog == 'E')
254 ds_put_char (&lexer->tokstr, *lexer->prog++);
255 if (*lexer->prog == '+' || *lexer->prog == '-')
256 ds_put_char (&lexer->tokstr, *lexer->prog++);
257 while (isdigit ((unsigned char) *lexer->prog))
258 ds_put_char (&lexer->tokstr, *lexer->prog++);
261 /* Parse as floating point. */
262 lexer->tokval = strtod (ds_cstr (&lexer->tokstr), &tail);
265 msg (SE, _("%s does not form a valid number."),
266 ds_cstr (&lexer->tokstr));
269 ds_clear (&lexer->tokstr);
270 ds_put_char (&lexer->tokstr, '0');
277 lexer->token = parse_string (lexer, CHARACTER_STRING);
280 case '(': case ')': case ',': case '=': case '+': case '/':
281 lexer->token = *lexer->prog++;
285 if (*++lexer->prog == '*')
288 lexer->token = T_EXP;
295 if (*++lexer->prog == '=')
300 else if (*lexer->prog == '>')
310 if (*++lexer->prog == '=')
320 if (*++lexer->prog == '=')
326 lexer->token = T_NOT;
331 lexer->token = T_AND;
340 if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
341 lexer->token = parse_string (lexer, BINARY_STRING);
343 lexer->token = parse_id (lexer);
347 if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
348 lexer->token = parse_string (lexer, OCTAL_STRING);
350 lexer->token = parse_id (lexer);
354 if (lexer->prog[1] == '\'' || lexer->prog[1] == '"')
355 lexer->token = parse_string (lexer, HEX_STRING);
357 lexer->token = parse_id (lexer);
361 if (lex_is_id1 (*lexer->prog))
363 lexer->token = parse_id (lexer);
368 if (isgraph ((unsigned char) *lexer->prog))
369 msg (SE, _("Bad character in input: `%c'."), *lexer->prog++);
371 msg (SE, _("Bad character in input: `\\%o'."), *lexer->prog++);
383 /* Parses an identifier at the current position into tokid and
385 Returns the correct token type. */
387 parse_id (struct lexer *lexer)
389 const char *start = lexer->prog;
390 lexer->prog = lex_skip_identifier (start);
392 ds_put_substring (&lexer->tokstr, ss_buffer (start, lexer->prog - start));
393 str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
394 return lex_id_to_token (ds_cstr (&lexer->tokstr), ds_length (&lexer->tokstr));
397 /* Reports an error to the effect that subcommand SBC may only be
400 lex_sbc_only_once (const char *sbc)
402 msg (SE, _("Subcommand %s may only be specified once."), sbc);
405 /* Reports an error to the effect that subcommand SBC is
408 lex_sbc_missing (struct lexer *lexer, const char *sbc)
410 lex_error (lexer, _("missing required subcommand %s"), sbc);
413 /* Prints a syntax error message containing the current token and
414 given message MESSAGE (if non-null). */
416 lex_error (struct lexer *lexer, const char *message, ...)
421 token_rep = lex_token_representation (lexer);
422 if (lexer->token == T_STOP)
423 strcpy (where, "end of file");
424 else if (lexer->token == '.')
425 strcpy (where, "end of command");
427 snprintf (where, sizeof where, "`%s'", token_rep);
435 va_start (args, message);
436 vsnprintf (buf, 1024, message, args);
439 msg (SE, _("Syntax error %s at %s."), buf, where);
442 msg (SE, _("Syntax error at %s."), where);
445 /* Checks that we're at end of command.
446 If so, returns a successful command completion code.
447 If not, flags a syntax error and returns an error command
450 lex_end_of_command (struct lexer *lexer)
452 if (lexer->token != '.')
454 lex_error (lexer, _("expecting end of command"));
461 /* Token testing functions. */
463 /* Returns true if the current token is a number. */
465 lex_is_number (struct lexer *lexer)
467 return lexer->token == T_POS_NUM || lexer->token == T_NEG_NUM;
470 /* Returns the value of the current token, which must be a
471 floating point number. */
473 lex_number (struct lexer *lexer)
475 assert (lex_is_number (lexer));
476 return lexer->tokval;
479 /* Returns true iff the current token is an integer. */
481 lex_is_integer (struct lexer *lexer)
483 return (lex_is_number (lexer)
484 && lexer->tokval != NOT_LONG
485 && lexer->tokval >= LONG_MIN
486 && lexer->tokval <= LONG_MAX
487 && floor (lexer->tokval) == lexer->tokval);
490 /* Returns the value of the current token, which must be an
493 lex_integer (struct lexer *lexer)
495 assert (lex_is_integer (lexer));
496 return lexer->tokval;
499 /* Token matching functions. */
501 /* If TOK is the current token, skips it and returns true
502 Otherwise, returns false. */
504 lex_match (struct lexer *lexer, int t)
506 if (lexer->token == t)
515 /* If the current token is the identifier S, skips it and returns
516 true. The identifier may be abbreviated to its first three
518 Otherwise, returns false. */
520 lex_match_id (struct lexer *lexer, const char *s)
522 if (lexer->token == T_ID && lex_id_match (s, lexer->tokid))
531 /* If the current token is integer N, skips it and returns true.
532 Otherwise, returns false. */
534 lex_match_int (struct lexer *lexer, int x)
536 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
545 /* Forced matches. */
547 /* If this token is identifier S, fetches the next token and returns
549 Otherwise, reports an error and returns zero. */
551 lex_force_match_id (struct lexer *lexer, const char *s)
553 if (lexer->token == T_ID && lex_id_match (s, lexer->tokid))
560 lex_error (lexer, _("expecting `%s'"), s);
565 /* If the current token is T, skips the token. Otherwise, reports an
566 error and returns from the current function with return value false. */
568 lex_force_match (struct lexer *lexer, int t)
570 if (lexer->token == t)
577 lex_error (lexer, _("expecting `%s'"), lex_token_name (t));
582 /* If this token is a string, does nothing and returns true.
583 Otherwise, reports an error and returns false. */
585 lex_force_string (struct lexer *lexer)
587 if (lexer->token == T_STRING)
591 lex_error (lexer, _("expecting string"));
596 /* If this token is an integer, does nothing and returns true.
597 Otherwise, reports an error and returns false. */
599 lex_force_int (struct lexer *lexer)
601 if (lex_is_integer (lexer))
605 lex_error (lexer, _("expecting integer"));
610 /* If this token is a number, does nothing and returns true.
611 Otherwise, reports an error and returns false. */
613 lex_force_num (struct lexer *lexer)
615 if (lex_is_number (lexer))
618 lex_error (lexer, _("expecting number"));
622 /* If this token is an identifier, does nothing and returns true.
623 Otherwise, reports an error and returns false. */
625 lex_force_id (struct lexer *lexer)
627 if (lexer->token == T_ID)
630 lex_error (lexer, _("expecting identifier"));
634 /* Weird token functions. */
636 /* Returns the first character of the next token, except that if the
637 next token is not an identifier, the character returned will not be
638 a character that can begin an identifier. Specifically, the
639 hexstring lead-in X' causes lookahead() to return '. Note that an
640 alphanumeric return value doesn't guarantee an ID token, it could
641 also be a reserved-word token. */
643 lex_look_ahead (struct lexer *lexer)
645 if (lexer->put_token)
646 return lexer->put_token;
655 while (isspace ((unsigned char) *lexer->prog))
662 else if (!lex_get_line (lexer))
665 if (lexer->put_token)
666 return lexer->put_token;
669 if ((toupper ((unsigned char) *lexer->prog) == 'X'
670 || toupper ((unsigned char) *lexer->prog) == 'B'
671 || toupper ((unsigned char) *lexer->prog) == 'O')
672 && (lexer->prog[1] == '\'' || lexer->prog[1] == '"'))
679 /* Makes the current token become the next token to be read; the
680 current token is set to T. */
682 lex_put_back (struct lexer *lexer, int t)
688 /* Makes the current token become the next token to be read; the
689 current token is set to the identifier ID. */
691 lex_put_back_id (struct lexer *lexer, const char *id)
693 assert (lex_id_to_token (id, strlen (id)) == T_ID);
696 ds_assign_cstr (&lexer->tokstr, id);
697 str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr));
700 /* Weird line processing functions. */
702 /* Returns the entire contents of the current line. */
704 lex_entire_line (struct lexer *lexer)
706 return ds_cstr (&lexer->line_buffer);
709 const struct string *
710 lex_entire_line_ds (struct lexer *lexer)
712 return &lexer->line_buffer;
715 /* As lex_entire_line(), but only returns the part of the current line
716 that hasn't already been tokenized.
717 If END_DOT is non-null, stores nonzero into *END_DOT if the line
718 ends with a terminal dot, or zero if it doesn't. */
720 lex_rest_of_line (struct lexer *lexer, int *end_dot)
723 *end_dot = lexer->dot;
727 /* Causes the rest of the current input line to be ignored for
728 tokenization purposes. */
730 lex_discard_line (struct lexer *lexer)
732 ds_cstr (&lexer->line_buffer); /* Ensures ds_end points to something valid */
733 lexer->prog = ds_end (&lexer->line_buffer);
735 lexer->put_token = 0;
739 /* Discards the rest of the current command.
740 When we're reading commands from a file, we skip tokens until
741 a terminal dot or EOF.
742 When we're reading commands interactively from the user,
743 that's just discarding the current line, because presumably
744 the user doesn't want to finish typing a command that will be
747 lex_discard_rest_of_command (struct lexer *lexer)
749 if (!getl_is_interactive ())
751 while (lexer->token != T_STOP && lexer->token != '.')
755 lex_discard_line (lexer);
758 /* Weird line reading functions. */
760 /* Remove C-style comments in STRING, begun by slash-star and
761 terminated by star-slash or newline. */
763 strip_comments (struct string *string)
771 for (cp = ds_cstr (string); *cp; )
773 /* If we're not in a comment, check for quote marks. */
778 else if (*cp == '\'' || *cp == '"')
782 /* If we're not inside a quotation, check for comment. */
785 if (cp[0] == '/' && cp[1] == '*')
792 else if (in_comment && cp[0] == '*' && cp[1] == '/')
801 /* Check commenting. */
808 /* Reads a line, without performing any preprocessing */
810 lex_get_line_raw (struct lexer *lexer)
813 return lexer->read_line (&lexer->line_buffer, &dummy);
816 /* Reads a line for use by the tokenizer, and preprocesses it by
817 removing comments, stripping trailing whitespace and the
818 terminal dot, and removing leading indentors. */
820 lex_get_line (struct lexer *lexer)
822 struct string *line = &lexer->line_buffer;
825 if (!lexer->read_line (line, &interactive))
828 strip_comments (line);
829 ds_rtrim (line, ss_cstr (CC_SPACES));
831 /* Check for and remove terminal dot. */
832 lexer->dot = (ds_chomp (line, get_endcmd ())
833 || (ds_is_empty (line) && get_nulline ()));
835 /* Strip leading indentors or insert a terminal dot (unless the
836 line was obtained interactively). */
839 int first = ds_first (line);
841 if (first == '+' || first == '-')
842 *ds_data (line) = ' ';
843 else if (first != EOF && !isspace (first))
844 lexer->put_token = '.';
847 lexer->prog = ds_cstr (line);
854 /* Returns the name of a token in a static buffer. */
856 lex_token_name (int token)
858 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
859 return keywords[token - T_FIRST_KEYWORD];
871 /* Returns an ASCII representation of the current token as a
872 malloc()'d string. */
874 lex_token_representation (struct lexer *lexer)
878 switch (lexer->token)
883 return ds_xstrdup (&lexer->tokstr);
891 for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
892 if (!isprint ((unsigned char) *sp))
898 token_rep = xmalloc (2 + ds_length (&lexer->tokstr) * 2 + 1 + 1);
906 for (sp = ds_cstr (&lexer->tokstr); *sp; )
910 *dp++ = (unsigned char) *sp++;
913 for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++)
915 *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
916 *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
926 token_rep = xmalloc (1);
931 return xstrdup ("**");
934 if (lexer->token >= T_FIRST_KEYWORD && lexer->token <= T_LAST_KEYWORD)
935 return xstrdup (keywords [lexer->token - T_FIRST_KEYWORD]);
938 token_rep = xmalloc (2);
939 token_rep[0] = lexer->token;
948 /* Really weird functions. */
950 /* Most of the time, a `-' is a lead-in to a negative number. But
951 sometimes it's actually part of the syntax. If a dash can be part
952 of syntax then this function is called to rip it off of a
955 lex_negative_to_dash (struct lexer *lexer)
957 if (lexer->token == T_NEG_NUM)
959 lexer->token = T_POS_NUM;
960 lexer->tokval = -lexer->tokval;
961 ds_assign_substring (&lexer->tokstr, ds_substr (&lexer->tokstr, 1, SIZE_MAX));
967 /* We're not at eof any more. */
969 lex_reset_eof (struct lexer *lexer)
974 /* Skip a COMMENT command. */
976 lex_skip_comment (struct lexer *lexer)
980 if (!lex_get_line (lexer))
982 lexer->put_token = T_STOP;
987 if (lexer->put_token == '.')
990 ds_cstr (&lexer->line_buffer); /* Ensures ds_end will point to a valid char */
991 lexer->prog = ds_end (&lexer->line_buffer);
997 /* Private functions. */
999 /* When invoked, tokstr contains a string of binary, octal, or
1000 hex digits, according to TYPE. The string is converted to
1001 characters having the specified values. */
1003 convert_numeric_string_to_char_string (struct lexer *lexer,
1004 enum string_type type)
1006 const char *base_name;
1016 base_name = _("binary");
1021 base_name = _("octal");
1026 base_name = _("hex");
1034 byte_cnt = ds_length (&lexer->tokstr) / chars_per_byte;
1035 if (ds_length (&lexer->tokstr) % chars_per_byte)
1036 msg (SE, _("String of %s digits has %d characters, which is not a "
1038 base_name, ds_length (&lexer->tokstr), chars_per_byte);
1040 p = ds_cstr (&lexer->tokstr);
1041 for (i = 0; i < byte_cnt; i++)
1047 for (j = 0; j < chars_per_byte; j++, p++)
1051 if (*p >= '0' && *p <= '9')
1055 static const char alpha[] = "abcdef";
1056 const char *q = strchr (alpha, tolower ((unsigned char) *p));
1065 msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1067 value = value * base + v;
1070 ds_cstr (&lexer->tokstr)[i] = (unsigned char) value;
1073 ds_truncate (&lexer->tokstr, byte_cnt);
1076 /* Parses a string from the input buffer into tokstr. The input
1077 buffer pointer lexer->prog must point to the initial single or double
1078 quote. TYPE indicates the type of string to be parsed.
1079 Returns token type. */
1081 parse_string (struct lexer *lexer, enum string_type type)
1083 if (type != CHARACTER_STRING)
1086 /* Accumulate the entire string, joining sections indicated by +
1090 /* Single or double quote. */
1091 int c = *lexer->prog++;
1093 /* Accumulate section. */
1096 /* Check end of line. */
1097 if (*lexer->prog == '\0')
1099 msg (SE, _("Unterminated string constant."));
1103 /* Double quote characters to embed them in strings. */
1104 if (*lexer->prog == c)
1106 if (lexer->prog[1] == c)
1112 ds_put_char (&lexer->tokstr, *lexer->prog++);
1116 /* Skip whitespace after final quote mark. */
1121 while (isspace ((unsigned char) *lexer->prog))
1129 if (!lex_get_line (lexer))
1133 /* Skip plus sign. */
1134 if (*lexer->prog != '+')
1138 /* Skip whitespace after plus sign. */
1143 while (isspace ((unsigned char) *lexer->prog))
1151 if (!lex_get_line (lexer))
1153 msg (SE, _("Unexpected end of file in string concatenation."));
1158 /* Ensure that a valid string follows. */
1159 if (*lexer->prog != '\'' && *lexer->prog != '"')
1161 msg (SE, _("String expected following `+'."));
1166 /* We come here when we've finished concatenating all the string sections
1167 into one large string. */
1169 if (type != CHARACTER_STRING)
1170 convert_numeric_string_to_char_string (lexer, type);
1172 if (ds_length (&lexer->tokstr) > 255)
1174 msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1175 ds_length (&lexer->tokstr));
1176 ds_truncate (&lexer->tokstr, 255);
1183 /* Reads one token from the lexer and writes a textual representation
1184 on stdout for debugging purposes. */
1186 dump_token (struct lexer *lexer)
1192 getl_location (&curfn, &curln);
1194 fprintf (stderr, "%s:%d\t", curfn, curln);
1197 switch (lexer->token)
1200 fprintf (stderr, "ID\t%s\n", lexer->tokid);
1205 fprintf (stderr, "NUM\t%f\n", lexer->tokval);
1209 fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&lexer->tokstr));
1213 fprintf (stderr, "STOP\n");
1217 fprintf (stderr, "MISC\tEXP\"");
1221 fprintf (stderr, "MISC\tEOF\n");
1225 if (lexer->token >= T_FIRST_KEYWORD && lexer->token <= T_LAST_KEYWORD)
1226 fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (token));
1228 fprintf (stderr, "PUNCT\t%c\n", lexer->token);
1232 #endif /* DUMP_TOKENS */
1235 /* Token Accessor Functions */
1238 lex_token (const struct lexer *lexer)
1240 return lexer->token;
1244 lex_tokval (const struct lexer *lexer)
1246 return lexer->tokval;
1250 lex_tokid (const struct lexer *lexer)
1252 return lexer->tokid;
1255 const struct string *
1256 lex_tokstr (const struct lexer *lexer)
1258 return &lexer->tokstr;