1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <libpspp/message.h>
29 #include <libpspp/alloc.h>
30 #include <libpspp/assertion.h>
31 #include <language/command.h>
32 #include <libpspp/message.h>
33 #include <language/line-buffer.h>
34 #include <libpspp/magic.h>
35 #include <data/settings.h>
36 #include <libpspp/str.h>
41 #define _(msgid) gettext (msgid)
42 #define N_(msgid) msgid
49 /* Global variables. */
51 extern const char *keywords[T_N_KEYWORDS + 1];
57 /* T_POS_NUM, T_NEG_NUM: the token's value. */
60 /* T_ID: the identifier. */
61 char tokid[LONG_NAME_LEN + 1];
63 /* T_ID, T_STRING: token string value.
64 For T_ID, this is not truncated as is tokid. */
67 /* Static variables. */
69 /* Pointer to next token in getl_buf. */
72 /* True only if this line ends with a terminal dot. */
75 /* True only if the last token returned was T_STOP. */
78 /* If nonzero, next token returned by lex_get().
79 Used only in exceptional circumstances. */
81 static struct string put_tokstr;
82 static double put_tokval;
84 static int parse_id (void);
86 /* How a string represents its contents. */
89 CHARACTER_STRING, /* Characters. */
90 BINARY_STRING, /* Binary digits. */
91 OCTAL_STRING, /* Octal digits. */
92 HEX_STRING /* Hexadecimal digits. */
95 static int parse_string (enum string_type);
98 static void dump_token (void);
101 /* Initialization. */
103 /* Initializes the lexer. */
107 ds_init_empty (&tokstr);
108 ds_init_empty (&put_tokstr);
109 if (!lex_get_line ())
116 ds_destroy (&put_tokstr);
117 ds_destroy (&tokstr);
121 /* Common functions. */
123 /* Copies put_token, put_tokstr, put_tokval into token, tokstr,
124 tokval, respectively, and sets tokid appropriately. */
128 assert (put_token != 0);
130 ds_assign_string (&tokstr, &put_tokstr);
131 str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
136 /* Copies token, tokstr, tokval into put_token, put_tokstr,
137 put_tokval respectively. */
142 ds_assign_string (&put_tokstr, &tokstr);
146 /* Parses a single token, setting appropriate global variables to
147 indicate the token's attributes. */
151 /* If a token was pushed ahead, return it. */
164 /* Skip whitespace. */
173 while (isspace ((unsigned char) *prog))
187 else if (!lex_get_line ())
208 /* Actually parse the token. */
214 case '0': case '1': case '2': case '3': case '4':
215 case '5': case '6': case '7': case '8': case '9':
219 /* `-' can introduce a negative number, or it can be a
220 token by itself. If it is not followed by a digit or a
221 decimal point, it is definitely not a number.
222 Otherwise, it might be either, but most of the time we
223 want it as a number. When the syntax calls for a `-'
224 token, lex_negative_to_dash() must be used to break
225 negative numbers into two tokens. */
228 ds_put_char (&tokstr, *prog++);
229 while (isspace ((unsigned char) *prog))
232 if (!isdigit ((unsigned char) *prog) && *prog != '.')
242 /* Parse the number, copying it into tokstr. */
243 while (isdigit ((unsigned char) *prog))
244 ds_put_char (&tokstr, *prog++);
247 ds_put_char (&tokstr, *prog++);
248 while (isdigit ((unsigned char) *prog))
249 ds_put_char (&tokstr, *prog++);
251 if (*prog == 'e' || *prog == 'E')
253 ds_put_char (&tokstr, *prog++);
254 if (*prog == '+' || *prog == '-')
255 ds_put_char (&tokstr, *prog++);
256 while (isdigit ((unsigned char) *prog))
257 ds_put_char (&tokstr, *prog++);
260 /* Parse as floating point. */
261 tokval = strtod (ds_cstr (&tokstr), &tail);
264 msg (SE, _("%s does not form a valid number."),
269 ds_put_char (&tokstr, '0');
276 token = parse_string (CHARACTER_STRING);
279 case '(': case ')': case ',': case '=': case '+': case '/':
299 else if (*prog == '>')
339 if (prog[1] == '\'' || prog[1] == '"')
340 token = parse_string (BINARY_STRING);
346 if (prog[1] == '\'' || prog[1] == '"')
347 token = parse_string (OCTAL_STRING);
353 if (prog[1] == '\'' || prog[1] == '"')
354 token = parse_string (HEX_STRING);
360 if (lex_is_id1 (*prog))
367 if (isgraph ((unsigned char) *prog))
368 msg (SE, _("Bad character in input: `%c'."), *prog++);
370 msg (SE, _("Bad character in input: `\\%o'."), *prog++);
382 /* Parses an identifier at the current position into tokid and
384 Returns the correct token type. */
388 const char *start = prog;
389 prog = lex_skip_identifier (start);
391 ds_put_substring (&tokstr, ss_buffer (start, prog - start));
392 str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
393 return lex_id_to_token (ds_cstr (&tokstr), ds_length (&tokstr));
396 /* Reports an error to the effect that subcommand SBC may only be
399 lex_sbc_only_once (const char *sbc)
401 msg (SE, _("Subcommand %s may only be specified once."), sbc);
404 /* Reports an error to the effect that subcommand SBC is
407 lex_sbc_missing (const char *sbc)
409 lex_error (_("missing required subcommand %s"), sbc);
412 /* Prints a syntax error message containing the current token and
413 given message MESSAGE (if non-null). */
415 lex_error (const char *message, ...)
420 token_rep = lex_token_representation ();
422 strcpy (where, "end of file");
423 else if (token == '.')
424 strcpy (where, "end of command");
426 snprintf (where, sizeof where, "`%s'", token_rep);
434 va_start (args, message);
435 vsnprintf (buf, 1024, message, args);
438 msg (SE, _("Syntax error %s at %s."), buf, where);
441 msg (SE, _("Syntax error at %s."), where);
444 /* Checks that we're at end of command.
445 If so, returns a successful command completion code.
446 If not, flags a syntax error and returns an error command
449 lex_end_of_command (void)
453 lex_error (_("expecting end of command"));
460 /* Token testing functions. */
462 /* Returns true if the current token is a number. */
466 return token == T_POS_NUM || token == T_NEG_NUM;
469 /* Returns the value of the current token, which must be a
470 floating point number. */
474 assert (lex_is_number ());
478 /* Returns true iff the current token is an integer. */
480 lex_is_integer (void)
482 return (lex_is_number ()
483 && tokval != NOT_LONG
484 && tokval >= LONG_MIN
485 && tokval <= LONG_MAX
486 && floor (tokval) == tokval);
489 /* Returns the value of the current token, which must be an
494 assert (lex_is_integer ());
498 /* Token matching functions. */
500 /* If TOK is the current token, skips it and returns true
501 Otherwise, returns false. */
514 /* If the current token is the identifier S, skips it and returns
515 true. The identifier may be abbreviated to its first three
517 Otherwise, returns false. */
519 lex_match_id (const char *s)
521 if (token == T_ID && lex_id_match (s, tokid))
530 /* If the current token is integer N, skips it and returns true.
531 Otherwise, returns false. */
533 lex_match_int (int x)
535 if (lex_is_integer () && lex_integer () == x)
544 /* Forced matches. */
546 /* If this token is identifier S, fetches the next token and returns
548 Otherwise, reports an error and returns zero. */
550 lex_force_match_id (const char *s)
552 if (token == T_ID && lex_id_match (s, tokid))
559 lex_error (_("expecting `%s'"), s);
564 /* If the current token is T, skips the token. Otherwise, reports an
565 error and returns from the current function with return value false. */
567 lex_force_match (int t)
576 lex_error (_("expecting `%s'"), lex_token_name (t));
581 /* If this token is a string, does nothing and returns true.
582 Otherwise, reports an error and returns false. */
584 lex_force_string (void)
586 if (token == T_STRING)
590 lex_error (_("expecting string"));
595 /* If this token is an integer, does nothing and returns true.
596 Otherwise, reports an error and returns false. */
600 if (lex_is_integer ())
604 lex_error (_("expecting integer"));
609 /* If this token is a number, does nothing and returns true.
610 Otherwise, reports an error and returns false. */
614 if (lex_is_number ())
618 lex_error (_("expecting number"));
623 /* If this token is an identifier, does nothing and returns true.
624 Otherwise, reports an error and returns false. */
632 lex_error (_("expecting identifier"));
636 /* Weird token functions. */
638 /* Returns the first character of the next token, except that if the
639 next token is not an identifier, the character returned will not be
640 a character that can begin an identifier. Specifically, the
641 hexstring lead-in X' causes lookahead() to return '. Note that an
642 alphanumeric return value doesn't guarantee an ID token, it could
643 also be a reserved-word token. */
645 lex_look_ahead (void)
657 while (isspace ((unsigned char) *prog))
664 else if (!lex_get_line ())
671 if ((toupper ((unsigned char) *prog) == 'X'
672 || toupper ((unsigned char) *prog) == 'B'
673 || toupper ((unsigned char) *prog) == 'O')
674 && (prog[1] == '\'' || prog[1] == '"'))
681 /* Makes the current token become the next token to be read; the
682 current token is set to T. */
690 /* Makes the current token become the next token to be read; the
691 current token is set to the identifier ID. */
693 lex_put_back_id (const char *id)
695 assert (lex_id_to_token (id, strlen (id)) == T_ID);
698 ds_assign_cstr (&tokstr, id);
699 str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
702 /* Weird line processing functions. */
704 /* Returns the entire contents of the current line. */
706 lex_entire_line (void)
708 return ds_cstr (&getl_buf);
711 /* As lex_entire_line(), but only returns the part of the current line
712 that hasn't already been tokenized.
713 If END_DOT is non-null, stores nonzero into *END_DOT if the line
714 ends with a terminal dot, or zero if it doesn't. */
716 lex_rest_of_line (int *end_dot)
723 /* Causes the rest of the current input line to be ignored for
724 tokenization purposes. */
726 lex_discard_line (void)
728 prog = ds_end (&getl_buf);
733 /* Sets the current position in the current line to P, which must be
736 lex_set_prog (char *p)
741 /* Discards the rest of the current command.
742 When we're reading commands from a file, we skip tokens until
743 a terminal dot or EOF.
744 When we're reading commands interactively from the user,
745 that's just discarding the current line, because presumably
746 the user doesn't want to finish typing a command that will be
749 lex_discard_rest_of_command (void)
751 if (!getl_is_interactive ())
753 while (token != T_STOP && token != '.')
760 /* Weird line reading functions. */
762 /* Remove C-style comments in STRING, begun by slash-star and
763 terminated by star-slash or newline. */
765 strip_comments (struct string *string)
773 for (cp = ds_cstr (string); *cp; )
775 /* If we're not in a comment, check for quote marks. */
780 else if (*cp == '\'' || *cp == '"')
784 /* If we're not inside a quotation, check for comment. */
787 if (cp[0] == '/' && cp[1] == '*')
794 else if (in_comment && cp[0] == '*' && cp[1] == '/')
803 /* Check commenting. */
810 /* Reads a line for use by the tokenizer, and preprocesses it by
811 removing comments, stripping trailing whitespace and the
812 terminal dot, and removing leading indentors. */
816 struct string *line = &getl_buf;
819 if (!getl_read_line (&interactive))
822 strip_comments (line);
823 ds_rtrim (line, ss_cstr (CC_SPACES));
825 /* Check for and remove terminal dot. */
826 dot = (ds_chomp (line, get_endcmd ())
827 || (ds_is_empty (line) && get_nulline ()));
829 /* Strip leading indentors or insert a terminal dot (unless the
830 line was obtained interactively). */
833 int first = ds_first (line);
835 if (first == '+' || first == '-')
836 *ds_data (line) = ' ';
837 else if (first != EOF && !isspace (first))
841 prog = ds_cstr (line);
848 /* Returns the name of a token in a static buffer. */
850 lex_token_name (int token)
852 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
853 return keywords[token - T_FIRST_KEYWORD];
865 /* Returns an ASCII representation of the current token as a
866 malloc()'d string. */
868 lex_token_representation (void)
877 return ds_xstrdup (&tokstr);
885 for (sp = ds_cstr (&tokstr); sp < ds_end (&tokstr); sp++)
886 if (!isprint ((unsigned char) *sp))
892 token_rep = xmalloc (2 + ds_length (&tokstr) * 2 + 1 + 1);
900 for (sp = ds_cstr (&tokstr); *sp; )
904 *dp++ = (unsigned char) *sp++;
907 for (sp = ds_cstr (&tokstr); sp < ds_end (&tokstr); sp++)
909 *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
910 *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
920 token_rep = xmalloc (1);
925 return xstrdup ("**");
928 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
929 return xstrdup (keywords [token - T_FIRST_KEYWORD]);
932 token_rep = xmalloc (2);
933 token_rep[0] = token;
942 /* Really weird functions. */
944 /* Most of the time, a `-' is a lead-in to a negative number. But
945 sometimes it's actually part of the syntax. If a dash can be part
946 of syntax then this function is called to rip it off of a
949 lex_negative_to_dash (void)
951 if (token == T_NEG_NUM)
955 ds_assign_substring (&tokstr, ds_substr (&tokstr, 1, SIZE_MAX));
961 /* We're not at eof any more. */
968 /* Skip a COMMENT command. */
970 lex_skip_comment (void)
974 if (!lex_get_line ())
981 if (put_token == '.')
984 prog = ds_end (&getl_buf);
990 /* Private functions. */
992 /* When invoked, tokstr contains a string of binary, octal, or
993 hex digits, according to TYPE. The string is converted to
994 characters having the specified values. */
996 convert_numeric_string_to_char_string (enum string_type type)
998 const char *base_name;
1008 base_name = _("binary");
1013 base_name = _("octal");
1018 base_name = _("hex");
1026 byte_cnt = ds_length (&tokstr) / chars_per_byte;
1027 if (ds_length (&tokstr) % chars_per_byte)
1028 msg (SE, _("String of %s digits has %d characters, which is not a "
1030 base_name, ds_length (&tokstr), chars_per_byte);
1032 p = ds_cstr (&tokstr);
1033 for (i = 0; i < byte_cnt; i++)
1039 for (j = 0; j < chars_per_byte; j++, p++)
1043 if (*p >= '0' && *p <= '9')
1047 static const char alpha[] = "abcdef";
1048 const char *q = strchr (alpha, tolower ((unsigned char) *p));
1057 msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1059 value = value * base + v;
1062 ds_cstr (&tokstr)[i] = (unsigned char) value;
1065 ds_truncate (&tokstr, byte_cnt);
1068 /* Parses a string from the input buffer into tokstr. The input
1069 buffer pointer prog must point to the initial single or double
1070 quote. TYPE indicates the type of string to be parsed.
1071 Returns token type. */
1073 parse_string (enum string_type type)
1075 /* Accumulate the entire string, joining sections indicated by +
1079 /* Single or double quote. */
1082 /* Accumulate section. */
1085 /* Check end of line. */
1088 msg (SE, _("Unterminated string constant."));
1092 /* Double quote characters to embed them in strings. */
1101 ds_put_char (&tokstr, *prog++);
1105 /* Skip whitespace after final quote mark. */
1110 while (isspace ((unsigned char) *prog))
1118 if (!lex_get_line ())
1122 /* Skip plus sign. */
1127 /* Skip whitespace after plus sign. */
1132 while (isspace ((unsigned char) *prog))
1140 if (!lex_get_line ())
1142 msg (SE, _("Unexpected end of file in string concatenation."));
1147 /* Ensure that a valid string follows. */
1148 if (*prog != '\'' && *prog != '"')
1150 msg (SE, _("String expected following `+'."));
1155 /* We come here when we've finished concatenating all the string sections
1156 into one large string. */
1158 if (type != CHARACTER_STRING)
1159 convert_numeric_string_to_char_string (type);
1161 if (ds_length (&tokstr) > 255)
1163 msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1164 ds_length (&tokstr));
1165 ds_truncate (&tokstr, 255);
1173 for (i = 0; i < ds_length (&tokstr); i++)
1174 if (ds_cstr (&tokstr)[i] == 0)
1178 msg (SE, _("Sorry, literal strings may not contain null "
1179 "characters. Replacing with spaces."));
1182 ds_cstr (&tokstr)[i] = ' ';
1190 /* Reads one token from the lexer and writes a textual representation
1191 on stdout for debugging purposes. */
1199 getl_location (&curfn, &curln);
1201 fprintf (stderr, "%s:%d\t", curfn, curln);
1207 fprintf (stderr, "ID\t%s\n", tokid);
1212 fprintf (stderr, "NUM\t%f\n", tokval);
1216 fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&tokstr));
1220 fprintf (stderr, "STOP\n");
1224 fprintf (stderr, "MISC\tEXP\"");
1228 fprintf (stderr, "MISC\tEOF\n");
1232 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
1233 fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (token));
1235 fprintf (stderr, "PUNCT\t%c\n", token);
1239 #endif /* DUMP_TOKENS */