1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <libpspp/message.h>
29 #include <libpspp/alloc.h>
30 #include <language/command.h>
31 #include <libpspp/message.h>
32 #include <language/line-buffer.h>
33 #include <libpspp/magic.h>
34 #include <data/settings.h>
35 #include <libpspp/str.h>
40 #define _(msgid) gettext (msgid)
41 #define N_(msgid) msgid
48 /* Global variables. */
50 extern const char *keywords[T_N_KEYWORDS + 1];
56 /* T_POS_NUM, T_NEG_NUM: the token's value. */
59 /* T_ID: the identifier. */
60 char tokid[LONG_NAME_LEN + 1];
62 /* T_ID, T_STRING: token string value.
63 For T_ID, this is not truncated as is tokid. */
66 /* Static variables. */
68 /* Pointer to next token in getl_buf. */
71 /* Nonzero only if this line ends with a terminal dot. */
74 /* Nonzero only if the last token returned was T_STOP. */
77 /* If nonzero, next token returned by lex_get().
78 Used only in exceptional circumstances. */
80 static struct string put_tokstr;
81 static double put_tokval;
83 static int parse_id (void);
85 /* How a string represents its contents. */
88 CHARACTER_STRING, /* Characters. */
89 BINARY_STRING, /* Binary digits. */
90 OCTAL_STRING, /* Octal digits. */
91 HEX_STRING /* Hexadecimal digits. */
94 static void convert_numeric_string_to_char_string (enum string_type);
95 static int parse_string (enum string_type);
98 static void dump_token (void);
101 /* Initialization. */
103 /* Initializes the lexer. */
107 ds_init (&tokstr, 64);
108 ds_init (&put_tokstr, 64);
109 if (!lex_get_line ())
116 ds_destroy (&put_tokstr);
117 ds_destroy (&tokstr);
121 /* Common functions. */
123 /* Copies put_token, put_tokstr, put_tokval into token, tokstr,
124 tokval, respectively, and sets tokid appropriately. */
128 assert (put_token != 0);
130 ds_assign_string (&tokstr, &put_tokstr);
131 str_copy_trunc (tokid, sizeof tokid, ds_c_str (&tokstr));
136 /* Copies token, tokstr, tokval into put_token, put_tokstr,
137 put_tokval respectively. */
142 ds_assign_string (&put_tokstr, &tokstr);
146 /* Parses a single token, setting appropriate global variables to
147 indicate the token's attributes. */
151 /* If a token was pushed ahead, return it. */
164 /* Skip whitespace. */
173 while (isspace ((unsigned char) *prog))
187 else if (!lex_get_line ())
208 /* Actually parse the token. */
214 case '0': case '1': case '2': case '3': case '4':
215 case '5': case '6': case '7': case '8': case '9':
219 /* `-' can introduce a negative number, or it can be a
220 token by itself. If it is not followed by a digit or a
221 decimal point, it is definitely not a number.
222 Otherwise, it might be either, but most of the time we
223 want it as a number. When the syntax calls for a `-'
224 token, lex_negative_to_dash() must be used to break
225 negative numbers into two tokens. */
228 ds_putc (&tokstr, *prog++);
229 while (isspace ((unsigned char) *prog))
232 if (!isdigit ((unsigned char) *prog) && *prog != '.')
242 /* Parse the number, copying it into tokstr. */
243 while (isdigit ((unsigned char) *prog))
244 ds_putc (&tokstr, *prog++);
247 ds_putc (&tokstr, *prog++);
248 while (isdigit ((unsigned char) *prog))
249 ds_putc (&tokstr, *prog++);
251 if (*prog == 'e' || *prog == 'E')
253 ds_putc (&tokstr, *prog++);
254 if (*prog == '+' || *prog == '-')
255 ds_putc (&tokstr, *prog++);
256 while (isdigit ((unsigned char) *prog))
257 ds_putc (&tokstr, *prog++);
260 /* Parse as floating point. */
261 tokval = strtod (ds_c_str (&tokstr), &tail);
264 msg (SE, _("%s does not form a valid number."),
269 ds_putc (&tokstr, '0');
276 token = parse_string (CHARACTER_STRING);
279 case '(': case ')': case ',': case '=': case '+': case '/':
299 else if (*prog == '>')
339 if (prog[1] == '\'' || prog[1] == '"')
340 token = parse_string (BINARY_STRING);
346 if (prog[1] == '\'' || prog[1] == '"')
347 token = parse_string (OCTAL_STRING);
353 if (prog[1] == '\'' || prog[1] == '"')
354 token = parse_string (HEX_STRING);
360 if (lex_is_id1 (*prog))
367 if (isgraph ((unsigned char) *prog))
368 msg (SE, _("Bad character in input: `%c'."), *prog++);
370 msg (SE, _("Bad character in input: `\\%o'."), *prog++);
382 /* Parses an identifier at the current position into tokid and
384 Returns the correct token type. */
388 const char *start = prog;
389 prog = lex_skip_identifier (start);
391 ds_concat (&tokstr, start, prog - start);
392 str_copy_trunc (tokid, sizeof tokid, ds_c_str (&tokstr));
393 return lex_id_to_token (ds_c_str (&tokstr), ds_length (&tokstr));
396 /* Reports an error to the effect that subcommand SBC may only be
399 lex_sbc_only_once (const char *sbc)
401 msg (SE, _("Subcommand %s may only be specified once."), sbc);
404 /* Reports an error to the effect that subcommand SBC is
407 lex_sbc_missing (const char *sbc)
409 lex_error (_("missing required subcommand %s"), sbc);
412 /* Prints a syntax error message containing the current token and
413 given message MESSAGE (if non-null). */
415 lex_error (const char *message, ...)
420 token_rep = lex_token_representation ();
422 strcpy (where, "end of file");
423 else if (token == '.')
424 strcpy (where, "end of command");
426 snprintf (where, sizeof where, "`%s'", token_rep);
434 va_start (args, message);
435 vsnprintf (buf, 1024, message, args);
438 msg (SE, _("Syntax error %s at %s."), buf, where);
441 msg (SE, _("Syntax error at %s."), where);
444 /* Checks that we're at end of command.
445 If so, returns a successful command completion code.
446 If not, flags a syntax error and returns an error command
449 lex_end_of_command (void)
453 lex_error (_("expecting end of command"));
460 /* Token testing functions. */
462 /* Returns true if the current token is a number. */
466 return token == T_POS_NUM || token == T_NEG_NUM;
469 /* Returns the value of the current token, which must be a
470 floating point number. */
474 assert (lex_is_number ());
478 /* Returns true iff the current token is an integer. */
480 lex_is_integer (void)
482 return (lex_is_number ()
483 && tokval != NOT_LONG
484 && tokval >= LONG_MIN
485 && tokval <= LONG_MAX
486 && floor (tokval) == tokval);
489 /* Returns the value of the current token, which must be an
494 assert (lex_is_integer ());
498 /* Token matching functions. */
500 /* If TOK is the current token, skips it and returns nonzero.
501 Otherwise, returns zero. */
514 /* If the current token is the identifier S, skips it and returns
515 nonzero. The identifier may be abbreviated to its first three
517 Otherwise, returns zero. */
519 lex_match_id (const char *s)
521 if (token == T_ID && lex_id_match (s, tokid))
530 /* If the current token is integer N, skips it and returns nonzero.
531 Otherwise, returns zero. */
533 lex_match_int (int x)
535 if (lex_is_integer () && lex_integer () == x)
544 /* Forced matches. */
546 /* If this token is identifier S, fetches the next token and returns
548 Otherwise, reports an error and returns zero. */
550 lex_force_match_id (const char *s)
552 if (token == T_ID && lex_id_match (s, tokid))
559 lex_error (_("expecting `%s'"), s);
564 /* If the current token is T, skips the token. Otherwise, reports an
565 error and returns from the current function with return value 0. */
567 lex_force_match (int t)
576 lex_error (_("expecting `%s'"), lex_token_name (t));
581 /* If this token is a string, does nothing and returns nonzero.
582 Otherwise, reports an error and returns zero. */
584 lex_force_string (void)
586 if (token == T_STRING)
590 lex_error (_("expecting string"));
595 /* If this token is an integer, does nothing and returns nonzero.
596 Otherwise, reports an error and returns zero. */
600 if (lex_is_integer ())
604 lex_error (_("expecting integer"));
609 /* If this token is a number, does nothing and returns nonzero.
610 Otherwise, reports an error and returns zero. */
614 if (lex_is_number ())
618 lex_error (_("expecting number"));
623 /* If this token is an identifier, does nothing and returns nonzero.
624 Otherwise, reports an error and returns zero. */
632 lex_error (_("expecting identifier"));
636 /* Weird token functions. */
638 /* Returns the first character of the next token, except that if the
639 next token is not an identifier, the character returned will not be
640 a character that can begin an identifier. Specifically, the
641 hexstring lead-in X' causes lookahead() to return '. Note that an
642 alphanumeric return value doesn't guarantee an ID token, it could
643 also be a reserved-word token. */
645 lex_look_ahead (void)
657 while (isspace ((unsigned char) *prog))
664 else if (!lex_get_line ())
671 if ((toupper ((unsigned char) *prog) == 'X'
672 || toupper ((unsigned char) *prog) == 'B'
673 || toupper ((unsigned char) *prog) == 'O')
674 && (prog[1] == '\'' || prog[1] == '"'))
681 /* Makes the current token become the next token to be read; the
682 current token is set to T. */
690 /* Makes the current token become the next token to be read; the
691 current token is set to the identifier ID. */
693 lex_put_back_id (const char *id)
695 assert (lex_id_to_token (id, strlen (id)) == T_ID);
698 ds_assign_c_str (&tokstr, id);
699 str_copy_trunc (tokid, sizeof tokid, ds_c_str (&tokstr));
702 /* Weird line processing functions. */
704 /* Returns the entire contents of the current line. */
706 lex_entire_line (void)
708 return ds_c_str (&getl_buf);
711 /* As lex_entire_line(), but only returns the part of the current line
712 that hasn't already been tokenized.
713 If END_DOT is non-null, stores nonzero into *END_DOT if the line
714 ends with a terminal dot, or zero if it doesn't. */
716 lex_rest_of_line (int *end_dot)
723 /* Causes the rest of the current input line to be ignored for
724 tokenization purposes. */
726 lex_discard_line (void)
728 prog = ds_end (&getl_buf);
732 /* Sets the current position in the current line to P, which must be
735 lex_set_prog (char *p)
740 /* Discards the rest of the current command.
741 When we're reading commands from a file, we skip tokens until
742 a terminal dot or EOF.
743 When we're reading commands interactively from the user,
744 that's just discarding the current line, because presumably
745 the user doesn't want to finish typing a command that will be
748 lex_discard_rest_of_command (void)
750 if (!getl_is_interactive ())
752 while (token != T_STOP && token != '.')
759 /* Weird line reading functions. */
761 /* Remove C-style comments in STRING, begun by slash-star and
762 terminated by star-slash or newline. */
764 strip_comments (struct string *string)
772 for (cp = ds_c_str (string); *cp; )
774 /* If we're not in a comment, check for quote marks. */
779 else if (*cp == '\'' || *cp == '"')
783 /* If we're not inside a quotation, check for comment. */
786 if (cp[0] == '/' && cp[1] == '*')
793 else if (in_comment && cp[0] == '*' && cp[1] == '/')
802 /* Check commenting. */
809 /* Reads a line for use by the tokenizer, and preprocesses it by
810 removing comments, stripping trailing whitespace and the
811 terminal dot, and removing leading indentors. */
815 struct string *line = &getl_buf;
818 if (!getl_read_line (&interactive))
821 strip_comments (line);
822 ds_rtrim_spaces (line);
824 /* Check for and remove terminal dot. */
825 dot = (ds_chomp (line, get_endcmd ())
826 || (ds_is_empty (line) && get_nulline ()));
828 /* Strip leading indentors or insert a terminal dot (unless the
829 line was obtained interactively). */
832 int first = ds_first (line);
834 if (first == '+' || first == '-')
835 *ds_data (line) = ' ';
836 else if (first != EOF && !isspace (first))
840 prog = ds_c_str (line);
847 /* Returns the name of a token in a static buffer. */
849 lex_token_name (int token)
851 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
852 return keywords[token - T_FIRST_KEYWORD];
864 /* Returns an ASCII representation of the current token as a
865 malloc()'d string. */
867 lex_token_representation (void)
876 return xstrdup (ds_c_str (&tokstr));
884 for (sp = ds_c_str (&tokstr); sp < ds_end (&tokstr); sp++)
885 if (!isprint ((unsigned char) *sp))
891 token_rep = xmalloc (2 + ds_length (&tokstr) * 2 + 1 + 1);
899 for (sp = ds_c_str (&tokstr); *sp; )
903 *dp++ = (unsigned char) *sp++;
906 for (sp = ds_c_str (&tokstr); sp < ds_end (&tokstr); sp++)
908 *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
909 *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
919 token_rep = xmalloc (1);
924 return xstrdup ("**");
927 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
928 return xstrdup (keywords [token - T_FIRST_KEYWORD]);
931 token_rep = xmalloc (2);
932 token_rep[0] = token;
941 /* Really weird functions. */
943 /* Most of the time, a `-' is a lead-in to a negative number. But
944 sometimes it's actually part of the syntax. If a dash can be part
945 of syntax then this function is called to rip it off of a
948 lex_negative_to_dash (void)
950 if (token == T_NEG_NUM)
954 ds_assign_substring (&tokstr, &tokstr, 1, SIZE_MAX);
960 /* We're not at eof any more. */
967 /* Skip a COMMENT command. */
969 lex_skip_comment (void)
973 if (!lex_get_line ())
980 if (put_token == '.')
983 prog = ds_end (&getl_buf);
989 /* Private functions. */
991 /* When invoked, tokstr contains a string of binary, octal, or
992 hex digits, according to TYPE. The string is converted to
993 characters having the specified values. */
995 convert_numeric_string_to_char_string (enum string_type type)
997 const char *base_name;
1007 base_name = _("binary");
1012 base_name = _("octal");
1017 base_name = _("hex");
1025 byte_cnt = ds_length (&tokstr) / chars_per_byte;
1026 if (ds_length (&tokstr) % chars_per_byte)
1027 msg (SE, _("String of %s digits has %d characters, which is not a "
1029 base_name, ds_length (&tokstr), chars_per_byte);
1031 p = ds_c_str (&tokstr);
1032 for (i = 0; i < byte_cnt; i++)
1038 for (j = 0; j < chars_per_byte; j++, p++)
1042 if (*p >= '0' && *p <= '9')
1046 static const char alpha[] = "abcdef";
1047 const char *q = strchr (alpha, tolower ((unsigned char) *p));
1056 msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1058 value = value * base + v;
1061 ds_c_str (&tokstr)[i] = (unsigned char) value;
1064 ds_truncate (&tokstr, byte_cnt);
1067 /* Parses a string from the input buffer into tokstr. The input
1068 buffer pointer prog must point to the initial single or double
1069 quote. TYPE indicates the type of string to be parsed.
1070 Returns token type. */
1072 parse_string (enum string_type type)
1074 /* Accumulate the entire string, joining sections indicated by +
1078 /* Single or double quote. */
1081 /* Accumulate section. */
1084 /* Check end of line. */
1087 msg (SE, _("Unterminated string constant."));
1091 /* Double quote characters to embed them in strings. */
1100 ds_putc (&tokstr, *prog++);
1104 /* Skip whitespace after final quote mark. */
1109 while (isspace ((unsigned char) *prog))
1117 if (!lex_get_line ())
1121 /* Skip plus sign. */
1126 /* Skip whitespace after plus sign. */
1131 while (isspace ((unsigned char) *prog))
1139 if (!lex_get_line ())
1141 msg (SE, _("Unexpected end of file in string concatenation."));
1146 /* Ensure that a valid string follows. */
1147 if (*prog != '\'' && *prog != '"')
1149 msg (SE, _("String expected following `+'."));
1154 /* We come here when we've finished concatenating all the string sections
1155 into one large string. */
1157 if (type != CHARACTER_STRING)
1158 convert_numeric_string_to_char_string (type);
1160 if (ds_length (&tokstr) > 255)
1162 msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1163 ds_length (&tokstr));
1164 ds_truncate (&tokstr, 255);
1172 for (i = 0; i < ds_length (&tokstr); i++)
1173 if (ds_c_str (&tokstr)[i] == 0)
1177 msg (SE, _("Sorry, literal strings may not contain null "
1178 "characters. Replacing with spaces."));
1181 ds_c_str (&tokstr)[i] = ' ';
1189 /* Reads one token from the lexer and writes a textual representation
1190 on stdout for debugging purposes. */
1198 getl_location (&curfn, &curln);
1200 fprintf (stderr, "%s:%d\t", curfn, curln);
1206 fprintf (stderr, "ID\t%s\n", tokid);
1211 fprintf (stderr, "NUM\t%f\n", tokval);
1215 fprintf (stderr, "STRING\t\"%s\"\n", ds_c_str (&tokstr));
1219 fprintf (stderr, "STOP\n");
1223 fprintf (stderr, "MISC\tEXP\"");
1227 fprintf (stderr, "MISC\tEOF\n");
1231 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
1232 fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (token));
1234 fprintf (stderr, "PUNCT\t%c\n", token);
1238 #endif /* DUMP_TOKENS */