1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <libpspp/message.h>
29 #include <libpspp/alloc.h>
30 #include <language/command.h>
31 #include <libpspp/message.h>
32 #include <language/line-buffer.h>
33 #include <libpspp/magic.h>
34 #include <data/settings.h>
35 #include <libpspp/str.h>
40 #define _(msgid) gettext (msgid)
41 #define N_(msgid) msgid
48 /* Global variables. */
50 extern const char *keywords[T_N_KEYWORDS + 1];
56 /* T_POS_NUM, T_NEG_NUM: the token's value. */
59 /* T_ID: the identifier. */
60 char tokid[LONG_NAME_LEN + 1];
62 /* T_ID, T_STRING: token string value.
63 For T_ID, this is not truncated as is tokid. */
66 /* Static variables. */
68 /* Pointer to next token in getl_buf. */
71 /* Nonzero only if this line ends with a terminal dot. */
74 /* Nonzero only if the last token returned was T_STOP. */
77 /* If nonzero, next token returned by lex_get().
78 Used only in exceptional circumstances. */
80 static struct string put_tokstr;
81 static double put_tokval;
83 static int parse_id (void);
85 /* How a string represents its contents. */
88 CHARACTER_STRING, /* Characters. */
89 BINARY_STRING, /* Binary digits. */
90 OCTAL_STRING, /* Octal digits. */
91 HEX_STRING /* Hexadecimal digits. */
94 static void convert_numeric_string_to_char_string (enum string_type);
95 static int parse_string (enum string_type);
98 static void dump_token (void);
101 /* Initialization. */
103 /* Initializes the lexer. */
107 ds_init (&tokstr, 64);
108 ds_init (&put_tokstr, 64);
109 if (!lex_get_line ())
116 ds_destroy (&put_tokstr);
117 ds_destroy (&tokstr);
121 /* Common functions. */
123 /* Copies put_token, put_tokstr, put_tokval into token, tokstr,
124 tokval, respectively, and sets tokid appropriately. */
128 assert (put_token != 0);
130 ds_assign_string (&tokstr, &put_tokstr);
131 str_copy_trunc (tokid, sizeof tokid, ds_c_str (&tokstr));
136 /* Copies token, tokstr, tokval into put_token, put_tokstr,
137 put_tokval respectively. */
142 ds_assign_string (&put_tokstr, &tokstr);
146 /* Parses a single token, setting appropriate global variables to
147 indicate the token's attributes. */
151 /* If a token was pushed ahead, return it. */
164 /* Skip whitespace. */
173 while (isspace ((unsigned char) *prog))
187 else if (!lex_get_line ())
208 /* Actually parse the token. */
214 case '0': case '1': case '2': case '3': case '4':
215 case '5': case '6': case '7': case '8': case '9':
219 /* `-' can introduce a negative number, or it can be a
220 token by itself. If it is not followed by a digit or a
221 decimal point, it is definitely not a number.
222 Otherwise, it might be either, but most of the time we
223 want it as a number. When the syntax calls for a `-'
224 token, lex_negative_to_dash() must be used to break
225 negative numbers into two tokens. */
228 ds_putc (&tokstr, *prog++);
229 while (isspace ((unsigned char) *prog))
232 if (!isdigit ((unsigned char) *prog) && *prog != '.')
242 /* Parse the number, copying it into tokstr. */
243 while (isdigit ((unsigned char) *prog))
244 ds_putc (&tokstr, *prog++);
247 ds_putc (&tokstr, *prog++);
248 while (isdigit ((unsigned char) *prog))
249 ds_putc (&tokstr, *prog++);
251 if (*prog == 'e' || *prog == 'E')
253 ds_putc (&tokstr, *prog++);
254 if (*prog == '+' || *prog == '-')
255 ds_putc (&tokstr, *prog++);
256 while (isdigit ((unsigned char) *prog))
257 ds_putc (&tokstr, *prog++);
260 /* Parse as floating point. */
261 tokval = strtod (ds_c_str (&tokstr), &tail);
264 msg (SE, _("%s does not form a valid number."),
269 ds_putc (&tokstr, '0');
276 token = parse_string (CHARACTER_STRING);
279 case '(': case ')': case ',': case '=': case '+': case '/':
299 else if (*prog == '>')
339 if (prog[1] == '\'' || prog[1] == '"')
340 token = parse_string (BINARY_STRING);
346 if (prog[1] == '\'' || prog[1] == '"')
347 token = parse_string (OCTAL_STRING);
353 if (prog[1] == '\'' || prog[1] == '"')
354 token = parse_string (HEX_STRING);
360 if (lex_is_id1 (*prog))
367 if (isgraph ((unsigned char) *prog))
368 msg (SE, _("Bad character in input: `%c'."), *prog++);
370 msg (SE, _("Bad character in input: `\\%o'."), *prog++);
382 /* Parses an identifier at the current position into tokid and
384 Returns the correct token type. */
388 const char *start = prog;
389 prog = lex_skip_identifier (start);
391 ds_concat (&tokstr, start, prog - start);
392 str_copy_trunc (tokid, sizeof tokid, ds_c_str (&tokstr));
393 return lex_id_to_token (ds_c_str (&tokstr), ds_length (&tokstr));
396 /* Reports an error to the effect that subcommand SBC may only be
399 lex_sbc_only_once (const char *sbc)
401 msg (SE, _("Subcommand %s may only be specified once."), sbc);
404 /* Reports an error to the effect that subcommand SBC is
407 lex_sbc_missing (const char *sbc)
409 lex_error (_("missing required subcommand %s"), sbc);
412 /* Prints a syntax error message containing the current token and
413 given message MESSAGE (if non-null). */
415 lex_error (const char *message, ...)
420 token_rep = lex_token_representation ();
422 strcpy (where, "end of file");
423 else if (token == '.')
424 strcpy (where, "end of command");
426 snprintf (where, sizeof where, "`%s'", token_rep);
434 va_start (args, message);
435 vsnprintf (buf, 1024, message, args);
438 msg (SE, _("Syntax error %s at %s."), buf, where);
441 msg (SE, _("Syntax error at %s."), where);
444 /* Checks that we're at end of command.
445 If so, returns a successful command completion code.
446 If not, flags a syntax error and returns an error command
449 lex_end_of_command (void)
453 lex_error (_("expecting end of command"));
454 return CMD_TRAILING_GARBAGE;
460 /* Token testing functions. */
462 /* Returns true if the current token is a number. */
466 return token == T_POS_NUM || token == T_NEG_NUM;
469 /* Returns the value of the current token, which must be a
470 floating point number. */
474 assert (lex_is_number ());
478 /* Returns true iff the current token is an integer. */
480 lex_is_integer (void)
482 return (lex_is_number ()
483 && tokval != NOT_LONG
484 && tokval >= LONG_MIN
485 && tokval <= LONG_MAX
486 && floor (tokval) == tokval);
489 /* Returns the value of the current token, which must be an
494 assert (lex_is_integer ());
498 /* Token matching functions. */
500 /* If TOK is the current token, skips it and returns nonzero.
501 Otherwise, returns zero. */
514 /* If the current token is the identifier S, skips it and returns
515 nonzero. The identifier may be abbreviated to its first three
517 Otherwise, returns zero. */
519 lex_match_id (const char *s)
521 if (token == T_ID && lex_id_match (s, tokid))
530 /* If the current token is integer N, skips it and returns nonzero.
531 Otherwise, returns zero. */
533 lex_match_int (int x)
535 if (lex_is_integer () && lex_integer () == x)
544 /* Forced matches. */
546 /* If this token is identifier S, fetches the next token and returns
548 Otherwise, reports an error and returns zero. */
550 lex_force_match_id (const char *s)
552 if (token == T_ID && lex_id_match (s, tokid))
559 lex_error (_("expecting `%s'"), s);
564 /* If the current token is T, skips the token. Otherwise, reports an
565 error and returns from the current function with return value 0. */
567 lex_force_match (int t)
576 lex_error (_("expecting `%s'"), lex_token_name (t));
581 /* If this token is a string, does nothing and returns nonzero.
582 Otherwise, reports an error and returns zero. */
584 lex_force_string (void)
586 if (token == T_STRING)
590 lex_error (_("expecting string"));
595 /* If this token is an integer, does nothing and returns nonzero.
596 Otherwise, reports an error and returns zero. */
600 if (lex_is_integer ())
604 lex_error (_("expecting integer"));
609 /* If this token is a number, does nothing and returns nonzero.
610 Otherwise, reports an error and returns zero. */
614 if (lex_is_number ())
618 lex_error (_("expecting number"));
623 /* If this token is an identifier, does nothing and returns nonzero.
624 Otherwise, reports an error and returns zero. */
632 lex_error (_("expecting identifier"));
636 /* Weird token functions. */
638 /* Returns the first character of the next token, except that if the
639 next token is not an identifier, the character returned will not be
640 a character that can begin an identifier. Specifically, the
641 hexstring lead-in X' causes lookahead() to return '. Note that an
642 alphanumeric return value doesn't guarantee an ID token, it could
643 also be a reserved-word token. */
645 lex_look_ahead (void)
657 while (isspace ((unsigned char) *prog))
664 else if (!lex_get_line ())
671 if ((toupper ((unsigned char) *prog) == 'X'
672 || toupper ((unsigned char) *prog) == 'B'
673 || toupper ((unsigned char) *prog) == 'O')
674 && (prog[1] == '\'' || prog[1] == '"'))
681 /* Makes the current token become the next token to be read; the
682 current token is set to T. */
690 /* Makes the current token become the next token to be read; the
691 current token is set to the identifier ID. */
693 lex_put_back_id (const char *id)
695 assert (lex_id_to_token (id, strlen (id)) == T_ID);
698 ds_assign_c_str (&tokstr, id);
699 str_copy_trunc (tokid, sizeof tokid, ds_c_str (&tokstr));
702 /* Weird line processing functions. */
704 /* Returns the entire contents of the current line. */
706 lex_entire_line (void)
708 return ds_c_str (&getl_buf);
711 /* As lex_entire_line(), but only returns the part of the current line
712 that hasn't already been tokenized.
713 If END_DOT is non-null, stores nonzero into *END_DOT if the line
714 ends with a terminal dot, or zero if it doesn't. */
716 lex_rest_of_line (int *end_dot)
723 /* Causes the rest of the current input line to be ignored for
724 tokenization purposes. */
726 lex_discard_line (void)
728 prog = ds_end (&getl_buf);
732 /* Sets the current position in the current line to P, which must be
735 lex_set_prog (char *p)
740 /* Weird line reading functions. */
742 /* Remove C-style comments in STRING, begun by slash-star and
743 terminated by star-slash or newline. */
745 strip_comments (struct string *string)
753 for (cp = ds_c_str (string); *cp; )
755 /* If we're not in a comment, check for quote marks. */
760 else if (*cp == '\'' || *cp == '"')
764 /* If we're not inside a quotation, check for comment. */
767 if (cp[0] == '/' && cp[1] == '*')
774 else if (in_comment && cp[0] == '*' && cp[1] == '/')
783 /* Check commenting. */
790 /* Reads a line for use by the tokenizer, and preprocesses it by
791 removing comments, stripping trailing whitespace and the
792 terminal dot, and removing leading indentors. */
796 struct string *line = &getl_buf;
799 if (!getl_read_line (&interactive))
802 strip_comments (line);
803 ds_rtrim_spaces (line);
805 /* Check for and remove terminal dot. */
806 dot = (ds_chomp (line, get_endcmd ())
807 || (ds_is_empty (line) && get_nulline ()));
809 /* Strip leading indentors or insert a terminal dot (unless the
810 line was obtained interactively). */
813 int first = ds_first (line);
815 if (first == '+' || first == '-')
816 *ds_data (line) = ' ';
817 else if (first != EOF && !isspace (first))
821 prog = ds_c_str (line);
828 /* Returns the name of a token in a static buffer. */
830 lex_token_name (int token)
832 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
833 return keywords[token - T_FIRST_KEYWORD];
845 /* Returns an ASCII representation of the current token as a
846 malloc()'d string. */
848 lex_token_representation (void)
857 return xstrdup (ds_c_str (&tokstr));
865 for (sp = ds_c_str (&tokstr); sp < ds_end (&tokstr); sp++)
866 if (!isprint ((unsigned char) *sp))
872 token_rep = xmalloc (2 + ds_length (&tokstr) * 2 + 1 + 1);
880 for (sp = ds_c_str (&tokstr); *sp; )
884 *dp++ = (unsigned char) *sp++;
887 for (sp = ds_c_str (&tokstr); sp < ds_end (&tokstr); sp++)
889 *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
890 *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
900 token_rep = xmalloc (1);
905 return xstrdup ("**");
908 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
909 return xstrdup (keywords [token - T_FIRST_KEYWORD]);
912 token_rep = xmalloc (2);
913 token_rep[0] = token;
922 /* Really weird functions. */
924 /* Most of the time, a `-' is a lead-in to a negative number. But
925 sometimes it's actually part of the syntax. If a dash can be part
926 of syntax then this function is called to rip it off of a
929 lex_negative_to_dash (void)
931 if (token == T_NEG_NUM)
935 ds_assign_substring (&tokstr, &tokstr, 1, SIZE_MAX);
941 /* We're not at eof any more. */
948 /* Skip a COMMENT command. */
950 lex_skip_comment (void)
954 if (!lex_get_line ())
961 if (put_token == '.')
964 prog = ds_end (&getl_buf);
970 /* Private functions. */
972 /* When invoked, tokstr contains a string of binary, octal, or
973 hex digits, according to TYPE. The string is converted to
974 characters having the specified values. */
976 convert_numeric_string_to_char_string (enum string_type type)
978 const char *base_name;
988 base_name = _("binary");
993 base_name = _("octal");
998 base_name = _("hex");
1006 byte_cnt = ds_length (&tokstr) / chars_per_byte;
1007 if (ds_length (&tokstr) % chars_per_byte)
1008 msg (SE, _("String of %s digits has %d characters, which is not a "
1010 base_name, ds_length (&tokstr), chars_per_byte);
1012 p = ds_c_str (&tokstr);
1013 for (i = 0; i < byte_cnt; i++)
1019 for (j = 0; j < chars_per_byte; j++, p++)
1023 if (*p >= '0' && *p <= '9')
1027 static const char alpha[] = "abcdef";
1028 const char *q = strchr (alpha, tolower ((unsigned char) *p));
1037 msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1039 value = value * base + v;
1042 ds_c_str (&tokstr)[i] = (unsigned char) value;
1045 ds_truncate (&tokstr, byte_cnt);
1048 /* Parses a string from the input buffer into tokstr. The input
1049 buffer pointer prog must point to the initial single or double
1050 quote. TYPE indicates the type of string to be parsed.
1051 Returns token type. */
1053 parse_string (enum string_type type)
1055 /* Accumulate the entire string, joining sections indicated by +
1059 /* Single or double quote. */
1062 /* Accumulate section. */
1065 /* Check end of line. */
1068 msg (SE, _("Unterminated string constant."));
1072 /* Double quote characters to embed them in strings. */
1081 ds_putc (&tokstr, *prog++);
1085 /* Skip whitespace after final quote mark. */
1090 while (isspace ((unsigned char) *prog))
1098 if (!lex_get_line ())
1102 /* Skip plus sign. */
1107 /* Skip whitespace after plus sign. */
1112 while (isspace ((unsigned char) *prog))
1120 if (!lex_get_line ())
1122 msg (SE, _("Unexpected end of file in string concatenation."));
1127 /* Ensure that a valid string follows. */
1128 if (*prog != '\'' && *prog != '"')
1130 msg (SE, _("String expected following `+'."));
1135 /* We come here when we've finished concatenating all the string sections
1136 into one large string. */
1138 if (type != CHARACTER_STRING)
1139 convert_numeric_string_to_char_string (type);
1141 if (ds_length (&tokstr) > 255)
1143 msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1144 ds_length (&tokstr));
1145 ds_truncate (&tokstr, 255);
1153 for (i = 0; i < ds_length (&tokstr); i++)
1154 if (ds_c_str (&tokstr)[i] == 0)
1158 msg (SE, _("Sorry, literal strings may not contain null "
1159 "characters. Replacing with spaces."));
1162 ds_c_str (&tokstr)[i] = ' ';
1170 /* Reads one token from the lexer and writes a textual representation
1171 on stdout for debugging purposes. */
1179 getl_location (&curfn, &curln);
1181 fprintf (stderr, "%s:%d\t", curfn, curln);
1187 fprintf (stderr, "ID\t%s\n", tokid);
1192 fprintf (stderr, "NUM\t%f\n", tokval);
1196 fprintf (stderr, "STRING\t\"%s\"\n", ds_c_str (&tokstr));
1200 fprintf (stderr, "STOP\n");
1204 fprintf (stderr, "MISC\tEXP\"");
1208 fprintf (stderr, "MISC\tEOF\n");
1212 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
1213 fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (token));
1215 fprintf (stderr, "PUNCT\t%c\n", token);
1219 #endif /* DUMP_TOKENS */