1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <libpspp/message.h>
29 #include <libpspp/alloc.h>
30 #include <libpspp/assertion.h>
31 #include <language/command.h>
32 #include <libpspp/message.h>
33 #include <language/line-buffer.h>
34 #include <libpspp/magic.h>
35 #include <data/settings.h>
36 #include <libpspp/str.h>
41 #define _(msgid) gettext (msgid)
42 #define N_(msgid) msgid
52 /* T_POS_NUM, T_NEG_NUM: the token's value. */
55 /* T_ID: the identifier. */
56 char tokid[LONG_NAME_LEN + 1];
58 /* T_ID, T_STRING: token string value.
59 For T_ID, this is not truncated as is tokid. */
62 /* Static variables. */
64 /* Pointer to next token in line_buffer. */
67 /* True only if this line ends with a terminal dot. */
70 /* True only if the last token returned was T_STOP. */
73 /* If nonzero, next token returned by lex_get().
74 Used only in exceptional circumstances. */
76 static struct string put_tokstr;
77 static double put_tokval;
79 static int parse_id (void);
81 /* How a string represents its contents. */
84 CHARACTER_STRING, /* Characters. */
85 BINARY_STRING, /* Binary digits. */
86 OCTAL_STRING, /* Octal digits. */
87 HEX_STRING /* Hexadecimal digits. */
90 static int parse_string (enum string_type);
93 static void dump_token (void);
98 static struct string line_buffer;
100 static bool (*lex_read_line) (struct string *, bool *);
102 /* Initializes the lexer. */
104 lex_init (bool (*read_line_func) (struct string *, bool *))
106 ds_init_empty (&tokstr);
107 ds_init_empty (&put_tokstr);
108 ds_init_empty (&line_buffer);
109 lex_read_line = read_line_func;
111 if (!lex_get_line ())
118 ds_destroy (&put_tokstr);
119 ds_destroy (&tokstr);
120 ds_destroy (&line_buffer);
124 /* Common functions. */
126 /* Copies put_token, put_tokstr, put_tokval into token, tokstr,
127 tokval, respectively, and sets tokid appropriately. */
131 assert (put_token != 0);
133 ds_assign_string (&tokstr, &put_tokstr);
134 str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
139 /* Copies token, tokstr, tokval into put_token, put_tokstr,
140 put_tokval respectively. */
145 ds_assign_string (&put_tokstr, &tokstr);
149 /* Parses a single token, setting appropriate global variables to
150 indicate the token's attributes. */
154 /* If a token was pushed ahead, return it. */
167 /* Skip whitespace. */
176 while (isspace ((unsigned char) *prog))
190 else if (!lex_get_line ())
211 /* Actually parse the token. */
217 case '0': case '1': case '2': case '3': case '4':
218 case '5': case '6': case '7': case '8': case '9':
222 /* `-' can introduce a negative number, or it can be a
223 token by itself. If it is not followed by a digit or a
224 decimal point, it is definitely not a number.
225 Otherwise, it might be either, but most of the time we
226 want it as a number. When the syntax calls for a `-'
227 token, lex_negative_to_dash() must be used to break
228 negative numbers into two tokens. */
231 ds_put_char (&tokstr, *prog++);
232 while (isspace ((unsigned char) *prog))
235 if (!isdigit ((unsigned char) *prog) && *prog != '.')
245 /* Parse the number, copying it into tokstr. */
246 while (isdigit ((unsigned char) *prog))
247 ds_put_char (&tokstr, *prog++);
250 ds_put_char (&tokstr, *prog++);
251 while (isdigit ((unsigned char) *prog))
252 ds_put_char (&tokstr, *prog++);
254 if (*prog == 'e' || *prog == 'E')
256 ds_put_char (&tokstr, *prog++);
257 if (*prog == '+' || *prog == '-')
258 ds_put_char (&tokstr, *prog++);
259 while (isdigit ((unsigned char) *prog))
260 ds_put_char (&tokstr, *prog++);
263 /* Parse as floating point. */
264 tokval = strtod (ds_cstr (&tokstr), &tail);
267 msg (SE, _("%s does not form a valid number."),
272 ds_put_char (&tokstr, '0');
279 token = parse_string (CHARACTER_STRING);
282 case '(': case ')': case ',': case '=': case '+': case '/':
302 else if (*prog == '>')
342 if (prog[1] == '\'' || prog[1] == '"')
343 token = parse_string (BINARY_STRING);
349 if (prog[1] == '\'' || prog[1] == '"')
350 token = parse_string (OCTAL_STRING);
356 if (prog[1] == '\'' || prog[1] == '"')
357 token = parse_string (HEX_STRING);
363 if (lex_is_id1 (*prog))
370 if (isgraph ((unsigned char) *prog))
371 msg (SE, _("Bad character in input: `%c'."), *prog++);
373 msg (SE, _("Bad character in input: `\\%o'."), *prog++);
385 /* Parses an identifier at the current position into tokid and
387 Returns the correct token type. */
391 const char *start = prog;
392 prog = lex_skip_identifier (start);
394 ds_put_substring (&tokstr, ss_buffer (start, prog - start));
395 str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
396 return lex_id_to_token (ds_cstr (&tokstr), ds_length (&tokstr));
399 /* Reports an error to the effect that subcommand SBC may only be
402 lex_sbc_only_once (const char *sbc)
404 msg (SE, _("Subcommand %s may only be specified once."), sbc);
407 /* Reports an error to the effect that subcommand SBC is
410 lex_sbc_missing (const char *sbc)
412 lex_error (_("missing required subcommand %s"), sbc);
415 /* Prints a syntax error message containing the current token and
416 given message MESSAGE (if non-null). */
418 lex_error (const char *message, ...)
423 token_rep = lex_token_representation ();
425 strcpy (where, "end of file");
426 else if (token == '.')
427 strcpy (where, "end of command");
429 snprintf (where, sizeof where, "`%s'", token_rep);
437 va_start (args, message);
438 vsnprintf (buf, 1024, message, args);
441 msg (SE, _("Syntax error %s at %s."), buf, where);
444 msg (SE, _("Syntax error at %s."), where);
447 /* Checks that we're at end of command.
448 If so, returns a successful command completion code.
449 If not, flags a syntax error and returns an error command
452 lex_end_of_command (void)
456 lex_error (_("expecting end of command"));
463 /* Token testing functions. */
465 /* Returns true if the current token is a number. */
469 return token == T_POS_NUM || token == T_NEG_NUM;
472 /* Returns the value of the current token, which must be a
473 floating point number. */
477 assert (lex_is_number ());
481 /* Returns true iff the current token is an integer. */
483 lex_is_integer (void)
485 return (lex_is_number ()
486 && tokval != NOT_LONG
487 && tokval >= LONG_MIN
488 && tokval <= LONG_MAX
489 && floor (tokval) == tokval);
492 /* Returns the value of the current token, which must be an
497 assert (lex_is_integer ());
501 /* Token matching functions. */
503 /* If TOK is the current token, skips it and returns true
504 Otherwise, returns false. */
517 /* If the current token is the identifier S, skips it and returns
518 true. The identifier may be abbreviated to its first three
520 Otherwise, returns false. */
522 lex_match_id (const char *s)
524 if (token == T_ID && lex_id_match (s, tokid))
533 /* If the current token is integer N, skips it and returns true.
534 Otherwise, returns false. */
536 lex_match_int (int x)
538 if (lex_is_integer () && lex_integer () == x)
547 /* Forced matches. */
549 /* If this token is identifier S, fetches the next token and returns
551 Otherwise, reports an error and returns zero. */
553 lex_force_match_id (const char *s)
555 if (token == T_ID && lex_id_match (s, tokid))
562 lex_error (_("expecting `%s'"), s);
567 /* If the current token is T, skips the token. Otherwise, reports an
568 error and returns from the current function with return value false. */
570 lex_force_match (int t)
579 lex_error (_("expecting `%s'"), lex_token_name (t));
584 /* If this token is a string, does nothing and returns true.
585 Otherwise, reports an error and returns false. */
587 lex_force_string (void)
589 if (token == T_STRING)
593 lex_error (_("expecting string"));
598 /* If this token is an integer, does nothing and returns true.
599 Otherwise, reports an error and returns false. */
603 if (lex_is_integer ())
607 lex_error (_("expecting integer"));
612 /* If this token is a number, does nothing and returns true.
613 Otherwise, reports an error and returns false. */
617 if (lex_is_number ())
621 lex_error (_("expecting number"));
626 /* If this token is an identifier, does nothing and returns true.
627 Otherwise, reports an error and returns false. */
635 lex_error (_("expecting identifier"));
639 /* Weird token functions. */
641 /* Returns the first character of the next token, except that if the
642 next token is not an identifier, the character returned will not be
643 a character that can begin an identifier. Specifically, the
644 hexstring lead-in X' causes lookahead() to return '. Note that an
645 alphanumeric return value doesn't guarantee an ID token, it could
646 also be a reserved-word token. */
648 lex_look_ahead (void)
660 while (isspace ((unsigned char) *prog))
667 else if (!lex_get_line ())
674 if ((toupper ((unsigned char) *prog) == 'X'
675 || toupper ((unsigned char) *prog) == 'B'
676 || toupper ((unsigned char) *prog) == 'O')
677 && (prog[1] == '\'' || prog[1] == '"'))
684 /* Makes the current token become the next token to be read; the
685 current token is set to T. */
693 /* Makes the current token become the next token to be read; the
694 current token is set to the identifier ID. */
696 lex_put_back_id (const char *id)
698 assert (lex_id_to_token (id, strlen (id)) == T_ID);
701 ds_assign_cstr (&tokstr, id);
702 str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
705 /* Weird line processing functions. */
707 /* Returns the entire contents of the current line. */
709 lex_entire_line (void)
711 return ds_cstr (&line_buffer);
714 const struct string *
715 lex_entire_line_ds (void)
720 /* As lex_entire_line(), but only returns the part of the current line
721 that hasn't already been tokenized.
722 If END_DOT is non-null, stores nonzero into *END_DOT if the line
723 ends with a terminal dot, or zero if it doesn't. */
725 lex_rest_of_line (int *end_dot)
732 /* Causes the rest of the current input line to be ignored for
733 tokenization purposes. */
735 lex_discard_line (void)
737 ds_cstr (&line_buffer); /* Ensures ds_end points to something valid */
738 prog = ds_end (&line_buffer);
744 /* Discards the rest of the current command.
745 When we're reading commands from a file, we skip tokens until
746 a terminal dot or EOF.
747 When we're reading commands interactively from the user,
748 that's just discarding the current line, because presumably
749 the user doesn't want to finish typing a command that will be
752 lex_discard_rest_of_command (void)
754 if (!getl_is_interactive ())
756 while (token != T_STOP && token != '.')
763 /* Weird line reading functions. */
765 /* Remove C-style comments in STRING, begun by slash-star and
766 terminated by star-slash or newline. */
768 strip_comments (struct string *string)
776 for (cp = ds_cstr (string); *cp; )
778 /* If we're not in a comment, check for quote marks. */
783 else if (*cp == '\'' || *cp == '"')
787 /* If we're not inside a quotation, check for comment. */
790 if (cp[0] == '/' && cp[1] == '*')
797 else if (in_comment && cp[0] == '*' && cp[1] == '/')
806 /* Check commenting. */
813 /* Reads a line, without performing any preprocessing */
815 lex_get_line_raw (void)
818 return lex_read_line (&line_buffer, &dummy);
821 /* Reads a line for use by the tokenizer, and preprocesses it by
822 removing comments, stripping trailing whitespace and the
823 terminal dot, and removing leading indentors. */
827 struct string *line = &line_buffer;
830 if (!lex_read_line (line, &interactive))
833 strip_comments (line);
834 ds_rtrim (line, ss_cstr (CC_SPACES));
836 /* Check for and remove terminal dot. */
837 dot = (ds_chomp (line, get_endcmd ())
838 || (ds_is_empty (line) && get_nulline ()));
840 /* Strip leading indentors or insert a terminal dot (unless the
841 line was obtained interactively). */
844 int first = ds_first (line);
846 if (first == '+' || first == '-')
847 *ds_data (line) = ' ';
848 else if (first != EOF && !isspace (first))
852 prog = ds_cstr (line);
859 /* Returns the name of a token in a static buffer. */
861 lex_token_name (int token)
863 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
864 return keywords[token - T_FIRST_KEYWORD];
876 /* Returns an ASCII representation of the current token as a
877 malloc()'d string. */
879 lex_token_representation (void)
888 return ds_xstrdup (&tokstr);
896 for (sp = ds_cstr (&tokstr); sp < ds_end (&tokstr); sp++)
897 if (!isprint ((unsigned char) *sp))
903 token_rep = xmalloc (2 + ds_length (&tokstr) * 2 + 1 + 1);
911 for (sp = ds_cstr (&tokstr); *sp; )
915 *dp++ = (unsigned char) *sp++;
918 for (sp = ds_cstr (&tokstr); sp < ds_end (&tokstr); sp++)
920 *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
921 *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
931 token_rep = xmalloc (1);
936 return xstrdup ("**");
939 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
940 return xstrdup (keywords [token - T_FIRST_KEYWORD]);
943 token_rep = xmalloc (2);
944 token_rep[0] = token;
953 /* Really weird functions. */
955 /* Most of the time, a `-' is a lead-in to a negative number. But
956 sometimes it's actually part of the syntax. If a dash can be part
957 of syntax then this function is called to rip it off of a
960 lex_negative_to_dash (void)
962 if (token == T_NEG_NUM)
966 ds_assign_substring (&tokstr, ds_substr (&tokstr, 1, SIZE_MAX));
972 /* We're not at eof any more. */
979 /* Skip a COMMENT command. */
981 lex_skip_comment (void)
985 if (!lex_get_line ())
992 if (put_token == '.')
995 ds_cstr (&line_buffer); /* Ensures ds_end will point to a valid char */
996 prog = ds_end (&line_buffer);
1002 /* Private functions. */
1004 /* When invoked, tokstr contains a string of binary, octal, or
1005 hex digits, according to TYPE. The string is converted to
1006 characters having the specified values. */
1008 convert_numeric_string_to_char_string (enum string_type type)
1010 const char *base_name;
1020 base_name = _("binary");
1025 base_name = _("octal");
1030 base_name = _("hex");
1038 byte_cnt = ds_length (&tokstr) / chars_per_byte;
1039 if (ds_length (&tokstr) % chars_per_byte)
1040 msg (SE, _("String of %s digits has %d characters, which is not a "
1042 base_name, ds_length (&tokstr), chars_per_byte);
1044 p = ds_cstr (&tokstr);
1045 for (i = 0; i < byte_cnt; i++)
1051 for (j = 0; j < chars_per_byte; j++, p++)
1055 if (*p >= '0' && *p <= '9')
1059 static const char alpha[] = "abcdef";
1060 const char *q = strchr (alpha, tolower ((unsigned char) *p));
1069 msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1071 value = value * base + v;
1074 ds_cstr (&tokstr)[i] = (unsigned char) value;
1077 ds_truncate (&tokstr, byte_cnt);
1080 /* Parses a string from the input buffer into tokstr. The input
1081 buffer pointer prog must point to the initial single or double
1082 quote. TYPE indicates the type of string to be parsed.
1083 Returns token type. */
1085 parse_string (enum string_type type)
1087 if (type != CHARACTER_STRING)
1090 /* Accumulate the entire string, joining sections indicated by +
1094 /* Single or double quote. */
1097 /* Accumulate section. */
1100 /* Check end of line. */
1103 msg (SE, _("Unterminated string constant."));
1107 /* Double quote characters to embed them in strings. */
1116 ds_put_char (&tokstr, *prog++);
1120 /* Skip whitespace after final quote mark. */
1125 while (isspace ((unsigned char) *prog))
1133 if (!lex_get_line ())
1137 /* Skip plus sign. */
1142 /* Skip whitespace after plus sign. */
1147 while (isspace ((unsigned char) *prog))
1155 if (!lex_get_line ())
1157 msg (SE, _("Unexpected end of file in string concatenation."));
1162 /* Ensure that a valid string follows. */
1163 if (*prog != '\'' && *prog != '"')
1165 msg (SE, _("String expected following `+'."));
1170 /* We come here when we've finished concatenating all the string sections
1171 into one large string. */
1173 if (type != CHARACTER_STRING)
1174 convert_numeric_string_to_char_string (type);
1176 if (ds_length (&tokstr) > 255)
1178 msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1179 ds_length (&tokstr));
1180 ds_truncate (&tokstr, 255);
1187 /* Reads one token from the lexer and writes a textual representation
1188 on stdout for debugging purposes. */
1196 getl_location (&curfn, &curln);
1198 fprintf (stderr, "%s:%d\t", curfn, curln);
1204 fprintf (stderr, "ID\t%s\n", tokid);
1209 fprintf (stderr, "NUM\t%f\n", tokval);
1213 fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&tokstr));
1217 fprintf (stderr, "STOP\n");
1221 fprintf (stderr, "MISC\tEXP\"");
1225 fprintf (stderr, "MISC\tEOF\n");
1229 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
1230 fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (token));
1232 fprintf (stderr, "PUNCT\t%c\n", token);
1236 #endif /* DUMP_TOKENS */