1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <libpspp/message.h>
29 #include <libpspp/alloc.h>
30 #include <libpspp/assertion.h>
31 #include <language/command.h>
32 #include <libpspp/message.h>
33 #include <language/line-buffer.h>
34 #include <libpspp/magic.h>
35 #include <data/settings.h>
36 #include <libpspp/str.h>
41 #define _(msgid) gettext (msgid)
42 #define N_(msgid) msgid
52 /* T_POS_NUM, T_NEG_NUM: the token's value. */
55 /* T_ID: the identifier. */
56 char tokid[LONG_NAME_LEN + 1];
58 /* T_ID, T_STRING: token string value.
59 For T_ID, this is not truncated as is tokid. */
62 /* Static variables. */
64 /* Pointer to next token in getl_buf. */
67 /* True only if this line ends with a terminal dot. */
70 /* True only if the last token returned was T_STOP. */
73 /* If nonzero, next token returned by lex_get().
74 Used only in exceptional circumstances. */
76 static struct string put_tokstr;
77 static double put_tokval;
79 static int parse_id (void);
81 /* How a string represents its contents. */
84 CHARACTER_STRING, /* Characters. */
85 BINARY_STRING, /* Binary digits. */
86 OCTAL_STRING, /* Octal digits. */
87 HEX_STRING /* Hexadecimal digits. */
90 static int parse_string (enum string_type);
93 static void dump_token (void);
98 /* Initializes the lexer. */
102 ds_init_empty (&tokstr);
103 ds_init_empty (&put_tokstr);
104 if (!lex_get_line ())
111 ds_destroy (&put_tokstr);
112 ds_destroy (&tokstr);
116 /* Common functions. */
118 /* Copies put_token, put_tokstr, put_tokval into token, tokstr,
119 tokval, respectively, and sets tokid appropriately. */
123 assert (put_token != 0);
125 ds_assign_string (&tokstr, &put_tokstr);
126 str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
131 /* Copies token, tokstr, tokval into put_token, put_tokstr,
132 put_tokval respectively. */
137 ds_assign_string (&put_tokstr, &tokstr);
141 /* Parses a single token, setting appropriate global variables to
142 indicate the token's attributes. */
146 /* If a token was pushed ahead, return it. */
159 /* Skip whitespace. */
168 while (isspace ((unsigned char) *prog))
182 else if (!lex_get_line ())
203 /* Actually parse the token. */
209 case '0': case '1': case '2': case '3': case '4':
210 case '5': case '6': case '7': case '8': case '9':
214 /* `-' can introduce a negative number, or it can be a
215 token by itself. If it is not followed by a digit or a
216 decimal point, it is definitely not a number.
217 Otherwise, it might be either, but most of the time we
218 want it as a number. When the syntax calls for a `-'
219 token, lex_negative_to_dash() must be used to break
220 negative numbers into two tokens. */
223 ds_put_char (&tokstr, *prog++);
224 while (isspace ((unsigned char) *prog))
227 if (!isdigit ((unsigned char) *prog) && *prog != '.')
237 /* Parse the number, copying it into tokstr. */
238 while (isdigit ((unsigned char) *prog))
239 ds_put_char (&tokstr, *prog++);
242 ds_put_char (&tokstr, *prog++);
243 while (isdigit ((unsigned char) *prog))
244 ds_put_char (&tokstr, *prog++);
246 if (*prog == 'e' || *prog == 'E')
248 ds_put_char (&tokstr, *prog++);
249 if (*prog == '+' || *prog == '-')
250 ds_put_char (&tokstr, *prog++);
251 while (isdigit ((unsigned char) *prog))
252 ds_put_char (&tokstr, *prog++);
255 /* Parse as floating point. */
256 tokval = strtod (ds_cstr (&tokstr), &tail);
259 msg (SE, _("%s does not form a valid number."),
264 ds_put_char (&tokstr, '0');
271 token = parse_string (CHARACTER_STRING);
274 case '(': case ')': case ',': case '=': case '+': case '/':
294 else if (*prog == '>')
334 if (prog[1] == '\'' || prog[1] == '"')
335 token = parse_string (BINARY_STRING);
341 if (prog[1] == '\'' || prog[1] == '"')
342 token = parse_string (OCTAL_STRING);
348 if (prog[1] == '\'' || prog[1] == '"')
349 token = parse_string (HEX_STRING);
355 if (lex_is_id1 (*prog))
362 if (isgraph ((unsigned char) *prog))
363 msg (SE, _("Bad character in input: `%c'."), *prog++);
365 msg (SE, _("Bad character in input: `\\%o'."), *prog++);
377 /* Parses an identifier at the current position into tokid and
379 Returns the correct token type. */
383 const char *start = prog;
384 prog = lex_skip_identifier (start);
386 ds_put_substring (&tokstr, ss_buffer (start, prog - start));
387 str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
388 return lex_id_to_token (ds_cstr (&tokstr), ds_length (&tokstr));
391 /* Reports an error to the effect that subcommand SBC may only be
394 lex_sbc_only_once (const char *sbc)
396 msg (SE, _("Subcommand %s may only be specified once."), sbc);
399 /* Reports an error to the effect that subcommand SBC is
402 lex_sbc_missing (const char *sbc)
404 lex_error (_("missing required subcommand %s"), sbc);
407 /* Prints a syntax error message containing the current token and
408 given message MESSAGE (if non-null). */
410 lex_error (const char *message, ...)
415 token_rep = lex_token_representation ();
417 strcpy (where, "end of file");
418 else if (token == '.')
419 strcpy (where, "end of command");
421 snprintf (where, sizeof where, "`%s'", token_rep);
429 va_start (args, message);
430 vsnprintf (buf, 1024, message, args);
433 msg (SE, _("Syntax error %s at %s."), buf, where);
436 msg (SE, _("Syntax error at %s."), where);
439 /* Checks that we're at end of command.
440 If so, returns a successful command completion code.
441 If not, flags a syntax error and returns an error command
444 lex_end_of_command (void)
448 lex_error (_("expecting end of command"));
455 /* Token testing functions. */
457 /* Returns true if the current token is a number. */
461 return token == T_POS_NUM || token == T_NEG_NUM;
464 /* Returns the value of the current token, which must be a
465 floating point number. */
469 assert (lex_is_number ());
473 /* Returns true iff the current token is an integer. */
475 lex_is_integer (void)
477 return (lex_is_number ()
478 && tokval != NOT_LONG
479 && tokval >= LONG_MIN
480 && tokval <= LONG_MAX
481 && floor (tokval) == tokval);
484 /* Returns the value of the current token, which must be an
489 assert (lex_is_integer ());
493 /* Token matching functions. */
495 /* If TOK is the current token, skips it and returns true
496 Otherwise, returns false. */
509 /* If the current token is the identifier S, skips it and returns
510 true. The identifier may be abbreviated to its first three
512 Otherwise, returns false. */
514 lex_match_id (const char *s)
516 if (token == T_ID && lex_id_match (s, tokid))
525 /* If the current token is integer N, skips it and returns true.
526 Otherwise, returns false. */
528 lex_match_int (int x)
530 if (lex_is_integer () && lex_integer () == x)
539 /* Forced matches. */
541 /* If this token is identifier S, fetches the next token and returns
543 Otherwise, reports an error and returns zero. */
545 lex_force_match_id (const char *s)
547 if (token == T_ID && lex_id_match (s, tokid))
554 lex_error (_("expecting `%s'"), s);
559 /* If the current token is T, skips the token. Otherwise, reports an
560 error and returns from the current function with return value false. */
562 lex_force_match (int t)
571 lex_error (_("expecting `%s'"), lex_token_name (t));
576 /* If this token is a string, does nothing and returns true.
577 Otherwise, reports an error and returns false. */
579 lex_force_string (void)
581 if (token == T_STRING)
585 lex_error (_("expecting string"));
590 /* If this token is an integer, does nothing and returns true.
591 Otherwise, reports an error and returns false. */
595 if (lex_is_integer ())
599 lex_error (_("expecting integer"));
604 /* If this token is a number, does nothing and returns true.
605 Otherwise, reports an error and returns false. */
609 if (lex_is_number ())
613 lex_error (_("expecting number"));
618 /* If this token is an identifier, does nothing and returns true.
619 Otherwise, reports an error and returns false. */
627 lex_error (_("expecting identifier"));
631 /* Weird token functions. */
633 /* Returns the first character of the next token, except that if the
634 next token is not an identifier, the character returned will not be
635 a character that can begin an identifier. Specifically, the
636 hexstring lead-in X' causes lookahead() to return '. Note that an
637 alphanumeric return value doesn't guarantee an ID token, it could
638 also be a reserved-word token. */
640 lex_look_ahead (void)
652 while (isspace ((unsigned char) *prog))
659 else if (!lex_get_line ())
666 if ((toupper ((unsigned char) *prog) == 'X'
667 || toupper ((unsigned char) *prog) == 'B'
668 || toupper ((unsigned char) *prog) == 'O')
669 && (prog[1] == '\'' || prog[1] == '"'))
676 /* Makes the current token become the next token to be read; the
677 current token is set to T. */
685 /* Makes the current token become the next token to be read; the
686 current token is set to the identifier ID. */
688 lex_put_back_id (const char *id)
690 assert (lex_id_to_token (id, strlen (id)) == T_ID);
693 ds_assign_cstr (&tokstr, id);
694 str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
697 /* Weird line processing functions. */
699 /* Returns the entire contents of the current line. */
701 lex_entire_line (void)
703 return ds_cstr (&getl_buf);
706 /* As lex_entire_line(), but only returns the part of the current line
707 that hasn't already been tokenized.
708 If END_DOT is non-null, stores nonzero into *END_DOT if the line
709 ends with a terminal dot, or zero if it doesn't. */
711 lex_rest_of_line (int *end_dot)
718 /* Causes the rest of the current input line to be ignored for
719 tokenization purposes. */
721 lex_discard_line (void)
723 prog = ds_end (&getl_buf);
728 /* Sets the current position in the current line to P, which must be
731 lex_set_prog (char *p)
736 /* Discards the rest of the current command.
737 When we're reading commands from a file, we skip tokens until
738 a terminal dot or EOF.
739 When we're reading commands interactively from the user,
740 that's just discarding the current line, because presumably
741 the user doesn't want to finish typing a command that will be
744 lex_discard_rest_of_command (void)
746 if (!getl_is_interactive ())
748 while (token != T_STOP && token != '.')
755 /* Weird line reading functions. */
757 /* Remove C-style comments in STRING, begun by slash-star and
758 terminated by star-slash or newline. */
760 strip_comments (struct string *string)
768 for (cp = ds_cstr (string); *cp; )
770 /* If we're not in a comment, check for quote marks. */
775 else if (*cp == '\'' || *cp == '"')
779 /* If we're not inside a quotation, check for comment. */
782 if (cp[0] == '/' && cp[1] == '*')
789 else if (in_comment && cp[0] == '*' && cp[1] == '/')
798 /* Check commenting. */
805 /* Reads a line for use by the tokenizer, and preprocesses it by
806 removing comments, stripping trailing whitespace and the
807 terminal dot, and removing leading indentors. */
811 struct string *line = &getl_buf;
814 if (!getl_read_line (&interactive))
817 strip_comments (line);
818 ds_rtrim (line, ss_cstr (CC_SPACES));
820 /* Check for and remove terminal dot. */
821 dot = (ds_chomp (line, get_endcmd ())
822 || (ds_is_empty (line) && get_nulline ()));
824 /* Strip leading indentors or insert a terminal dot (unless the
825 line was obtained interactively). */
828 int first = ds_first (line);
830 if (first == '+' || first == '-')
831 *ds_data (line) = ' ';
832 else if (first != EOF && !isspace (first))
836 prog = ds_cstr (line);
843 /* Returns the name of a token in a static buffer. */
845 lex_token_name (int token)
847 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
848 return keywords[token - T_FIRST_KEYWORD];
860 /* Returns an ASCII representation of the current token as a
861 malloc()'d string. */
863 lex_token_representation (void)
872 return ds_xstrdup (&tokstr);
880 for (sp = ds_cstr (&tokstr); sp < ds_end (&tokstr); sp++)
881 if (!isprint ((unsigned char) *sp))
887 token_rep = xmalloc (2 + ds_length (&tokstr) * 2 + 1 + 1);
895 for (sp = ds_cstr (&tokstr); *sp; )
899 *dp++ = (unsigned char) *sp++;
902 for (sp = ds_cstr (&tokstr); sp < ds_end (&tokstr); sp++)
904 *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
905 *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
915 token_rep = xmalloc (1);
920 return xstrdup ("**");
923 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
924 return xstrdup (keywords [token - T_FIRST_KEYWORD]);
927 token_rep = xmalloc (2);
928 token_rep[0] = token;
937 /* Really weird functions. */
939 /* Most of the time, a `-' is a lead-in to a negative number. But
940 sometimes it's actually part of the syntax. If a dash can be part
941 of syntax then this function is called to rip it off of a
944 lex_negative_to_dash (void)
946 if (token == T_NEG_NUM)
950 ds_assign_substring (&tokstr, ds_substr (&tokstr, 1, SIZE_MAX));
956 /* We're not at eof any more. */
963 /* Skip a COMMENT command. */
965 lex_skip_comment (void)
969 if (!lex_get_line ())
976 if (put_token == '.')
979 prog = ds_end (&getl_buf);
985 /* Private functions. */
987 /* When invoked, tokstr contains a string of binary, octal, or
988 hex digits, according to TYPE. The string is converted to
989 characters having the specified values. */
991 convert_numeric_string_to_char_string (enum string_type type)
993 const char *base_name;
1003 base_name = _("binary");
1008 base_name = _("octal");
1013 base_name = _("hex");
1021 byte_cnt = ds_length (&tokstr) / chars_per_byte;
1022 if (ds_length (&tokstr) % chars_per_byte)
1023 msg (SE, _("String of %s digits has %d characters, which is not a "
1025 base_name, ds_length (&tokstr), chars_per_byte);
1027 p = ds_cstr (&tokstr);
1028 for (i = 0; i < byte_cnt; i++)
1034 for (j = 0; j < chars_per_byte; j++, p++)
1038 if (*p >= '0' && *p <= '9')
1042 static const char alpha[] = "abcdef";
1043 const char *q = strchr (alpha, tolower ((unsigned char) *p));
1052 msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1054 value = value * base + v;
1057 ds_cstr (&tokstr)[i] = (unsigned char) value;
1060 ds_truncate (&tokstr, byte_cnt);
1063 /* Parses a string from the input buffer into tokstr. The input
1064 buffer pointer prog must point to the initial single or double
1065 quote. TYPE indicates the type of string to be parsed.
1066 Returns token type. */
1068 parse_string (enum string_type type)
1070 /* Accumulate the entire string, joining sections indicated by +
1074 /* Single or double quote. */
1077 /* Accumulate section. */
1080 /* Check end of line. */
1083 msg (SE, _("Unterminated string constant."));
1087 /* Double quote characters to embed them in strings. */
1096 ds_put_char (&tokstr, *prog++);
1100 /* Skip whitespace after final quote mark. */
1105 while (isspace ((unsigned char) *prog))
1113 if (!lex_get_line ())
1117 /* Skip plus sign. */
1122 /* Skip whitespace after plus sign. */
1127 while (isspace ((unsigned char) *prog))
1135 if (!lex_get_line ())
1137 msg (SE, _("Unexpected end of file in string concatenation."));
1142 /* Ensure that a valid string follows. */
1143 if (*prog != '\'' && *prog != '"')
1145 msg (SE, _("String expected following `+'."));
1150 /* We come here when we've finished concatenating all the string sections
1151 into one large string. */
1153 if (type != CHARACTER_STRING)
1154 convert_numeric_string_to_char_string (type);
1156 if (ds_length (&tokstr) > 255)
1158 msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1159 ds_length (&tokstr));
1160 ds_truncate (&tokstr, 255);
1168 for (i = 0; i < ds_length (&tokstr); i++)
1169 if (ds_cstr (&tokstr)[i] == 0)
1173 msg (SE, _("Sorry, literal strings may not contain null "
1174 "characters. Replacing with spaces."));
1177 ds_cstr (&tokstr)[i] = ' ';
1185 /* Reads one token from the lexer and writes a textual representation
1186 on stdout for debugging purposes. */
1194 getl_location (&curfn, &curln);
1196 fprintf (stderr, "%s:%d\t", curfn, curln);
1202 fprintf (stderr, "ID\t%s\n", tokid);
1207 fprintf (stderr, "NUM\t%f\n", tokval);
1211 fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&tokstr));
1215 fprintf (stderr, "STOP\n");
1219 fprintf (stderr, "MISC\tEXP\"");
1223 fprintf (stderr, "MISC\tEOF\n");
1227 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
1228 fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (token));
1230 fprintf (stderr, "PUNCT\t%c\n", token);
1234 #endif /* DUMP_TOKENS */