1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <libpspp/message.h>
29 #include <libpspp/alloc.h>
30 #include <language/command.h>
31 #include <libpspp/message.h>
32 #include <language/line-buffer.h>
33 #include <libpspp/magic.h>
34 #include <data/settings.h>
35 #include <libpspp/str.h>
40 #define _(msgid) gettext (msgid)
41 #define N_(msgid) msgid
48 /* Global variables. */
50 extern const char *keywords[T_N_KEYWORDS + 1];
56 /* T_POS_NUM, T_NEG_NUM: the token's value. */
59 /* T_ID: the identifier. */
60 char tokid[LONG_NAME_LEN + 1];
62 /* T_ID, T_STRING: token string value.
63 For T_ID, this is not truncated as is tokid. */
66 /* Static variables. */
68 /* Pointer to next token in getl_buf. */
71 /* Nonzero only if this line ends with a terminal dot. */
74 /* Nonzero only if the last token returned was T_STOP. */
77 /* If nonzero, next token returned by lex_get().
78 Used only in exceptional circumstances. */
80 static struct string put_tokstr;
81 static double put_tokval;
83 static int parse_id (void);
85 /* How a string represents its contents. */
88 CHARACTER_STRING, /* Characters. */
89 BINARY_STRING, /* Binary digits. */
90 OCTAL_STRING, /* Octal digits. */
91 HEX_STRING /* Hexadecimal digits. */
94 static int parse_string (enum string_type);
97 static void dump_token (void);
100 /* Initialization. */
102 /* Initializes the lexer. */
106 ds_init_empty (&tokstr);
107 ds_init_empty (&put_tokstr);
108 if (!lex_get_line ())
115 ds_destroy (&put_tokstr);
116 ds_destroy (&tokstr);
120 /* Common functions. */
122 /* Copies put_token, put_tokstr, put_tokval into token, tokstr,
123 tokval, respectively, and sets tokid appropriately. */
127 assert (put_token != 0);
129 ds_assign_string (&tokstr, &put_tokstr);
130 str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
135 /* Copies token, tokstr, tokval into put_token, put_tokstr,
136 put_tokval respectively. */
141 ds_assign_string (&put_tokstr, &tokstr);
145 /* Parses a single token, setting appropriate global variables to
146 indicate the token's attributes. */
150 /* If a token was pushed ahead, return it. */
163 /* Skip whitespace. */
172 while (isspace ((unsigned char) *prog))
186 else if (!lex_get_line ())
207 /* Actually parse the token. */
213 case '0': case '1': case '2': case '3': case '4':
214 case '5': case '6': case '7': case '8': case '9':
218 /* `-' can introduce a negative number, or it can be a
219 token by itself. If it is not followed by a digit or a
220 decimal point, it is definitely not a number.
221 Otherwise, it might be either, but most of the time we
222 want it as a number. When the syntax calls for a `-'
223 token, lex_negative_to_dash() must be used to break
224 negative numbers into two tokens. */
227 ds_put_char (&tokstr, *prog++);
228 while (isspace ((unsigned char) *prog))
231 if (!isdigit ((unsigned char) *prog) && *prog != '.')
241 /* Parse the number, copying it into tokstr. */
242 while (isdigit ((unsigned char) *prog))
243 ds_put_char (&tokstr, *prog++);
246 ds_put_char (&tokstr, *prog++);
247 while (isdigit ((unsigned char) *prog))
248 ds_put_char (&tokstr, *prog++);
250 if (*prog == 'e' || *prog == 'E')
252 ds_put_char (&tokstr, *prog++);
253 if (*prog == '+' || *prog == '-')
254 ds_put_char (&tokstr, *prog++);
255 while (isdigit ((unsigned char) *prog))
256 ds_put_char (&tokstr, *prog++);
259 /* Parse as floating point. */
260 tokval = strtod (ds_cstr (&tokstr), &tail);
263 msg (SE, _("%s does not form a valid number."),
268 ds_put_char (&tokstr, '0');
275 token = parse_string (CHARACTER_STRING);
278 case '(': case ')': case ',': case '=': case '+': case '/':
298 else if (*prog == '>')
338 if (prog[1] == '\'' || prog[1] == '"')
339 token = parse_string (BINARY_STRING);
345 if (prog[1] == '\'' || prog[1] == '"')
346 token = parse_string (OCTAL_STRING);
352 if (prog[1] == '\'' || prog[1] == '"')
353 token = parse_string (HEX_STRING);
359 if (lex_is_id1 (*prog))
366 if (isgraph ((unsigned char) *prog))
367 msg (SE, _("Bad character in input: `%c'."), *prog++);
369 msg (SE, _("Bad character in input: `\\%o'."), *prog++);
381 /* Parses an identifier at the current position into tokid and
383 Returns the correct token type. */
387 const char *start = prog;
388 prog = lex_skip_identifier (start);
390 ds_put_substring (&tokstr, ss_buffer (start, prog - start));
391 str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
392 return lex_id_to_token (ds_cstr (&tokstr), ds_length (&tokstr));
395 /* Reports an error to the effect that subcommand SBC may only be
398 lex_sbc_only_once (const char *sbc)
400 msg (SE, _("Subcommand %s may only be specified once."), sbc);
403 /* Reports an error to the effect that subcommand SBC is
406 lex_sbc_missing (const char *sbc)
408 lex_error (_("missing required subcommand %s"), sbc);
411 /* Prints a syntax error message containing the current token and
412 given message MESSAGE (if non-null). */
414 lex_error (const char *message, ...)
419 token_rep = lex_token_representation ();
421 strcpy (where, "end of file");
422 else if (token == '.')
423 strcpy (where, "end of command");
425 snprintf (where, sizeof where, "`%s'", token_rep);
433 va_start (args, message);
434 vsnprintf (buf, 1024, message, args);
437 msg (SE, _("Syntax error %s at %s."), buf, where);
440 msg (SE, _("Syntax error at %s."), where);
443 /* Checks that we're at end of command.
444 If so, returns a successful command completion code.
445 If not, flags a syntax error and returns an error command
448 lex_end_of_command (void)
452 lex_error (_("expecting end of command"));
459 /* Token testing functions. */
461 /* Returns true if the current token is a number. */
465 return token == T_POS_NUM || token == T_NEG_NUM;
468 /* Returns the value of the current token, which must be a
469 floating point number. */
473 assert (lex_is_number ());
477 /* Returns true iff the current token is an integer. */
479 lex_is_integer (void)
481 return (lex_is_number ()
482 && tokval != NOT_LONG
483 && tokval >= LONG_MIN
484 && tokval <= LONG_MAX
485 && floor (tokval) == tokval);
488 /* Returns the value of the current token, which must be an
493 assert (lex_is_integer ());
497 /* Token matching functions. */
499 /* If TOK is the current token, skips it and returns nonzero.
500 Otherwise, returns zero. */
513 /* If the current token is the identifier S, skips it and returns
514 nonzero. The identifier may be abbreviated to its first three
516 Otherwise, returns zero. */
518 lex_match_id (const char *s)
520 if (token == T_ID && lex_id_match (s, tokid))
529 /* If the current token is integer N, skips it and returns nonzero.
530 Otherwise, returns zero. */
532 lex_match_int (int x)
534 if (lex_is_integer () && lex_integer () == x)
543 /* Forced matches. */
545 /* If this token is identifier S, fetches the next token and returns
547 Otherwise, reports an error and returns zero. */
549 lex_force_match_id (const char *s)
551 if (token == T_ID && lex_id_match (s, tokid))
558 lex_error (_("expecting `%s'"), s);
563 /* If the current token is T, skips the token. Otherwise, reports an
564 error and returns from the current function with return value 0. */
566 lex_force_match (int t)
575 lex_error (_("expecting `%s'"), lex_token_name (t));
580 /* If this token is a string, does nothing and returns nonzero.
581 Otherwise, reports an error and returns zero. */
583 lex_force_string (void)
585 if (token == T_STRING)
589 lex_error (_("expecting string"));
594 /* If this token is an integer, does nothing and returns nonzero.
595 Otherwise, reports an error and returns zero. */
599 if (lex_is_integer ())
603 lex_error (_("expecting integer"));
608 /* If this token is a number, does nothing and returns nonzero.
609 Otherwise, reports an error and returns zero. */
613 if (lex_is_number ())
617 lex_error (_("expecting number"));
622 /* If this token is an identifier, does nothing and returns nonzero.
623 Otherwise, reports an error and returns zero. */
631 lex_error (_("expecting identifier"));
635 /* Weird token functions. */
637 /* Returns the first character of the next token, except that if the
638 next token is not an identifier, the character returned will not be
639 a character that can begin an identifier. Specifically, the
640 hexstring lead-in X' causes lookahead() to return '. Note that an
641 alphanumeric return value doesn't guarantee an ID token, it could
642 also be a reserved-word token. */
644 lex_look_ahead (void)
656 while (isspace ((unsigned char) *prog))
663 else if (!lex_get_line ())
670 if ((toupper ((unsigned char) *prog) == 'X'
671 || toupper ((unsigned char) *prog) == 'B'
672 || toupper ((unsigned char) *prog) == 'O')
673 && (prog[1] == '\'' || prog[1] == '"'))
680 /* Makes the current token become the next token to be read; the
681 current token is set to T. */
689 /* Makes the current token become the next token to be read; the
690 current token is set to the identifier ID. */
692 lex_put_back_id (const char *id)
694 assert (lex_id_to_token (id, strlen (id)) == T_ID);
697 ds_assign_cstr (&tokstr, id);
698 str_copy_trunc (tokid, sizeof tokid, ds_cstr (&tokstr));
701 /* Weird line processing functions. */
703 /* Returns the entire contents of the current line. */
705 lex_entire_line (void)
707 return ds_cstr (&getl_buf);
710 /* As lex_entire_line(), but only returns the part of the current line
711 that hasn't already been tokenized.
712 If END_DOT is non-null, stores nonzero into *END_DOT if the line
713 ends with a terminal dot, or zero if it doesn't. */
715 lex_rest_of_line (int *end_dot)
722 /* Causes the rest of the current input line to be ignored for
723 tokenization purposes. */
725 lex_discard_line (void)
727 prog = ds_end (&getl_buf);
731 /* Sets the current position in the current line to P, which must be
734 lex_set_prog (char *p)
739 /* Discards the rest of the current command.
740 When we're reading commands from a file, we skip tokens until
741 a terminal dot or EOF.
742 When we're reading commands interactively from the user,
743 that's just discarding the current line, because presumably
744 the user doesn't want to finish typing a command that will be
747 lex_discard_rest_of_command (void)
749 if (!getl_is_interactive ())
751 while (token != T_STOP && token != '.')
758 /* Weird line reading functions. */
760 /* Remove C-style comments in STRING, begun by slash-star and
761 terminated by star-slash or newline. */
763 strip_comments (struct string *string)
771 for (cp = ds_cstr (string); *cp; )
773 /* If we're not in a comment, check for quote marks. */
778 else if (*cp == '\'' || *cp == '"')
782 /* If we're not inside a quotation, check for comment. */
785 if (cp[0] == '/' && cp[1] == '*')
792 else if (in_comment && cp[0] == '*' && cp[1] == '/')
801 /* Check commenting. */
808 /* Reads a line for use by the tokenizer, and preprocesses it by
809 removing comments, stripping trailing whitespace and the
810 terminal dot, and removing leading indentors. */
814 struct string *line = &getl_buf;
817 if (!getl_read_line (&interactive))
820 strip_comments (line);
821 ds_rtrim (line, ss_cstr (CC_SPACES));
823 /* Check for and remove terminal dot. */
824 dot = (ds_chomp (line, get_endcmd ())
825 || (ds_is_empty (line) && get_nulline ()));
827 /* Strip leading indentors or insert a terminal dot (unless the
828 line was obtained interactively). */
831 int first = ds_first (line);
833 if (first == '+' || first == '-')
834 *ds_data (line) = ' ';
835 else if (first != EOF && !isspace (first))
839 prog = ds_cstr (line);
846 /* Returns the name of a token in a static buffer. */
848 lex_token_name (int token)
850 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
851 return keywords[token - T_FIRST_KEYWORD];
863 /* Returns an ASCII representation of the current token as a
864 malloc()'d string. */
866 lex_token_representation (void)
875 return ds_xstrdup (&tokstr);
883 for (sp = ds_cstr (&tokstr); sp < ds_end (&tokstr); sp++)
884 if (!isprint ((unsigned char) *sp))
890 token_rep = xmalloc (2 + ds_length (&tokstr) * 2 + 1 + 1);
898 for (sp = ds_cstr (&tokstr); *sp; )
902 *dp++ = (unsigned char) *sp++;
905 for (sp = ds_cstr (&tokstr); sp < ds_end (&tokstr); sp++)
907 *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
908 *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
918 token_rep = xmalloc (1);
923 return xstrdup ("**");
926 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
927 return xstrdup (keywords [token - T_FIRST_KEYWORD]);
930 token_rep = xmalloc (2);
931 token_rep[0] = token;
940 /* Really weird functions. */
942 /* Most of the time, a `-' is a lead-in to a negative number. But
943 sometimes it's actually part of the syntax. If a dash can be part
944 of syntax then this function is called to rip it off of a
947 lex_negative_to_dash (void)
949 if (token == T_NEG_NUM)
953 ds_assign_substring (&tokstr, ds_substr (&tokstr, 1, SIZE_MAX));
959 /* We're not at eof any more. */
966 /* Skip a COMMENT command. */
968 lex_skip_comment (void)
972 if (!lex_get_line ())
979 if (put_token == '.')
982 prog = ds_end (&getl_buf);
988 /* Private functions. */
990 /* When invoked, tokstr contains a string of binary, octal, or
991 hex digits, according to TYPE. The string is converted to
992 characters having the specified values. */
994 convert_numeric_string_to_char_string (enum string_type type)
996 const char *base_name;
1006 base_name = _("binary");
1011 base_name = _("octal");
1016 base_name = _("hex");
1024 byte_cnt = ds_length (&tokstr) / chars_per_byte;
1025 if (ds_length (&tokstr) % chars_per_byte)
1026 msg (SE, _("String of %s digits has %d characters, which is not a "
1028 base_name, ds_length (&tokstr), chars_per_byte);
1030 p = ds_cstr (&tokstr);
1031 for (i = 0; i < byte_cnt; i++)
1037 for (j = 0; j < chars_per_byte; j++, p++)
1041 if (*p >= '0' && *p <= '9')
1045 static const char alpha[] = "abcdef";
1046 const char *q = strchr (alpha, tolower ((unsigned char) *p));
1055 msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1057 value = value * base + v;
1060 ds_cstr (&tokstr)[i] = (unsigned char) value;
1063 ds_truncate (&tokstr, byte_cnt);
1066 /* Parses a string from the input buffer into tokstr. The input
1067 buffer pointer prog must point to the initial single or double
1068 quote. TYPE indicates the type of string to be parsed.
1069 Returns token type. */
1071 parse_string (enum string_type type)
1073 /* Accumulate the entire string, joining sections indicated by +
1077 /* Single or double quote. */
1080 /* Accumulate section. */
1083 /* Check end of line. */
1086 msg (SE, _("Unterminated string constant."));
1090 /* Double quote characters to embed them in strings. */
1099 ds_put_char (&tokstr, *prog++);
1103 /* Skip whitespace after final quote mark. */
1108 while (isspace ((unsigned char) *prog))
1116 if (!lex_get_line ())
1120 /* Skip plus sign. */
1125 /* Skip whitespace after plus sign. */
1130 while (isspace ((unsigned char) *prog))
1138 if (!lex_get_line ())
1140 msg (SE, _("Unexpected end of file in string concatenation."));
1145 /* Ensure that a valid string follows. */
1146 if (*prog != '\'' && *prog != '"')
1148 msg (SE, _("String expected following `+'."));
1153 /* We come here when we've finished concatenating all the string sections
1154 into one large string. */
1156 if (type != CHARACTER_STRING)
1157 convert_numeric_string_to_char_string (type);
1159 if (ds_length (&tokstr) > 255)
1161 msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1162 ds_length (&tokstr));
1163 ds_truncate (&tokstr, 255);
1171 for (i = 0; i < ds_length (&tokstr); i++)
1172 if (ds_cstr (&tokstr)[i] == 0)
1176 msg (SE, _("Sorry, literal strings may not contain null "
1177 "characters. Replacing with spaces."));
1180 ds_cstr (&tokstr)[i] = ' ';
1188 /* Reads one token from the lexer and writes a textual representation
1189 on stdout for debugging purposes. */
1197 getl_location (&curfn, &curln);
1199 fprintf (stderr, "%s:%d\t", curfn, curln);
1205 fprintf (stderr, "ID\t%s\n", tokid);
1210 fprintf (stderr, "NUM\t%f\n", tokval);
1214 fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&tokstr));
1218 fprintf (stderr, "STOP\n");
1222 fprintf (stderr, "MISC\tEXP\"");
1226 fprintf (stderr, "MISC\tEOF\n");
1230 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
1231 fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (token));
1233 fprintf (stderr, "PUNCT\t%c\n", token);
1237 #endif /* DUMP_TOKENS */