1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
32 #include "line-buffer.h"
38 #define _(msgid) gettext (msgid)
39 #define N_(msgid) msgid
46 /* Global variables. */
48 extern const char *keywords[T_N_KEYWORDS + 1];
54 /* T_POS_NUM, T_NEG_NUM: the token's value. */
57 /* T_ID: the identifier. */
58 char tokid[LONG_NAME_LEN + 1];
60 /* T_ID, T_STRING: token string value.
61 For T_ID, this is not truncated as is tokid. */
64 /* Static variables. */
66 /* Pointer to next token in getl_buf. */
69 /* Nonzero only if this line ends with a terminal dot. */
72 /* Nonzero only if the last token returned was T_STOP. */
75 /* If nonzero, next token returned by lex_get().
76 Used only in exceptional circumstances. */
78 static struct string put_tokstr;
79 static double put_tokval;
81 static int parse_id (void);
83 /* How a string represents its contents. */
86 CHARACTER_STRING, /* Characters. */
87 BINARY_STRING, /* Binary digits. */
88 OCTAL_STRING, /* Octal digits. */
89 HEX_STRING /* Hexadecimal digits. */
92 static void convert_numeric_string_to_char_string (enum string_type);
93 static int parse_string (enum string_type);
96 static void dump_token (void);
101 /* Initializes the lexer. */
105 ds_init (&tokstr, 64);
106 ds_init (&put_tokstr, 64);
107 if (!lex_get_line ())
114 ds_destroy (&put_tokstr);
115 ds_destroy (&tokstr);
119 /* Common functions. */
121 /* Copies put_token, put_tokstr, put_tokval into token, tokstr,
122 tokval, respectively, and sets tokid appropriately. */
126 assert (put_token != 0);
128 ds_replace (&tokstr, ds_c_str (&put_tokstr));
129 str_copy_trunc (tokid, sizeof tokid, ds_c_str (&tokstr));
134 /* Copies token, tokstr, tokval into put_token, put_tokstr,
135 put_tokval respectively. */
140 ds_replace (&put_tokstr, ds_c_str (&tokstr));
144 /* Parses a single token, setting appropriate global variables to
145 indicate the token's attributes. */
149 /* If a token was pushed ahead, return it. */
162 /* Skip whitespace. */
171 while (isspace ((unsigned char) *prog))
185 else if (!lex_get_line ())
206 /* Actually parse the token. */
212 case '0': case '1': case '2': case '3': case '4':
213 case '5': case '6': case '7': case '8': case '9':
217 /* `-' can introduce a negative number, or it can be a
218 token by itself. If it is not followed by a digit or a
219 decimal point, it is definitely not a number.
220 Otherwise, it might be either, but most of the time we
221 want it as a number. When the syntax calls for a `-'
222 token, lex_negative_to_dash() must be used to break
223 negative numbers into two tokens. */
226 ds_putc (&tokstr, *prog++);
227 while (isspace ((unsigned char) *prog))
230 if (!isdigit ((unsigned char) *prog) && *prog != '.')
240 /* Parse the number, copying it into tokstr. */
241 while (isdigit ((unsigned char) *prog))
242 ds_putc (&tokstr, *prog++);
245 ds_putc (&tokstr, *prog++);
246 while (isdigit ((unsigned char) *prog))
247 ds_putc (&tokstr, *prog++);
249 if (*prog == 'e' || *prog == 'E')
251 ds_putc (&tokstr, *prog++);
252 if (*prog == '+' || *prog == '-')
253 ds_putc (&tokstr, *prog++);
254 while (isdigit ((unsigned char) *prog))
255 ds_putc (&tokstr, *prog++);
258 /* Parse as floating point. */
259 tokval = strtod (ds_c_str (&tokstr), &tail);
262 msg (SE, _("%s does not form a valid number."),
267 ds_putc (&tokstr, '0');
274 token = parse_string (CHARACTER_STRING);
277 case '(': case ')': case ',': case '=': case '+': case '/':
297 else if (*prog == '>')
337 if (prog[1] == '\'' || prog[1] == '"')
338 token = parse_string (BINARY_STRING);
344 if (prog[1] == '\'' || prog[1] == '"')
345 token = parse_string (OCTAL_STRING);
351 if (prog[1] == '\'' || prog[1] == '"')
352 token = parse_string (HEX_STRING);
358 if (lex_is_id1 (*prog))
365 if (isgraph ((unsigned char) *prog))
366 msg (SE, _("Bad character in input: `%c'."), *prog++);
368 msg (SE, _("Bad character in input: `\\%o'."), *prog++);
380 /* Parses an identifier at the current position into tokid and
382 Returns the correct token type. */
386 const char *start = prog;
387 prog = lex_skip_identifier (start);
389 ds_concat (&tokstr, start, prog - start);
390 str_copy_trunc (tokid, sizeof tokid, ds_c_str (&tokstr));
391 return lex_id_to_token (ds_c_str (&tokstr), ds_length (&tokstr));
394 /* Reports an error to the effect that subcommand SBC may only be
397 lex_sbc_only_once (const char *sbc)
399 msg (SE, _("Subcommand %s may only be specified once."), sbc);
402 /* Reports an error to the effect that subcommand SBC is
405 lex_sbc_missing (const char *sbc)
407 lex_error (_("missing required subcommand %s"), sbc);
410 /* Prints a syntax error message containing the current token and
411 given message MESSAGE (if non-null). */
413 lex_error (const char *message, ...)
418 token_rep = lex_token_representation ();
420 strcpy (where, "end of file");
421 else if (token == '.')
422 strcpy (where, "end of command");
424 snprintf (where, sizeof where, "`%s'", token_rep);
432 va_start (args, message);
433 vsnprintf (buf, 1024, message, args);
436 msg (SE, _("Syntax error %s at %s."), buf, where);
439 msg (SE, _("Syntax error at %s."), where);
442 /* Checks that we're at end of command.
443 If so, returns a successful command completion code.
444 If not, flags a syntax error and returns an error command
447 lex_end_of_command (void)
451 lex_error (_("expecting end of command"));
452 return CMD_TRAILING_GARBAGE;
458 /* Token testing functions. */
460 /* Returns true if the current token is a number. */
464 return token == T_POS_NUM || token == T_NEG_NUM;
467 /* Returns the value of the current token, which must be a
468 floating point number. */
472 assert (lex_is_number ());
476 /* Returns true iff the current token is an integer. */
478 lex_is_integer (void)
480 return (lex_is_number ()
481 && tokval != NOT_LONG
482 && tokval >= LONG_MIN
483 && tokval <= LONG_MAX
484 && floor (tokval) == tokval);
487 /* Returns the value of the current token, which must be an
492 assert (lex_is_integer ());
496 /* Token matching functions. */
498 /* If TOK is the current token, skips it and returns nonzero.
499 Otherwise, returns zero. */
512 /* If the current token is the identifier S, skips it and returns
513 nonzero. The identifier may be abbreviated to its first three
515 Otherwise, returns zero. */
517 lex_match_id (const char *s)
519 if (token == T_ID && lex_id_match (s, tokid))
528 /* If the current token is integer N, skips it and returns nonzero.
529 Otherwise, returns zero. */
531 lex_match_int (int x)
533 if (lex_is_integer () && lex_integer () == x)
542 /* Forced matches. */
544 /* If this token is identifier S, fetches the next token and returns
546 Otherwise, reports an error and returns zero. */
548 lex_force_match_id (const char *s)
550 if (token == T_ID && lex_id_match (s, tokid))
557 lex_error (_("expecting `%s'"), s);
562 /* If the current token is T, skips the token. Otherwise, reports an
563 error and returns from the current function with return value 0. */
565 lex_force_match (int t)
574 lex_error (_("expecting `%s'"), lex_token_name (t));
579 /* If this token is a string, does nothing and returns nonzero.
580 Otherwise, reports an error and returns zero. */
582 lex_force_string (void)
584 if (token == T_STRING)
588 lex_error (_("expecting string"));
593 /* If this token is an integer, does nothing and returns nonzero.
594 Otherwise, reports an error and returns zero. */
598 if (lex_is_integer ())
602 lex_error (_("expecting integer"));
607 /* If this token is a number, does nothing and returns nonzero.
608 Otherwise, reports an error and returns zero. */
612 if (lex_is_number ())
616 lex_error (_("expecting number"));
621 /* If this token is an identifier, does nothing and returns nonzero.
622 Otherwise, reports an error and returns zero. */
630 lex_error (_("expecting identifier"));
634 /* Weird token functions. */
636 /* Returns the first character of the next token, except that if the
637 next token is not an identifier, the character returned will not be
638 a character that can begin an identifier. Specifically, the
639 hexstring lead-in X' causes lookahead() to return '. Note that an
640 alphanumeric return value doesn't guarantee an ID token, it could
641 also be a reserved-word token. */
643 lex_look_ahead (void)
655 while (isspace ((unsigned char) *prog))
662 else if (!lex_get_line ())
669 if ((toupper ((unsigned char) *prog) == 'X'
670 || toupper ((unsigned char) *prog) == 'B'
671 || toupper ((unsigned char) *prog) == 'O')
672 && (prog[1] == '\'' || prog[1] == '"'))
679 /* Makes the current token become the next token to be read; the
680 current token is set to T. */
688 /* Makes the current token become the next token to be read; the
689 current token is set to the identifier ID. */
691 lex_put_back_id (const char *id)
693 assert (lex_id_to_token (id, strlen (id)) == T_ID);
696 ds_replace (&tokstr, id);
697 str_copy_trunc (tokid, sizeof tokid, ds_c_str (&tokstr));
700 /* Weird line processing functions. */
702 /* Returns the entire contents of the current line. */
704 lex_entire_line (void)
706 return ds_c_str (&getl_buf);
709 /* As lex_entire_line(), but only returns the part of the current line
710 that hasn't already been tokenized.
711 If END_DOT is non-null, stores nonzero into *END_DOT if the line
712 ends with a terminal dot, or zero if it doesn't. */
714 lex_rest_of_line (int *end_dot)
721 /* Causes the rest of the current input line to be ignored for
722 tokenization purposes. */
724 lex_discard_line (void)
726 prog = ds_end (&getl_buf);
730 /* Sets the current position in the current line to P, which must be
733 lex_set_prog (char *p)
738 /* Weird line reading functions. */
740 /* Remove C-style comments in STRING, begun by slash-star and
741 terminated by star-slash or newline. */
743 strip_comments (struct string *string)
751 for (cp = ds_c_str (string); *cp; )
753 /* If we're not in a comment, check for quote marks. */
758 else if (*cp == '\'' || *cp == '"')
762 /* If we're not inside a quotation, check for comment. */
765 if (cp[0] == '/' && cp[1] == '*')
772 else if (in_comment && cp[0] == '*' && cp[1] == '/')
781 /* Check commenting. */
788 /* Reads a line for use by the tokenizer, and preprocesses it by
789 removing comments, stripping trailing whitespace and the
790 terminal dot, and removing leading indentors. */
794 struct string *line = &getl_buf;
797 if (!getl_read_line (&interactive))
800 strip_comments (line);
801 ds_rtrim_spaces (line);
803 /* Check for and remove terminal dot. */
804 dot = (ds_chomp (line, get_endcmd ())
805 || (ds_is_empty (line) && get_nulline ()));
807 /* Strip leading indentors or insert a terminal dot (unless the
808 line was obtained interactively). */
811 int first = ds_first (line);
813 if (first == '+' || first == '-')
814 *ds_data (line) = ' ';
815 else if (first != EOF && !isspace (first))
819 prog = ds_c_str (line);
826 /* Returns the name of a token in a static buffer. */
828 lex_token_name (int token)
830 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
831 return keywords[token - T_FIRST_KEYWORD];
843 /* Returns an ASCII representation of the current token as a
844 malloc()'d string. */
846 lex_token_representation (void)
855 return xstrdup (ds_c_str (&tokstr));
863 for (sp = ds_c_str (&tokstr); sp < ds_end (&tokstr); sp++)
864 if (!isprint ((unsigned char) *sp))
870 token_rep = xmalloc (2 + ds_length (&tokstr) * 2 + 1 + 1);
878 for (sp = ds_c_str (&tokstr); *sp; )
882 *dp++ = (unsigned char) *sp++;
885 for (sp = ds_c_str (&tokstr); sp < ds_end (&tokstr); sp++)
887 *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"];
888 *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"];
898 token_rep = xmalloc (1);
903 return xstrdup ("**");
906 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
907 return xstrdup (keywords [token - T_FIRST_KEYWORD]);
910 token_rep = xmalloc (2);
911 token_rep[0] = token;
920 /* Really weird functions. */
922 /* Most of the time, a `-' is a lead-in to a negative number. But
923 sometimes it's actually part of the syntax. If a dash can be part
924 of syntax then this function is called to rip it off of a
927 lex_negative_to_dash (void)
929 if (token == T_NEG_NUM)
933 ds_replace (&tokstr, ds_c_str (&tokstr) + 1);
939 /* We're not at eof any more. */
946 /* Skip a COMMENT command. */
948 lex_skip_comment (void)
952 if (!lex_get_line ())
959 if (put_token == '.')
962 prog = ds_end (&getl_buf);
968 /* Private functions. */
970 /* When invoked, tokstr contains a string of binary, octal, or
971 hex digits, according to TYPE. The string is converted to
972 characters having the specified values. */
974 convert_numeric_string_to_char_string (enum string_type type)
976 const char *base_name;
986 base_name = _("binary");
991 base_name = _("octal");
996 base_name = _("hex");
1004 byte_cnt = ds_length (&tokstr) / chars_per_byte;
1005 if (ds_length (&tokstr) % chars_per_byte)
1006 msg (SE, _("String of %s digits has %d characters, which is not a "
1008 base_name, ds_length (&tokstr), chars_per_byte);
1010 p = ds_c_str (&tokstr);
1011 for (i = 0; i < byte_cnt; i++)
1017 for (j = 0; j < chars_per_byte; j++, p++)
1021 if (*p >= '0' && *p <= '9')
1025 static const char alpha[] = "abcdef";
1026 const char *q = strchr (alpha, tolower ((unsigned char) *p));
1035 msg (SE, _("`%c' is not a valid %s digit."), *p, base_name);
1037 value = value * base + v;
1040 ds_c_str (&tokstr)[i] = (unsigned char) value;
1043 ds_truncate (&tokstr, byte_cnt);
1046 /* Parses a string from the input buffer into tokstr. The input
1047 buffer pointer prog must point to the initial single or double
1048 quote. TYPE indicates the type of string to be parsed.
1049 Returns token type. */
1051 parse_string (enum string_type type)
1053 /* Accumulate the entire string, joining sections indicated by +
1057 /* Single or double quote. */
1060 /* Accumulate section. */
1063 /* Check end of line. */
1066 msg (SE, _("Unterminated string constant."));
1070 /* Double quote characters to embed them in strings. */
1079 ds_putc (&tokstr, *prog++);
1083 /* Skip whitespace after final quote mark. */
1088 while (isspace ((unsigned char) *prog))
1096 if (!lex_get_line ())
1100 /* Skip plus sign. */
1105 /* Skip whitespace after plus sign. */
1110 while (isspace ((unsigned char) *prog))
1118 if (!lex_get_line ())
1120 msg (SE, _("Unexpected end of file in string concatenation."));
1125 /* Ensure that a valid string follows. */
1126 if (*prog != '\'' && *prog != '"')
1128 msg (SE, _("String expected following `+'."));
1133 /* We come here when we've finished concatenating all the string sections
1134 into one large string. */
1136 if (type != CHARACTER_STRING)
1137 convert_numeric_string_to_char_string (type);
1139 if (ds_length (&tokstr) > 255)
1141 msg (SE, _("String exceeds 255 characters in length (%d characters)."),
1142 ds_length (&tokstr));
1143 ds_truncate (&tokstr, 255);
1151 for (i = 0; i < ds_length (&tokstr); i++)
1152 if (ds_c_str (&tokstr)[i] == 0)
1156 msg (SE, _("Sorry, literal strings may not contain null "
1157 "characters. Replacing with spaces."));
1160 ds_c_str (&tokstr)[i] = ' ';
1168 /* Reads one token from the lexer and writes a textual representation
1169 on stdout for debugging purposes. */
1177 getl_location (&curfn, &curln);
1179 fprintf (stderr, "%s:%d\t", curfn, curln);
1185 fprintf (stderr, "ID\t%s\n", tokid);
1190 fprintf (stderr, "NUM\t%f\n", tokval);
1194 fprintf (stderr, "STRING\t\"%s\"\n", ds_c_str (&tokstr));
1198 fprintf (stderr, "STOP\n");
1202 fprintf (stderr, "MISC\tEXP\"");
1206 fprintf (stderr, "MISC\tEOF\n");
1210 if (token >= T_FIRST_KEYWORD && token <= T_LAST_KEYWORD)
1211 fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (token));
1213 fprintf (stderr, "PUNCT\t%c\n", token);
1217 #endif /* DUMP_TOKENS */