1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "format-guesser.h"
28 #include <data/format.h>
29 #include <data/settings.h>
30 #include <libpspp/assertion.h>
31 #include <libpspp/str.h>
33 /* A token in which potential date or time fields are broken.
35 The token type is actually a bit-map. This allows a single
36 token to represent multiple roles, as often happens in parsing
37 adate or a time. For example, the number "1" can be a quarter
38 of the year, month, hour, day of the month, week of the year,
39 or a count of days. Such ambiguities are resolved on the
40 higher-level bases of multiple tokens and multiple full
44 DT_DAY = 1 << 0, /* dd: Day of the month. */
45 DT_MONTH = 1 << 1, /* mm: Month. */
46 DT_ENGLISH_MONTH = 1 << 2, /* mmm: Spelled-out month, e.g. "jan". */
47 DT_YEAR = 1 << 3, /* yy: Year. */
49 DT_HOUR = 1 << 4, /* HH: Hour. */
50 DT_MINUTE = 1 << 5, /* MM: Minute. */
51 DT_SECOND = 1 << 6, /* SS: Second. */
53 DT_WEEKDAY = 1 << 7, /* www: Day of the week. */
55 DT_DAY_COUNT = 1 << 8, /* D: Number of days. */
56 DT_WEEK = 1 << 9, /* ww: Week of the year. */
57 DT_QUARTER = 1 << 10, /* q: Quarter of the year. */
59 DT_Q = 1 << 11, /* Literal "Q". */
60 DT_WK = 1 << 12, /* Literal "WK". */
62 DT_DELIM = 1 << 13, /* One of -/., or white space. */
63 DT_SPACE = 1 << 14, /* Any white space. */
64 DT_COLON = 1 << 15, /* : */
67 /* Syntax of a date format, in terms of the date tokens that
71 enum fmt_type format; /* Format type. */
73 size_t token_cnt; /* Number of tokens. */
74 enum date_token tokens[MAX_TOKENS]; /* Tokens. */
77 /* Syntax of all the data formats that we can parse.
79 The order in the array can make a difference in the final
80 choice of formats: in the case of a tie between the number of
81 times each format is seen, the syntax earlier in the array
82 takes precedence. The most important cases are the ordering
83 of DATE before EDATE, so that spelled-out months in input
84 yield DATE format (that produces spelled-out months in output,
85 and the ordering of EDATE before ADATE, so that ambiguous
86 dates such as "1/1/99" yield the more sensible European date
87 format instead of American format.
89 When a given date format has more than one syntax, they must
90 be in adjacent array elements. */
91 static struct date_syntax syntax[] =
94 { FMT_DATE, 5, {DT_DAY, DT_DELIM, DT_ENGLISH_MONTH, DT_DELIM, DT_YEAR} },
97 { FMT_EDATE, 5, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR} },
100 { FMT_ADATE, 5, {DT_MONTH, DT_DELIM, DT_DAY, DT_DELIM, DT_YEAR} },
103 { FMT_SDATE, 5, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY} },
106 { FMT_MOYR, 3, {DT_MONTH, DT_DELIM, DT_YEAR} },
109 { FMT_QYR, 3, {DT_QUARTER, DT_Q, DT_YEAR} },
112 { FMT_WKYR, 3, {DT_WEEK, DT_WK, DT_YEAR} },
114 /* dd-mmm-yyyy HH:MM */
116 9, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
117 DT_COLON, DT_MINUTE} },
118 /* dd-mmm-yyyy HH:MM:SS */
120 11, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
121 DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
124 { FMT_TIME, 3, {DT_HOUR, DT_COLON, DT_MINUTE} },
126 { FMT_TIME, 5, {DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
129 { FMT_DTIME, 5, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE} },
132 7, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON,
136 { FMT_WKDAY, 1, {DT_WEEKDAY} },
139 { FMT_MONTH, 1, {DT_MONTH} },
142 /* Number of recognized date syntax formats. */
143 #define DATE_SYNTAX_CNT (sizeof syntax / sizeof *syntax)
145 /* A format guesser. */
148 /* Maximum observed input width. */
151 /* Sum of the digits after the decimal point in each input
152 (divide by count to obtain average decimal positions). */
153 unsigned int decimals;
155 /* Number of non-empty, non-missing input values.
157 count is the sum of any_numeric, any_date, and the number
158 of inputs that were not in any recognized format (hence,
159 treated as A format). */
162 /* Numeric input formats. */
163 unsigned int any_numeric; /* Sum of following counts. */
164 unsigned int f; /* Number of inputs in F format. */
165 unsigned int comma; /* Number of inputs in COMMA format. */
166 unsigned int dot; /* Number of inputs in DOT format. */
167 unsigned int dollar; /* Number of inputs in DOLLAR format. */
168 unsigned int pct; /* Number of inputs in PCT format. */
169 unsigned int e; /* Number of inputs in E format. */
171 /* Date or time input formats.
173 The sum of the values in the date array is at least
174 any_date, often higher because many example dates match
175 more than one date format. */
176 unsigned int any_date; /* Number of inputs in any date format. */
177 unsigned int date[DATE_SYNTAX_CNT]; /* Number of inputs in each date
181 static bool add_numeric (struct fmt_guesser *, struct substring);
182 static void guess_numeric (struct fmt_guesser *, struct fmt_spec *);
183 static void add_date_time (struct fmt_guesser *, struct substring);
184 static bool match_date_syntax (const enum date_token a[], size_t a_len,
185 const enum date_token b[], size_t b_len);
186 static void guess_date_time (struct fmt_guesser *, struct fmt_spec *);
187 static enum date_token parse_date_token (struct substring *,
188 enum date_token tokens_seen,
190 static enum date_token parse_date_number (struct substring *,
191 enum date_token tokens_seen,
193 static enum date_token recognize_identifier_token (struct substring *);
194 static enum date_token recognize_id2 (int s0, int s1, bool more);
195 static enum date_token recognize_id3 (int s0, int s1, int s2, bool more);
197 /* Creates and returns a new format guesser. */
199 fmt_guesser_create (void)
201 struct fmt_guesser *g = xmalloc (sizeof *g);
202 fmt_guesser_clear (g);
206 /* Destroys format guesser G. */
208 fmt_guesser_destroy (struct fmt_guesser *g)
213 /* Clears the state of format guesser G, making it available for
214 guessing the format of a new input stream. */
216 fmt_guesser_clear (struct fmt_guesser *g)
218 memset (g, 0, sizeof *g);
221 /* Appends S to the stream of data items whose format G is
224 fmt_guesser_add (struct fmt_guesser *g, struct substring s)
226 if (ss_length (s) > g->width)
227 g->width = ss_length (s);
228 ss_trim (&s, ss_cstr (CC_SPACES));
229 if (ss_is_empty (s) || ss_equals (s, ss_cstr (".")))
231 /* Can't guess anything from an empty string or a missing value. */
236 if (!add_numeric (g, s))
237 add_date_time (g, s);
240 /* Guesses the format of the input previously added to G using
241 fmt_guesser_add, storing the guess into *F. The guessed
242 format may not actually a valid input or output format, in
243 that its width and number of decimal places may be outside the
244 valid range for the guessed format type. The caller must
245 therefore adjust the format to make it valid, e.g. by calling
248 fmt_guesser_guess (struct fmt_guesser *g, struct fmt_spec *f)
252 /* Set defaults. The guesser functions typically override
253 the width and type. */
258 if (g->any_numeric > g->count / 2)
259 guess_numeric (g, f);
260 else if (g->any_date > g->count / 2)
261 guess_date_time (g, f);
265 /* No data at all. Use fallback default. */
266 *f = fmt_default_for_width (0);
270 /* Numeric formats. */
272 /* Tries to parse S as a numeric (F, COMMA, DOT, DOLLAR, PCT, or
273 E) format. If successful, increments G's any_numeric counter
274 and the counter for the specific format S that S matches and
275 returns true. On failure, returns false without modifying G.
277 This function is intended to match exactly the same set of
278 strings that the actual numeric value parsers used by the
279 data_in function would match. */
281 add_numeric (struct fmt_guesser *g, struct substring s)
283 bool has_dollar; /* '$' appeared at start of S? */
284 bool has_percent; /* '%' appeared at end of S? */
285 int digits; /* Number of digits in S (before exponent). */
286 int dots; /* Number of '.' in S. */
287 int commas; /* Number of ',' in S. */
288 bool has_exp; /* [eEdD] appeared introducing exponent? */
289 bool has_exp_sign; /* '+' or '-' appeared in exponent? */
290 int exp_digits; /* Number of digits in exponent. */
292 int prev_delim; /* Initially 0, then ',' or '.' as delimiters seen. */
293 int delim_digits; /* Number of digits since last delimiter. */
295 int decimal; /* Decimal point character: '.', ',',
296 or 0 if unknown or no decimal point in S. */
297 int precision; /* Digits of precision after decimal point. */
301 /* Skip leading "$" and optional following white space. */
302 has_dollar = ss_match_char (&s, '$');
304 ss_ltrim (&s, ss_cstr (CC_SPACES));
306 /* Skip optional sign. */
307 ss_match_char_in (&s, ss_cstr ("+-"));
309 /* Skip digits punctuated by commas and dots. We don't know
310 whether the decimal point is a comma or a dot, so for now we
312 digits = dots = commas = 0;
315 for (; (c = ss_first (s)) != -1; ss_advance (&s, 1))
317 if (c >= '0' && c <= '9')
338 if (digits == 0 || (dots > 1 && commas > 1))
340 /* A valid number has at least one digit and can't have
341 more than one decimal point. */
345 /* Skip the optional exponent. */
346 has_exp = ss_match_char_in (&s, ss_cstr ("eEdD")) != EOF;
347 has_exp_sign = ss_match_char_in (&s, ss_cstr ("-+")) != EOF;
349 ss_match_char (&s, ' ');
350 exp_digits = ss_ltrim (&s, ss_cstr (CC_DIGITS));
351 if ((has_exp || has_exp_sign) && !exp_digits)
353 /* Can't have the E or sign that leads in the exponent
354 without actually having an exponent. */
358 /* Skip optional '%'. */
359 has_percent = ss_match_char (&s, '%');
360 if (has_dollar && has_percent)
362 /* A valid number cannot have both '$' and '%'. */
366 /* Make sure there's no trailing garbage. */
367 if (!ss_is_empty (s))
370 /* Figure out the decimal point (and therefore grouping)
371 character and the number of digits following the decimal
372 point. Sometimes the answer is ambiguous. */
373 if (dots > 1 && prev_delim == '.')
375 /* Can't have multiple decimal points, so '.' must really
376 be the grouping character, with a precision of 0. */
380 else if (commas > 1 && prev_delim == ',')
382 /* Can't have multiple decimal points, so ',' must really
383 be the grouping character, with a precision of 0. */
387 else if (delim_digits == 3 && (!dots || !commas))
389 /* The input is something like "1.234" or "1,234" where we
390 can't tell whether the ',' or '.' is a grouping or
391 decimal character. Assume that the decimal character
392 from the settings is in use. */
393 if (prev_delim == settings_get_decimal_char (FMT_F))
395 decimal = prev_delim;
396 precision = delim_digits;
400 decimal = prev_delim == '.' ? ',' : '.';
406 /* The final delimiter is a decimal point, and the digits
407 following it are decimals. */
408 decimal = prev_delim;
409 precision = delim_digits;
412 /* Decide the most likely format. */
414 g->decimals += precision;
417 else if (has_percent)
419 else if (commas && decimal == '.')
421 else if (dots && decimal == ',')
423 else if (has_exp || has_exp_sign)
431 /* Guess which numeric format is most likely represented by G,
432 and store it in F's type and d members. (f->w is already
435 guess_numeric (struct fmt_guesser *g, struct fmt_spec *f)
437 int decimal_char = settings_get_decimal_char (FMT_COMMA);
439 f->d = g->decimals / g->count;
443 f->type = FMT_DOLLAR;
444 else if (g->comma > g->dot)
445 f->type = decimal_char == '.' ? FMT_COMMA : FMT_DOT;
446 else if (g->dot > g->comma)
447 f->type = decimal_char == '.' ? FMT_DOT : FMT_COMMA;
448 else if (g->e > g->any_numeric / 2)
454 /* Tries to parse S as a date (DATE, ADATE, EDATE, SDATE, QYR,
455 MOYR, WKYR, or DATETIME), time (TIME or DTIME), or date
456 component (WKDAY or MONTH) format. If successful, increments
457 G's any_date counter and the counter or counters for the
458 specific format(s) that S matches. On failure, does not
461 Does not attempt to recognize JDATE format: it looks just like
462 F format and will thus be caught by the numeric parser.
464 This function is intended to match a set of strings close to
465 those that actual date and time parsers used by the data_in
466 function would match, but somewhat pickier. In particular,
467 minutes and seconds are only recognized when they have exactly
468 two digits: "1:02:03" is a valid time, but "1:2:3" is
471 add_date_time (struct fmt_guesser *g, struct substring s)
473 enum date_token token;
474 enum date_token tokens[MAX_TOKENS];
475 enum date_token tokens_seen;
481 /* Break S into tokens. */
485 while (!ss_is_empty (s))
487 if (token_cnt >= MAX_TOKENS)
490 token = parse_date_token (&s, tokens_seen, &decimals);
493 tokens[token_cnt++] = token;
494 tokens_seen |= token;
499 /* Find matching date formats, if any, and increment the
500 counter for each one of them. */
502 for (i = 0; i < DATE_SYNTAX_CNT; i++)
504 struct date_syntax *s = &syntax[i];
505 if (match_date_syntax (tokens, token_cnt, s->tokens, s->token_cnt))
514 g->decimals += decimals;
518 /* Returns true if the A_LEN tokens in A[] match the B_LEN tokens
519 in B[], false otherwise. */
521 match_date_syntax (const enum date_token a[], size_t a_len,
522 const enum date_token b[], size_t b_len)
529 for (i = 0; i < a_len; i++)
536 /* Guess which date or time format is most likely represented by
537 G, and store it in F's type and d members. (f->w is already
540 guess_date_time (struct fmt_guesser *g, struct fmt_spec *f)
542 unsigned int max = 0;
545 /* Choose the date format matched by the most inputs. Break
546 ties by choosing the earliest in the array. */
547 for (i = 0; i < DATE_SYNTAX_CNT; i = j)
549 unsigned int sum = g->date[i];
550 for (j = i + 1; j < DATE_SYNTAX_CNT; j++)
552 if (syntax[i].format != syntax[j].format)
558 f->type = syntax[i].format;
563 /* Formats that include a time have an optional seconds field.
564 If we saw a seconds field in any of the inputs, make sure
565 that the field width is large enough to include for them.
566 (We use the minimum input width, but an output width would
567 be equally appropriate, since all the time formats have the
568 same minimum widths for input and output.) */
569 if (f->type == FMT_DATETIME || f->type == FMT_TIME
570 || f->type == FMT_DTIME)
572 for (i = 0; i < DATE_SYNTAX_CNT; i++)
574 && syntax[i].tokens[syntax[i].token_cnt - 1] == DT_SECOND)
576 f->d = g->decimals / g->count;
577 f->w = MAX (f->w, fmt_min_input_width (f->type) + 3);
582 /* Extracts the next date token from the string represented by S,
583 which must not be an empty string, and advances *S past the
584 end of the token. Returns the parsed token, or 0 if no valid
587 TOKENS_SEEN should be a bitmap representing all the tokens
588 already seen in this input; this is used to resolve some
589 otherwise ambiguous parsing situation. If a count of seconds
590 is parsed, *DECIMALS is set to the number of digits after the
592 static enum date_token
593 parse_date_token (struct substring *s, enum date_token tokens_seen,
596 int c = ss_first (*s);
600 case '0': case '1': case '2': case '3': case '4':
601 case '5': case '6': case '7': case '8': case '9':
602 return parse_date_number (s, tokens_seen, decimals);
606 /* '+' or '-' at the start of a string, or following a
607 space, could be the sign that optionally introduces a
608 time, e.g. "-1:00" in TIME format, "-1 1:00" in DTIME
609 format, or "1/1/1978 +1:00" in DATETIME format. */
610 if ((!tokens_seen || s->string[-1] == ' ') && c_isdigit (ss_at (*s, 1)))
613 ss_ltrim (s, ss_cstr (CC_DIGITS));
614 return DT_DAY_COUNT | DT_HOUR;
619 case '/': case '.': case ',':
627 case ' ': case '\t': case '\v': case '\r': case '\n':
629 enum date_token token;
631 token = recognize_identifier_token (s);
633 ss_match_char_in (s, ss_cstr (CC_SPACES));
635 token = DT_DELIM | DT_SPACE;
640 return recognize_identifier_token (s);
647 /* Parses a digit sequence found in a date token. Advances *S
648 past the end of the token. Returns the parsed token, or 0 if
649 no valid token was found.
651 TOKENS_SEEN should be a bitmap representing all the tokens
652 already seen in this input; this is used to resolve some
653 otherwise ambiguous parsing situation. If a count of seconds
654 is parsed, *DECIMALS is set to the number of digits after the
656 static enum date_token
657 parse_date_number (struct substring *s, enum date_token tokens_seen,
661 size_t digit_cnt = ss_get_long (s, &value);
662 enum date_token token = 0;
664 if (ss_match_char (s, settings_get_decimal_char (FMT_F))
665 && tokens_seen & DT_COLON
668 /* Parse digits after the decimal point. */
670 *decimals = ss_ltrim (s, ss_cstr (CC_DIGITS));
675 token = (DT_QUARTER | DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK
677 else if (value <= 12)
678 token = DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
679 else if (value <= 23)
680 token = DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
681 else if (value <= 31)
682 token = DT_DAY | DT_WEEK | DT_DAY_COUNT;
683 else if (value <= 52)
684 token = DT_WEEK | DT_DAY_COUNT;
686 token = DT_DAY_COUNT;
692 token |= DT_MINUTE | DT_SECOND;
694 else if (digit_cnt == 4)
701 /* Attempts to parse an identifier found in a date at the
702 beginning of S. Advances *S past the end of the token.
703 Returns the parsed token, or 0 if no valid token was
705 static enum date_token
706 recognize_identifier_token (struct substring *s)
708 size_t length = ss_span (*s, ss_cstr (CC_LETTERS));
709 enum date_token token = 0;
716 switch (c_tolower (s->string[0]))
732 int s0 = c_tolower ((unsigned char) s->string[0]);
733 int s1 = c_tolower ((unsigned char) s->string[1]);
734 token = recognize_id2 (s0, s1, false);
735 if (!token && s0 == 'w' && s1 == 'k')
742 int s0 = c_tolower ((unsigned char) s->string[0]);
743 int s1 = c_tolower ((unsigned char) s->string[1]);
744 int s2 = c_tolower ((unsigned char) s->string[2]);
745 token = recognize_id2 (s0, s1, true);
747 token = recognize_id3 (s0, s1, s2, length > 3);
748 if (!token && length == 4)
750 int s3 = c_tolower ((unsigned char) s->string[3]);
751 if (s0 == 'v' && s1 == 'i' && s2 == 'i' && s3 == 'i')
758 ss_advance (s, length);
762 static enum date_token
763 recognize_id2 (int s0, int s1, bool more)
768 case 's': weekday = s1 == 'a' || s1 == 'u'; break;
769 case 'm': weekday = s1 == 'o'; break;
770 case 't': weekday = s1 == 'u' || s1 == 'h'; break;
771 case 'w': weekday = s1 == 'e'; break;
772 case 'f': weekday = s1 == 'r'; break;
773 default: weekday = false; break;
783 case 'i': month = s1 == 'i' || s1 == 'v' || s1 == 'x'; break;
784 case 'v': month = s1 == 'i'; break;
785 case 'x': month = s1 == 'i'; break;
786 default: month = false; break;
795 static enum date_token
796 recognize_id3 (int s0, int s1, int s2, bool more)
802 month = ((s1 == 'a' && s2 == 'n')
803 || (s1 == 'u' && (s2 == 'n' || s2 == 'l')));
806 month = s1 == 'e' && s2 == 'b';
809 month = (s1 == 'a' && (s2 == 'r' || s2 == 'y'));
812 month = (s1 == 'p' && s2 == 'r') || (s1 == 'u' && s2 == 'g');
815 month = s1 == 'e' && s2 == 'p';
818 month = s1 == 'c' && s2 == 't';
821 month = s1 == 'o' && s2 == 'v';
824 month = s1 == 'e' && s2 == 'c';
830 return DT_MONTH | DT_ENGLISH_MONTH;
834 bool roman_month = false;
839 roman_month = s1 == 'i' && s2 == 'i';
842 roman_month = s1 == 'i' && s2 == 'i';