1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/format-guesser.h"
24 #include "data/format.h"
25 #include "data/settings.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/str.h"
29 #include "gl/c-ctype.h"
30 #include "gl/minmax.h"
31 #include "gl/xalloc.h"
33 /* A token in which potential date or time fields are broken.
35 The token type is actually a bit-map. This allows a single
36 token to represent multiple roles, as often happens in parsing
37 adate or a time. For example, the number "1" can be a quarter
38 of the year, month, hour, day of the month, week of the year,
39 or a count of days. Such ambiguities are resolved on the
40 higher-level bases of multiple tokens and multiple full
44 DT_DAY = 1 << 0, /* dd: Day of the month. */
45 DT_MONTH = 1 << 1, /* mm: Month. */
46 DT_ENGLISH_MONTH = 1 << 2, /* mmm: Spelled-out month, e.g. "jan". */
47 DT_YEAR = 1 << 3, /* yy: Year. */
49 DT_HOUR = 1 << 4, /* HH: Hour. */
50 DT_MINUTE = 1 << 5, /* MM: Minute. */
51 DT_SECOND = 1 << 6, /* SS: Second. */
53 DT_WEEKDAY = 1 << 7, /* www: Day of the week. */
55 DT_DAY_COUNT = 1 << 8, /* D: Number of days. */
56 DT_WEEK = 1 << 9, /* ww: Week of the year. */
57 DT_QUARTER = 1 << 10, /* q: Quarter of the year. */
59 DT_Q = 1 << 11, /* Literal "Q". */
60 DT_WK = 1 << 12, /* Literal "WK". */
62 DT_DELIM = 1 << 13, /* One of -/., or white space. */
63 DT_SPACE = 1 << 14, /* Any white space. */
64 DT_COLON = 1 << 15, /* : */
67 /* Syntax of a date format, in terms of the date tokens that
71 enum fmt_type format; /* Format type. */
73 size_t token_cnt; /* Number of tokens. */
74 enum date_token tokens[MAX_TOKENS]; /* Tokens. */
77 /* Syntax of all the data formats that we can parse.
79 The order in the array can make a difference in the final
80 choice of formats: in the case of a tie between the number of
81 times each format is seen, the syntax earlier in the array
82 takes precedence. The most important cases are the ordering
83 of DATE before EDATE, so that spelled-out months in input
84 yield DATE format (that produces spelled-out months in output,
85 and the ordering of EDATE before ADATE, so that ambiguous
86 dates such as "1/1/99" yield the more sensible European date
87 format instead of American format.
89 When a given date format has more than one syntax, they must
90 be in adjacent array elements. */
91 static struct date_syntax syntax[] =
94 { FMT_DATE, 5, {DT_DAY, DT_DELIM, DT_ENGLISH_MONTH, DT_DELIM, DT_YEAR} },
97 { FMT_EDATE, 5, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR} },
100 { FMT_ADATE, 5, {DT_MONTH, DT_DELIM, DT_DAY, DT_DELIM, DT_YEAR} },
103 { FMT_SDATE, 5, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY} },
106 { FMT_MOYR, 3, {DT_MONTH, DT_DELIM, DT_YEAR} },
109 { FMT_QYR, 3, {DT_QUARTER, DT_Q, DT_YEAR} },
112 { FMT_WKYR, 3, {DT_WEEK, DT_WK, DT_YEAR} },
114 /* dd-mmm-yyyy HH:MM */
116 9, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
117 DT_COLON, DT_MINUTE} },
118 /* dd-mmm-yyyy HH:MM:SS */
120 11, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
121 DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
124 { FMT_TIME, 3, {DT_HOUR, DT_COLON, DT_MINUTE} },
126 { FMT_TIME, 5, {DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
129 { FMT_DTIME, 5, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE} },
132 7, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON,
136 { FMT_WKDAY, 1, {DT_WEEKDAY} },
140 We require a spelled-out English month so that
141 single-character Roman numerals like "i" and "x" don't get
142 detected as months. The latter is particularly common in
143 the password field of /etc/passwd-like files. */
144 { FMT_MONTH, 1, {DT_ENGLISH_MONTH} },
147 /* Number of recognized date syntax formats. */
148 #define DATE_SYNTAX_CNT (sizeof syntax / sizeof *syntax)
150 /* A format guesser. */
153 /* Maximum observed input width. */
156 /* Sum of the digits after the decimal point in each input
157 (divide by count to obtain average decimal positions). */
158 unsigned int decimals;
160 /* Number of non-empty, non-missing input values.
162 count is the sum of any_numeric, any_date, and the number
163 of inputs that were not in any recognized format (hence,
164 treated as A format). */
167 /* Numeric input formats. */
168 unsigned int any_numeric; /* Sum of following counts. */
169 unsigned int f; /* Number of inputs in F format. */
170 unsigned int comma; /* Number of inputs in COMMA format. */
171 unsigned int dot; /* Number of inputs in DOT format. */
172 unsigned int dollar; /* Number of inputs in DOLLAR format. */
173 unsigned int pct; /* Number of inputs in PCT format. */
174 unsigned int e; /* Number of inputs in E format. */
176 /* Date or time input formats.
178 The sum of the values in the date array is at least
179 any_date, often higher because many example dates match
180 more than one date format. */
181 unsigned int any_date; /* Number of inputs in any date format. */
182 unsigned int date[DATE_SYNTAX_CNT]; /* Number of inputs in each date
186 static bool add_numeric (struct fmt_guesser *, struct substring);
187 static void guess_numeric (struct fmt_guesser *, struct fmt_spec *);
188 static void add_date_time (struct fmt_guesser *, struct substring);
189 static bool match_date_syntax (const enum date_token a[], size_t a_len,
190 const enum date_token b[], size_t b_len);
191 static void guess_date_time (struct fmt_guesser *, struct fmt_spec *);
192 static enum date_token parse_date_token (struct substring *,
193 enum date_token tokens_seen,
195 static enum date_token parse_date_number (struct substring *,
196 enum date_token tokens_seen,
198 static enum date_token recognize_identifier_token (struct substring *);
199 static enum date_token recognize_id2 (int s0, int s1, bool more);
200 static enum date_token recognize_id3 (int s0, int s1, int s2, bool more);
202 /* Creates and returns a new format guesser. */
204 fmt_guesser_create (void)
206 struct fmt_guesser *g = xmalloc (sizeof *g);
207 fmt_guesser_clear (g);
211 /* Destroys format guesser G. */
213 fmt_guesser_destroy (struct fmt_guesser *g)
218 /* Clears the state of format guesser G, making it available for
219 guessing the format of a new input stream. */
221 fmt_guesser_clear (struct fmt_guesser *g)
223 memset (g, 0, sizeof *g);
226 /* Appends S to the stream of data items whose format G is
229 fmt_guesser_add (struct fmt_guesser *g, struct substring s)
231 if (ss_length (s) > g->width)
232 g->width = ss_length (s);
233 ss_trim (&s, ss_cstr (CC_SPACES));
234 if (ss_is_empty (s) || ss_equals (s, ss_cstr (".")))
236 /* Can't guess anything from an empty string or a missing value. */
241 if (!add_numeric (g, s))
242 add_date_time (g, s);
245 /* Guesses the format of the input previously added to G using
246 fmt_guesser_add, storing the guess into *F. The guessed
247 format may not actually a valid input or output format, in
248 that its width and number of decimal places may be outside the
249 valid range for the guessed format type. The caller must
250 therefore adjust the format to make it valid, e.g. by calling
253 fmt_guesser_guess (struct fmt_guesser *g, struct fmt_spec *f)
257 /* Set defaults. The guesser functions typically override
258 the width and type. */
263 if (g->any_numeric > g->count / 2)
264 guess_numeric (g, f);
265 else if (g->any_date > g->count / 2)
266 guess_date_time (g, f);
270 /* No data at all. Use fallback default. */
271 *f = fmt_default_for_width (0);
275 /* Numeric formats. */
277 /* Tries to parse S as a numeric (F, COMMA, DOT, DOLLAR, PCT, or
278 E) format. If successful, increments G's any_numeric counter
279 and the counter for the specific format S that S matches and
280 returns true. On failure, returns false without modifying G.
282 This function is intended to match exactly the same set of
283 strings that the actual numeric value parsers used by the
284 data_in function would match. */
286 add_numeric (struct fmt_guesser *g, struct substring s)
288 bool has_dollar; /* '$' appeared at start of S? */
289 bool has_percent; /* '%' appeared at end of S? */
290 int digits; /* Number of digits in S (before exponent). */
291 int dots; /* Number of '.' in S. */
292 int commas; /* Number of ',' in S. */
293 bool has_exp; /* [eEdD] appeared introducing exponent? */
294 bool has_exp_sign; /* '+' or '-' appeared in exponent? */
295 int exp_digits; /* Number of digits in exponent. */
297 int prev_delim; /* Initially 0, then ',' or '.' as delimiters seen. */
298 int delim_digits; /* Number of digits since last delimiter. */
300 int decimal; /* Decimal point character: '.', ',',
301 or 0 if unknown or no decimal point in S. */
302 int precision; /* Digits of precision after decimal point. */
306 /* Skip leading "$" and optional following white space. */
307 has_dollar = ss_match_byte (&s, '$');
309 ss_ltrim (&s, ss_cstr (CC_SPACES));
311 /* Skip optional sign. */
312 ss_match_byte_in (&s, ss_cstr ("+-"));
314 /* Skip digits punctuated by commas and dots. We don't know
315 whether the decimal point is a comma or a dot, so for now we
317 digits = dots = commas = 0;
320 for (; (c = ss_first (s)) != -1; ss_advance (&s, 1))
322 if (c >= '0' && c <= '9')
343 if (digits == 0 || (dots > 1 && commas > 1))
345 /* A valid number has at least one digit and can't have
346 more than one decimal point. */
350 /* Skip the optional exponent. */
351 has_exp = ss_match_byte_in (&s, ss_cstr ("eEdD")) != EOF;
352 has_exp_sign = ss_match_byte_in (&s, ss_cstr ("-+")) != EOF;
354 ss_match_byte (&s, ' ');
355 exp_digits = ss_ltrim (&s, ss_cstr (CC_DIGITS));
356 if ((has_exp || has_exp_sign) && !exp_digits)
358 /* Can't have the E or sign that leads in the exponent
359 without actually having an exponent. */
363 /* Skip optional '%'. */
364 has_percent = ss_match_byte (&s, '%');
365 if (has_dollar && has_percent)
367 /* A valid number cannot have both '$' and '%'. */
371 /* Make sure there's no trailing garbage. */
372 if (!ss_is_empty (s))
375 /* Figure out the decimal point (and therefore grouping)
376 character and the number of digits following the decimal
377 point. Sometimes the answer is ambiguous. */
378 if (dots > 1 && prev_delim == '.')
380 /* Can't have multiple decimal points, so '.' must really
381 be the grouping character, with a precision of 0. */
385 else if (commas > 1 && prev_delim == ',')
387 /* Can't have multiple decimal points, so ',' must really
388 be the grouping character, with a precision of 0. */
392 else if (delim_digits == 3 && (!dots || !commas))
394 /* The input is something like "1.234" or "1,234" where we
395 can't tell whether the ',' or '.' is a grouping or
396 decimal character. Assume that the decimal character
397 from the settings is in use. */
398 if (prev_delim == settings_get_decimal_char (FMT_F))
400 decimal = prev_delim;
401 precision = delim_digits;
405 decimal = prev_delim == '.' ? ',' : '.';
411 /* The final delimiter is a decimal point, and the digits
412 following it are decimals. */
413 decimal = prev_delim;
414 precision = delim_digits;
417 /* Decide the most likely format. */
419 g->decimals += precision;
422 else if (has_percent)
424 else if (commas && decimal == '.')
426 else if (dots && decimal == ',')
428 else if (has_exp || has_exp_sign)
436 /* Guess which numeric format is most likely represented by G,
437 and store it in F's type and d members. (f->w is already
440 guess_numeric (struct fmt_guesser *g, struct fmt_spec *f)
442 int decimal_char = settings_get_decimal_char (FMT_COMMA);
444 f->d = g->decimals / g->count;
448 f->type = FMT_DOLLAR;
449 else if (g->comma > g->dot)
450 f->type = decimal_char == '.' ? FMT_COMMA : FMT_DOT;
451 else if (g->dot > g->comma)
452 f->type = decimal_char == '.' ? FMT_DOT : FMT_COMMA;
453 else if (g->e > g->any_numeric / 2)
459 /* Tries to parse S as a date (DATE, ADATE, EDATE, SDATE, QYR,
460 MOYR, WKYR, or DATETIME), time (TIME or DTIME), or date
461 component (WKDAY or MONTH) format. If successful, increments
462 G's any_date counter and the counter or counters for the
463 specific format(s) that S matches. On failure, does not
466 Does not attempt to recognize JDATE format: it looks just like
467 F format and will thus be caught by the numeric parser.
469 This function is intended to match a set of strings close to
470 those that actual date and time parsers used by the data_in
471 function would match, but somewhat pickier. In particular,
472 minutes and seconds are only recognized when they have exactly
473 two digits: "1:02:03" is a valid time, but "1:2:3" is
476 add_date_time (struct fmt_guesser *g, struct substring s)
478 enum date_token token;
479 enum date_token tokens[MAX_TOKENS];
480 enum date_token tokens_seen;
486 /* Break S into tokens. */
490 while (!ss_is_empty (s))
492 if (token_cnt >= MAX_TOKENS)
495 token = parse_date_token (&s, tokens_seen, &decimals);
498 tokens[token_cnt++] = token;
499 tokens_seen |= token;
504 /* Find matching date formats, if any, and increment the
505 counter for each one of them. */
507 for (i = 0; i < DATE_SYNTAX_CNT; i++)
509 struct date_syntax *s = &syntax[i];
510 if (match_date_syntax (tokens, token_cnt, s->tokens, s->token_cnt))
519 g->decimals += decimals;
523 /* Returns true if the A_LEN tokens in A[] match the B_LEN tokens
524 in B[], false otherwise. */
526 match_date_syntax (const enum date_token a[], size_t a_len,
527 const enum date_token b[], size_t b_len)
534 for (i = 0; i < a_len; i++)
541 /* Guess which date or time format is most likely represented by
542 G, and store it in F's type and d members. (f->w is already
545 guess_date_time (struct fmt_guesser *g, struct fmt_spec *f)
547 unsigned int max = 0;
550 /* Choose the date format matched by the most inputs. Break
551 ties by choosing the earliest in the array. */
552 for (i = 0; i < DATE_SYNTAX_CNT; i = j)
554 unsigned int sum = g->date[i];
555 for (j = i + 1; j < DATE_SYNTAX_CNT; j++)
557 if (syntax[i].format != syntax[j].format)
563 f->type = syntax[i].format;
568 /* Formats that include a time have an optional seconds field.
569 If we saw a seconds field in any of the inputs, make sure
570 that the field width is large enough to include for them.
571 (We use the minimum input width, but an output width would
572 be equally appropriate, since all the time formats have the
573 same minimum widths for input and output.) */
574 if (f->type == FMT_DATETIME || f->type == FMT_TIME
575 || f->type == FMT_DTIME)
577 for (i = 0; i < DATE_SYNTAX_CNT; i++)
579 && syntax[i].tokens[syntax[i].token_cnt - 1] == DT_SECOND)
581 f->d = g->decimals / g->count;
582 f->w = MAX (f->w, fmt_min_input_width (f->type) + 3);
587 /* Extracts the next date token from the string represented by S,
588 which must not be an empty string, and advances *S past the
589 end of the token. Returns the parsed token, or 0 if no valid
592 TOKENS_SEEN should be a bitmap representing all the tokens
593 already seen in this input; this is used to resolve some
594 otherwise ambiguous parsing situation. If a count of seconds
595 is parsed, *DECIMALS is set to the number of digits after the
597 static enum date_token
598 parse_date_token (struct substring *s, enum date_token tokens_seen,
601 int c = ss_first (*s);
605 case '0': case '1': case '2': case '3': case '4':
606 case '5': case '6': case '7': case '8': case '9':
607 return parse_date_number (s, tokens_seen, decimals);
611 /* '+' or '-' at the start of a string, or following a
612 space, could be the sign that optionally introduces a
613 time, e.g. "-1:00" in TIME format, "-1 1:00" in DTIME
614 format, or "1/1/1978 +1:00" in DATETIME format. */
615 if ((!tokens_seen || s->string[-1] == ' ') && c_isdigit (ss_at (*s, 1)))
618 ss_ltrim (s, ss_cstr (CC_DIGITS));
619 return DT_DAY_COUNT | DT_HOUR;
624 case '/': case '.': case ',':
632 case ' ': case '\t': case '\v': case '\r': case '\n':
634 enum date_token token;
636 token = recognize_identifier_token (s);
638 ss_match_byte_in (s, ss_cstr (CC_SPACES));
640 token = DT_DELIM | DT_SPACE;
645 return recognize_identifier_token (s);
652 /* Parses a digit sequence found in a date token. Advances *S
653 past the end of the token. Returns the parsed token, or 0 if
654 no valid token was found.
656 TOKENS_SEEN should be a bitmap representing all the tokens
657 already seen in this input; this is used to resolve some
658 otherwise ambiguous parsing situation. If a count of seconds
659 is parsed, *DECIMALS is set to the number of digits after the
661 static enum date_token
662 parse_date_number (struct substring *s, enum date_token tokens_seen,
666 size_t digit_cnt = ss_get_long (s, &value);
667 enum date_token token = 0;
669 if (ss_match_byte (s, settings_get_decimal_char (FMT_F))
670 && tokens_seen & DT_COLON
673 /* Parse digits after the decimal point. */
675 *decimals = ss_ltrim (s, ss_cstr (CC_DIGITS));
680 token = (DT_QUARTER | DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK
682 else if (value <= 12)
683 token = DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
684 else if (value <= 23)
685 token = DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
686 else if (value <= 31)
687 token = DT_DAY | DT_WEEK | DT_DAY_COUNT;
688 else if (value <= 52)
689 token = DT_WEEK | DT_DAY_COUNT;
691 token = DT_DAY_COUNT;
697 token |= DT_MINUTE | DT_SECOND;
699 else if (digit_cnt == 4)
706 /* Attempts to parse an identifier found in a date at the
707 beginning of S. Advances *S past the end of the token.
708 Returns the parsed token, or 0 if no valid token was
710 static enum date_token
711 recognize_identifier_token (struct substring *s)
713 size_t length = ss_span (*s, ss_cstr (CC_LETTERS));
714 enum date_token token = 0;
721 switch (c_tolower (s->string[0]))
737 int s0 = c_tolower ((unsigned char) s->string[0]);
738 int s1 = c_tolower ((unsigned char) s->string[1]);
739 token = recognize_id2 (s0, s1, false);
740 if (!token && s0 == 'w' && s1 == 'k')
747 int s0 = c_tolower ((unsigned char) s->string[0]);
748 int s1 = c_tolower ((unsigned char) s->string[1]);
749 int s2 = c_tolower ((unsigned char) s->string[2]);
750 token = recognize_id2 (s0, s1, true);
752 token = recognize_id3 (s0, s1, s2, length > 3);
753 if (!token && length == 4)
755 int s3 = c_tolower ((unsigned char) s->string[3]);
756 if (s0 == 'v' && s1 == 'i' && s2 == 'i' && s3 == 'i')
763 ss_advance (s, length);
767 static enum date_token
768 recognize_id2 (int s0, int s1, bool more)
773 case 's': weekday = s1 == 'a' || s1 == 'u'; break;
774 case 'm': weekday = s1 == 'o'; break;
775 case 't': weekday = s1 == 'u' || s1 == 'h'; break;
776 case 'w': weekday = s1 == 'e'; break;
777 case 'f': weekday = s1 == 'r'; break;
778 default: weekday = false; break;
788 case 'i': month = s1 == 'i' || s1 == 'v' || s1 == 'x'; break;
789 case 'v': month = s1 == 'i'; break;
790 case 'x': month = s1 == 'i'; break;
791 default: month = false; break;
800 static enum date_token
801 recognize_id3 (int s0, int s1, int s2, bool more)
807 month = ((s1 == 'a' && s2 == 'n')
808 || (s1 == 'u' && (s2 == 'n' || s2 == 'l')));
811 month = s1 == 'e' && s2 == 'b';
814 month = (s1 == 'a' && (s2 == 'r' || s2 == 'y'));
817 month = (s1 == 'p' && s2 == 'r') || (s1 == 'u' && s2 == 'g');
820 month = s1 == 'e' && s2 == 'p';
823 month = s1 == 'c' && s2 == 't';
826 month = s1 == 'o' && s2 == 'v';
829 month = s1 == 'e' && s2 == 'c';
835 return DT_MONTH | DT_ENGLISH_MONTH;
839 bool roman_month = false;
844 roman_month = s1 == 'i' && s2 == 'i';
847 roman_month = s1 == 'i' && s2 == 'i';