1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/format-guesser.h"
24 #include "data/format.h"
25 #include "data/settings.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/str.h"
29 #include "gl/c-ctype.h"
30 #include "gl/minmax.h"
31 #include "gl/xalloc.h"
33 /* A token in which potential date or time fields are broken.
35 The token type is actually a bit-map. This allows a single
36 token to represent multiple roles, as often happens in parsing
37 adate or a time. For example, the number "1" can be a quarter
38 of the year, month, hour, day of the month, week of the year,
39 or a count of days. Such ambiguities are resolved on the
40 higher-level bases of multiple tokens and multiple full
44 DT_DAY = 1 << 0, /* dd: Day of the month. */
45 DT_MONTH = 1 << 1, /* mm: Month. */
46 DT_ENGLISH_MONTH = 1 << 2, /* mmm: Spelled-out month, e.g. "jan". */
47 DT_YEAR = 1 << 3, /* yy: Year. */
49 DT_HOUR = 1 << 4, /* HH: Hour. */
50 DT_MINUTE = 1 << 5, /* MM: Minute. */
51 DT_SECOND = 1 << 6, /* SS: Second. */
53 DT_WEEKDAY = 1 << 7, /* www: Day of the week. */
55 DT_DAY_COUNT = 1 << 8, /* D: Number of days. */
56 DT_WEEK = 1 << 9, /* ww: Week of the year. */
57 DT_QUARTER = 1 << 10, /* q: Quarter of the year. */
59 DT_Q = 1 << 11, /* Literal "Q". */
60 DT_WK = 1 << 12, /* Literal "WK". */
62 DT_DELIM = 1 << 13, /* One of -/., or white space. */
63 DT_SPACE = 1 << 14, /* Any white space. */
64 DT_COLON = 1 << 15, /* : */
67 /* Syntax of a date format, in terms of the date tokens that
71 enum fmt_type format; /* Format type. */
73 size_t n_tokens; /* Number of tokens. */
74 enum date_token tokens[MAX_TOKENS]; /* Tokens. */
77 /* Syntax of all the data formats that we can parse.
79 The order in the array can make a difference in the final
80 choice of formats: in the case of a tie between the number of
81 times each format is seen, the syntax earlier in the array
82 takes precedence. The most important cases are the ordering
83 of DATE before EDATE, so that spelled-out months in input
84 yield DATE format (that produces spelled-out months in output,
85 and the ordering of EDATE before ADATE, so that ambiguous
86 dates such as "1/1/99" yield the more sensible European date
87 format instead of American format.
89 When a given date format has more than one syntax, they must
90 be in adjacent array elements. */
91 static struct date_syntax syntax[] =
94 { FMT_DATE, 5, {DT_DAY, DT_DELIM, DT_ENGLISH_MONTH, DT_DELIM, DT_YEAR} },
97 { FMT_EDATE, 5, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR} },
100 { FMT_ADATE, 5, {DT_MONTH, DT_DELIM, DT_DAY, DT_DELIM, DT_YEAR} },
103 { FMT_SDATE, 5, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY} },
106 { FMT_MOYR, 3, {DT_MONTH, DT_DELIM, DT_YEAR} },
109 { FMT_QYR, 3, {DT_QUARTER, DT_Q, DT_YEAR} },
112 { FMT_WKYR, 3, {DT_WEEK, DT_WK, DT_YEAR} },
114 /* dd-mmm-yyyy HH:MM */
116 9, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
117 DT_COLON, DT_MINUTE} },
118 /* dd-mmm-yyyy HH:MM:SS */
120 11, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
121 DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
123 /* yyyy-dd-mmm HH:MM */
125 9, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY, DT_SPACE, DT_HOUR,
126 DT_COLON, DT_MINUTE} },
127 /* yyyy-dd-mmm HH:MM:SS */
129 11, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY, DT_SPACE, DT_HOUR,
130 DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
133 { FMT_TIME, 3, {DT_HOUR, DT_COLON, DT_MINUTE} },
135 { FMT_TIME, 5, {DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
138 { FMT_DTIME, 5, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE} },
141 7, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON,
145 { FMT_WKDAY, 1, {DT_WEEKDAY} },
149 We require a spelled-out English month so that
150 single-character Roman numerals like "i" and "x" don't get
151 detected as months. The latter is particularly common in
152 the password field of /etc/passwd-like files. */
153 { FMT_MONTH, 1, {DT_ENGLISH_MONTH} },
156 /* Number of recognized date syntax formats. */
157 #define DATE_SYNTAX_CNT (sizeof syntax / sizeof *syntax)
159 /* A format guesser. */
162 /* Maximum observed input width. */
165 /* Sum of the digits after the decimal point in each input
166 (divide by count to obtain average decimal positions). */
167 unsigned int decimals;
169 /* Number of non-empty, non-missing input values.
171 count is the sum of any_numeric, any_date, and the number
172 of inputs that were not in any recognized format (hence,
173 treated as A format). */
176 /* Numeric input formats. */
177 unsigned int any_numeric; /* Sum of following counts. */
178 unsigned int f; /* Number of inputs in F format. */
179 unsigned int comma; /* Number of inputs in COMMA format. */
180 unsigned int dot; /* Number of inputs in DOT format. */
181 unsigned int dollar; /* Number of inputs in DOLLAR format. */
182 unsigned int pct; /* Number of inputs in PCT format. */
183 unsigned int e; /* Number of inputs in E format. */
185 /* Date or time input formats.
187 The sum of the values in the date array is at least
188 any_date, often higher because many example dates match
189 more than one date format. */
190 unsigned int any_date; /* Number of inputs in any date format. */
191 unsigned int date[DATE_SYNTAX_CNT]; /* Number of inputs in each date
195 static bool add_numeric (struct fmt_guesser *, struct substring);
196 static void guess_numeric (struct fmt_guesser *, struct fmt_spec *);
197 static void add_date_time (struct fmt_guesser *, struct substring);
198 static bool match_date_syntax (const enum date_token a[], size_t a_len,
199 const enum date_token b[], size_t b_len);
200 static void guess_date_time (struct fmt_guesser *, struct fmt_spec *);
201 static enum date_token parse_date_token (struct substring *,
202 enum date_token tokens_seen,
204 static enum date_token parse_date_number (struct substring *,
205 enum date_token tokens_seen,
207 static enum date_token recognize_identifier_token (struct substring *);
208 static enum date_token recognize_id2 (int s0, int s1, bool more);
209 static enum date_token recognize_id3 (int s0, int s1, int s2, bool more);
211 /* Creates and returns a new format guesser. */
213 fmt_guesser_create (void)
215 struct fmt_guesser *g = xmalloc (sizeof *g);
216 fmt_guesser_clear (g);
220 /* Destroys format guesser G. */
222 fmt_guesser_destroy (struct fmt_guesser *g)
227 /* Clears the state of format guesser G, making it available for
228 guessing the format of a new input stream. */
230 fmt_guesser_clear (struct fmt_guesser *g)
232 memset (g, 0, sizeof *g);
235 /* Appends S to the stream of data items whose format G is
238 fmt_guesser_add (struct fmt_guesser *g, struct substring s)
240 if (ss_length (s) > g->width)
241 g->width = ss_length (s);
242 ss_trim (&s, ss_cstr (CC_SPACES));
243 if (ss_is_empty (s) || ss_equals (s, ss_cstr (".")))
245 /* Can't guess anything from an empty string or a missing value. */
250 if (!add_numeric (g, s))
251 add_date_time (g, s);
254 /* Guesses the format of the input previously added to G using
255 fmt_guesser_add, storing the guess into *F. The guessed
256 format may not actually a valid input or output format, in
257 that its width and number of decimal places may be outside the
258 valid range for the guessed format type. The caller must
259 therefore adjust the format to make it valid, e.g. by calling
262 fmt_guesser_guess (struct fmt_guesser *g, struct fmt_spec *f)
266 /* Set defaults. The guesser functions typically override
267 the width and type. */
272 if (g->any_numeric > g->count / 2)
273 guess_numeric (g, f);
274 else if (g->any_date > g->count / 2)
275 guess_date_time (g, f);
279 /* No data at all. Use fallback default. */
280 *f = fmt_default_for_width (0);
284 /* Numeric formats. */
286 /* Tries to parse S as a numeric (F, COMMA, DOT, DOLLAR, PCT, or
287 E) format. If successful, increments G's any_numeric counter
288 and the counter for the specific format S that S matches and
289 returns true. On failure, returns false without modifying G.
291 This function is intended to match exactly the same set of
292 strings that the actual numeric value parsers used by the
293 data_in function would match. */
295 add_numeric (struct fmt_guesser *g, struct substring s)
297 bool has_dollar; /* '$' appeared at start of S? */
298 bool has_percent; /* '%' appeared at end of S? */
299 int digits; /* Number of digits in S (before exponent). */
300 int dots; /* Number of '.' in S. */
301 int commas; /* Number of ',' in S. */
302 bool has_exp; /* [eEdD] appeared introducing exponent? */
303 bool has_exp_sign; /* '+' or '-' appeared in exponent? */
304 int exp_digits; /* Number of digits in exponent. */
306 int prev_delim; /* Initially 0, then ',' or '.' as delimiters seen. */
307 int delim_digits; /* Number of digits since last delimiter. */
309 int decimal; /* Decimal point character: '.', ',',
310 or 0 if unknown or no decimal point in S. */
311 int precision; /* Digits of precision after decimal point. */
315 /* Skip leading "$" and optional following white space. */
316 has_dollar = ss_match_byte (&s, '$');
318 ss_ltrim (&s, ss_cstr (CC_SPACES));
320 /* Skip optional sign. */
321 ss_match_byte_in (&s, ss_cstr ("+-"));
323 /* Skip digits punctuated by commas and dots. We don't know
324 whether the decimal point is a comma or a dot, so for now we
326 digits = dots = commas = 0;
329 for (; (c = ss_first (s)) != -1; ss_advance (&s, 1))
331 if (c >= '0' && c <= '9')
352 if (digits == 0 || (dots > 1 && commas > 1))
354 /* A valid number has at least one digit and can't have
355 more than one decimal point. */
359 /* Skip the optional exponent. */
360 has_exp = ss_match_byte_in (&s, ss_cstr ("eEdD")) != EOF;
361 has_exp_sign = ss_match_byte_in (&s, ss_cstr ("-+")) != EOF;
363 ss_match_byte (&s, ' ');
364 exp_digits = ss_ltrim (&s, ss_cstr (CC_DIGITS));
365 if ((has_exp || has_exp_sign) && !exp_digits)
367 /* Can't have the E or sign that leads in the exponent
368 without actually having an exponent. */
372 /* Skip optional '%'. */
373 has_percent = ss_match_byte (&s, '%');
374 if (has_dollar && has_percent)
376 /* A valid number cannot have both '$' and '%'. */
380 /* Make sure there's no trailing garbage. */
381 if (!ss_is_empty (s))
384 /* Figure out the decimal point (and therefore grouping)
385 character and the number of digits following the decimal
386 point. Sometimes the answer is ambiguous. */
387 if (dots > 1 && prev_delim == '.')
389 /* Can't have multiple decimal points, so '.' must really
390 be the grouping character, with a precision of 0. */
394 else if (commas > 1 && prev_delim == ',')
396 /* Can't have multiple decimal points, so ',' must really
397 be the grouping character, with a precision of 0. */
401 else if (delim_digits == 3 && (!dots || !commas))
403 /* The input is something like "1.234" or "1,234" where we
404 can't tell whether the ',' or '.' is a grouping or
405 decimal character. Assume that the decimal character
406 from the settings is in use. */
407 if (prev_delim == settings_get_fmt_settings ()->decimal)
409 decimal = prev_delim;
410 precision = delim_digits;
414 decimal = prev_delim == '.' ? ',' : '.';
420 /* The final delimiter is a decimal point, and the digits
421 following it are decimals. */
422 decimal = prev_delim;
423 precision = delim_digits;
426 /* Decide the most likely format. */
428 g->decimals += precision;
431 else if (has_percent)
433 else if (commas && decimal == '.')
435 else if (dots && decimal == ',')
437 else if (has_exp || has_exp_sign)
445 /* Guess which numeric format is most likely represented by G,
446 and store it in F's type and d members. (f->w is already
449 guess_numeric (struct fmt_guesser *g, struct fmt_spec *f)
451 int decimal_char = settings_get_fmt_settings ()->decimal;
453 f->d = g->decimals / g->count;
457 f->type = FMT_DOLLAR;
458 else if (g->comma > g->dot)
459 f->type = decimal_char == '.' ? FMT_COMMA : FMT_DOT;
460 else if (g->dot > g->comma)
461 f->type = decimal_char == '.' ? FMT_DOT : FMT_COMMA;
462 else if (g->e > g->any_numeric / 2)
468 /* Tries to parse S as a date (DATE, ADATE, EDATE, SDATE, QYR,
469 MOYR, WKYR, DATETIME, or YMDHMS), time (TIME or DTIME), or
470 date component (WKDAY or MONTH) format. If successful,
471 increments G's any_date counter and the counter or counters
472 for the specific format(s) that S matches. On failure, does
475 XXX How can we distinguish MTIME from TIME? One way might be
476 that TIME can have three parts (HH:MM:SS) but MTIME only ever
479 Does not attempt to recognize JDATE format: it looks just like
480 F format and will thus be caught by the numeric parser.
482 This function is intended to match a set of strings close to
483 those that actual date and time parsers used by the data_in
484 function would match, but somewhat pickier. In particular,
485 minutes and seconds are only recognized when they have exactly
486 two digits: "1:02:03" is a valid time, but "1:2:3" is
489 add_date_time (struct fmt_guesser *g, struct substring s)
491 enum date_token token;
492 enum date_token tokens[MAX_TOKENS];
493 enum date_token tokens_seen;
499 /* Break S into tokens. */
503 while (!ss_is_empty (s))
505 if (n_tokens >= MAX_TOKENS)
508 token = parse_date_token (&s, tokens_seen, &decimals);
511 tokens[n_tokens++] = token;
512 tokens_seen |= token;
517 /* Find matching date formats, if any, and increment the
518 counter for each one of them. */
520 for (i = 0; i < DATE_SYNTAX_CNT; i++)
522 struct date_syntax *s = &syntax[i];
523 if (match_date_syntax (tokens, n_tokens, s->tokens, s->n_tokens))
532 g->decimals += decimals;
536 /* Returns true if the A_LEN tokens in A[] match the B_LEN tokens
537 in B[], false otherwise. */
539 match_date_syntax (const enum date_token a[], size_t a_len,
540 const enum date_token b[], size_t b_len)
547 for (i = 0; i < a_len; i++)
554 /* Guess which date or time format is most likely represented by
555 G, and store it in F's type and d members. (f->w is already
558 guess_date_time (struct fmt_guesser *g, struct fmt_spec *f)
560 unsigned int max = 0;
563 /* Choose the date format matched by the most inputs. Break
564 ties by choosing the earliest in the array. */
565 for (i = 0; i < DATE_SYNTAX_CNT; i = j)
567 unsigned int sum = g->date[i];
568 for (j = i + 1; j < DATE_SYNTAX_CNT; j++)
570 if (syntax[i].format != syntax[j].format)
576 f->type = syntax[i].format;
581 /* Formats that include a time have an optional seconds field.
582 If we saw a seconds field in any of the inputs, make sure
583 that the field width is large enough to include for them.
584 (We use the minimum input width, but an output width would
585 be equally appropriate, since all the time formats have the
586 same minimum widths for input and output.) */
587 if (f->type == FMT_DATETIME || f->type == FMT_YMDHMS
588 || f->type == FMT_MTIME || f->type == FMT_TIME || f->type == FMT_DTIME)
590 for (i = 0; i < DATE_SYNTAX_CNT; i++)
592 && syntax[i].tokens[syntax[i].n_tokens - 1] == DT_SECOND)
594 f->d = g->decimals / g->count;
595 f->w = MAX (f->w, fmt_min_input_width (f->type) + 3);
600 /* Extracts the next date token from the string represented by S,
601 which must not be an empty string, and advances *S past the
602 end of the token. Returns the parsed token, or 0 if no valid
605 TOKENS_SEEN should be a bitmap representing all the tokens
606 already seen in this input; this is used to resolve some
607 otherwise ambiguous parsing situation. If a count of seconds
608 is parsed, *DECIMALS is set to the number of digits after the
610 static enum date_token
611 parse_date_token (struct substring *s, enum date_token tokens_seen,
614 int c = ss_first (*s);
618 case '0': case '1': case '2': case '3': case '4':
619 case '5': case '6': case '7': case '8': case '9':
620 return parse_date_number (s, tokens_seen, decimals);
624 /* '+' or '-' at the start of a string, or following a
625 space, could be the sign that optionally introduces a
626 time, e.g. "-1:00" in TIME format, "-1 1:00" in DTIME
627 format, or "1/1/1978 +1:00" in DATETIME format. */
628 if ((!tokens_seen || s->string[-1] == ' ') && c_isdigit (ss_at (*s, 1)))
631 ss_ltrim (s, ss_cstr (CC_DIGITS));
632 return DT_DAY_COUNT | DT_HOUR;
637 case '/': case '.': case ',':
645 case ' ': case '\t': case '\v': case '\r': case '\n':
647 enum date_token token;
649 token = recognize_identifier_token (s);
651 ss_match_byte_in (s, ss_cstr (CC_SPACES));
653 token = DT_DELIM | DT_SPACE;
658 return recognize_identifier_token (s);
665 /* Parses a digit sequence found in a date token. Advances *S
666 past the end of the token. Returns the parsed token, or 0 if
667 no valid token was found.
669 TOKENS_SEEN should be a bitmap representing all the tokens
670 already seen in this input; this is used to resolve some
671 otherwise ambiguous parsing situation. If a count of seconds
672 is parsed, *DECIMALS is set to the number of digits after the
674 static enum date_token
675 parse_date_number (struct substring *s, enum date_token tokens_seen,
679 size_t n_digits = ss_get_long (s, &value);
680 enum date_token token = 0;
682 if (ss_match_byte (s, settings_get_fmt_settings ()->decimal)
683 && tokens_seen & DT_COLON
686 /* Parse digits after the decimal point. */
688 *decimals = ss_ltrim (s, ss_cstr (CC_DIGITS));
693 token = (DT_QUARTER | DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK
695 else if (value <= 12)
696 token = DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
697 else if (value <= 23)
698 token = DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
699 else if (value <= 31)
700 token = DT_DAY | DT_WEEK | DT_DAY_COUNT;
701 else if (value <= 52)
702 token = DT_WEEK | DT_DAY_COUNT;
704 token = DT_DAY_COUNT;
710 token |= DT_MINUTE | DT_SECOND;
712 else if (n_digits == 4)
719 /* Attempts to parse an identifier found in a date at the
720 beginning of S. Advances *S past the end of the token.
721 Returns the parsed token, or 0 if no valid token was
723 static enum date_token
724 recognize_identifier_token (struct substring *s)
726 size_t length = ss_span (*s, ss_cstr (CC_LETTERS));
727 enum date_token token = 0;
734 switch (c_tolower (s->string[0]))
750 int s0 = c_tolower ((unsigned char) s->string[0]);
751 int s1 = c_tolower ((unsigned char) s->string[1]);
752 token = recognize_id2 (s0, s1, false);
753 if (!token && s0 == 'w' && s1 == 'k')
760 int s0 = c_tolower ((unsigned char) s->string[0]);
761 int s1 = c_tolower ((unsigned char) s->string[1]);
762 int s2 = c_tolower ((unsigned char) s->string[2]);
763 token = recognize_id2 (s0, s1, true);
765 token = recognize_id3 (s0, s1, s2, length > 3);
766 if (!token && length == 4)
768 int s3 = c_tolower ((unsigned char) s->string[3]);
769 if (s0 == 'v' && s1 == 'i' && s2 == 'i' && s3 == 'i')
776 ss_advance (s, length);
780 static enum date_token
781 recognize_id2 (int s0, int s1, bool more)
786 case 's': weekday = s1 == 'a' || s1 == 'u'; break;
787 case 'm': weekday = s1 == 'o'; break;
788 case 't': weekday = s1 == 'u' || s1 == 'h'; break;
789 case 'w': weekday = s1 == 'e'; break;
790 case 'f': weekday = s1 == 'r'; break;
791 default: weekday = false; break;
801 case 'i': month = s1 == 'i' || s1 == 'v' || s1 == 'x'; break;
802 case 'v': month = s1 == 'i'; break;
803 case 'x': month = s1 == 'i'; break;
804 default: month = false; break;
813 static enum date_token
814 recognize_id3 (int s0, int s1, int s2, bool more)
820 month = ((s1 == 'a' && s2 == 'n')
821 || (s1 == 'u' && (s2 == 'n' || s2 == 'l')));
824 month = s1 == 'e' && s2 == 'b';
827 month = (s1 == 'a' && (s2 == 'r' || s2 == 'y'));
830 month = (s1 == 'p' && s2 == 'r') || (s1 == 'u' && s2 == 'g');
833 month = s1 == 'e' && s2 == 'p';
836 month = s1 == 'c' && s2 == 't';
839 month = s1 == 'o' && s2 == 'v';
842 month = s1 == 'e' && s2 == 'c';
848 return DT_MONTH | DT_ENGLISH_MONTH;
852 bool roman_month = false;
857 roman_month = s1 == 'i' && s2 == 'i';
860 roman_month = s1 == 'i' && s2 == 'i';