src/data/format-guesser.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2008 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "format-guesser.h"
  20
  21 #include <stdlib.h>
  22 #include <string.h>
  23
  24 #include "c-ctype.h"
  25 #include "minmax.h"
  26 #include "xalloc.h"
  27
  28 #include <data/format.h>
  29 #include <data/settings.h>
  30 #include <libpspp/assertion.h>
  31 #include <libpspp/str.h>
  32
  33 /* A token in which potential date or time fields are broken.
  34
  35    The token type is actually a bit-map.  This allows a single
  36    token to represent multiple roles, as often happens in parsing
  37    adate or a time.  For example, the number "1" can be a quarter
  38    of the year, month, hour, day of the month, week of the year,
  39    or a count of days.  Such ambiguities are resolved on the
  40    higher-level bases of multiple tokens and multiple full
  41    dates. */
  42 enum date_token
  43   {
  44     DT_DAY = 1 << 0,            /* dd: Day of the month. */
  45     DT_MONTH = 1 << 1,          /* mm: Month. */
  46     DT_ENGLISH_MONTH = 1 << 2,  /* mmm: Spelled-out month, e.g. "jan". */
  47     DT_YEAR = 1 << 3,           /* yy: Year. */
  48
  49     DT_HOUR = 1 << 4,           /* HH: Hour. */
  50     DT_MINUTE = 1 << 5,         /* MM: Minute. */
  51     DT_SECOND = 1 << 6,         /* SS: Second. */
  52
  53     DT_WEEKDAY = 1 << 7,        /* www: Day of the week. */
  54
  55     DT_DAY_COUNT = 1 << 8,      /* D: Number of days. */
  56     DT_WEEK = 1 << 9,           /* ww: Week of the year. */
  57     DT_QUARTER = 1 << 10,       /* q: Quarter of the year. */
  58
  59     DT_Q = 1 << 11,             /* Literal "Q". */
  60     DT_WK = 1 << 12,            /* Literal "WK". */
  61
  62     DT_DELIM = 1 << 13,         /* One of -/., or white space. */
  63     DT_SPACE = 1 << 14,         /* Any white space. */
  64     DT_COLON = 1 << 15,         /* : */
  65   };
  66
  67 /* Syntax of a date format, in terms of the date tokens that
  68    compose it.*/
  69 struct date_syntax
  70   {
  71     enum fmt_type format;       /* Format type. */
  72 #define MAX_TOKENS 11
  73     size_t token_cnt;           /* Number of tokens. */
  74     enum date_token tokens[MAX_TOKENS]; /* Tokens. */
  75   };
  76
  77 /* Syntax of all the data formats that we can parse.
  78
  79    The order in the array can make a difference in the final
  80    choice of formats: in the case of a tie between the number of
  81    times each format is seen, the syntax earlier in the array
  82    takes precedence.  The most important cases are the ordering
  83    of DATE before EDATE, so that spelled-out months in input
  84    yield DATE format (that produces spelled-out months in output,
  85    and the ordering of EDATE before ADATE, so that ambiguous
  86    dates such as "1/1/99" yield the more sensible European date
  87    format instead of American format.
  88
  89    When a given date format has more than one syntax, they must
  90    be in adjacent array elements. */
  91 static struct date_syntax syntax[] =
  92   {
  93     /* dd-mmm-yy */
  94     { FMT_DATE, 5, {DT_DAY, DT_DELIM, DT_ENGLISH_MONTH, DT_DELIM, DT_YEAR} },
  95
  96     /* dd.mm.yy */
  97     { FMT_EDATE, 5, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR} },
  98
  99     /* mm/dd/yy */
 100     { FMT_ADATE, 5, {DT_MONTH, DT_DELIM, DT_DAY, DT_DELIM, DT_YEAR} },
 101
 102     /* yy/mm/dd */
 103     { FMT_SDATE, 5, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY} },
 104
 105     /* mmm yy */
 106     { FMT_MOYR, 3, {DT_MONTH, DT_DELIM, DT_YEAR} },
 107
 108     /* q Q yy */
 109     { FMT_QYR, 3, {DT_QUARTER, DT_Q, DT_YEAR} },
 110
 111     /* ww WK yy */
 112     { FMT_WKYR, 3, {DT_WEEK, DT_WK, DT_YEAR} },
 113
 114     /* dd-mmm-yyyy HH:MM */
 115     { FMT_DATETIME,
 116       9, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
 117           DT_COLON, DT_MINUTE} },
 118     /* dd-mmm-yyyy HH:MM:SS */
 119     { FMT_DATETIME,
 120       11, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
 121            DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
 122
 123     /* HH:MM */
 124     { FMT_TIME, 3, {DT_HOUR, DT_COLON, DT_MINUTE} },
 125     /* HH:MM:SS */
 126     { FMT_TIME, 5, {DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
 127
 128     /* D HH:MM */
 129     { FMT_DTIME, 5, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE} },
 130     /* D HH:MM:SS */
 131     { FMT_DTIME,
 132       7, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON,
 133           DT_SECOND} },
 134
 135     /* www */
 136     { FMT_WKDAY, 1, {DT_WEEKDAY} },
 137
 138     /* mmm */
 139     { FMT_MONTH, 1, {DT_MONTH} },
 140   };
 141
 142 /* Number of recognized date syntax formats. */
 143 #define DATE_SYNTAX_CNT (sizeof syntax / sizeof *syntax)
 144
 145 /* A format guesser. */
 146 struct fmt_guesser
 147   {
 148     /* Maximum observed input width. */
 149     unsigned int width;
 150
 151     /* Sum of the digits after the decimal point in each input
 152        (divide by count to obtain average decimal positions). */
 153     unsigned int decimals;
 154
 155     /* Number of non-empty, non-missing input values.
 156
 157        count is the sum of any_numeric, any_date, and the number
 158        of inputs that were not in any recognized format (hence,
 159        treated as A format). */
 160     unsigned int count;
 161
 162     /* Numeric input formats. */
 163     unsigned int any_numeric;   /* Sum of following counts. */
 164     unsigned int f;             /* Number of inputs in F format. */
 165     unsigned int comma;         /* Number of inputs in COMMA format. */
 166     unsigned int dot;           /* Number of inputs in DOT format. */
 167     unsigned int dollar;        /* Number of inputs in DOLLAR format. */
 168     unsigned int pct;           /* Number of inputs in PCT format. */
 169     unsigned int e;             /* Number of inputs in E format. */
 170
 171     /* Date or time input formats.
 172
 173        The sum of the values in the date array is at least
 174        any_date, often higher because many example dates match
 175        more than one date format. */
 176     unsigned int any_date;      /* Number of inputs in any date format. */
 177     unsigned int date[DATE_SYNTAX_CNT]; /* Number of inputs in each date
 178                                            format. */
 179   };
 180
 181 static bool add_numeric (struct fmt_guesser *, struct substring);
 182 static void guess_numeric (struct fmt_guesser *, struct fmt_spec *);
 183 static void add_date_time (struct fmt_guesser *, struct substring);
 184 static bool match_date_syntax (const enum date_token a[], size_t a_len,
 185                                const enum date_token b[], size_t b_len);
 186 static void guess_date_time (struct fmt_guesser *, struct fmt_spec *);
 187 static enum date_token parse_date_token (struct substring *,
 188                                          enum date_token tokens_seen,
 189                                          int *decimals);
 190 static enum date_token parse_date_number (struct substring *,
 191                                           enum date_token tokens_seen,
 192                                           int *decimals);
 193 static enum date_token recognize_identifier_token (struct substring *);
 194 static enum date_token recognize_id2 (int s0, int s1, bool more);
 195 static enum date_token recognize_id3 (int s0, int s1, int s2, bool more);
 196
 197 /* Creates and returns a new format guesser. */
 198 struct fmt_guesser *
 199 fmt_guesser_create (void)
 200 {
 201   struct fmt_guesser *g = xmalloc (sizeof *g);
 202   fmt_guesser_clear (g);
 203   return g;
 204 }
 205
 206 /* Destroys format guesser G. */
 207 void
 208 fmt_guesser_destroy (struct fmt_guesser *g)
 209 {
 210   free (g);
 211 }
 212
 213 /* Clears the state of format guesser G, making it available for
 214    guessing the format of a new input stream.  */
 215 void
 216 fmt_guesser_clear (struct fmt_guesser *g)
 217 {
 218   memset (g, 0, sizeof *g);
 219 }
 220
 221 /* Appends S to the stream of data items whose format G is
 222    guessing. */
 223 void
 224 fmt_guesser_add (struct fmt_guesser *g, struct substring s)
 225 {
 226   if (ss_length (s) > g->width)
 227     g->width = ss_length (s);
 228   ss_trim (&s, ss_cstr (CC_SPACES));
 229   if (ss_is_empty (s) || ss_equals (s, ss_cstr (".")))
 230     {
 231       /* Can't guess anything from an empty string or a missing value. */
 232       return;
 233     }
 234
 235   g->count++;
 236   if (!add_numeric (g, s))
 237     add_date_time (g, s);
 238 }
 239
 240 /* Guesses the format of the input previously added to G using
 241    fmt_guesser_add, storing the guess into *F.  The guessed
 242    format may not actually a valid input or output format, in
 243    that its width and number of decimal places may be outside the
 244    valid range for the guessed format type.  The caller must
 245    therefore adjust the format to make it valid, e.g. by calling
 246    fmt_fix. */
 247 void
 248 fmt_guesser_guess (struct fmt_guesser *g, struct fmt_spec *f)
 249 {
 250   if (g->count > 0)
 251     {
 252       /* Set defaults.  The guesser functions typically override
 253          the width and type. */
 254       f->type = FMT_A;
 255       f->w = g->width;
 256       f->d = 0;
 257
 258       if (g->any_numeric > g->count / 2)
 259         guess_numeric (g, f);
 260       else if (g->any_date > g->count / 2)
 261         guess_date_time (g, f);
 262     }
 263   else
 264     {
 265       /* No data at all.  Use fallback default. */
 266       *f = fmt_default_for_width (0);
 267     }
 268 }
 269 \f
 270 /* Numeric formats. */
 271
 272 /* Tries to parse S as a numeric (F, COMMA, DOT, DOLLAR, PCT, or
 273    E) format.  If successful, increments G's any_numeric counter
 274    and the counter for the specific format S that S matches and
 275    returns true.  On failure, returns false without modifying G.
 276
 277    This function is intended to match exactly the same set of
 278    strings that the actual numeric value parsers used by the
 279    data_in function would match. */
 280 static bool
 281 add_numeric (struct fmt_guesser *g, struct substring s)
 282 {
 283   bool has_dollar;              /* '$' appeared at start of S? */
 284   bool has_percent;             /* '%' appeared at end of S? */
 285   int digits;                   /* Number of digits in S (before exponent). */
 286   int dots;                     /* Number of '.' in S. */
 287   int commas;                   /* Number of ',' in S. */
 288   bool has_exp;                 /* [eEdD] appeared introducing exponent? */
 289   bool has_exp_sign;            /* '+' or '-' appeared in exponent? */
 290   int exp_digits;               /* Number of digits in exponent. */
 291
 292   int prev_delim;       /* Initially 0, then ',' or '.' as delimiters seen. */
 293   int delim_digits;             /* Number of digits since last delimiter. */
 294
 295   int decimal;                  /* Decimal point character: '.', ',',
 296                                    or 0 if unknown or no decimal point in S. */
 297   int precision;                /* Digits of precision after decimal point. */
 298
 299   int c;
 300
 301   /* Skip leading "$" and optional following white space. */
 302   has_dollar = ss_match_char (&s, '$');
 303   if (has_dollar)
 304     ss_ltrim (&s, ss_cstr (CC_SPACES));
 305
 306   /* Skip optional sign. */
 307   ss_match_char_in (&s, ss_cstr ("+-"));
 308
 309   /* Skip digits punctuated by commas and dots.  We don't know
 310      whether the decimal point is a comma or a dot, so for now we
 311      just count them.  */
 312   digits = dots = commas = 0;
 313   delim_digits = 0;
 314   prev_delim = 0;
 315   for (; (c = ss_first (s)) != -1; ss_advance (&s, 1))
 316     {
 317       if (c >= '0' && c <= '9')
 318         {
 319           digits++;
 320           if (dots || commas)
 321             delim_digits++;
 322         }
 323       else if (c == '.' )
 324         {
 325           dots++;
 326           prev_delim = c;
 327           delim_digits = 0;
 328         }
 329       else if (c == ',')
 330         {
 331           commas++;
 332           prev_delim = c;
 333           delim_digits = 0;
 334         }
 335       else
 336         break;
 337     }
 338   if (digits == 0 || (dots > 1 && commas > 1))
 339     {
 340       /* A valid number has at least one digit and can't have
 341          more than one decimal point. */
 342       return false;
 343     }
 344
 345   /* Skip the optional exponent. */
 346   has_exp = ss_match_char_in (&s, ss_cstr ("eEdD")) != EOF;
 347   has_exp_sign = ss_match_char_in (&s, ss_cstr ("-+")) != EOF;
 348   if (has_exp_sign)
 349     ss_match_char (&s, ' ');
 350   exp_digits = ss_ltrim (&s, ss_cstr (CC_DIGITS));
 351   if ((has_exp || has_exp_sign) && !exp_digits)
 352     {
 353       /* Can't have the E or sign that leads in the exponent
 354          without actually having an exponent. */
 355       return false;
 356     }
 357
 358   /* Skip optional '%'. */
 359   has_percent = ss_match_char (&s, '%');
 360   if (has_dollar && has_percent)
 361     {
 362       /* A valid number cannot have both '$' and '%'. */
 363       return false;
 364     }
 365
 366   /* Make sure there's no trailing garbage. */
 367   if (!ss_is_empty (s))
 368     return false;
 369
 370   /* Figure out the decimal point (and therefore grouping)
 371      character and the number of digits following the decimal
 372      point.  Sometimes the answer is ambiguous. */
 373   if (dots > 1 && prev_delim == '.')
 374     {
 375       /* Can't have multiple decimal points, so '.' must really
 376          be the grouping character, with a precision of 0. */
 377       decimal = ',';
 378       precision = 0;
 379     }
 380   else if (commas > 1 && prev_delim == ',')
 381     {
 382       /* Can't have multiple decimal points, so ',' must really
 383          be the grouping character, with a precision of 0. */
 384       decimal = '.';
 385       precision = 0;
 386     }
 387   else if (delim_digits == 3 && (!dots || !commas))
 388     {
 389       /* The input is something like "1.234" or "1,234" where we
 390          can't tell whether the ',' or '.' is a grouping or
 391          decimal character.  Assume that the decimal character
 392          from the settings is in use. */
 393       if (prev_delim == settings_get_decimal_char (FMT_F))
 394         {
 395           decimal = prev_delim;
 396           precision = delim_digits;
 397         }
 398       else
 399         {
 400           decimal = prev_delim == '.' ? ',' : '.';
 401           precision = 0;
 402         }
 403     }
 404   else
 405     {
 406       /* The final delimiter is a decimal point, and the digits
 407          following it are decimals. */
 408       decimal = prev_delim;
 409       precision = delim_digits;
 410     }
 411
 412   /* Decide the most likely format. */
 413   g->any_numeric++;
 414   g->decimals += precision;
 415   if (has_dollar)
 416     g->dollar++;
 417   else if (has_percent)
 418     g->pct++;
 419   else if (commas && decimal == '.')
 420     g->comma++;
 421   else if (dots && decimal == ',')
 422     g->dot++;
 423   else if (has_exp || has_exp_sign)
 424     g->e++;
 425   else
 426     g->f++;
 427
 428   return true;
 429 }
 430
 431 /* Guess which numeric format is most likely represented by G,
 432    and store it in F's type and d members.  (f->w is already
 433    initialized.) */
 434 static void
 435 guess_numeric (struct fmt_guesser *g, struct fmt_spec *f)
 436 {
 437   int decimal_char = settings_get_decimal_char (FMT_COMMA);
 438
 439   f->d = g->decimals / g->count;
 440   if (g->pct)
 441     f->type = FMT_PCT;
 442   else if (g->dollar)
 443     f->type = FMT_DOLLAR;
 444   else if (g->comma > g->dot)
 445     f->type = decimal_char == '.' ? FMT_COMMA : FMT_DOT;
 446   else if (g->dot > g->comma)
 447     f->type = decimal_char == '.' ? FMT_DOT : FMT_COMMA;
 448   else if (g->e > g->any_numeric / 2)
 449     f->type = FMT_E;
 450   else
 451     f->type = FMT_F;
 452 }
 453 \f
 454 /* Tries to parse S as a date (DATE, ADATE, EDATE, SDATE, QYR,
 455    MOYR, WKYR, or DATETIME), time (TIME or DTIME), or date
 456    component (WKDAY or MONTH) format.  If successful, increments
 457    G's any_date counter and the counter or counters for the
 458    specific format(s) that S matches.  On failure, does not
 459    modify G.
 460
 461    Does not attempt to recognize JDATE format: it looks just like
 462    F format and will thus be caught by the numeric parser.
 463
 464    This function is intended to match a set of strings close to
 465    those that actual date and time parsers used by the data_in
 466    function would match, but somewhat pickier.  In particular,
 467    minutes and seconds are only recognized when they have exactly
 468    two digits: "1:02:03" is a valid time, but "1:2:3" is
 469    rejected.  */
 470 static void
 471 add_date_time (struct fmt_guesser *g, struct substring s)
 472 {
 473   enum date_token token;
 474   enum date_token tokens[MAX_TOKENS];
 475   enum date_token tokens_seen;
 476   size_t token_cnt;
 477   int decimals;
 478   bool is_date;
 479   int i;
 480
 481   /* Break S into tokens. */
 482   token_cnt = 0;
 483   tokens_seen = 0;
 484   decimals = 0;
 485   while (!ss_is_empty (s))
 486     {
 487       if (token_cnt >= MAX_TOKENS)
 488         return;
 489
 490       token = parse_date_token (&s, tokens_seen, &decimals);
 491       if (token == 0)
 492         return;
 493       tokens[token_cnt++] = token;
 494       tokens_seen |= token;
 495     }
 496   if (token_cnt == 0)
 497     return;
 498
 499   /* Find matching date formats, if any, and increment the
 500      counter for each one of them. */
 501   is_date = false;
 502   for (i = 0; i < DATE_SYNTAX_CNT; i++)
 503     {
 504       struct date_syntax *s = &syntax[i];
 505       if (match_date_syntax (tokens, token_cnt, s->tokens, s->token_cnt))
 506         {
 507           is_date = true;
 508           g->date[i]++;
 509         }
 510     }
 511   if (is_date)
 512     {
 513       g->any_date++;
 514       g->decimals += decimals;
 515     }
 516 }
 517
 518 /* Returns true if the A_LEN tokens in A[] match the B_LEN tokens
 519    in B[], false otherwise. */
 520 static bool
 521 match_date_syntax (const enum date_token a[], size_t a_len,
 522                    const enum date_token b[], size_t b_len)
 523 {
 524   size_t i;
 525
 526   if (a_len != b_len)
 527     return false;
 528
 529   for (i = 0; i < a_len; i++)
 530     if (!(a[i] & b[i]))
 531       return false;
 532
 533   return true;
 534 }
 535
 536 /* Guess which date or time format is most likely represented by
 537    G, and store it in F's type and d members.  (f->w is already
 538    initialized.) */
 539 static void
 540 guess_date_time (struct fmt_guesser *g, struct fmt_spec *f)
 541 {
 542   unsigned int max = 0;
 543   int i, j;
 544
 545   /* Choose the date format matched by the most inputs.  Break
 546      ties by choosing the earliest in the array. */
 547   for (i = 0; i < DATE_SYNTAX_CNT; i = j)
 548     {
 549       unsigned int sum = g->date[i];
 550       for (j = i + 1; j < DATE_SYNTAX_CNT; j++)
 551         {
 552           if (syntax[i].format != syntax[j].format)
 553             break;
 554           sum += g->date[j];
 555         }
 556       if (sum > max)
 557         {
 558           f->type = syntax[i].format;
 559           max = sum;
 560         }
 561     }
 562
 563   /* Formats that include a time have an optional seconds field.
 564      If we saw a seconds field in any of the inputs, make sure
 565      that the field width is large enough to include for them.
 566      (We use the minimum input width, but an output width would
 567      be equally appropriate, since all the time formats have the
 568      same minimum widths for input and output.)  */
 569   if (f->type == FMT_DATETIME || f->type == FMT_TIME
 570       || f->type == FMT_DTIME)
 571     {
 572       for (i = 0; i < DATE_SYNTAX_CNT; i++)
 573         if (g->date[i]
 574             && syntax[i].tokens[syntax[i].token_cnt - 1] == DT_SECOND)
 575           {
 576             f->d = g->decimals / g->count;
 577             f->w = MAX (f->w, fmt_min_input_width (f->type) + 3);
 578           }
 579     }
 580 }
 581
 582 /* Extracts the next date token from the string represented by S,
 583    which must not be an empty string, and advances *S past the
 584    end of the token.  Returns the parsed token, or 0 if no valid
 585    token was found.
 586
 587    TOKENS_SEEN should be a bitmap representing all the tokens
 588    already seen in this input; this is used to resolve some
 589    otherwise ambiguous parsing situation.  If a count of seconds
 590    is parsed, *DECIMALS is set to the number of digits after the
 591    decimal point.  */
 592 static enum date_token
 593 parse_date_token (struct substring *s, enum date_token tokens_seen,
 594                   int *decimals)
 595 {
 596   int c = ss_first (*s);
 597
 598   switch (c)
 599     {
 600     case '0': case '1': case '2': case '3': case '4':
 601     case '5': case '6': case '7': case '8': case '9':
 602       return parse_date_number (s, tokens_seen, decimals);
 603
 604     case '+':
 605     case '-':
 606       /* '+' or '-' at the start of a string, or following a
 607          space, could be the sign that optionally introduces a
 608          time, e.g. "-1:00" in TIME format, "-1 1:00" in DTIME
 609          format, or "1/1/1978 +1:00" in DATETIME format. */
 610       if ((!tokens_seen || s->string[-1] == ' ') && c_isdigit (ss_at (*s, 1)))
 611         {
 612           ss_advance (s, 1);
 613           ss_ltrim (s, ss_cstr (CC_DIGITS));
 614           return DT_DAY_COUNT | DT_HOUR;
 615         }
 616       else if (c == '+')
 617         return 0;
 618       /* Fall through. */
 619     case '/': case '.': case ',':
 620       ss_advance (s, 1);
 621       return DT_DELIM;
 622
 623     case ':':
 624       ss_advance (s, 1);
 625       return DT_COLON;
 626
 627     case ' ': case '\t': case '\v': case '\r': case '\n':
 628       {
 629         enum date_token token;
 630         ss_advance (s, 1);
 631         token = recognize_identifier_token (s);
 632         if (token)
 633           ss_match_char_in (s, ss_cstr (CC_SPACES));
 634         else
 635           token = DT_DELIM | DT_SPACE;
 636         return token;
 637       }
 638
 639     default:
 640       return recognize_identifier_token (s);
 641
 642     case EOF:
 643       NOT_REACHED ();
 644     }
 645 }
 646
 647 /* Parses a digit sequence found in a date token.  Advances *S
 648    past the end of the token.  Returns the parsed token, or 0 if
 649    no valid token was found.
 650
 651    TOKENS_SEEN should be a bitmap representing all the tokens
 652    already seen in this input; this is used to resolve some
 653    otherwise ambiguous parsing situation.  If a count of seconds
 654    is parsed, *DECIMALS is set to the number of digits after the
 655    decimal point.*/
 656 static enum date_token
 657 parse_date_number (struct substring *s, enum date_token tokens_seen,
 658                    int *decimals)
 659 {
 660   long int value;
 661   size_t digit_cnt = ss_get_long (s, &value);
 662   enum date_token token = 0;
 663
 664   if (ss_match_char (s, settings_get_decimal_char (FMT_F))
 665       && tokens_seen & DT_COLON
 666       && value <= 59)
 667     {
 668       /* Parse digits after the decimal point. */
 669       token = DT_SECOND;
 670       *decimals = ss_ltrim (s, ss_cstr (CC_DIGITS));
 671     }
 672   else
 673     {
 674       if (value <= 4)
 675         token = (DT_QUARTER | DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK
 676                  | DT_DAY_COUNT);
 677       else if (value <= 12)
 678         token = DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
 679       else if (value <= 23)
 680         token = DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
 681       else if (value <= 31)
 682         token = DT_DAY | DT_WEEK | DT_DAY_COUNT;
 683       else if (value <= 52)
 684         token = DT_WEEK | DT_DAY_COUNT;
 685       else
 686         token = DT_DAY_COUNT;
 687
 688       if (digit_cnt == 2)
 689         {
 690           token |= DT_YEAR;
 691           if (value <= 59)
 692             token |= DT_MINUTE | DT_SECOND;
 693         }
 694       else if (digit_cnt == 4)
 695         token |= DT_YEAR;
 696     }
 697
 698   return token;
 699 }
 700
 701 /* Attempts to parse an identifier found in a date at the
 702    beginning of S.  Advances *S past the end of the token.
 703    Returns the parsed token, or 0 if no valid token was
 704    found.  */
 705 static enum date_token
 706 recognize_identifier_token (struct substring *s)
 707 {
 708   size_t length = ss_span (*s, ss_cstr (CC_LETTERS));
 709   enum date_token token = 0;
 710   switch (length)
 711     {
 712     case 0:
 713       break;
 714
 715     case 1:
 716       switch (c_tolower (s->string[0]))
 717         {
 718         case 'i':
 719         case 'v':
 720         case 'x':
 721           token = DT_MONTH;
 722           break;
 723
 724         case 'q':
 725           token = DT_Q;
 726           break;
 727         }
 728       break;
 729
 730     case 2:
 731       {
 732         int s0 = c_tolower ((unsigned char) s->string[0]);
 733         int s1 = c_tolower ((unsigned char) s->string[1]);
 734         token = recognize_id2 (s0, s1, false);
 735         if (!token && s0 == 'w' && s1 == 'k')
 736           token = DT_WK;
 737       }
 738       break;
 739
 740     default:
 741       {
 742         int s0 = c_tolower ((unsigned char) s->string[0]);
 743         int s1 = c_tolower ((unsigned char) s->string[1]);
 744         int s2 = c_tolower ((unsigned char) s->string[2]);
 745         token = recognize_id2 (s0, s1, true);
 746         if (!token)
 747           token = recognize_id3 (s0, s1, s2, length > 3);
 748         if (!token && length == 4)
 749           {
 750             int s3 = c_tolower ((unsigned char) s->string[3]);
 751             if (s0 == 'v' && s1 == 'i' && s2 == 'i' && s3 == 'i')
 752               token = DT_MONTH;
 753           }
 754       }
 755       break;
 756     }
 757   if (token)
 758     ss_advance (s, length);
 759   return token;
 760 }
 761
 762 static enum date_token
 763 recognize_id2 (int s0, int s1, bool more)
 764 {
 765   bool weekday;
 766   switch (s0)
 767     {
 768     case 's': weekday = s1 == 'a' || s1 == 'u'; break;
 769     case 'm': weekday = s1 == 'o'; break;
 770     case 't': weekday = s1 == 'u' || s1 == 'h'; break;
 771     case 'w': weekday = s1 == 'e'; break;
 772     case 'f': weekday = s1 == 'r'; break;
 773     default: weekday = false; break;
 774     }
 775   if (weekday)
 776     return DT_WEEKDAY;
 777
 778   if (!more)
 779     {
 780       bool month;
 781       switch (s0)
 782         {
 783         case 'i': month = s1 == 'i' || s1 == 'v' || s1 == 'x'; break;
 784         case 'v': month = s1 == 'i'; break;
 785         case 'x': month = s1 == 'i'; break;
 786         default: month = false; break;
 787         }
 788       if (month)
 789         return DT_MONTH;
 790     }
 791
 792   return 0;
 793 }
 794
 795 static enum date_token
 796 recognize_id3 (int s0, int s1, int s2, bool more)
 797 {
 798   bool month;
 799   switch (s0)
 800     {
 801     case 'j':
 802       month = ((s1 == 'a' && s2 == 'n')
 803                || (s1 == 'u' && (s2 == 'n' || s2 == 'l')));
 804       break;
 805     case 'f':
 806       month = s1 == 'e' && s2 == 'b';
 807       break;
 808     case 'm':
 809       month = (s1 == 'a' && (s2 == 'r' || s2 == 'y'));
 810       break;
 811     case 'a':
 812       month = (s1 == 'p' && s2 == 'r') || (s1 == 'u' && s2 == 'g');
 813       break;
 814     case 's':
 815       month = s1 == 'e' && s2 == 'p';
 816       break;
 817     case 'o':
 818       month = s1 == 'c' && s2 == 't';
 819       break;
 820     case 'n':
 821       month = s1 == 'o' && s2 == 'v';
 822       break;
 823     case 'd':
 824       month = s1 == 'e' && s2 == 'c';
 825       break;
 826     default:
 827       month = false;
 828     }
 829   if (month)
 830     return DT_MONTH | DT_ENGLISH_MONTH;
 831
 832   if (!more)
 833     {
 834       bool roman_month = false;
 835       switch (s0)
 836         {
 837         case 'i':
 838         case 'x':
 839           roman_month = s1 == 'i' && s2 == 'i';
 840           break;
 841         case 'v':
 842           roman_month = s1 == 'i' && s2 == 'i';
 843           break;
 844         }
 845       if (roman_month)
 846         return DT_MONTH;
 847     }
 848
 849   return 0;
 850 }
 851
 852
 853
 854