pintos-os.org Git - pspp/blob - src/data/format-guesser.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2008, 2010, 2011 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "data/format-guesser.h"
  20
  21 #include <stdlib.h>
  22 #include <string.h>
  23
  24 #include "data/format.h"
  25 #include "data/settings.h"
  26 #include "libpspp/assertion.h"
  27 #include "libpspp/str.h"
  28
  29 #include "gl/c-ctype.h"
  30 #include "gl/minmax.h"
  31 #include "gl/xalloc.h"
  32
  33 /* A token in which potential date or time fields are broken.
  34
  35    The token type is actually a bit-map.  This allows a single
  36    token to represent multiple roles, as often happens in parsing
  37    adate or a time.  For example, the number "1" can be a quarter
  38    of the year, month, hour, day of the month, week of the year,
  39    or a count of days.  Such ambiguities are resolved on the
  40    higher-level bases of multiple tokens and multiple full
  41    dates. */
  42 enum date_token
  43   {
  44     DT_DAY = 1 << 0,            /* dd: Day of the month. */
  45     DT_MONTH = 1 << 1,          /* mm: Month. */
  46     DT_ENGLISH_MONTH = 1 << 2,  /* mmm: Spelled-out month, e.g. "jan". */
  47     DT_YEAR = 1 << 3,           /* yy: Year. */
  48
  49     DT_HOUR = 1 << 4,           /* HH: Hour. */
  50     DT_MINUTE = 1 << 5,         /* MM: Minute. */
  51     DT_SECOND = 1 << 6,         /* SS: Second. */
  52
  53     DT_WEEKDAY = 1 << 7,        /* www: Day of the week. */
  54
  55     DT_DAY_COUNT = 1 << 8,      /* D: Number of days. */
  56     DT_WEEK = 1 << 9,           /* ww: Week of the year. */
  57     DT_QUARTER = 1 << 10,       /* q: Quarter of the year. */
  58
  59     DT_Q = 1 << 11,             /* Literal "Q". */
  60     DT_WK = 1 << 12,            /* Literal "WK". */
  61
  62     DT_DELIM = 1 << 13,         /* One of -/., or white space. */
  63     DT_SPACE = 1 << 14,         /* Any white space. */
  64     DT_COLON = 1 << 15,         /* : */
  65   };
  66
  67 /* Syntax of a date format, in terms of the date tokens that
  68    compose it.*/
  69 struct date_syntax
  70   {
  71     enum fmt_type format;       /* Format type. */
  72 #define MAX_TOKENS 11
  73     size_t n_tokens;           /* Number of tokens. */
  74     enum date_token tokens[MAX_TOKENS]; /* Tokens. */
  75   };
  76
  77 /* Syntax of all the data formats that we can parse.
  78
  79    The order in the array can make a difference in the final
  80    choice of formats: in the case of a tie between the number of
  81    times each format is seen, the syntax earlier in the array
  82    takes precedence.  The most important cases are the ordering
  83    of DATE before EDATE, so that spelled-out months in input
  84    yield DATE format (that produces spelled-out months in output,
  85    and the ordering of EDATE before ADATE, so that ambiguous
  86    dates such as "1/1/99" yield the more sensible European date
  87    format instead of American format.
  88
  89    When a given date format has more than one syntax, they must
  90    be in adjacent array elements. */
  91 static struct date_syntax syntax[] =
  92   {
  93     /* dd-mmm-yy */
  94     { FMT_DATE, 5, {DT_DAY, DT_DELIM, DT_ENGLISH_MONTH, DT_DELIM, DT_YEAR} },
  95
  96     /* dd.mm.yy */
  97     { FMT_EDATE, 5, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR} },
  98
  99     /* mm/dd/yy */
 100     { FMT_ADATE, 5, {DT_MONTH, DT_DELIM, DT_DAY, DT_DELIM, DT_YEAR} },
 101
 102     /* yy/mm/dd */
 103     { FMT_SDATE, 5, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY} },
 104
 105     /* mmm yy */
 106     { FMT_MOYR, 3, {DT_MONTH, DT_DELIM, DT_YEAR} },
 107
 108     /* q Q yy */
 109     { FMT_QYR, 3, {DT_QUARTER, DT_Q, DT_YEAR} },
 110
 111     /* ww WK yy */
 112     { FMT_WKYR, 3, {DT_WEEK, DT_WK, DT_YEAR} },
 113
 114     /* dd-mmm-yyyy HH:MM */
 115     { FMT_DATETIME,
 116       9, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
 117           DT_COLON, DT_MINUTE} },
 118     /* dd-mmm-yyyy HH:MM:SS */
 119     { FMT_DATETIME,
 120       11, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
 121            DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
 122
 123     /* yyyy-dd-mmm HH:MM */
 124     { FMT_YMDHMS,
 125       9, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY, DT_SPACE, DT_HOUR,
 126           DT_COLON, DT_MINUTE} },
 127     /* yyyy-dd-mmm HH:MM:SS */
 128     { FMT_YMDHMS,
 129       11, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY, DT_SPACE, DT_HOUR,
 130            DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
 131
 132     /* HH:MM */
 133     { FMT_TIME, 3, {DT_HOUR, DT_COLON, DT_MINUTE} },
 134     /* HH:MM:SS */
 135     { FMT_TIME, 5, {DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
 136
 137     /* D HH:MM */
 138     { FMT_DTIME, 5, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE} },
 139     /* D HH:MM:SS */
 140     { FMT_DTIME,
 141       7, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON,
 142           DT_SECOND} },
 143
 144     /* www */
 145     { FMT_WKDAY, 1, {DT_WEEKDAY} },
 146
 147     /* mmm
 148
 149        We require a spelled-out English month so that
 150        single-character Roman numerals like "i" and "x" don't get
 151        detected as months.  The latter is particularly common in
 152        the password field of /etc/passwd-like files. */
 153     { FMT_MONTH, 1, {DT_ENGLISH_MONTH} },
 154   };
 155
 156 /* Number of recognized date syntax formats. */
 157 #define DATE_SYNTAX_CNT (sizeof syntax / sizeof *syntax)
 158
 159 /* A format guesser. */
 160 struct fmt_guesser
 161   {
 162     /* Maximum observed input width. */
 163     unsigned int width;
 164
 165     /* Sum of the digits after the decimal point in each input
 166        (divide by count to obtain average decimal positions). */
 167     unsigned int decimals;
 168
 169     /* Number of non-empty, non-missing input values.
 170
 171        count is the sum of any_numeric, any_date, and the number
 172        of inputs that were not in any recognized format (hence,
 173        treated as A format). */
 174     unsigned int count;
 175
 176     /* Numeric input formats. */
 177     unsigned int any_numeric;   /* Sum of following counts. */
 178     unsigned int f;             /* Number of inputs in F format. */
 179     unsigned int comma;         /* Number of inputs in COMMA format. */
 180     unsigned int dot;           /* Number of inputs in DOT format. */
 181     unsigned int dollar;        /* Number of inputs in DOLLAR format. */
 182     unsigned int pct;           /* Number of inputs in PCT format. */
 183     unsigned int e;             /* Number of inputs in E format. */
 184
 185     /* Date or time input formats.
 186
 187        The sum of the values in the date array is at least
 188        any_date, often higher because many example dates match
 189        more than one date format. */
 190     unsigned int any_date;      /* Number of inputs in any date format. */
 191     unsigned int date[DATE_SYNTAX_CNT]; /* Number of inputs in each date
 192                                            format. */
 193   };
 194
 195 static bool add_numeric (struct fmt_guesser *, struct substring);
 196 static void guess_numeric (struct fmt_guesser *, struct fmt_spec *);
 197 static void add_date_time (struct fmt_guesser *, struct substring);
 198 static bool match_date_syntax (const enum date_token a[], size_t a_len,
 199                                const enum date_token b[], size_t b_len);
 200 static void guess_date_time (struct fmt_guesser *, struct fmt_spec *);
 201 static enum date_token parse_date_token (struct substring *,
 202                                          enum date_token tokens_seen,
 203                                          int *decimals);
 204 static enum date_token parse_date_number (struct substring *,
 205                                           enum date_token tokens_seen,
 206                                           int *decimals);
 207 static enum date_token recognize_identifier_token (struct substring *);
 208 static enum date_token recognize_id2 (int s0, int s1, bool more);
 209 static enum date_token recognize_id3 (int s0, int s1, int s2, bool more);
 210
 211 /* Creates and returns a new format guesser. */
 212 struct fmt_guesser *
 213 fmt_guesser_create (void)
 214 {
 215   struct fmt_guesser *g = xmalloc (sizeof *g);
 216   fmt_guesser_clear (g);
 217   return g;
 218 }
 219
 220 /* Destroys format guesser G. */
 221 void
 222 fmt_guesser_destroy (struct fmt_guesser *g)
 223 {
 224   free (g);
 225 }
 226
 227 /* Clears the state of format guesser G, making it available for
 228    guessing the format of a new input stream.  */
 229 void
 230 fmt_guesser_clear (struct fmt_guesser *g)
 231 {
 232   memset (g, 0, sizeof *g);
 233 }
 234
 235 /* Appends S to the stream of data items whose format G is
 236    guessing. */
 237 void
 238 fmt_guesser_add (struct fmt_guesser *g, struct substring s)
 239 {
 240   if (ss_length (s) > g->width)
 241     g->width = ss_length (s);
 242   ss_trim (&s, ss_cstr (CC_SPACES));
 243   if (ss_is_empty (s) || ss_equals (s, ss_cstr (".")))
 244     {
 245       /* Can't guess anything from an empty string or a missing value. */
 246       return;
 247     }
 248
 249   g->count++;
 250   if (!add_numeric (g, s))
 251     add_date_time (g, s);
 252 }
 253
 254 /* Guesses the format of the input previously added to G using
 255    fmt_guesser_add, storing the guess into *F.  The guessed
 256    format may not actually a valid input or output format, in
 257    that its width and number of decimal places may be outside the
 258    valid range for the guessed format type.  The caller must
 259    therefore adjust the format to make it valid, e.g. by calling
 260    fmt_fix. */
 261 void
 262 fmt_guesser_guess (struct fmt_guesser *g, struct fmt_spec *f)
 263 {
 264   if (g->count > 0)
 265     {
 266       /* Set defaults.  The guesser functions typically override
 267          the width and type. */
 268       f->type = FMT_A;
 269       f->w = g->width;
 270       f->d = 0;
 271
 272       if (g->any_numeric > g->count / 2)
 273         guess_numeric (g, f);
 274       else if (g->any_date > g->count / 2)
 275         guess_date_time (g, f);
 276     }
 277   else
 278     {
 279       /* No data at all.  Use fallback default. */
 280       *f = fmt_default_for_width (0);
 281     }
 282 }
 283 \f
 284 /* Numeric formats. */
 285
 286 /* Tries to parse S as a numeric (F, COMMA, DOT, DOLLAR, PCT, or
 287    E) format.  If successful, increments G's any_numeric counter
 288    and the counter for the specific format S that S matches and
 289    returns true.  On failure, returns false without modifying G.
 290
 291    This function is intended to match exactly the same set of
 292    strings that the actual numeric value parsers used by the
 293    data_in function would match. */
 294 static bool
 295 add_numeric (struct fmt_guesser *g, struct substring s)
 296 {
 297   bool has_dollar;              /* '$' appeared at start of S? */
 298   bool has_percent;             /* '%' appeared at end of S? */
 299   int digits;                   /* Number of digits in S (before exponent). */
 300   int dots;                     /* Number of '.' in S. */
 301   int commas;                   /* Number of ',' in S. */
 302   bool has_exp;                 /* [eEdD] appeared introducing exponent? */
 303   bool has_exp_sign;            /* '+' or '-' appeared in exponent? */
 304   int exp_digits;               /* Number of digits in exponent. */
 305
 306   int prev_delim;       /* Initially 0, then ',' or '.' as delimiters seen. */
 307   int delim_digits;             /* Number of digits since last delimiter. */
 308
 309   int decimal;                  /* Decimal point character: '.', ',',
 310                                    or 0 if unknown or no decimal point in S. */
 311   int precision;                /* Digits of precision after decimal point. */
 312
 313   int c;
 314
 315   /* Skip leading "$" and optional following white space. */
 316   has_dollar = ss_match_byte (&s, '$');
 317   if (has_dollar)
 318     ss_ltrim (&s, ss_cstr (CC_SPACES));
 319
 320   /* Skip optional sign. */
 321   ss_match_byte_in (&s, ss_cstr ("+-"));
 322
 323   /* Skip digits punctuated by commas and dots.  We don't know
 324      whether the decimal point is a comma or a dot, so for now we
 325      just count them.  */
 326   digits = dots = commas = 0;
 327   delim_digits = 0;
 328   prev_delim = 0;
 329   for (; (c = ss_first (s)) != -1; ss_advance (&s, 1))
 330     {
 331       if (c >= '0' && c <= '9')
 332         {
 333           digits++;
 334           if (dots || commas)
 335             delim_digits++;
 336         }
 337       else if (c == '.')
 338         {
 339           dots++;
 340           prev_delim = c;
 341           delim_digits = 0;
 342         }
 343       else if (c == ',')
 344         {
 345           commas++;
 346           prev_delim = c;
 347           delim_digits = 0;
 348         }
 349       else
 350         break;
 351     }
 352   if (digits == 0 || (dots > 1 && commas > 1))
 353     {
 354       /* A valid number has at least one digit and can't have
 355          more than one decimal point. */
 356       return false;
 357     }
 358
 359   /* Skip the optional exponent. */
 360   has_exp = ss_match_byte_in (&s, ss_cstr ("eEdD")) != EOF;
 361   has_exp_sign = ss_match_byte_in (&s, ss_cstr ("-+")) != EOF;
 362   if (has_exp_sign)
 363     ss_match_byte (&s, ' ');
 364   exp_digits = ss_ltrim (&s, ss_cstr (CC_DIGITS));
 365   if ((has_exp || has_exp_sign) && !exp_digits)
 366     {
 367       /* Can't have the E or sign that leads in the exponent
 368          without actually having an exponent. */
 369       return false;
 370     }
 371
 372   /* Skip optional '%'. */
 373   has_percent = ss_match_byte (&s, '%');
 374   if (has_dollar && has_percent)
 375     {
 376       /* A valid number cannot have both '$' and '%'. */
 377       return false;
 378     }
 379
 380   /* Make sure there's no trailing garbage. */
 381   if (!ss_is_empty (s))
 382     return false;
 383
 384   /* Figure out the decimal point (and therefore grouping)
 385      character and the number of digits following the decimal
 386      point.  Sometimes the answer is ambiguous. */
 387   if (dots > 1 && prev_delim == '.')
 388     {
 389       /* Can't have multiple decimal points, so '.' must really
 390          be the grouping character, with a precision of 0. */
 391       decimal = ',';
 392       precision = 0;
 393     }
 394   else if (commas > 1 && prev_delim == ',')
 395     {
 396       /* Can't have multiple decimal points, so ',' must really
 397          be the grouping character, with a precision of 0. */
 398       decimal = '.';
 399       precision = 0;
 400     }
 401   else if (delim_digits == 3 && (!dots || !commas))
 402     {
 403       /* The input is something like "1.234" or "1,234" where we
 404          can't tell whether the ',' or '.' is a grouping or
 405          decimal character.  Assume that the decimal character
 406          from the settings is in use. */
 407       if (prev_delim == settings_get_fmt_settings ()->decimal)
 408         {
 409           decimal = prev_delim;
 410           precision = delim_digits;
 411         }
 412       else
 413         {
 414           decimal = prev_delim == '.' ? ',' : '.';
 415           precision = 0;
 416         }
 417     }
 418   else
 419     {
 420       /* The final delimiter is a decimal point, and the digits
 421          following it are decimals. */
 422       decimal = prev_delim;
 423       precision = delim_digits;
 424     }
 425
 426   /* Decide the most likely format. */
 427   g->any_numeric++;
 428   g->decimals += precision;
 429   if (has_dollar)
 430     g->dollar++;
 431   else if (has_percent)
 432     g->pct++;
 433   else if (commas && decimal == '.')
 434     g->comma++;
 435   else if (dots && decimal == ',')
 436     g->dot++;
 437   else if (has_exp || has_exp_sign)
 438     g->e++;
 439   else
 440     g->f++;
 441
 442   return true;
 443 }
 444
 445 /* Guess which numeric format is most likely represented by G,
 446    and store it in F's type and d members.  (f->w is already
 447    initialized.) */
 448 static void
 449 guess_numeric (struct fmt_guesser *g, struct fmt_spec *f)
 450 {
 451   int decimal_char = settings_get_fmt_settings ()->decimal;
 452
 453   f->d = g->decimals / g->count;
 454   if (g->pct)
 455     f->type = FMT_PCT;
 456   else if (g->dollar)
 457     f->type = FMT_DOLLAR;
 458   else if (g->comma > g->dot)
 459     f->type = decimal_char == '.' ? FMT_COMMA : FMT_DOT;
 460   else if (g->dot > g->comma)
 461     f->type = decimal_char == '.' ? FMT_DOT : FMT_COMMA;
 462   else if (g->e > g->any_numeric / 2)
 463     f->type = FMT_E;
 464   else
 465     f->type = FMT_F;
 466 }
 467 \f
 468 /* Tries to parse S as a date (DATE, ADATE, EDATE, SDATE, QYR,
 469    MOYR, WKYR, DATETIME, or YMDHMS), time (TIME or DTIME), or
 470    date component (WKDAY or MONTH) format.  If successful,
 471    increments G's any_date counter and the counter or counters
 472    for the specific format(s) that S matches.  On failure, does
 473    not modify G.
 474
 475    XXX How can we distinguish MTIME from TIME?  One way might be
 476    that TIME can have three parts (HH:MM:SS) but MTIME only ever
 477    has two (MM:SS).
 478
 479    Does not attempt to recognize JDATE format: it looks just like
 480    F format and will thus be caught by the numeric parser.
 481
 482    This function is intended to match a set of strings close to
 483    those that actual date and time parsers used by the data_in
 484    function would match, but somewhat pickier.  In particular,
 485    minutes and seconds are only recognized when they have exactly
 486    two digits: "1:02:03" is a valid time, but "1:2:3" is
 487    rejected.  */
 488 static void
 489 add_date_time (struct fmt_guesser *g, struct substring s)
 490 {
 491   enum date_token token;
 492   enum date_token tokens[MAX_TOKENS];
 493   enum date_token tokens_seen;
 494   size_t n_tokens;
 495   int decimals;
 496   bool is_date;
 497   int i;
 498
 499   /* Break S into tokens. */
 500   n_tokens = 0;
 501   tokens_seen = 0;
 502   decimals = 0;
 503   while (!ss_is_empty (s))
 504     {
 505       if (n_tokens >= MAX_TOKENS)
 506         return;
 507
 508       token = parse_date_token (&s, tokens_seen, &decimals);
 509       if (token == 0)
 510         return;
 511       tokens[n_tokens++] = token;
 512       tokens_seen |= token;
 513     }
 514   if (n_tokens == 0)
 515     return;
 516
 517   /* Find matching date formats, if any, and increment the
 518      counter for each one of them. */
 519   is_date = false;
 520   for (i = 0; i < DATE_SYNTAX_CNT; i++)
 521     {
 522       struct date_syntax *s = &syntax[i];
 523       if (match_date_syntax (tokens, n_tokens, s->tokens, s->n_tokens))
 524         {
 525           is_date = true;
 526           g->date[i]++;
 527         }
 528     }
 529   if (is_date)
 530     {
 531       g->any_date++;
 532       g->decimals += decimals;
 533     }
 534 }
 535
 536 /* Returns true if the A_LEN tokens in A[] match the B_LEN tokens
 537    in B[], false otherwise. */
 538 static bool
 539 match_date_syntax (const enum date_token a[], size_t a_len,
 540                    const enum date_token b[], size_t b_len)
 541 {
 542   size_t i;
 543
 544   if (a_len != b_len)
 545     return false;
 546
 547   for (i = 0; i < a_len; i++)
 548     if (!(a[i] & b[i]))
 549       return false;
 550
 551   return true;
 552 }
 553
 554 /* Guess which date or time format is most likely represented by
 555    G, and store it in F's type and d members.  (f->w is already
 556    initialized.) */
 557 static void
 558 guess_date_time (struct fmt_guesser *g, struct fmt_spec *f)
 559 {
 560   unsigned int max = 0;
 561   int i, j;
 562
 563   /* Choose the date format matched by the most inputs.  Break
 564      ties by choosing the earliest in the array. */
 565   for (i = 0; i < DATE_SYNTAX_CNT; i = j)
 566     {
 567       unsigned int sum = g->date[i];
 568       for (j = i + 1; j < DATE_SYNTAX_CNT; j++)
 569         {
 570           if (syntax[i].format != syntax[j].format)
 571             break;
 572           sum += g->date[j];
 573         }
 574       if (sum > max)
 575         {
 576           f->type = syntax[i].format;
 577           max = sum;
 578         }
 579     }
 580
 581   /* Formats that include a time have an optional seconds field.
 582      If we saw a seconds field in any of the inputs, make sure
 583      that the field width is large enough to include for them.
 584      (We use the minimum input width, but an output width would
 585      be equally appropriate, since all the time formats have the
 586      same minimum widths for input and output.)  */
 587   if (f->type == FMT_DATETIME || f->type == FMT_YMDHMS
 588       || f->type == FMT_MTIME || f->type == FMT_TIME || f->type == FMT_DTIME)
 589     {
 590       for (i = 0; i < DATE_SYNTAX_CNT; i++)
 591         if (g->date[i]
 592             && syntax[i].tokens[syntax[i].n_tokens - 1] == DT_SECOND)
 593           {
 594             f->d = g->decimals / g->count;
 595             f->w = MAX (f->w, fmt_min_input_width (f->type) + 3);
 596           }
 597     }
 598 }
 599
 600 /* Extracts the next date token from the string represented by S,
 601    which must not be an empty string, and advances *S past the
 602    end of the token.  Returns the parsed token, or 0 if no valid
 603    token was found.
 604
 605    TOKENS_SEEN should be a bitmap representing all the tokens
 606    already seen in this input; this is used to resolve some
 607    otherwise ambiguous parsing situation.  If a count of seconds
 608    is parsed, *DECIMALS is set to the number of digits after the
 609    decimal point.  */
 610 static enum date_token
 611 parse_date_token (struct substring *s, enum date_token tokens_seen,
 612                   int *decimals)
 613 {
 614   int c = ss_first (*s);
 615
 616   switch (c)
 617     {
 618     case '0': case '1': case '2': case '3': case '4':
 619     case '5': case '6': case '7': case '8': case '9':
 620       return parse_date_number (s, tokens_seen, decimals);
 621
 622     case '+':
 623     case '-':
 624       /* '+' or '-' at the start of a string, or following a
 625          space, could be the sign that optionally introduces a
 626          time, e.g. "-1:00" in TIME format, "-1 1:00" in DTIME
 627          format, or "1/1/1978 +1:00" in DATETIME format. */
 628       if ((!tokens_seen || s->string[-1] == ' ') && c_isdigit (ss_at (*s, 1)))
 629         {
 630           ss_advance (s, 1);
 631           ss_ltrim (s, ss_cstr (CC_DIGITS));
 632           return DT_DAY_COUNT | DT_HOUR;
 633         }
 634       else if (c == '+')
 635         return 0;
 636       /* Fall through. */
 637     case '/': case '.': case ',':
 638       ss_advance (s, 1);
 639       return DT_DELIM;
 640
 641     case ':':
 642       ss_advance (s, 1);
 643       return DT_COLON;
 644
 645     case ' ': case '\t': case '\v': case '\r': case '\n':
 646       {
 647         enum date_token token;
 648         ss_advance (s, 1);
 649         token = recognize_identifier_token (s);
 650         if (token)
 651           ss_match_byte_in (s, ss_cstr (CC_SPACES));
 652         else
 653           token = DT_DELIM | DT_SPACE;
 654         return token;
 655       }
 656
 657     default:
 658       return recognize_identifier_token (s);
 659
 660     case EOF:
 661       NOT_REACHED ();
 662     }
 663 }
 664
 665 /* Parses a digit sequence found in a date token.  Advances *S
 666    past the end of the token.  Returns the parsed token, or 0 if
 667    no valid token was found.
 668
 669    TOKENS_SEEN should be a bitmap representing all the tokens
 670    already seen in this input; this is used to resolve some
 671    otherwise ambiguous parsing situation.  If a count of seconds
 672    is parsed, *DECIMALS is set to the number of digits after the
 673    decimal point.*/
 674 static enum date_token
 675 parse_date_number (struct substring *s, enum date_token tokens_seen,
 676                    int *decimals)
 677 {
 678   long int value;
 679   size_t n_digits = ss_get_long (s, &value);
 680   enum date_token token = 0;
 681
 682   if (ss_match_byte (s, settings_get_fmt_settings ()->decimal)
 683       && tokens_seen & DT_COLON
 684       && value <= 59)
 685     {
 686       /* Parse digits after the decimal point. */
 687       token = DT_SECOND;
 688       *decimals = ss_ltrim (s, ss_cstr (CC_DIGITS));
 689     }
 690   else
 691     {
 692       if (value <= 4)
 693         token = (DT_QUARTER | DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK
 694                  | DT_DAY_COUNT);
 695       else if (value <= 12)
 696         token = DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
 697       else if (value <= 23)
 698         token = DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
 699       else if (value <= 31)
 700         token = DT_DAY | DT_WEEK | DT_DAY_COUNT;
 701       else if (value <= 52)
 702         token = DT_WEEK | DT_DAY_COUNT;
 703       else
 704         token = DT_DAY_COUNT;
 705
 706       if (n_digits == 2)
 707         {
 708           token |= DT_YEAR;
 709           if (value <= 59)
 710             token |= DT_MINUTE | DT_SECOND;
 711         }
 712       else if (n_digits == 4)
 713         token |= DT_YEAR;
 714     }
 715
 716   return token;
 717 }
 718
 719 /* Attempts to parse an identifier found in a date at the
 720    beginning of S.  Advances *S past the end of the token.
 721    Returns the parsed token, or 0 if no valid token was
 722    found.  */
 723 static enum date_token
 724 recognize_identifier_token (struct substring *s)
 725 {
 726   size_t length = ss_span (*s, ss_cstr (CC_LETTERS));
 727   enum date_token token = 0;
 728   switch (length)
 729     {
 730     case 0:
 731       break;
 732
 733     case 1:
 734       switch (c_tolower (s->string[0]))
 735         {
 736         case 'i':
 737         case 'v':
 738         case 'x':
 739           token = DT_MONTH;
 740           break;
 741
 742         case 'q':
 743           token = DT_Q;
 744           break;
 745         }
 746       break;
 747
 748     case 2:
 749       {
 750         int s0 = c_tolower ((unsigned char) s->string[0]);
 751         int s1 = c_tolower ((unsigned char) s->string[1]);
 752         token = recognize_id2 (s0, s1, false);
 753         if (!token && s0 == 'w' && s1 == 'k')
 754           token = DT_WK;
 755       }
 756       break;
 757
 758     default:
 759       {
 760         int s0 = c_tolower ((unsigned char) s->string[0]);
 761         int s1 = c_tolower ((unsigned char) s->string[1]);
 762         int s2 = c_tolower ((unsigned char) s->string[2]);
 763         token = recognize_id2 (s0, s1, true);
 764         if (!token)
 765           token = recognize_id3 (s0, s1, s2, length > 3);
 766         if (!token && length == 4)
 767           {
 768             int s3 = c_tolower ((unsigned char) s->string[3]);
 769             if (s0 == 'v' && s1 == 'i' && s2 == 'i' && s3 == 'i')
 770               token = DT_MONTH;
 771           }
 772       }
 773       break;
 774     }
 775   if (token)
 776     ss_advance (s, length);
 777   return token;
 778 }
 779
 780 static enum date_token
 781 recognize_id2 (int s0, int s1, bool more)
 782 {
 783   bool weekday;
 784   switch (s0)
 785     {
 786     case 's': weekday = s1 == 'a' || s1 == 'u'; break;
 787     case 'm': weekday = s1 == 'o'; break;
 788     case 't': weekday = s1 == 'u' || s1 == 'h'; break;
 789     case 'w': weekday = s1 == 'e'; break;
 790     case 'f': weekday = s1 == 'r'; break;
 791     default: weekday = false; break;
 792     }
 793   if (weekday)
 794     return DT_WEEKDAY;
 795
 796   if (!more)
 797     {
 798       bool month;
 799       switch (s0)
 800         {
 801         case 'i': month = s1 == 'i' || s1 == 'v' || s1 == 'x'; break;
 802         case 'v': month = s1 == 'i'; break;
 803         case 'x': month = s1 == 'i'; break;
 804         default: month = false; break;
 805         }
 806       if (month)
 807         return DT_MONTH;
 808     }
 809
 810   return 0;
 811 }
 812
 813 static enum date_token
 814 recognize_id3 (int s0, int s1, int s2, bool more)
 815 {
 816   bool month;
 817   switch (s0)
 818     {
 819     case 'j':
 820       month = ((s1 == 'a' && s2 == 'n')
 821                || (s1 == 'u' && (s2 == 'n' || s2 == 'l')));
 822       break;
 823     case 'f':
 824       month = s1 == 'e' && s2 == 'b';
 825       break;
 826     case 'm':
 827       month = (s1 == 'a' && (s2 == 'r' || s2 == 'y'));
 828       break;
 829     case 'a':
 830       month = (s1 == 'p' && s2 == 'r') || (s1 == 'u' && s2 == 'g');
 831       break;
 832     case 's':
 833       month = s1 == 'e' && s2 == 'p';
 834       break;
 835     case 'o':
 836       month = s1 == 'c' && s2 == 't';
 837       break;
 838     case 'n':
 839       month = s1 == 'o' && s2 == 'v';
 840       break;
 841     case 'd':
 842       month = s1 == 'e' && s2 == 'c';
 843       break;
 844     default:
 845       month = false;
 846     }
 847   if (month)
 848     return DT_MONTH | DT_ENGLISH_MONTH;
 849
 850   if (!more)
 851     {
 852       bool roman_month = false;
 853       switch (s0)
 854         {
 855         case 'i':
 856         case 'x':
 857           roman_month = s1 == 'i' && s2 == 'i';
 858           break;
 859         case 'v':
 860           roman_month = s1 == 'i' && s2 == 'i';
 861           break;
 862         }
 863       if (roman_month)
 864         return DT_MONTH;
 865     }
 866
 867   return 0;
 868 }
 869
 870
 871
 872