Patch #6441. Reviewed by John Darrington.

[pspp-builds.git] / src / data / format-guesser.c
diff --git a/src/data/format-guesser.c b/src/data/format-guesser.c

new file mode 100644 (file)

index 0000000..78af8df
--- /dev/null
+++ b/src/data/format-guesser.c
@@ -0,0 +1,854 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2008 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "format-guesser.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "c-ctype.h"
+#include "minmax.h"
+#include "xalloc.h"
+
+#include <data/format.h>
+#include <data/settings.h>
+#include <libpspp/assertion.h>
+#include <libpspp/str.h>
+
+/* A token in which potential date or time fields are broken.
+
+   The token type is actually a bit-map.  This allows a single
+   token to represent multiple roles, as often happens in parsing
+   adate or a time.  For example, the number "1" can be a quarter
+   of the year, month, hour, day of the month, week of the year,
+   or a count of days.  Such ambiguities are resolved on the
+   higher-level bases of multiple tokens and multiple full
+   dates. */
+enum date_token
+  {
+    DT_DAY = 1 << 0,            /* dd: Day of the month. */
+    DT_MONTH = 1 << 1,          /* mm: Month. */
+    DT_ENGLISH_MONTH = 1 << 2,  /* mmm: Spelled-out month, e.g. "jan". */
+    DT_YEAR = 1 << 3,           /* yy: Year. */
+
+    DT_HOUR = 1 << 4,           /* HH: Hour. */
+    DT_MINUTE = 1 << 5,         /* MM: Minute. */
+    DT_SECOND = 1 << 6,         /* SS: Second. */
+
+    DT_WEEKDAY = 1 << 7,        /* www: Day of the week. */
+
+    DT_DAY_COUNT = 1 << 8,      /* D: Number of days. */
+    DT_WEEK = 1 << 9,           /* ww: Week of the year. */
+    DT_QUARTER = 1 << 10,       /* q: Quarter of the year. */
+
+    DT_Q = 1 << 11,             /* Literal "Q". */
+    DT_WK = 1 << 12,            /* Literal "WK". */
+
+    DT_DELIM = 1 << 13,         /* One of -/., or white space. */
+    DT_SPACE = 1 << 14,         /* Any white space. */
+    DT_COLON = 1 << 15,         /* : */
+  };
+
+/* Syntax of a date format, in terms of the date tokens that
+   compose it.*/
+struct date_syntax
+  {
+    enum fmt_type format;       /* Format type. */
+#define MAX_TOKENS 11
+    size_t token_cnt;           /* Number of tokens. */
+    enum date_token tokens[MAX_TOKENS]; /* Tokens. */
+  };
+
+/* Syntax of all the data formats that we can parse.
+
+   The order in the array can make a difference in the final
+   choice of formats: in the case of a tie between the number of
+   times each format is seen, the syntax earlier in the array
+   takes precedence.  The most important cases are the ordering
+   of DATE before EDATE, so that spelled-out months in input
+   yield DATE format (that produces spelled-out months in output,
+   and the ordering of EDATE before ADATE, so that ambiguous
+   dates such as "1/1/99" yield the more sensible European date
+   format instead of American format.
+
+   When a given date format has more than one syntax, they must
+   be in adjacent array elements. */
+static struct date_syntax syntax[] =
+  {
+    /* dd-mmm-yy */
+    { FMT_DATE, 5, {DT_DAY, DT_DELIM, DT_ENGLISH_MONTH, DT_DELIM, DT_YEAR} },
+
+    /* dd.mm.yy */
+    { FMT_EDATE, 5, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR} },
+
+    /* mm/dd/yy */
+    { FMT_ADATE, 5, {DT_MONTH, DT_DELIM, DT_DAY, DT_DELIM, DT_YEAR} },
+
+    /* yy/mm/dd */
+    { FMT_SDATE, 5, {DT_YEAR, DT_DELIM, DT_MONTH, DT_DELIM, DT_DAY} },
+
+    /* mmm yy */
+    { FMT_MOYR, 3, {DT_MONTH, DT_DELIM, DT_YEAR} },
+
+    /* q Q yy */
+    { FMT_QYR, 3, {DT_QUARTER, DT_Q, DT_YEAR} },
+
+    /* ww WK yy */
+    { FMT_WKYR, 3, {DT_WEEK, DT_WK, DT_YEAR} },
+
+    /* dd-mmm-yyyy HH:MM */
+    { FMT_DATETIME,
+      9, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
+          DT_COLON, DT_MINUTE} },
+    /* dd-mmm-yyyy HH:MM:SS */
+    { FMT_DATETIME,
+      11, {DT_DAY, DT_DELIM, DT_MONTH, DT_DELIM, DT_YEAR, DT_SPACE, DT_HOUR,
+           DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
+
+    /* HH:MM */
+    { FMT_TIME, 3, {DT_HOUR, DT_COLON, DT_MINUTE} },
+    /* HH:MM:SS */
+    { FMT_TIME, 5, {DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON, DT_SECOND} },
+
+    /* D HH:MM */
+    { FMT_DTIME, 5, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE} },
+    /* D HH:MM:SS */
+    { FMT_DTIME,
+      7, {DT_DAY_COUNT, DT_SPACE, DT_HOUR, DT_COLON, DT_MINUTE, DT_COLON,
+          DT_SECOND} },
+
+    /* www */
+    { FMT_WKDAY, 1, {DT_WEEKDAY} },
+
+    /* mmm */
+    { FMT_MONTH, 1, {DT_MONTH} },
+  };
+
+/* Number of recognized date syntax formats. */
+#define DATE_SYNTAX_CNT (sizeof syntax / sizeof *syntax)
+
+/* A format guesser. */
+struct fmt_guesser
+  {
+    /* Maximum observed input width. */
+    unsigned int width;
+
+    /* Sum of the digits after the decimal point in each input
+       (divide by count to obtain average decimal positions). */
+    unsigned int decimals;
+
+    /* Number of non-empty, non-missing input values.
+
+       count is the sum of any_numeric, any_date, and the number
+       of inputs that were not in any recognized format (hence,
+       treated as A format). */
+    unsigned int count;
+
+    /* Numeric input formats. */
+    unsigned int any_numeric;   /* Sum of following counts. */
+    unsigned int f;             /* Number of inputs in F format. */
+    unsigned int comma;         /* Number of inputs in COMMA format. */
+    unsigned int dot;           /* Number of inputs in DOT format. */
+    unsigned int dollar;        /* Number of inputs in DOLLAR format. */
+    unsigned int pct;           /* Number of inputs in PCT format. */
+    unsigned int e;             /* Number of inputs in E format. */
+
+    /* Date or time input formats.
+
+       The sum of the values in the date array is at least
+       any_date, often higher because many example dates match
+       more than one date format. */
+    unsigned int any_date;      /* Number of inputs in any date format. */
+    unsigned int date[DATE_SYNTAX_CNT]; /* Number of inputs in each date
+                                           format. */
+  };
+
+static bool add_numeric (struct fmt_guesser *, struct substring);
+static void guess_numeric (struct fmt_guesser *, struct fmt_spec *);
+static void add_date_time (struct fmt_guesser *, struct substring);
+static bool match_date_syntax (const enum date_token a[], size_t a_len,
+                               const enum date_token b[], size_t b_len);
+static void guess_date_time (struct fmt_guesser *, struct fmt_spec *);
+static enum date_token parse_date_token (struct substring *,
+                                         enum date_token tokens_seen,
+                                         int *decimals);
+static enum date_token parse_date_number (struct substring *,
+                                          enum date_token tokens_seen,
+                                          int *decimals);
+static enum date_token recognize_identifier_token (struct substring *);
+static enum date_token recognize_id2 (int s0, int s1, bool more);
+static enum date_token recognize_id3 (int s0, int s1, int s2, bool more);
+
+/* Creates and returns a new format guesser. */
+struct fmt_guesser *
+fmt_guesser_create (void)
+{
+  struct fmt_guesser *g = xmalloc (sizeof *g);
+  fmt_guesser_clear (g);
+  return g;
+}
+
+/* Destroys format guesser G. */
+void
+fmt_guesser_destroy (struct fmt_guesser *g)
+{
+  free (g);
+}
+
+/* Clears the state of format guesser G, making it available for
+   guessing the format of a new input stream.  */
+void
+fmt_guesser_clear (struct fmt_guesser *g)
+{
+  memset (g, 0, sizeof *g);
+}
+
+/* Appends S to the stream of data items whose format G is
+   guessing. */
+void
+fmt_guesser_add (struct fmt_guesser *g, struct substring s)
+{
+  if (ss_length (s) > g->width)
+    g->width = ss_length (s);
+  ss_trim (&s, ss_cstr (CC_SPACES));
+  if (ss_is_empty (s) || ss_equals (s, ss_cstr (".")))
+    {
+      /* Can't guess anything from an empty string or a missing value. */
+      return;
+    }
+
+  g->count++;
+  if (!add_numeric (g, s))
+    add_date_time (g, s);
+}
+
+/* Guesses the format of the input previously added to G using
+   fmt_guesser_add, storing the guess into *F.  The guessed
+   format may not actually a valid input or output format, in
+   that its width and number of decimal places may be outside the
+   valid range for the guessed format type.  The caller must
+   therefore adjust the format to make it valid, e.g. by calling
+   fmt_fix. */
+void
+fmt_guesser_guess (struct fmt_guesser *g, struct fmt_spec *f)
+{
+  if (g->count > 0)
+    {
+      /* Set defaults.  The guesser functions typically override
+         the width and type. */
+      f->type = FMT_A;
+      f->w = g->width;
+      f->d = 0;
+
+      if (g->any_numeric > g->count / 2)
+        guess_numeric (g, f);
+      else if (g->any_date > g->count / 2)
+        guess_date_time (g, f);
+    }
+  else
+    {
+      /* No data at all.  Use fallback default. */
+      *f = fmt_default_for_width (0);
+    }
+}
+\f
+/* Numeric formats. */
+
+/* Tries to parse S as a numeric (F, COMMA, DOT, DOLLAR, PCT, or
+   E) format.  If successful, increments G's any_numeric counter
+   and the counter for the specific format S that S matches and
+   returns true.  On failure, returns false without modifying G.
+
+   This function is intended to match exactly the same set of
+   strings that the actual numeric value parsers used by the
+   data_in function would match. */
+static bool
+add_numeric (struct fmt_guesser *g, struct substring s)
+{
+  bool has_dollar;              /* '$' appeared at start of S? */
+  bool has_percent;             /* '%' appeared at end of S? */
+  int digits;                   /* Number of digits in S (before exponent). */
+  int dots;                     /* Number of '.' in S. */
+  int commas;                   /* Number of ',' in S. */
+  bool has_exp;                 /* [eEdD] appeared introducing exponent? */
+  bool has_exp_sign;            /* '+' or '-' appeared in exponent? */
+  int exp_digits;               /* Number of digits in exponent. */
+
+  int prev_delim;       /* Initially 0, then ',' or '.' as delimiters seen. */
+  int delim_digits;             /* Number of digits since last delimiter. */
+
+  int decimal;                  /* Decimal point character: '.', ',',
+                                   or 0 if unknown or no decimal point in S. */
+  int precision;                /* Digits of precision after decimal point. */
+
+  int c;
+
+  /* Skip leading "$" and optional following white space. */
+  has_dollar = ss_match_char (&s, '$');
+  if (has_dollar)
+    ss_ltrim (&s, ss_cstr (CC_SPACES));
+
+  /* Skip optional sign. */
+  ss_match_char_in (&s, ss_cstr ("+-"));
+
+  /* Skip digits punctuated by commas and dots.  We don't know
+     whether the decimal point is a comma or a dot, so for now we
+     just count them.  */
+  digits = dots = commas = 0;
+  delim_digits = 0;
+  prev_delim = 0;
+  for (; (c = ss_first (s)) != -1; ss_advance (&s, 1))
+    {
+      if (c >= '0' && c <= '9')
+        {
+          digits++;
+          if (dots || commas)
+            delim_digits++;
+        }
+      else if (c == '.' )
+        {
+          dots++;
+          prev_delim = c;
+          delim_digits = 0;
+        }
+      else if (c == ',')
+        {
+          commas++;
+          prev_delim = c;
+          delim_digits = 0;
+        }
+      else
+        break;
+    }
+  if (digits == 0 || (dots > 1 && commas > 1))
+    {
+      /* A valid number has at least one digit and can't have
+         more than one decimal point. */
+      return false;
+    }
+
+  /* Skip the optional exponent. */
+  has_exp = ss_match_char_in (&s, ss_cstr ("eEdD")) != EOF;
+  has_exp_sign = ss_match_char_in (&s, ss_cstr ("-+")) != EOF;
+  if (has_exp_sign)
+    ss_match_char (&s, ' ');
+  exp_digits = ss_ltrim (&s, ss_cstr (CC_DIGITS));
+  if ((has_exp || has_exp_sign) && !exp_digits)
+    {
+      /* Can't have the E or sign that leads in the exponent
+         without actually having an exponent. */
+      return false;
+    }
+
+  /* Skip optional '%'. */
+  has_percent = ss_match_char (&s, '%');
+  if (has_dollar && has_percent)
+    {
+      /* A valid number cannot have both '$' and '%'. */
+      return false;
+    }
+
+  /* Make sure there's no trailing garbage. */
+  if (!ss_is_empty (s))
+    return false;
+
+  /* Figure out the decimal point (and therefore grouping)
+     character and the number of digits following the decimal
+     point.  Sometimes the answer is ambiguous. */
+  if (dots > 1 && prev_delim == '.')
+    {
+      /* Can't have multiple decimal points, so '.' must really
+         be the grouping character, with a precision of 0. */
+      decimal = ',';
+      precision = 0;
+    }
+  else if (commas > 1 && prev_delim == ',')
+    {
+      /* Can't have multiple decimal points, so ',' must really
+         be the grouping character, with a precision of 0. */
+      decimal = '.';
+      precision = 0;
+    }
+  else if (delim_digits == 3 && (!dots || !commas))
+    {
+      /* The input is something like "1.234" or "1,234" where we
+         can't tell whether the ',' or '.' is a grouping or
+         decimal character.  Assume that the decimal character
+         from the settings is in use. */
+      if (prev_delim == settings_get_decimal_char (FMT_F))
+        {
+          decimal = prev_delim;
+          precision = delim_digits;
+        }
+      else
+        {
+          decimal = prev_delim == '.' ? ',' : '.';
+          precision = 0;
+        }
+    }
+  else
+    {
+      /* The final delimiter is a decimal point, and the digits
+         following it are decimals. */
+      decimal = prev_delim;
+      precision = delim_digits;
+    }
+
+  /* Decide the most likely format. */
+  g->any_numeric++;
+  g->decimals += precision;
+  if (has_dollar)
+    g->dollar++;
+  else if (has_percent)
+    g->pct++;
+  else if (commas && decimal == '.')
+    g->comma++;
+  else if (dots && decimal == ',')
+    g->dot++;
+  else if (has_exp || has_exp_sign)
+    g->e++;
+  else
+    g->f++;
+
+  return true;
+}
+
+/* Guess which numeric format is most likely represented by G,
+   and store it in F's type and d members.  (f->w is already
+   initialized.) */
+static void
+guess_numeric (struct fmt_guesser *g, struct fmt_spec *f)
+{
+  int decimal_char = settings_get_decimal_char (FMT_COMMA);
+
+  f->d = g->decimals / g->count;
+  if (g->pct)
+    f->type = FMT_PCT;
+  else if (g->dollar)
+    f->type = FMT_DOLLAR;
+  else if (g->comma > g->dot)
+    f->type = decimal_char == '.' ? FMT_COMMA : FMT_DOT;
+  else if (g->dot > g->comma)
+    f->type = decimal_char == '.' ? FMT_DOT : FMT_COMMA;
+  else if (g->e > g->any_numeric / 2)
+    f->type = FMT_E;
+  else
+    f->type = FMT_F;
+}
+\f
+/* Tries to parse S as a date (DATE, ADATE, EDATE, SDATE, QYR,
+   MOYR, WKYR, or DATETIME), time (TIME or DTIME), or date
+   component (WKDAY or MONTH) format.  If successful, increments
+   G's any_date counter and the counter or counters for the
+   specific format(s) that S matches.  On failure, does not
+   modify G.
+
+   Does not attempt to recognize JDATE format: it looks just like
+   F format and will thus be caught by the numeric parser.
+
+   This function is intended to match a set of strings close to
+   those that actual date and time parsers used by the data_in
+   function would match, but somewhat pickier.  In particular,
+   minutes and seconds are only recognized when they have exactly
+   two digits: "1:02:03" is a valid time, but "1:2:3" is
+   rejected.  */
+static void
+add_date_time (struct fmt_guesser *g, struct substring s)
+{
+  enum date_token token;
+  enum date_token tokens[MAX_TOKENS];
+  enum date_token tokens_seen;
+  size_t token_cnt;
+  int decimals;
+  bool is_date;
+  int i;
+
+  /* Break S into tokens. */
+  token_cnt = 0;
+  tokens_seen = 0;
+  decimals = 0;
+  while (!ss_is_empty (s))
+    {
+      if (token_cnt >= MAX_TOKENS)
+        return;
+
+      token = parse_date_token (&s, tokens_seen, &decimals);
+      if (token == 0)
+        return;
+      tokens[token_cnt++] = token;
+      tokens_seen |= token;
+    }
+  if (token_cnt == 0)
+    return;
+
+  /* Find matching date formats, if any, and increment the
+     counter for each one of them. */
+  is_date = false;
+  for (i = 0; i < DATE_SYNTAX_CNT; i++)
+    {
+      struct date_syntax *s = &syntax[i];
+      if (match_date_syntax (tokens, token_cnt, s->tokens, s->token_cnt))
+        {
+          is_date = true;
+          g->date[i]++;
+        }
+    }
+  if (is_date)
+    {
+      g->any_date++;
+      g->decimals += decimals;
+    }
+}
+
+/* Returns true if the A_LEN tokens in A[] match the B_LEN tokens
+   in B[], false otherwise. */
+static bool
+match_date_syntax (const enum date_token a[], size_t a_len,
+                   const enum date_token b[], size_t b_len)
+{
+  size_t i;
+
+  if (a_len != b_len)
+    return false;
+
+  for (i = 0; i < a_len; i++)
+    if (!(a[i] & b[i]))
+      return false;
+
+  return true;
+}
+
+/* Guess which date or time format is most likely represented by
+   G, and store it in F's type and d members.  (f->w is already
+   initialized.) */
+static void
+guess_date_time (struct fmt_guesser *g, struct fmt_spec *f)
+{
+  unsigned int max = 0;
+  int i, j;
+
+  /* Choose the date format matched by the most inputs.  Break
+     ties by choosing the earliest in the array. */
+  for (i = 0; i < DATE_SYNTAX_CNT; i = j)
+    {
+      unsigned int sum = g->date[i];
+      for (j = i + 1; j < DATE_SYNTAX_CNT; j++)
+        {
+          if (syntax[i].format != syntax[j].format)
+            break;
+          sum += g->date[j];
+        }
+      if (sum > max)
+        {
+          f->type = syntax[i].format;
+          max = sum;
+        }
+    }
+
+  /* Formats that include a time have an optional seconds field.
+     If we saw a seconds field in any of the inputs, make sure
+     that the field width is large enough to include for them.
+     (We use the minimum input width, but an output width would
+     be equally appropriate, since all the time formats have the
+     same minimum widths for input and output.)  */
+  if (f->type == FMT_DATETIME || f->type == FMT_TIME
+      || f->type == FMT_DTIME)
+    {
+      for (i = 0; i < DATE_SYNTAX_CNT; i++)
+        if (g->date[i]
+            && syntax[i].tokens[syntax[i].token_cnt - 1] == DT_SECOND)
+          {
+            f->d = g->decimals / g->count;
+            f->w = MAX (f->w, fmt_min_input_width (f->type) + 3);
+          }
+    }
+}
+
+/* Extracts the next date token from the string represented by S,
+   which must not be an empty string, and advances *S past the
+   end of the token.  Returns the parsed token, or 0 if no valid
+   token was found.
+
+   TOKENS_SEEN should be a bitmap representing all the tokens
+   already seen in this input; this is used to resolve some
+   otherwise ambiguous parsing situation.  If a count of seconds
+   is parsed, *DECIMALS is set to the number of digits after the
+   decimal point.  */
+static enum date_token
+parse_date_token (struct substring *s, enum date_token tokens_seen,
+                  int *decimals)
+{
+  int c = ss_first (*s);
+
+  switch (c)
+    {
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+      return parse_date_number (s, tokens_seen, decimals);
+
+    case '+':
+    case '-':
+      /* '+' or '-' at the start of a string, or following a
+         space, could be the sign that optionally introduces a
+         time, e.g. "-1:00" in TIME format, "-1 1:00" in DTIME
+         format, or "1/1/1978 +1:00" in DATETIME format. */
+      if ((!tokens_seen || s->string[-1] == ' ') && c_isdigit (ss_at (*s, 1)))
+        {
+          ss_advance (s, 1);
+          ss_ltrim (s, ss_cstr (CC_DIGITS));
+          return DT_DAY_COUNT | DT_HOUR;
+        }
+      else if (c == '+')
+        return 0;
+      /* Fall through. */
+    case '/': case '.': case ',':
+      ss_advance (s, 1);
+      return DT_DELIM;
+
+    case ':':
+      ss_advance (s, 1);
+      return DT_COLON;
+
+    case ' ': case '\t': case '\v': case '\r': case '\n':
+      {
+        enum date_token token;
+        ss_advance (s, 1);
+        token = recognize_identifier_token (s);
+        if (token)
+          ss_match_char_in (s, ss_cstr (CC_SPACES));
+        else
+          token = DT_DELIM | DT_SPACE;
+        return token;
+      }
+
+    default:
+      return recognize_identifier_token (s);
+
+    case EOF:
+      NOT_REACHED ();
+    }
+}
+
+/* Parses a digit sequence found in a date token.  Advances *S
+   past the end of the token.  Returns the parsed token, or 0 if
+   no valid token was found.
+
+   TOKENS_SEEN should be a bitmap representing all the tokens
+   already seen in this input; this is used to resolve some
+   otherwise ambiguous parsing situation.  If a count of seconds
+   is parsed, *DECIMALS is set to the number of digits after the
+   decimal point.*/
+static enum date_token
+parse_date_number (struct substring *s, enum date_token tokens_seen,
+                   int *decimals)
+{
+  long int value;
+  size_t digit_cnt = ss_get_long (s, &value);
+  enum date_token token = 0;
+
+  if (ss_match_char (s, settings_get_decimal_char (FMT_F))
+      && tokens_seen & DT_COLON
+      && value <= 59)
+    {
+      /* Parse digits after the decimal point. */
+      token = DT_SECOND;
+      *decimals = ss_ltrim (s, ss_cstr (CC_DIGITS));
+    }
+  else
+    {
+      if (value <= 4)
+        token = (DT_QUARTER | DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK
+                 | DT_DAY_COUNT);
+      else if (value <= 12)
+        token = DT_MONTH | DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
+      else if (value <= 23)
+        token = DT_HOUR | DT_DAY | DT_WEEK | DT_DAY_COUNT;
+      else if (value <= 31)
+        token = DT_DAY | DT_WEEK | DT_DAY_COUNT;
+      else if (value <= 52)
+        token = DT_WEEK | DT_DAY_COUNT;
+      else
+        token = DT_DAY_COUNT;
+
+      if (digit_cnt == 2)
+        {
+          token |= DT_YEAR;
+          if (value <= 59)
+            token |= DT_MINUTE | DT_SECOND;
+        }
+      else if (digit_cnt == 4)
+        token |= DT_YEAR;
+    }
+
+  return token;
+}
+
+/* Attempts to parse an identifier found in a date at the
+   beginning of S.  Advances *S past the end of the token.
+   Returns the parsed token, or 0 if no valid token was
+   found.  */
+static enum date_token
+recognize_identifier_token (struct substring *s)
+{
+  size_t length = ss_span (*s, ss_cstr (CC_LETTERS));
+  enum date_token token = 0;
+  switch (length)
+    {
+    case 0:
+      break;
+
+    case 1:
+      switch (c_tolower (s->string[0]))
+        {
+        case 'i':
+        case 'v':
+        case 'x':
+          token = DT_MONTH;
+          break;
+
+        case 'q':
+          token = DT_Q;
+          break;
+        }
+      break;
+
+    case 2:
+      {
+        int s0 = c_tolower ((unsigned char) s->string[0]);
+        int s1 = c_tolower ((unsigned char) s->string[1]);
+        token = recognize_id2 (s0, s1, false);
+        if (!token && s0 == 'w' && s1 == 'k')
+          token = DT_WK;
+      }
+      break;
+
+    default:
+      {
+        int s0 = c_tolower ((unsigned char) s->string[0]);
+        int s1 = c_tolower ((unsigned char) s->string[1]);
+        int s2 = c_tolower ((unsigned char) s->string[2]);
+        token = recognize_id2 (s0, s1, true);
+        if (!token)
+          token = recognize_id3 (s0, s1, s2, length > 3);
+        if (!token && length == 4)
+          {
+            int s3 = c_tolower ((unsigned char) s->string[3]);
+            if (s0 == 'v' && s1 == 'i' && s2 == 'i' && s3 == 'i')
+              token = DT_MONTH;
+          }
+      }
+      break;
+    }
+  if (token)
+    ss_advance (s, length);
+  return token;
+}
+
+static enum date_token
+recognize_id2 (int s0, int s1, bool more)
+{
+  bool weekday;
+  switch (s0)
+    {
+    case 's': weekday = s1 == 'a' || s1 == 'u'; break;
+    case 'm': weekday = s1 == 'o'; break;
+    case 't': weekday = s1 == 'u' || s1 == 'h'; break;
+    case 'w': weekday = s1 == 'e'; break;
+    case 'f': weekday = s1 == 'r'; break;
+    default: weekday = false; break;
+    }
+  if (weekday)
+    return DT_WEEKDAY;
+
+  if (!more)
+    {
+      bool month;
+      switch (s0)
+        {
+        case 'i': month = s1 == 'i' || s1 == 'v' || s1 == 'x'; break;
+        case 'v': month = s1 == 'i'; break;
+        case 'x': month = s1 == 'i'; break;
+        default: month = false; break;
+        }
+      if (month)
+        return DT_MONTH;
+    }
+
+  return 0;
+}
+
+static enum date_token
+recognize_id3 (int s0, int s1, int s2, bool more)
+{
+  bool month;
+  switch (s0)
+    {
+    case 'j':
+      month = ((s1 == 'a' && s2 == 'n')
+               || (s1 == 'u' && (s2 == 'n' || s2 == 'l')));
+      break;
+    case 'f':
+      month = s1 == 'e' && s2 == 'b';
+      break;
+    case 'm':
+      month = (s1 == 'a' && (s2 == 'r' || s2 == 'y'));
+      break;
+    case 'a':
+      month = (s1 == 'p' && s2 == 'r') || (s1 == 'u' && s2 == 'g');
+      break;
+    case 's':
+      month = s1 == 'e' && s2 == 'p';
+      break;
+    case 'o':
+      month = s1 == 'c' && s2 == 't';
+      break;
+    case 'n':
+      month = s1 == 'o' && s2 == 'v';
+      break;
+    case 'd':
+      month = s1 == 'e' && s2 == 'c';
+      break;
+    default:
+      month = false;
+    }
+  if (month)
+    return DT_MONTH | DT_ENGLISH_MONTH;
+
+  if (!more)
+    {
+      bool roman_month = false;
+      switch (s0)
+        {
+        case 'i':
+        case 'x':
+          roman_month = s1 == 'i' && s2 == 'i';
+          break;
+        case 'v':
+          roman_month = s1 == 'i' && s2 == 'i';
+          break;
+        }
+      if (roman_month)
+        return DT_MONTH;
+    }
+
+  return 0;
+}
+
+
+
+