lexer: Reimplement for better testability and internationalization.

[pspp-builds.git] / src / data / dictionary.c
diff --git a/src/data/dictionary.c b/src/data/dictionary.c

index 03548c44eb628aef66341b1c3bd53da05ae9aca2..79d36374fc42767660234911ed533c5732270b92 100644 (file)
--- a/src/data/dictionary.c
+++ b/src/data/dictionary.c
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2006, 2007, 2009, 2010, 2011 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -18,13 +18,15 @@
  
  #include "data/dictionary.h"
  
+#include <stdint.h>
  #include <stdlib.h>
  #include <ctype.h>
+#include <unistr.h>
  
  #include "data/attributes.h"
  #include "data/case.h"
-#include "data/category.h"
  #include "data/identifier.h"
+#include "data/mrset.h"
  #include "data/settings.h"
  #include "data/value-labels.h"
  #include "data/vardict.h"
@@ -35,14 +37,17 @@
  #include "libpspp/compiler.h"
  #include "libpspp/hash-functions.h"
  #include "libpspp/hmap.h"
+#include "libpspp/i18n.h"
  #include "libpspp/message.h"
  #include "libpspp/misc.h"
  #include "libpspp/pool.h"
  #include "libpspp/str.h"
+#include "libpspp/string-array.h"
  
  #include "gl/intprops.h"
  #include "gl/minmax.h"
  #include "gl/xalloc.h"
+#include "gl/xmemdup0.h"
  
  #include "gettext.h"
  #define _(msgid) gettext (msgid)
@@ -62,10 +67,12 @@ struct dictionary
      struct variable *filter;    /* FILTER variable. */
      casenumber case_limit;      /* Current case limit (N command). */
      char *label;               /* File label. */
-    struct string documents;    /* Documents, as a string. */
+    struct string_array documents; /* Documents. */
      struct vector **vector;     /* Vectors of variables. */
      size_t vector_cnt;          /* Number of vectors. */
      struct attrset attributes;  /* Custom attributes. */
+    struct mrset **mrsets;      /* Multiple response sets. */
+    size_t n_mrsets;            /* Number of multiple response sets. */
  
      char *encoding;             /* Character encoding of string data */
  
@@ -77,6 +84,8 @@ struct dictionary
      void *changed_data;
    };
  
+static void dict_unset_split_var (struct dictionary *, struct variable *);
+static void dict_unset_mrset_var (struct dictionary *, struct variable *);
  
  void
  dict_set_encoding (struct dictionary *d, const char *enc)
@@ -94,6 +103,15 @@ dict_get_encoding (const struct dictionary *d)
    return d->encoding ;
  }
  
+/* Returns true if UTF-8 string ID is an acceptable identifier in DICT's
+   encoding, false otherwise.  If ISSUE_ERROR is true, issues an explanatory
+   error message on failure. */
+bool
+dict_id_is_valid (const struct dictionary *dict, const char *id,
+                  bool issue_error)
+{
+  return id_is_valid (id, dict->encoding, issue_error);
+}
  
  void
  dict_set_change_callback (struct dictionary *d,
@@ -222,6 +240,20 @@ dict_clone (const struct dictionary *s)
  
    dict_set_attributes (d, dict_get_attributes (s));
  
+  for (i = 0; i < s->n_mrsets; i++)
+    {
+      const struct mrset *old = s->mrsets[i];
+      struct mrset *new;
+      size_t j;
+
+      /* Clone old mrset, then replace vars from D by vars from S. */
+      new = mrset_clone (old);
+      for (j = 0; j < new->n_vars; j++)
+        new->vars[j] = dict_lookup_var_assert (d, var_get_name (new->vars[j]));
+
+      dict_add_mrset (d, new);
+    }
+
    return d;
  }
  
@@ -249,7 +281,7 @@ dict_clear (struct dictionary *d)
    d->case_limit = 0;
    free (d->label);
    d->label = NULL;
-  ds_destroy (&d->documents);
+  string_array_clear (&d->documents);
    dict_clear_vectors (d);
    attrset_clear (&d->attributes);
  }
@@ -278,6 +310,8 @@ dict_destroy (struct dictionary *d)
        dict_clear (d);
        hmap_destroy (&d->name_map);
        attrset_destroy (&d->attributes);
+      dict_clear_mrsets (d);
+      free (d->encoding);
        free (d);
      }
  }
@@ -577,6 +611,7 @@ dict_delete_var (struct dictionary *d, struct variable *v)
    var_clear_aux (v);
  
    dict_unset_split_var (d, v);
+  dict_unset_mrset_var (d, v);
  
    if (d->weight == v)
      dict_set_weight (d, NULL);
@@ -820,63 +855,75 @@ var_name_is_insertable (const struct dictionary *dict, const char *name)
            && lex_id_to_token (ss_cstr (name)) == T_ID);
  }
  
-static bool
-make_hinted_name (const struct dictionary *dict, const char *hint,
-                  char name[VAR_NAME_LEN + 1])
+static char *
+make_hinted_name (const struct dictionary *dict, const char *hint)
  {
+  size_t hint_len = strlen (hint);
    bool dropped = false;
-  char *cp;
-
-  for (cp = name; *hint && cp < name + VAR_NAME_LEN; hint++)
+  char *root, *rp;
+  size_t ofs;
+  int mblen;
+
+  /* The allocation size here is OK: characters that are copied directly fit
+     OK, and characters that are not copied directly are replaced by a single
+     '_' byte.  If u8_mbtouc() replaces bad input by 0xfffd, then that will get
+     replaced by '_' too.  */
+  root = rp = xmalloc (hint_len + 1);
+  for (ofs = 0; ofs < hint_len; ofs += mblen)
      {
-      if (cp == name
-          ? lex_is_id1 (*hint) && *hint != '$'
-          : lex_is_idn (*hint))
+      ucs4_t uc;
+
+      mblen = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, hint + ofs),
+                         hint_len - ofs);
+      if (rp == root
+          ? lex_uc_is_id1 (uc) && uc != '$'
+          : lex_uc_is_idn (uc))
          {
            if (dropped)
              {
-              *cp++ = '_';
+              *rp++ = '_';
                dropped = false;
              }
-          if (cp < name + VAR_NAME_LEN)
-            *cp++ = *hint;
+          rp += u8_uctomb (CHAR_CAST (uint8_t *, rp), uc, 6);
          }
-      else if (cp > name)
+      else if (rp != root)
          dropped = true;
      }
-  *cp = '\0';
+  *rp = '\0';
  
-  if (name[0] != '\0')
+  if (root[0] != '\0')
      {
-      size_t len = strlen (name);
        unsigned long int i;
  
-      if (var_name_is_insertable (dict, name))
-        return true;
+      if (var_name_is_insertable (dict, root))
+        return root;
  
        for (i = 0; i < ULONG_MAX; i++)
          {
            char suffix[INT_BUFSIZE_BOUND (i) + 1];
-          int ofs;
+          char *name;
  
            suffix[0] = '_';
            if (!str_format_26adic (i + 1, &suffix[1], sizeof suffix - 1))
              NOT_REACHED ();
  
-          ofs = MIN (VAR_NAME_LEN - strlen (suffix), len);
-          strcpy (&name[ofs], suffix);
-
+          name = utf8_encoding_concat (root, suffix, dict->encoding, 64);
            if (var_name_is_insertable (dict, name))
-            return true;
+            {
+              free (root);
+              return name;
+            }
+          free (name);
          }
      }
  
-  return false;
+  free (root);
+
+  return NULL;
  }
  
-static bool
-make_numeric_name (const struct dictionary *dict, unsigned long int *num_start,
-                   char name[VAR_NAME_LEN + 1])
+static char *
+make_numeric_name (const struct dictionary *dict, unsigned long int *num_start)
  {
    unsigned long int number;
  
@@ -884,27 +931,24 @@ make_numeric_name (const struct dictionary *dict, unsigned long int *num_start,
         number < ULONG_MAX;
         number++)
      {
+      char name[3 + INT_STRLEN_BOUND (number) + 1];
+
        sprintf (name, "VAR%03lu", number);
        if (dict_lookup_var (dict, name) == NULL)
          {
            if (num_start != NULL)
              *num_start = number + 1;
-          return true;
+          return xstrdup (name);
          }
      }
  
-  if (num_start != NULL)
-    *num_start = ULONG_MAX;
-  return false;
+  NOT_REACHED ();
  }
  
  
-/* Attempts to devise a variable name unique within DICT.
-   Returns true if successful, in which case the new variable
-   name is stored into NAME.  Returns false if all names that can
-   be generated have already been taken.  (Returning false is
-   quite unlikely: at least ULONG_MAX unique names can be
-   generated.)
+/* Devises and returns a variable name unique within DICT.  The variable name
+   is owned by the caller, which must free it with free() when it is no longer
+   needed.
  
     HINT, if it is non-null, is used as a suggestion that will be
     modified for suitability as a variable name and for
@@ -915,14 +959,18 @@ make_numeric_name (const struct dictionary *dict, unsigned long int *num_start,
     value is used.  If NUM_START is non-null, then its value is
     used as the minimum numeric value to check, and it is updated
     to the next value to be checked.
-   */
-bool
+*/
+char *
  dict_make_unique_var_name (const struct dictionary *dict, const char *hint,
-                           unsigned long int *num_start,
-                           char name[VAR_NAME_LEN + 1])
+                           unsigned long int *num_start)
  {
-  return ((hint != NULL && make_hinted_name (dict, hint, name))
-          || make_numeric_name (dict, num_start, name));
+  if (hint != NULL)
+    {
+      char *hinted_name = make_hinted_name (dict, hint);
+      if (hinted_name != NULL)
+        return hinted_name;
+    }
+  return make_numeric_name (dict, num_start);
  }
  
  /* Returns the weighting variable in dictionary D, or a null
@@ -1153,7 +1201,7 @@ dict_get_split_cnt (const struct dictionary *d)
  
  /* Removes variable V, which must be in D, from D's set of split
     variables. */
-void
+static void
  dict_unset_split_var (struct dictionary *d, struct variable *v)
  {
    int orig_count;
@@ -1206,82 +1254,104 @@ dict_get_label (const struct dictionary *d)
  }
  
  /* Sets D's file label to LABEL, truncating it to a maximum of 60
-   characters. */
+   characters.
+
+   Removes D's label if LABEL is null or the empty string. */
  void
  dict_set_label (struct dictionary *d, const char *label)
  {
    free (d->label);
-  d->label = label != NULL ? xstrndup (label, 60) : NULL;
+  d->label = label != NULL && label[0] != '\0' ? xstrndup (label, 60) : NULL;
  }
  
-/* Returns the documents for D, or a null pointer if D has no
-   documents.  If the return value is nonnull, then the string
-   will be an exact multiple of DOC_LINE_LENGTH bytes in length,
-   with each segment corresponding to one line. */
-const char *
+/* Returns the documents for D, as an UTF-8 encoded string_array.  The
+   return value is always nonnull; if there are no documents then the
+   string_arary is empty.*/
+const struct string_array *
  dict_get_documents (const struct dictionary *d)
  {
-  return ds_is_empty (&d->documents) ? NULL : ds_cstr (&d->documents);
+  return &d->documents;
  }
  
-/* Sets the documents for D to DOCUMENTS, or removes D's
-   documents if DOCUMENT is a null pointer.  If DOCUMENTS is
-   nonnull, then it should be an exact multiple of
-   DOC_LINE_LENGTH bytes in length, with each segment
-   corresponding to one line. */
+/* Replaces the documents for D by NEW_DOCS, a UTF-8 encoded string_array. */
  void
-dict_set_documents (struct dictionary *d, const char *documents)
+dict_set_documents (struct dictionary *d, const struct string_array *new_docs)
  {
-  size_t remainder;
+  size_t i;
  
-  ds_assign_cstr (&d->documents, documents != NULL ? documents : "");
+  dict_clear_documents (d);
  
-  /* In case the caller didn't get it quite right, pad out the
-     final line with spaces. */
-  remainder = ds_length (&d->documents) % DOC_LINE_LENGTH;
-  if (remainder != 0)
-    ds_put_char_multiple (&d->documents, ' ', DOC_LINE_LENGTH - remainder);
+  for (i = 0; i < new_docs->n; i++)
+    dict_add_document_line (d, new_docs->strings[i], false);
+}
+
+/* Replaces the documents for D by UTF-8 encoded string NEW_DOCS, dividing it
+   into individual lines at new-line characters.  Each line is truncated to at
+   most DOC_LINE_LENGTH bytes in D's encoding. */
+void
+dict_set_documents_string (struct dictionary *d, const char *new_docs)
+{
+  const char *s;
+
+  dict_clear_documents (d);
+  for (s = new_docs; *s != '\0'; )
+    {
+      size_t len = strcspn (s, "\n");
+      char *line = xmemdup0 (s, len);
+      dict_add_document_line (d, line, false);
+      free (line);
+
+      s += len;
+      if (*s == '\n')
+        s++;
+    }
  }
  
  /* Drops the documents from dictionary D. */
  void
  dict_clear_documents (struct dictionary *d)
  {
-  ds_clear (&d->documents);
+  string_array_clear (&d->documents);
  }
  
-/* Appends LINE to the documents in D.  LINE will be truncated or
-   padded on the right with spaces to make it exactly
-   DOC_LINE_LENGTH bytes long. */
-void
-dict_add_document_line (struct dictionary *d, const char *line)
+/* Appends the UTF-8 encoded LINE to the documents in D.  LINE will be
+   truncated so that it is no more than 80 bytes in the dictionary's
+   encoding.  If this causes some text to be lost, and ISSUE_WARNING is true,
+   then a warning will be issued. */
+bool
+dict_add_document_line (struct dictionary *d, const char *line,
+                        bool issue_warning)
  {
-  if (strlen (line) > DOC_LINE_LENGTH)
+  size_t trunc_len;
+  bool truncated;
+
+  trunc_len = utf8_encoding_trunc_len (line, d->encoding, DOC_LINE_LENGTH);
+  truncated = line[trunc_len] != '\0';
+  if (truncated && issue_warning)
      {
        /* Note to translators: "bytes" is correct, not characters */
        msg (SW, _("Truncating document line to %d bytes."), DOC_LINE_LENGTH);
      }
-  buf_copy_str_rpad (ds_put_uninit (&d->documents, DOC_LINE_LENGTH),
-                     DOC_LINE_LENGTH, line, ' ');
+
+  string_array_append_nocopy (&d->documents, xmemdup0 (line, trunc_len));
+
+  return !truncated;
  }
  
  /* Returns the number of document lines in dictionary D. */
  size_t
  dict_get_document_line_cnt (const struct dictionary *d)
  {
-  return ds_length (&d->documents) / DOC_LINE_LENGTH;
+  return d->documents.n;
  }
  
-/* Copies document line number IDX from dictionary D into
-   LINE, trimming off any trailing white space. */
-void
-dict_get_document_line (const struct dictionary *d,
-                        size_t idx, struct string *line)
+/* Returns document line number IDX in dictionary D.  The caller must not
+   modify or free the returned string. */
+const char *
+dict_get_document_line (const struct dictionary *d, size_t idx)
  {
-  assert (idx < dict_get_document_line_cnt (d));
-  ds_assign_substring (line, ds_substr (&d->documents, idx * DOC_LINE_LENGTH,
-                                        DOC_LINE_LENGTH));
-  ds_rtrim (line, ss_cstr (CC_SPACES));
+  assert (idx < d->documents.n);
+  return d->documents.strings[idx];
  }
  
  /* Creates in D a vector named NAME that contains the CNT
@@ -1362,7 +1432,138 @@ dict_clear_vectors (struct dictionary *d)
    d->vector = NULL;
    d->vector_cnt = 0;
  }
+\f
+/* Multiple response sets. */
+
+/* Returns the multiple response set in DICT with index IDX, which must be
+   between 0 and the count returned by dict_get_n_mrsets(), exclusive. */
+const struct mrset *
+dict_get_mrset (const struct dictionary *dict, size_t idx)
+{
+  assert (idx < dict->n_mrsets);
+  return dict->mrsets[idx];
+}
+
+/* Returns the number of multiple response sets in DICT. */
+size_t
+dict_get_n_mrsets (const struct dictionary *dict)
+{
+  return dict->n_mrsets;
+}
  
+/* Looks for a multiple response set named NAME in DICT.  If it finds one,
+   returns its index; otherwise, returns SIZE_MAX. */
+static size_t
+dict_lookup_mrset_idx (const struct dictionary *dict, const char *name)
+{
+  size_t i;
+
+  for (i = 0; i < dict->n_mrsets; i++)
+    if (!strcasecmp (name, dict->mrsets[i]->name))
+      return i;
+
+  return SIZE_MAX;
+}
+
+/* Looks for a multiple response set named NAME in DICT.  If it finds one,
+   returns it; otherwise, returns NULL. */
+const struct mrset *
+dict_lookup_mrset (const struct dictionary *dict, const char *name)
+{
+  size_t idx = dict_lookup_mrset_idx (dict, name);
+  return idx != SIZE_MAX ? dict->mrsets[idx] : NULL;
+}
+
+/* Adds MRSET to DICT, replacing any existing set with the same name.  Returns
+   true if a set was replaced, false if none existed with the specified name.
+
+   Ownership of MRSET is transferred to DICT. */
+bool
+dict_add_mrset (struct dictionary *dict, struct mrset *mrset)
+{
+  size_t idx;
+
+  assert (mrset_ok (mrset, dict));
+
+  idx = dict_lookup_mrset_idx (dict, mrset->name);
+  if (idx == SIZE_MAX)
+    {
+      dict->mrsets = xrealloc (dict->mrsets,
+                               (dict->n_mrsets + 1) * sizeof *dict->mrsets);
+      dict->mrsets[dict->n_mrsets++] = mrset;
+      return true;
+    }
+  else
+    {
+      mrset_destroy (dict->mrsets[idx]);
+      dict->mrsets[idx] = mrset;
+      return false;
+    }
+}
+
+/* Looks for a multiple response set in DICT named NAME.  If found, removes it
+   from DICT and returns true.  If none is found, returns false without
+   modifying DICT.
+
+   Deleting one multiple response set causes the indexes of other sets within
+   DICT to change. */
+bool
+dict_delete_mrset (struct dictionary *dict, const char *name)
+{
+  size_t idx = dict_lookup_mrset_idx (dict, name);
+  if (idx != SIZE_MAX)
+    {
+      mrset_destroy (dict->mrsets[idx]);
+      dict->mrsets[idx] = dict->mrsets[--dict->n_mrsets];
+      return true;
+    }
+  else
+    return false;
+}
+
+/* Deletes all multiple response sets from DICT. */
+void
+dict_clear_mrsets (struct dictionary *dict)
+{
+  size_t i;
+
+  for (i = 0; i < dict->n_mrsets; i++)
+    mrset_destroy (dict->mrsets[i]);
+  free (dict->mrsets);
+  dict->mrsets = NULL;
+  dict->n_mrsets = 0;
+}
+
+/* Removes VAR, which must be in DICT, from DICT's multiple response sets. */
+static void
+dict_unset_mrset_var (struct dictionary *dict, struct variable *var)
+{
+  size_t i;
+
+  assert (dict_contains_var (dict, var));
+
+  for (i = 0; i < dict->n_mrsets; )
+    {
+      struct mrset *mrset = dict->mrsets[i];
+      size_t j;
+
+      for (j = 0; j < mrset->n_vars; )
+        if (mrset->vars[j] == var)
+          remove_element (mrset->vars, mrset->n_vars--,
+                          sizeof *mrset->vars, j);
+        else
+          j++;
+
+      if (mrset->n_vars < 2)
+        {
+          mrset_destroy (mrset);
+          dict->mrsets[i] = dict->mrsets[--dict->n_mrsets];
+        }
+      else
+        i++;
+    }
+}
+\f
  /* Returns D's attribute set.  The caller may examine or modify
     the attribute set, but must not destroy it.  Destroying D or
     calling dict_set_attributes for D will also destroy D's