From: Ben Pfaff <blp@cs.stanford.edu>
Date: Sun, 24 Apr 2011 04:40:48 +0000 (-0700)
Subject: dictionary: Make dict_create() take the new dictionary's encoding.
X-Git-Tag: v0.7.8~40
X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp-builds.git;a=commitdiff_plain;h=d9fc15ceb74cdc66487ba9aaed04246170729640

dictionary: Make dict_create() take the new dictionary's encoding.

There are several places in the PSPP tree that create dictionaries,
but few of them actually set an encoding.  This causes most
dictionaries to be in the default encoding, which is often not
correct.

By making dict_create() take the encoding as a parameter we force
the caller to think about the encoding issue up-front.
---

diff --git a/perl-module/PSPP.xs b/perl-module/PSPP.xs
index 58eac5b7..896d7d88 100644
--- a/perl-module/PSPP.xs
+++ b/perl-module/PSPP.xs
@@ -223,7 +223,7 @@ MODULE = PSPP		PACKAGE = PSPP::Dict
 struct dictionary *
 pxs_dict_new()
 CODE:
- RETVAL = dict_create ();
+ RETVAL = dict_create ("UTF-8");
 OUTPUT:
  RETVAL
 
@@ -592,7 +592,6 @@ CODE:
  struct file_handle *fh =
   fh_create_file (NULL, name, fh_default_properties () );
  struct sysfile_info *sfi = xmalloc (sizeof (*sfi));
- dict_set_encoding (dict, UTF8);
  sfi->writer = sfm_open_writer (fh, dict, opts);
  sfi->dict = dict;
  sfi->opened = true;
diff --git a/src/data/dataset.c b/src/data/dataset.c
index 5d0598e3..466696c7 100644
--- a/src/data/dataset.c
+++ b/src/data/dataset.c
@@ -124,9 +124,8 @@ dataset_create (void)
   struct dataset *ds;
 
   ds = xzalloc (sizeof *ds);
-  ds->dict = dict_create ();
+  ds->dict = dict_create (get_default_encoding ());
   dict_set_change_callback (ds->dict, dict_callback, ds);
-  dict_set_encoding (ds->dict, get_default_encoding ());
 
   ds->caseinit = caseinit_create ();
   proc_cancel_all_transformations (ds);
diff --git a/src/data/dictionary.c b/src/data/dictionary.c
index 7d67defb..c8f58516 100644
--- a/src/data/dictionary.c
+++ b/src/data/dictionary.c
@@ -87,16 +87,6 @@ struct dictionary
 static void dict_unset_split_var (struct dictionary *, struct variable *);
 static void dict_unset_mrset_var (struct dictionary *, struct variable *);
 
-void
-dict_set_encoding (struct dictionary *d, const char *enc)
-{
-  if (enc)
-    {
-      free (d->encoding);
-      d->encoding = xstrdup (enc);
-    }
-}
-
 const char *
 dict_get_encoding (const struct dictionary *d)
 {
@@ -171,14 +161,16 @@ dict_copy_callbacks (struct dictionary *dest,
   dest->cb_data = src->cb_data;
 }
 
-/* Creates and returns a new dictionary. */
+/* Creates and returns a new dictionary with the specified ENCODING. */
 struct dictionary *
-dict_create (void)
+dict_create (const char *encoding)
 {
   struct dictionary *d = xzalloc (sizeof *d);
 
+  d->encoding = xstrdup (encoding);
   hmap_init (&d->name_map);
   attrset_init (&d->attributes);
+
   return d;
 }
 
@@ -196,7 +188,7 @@ dict_clone (const struct dictionary *s)
   struct dictionary *d;
   size_t i;
 
-  d = dict_create ();
+  d = dict_create (s->encoding);
 
   /* Set the new dictionary's encoding early so that string length limitations
      are interpreted correctly. */
@@ -1660,7 +1652,7 @@ struct variable *
 dict_create_internal_var (int case_idx, int width)
 {
   if (internal_dict == NULL)
-    internal_dict = dict_create ();
+    internal_dict = dict_create ("UTF-8");
 
   for (;;)
     {
diff --git a/src/data/dictionary.h b/src/data/dictionary.h
index fa5d0dde..2a196950 100644
--- a/src/data/dictionary.h
+++ b/src/data/dictionary.h
@@ -26,7 +26,7 @@ struct string;
 struct ccase;
 
 /* Creating dictionaries. */
-struct dictionary *dict_create (void);
+struct dictionary *dict_create (const char *encoding);
 struct dictionary *dict_clone (const struct dictionary *);
 
 
@@ -164,7 +164,6 @@ void dict_set_attributes (struct dictionary *, const struct attrset *);
 bool dict_has_attributes (const struct dictionary *);
 
 /* Data encoding. */
-void dict_set_encoding (struct dictionary *d, const char *enc);
 const char *dict_get_encoding (const struct dictionary *d);
 
 bool dict_id_is_valid (const struct dictionary *, const char *id,
diff --git a/src/data/gnumeric-reader.c b/src/data/gnumeric-reader.c
index 6392f9b8..61fbab89 100644
--- a/src/data/gnumeric-reader.c
+++ b/src/data/gnumeric-reader.c
@@ -496,9 +496,8 @@ gnumeric_open_reader (struct gnumeric_read_info *gri, struct dictionary **dict)
 
 
   /* Create the dictionary and populate it */
-  *dict = r->dict = dict_create ();
-
-  dict_set_encoding (r->dict, CHAR_CAST (const char *, xmlTextReaderConstEncoding (r->xtr)));
+  *dict = r->dict = dict_create (
+    CHAR_CAST (const char *, xmlTextReaderConstEncoding (r->xtr)));
 
   for (i = 0 ; i < n_var_specs ; ++i )
     {
diff --git a/src/data/por-file-reader.c b/src/data/por-file-reader.c
index 372d7682..a05f6b3d 100644
--- a/src/data/por-file-reader.c
+++ b/src/data/por-file-reader.c
@@ -38,6 +38,7 @@
 #include "data/value-labels.h"
 #include "data/variable.h"
 #include "libpspp/compiler.h"
+#include "libpspp/i18n.h"
 #include "libpspp/message.h"
 #include "libpspp/misc.h"
 #include "libpspp/pool.h"
@@ -250,7 +251,7 @@ pfm_open_reader (struct file_handle *fh, struct dictionary **dict,
   struct pool *volatile pool = NULL;
   struct pfm_reader *volatile r = NULL;
 
-  *dict = dict_create ();
+  *dict = dict_create (get_default_encoding ());
 
   /* Create and initialize reader. */
   pool = pool_create ();
diff --git a/src/data/psql-reader.c b/src/data/psql-reader.c
index 346d214a..2d9a2678 100644
--- a/src/data/psql-reader.c
+++ b/src/data/psql-reader.c
@@ -27,6 +27,7 @@
 #include "data/dictionary.h"
 #include "data/format.h"
 #include "data/variable.h"
+#include "libpspp/i18n.h"
 #include "libpspp/message.h"
 #include "libpspp/misc.h"
 #include "libpspp/str.h"
@@ -229,6 +230,7 @@ psql_open_reader (struct psql_read_info *info, struct dictionary **dict)
   int n_fields, n_tuples;
   PGresult *qres = NULL;
   casenumber n_cases = CASENUMBER_MAX;
+  const char *encoding;
 
   struct psql_reader *r = xzalloc (sizeof *r);
   struct string query ;
@@ -285,23 +287,21 @@ psql_open_reader (struct psql_read_info *info, struct dictionary **dict)
 
   r->postgres_epoch = calendar_gregorian_to_offset (2000, 1, 1, NULL);
 
-
-  /* Create the dictionary and populate it */
-  *dict = r->dict = dict_create ();
-
   {
     const int enc = PQclientEncoding (r->conn);
 
     /* According to section 22.2 of the Postgresql manual
        a value of zero (SQL_ASCII) indicates
        "a declaration of ignorance about the encoding".
-       Accordingly, we don't set the dictionary's encoding
+       Accordingly, we use the default encoding
        if we find this value.
     */
-    if ( enc != 0 )
-      dict_set_encoding (r->dict, pg_encoding_to_char (enc));
+    encoding = enc ? pg_encoding_to_char (enc) : get_default_encoding ();
   }
 
+  /* Create the dictionary and populate it */
+  *dict = r->dict = dict_create ();
+
   /*
     select count (*) from (select * from medium) stupid_sql_standard;
   */
diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c
index 6643b85d..7e5a9e0f 100644
--- a/src/data/sys-file-reader.c
+++ b/src/data/sys-file-reader.c
@@ -428,10 +428,8 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
 
      First, figure out the correct character encoding, because this determines
      how the rest of the header data is to be interpreted. */
-  dict = dict_create ();
-  r->encoding = choose_encoding (r, extensions[EXT_INTEGER],
-                                 extensions[EXT_ENCODING]);
-  dict_set_encoding (dict, r->encoding);
+  dict = dict_create (choose_encoding (r, extensions[EXT_INTEGER],
+                                       extensions[EXT_ENCODING]));
 
   /* These records don't use variables at all. */
   if (document != NULL)
diff --git a/src/language/data-io/combine-files.c b/src/language/data-io/combine-files.c
index 0f09e735..b4f06655 100644
--- a/src/language/data-io/combine-files.c
+++ b/src/language/data-io/combine-files.c
@@ -35,6 +35,7 @@
 #include "language/lexer/variable-parser.h"
 #include "language/stats/sort-criteria.h"
 #include "libpspp/assertion.h"
+#include "libpspp/i18n.h"
 #include "libpspp/message.h"
 #include "libpspp/string-array.h"
 #include "libpspp/taint.h"
@@ -160,7 +161,7 @@ combine_files (enum comb_command_type command,
 
   proc.files = NULL;
   proc.n_files = 0;
-  proc.dict = dict_create ();
+  proc.dict = dict_create (get_default_encoding ());
   proc.output = NULL;
   proc.matcher = NULL;
   subcase_init_empty (&proc.by_vars);
@@ -496,7 +497,6 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f)
   struct dictionary *d = f->dict;
   const struct string_array *d_docs, *m_docs;
   int i;
-  const char *file_encoding;
 
   if (dict_get_label (m) == NULL)
     dict_set_label (m, dict_get_label (d));
@@ -510,17 +510,9 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f)
      The correct thing to do would be to convert to an encoding
      which can cope with all the input files (eg UTF-8).
    */
-  file_encoding = dict_get_encoding (f->dict);
-  if ( file_encoding != NULL)
-    {
-      if ( dict_get_encoding (m) == NULL)
-	dict_set_encoding (m, file_encoding);
-      else if ( 0 != strcmp (file_encoding, dict_get_encoding (m)))
-	{
-	  msg (MW,
-	       _("Combining files with incompatible encodings. String data may not be represented correctly."));
-	}
-    }
+  if ( 0 != strcmp (dict_get_encoding (f->dict), dict_get_encoding (m)))
+    msg (MW, _("Combining files with incompatible encodings. String data may "
+               "not be represented correctly."));
 
   if (d_docs != NULL)
     {
diff --git a/src/language/data-io/data-list.c b/src/language/data-io/data-list.c
index a171986d..8ab75884 100644
--- a/src/language/data-io/data-list.c
+++ b/src/language/data-io/data-list.c
@@ -42,6 +42,7 @@
 #include "language/lexer/variable-parser.h"
 #include "libpspp/assertion.h"
 #include "libpspp/compiler.h"
+#include "libpspp/i18n.h"
 #include "libpspp/message.h"
 #include "libpspp/misc.h"
 #include "libpspp/pool.h"
@@ -85,7 +86,9 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds)
   struct pool *tmp_pool;
   bool ok;
 
-  dict = in_input_program () ? dataset_dict (ds) : dict_create ();
+  dict = (in_input_program ()
+          ? dataset_dict (ds)
+          : dict_create (get_default_encoding ()));
   parser = data_parser_create (dict);
   reader = NULL;
 
@@ -238,13 +241,9 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds)
     }
   type = data_parser_get_type (parser);
 
-  if (! ds_is_empty (&encoding))
-    {
-      if ( NULL == fh)
-	msg (MW, _("Encoding should not be specified for inline data. It will be ignored."));
-      else
-	dict_set_encoding (dict, ds_cstr (&encoding));
-    }
+  if (! ds_is_empty (&encoding) && NULL == fh)
+    msg (MW, _("Encoding should not be specified for inline data. It will be "
+               "ignored."));
 
   if (fh == NULL)
     fh = fh_inline_file ();
diff --git a/src/language/data-io/get-data.c b/src/language/data-io/get-data.c
index dd55752c..d7927527 100644
--- a/src/language/data-io/get-data.c
+++ b/src/language/data-io/get-data.c
@@ -279,7 +279,7 @@ static int
 parse_get_txt (struct lexer *lexer, struct dataset *ds)
 {
   struct data_parser *parser = NULL;
-  struct dictionary *dict = dict_create ();
+  struct dictionary *dict = dict_create (get_default_encoding ());
   struct file_handle *fh = NULL;
   struct dfm_reader *reader = NULL;
   char *name = NULL;
diff --git a/src/language/stats/aggregate.c b/src/language/stats/aggregate.c
index d7339d86..fe6f5eed 100644
--- a/src/language/stats/aggregate.c
+++ b/src/language/stats/aggregate.c
@@ -208,7 +208,7 @@ cmd_aggregate (struct lexer *lexer, struct dataset *ds)
   if ( agr.add_variables )
     agr.dict = dict_clone (dict);
   else
-    agr.dict = dict_create ();    
+    agr.dict = dict_create (dict_get_encoding (dict));
 
   dict_set_label (agr.dict, dict_get_label (dict));
   dict_set_documents (agr.dict, dict_get_documents (dict));
diff --git a/src/ui/gui/psppire-data-editor.c b/src/ui/gui/psppire-data-editor.c
index 094a4f69..2aba780e 100644
--- a/src/ui/gui/psppire-data-editor.c
+++ b/src/ui/gui/psppire-data-editor.c
@@ -1,5 +1,5 @@
 /* PSPPIRE - a graphical user interface for PSPP.
-   Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc.
+   Copyright (C) 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -1612,8 +1612,7 @@ data_sheet_set_clip (PsppireSheet *sheet)
     }
 
   /* Construct clip dictionary. */
-  clip_dict = dict_create ();
-  dict_set_encoding (clip_dict, dict_get_encoding (ds->dict->dict));
+  clip_dict = dict_create (dict_get_encoding (ds->dict->dict));
   for (i = col0; i <= coli; i++)
     dict_clone_var_assert (clip_dict, dict_get_var (ds->dict->dict, i));
 
diff --git a/src/ui/gui/text-data-import-dialog.c b/src/ui/gui/text-data-import-dialog.c
index 14a23a79..8579ccc9 100644
--- a/src/ui/gui/text-data-import-dialog.c
+++ b/src/ui/gui/text-data-import-dialog.c
@@ -1252,7 +1252,7 @@ choose_column_names (struct import_assistant *ia)
   struct column *col;
   size_t name_row;
 
-  dict = dict_create ();
+  dict = dict_create (get_default_encoding ());
   name_row = f->variable_names && f->skip_lines ? f->skip_lines : 0;
   for (col = s->columns; col < &s->columns[s->column_cnt]; col++)
     {
@@ -1595,7 +1595,7 @@ prepare_formats_page (struct import_assistant *ia)
 
   push_watch_cursor (ia);
 
-  dict = dict_create ();
+  dict = dict_create (get_default_encoding ());
   fg = fmt_guesser_create ();
   for (column_idx = 0; column_idx < s->column_cnt; column_idx++)
     {