Merge branch 'refs/heads/charset' of ssh://jmd@git.sv.gnu.org/srv/git/pspp into charset

author John Darrington <john@darrington.wattle.id.au>

Sun, 29 Mar 2009 23:59:42 +0000 (07:59 +0800)

committer John Darrington <john@darrington.wattle.id.au>

Sun, 29 Mar 2009 23:59:42 +0000 (07:59 +0800)
author John Darrington <john@darrington.wattle.id.au>
Sun, 29 Mar 2009 23:59:42 +0000 (07:59 +0800)
committer John Darrington <john@darrington.wattle.id.au>
Sun, 29 Mar 2009 23:59:42 +0000 (07:59 +0800)
diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi

index 3e764c8ce714099f1becf877c927681911b23a54..164807b80115e4796bb394c0336522b0d84344ba 100644 (file)
--- a/doc/dev/system-file-format.texi
+++ b/doc/dev/system-file-format.texi
@@ -96,6 +96,7 @@ Each type of record is described separately below.
  * Variable Display Parameter Record::
  * Long Variable Names Record::
  * Very Long String Record::
+* Character Encoding Record::
  * Data File and Variable Attributes Records::
  * Miscellaneous Informational Records::
  * Dictionary Termination Record::
@@ -546,9 +547,14 @@ Compression code.  Always set to 1.
  Machine endianness.  1 indicates big-endian, 2 indicates little-endian.
  
  @item int32 character_code;
+@anchor{character-code}
  Character code.  1 indicates EBCDIC, 2 indicates 7-bit ASCII, 3
  indicates 8-bit ASCII, 4 indicates DEC Kanji.
  Windows code page numbers are also valid.
+
+Experience has shown that in many files, this field is ignored or incorrect.
+For a more reliable indication of the file's character encoding
+see @ref{Character Encoding Record}.
  @end table
  
  @node Machine Floating-Point Info Record
@@ -792,6 +798,46 @@ After the last tuple, there may be a single byte 00, or @{00, 09@}.
  The total length is @code{count} bytes.
  @end table
  
+@node Character Encoding Record
+@section Character Encoding Record
+
+This record, if present, indicates the character encoding for string data,
+long variable names, variable labels, value labels and other strings in the
+file.
+
+@example
+/* @r{Header.} */
+int32               rec_type;
+int32               subtype;
+int32               size;
+int32               count;
+
+/* @r{Exactly @code{count} bytes of data.} */
+char                encoding[];
+@end example
+
+@table @code
+@item int32 rec_type;
+Record type.  Always set to 7.
+
+@item int32 subtype;
+Record subtype.  Always set to 20.
+
+@item int32 size;
+The size of each element in the @code{encoding} member. Always set to 1.
+
+@item int32 count;
+The total number of bytes in @code{encoding}.
+
+@item char encoding[];
+The name of the character encoding.  Normally this will be an official IANA characterset name or alias.
+See @url{http://www.iana.org/assignments/character-sets}.
+@end table
+
+This record is not present in files generated by older software.
+See also @ref{character-code}.
+
+
  @node Data File and Variable Attributes Records
  @section Data File and Variable Attributes Records
  
diff --git a/src/data/psql-reader.c b/src/data/psql-reader.c

index a54b9f8b3888c58320623ea1d53086bc8650a155..f2d5ff282272f48e821c4826d9d60f5158f6bfca 100644 (file)
--- a/src/data/psql-reader.c
+++ b/src/data/psql-reader.c
@@ -288,10 +288,22 @@ psql_open_reader (struct psql_read_info *info, struct dictionary **dict)
    /* Create the dictionary and populate it */
    *dict = r->dict = dict_create ();
  
+  {
+    const int enc = PQclientEncoding (r->conn);
+
+    /* According to section 22.2 of the Postgresql manual
+       a value of zero (SQL_ASCII) indicates
+       "a declaration of ignorance about the encoding".
+       Accordingly, we don't set the dictionary's encoding
+       if we find this value.
+    */
+    if ( enc != 0 )
+      dict_set_encoding (r->dict, pg_encoding_to_char (enc));
+  }
+
    /*
      select count (*) from (select * from medium) stupid_sql_standard;
    */
-
    ds_init_cstr (&query,
                 "BEGIN READ ONLY ISOLATION LEVEL SERIALIZABLE; "
                 "DECLARE  pspp BINARY CURSOR FOR ");
diff --git a/tests/dissect-sysfile.c b/tests/dissect-sysfile.c

index 25d01158ee77b1c80062f8c6689f7a5f0fb608fa..1c81c72c98b089c3cb5dd6639c695acd64bedd01 100644 (file)
--- a/tests/dissect-sysfile.c
+++ b/tests/dissect-sysfile.c
@@ -66,6 +66,9 @@ static void read_datafile_attributes (struct sfm_reader *r,
                                        size_t size, size_t count);
  static void read_variable_attributes (struct sfm_reader *r,
                                        size_t size, size_t count);
+static void read_character_encoding (struct sfm_reader *r,
+                                      size_t size, size_t count);
+
  
  static struct text_record *open_text_record (
    struct sfm_reader *, size_t size);
@@ -510,6 +513,10 @@ read_extension_record (struct sfm_reader *r)
        read_variable_attributes (r, size, count);
        return;
  
+    case 20:
+      read_character_encoding (r, size, count);
+      return;
+
      default:
        sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype);
        break;
@@ -712,6 +719,17 @@ read_datafile_attributes (struct sfm_reader *r, size_t size, size_t count)
    close_text_record (text);
  }
  
+static void
+read_character_encoding (struct sfm_reader *r, size_t size, size_t count)
+{
+  const unsigned long int posn =  ftell (r->file);
+  char *encoding = calloc (size, count + 1);
+  read_string (r, encoding, count + 1);
+
+  printf ("%08lx: Character Encoding: %s\n", posn, encoding);
+}
+
+
  static void
  read_variable_attributes (struct sfm_reader *r, size_t size, size_t count) 
  {
author	John Darrington <john@darrington.wattle.id.au>
	Sun, 29 Mar 2009 23:59:42 +0000 (07:59 +0800)
committer	John Darrington <john@darrington.wattle.id.au>
	Sun, 29 Mar 2009 23:59:42 +0000 (07:59 +0800)
doc/dev/system-file-format.texi		patch \| blob \| history
src/data/psql-reader.c		patch \| blob \| history
tests/dissect-sysfile.c		patch \| blob \| history