From: Ben Pfaff Date: Wed, 8 Feb 2012 06:20:28 +0000 (-0800) Subject: sys-file: Support EBCDIC magic number in system files. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=dff37440177a355bfc0cf9ff56428114e29f5106;p=pspp sys-file: Support EBCDIC magic number in system files. This makes PSPP able to read the 1980s EBCDIC-encoded system file provided by Chris Muller . --- diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi index 7b0eff9f79..8315762cef 100644 --- a/doc/dev/system-file-format.texi +++ b/doc/dev/system-file-format.texi @@ -115,7 +115,9 @@ char padding[3]; @table @code @item char rec_type[4]; -Record type code, set to @samp{$FL2}. +Record type code, set to @samp{$FL2}, that is, either @code{24 46 4c +32} if the file uses an ASCII-based character encoding, or @code{5b c6 +d3 f2} if the file uses an EBCDIC-based character encoding. @item char prod_name[60]; Product identification string. This always begins with the characters @@ -559,6 +561,9 @@ Machine endianness. 1 indicates big-endian, 2 indicates little-endian. been actually observed in system files: @table @asis +@item 1 +EBCDIC. + @item 2 7-bit ASCII. @@ -579,9 +584,6 @@ UTF-8. The following additional values are known to be defined: @table @asis -@item 1 -EBCDIC. - @item 3 8-bit ``ASCII''. @@ -591,9 +593,10 @@ DEC Kanji. Other Windows code page numbers are known to be generally valid. -Old versions of SPSS always wrote value 2 in this field, regardless of -the encoding in use. Newer versions also write the character encoding -as a string (see @ref{Character Encoding Record}). +Old versions of SPSS for Unix and Windows always wrote value 2 in this +field, regardless of the encoding in use. Newer versions also write +the character encoding as a string (see @ref{Character Encoding +Record}). @end table @node Machine Floating-Point Info Record diff --git a/src/data/sys-file-private.h b/src/data/sys-file-private.h index 1eee77950f..21ff8ade3a 100644 --- a/src/data/sys-file-private.h +++ b/src/data/sys-file-private.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2006, 2007, 2009, 2010 Free Software Foundation, Inc. + Copyright (C) 2006-2007, 2009-2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,6 +35,14 @@ struct dictionary; +/* Magic numbers. + + Both of these are actually $FL2 in the respective character set. The "FL2" + part is invariant among national variants of each character set, but "$" has + different encodings, so it is safer to write them as hexadecimal. */ +#define ASCII_MAGIC "\x24\x46\x4c\x32" +#define EBCDIC_MAGIC "\x5b\xc6\xd3\xf2" + /* A variable in a system file. */ struct sfm_var { diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index 336549b761..024b4ae182 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -88,6 +88,7 @@ enum /* Fields from the top-level header record. */ struct sfm_header_record { + char magic[5]; /* First 4 bytes of file, then null. */ int weight_idx; /* 0 if unweighted, otherwise a var index. */ int nominal_case_size; /* Number of var positions. */ @@ -213,6 +214,7 @@ static void skip_extension_record (struct sfm_reader *, int subtype); static const char *choose_encoding ( struct sfm_reader *, + const struct sfm_header_record *, const struct sfm_extension_record *ext_integer, const struct sfm_extension_record *ext_encoding); @@ -452,7 +454,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dictp, First, figure out the correct character encoding, because this determines how the rest of the header data is to be interpreted. */ - dict = dict_create (choose_encoding (r, extensions[EXT_INTEGER], + dict = dict_create (choose_encoding (r, &header, extensions[EXT_INTEGER], extensions[EXT_ENCODING])); r->encoding = dict_get_encoding (dict); @@ -598,13 +600,13 @@ sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_) bool sfm_detect (FILE *file) { - char rec_type[5]; + char magic[5]; - if (fread (rec_type, 4, 1, file) != 1) + if (fread (magic, 4, 1, file) != 1) return false; - rec_type[4] = '\0'; + magic[4] = '\0'; - return !strcmp ("$FL2", rec_type); + return !strcmp (ASCII_MAGIC, magic) || !strcmp (EBCDIC_MAGIC, magic); } /* Reads the global header of the system file. Initializes *HEADER and *INFO, @@ -614,14 +616,14 @@ static void read_header (struct sfm_reader *r, struct sfm_read_info *info, struct sfm_header_record *header) { - char rec_type[5]; uint8_t raw_layout_code[4]; uint8_t raw_bias[8]; - read_string (r, rec_type, sizeof rec_type); + read_string (r, header->magic, sizeof header->magic); read_string (r, header->eye_catcher, sizeof header->eye_catcher); - if (strcmp ("$FL2", rec_type) != 0) + if (strcmp (ASCII_MAGIC, header->magic) + && strcmp (EBCDIC_MAGIC, header->magic)) sys_error (r, 0, _("This is not an SPSS system file.")); /* Identify integer format. */ @@ -1185,6 +1187,7 @@ parse_machine_integer_info (struct sfm_reader *r, static const char * choose_encoding (struct sfm_reader *r, + const struct sfm_header_record *header, const struct sfm_extension_record *ext_integer, const struct sfm_extension_record *ext_encoding) { @@ -1223,6 +1226,10 @@ choose_encoding (struct sfm_reader *r, } } + /* If the file magic number is EBCDIC then its character data is too. */ + if (!strcmp (header->magic, EBCDIC_MAGIC)) + return "EBCDIC-US"; + return locale_charset (); } diff --git a/src/data/sys-file-writer.c b/src/data/sys-file-writer.c index fe96d3cbba..cf7bc70b22 100644 --- a/src/data/sys-file-writer.c +++ b/src/data/sys-file-writer.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006, 2007, 2009, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-2000, 2006-2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -311,6 +311,7 @@ calc_oct_idx (const struct dictionary *d, struct variable *target_var) static void write_header (struct sfm_writer *w, const struct dictionary *d) { + const char *dict_encoding = dict_get_encoding (d); char prod_name[61]; char creation_date[10]; char creation_time[9]; @@ -320,7 +321,10 @@ write_header (struct sfm_writer *w, const struct dictionary *d) time_t t; /* Record-type code. */ - write_string (w, "$FL2", 4); + if (is_encoding_ebcdic_compatible (dict_encoding)) + write_string (w, EBCDIC_MAGIC, 4); + else + write_string (w, ASCII_MAGIC, 4); /* Product identification. */ snprintf (prod_name, sizeof prod_name, "@(#) SPSS DATA FILE %s - %s", @@ -952,6 +956,7 @@ static void write_integer_info_record (struct sfm_writer *w, const struct dictionary *d) { + const char *dict_encoding = dict_get_encoding (d); int version_component[3]; int float_format; int codepage; @@ -973,13 +978,21 @@ write_integer_info_record (struct sfm_writer *w, abort (); /* Choose codepage. */ - codepage = sys_get_codepage_from_encoding (dict_get_encoding (d)); + codepage = sys_get_codepage_from_encoding (dict_encoding); if (codepage == 0) { - /* Default to "7-bit ASCII" if the codepage number is unknown, because + /* The codepage is unknown. Choose a default. + + For an EBCDIC-compatible encoding, use the value for EBCDIC. + + For an ASCII-compatible encoding, default to "7-bit ASCII", because many files use this codepage number regardless of their actual - encoding. */ - codepage = 2; + encoding. + */ + if (is_encoding_ascii_compatible (dict_encoding)) + codepage = 2; + else if (is_encoding_ebcdic_compatible (dict_encoding)) + codepage = 1; } /* Write record. */ diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c index a3dc08a1ff..149ad6fb2f 100644 --- a/src/libpspp/i18n.c +++ b/src/libpspp/i18n.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2006, 2009, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 2006, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -722,6 +722,16 @@ get_encoding_info (struct encoding_info *e, const char *name) e->is_ascii_compatible = ss_equals (in, out); ss_dealloc (&out); + if (!e->is_ascii_compatible && e->unit == 1) + { + out = recode_substring_pool ("UTF-8", name, ss_cstr ("A"), NULL); + e->is_ebcdic_compatible = (out.length == 1 + && (uint8_t) out.string[0] == 0xc1); + ss_dealloc (&out); + } + else + e->is_ebcdic_compatible = false; + return ok; } @@ -734,6 +744,15 @@ is_encoding_ascii_compatible (const char *encoding) return e.is_ascii_compatible; } +bool +is_encoding_ebcdic_compatible (const char *encoding) +{ + struct encoding_info e; + + get_encoding_info (&e, encoding); + return e.is_ebcdic_compatible; +} + /* Returns true if iconv can convert ENCODING to and from UTF-8, otherwise false. */ bool diff --git a/src/libpspp/i18n.h b/src/libpspp/i18n.h index fc3a64935e..27ccce361e 100644 --- a/src/libpspp/i18n.h +++ b/src/libpspp/i18n.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2006, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 2006, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -126,6 +126,10 @@ struct encoding_info used in ASCII text files has the same value in this encoding. */ bool is_ascii_compatible; + /* True if this encoding has a unit width of 1 byte and appears to be + EBCDIC-based. */ + bool is_ebcdic_compatible; + /* Character information. */ int unit; /* Unit width, in bytes. */ char cr[MAX_UNIT]; /* \r in encoding, 'unit' bytes long. */ @@ -134,6 +138,7 @@ struct encoding_info bool get_encoding_info (struct encoding_info *, const char *name); bool is_encoding_ascii_compatible (const char *encoding); +bool is_encoding_ebcdic_compatible (const char *encoding); bool is_encoding_supported (const char *encoding); #endif /* i18n.h */