From 51318369476380d034d03d95952ed76efbd4761f Mon Sep 17 00:00:00 2001 From: John Darrington Date: Mon, 4 Mar 2013 20:09:41 +0100 Subject: [PATCH] Added a feature to read the meta data from spreadsheet files. This is in preparation for upcoming features. --- src/data/gnumeric-reader.c | 379 +++++++++++++++--- src/data/gnumeric-reader.h | 11 +- src/data/ods-reader.c | 671 +++++++++++++++++++++++--------- src/data/ods-reader.h | 12 +- src/data/spreadsheet-reader.c | 143 ++++++- src/data/spreadsheet-reader.h | 59 ++- src/language/data-io/get-data.c | 88 +++-- 7 files changed, 1052 insertions(+), 311 deletions(-) diff --git a/src/data/gnumeric-reader.c b/src/data/gnumeric-reader.c index 459eeb7e8f..578ff8ab4b 100644 --- a/src/data/gnumeric-reader.c +++ b/src/data/gnumeric-reader.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2007, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. + Copyright (C) 2007, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -31,7 +31,7 @@ #if !GNM_SUPPORT struct casereader * -gnumeric_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict) +gnumeric_open_reader (const struct spreadsheet_read_options *opts, struct dictionary **dict) { msg (ME, _("Support for %s files was not compiled into this installation of PSPP"), "Gnumeric"); @@ -73,60 +73,130 @@ static const struct casereader_class gnm_file_casereader_class = enum reader_state { - STATE_INIT = 0, /* Initial state */ + STATE_PRE_INIT = 0, /* Initial state */ + STATE_SHEET_COUNT, /* Found the sheet index */ + STATE_INIT , /* Other Initial state */ STATE_SHEET_START, /* Found the start of a sheet */ STATE_SHEET_NAME, /* Found the sheet name */ STATE_MAXROW, + STATE_MAXCOL, STATE_SHEET_FOUND, /* Found the sheet that we actually want */ STATE_CELLS_START, /* Found the start of the cell array */ STATE_CELL /* Found a cell */ }; +struct sheet_detail +{ + /* The name of the sheet (utf8 encoding) */ + char *name; + + int start_col; + int stop_col; + int start_row; + int stop_row; + + int maxcol; + int maxrow; +}; + struct gnumeric_reader { + struct spreadsheet spreadsheet; + + /* The libxml reader for this instance */ xmlTextReaderPtr xtr; + /* An internal state variable */ enum reader_state state; + int row; int col; int min_col; int node_type; - int sheet_index; + int current_sheet; + int start_col; + int stop_col; + int start_row; + int stop_row; + + struct sheet_detail *sheets; const xmlChar *target_sheet; int target_sheet_index; - int start_row; - int start_col; - int stop_row; - int stop_col; - struct caseproto *proto; struct dictionary *dict; struct ccase *first_case; bool used_first_case; }; + +const char * +gnumeric_get_sheet_name (struct spreadsheet *s, int n) +{ + struct gnumeric_reader *gr = (struct gnumeric_reader *) s; + assert (n < s->n_sheets); + + return gr->sheets[n].name; +} + + static void process_node (struct gnumeric_reader *r); + +char * +gnumeric_get_sheet_range (struct spreadsheet *s, int n) +{ + int ret; + struct gnumeric_reader *gr = (struct gnumeric_reader *) s; + + assert (n < s->n_sheets); + + while ( + (gr->sheets[n].stop_col == -1) + && + (1 == (ret = xmlTextReaderRead (gr->xtr))) + ) + { + process_node (gr); + } + + return create_cell_ref ( + gr->sheets[n].start_col, + gr->sheets[n].start_row, + gr->sheets[n].stop_col, + gr->sheets[n].stop_row); +} + + static void gnm_file_casereader_destroy (struct casereader *reader UNUSED, void *r_) { + int i; struct gnumeric_reader *r = r_; if ( r == NULL) return ; if ( r->xtr) xmlFreeTextReader (r->xtr); + r->xtr = NULL; if ( ! r->used_first_case ) case_unref (r->first_case); caseproto_unref (r->proto); + for (i = 0; i < r->spreadsheet.n_sheets; ++i) + { + xmlFree (r->sheets[i].name); + } + + free (r->sheets); + + free (r); } @@ -137,16 +207,50 @@ process_node (struct gnumeric_reader *r) if (name == NULL) name = xmlStrdup (_xml ("--")); - r->node_type = xmlTextReaderNodeType (r->xtr); - switch ( r->state) + switch (r->state) { + case STATE_PRE_INIT: + r->current_sheet = -1; + if (0 == xmlStrcasecmp (name, _xml("gnm:SheetNameIndex")) && + XML_READER_TYPE_ELEMENT == r->node_type) + { + r->state = STATE_SHEET_COUNT; + } + break; + + case STATE_SHEET_COUNT: + if (0 == xmlStrcasecmp (name, _xml("gnm:SheetName")) && + XML_READER_TYPE_ELEMENT == r->node_type) + { + ++r->current_sheet; + if (r->current_sheet + 1 > r->spreadsheet.n_sheets) + { + struct sheet_detail *sd ; + r->sheets = xrealloc (r->sheets, (r->current_sheet + 1) * sizeof *r->sheets); + sd = &r->sheets[r->current_sheet]; + sd->start_col = sd->stop_col = sd->start_row = sd->stop_row = -1; + r->spreadsheet.n_sheets = r->current_sheet + 1; + } + } + else if (0 == xmlStrcasecmp (name, _xml("gnm:SheetNameIndex")) && + XML_READER_TYPE_END_ELEMENT == r->node_type) + { + r->state = STATE_INIT; + r->current_sheet = -1; + } + else if (XML_READER_TYPE_TEXT == r->node_type) + { + r->sheets [r->spreadsheet.n_sheets - 1].name = CHAR_CAST (char *, xmlTextReaderValue (r->xtr)); + } + break; + case STATE_INIT: if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet")) && XML_READER_TYPE_ELEMENT == r->node_type) { - ++r->sheet_index; + ++r->current_sheet; r->state = STATE_SHEET_START; } break; @@ -163,16 +267,25 @@ process_node (struct gnumeric_reader *r) { r->state = STATE_INIT; } + else if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet")) && + XML_READER_TYPE_END_ELEMENT == r->node_type) + { + r->state = STATE_INIT; + } else if (XML_READER_TYPE_TEXT == r->node_type) { - if ( r->target_sheet != NULL) + if ( r->target_sheet != NULL) { xmlChar *value = xmlTextReaderValue (r->xtr); if ( 0 == xmlStrcmp (value, r->target_sheet)) r->state = STATE_SHEET_FOUND; free (value); } - else if (r->target_sheet_index == r->sheet_index) + else if (r->target_sheet_index == r->current_sheet + 1) + { + r->state = STATE_SHEET_FOUND; + } + else if (r->target_sheet_index == -1) { r->state = STATE_SHEET_FOUND; } @@ -191,10 +304,15 @@ process_node (struct gnumeric_reader *r) { r->state = STATE_MAXROW; } + else if (0 == xmlStrcasecmp (name, _xml("gnm:MaxCol")) && + XML_READER_TYPE_ELEMENT == r->node_type) + { + r->state = STATE_MAXCOL; + } else if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet")) && XML_READER_TYPE_END_ELEMENT == r->node_type) { - r->state = STATE_INIT; + r->state = STATE_INIT; } break; case STATE_MAXROW: @@ -203,12 +321,31 @@ process_node (struct gnumeric_reader *r) { r->state = STATE_SHEET_FOUND; } + else if (r->node_type == XML_READER_TYPE_TEXT) + { + xmlChar *value = xmlTextReaderValue (r->xtr); + r->sheets[r->current_sheet].maxrow = _xmlchar_to_int (value); + xmlFree (value); + } + break; + case STATE_MAXCOL: + if (0 == xmlStrcasecmp (name, _xml("gnm:MaxCol")) && + XML_READER_TYPE_END_ELEMENT == r->node_type) + { + r->state = STATE_SHEET_FOUND; + } + else if (r->node_type == XML_READER_TYPE_TEXT) + { + xmlChar *value = xmlTextReaderValue (r->xtr); + r->sheets[r->current_sheet].maxcol = _xmlchar_to_int (value); + xmlFree (value); + } + break; case STATE_CELLS_START: if (0 == xmlStrcasecmp (name, _xml ("gnm:Cell")) && XML_READER_TYPE_ELEMENT == r->node_type) { xmlChar *attr = NULL; - r->state = STATE_CELL; attr = xmlTextReaderGetAttribute (r->xtr, _xml ("Col")); r->col = _xmlchar_to_int (attr); @@ -220,15 +357,28 @@ process_node (struct gnumeric_reader *r) attr = xmlTextReaderGetAttribute (r->xtr, _xml ("Row")); r->row = _xmlchar_to_int (attr); free (attr); - } - else if (0 == xmlStrcasecmp (name, _xml("gnm:Cells")) && - XML_READER_TYPE_END_ELEMENT == r->node_type) - r->state = STATE_SHEET_NAME; + if (r->sheets[r->current_sheet].start_row == -1) + { + r->sheets[r->current_sheet].start_row = r->row; + } + + if (r->sheets[r->current_sheet].start_col == -1) + { + r->sheets[r->current_sheet].start_col = r->col; + } + if (! xmlTextReaderIsEmptyElement (r->xtr)) + r->state = STATE_CELL; + } + else if ( (0 == xmlStrcasecmp (name, _xml("gnm:Cells"))) && (XML_READER_TYPE_END_ELEMENT == r->node_type) ) + { + r->sheets[r->current_sheet].stop_col = r->col; + r->sheets[r->current_sheet].stop_row = r->row; + r->state = STATE_SHEET_NAME; + } break; case STATE_CELL: - if (0 == xmlStrcasecmp (name, _xml("gnm:Cell")) && - XML_READER_TYPE_END_ELEMENT == r->node_type) + if (0 == xmlStrcasecmp (name, _xml("gnm:Cell")) && XML_READER_TYPE_END_ELEMENT == r->node_type) { r->state = STATE_CELLS_START; } @@ -273,45 +423,153 @@ struct var_spec xmlChar *first_value; }; -struct casereader * -gnumeric_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict) + +void +gnumeric_destroy (struct spreadsheet *s) { - unsigned long int vstart = 0; + gnm_file_casereader_destroy (NULL, s); +} + + +static void +gnumeric_error_handler (void *ctx, const char *mesg, + UNUSED xmlParserSeverities sev, xmlTextReaderLocatorPtr loc) +{ + struct gnumeric_reader *r = ctx; + + msg (MW, _("There was a problem whilst reading the %s file `%s' (near line %d): `%s'"), + "Gnumeric", + r->spreadsheet.file_name, + xmlTextReaderLocatorLineNumber (loc), + mesg); +} + +static struct gnumeric_reader * +gnumeric_reopen (struct gnumeric_reader *r, const char *filename, bool show_errors) +{ int ret; - casenumber n_cases = CASENUMBER_MAX; - int i; - struct var_spec *var_spec = NULL; - int n_var_specs = 0; - struct gnumeric_reader *r = NULL; + xmlTextReaderPtr xtr; + gzFile gz; + + assert (r == NULL || filename == NULL); + + if (r && r->xtr) + xmlFreeTextReader (r->xtr); + + if (filename) + gz = gzopen (filename, "r"); + else + gz = gzopen ( r->spreadsheet.file_name, "r"); + + if (NULL == gz) + return NULL; - gzFile gz = gzopen (gri->file_name, "r"); - if ( NULL == gz) + xtr = xmlReaderForIO ((xmlInputReadCallback) gzread, + (xmlInputCloseCallback) gzclose, gz, + NULL, NULL, + show_errors ? 0 : (XML_PARSE_NOERROR | XML_PARSE_NOWARNING) ); + + if (xtr == NULL) { - msg (ME, _("Error opening `%s' for reading as a Gnumeric file: %s."), - gri->file_name, strerror (errno)); + gzclose (gz); + return NULL; + } - goto error; + if (r == NULL) + { + r = xzalloc (sizeof *r); + r->spreadsheet.n_sheets = -1; + r->spreadsheet.file_name = filename; } + + if (show_errors) + xmlTextReaderSetErrorHandler (xtr, gnumeric_error_handler, r); - r = xzalloc (sizeof *r); + r->target_sheet = NULL; + r->target_sheet_index = -1; - r->xtr = xmlReaderForIO ((xmlInputReadCallback) gzread, - (xmlInputCloseCallback) gzclose, gz, - NULL, NULL, 0); + r->row = r->col = -1; + r->state = STATE_PRE_INIT; + r->xtr = xtr; + + /* Advance to the start of the workbook. + This gives us some confidence that we are actually dealing with a gnumeric + spreadsheet. + */ + while ( (r->state != STATE_INIT ) + && 1 == (ret = xmlTextReaderRead (r->xtr))) + { + process_node (r); + } - if ( r->xtr == NULL ) - goto error; - if ( gri->cell_range ) + if ( ret != 1) { - if ( ! convert_cell_ref (gri->cell_range, + /* Does not seem to be a gnumeric file */ + xmlFreeTextReader (r->xtr); + free (r); + return NULL; + } + + r->spreadsheet.type = SPREADSHEET_GNUMERIC; + + if (show_errors) + { + const xmlChar *enc = xmlTextReaderConstEncoding (r->xtr); + xmlCharEncoding xce = xmlParseCharEncoding (CHAR_CAST (const char *, enc)); + + if ( XML_CHAR_ENCODING_UTF8 != xce) + { + /* I have been told that ALL gnumeric files are UTF8 encoded. If that is correct, this + can never happen. */ + msg (MW, _("The gnumeric file `%s' is encoded as %s instead of the usual UTF-8 encoding. " + "Any non-ascii characters will be incorrectly imported."), + r->spreadsheet.file_name, + enc); + } + } + + return r; +} + + +struct spreadsheet * +gnumeric_probe (const char *filename, bool report_errors) +{ + struct gnumeric_reader *r = gnumeric_reopen (NULL, filename, report_errors); + + return &r->spreadsheet; +} + + +struct casereader * +gnumeric_make_reader (struct spreadsheet *spreadsheet, + const struct spreadsheet_read_options *opts) +{ + int x = 0; + struct gnumeric_reader *r = NULL; + unsigned long int vstart = 0; + int ret; + casenumber n_cases = CASENUMBER_MAX; + int i; + struct var_spec *var_spec = NULL; + int n_var_specs = 0; + + r = (struct gnumeric_reader *) (spreadsheet); + + if (r->row != -1) + r = gnumeric_reopen (r, NULL, true); + + if ( opts->cell_range ) + { + if ( ! convert_cell_ref (opts->cell_range, &r->start_col, &r->start_row, &r->stop_col, &r->stop_row)) { msg (SE, _("Invalid cell range `%s'"), - gri->cell_range); + opts->cell_range); goto error; } } @@ -323,11 +581,10 @@ gnumeric_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dic r->stop_row = -1; } - r->state = STATE_INIT; - r->target_sheet = BAD_CAST gri->sheet_name; - r->target_sheet_index = gri->sheet_index; + r->target_sheet = BAD_CAST opts->sheet_name; + r->target_sheet_index = opts->sheet_index; r->row = r->col = -1; - r->sheet_index = 0; + r->current_sheet = -1; /* Advance to the start of the cells for the target sheet */ while ( (r->state != STATE_CELL || r->row < r->start_row ) @@ -344,15 +601,14 @@ gnumeric_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dic free (value); } - /* If a range has been given, then use that to calculate the number of cases */ - if ( gri->cell_range) + if ( opts->cell_range) { n_cases = MIN (n_cases, r->stop_row - r->start_row + 1); } - if ( gri->read_names ) + if ( opts->read_names ) { r->start_row++; n_cases --; @@ -396,7 +652,7 @@ gnumeric_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dic if ( r->row < r->start_row) { - if ( gri->read_names ) + if ( opts->read_names ) { var_spec [idx].name = xstrdup (text); } @@ -406,8 +662,8 @@ gnumeric_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dic var_spec [idx].first_value = xmlStrdup (value); if (-1 == var_spec [idx].width ) - var_spec [idx].width = (gri->asw == -1) ? - ROUND_UP (strlen(text), SPREADSHEET_DEFAULT_WIDTH) : gri->asw; + var_spec [idx].width = (opts->asw == -1) ? + ROUND_UP (strlen(text), SPREADSHEET_DEFAULT_WIDTH) : opts->asw; } free (value); @@ -433,7 +689,7 @@ gnumeric_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dic if ( enc == NULL) goto error; /* Create the dictionary and populate it */ - *dict = r->dict = dict_create (CHAR_CAST (const char *, enc)); + spreadsheet->dict = r->dict = dict_create (CHAR_CAST (const char *, enc)); } for (i = 0 ; i < n_var_specs ; ++i ) @@ -459,7 +715,7 @@ gnumeric_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dic if ( n_var_specs == 0 ) { msg (MW, _("Selected sheet or range of spreadsheet `%s' is empty."), - gri->file_name); + spreadsheet->file_name); goto error; } @@ -467,13 +723,15 @@ gnumeric_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dic r->first_case = case_create (r->proto); case_set_missing (r->first_case); - int x = 0; + for ( i = 0 ; i < n_var_specs ; ++i ) { + const struct variable *var; + if ( (var_spec[i].name == NULL) && (var_spec[i].first_value == NULL)) continue; - const struct variable *var = dict_get_var (r->dict, x++); + var = dict_get_var (r->dict, x++); convert_xml_string_to_value (r->first_case, var, var_spec[i].first_value); @@ -486,6 +744,7 @@ gnumeric_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dic } free (var_spec); + return casereader_create_sequential (NULL, @@ -502,8 +761,8 @@ gnumeric_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dic } free (var_spec); - dict_destroy (*dict); - *dict = NULL; + dict_destroy (spreadsheet->dict); + spreadsheet->dict = NULL; gnm_file_casereader_destroy (NULL, r); diff --git a/src/data/gnumeric-reader.h b/src/data/gnumeric-reader.h index fcd3385675..e0a5f5cad0 100644 --- a/src/data/gnumeric-reader.h +++ b/src/data/gnumeric-reader.h @@ -22,8 +22,17 @@ struct casereader; struct dictionary; struct spreadsheet_read_info; +struct spreadsheet_read_options; -struct casereader * gnumeric_open_reader (struct spreadsheet_read_info *, struct dictionary **); +struct spreadsheet *gnumeric_probe (const char *filename, bool report_errors); + +const char * gnumeric_get_sheet_name (struct spreadsheet *s, int n); +char * gnumeric_get_sheet_range (struct spreadsheet *s, int n); + +void gnumeric_destroy (struct spreadsheet *); + +struct casereader * gnumeric_make_reader (struct spreadsheet *spreadsheet, + const struct spreadsheet_read_options *opts); #endif diff --git a/src/data/ods-reader.c b/src/data/ods-reader.c index 170c005739..97f9e33441 100644 --- a/src/data/ods-reader.c +++ b/src/data/ods-reader.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2011, 2012 Free Software Foundation, Inc. + Copyright (C) 2011, 2012, 2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,9 +18,11 @@ #include "libpspp/message.h" #include "libpspp/misc.h" +#include "libpspp/assertion.h" #include "data/data-in.h" +#include "gl/c-strtod.h" #include "gl/minmax.h" #include "gettext.h" @@ -33,7 +35,8 @@ #if !ODF_READ_SUPPORT struct casereader * -ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict) +ods_open_reader (const struct spreadsheet_read_options *opts, + struct dictionary **dict) { msg (ME, _("Support for %s files was not compiled into this installation of PSPP"), "OpenDocument"); @@ -75,6 +78,18 @@ static const struct casereader_class ods_file_casereader_class = NULL, }; +struct sheet_detail +{ + /* The name of the sheet (utf8 encoding) */ + char *name; + + int start_col; + int stop_col; + int start_row; + int stop_row; +}; + + enum reader_state { STATE_INIT = 0, /* Initial state */ @@ -87,23 +102,31 @@ enum reader_state struct ods_reader { + struct spreadsheet spreadsheet; + struct zip_reader *zreader; xmlTextReaderPtr xtr; enum reader_state state; - bool sheet_found; int row; int col; int node_type; - int sheet_index; + int current_sheet; + xmlChar *current_sheet_name; - const xmlChar *target_sheet; + const xmlChar *target_sheet_name; int target_sheet_index; + int start_row; int start_col; int stop_row; int stop_col; + int col_span; + + struct sheet_detail *sheets; + int n_allocated_sheets; + struct caseproto *proto; struct dictionary *dict; struct ccase *first_case; @@ -111,20 +134,89 @@ struct ods_reader bool read_names; struct string ods_errs; - int span; }; + +static bool +reading_target_sheet (const struct ods_reader *r) +{ + if (r->target_sheet_name != NULL) + { + if ( 0 == xmlStrcmp (r->target_sheet_name, r->current_sheet_name)) + return true; + } + + if (r->target_sheet_index == r->current_sheet + 1) + return true; + + return false; +} + + static void process_node (struct ods_reader *r); + +const char * +ods_get_sheet_name (struct spreadsheet *s, int n) +{ + struct ods_reader *or = (struct ods_reader *) s; + + assert (n < s->n_sheets); + + while ( + (or->n_allocated_sheets <= n) + || or->state != STATE_SPREADSHEET + ) + { + int ret = xmlTextReaderRead (or->xtr); + if ( ret != 1) + break; + + process_node (or); + } + + return or->sheets[n].name; +} + +char * +ods_get_sheet_range (struct spreadsheet *s, int n) +{ + struct ods_reader *or = (struct ods_reader *) s; + + assert (n < s->n_sheets); + + while ( + (or->n_allocated_sheets <= n) + || (or->sheets[n].stop_row == -1) + || or->state != STATE_SPREADSHEET + ) + { + int ret = xmlTextReaderRead (or->xtr); + if ( ret != 1) + break; + + process_node (or); + } + + return create_cell_ref ( + or->sheets[n].start_col, + or->sheets[n].start_row, + or->sheets[n].stop_col, + or->sheets[n].stop_row); +} + + static void ods_file_casereader_destroy (struct casereader *reader UNUSED, void *r_) { + int i; struct ods_reader *r = r_; if ( r == NULL) return ; if (r->xtr) xmlFreeTextReader (r->xtr); + r->xtr = NULL; if ( ! ds_is_empty (&r->ods_errs)) msg (ME, "%s", ds_cstr (&r->ods_errs)); @@ -136,6 +228,15 @@ ods_file_casereader_destroy (struct casereader *reader UNUSED, void *r_) caseproto_unref (r->proto); + xmlFree (r->current_sheet_name); + + for (i = 0; i < r->n_allocated_sheets; ++i) + { + xmlFree (r->sheets[i].name); + } + + free (r->sheets); + free (r); } @@ -146,111 +247,141 @@ process_node (struct ods_reader *r) if (name == NULL) name = xmlStrdup (_xml ("--")); + r->node_type = xmlTextReaderNodeType (r->xtr); - switch ( r->state) + switch (r->state) { case STATE_INIT: if (0 == xmlStrcasecmp (name, _xml("office:spreadsheet")) && XML_READER_TYPE_ELEMENT == r->node_type) { r->state = STATE_SPREADSHEET; + r->current_sheet = -1; + r->current_sheet_name = NULL; } break; case STATE_SPREADSHEET: - if (0 == xmlStrcasecmp (name, _xml("table:table"))) + if (0 == xmlStrcasecmp (name, _xml("table:table")) + && + (XML_READER_TYPE_ELEMENT == r->node_type)) { - if (XML_READER_TYPE_ELEMENT == r->node_type) + xmlFree (r->current_sheet_name); + r->current_sheet_name = xmlTextReaderGetAttribute (r->xtr, _xml ("table:name")); + + ++r->current_sheet; + + if (r->current_sheet >= r->n_allocated_sheets) { - r->col = -1; - r->row = -1; - ++r->sheet_index; - if ( r->target_sheet != NULL) - { - xmlChar *value = xmlTextReaderGetAttribute (r->xtr, _xml ("table:name")); - if ( 0 == xmlStrcmp (value, r->target_sheet)) - { - r->sheet_found = true; - r->state = STATE_TABLE; - } - free (value); - } - else if (r->target_sheet_index == r->sheet_index) - { - r->sheet_found = true; - r->state = STATE_TABLE; - } - else if ( r->target_sheet_index == -1) - r->state = STATE_TABLE; + assert (r->current_sheet == r->n_allocated_sheets); + r->sheets = xrealloc (r->sheets, sizeof (*r->sheets) * ++r->n_allocated_sheets); + r->sheets[r->n_allocated_sheets - 1].start_col = -1; + r->sheets[r->n_allocated_sheets - 1].stop_col = -1; + r->sheets[r->n_allocated_sheets - 1].start_row = -1; + r->sheets[r->n_allocated_sheets - 1].stop_row = -1; + r->sheets[r->n_allocated_sheets - 1].name = CHAR_CAST (char *, xmlStrdup (r->current_sheet_name)); } + + r->col = 0; + r->row = 0; + + r->state = STATE_TABLE; } - else if (XML_READER_TYPE_END_ELEMENT == r->node_type - && r->sheet_found) + else if (0 == xmlStrcasecmp (name, _xml("office:spreadsheet")) && + XML_READER_TYPE_ELEMENT == r->node_type) { r->state = STATE_INIT; } - break; + break; case STATE_TABLE: - if (0 == xmlStrcasecmp (name, _xml("table:table-row")) ) + if (0 == xmlStrcasecmp (name, _xml("table:table-row")) && + (XML_READER_TYPE_ELEMENT == r->node_type)) { - if ( XML_READER_TYPE_ELEMENT == r->node_type) - { - if (! xmlTextReaderIsEmptyElement (r->xtr)) - { - r->state = STATE_ROW; - } - r->row++; - r->span = 1; - } + xmlChar *value = + xmlTextReaderGetAttribute (r->xtr, + _xml ("table:number-rows-repeated")); + + int row_span = value ? _xmlchar_to_int (value) : 1; + + r->row += row_span; + r->col = 0; + + if (! xmlTextReaderIsEmptyElement (r->xtr)) + r->state = STATE_ROW; + + xmlFree (value); } - else if (XML_READER_TYPE_END_ELEMENT == r->node_type) + else if (0 == xmlStrcasecmp (name, _xml("table:table")) && + (XML_READER_TYPE_END_ELEMENT == r->node_type)) { r->state = STATE_SPREADSHEET; } break; case STATE_ROW: - if (0 == xmlStrcasecmp (name, _xml ("table:table-cell"))) + if ( (0 == xmlStrcasecmp (name, _xml ("table:table-cell"))) + && + (XML_READER_TYPE_ELEMENT == r->node_type)) { - if ( XML_READER_TYPE_ELEMENT == r->node_type) - { - xmlChar *value = - xmlTextReaderGetAttribute (r->xtr, - _xml ("table:number-columns-repeated")); - r->col += r->span; - r->span = value ? _xmlchar_to_int (value) : 1; - free (value); - if (! xmlTextReaderIsEmptyElement (r->xtr)) - { - r->state = STATE_CELL; - } - } + xmlChar *value = + xmlTextReaderGetAttribute (r->xtr, + _xml ("table:number-columns-repeated")); + + r->col_span = value ? _xmlchar_to_int (value) : 1; + r->col += r->col_span; + + if (! xmlTextReaderIsEmptyElement (r->xtr)) + r->state = STATE_CELL; + + xmlFree (value); } - else if (XML_READER_TYPE_END_ELEMENT == r->node_type) + else if ( (0 == xmlStrcasecmp (name, _xml ("table:table-row"))) + && + (XML_READER_TYPE_END_ELEMENT == r->node_type)) { r->state = STATE_TABLE; - r->col = -1; - /* Set the span back to the default */ - r->span = 1; } break; case STATE_CELL: - if (0 == xmlStrcasecmp (name, _xml("text:p"))) + if ( (0 == xmlStrcasecmp (name, _xml("text:p"))) + && + ( XML_READER_TYPE_ELEMENT == r->node_type)) { - if ( XML_READER_TYPE_ELEMENT == r->node_type) - { - r->state = STATE_CELL_CONTENT; - } + if (! xmlTextReaderIsEmptyElement (r->xtr)) + r->state = STATE_CELL_CONTENT; } - else if (XML_READER_TYPE_END_ELEMENT == r->node_type) + else if + ( (0 == xmlStrcasecmp (name, _xml("table:table-cell"))) + && + (XML_READER_TYPE_END_ELEMENT == r->node_type) + ) { r->state = STATE_ROW; } break; case STATE_CELL_CONTENT: - if (XML_READER_TYPE_TEXT != r->node_type) + assert (r->current_sheet >= 0); + assert (r->current_sheet < r->n_allocated_sheets); + + if (r->sheets[r->current_sheet].start_row == -1) + r->sheets[r->current_sheet].start_row = r->row - 1; + + if ( + (r->sheets[r->current_sheet].start_col == -1) + || + (r->sheets[r->current_sheet].start_col >= r->col - 1) + ) + r->sheets[r->current_sheet].start_col = r->col - 1; + + r->sheets[r->current_sheet].stop_row = r->row - 1; + + if ( r->sheets[r->current_sheet].stop_col < r->col - 1) + r->sheets[r->current_sheet].stop_col = r->col - 1; + + if (XML_READER_TYPE_END_ELEMENT == r->node_type) r->state = STATE_CELL; break; default: + NOT_REACHED (); break; }; @@ -315,81 +446,195 @@ convert_xml_to_value (struct ccase *c, const struct variable *var, value_copy_str_rpad (v, var_get_width (var), xmv->text, ' '); else { - const char *text ; const struct fmt_spec *fmt = var_get_write_format (var); enum fmt_category fc = fmt_get_category (fmt->type); assert ( fc != FMT_CAT_STRING); - text = - xmv->value ? CHAR_CAST (const char *, xmv->value) : CHAR_CAST (const char *, xmv->text); + if ( 0 == xmlStrcmp (xmv->type, _xml("float"))) + { + v->f = c_strtod (CHAR_CAST (const char *, xmv->value), NULL); + } + else + { + const char *text = xmv->value ? + CHAR_CAST (const char *, xmv->value) : CHAR_CAST (const char *, xmv->text); + - free (data_in (ss_cstr (text), "UTF-8", - fmt->type, - v, - var_get_width (var), - "UTF-8")); + free (data_in (ss_cstr (text), "UTF-8", + fmt->type, + v, + var_get_width (var), + "UTF-8")); + } } } -struct casereader * -ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict) +/* Try to find out how many sheets there are in the "workbook" */ +static int +get_sheet_count (struct zip_reader *zreader) { - int ret = 0; - xmlChar *type = NULL; - unsigned long int vstart = 0; - casenumber n_cases = CASENUMBER_MAX; - int i; - struct var_spec *var_spec = NULL; - int n_var_specs = 0; + xmlTextReaderPtr mxtr; + struct zip_member *meta = NULL; + meta = zip_member_open (zreader, "meta.xml"); - struct ods_reader *r = xzalloc (sizeof *r); - struct zip_member *content = NULL; - struct zip_reader *zreader ; - xmlChar *val_string = NULL; + if ( meta == NULL) + return -1; - r->read_names = gri->read_names; - ds_init_empty (&r->ods_errs); + mxtr = xmlReaderForIO ((xmlInputReadCallback) zip_member_read, + (xmlInputCloseCallback) zip_member_finish, + meta, NULL, NULL, 0); - zreader = zip_reader_create (gri->file_name, &r->ods_errs); - - if ( NULL == zreader) + while (1 == xmlTextReaderRead (mxtr)) { - msg (ME, _("Error opening `%s' for reading as a OpenDocument spreadsheet file: %s."), - gri->file_name, ds_cstr (&r->ods_errs)); + xmlChar *name = xmlTextReaderName (mxtr); + if ( 0 == xmlStrcmp (name, _xml("meta:document-statistic"))) + { + xmlChar *attr = xmlTextReaderGetAttribute (mxtr, _xml ("meta:table-count")); - goto error; + if ( attr != NULL) + { + int s = _xmlchar_to_int (attr); + xmlFreeTextReader (mxtr); + xmlFree (name); + xmlFree (attr); + return s; + } + xmlFree (attr); + } + xmlFree (name); } - content = zip_member_open (zreader, "content.xml"); - if ( NULL == content) - { - msg (ME, _("Could not extract OpenDocument spreadsheet from file `%s': %s."), - gri->file_name, ds_cstr (&r->ods_errs)); + xmlFreeTextReader (mxtr); + return -1; +} - goto error; - } +static void +ods_error_handler (void *ctx, const char *mesg, + UNUSED xmlParserSeverities sev, xmlTextReaderLocatorPtr loc) +{ + struct ods_reader *r = ctx; + + msg (MW, _("There was a problem whilst reading the %s file `%s' (near line %d): `%s'"), + "ODF", + r->spreadsheet.file_name, + xmlTextReaderLocatorLineNumber (loc), + mesg); +} + + +static bool +init_reader (struct ods_reader *r, bool report_errors) +{ + struct zip_member *content = zip_member_open (r->zreader, "content.xml"); + xmlTextReaderPtr xtr; + + if ( content == NULL) + return false; zip_member_ref (content); - r->xtr = xmlReaderForIO ((xmlInputReadCallback) zip_member_read, - (xmlInputCloseCallback) zip_member_finish, - content, NULL, NULL, XML_PARSE_RECOVER); + if (r->xtr) + xmlFreeTextReader (r->xtr); + + xtr = xmlReaderForIO ((xmlInputReadCallback) zip_member_read, + (xmlInputCloseCallback) zip_member_finish, + content, NULL, NULL, + report_errors ? 0 : (XML_PARSE_NOERROR | XML_PARSE_NOWARNING) ); + + if ( xtr == NULL) + return false; + + r->xtr = xtr; + r->spreadsheet.type = SPREADSHEET_ODS; + r->row = 0; + r->col = 0; + r->current_sheet = 0; + r->state = STATE_INIT; + + if (report_errors) + xmlTextReaderSetErrorHandler (xtr, ods_error_handler, r); + + return true; +} + + +struct spreadsheet * +ods_probe (const char *filename, bool report_errors) +{ + struct ods_reader *r; + struct string errs = DS_EMPTY_INITIALIZER; + int sheet_count; + struct zip_reader *zr = zip_reader_create (filename, &errs); + + if (zr == NULL) + { + if (report_errors) + { + msg (ME, _("Cannot open %s as a OpenDocument file: %s"), + filename, ds_cstr (&errs)); + } + return NULL; + } + + sheet_count = get_sheet_count (zr); - if ( r->xtr == NULL) + r = xzalloc (sizeof *r); + r->zreader = zr; + + if (! init_reader (r, report_errors)) { goto error; } - if ( gri->cell_range ) + r->spreadsheet.n_sheets = sheet_count; + r->n_allocated_sheets = 0; + r->sheets = NULL; + + ds_destroy (&errs); + + r->spreadsheet.file_name = filename; + return &r->spreadsheet; + + error: + zip_reader_destroy (r->zreader); + ds_destroy (&errs); + free (r); + return NULL; +} + +struct casereader * +ods_make_reader (struct spreadsheet *spreadsheet, + const struct spreadsheet_read_options *opts) +{ + intf ret = 0; + xmlChar *type = NULL; + unsigned long int vstart = 0; + casenumber n_cases = CASENUMBER_MAX; + int i; + struct var_spec *var_spec = NULL; + int n_var_specs = 0; + + struct ods_reader *r = (struct ods_reader *) spreadsheet; + xmlChar *val_string = NULL; + + assert (r); + r->read_names = opts->read_names; + ds_init_empty (&r->ods_errs); + + + if ( !init_reader (r, true)) + goto error; + + if ( opts->cell_range ) { - if ( ! convert_cell_ref (gri->cell_range, + if ( ! convert_cell_ref (opts->cell_range, &r->start_col, &r->start_row, &r->stop_col, &r->stop_row)) { msg (SE, _("Invalid cell range `%s'"), - gri->cell_range); + opts->cell_range); goto error; } } @@ -402,24 +647,23 @@ ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict) } r->state = STATE_INIT; - r->target_sheet = BAD_CAST gri->sheet_name; - r->target_sheet_index = gri->sheet_index; - r->row = r->col = -1; - r->sheet_index = 0; + r->target_sheet_name = BAD_CAST opts->sheet_name; + r->target_sheet_index = opts->sheet_index; + r->row = r->col = 0; - /* If CELLRANGE was given, then we know how many variables should be read */ - if ( r->stop_col != -1 ) - { - assert (var_spec == NULL); - n_var_specs = r->stop_col - r->start_col + 1; - var_spec = xrealloc (var_spec, sizeof (*var_spec) * n_var_specs); - memset (var_spec, '\0', sizeof (*var_spec) * n_var_specs); - } +#if 0 + printf ("%s:%d %d,%d %d,%d\n", __FILE__, __LINE__, + r->start_col, + r->start_row, + r->stop_col, + r->stop_row); +#endif /* Advance to the start of the cells for the target sheet */ - while ( (r->row < r->start_row )) + while ( ! reading_target_sheet (r) + || r->state != STATE_ROW || r->row <= r->start_row ) { if (1 != (ret = xmlTextReaderRead (r->xtr))) break; @@ -430,41 +674,44 @@ ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict) if (ret < 1) { msg (MW, _("Selected sheet or range of spreadsheet `%s' is empty."), - gri->file_name); + spreadsheet->file_name); goto error; } - if ( gri->read_names) + if ( opts->read_names) { while (1 == (ret = xmlTextReaderRead (r->xtr))) { int idx; + process_node (r); - if ( r->row > r->start_row) - break; - if (r->col == -1 && r->row == r->start_row) + /* If the row is finished then stop for now */ + if (r->state == STATE_TABLE && r->row > r->start_row) break; - if ( r->col < r->start_col) + idx = r->col - r->start_col -1 ; + + if ( idx < 0) continue; - idx = r->col - r->start_col; + if (r->stop_col != -1 && idx > r->stop_col - r->start_col) + continue; if (r->state == STATE_CELL_CONTENT && XML_READER_TYPE_TEXT == r->node_type) { xmlChar *value = xmlTextReaderValue (r->xtr); + if ( idx >= n_var_specs) { - var_spec = xrealloc (var_spec, sizeof (*var_spec) * (idx + 1)); /* xrealloc (unlike realloc) doesn't initialise its memory to 0 */ memset (var_spec + n_var_specs, 0, - (n_var_specs - idx + 1) * sizeof (*var_spec)); + (idx - n_var_specs + 1) * sizeof (*var_spec)); n_var_specs = idx + 1; } var_spec[idx].firstval.text = 0; @@ -472,8 +719,8 @@ ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict) var_spec[idx].firstval.type = 0; var_spec [idx].name = strdup (CHAR_CAST (const char *, value)); - free (value); - value = NULL; + + xmlFree (value); } } } @@ -483,16 +730,21 @@ ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict) { int idx; process_node (r); - if ( r->row >= r->start_row + 1 + gri->read_names) + + if ( ! reading_target_sheet (r) ) break; - if ( r->col < r->start_col) - continue; + /* If the row is finished then stop for now */ + if (r->state == STATE_TABLE && + r->row > r->start_row + (opts->read_names ? 1 : 0)) + break; - if ( r->col - r->start_col + 1 > n_var_specs) + idx = r->col - r->start_col - 1; + if (idx < 0) continue; - idx = r->col - r->start_col; + if (r->stop_col != -1 && idx > r->stop_col - r->start_col) + continue; if ( r->state == STATE_CELL && XML_READER_TYPE_ELEMENT == r->node_type) @@ -504,24 +756,45 @@ ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict) if ( r->state == STATE_CELL_CONTENT && XML_READER_TYPE_TEXT == r->node_type) { +#if 0 + printf ("%s:%d Idx %d n_var_specs %d\n", __FILE__, __LINE__, + idx, n_var_specs); + + printf ("%s:%d Idx %d r_col %d\n", __FILE__, __LINE__, + idx, r->col); +#endif + + if (idx >= n_var_specs) + { + var_spec = xrealloc (var_spec, sizeof (*var_spec) * (idx + 1)); + memset (var_spec + n_var_specs, + 0, + (idx - n_var_specs + 1) * sizeof (*var_spec)); + + var_spec [idx].name = NULL; + n_var_specs = idx + 1; + } + var_spec [idx].firstval.type = type; var_spec [idx].firstval.text = xmlTextReaderValue (r->xtr); var_spec [idx].firstval.value = val_string; + val_string = NULL; type = NULL; } } + /* Create the dictionary and populate it */ - *dict = r->dict = dict_create ( + r->spreadsheet.dict = r->dict = dict_create ( CHAR_CAST (const char *, xmlTextReaderConstEncoding (r->xtr))); - for (i = 0 ; i < n_var_specs ; ++i ) + for (i = 0; i < n_var_specs ; ++i ) { struct fmt_spec fmt; struct variable *var = NULL; char *name = dict_make_unique_var_name (r->dict, var_spec[i].name, &vstart); - int width = xmv_to_width (&var_spec[i].firstval, gri->asw); + int width = xmv_to_width (&var_spec[i].firstval, opts->asw); dict_create_var (r->dict, name, width); free (name); @@ -545,7 +818,7 @@ ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict) if ( n_var_specs == 0 ) { msg (MW, _("Selected sheet or range of spreadsheet `%s' is empty."), - gri->file_name); + spreadsheet->file_name); goto error; } @@ -553,14 +826,23 @@ ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict) r->first_case = case_create (r->proto); case_set_missing (r->first_case); - for ( i = 0 ; i < n_var_specs ; ++i ) + for (i = 0 ; i < n_var_specs; ++i) { const struct variable *var = dict_get_var (r->dict, i); convert_xml_to_value (r->first_case, var, &var_spec[i].firstval); } - zip_reader_destroy (zreader); + /* Read in the first row of data */ + while (1 == xmlTextReaderRead (r->xtr)) + { + process_node (r); + + if (r->state == STATE_ROW) + break; + } + + // zip_reader_destroy (zreader); for ( i = 0 ; i < n_var_specs ; ++i ) { @@ -572,6 +854,7 @@ ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict) free (var_spec); + return casereader_create_sequential (NULL, r->proto, @@ -580,7 +863,7 @@ ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict) error: - zip_reader_destroy (zreader); + //zip_reader_destroy (zreader); for ( i = 0 ; i < n_var_specs ; ++i ) { @@ -592,7 +875,8 @@ ods_open_reader (struct spreadsheet_read_info *gri, struct dictionary **dict) free (var_spec); - dict_destroy (r->dict); + dict_destroy (r->spreadsheet.dict); + r->spreadsheet.dict = NULL; ods_file_casereader_destroy (NULL, r); @@ -607,83 +891,82 @@ ods_file_casereader_read (struct casereader *reader UNUSED, void *r_) { struct ccase *c = NULL; xmlChar *val_string = NULL; + xmlChar *type = NULL; struct ods_reader *r = r_; - int current_row = r->row; - - if ( r->row == -1) - return NULL; - if ( !r->used_first_case ) + if (!r->used_first_case) { r->used_first_case = true; return r->first_case; } - if ( r->state > STATE_INIT) + /* Advance to the start of a row. (If there is one) */ + while (r->state != STATE_ROW + && 1 == xmlTextReaderRead (r->xtr) + ) { - c = case_create (r->proto); - case_set_missing (c); + process_node (r); } + + if ( ! reading_target_sheet (r) + || r->state < STATE_TABLE + || (r->stop_row != -1 && r->row > r->stop_row + 1) + ) + { + return NULL; + } + + c = case_create (r->proto); + case_set_missing (c); + while (1 == xmlTextReaderRead (r->xtr)) { process_node (r); - if ( r->row > current_row) - { - break; - } - if ( r->col < r->start_col || (r->stop_col != -1 && r->col > r->stop_col)) - { - continue; - } - if ( r->col - r->start_col >= caseproto_get_n_widths (r->proto)) - { - continue; - } - if ( r->stop_row != -1 && r->row > r->stop_row) - { - continue; - } - if ( r->state == STATE_CELL && - r->node_type == XML_READER_TYPE_ELEMENT ) + + if ( r->stop_row != -1 && r->row > r->stop_row + 1) + break; + + if (r->state == STATE_CELL && + r->node_type == XML_READER_TYPE_ELEMENT) { + type = xmlTextReaderGetAttribute (r->xtr, _xml ("office:value-type")); val_string = xmlTextReaderGetAttribute (r->xtr, _xml ("office:value")); } - if ( r->state == STATE_CELL_CONTENT && r->node_type == XML_READER_TYPE_TEXT ) + if (r->state == STATE_CELL_CONTENT && + r->node_type == XML_READER_TYPE_TEXT) { int col; struct xml_value *xmv = xzalloc (sizeof *xmv); xmv->text = xmlTextReaderValue (r->xtr); - xmv->value = val_string; + xmv->value = val_string; + xmv->type = type; val_string = NULL; - for (col = 0; col < r->span ; ++col) + for (col = 0; col < r->col_span; ++col) { - const int idx = r->col + col - r->start_col; - - const struct variable *var = dict_get_var (r->dict, idx); - + const struct variable *var; + const int idx = r->col - col - r->start_col - 1; + if (idx < 0) + continue; + if (r->stop_col != -1 && idx > r->stop_col - r->start_col ) + break; + + var = dict_get_var (r->dict, idx); convert_xml_to_value (c, var, xmv); } - free (xmv->text); - free (xmv->value); + + xmlFree (xmv->text); + xmlFree (xmv->value); + xmlFree (xmv->type); free (xmv); } - - if ( r->state < STATE_TABLE) + if ( r->state <= STATE_TABLE) break; } - if (NULL == c || (r->stop_row != -1 && r->row > r->stop_row + 1)) - { - case_unref (c); - return NULL; - } - else - { - return c; - } + return c; } #endif diff --git a/src/data/ods-reader.h b/src/data/ods-reader.h index 79b7169833..3d939a8048 100644 --- a/src/data/ods-reader.h +++ b/src/data/ods-reader.h @@ -19,9 +19,17 @@ struct casereader; struct dictionary; -struct spreadsheet_read_info; -struct casereader * ods_open_reader (struct spreadsheet_read_info *, struct dictionary **); +struct spreadsheet_read_options; +struct spreadsheet; + +const char * ods_get_sheet_name (struct spreadsheet *s, int n); +char * ods_get_sheet_range (struct spreadsheet *s, int n); + +struct spreadsheet *ods_probe (const char *filename, bool report_errors); + +struct casereader * ods_make_reader (struct spreadsheet *spreadsheet, + const struct spreadsheet_read_options *opts); #endif diff --git a/src/data/spreadsheet-reader.c b/src/data/spreadsheet-reader.c index 11e8cf593a..4a85cb22dd 100644 --- a/src/data/spreadsheet-reader.c +++ b/src/data/spreadsheet-reader.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2007, 2009, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 2007, 2009, 2010, 2011, 2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,9 +18,72 @@ #include "spreadsheet-reader.h" +#include "gnumeric-reader.h" +#include "ods-reader.h" + #include #include #include +#include +#include +#include + +void +spreadsheet_close (UNUSED struct spreadsheet *spreadsheet) +{ +} + + +struct casereader * +spreadsheet_make_reader (struct spreadsheet *s, const struct spreadsheet_read_options *opts) +{ + if ( s->type == SPREADSHEET_ODS) + return ods_make_reader (s, opts); + if ( s->type == SPREADSHEET_GNUMERIC) + return gnumeric_make_reader (s, opts); + + return NULL; +} + +const char * +spreadsheet_get_sheet_name (struct spreadsheet *s, int n) +{ + if ( s->type == SPREADSHEET_ODS) + return ods_get_sheet_name (s, n); + + if ( s->type == SPREADSHEET_GNUMERIC) + return gnumeric_get_sheet_name (s, n); + + return NULL; +} + +char * +spreadsheet_get_sheet_range (struct spreadsheet *s, int n) +{ + if ( s->type == SPREADSHEET_ODS) + return ods_get_sheet_range (s, n); + + if ( s->type == SPREADSHEET_GNUMERIC) + return gnumeric_get_sheet_range (s, n); + + return NULL; +} + + +#define RADIX 26 + +static void +reverse (char *s, int len) +{ + int i; + for (i = 0; i < len / 2; ++i) + { + char tmp = s[len - i - 1]; + s[len - i -1] = s[i]; + s[i] = tmp; + } +} + /* Convert a string, which is an integer encoded in base26 IE, A=0, B=1, ... Z=25 to the integer it represents. @@ -30,31 +93,91 @@ ABC = 2 + 2*26 + 1*26^2 .... */ int -pseudo_base26 (const char *str) +ps26_to_int (const char *str) { int i; int multiplier = 1; int result = 0; int len = strlen (str); - for ( i = len - 1 ; i >= 0; --i) + for (i = len - 1 ; i >= 0; --i) { int mantissa = (str[i] - 'A'); - if ( mantissa < 0 || mantissa > 25 ) - return -1; + assert (mantissa >= 0); + assert (mantissa < RADIX); - if ( i != len - 1) + if (i != len - 1) mantissa++; result += mantissa * multiplier; - - multiplier *= 26; + multiplier *= RADIX; } return result; } +char * +int_to_ps26 (int i) +{ + char *ret = NULL; + + int lower = 0; + long long int base = RADIX; + int exp = 1; + + assert (i >= 0); + + while (i > lower + base - 1) + { + lower += base; + base *= RADIX; + assert (base > 0); + exp++; + } + + i -= lower; + i += base; + + ret = xmalloc (exp + 1); + + exp = 0; + do + { + ret[exp++] = (i % RADIX) + 'A'; + i /= RADIX; + } + while (i > 1); + + ret[exp]='\0'; + + reverse (ret, exp); + return ret; +} + +char * +create_cell_ref (int col0, int row0, int coli, int rowi) +{ + char *cs0 ; + char *csi ; + char *s ; + + if ( col0 < 0) return NULL; + if ( rowi < 0) return NULL; + if ( coli < 0) return NULL; + if ( row0 < 0) return NULL; + + cs0 = int_to_ps26 (col0); + csi = int_to_ps26 (coli); + s = c_xasprintf ("%s%d:%s%d", + cs0, row0 + 1, + csi, rowi + 1); + free (cs0); + free (csi); + + return s; +} + /* Convert a cell reference in the form "A1:B2", to integers. A1 means column zero, row zero. @@ -78,9 +201,9 @@ convert_cell_ref (const char *ref, return false; str_uppercase (startcol); - *col0 = pseudo_base26 (startcol); + *col0 = ps26_to_int (startcol); str_uppercase (stopcol); - *coli = pseudo_base26 (stopcol); + *coli = ps26_to_int (stopcol); *row0 = startrow - 1; *rowi = stoprow - 1 ; diff --git a/src/data/spreadsheet-reader.h b/src/data/spreadsheet-reader.h index 6edd705067..5cfd81d7e1 100644 --- a/src/data/spreadsheet-reader.h +++ b/src/data/spreadsheet-reader.h @@ -19,20 +19,26 @@ #include +struct casereeader; + /* Default width of string variables. */ #define SPREADSHEET_DEFAULT_WIDTH 8 -struct spreadsheet_read_info +/* These elements are read/write. + They may be passed in NULL (for pointers) or negative for integers, in which + case they will be filled in be the function. +*/ +struct spreadsheet_read_options { - char *sheet_name ; /* In UTF-8. */ - char *file_name ; /* In filename encoding. */ - char *cell_range ; /* In UTF-8. */ - int sheet_index ; - bool read_names ; - int asw ; + const char *sheet_name ; /* The name of the sheet to open (in UTF-8) */ + int sheet_index ; /* The index of the sheet to open (only used if sheet_name is NULL) */ + const char *cell_range ; /* The cell range (in UTF-8) */ + bool read_names ; /* True if the first row is to be used as the names of the variables */ + int asw ; /* The width of string variables in the created dictionary */ }; -int pseudo_base26 (const char *str); +int ps26_to_int (const char *str); +char * int_to_ps26 (int); bool convert_cell_ref (const char *ref, int *col0, int *row0, @@ -43,5 +49,42 @@ bool convert_cell_ref (const char *ref, #define _xmlchar_to_int(X) (atoi(CHAR_CAST (const char *, X))) +enum spreadsheet_type + { + SPREADSHEET_NONE, + SPREADSHEET_GNUMERIC, + SPREADSHEET_ODS + }; + + +struct spreadsheet +{ + const char *file_name; + + enum spreadsheet_type type; + + /* The total number of sheets in the "workbook" */ + int n_sheets; + + /* The dictionary */ + struct dictionary *dict; +}; + + +struct casereader * spreadsheet_make_reader (struct spreadsheet *, const struct spreadsheet_read_options *); + +const char * spreadsheet_get_sheet_name (struct spreadsheet *s, int n); +char * spreadsheet_get_sheet_range (struct spreadsheet *s, int n); + + +char *create_cell_ref (int col0, int row0, int coli, int rowi); + +void spreadsheet_close (struct spreadsheet *); + + + + + +#define SPREADSHEET_CAST(X) ((struct spreadsheet *)(X)) #endif diff --git a/src/language/data-io/get-data.c b/src/language/data-io/get-data.c index ac2944caee..2879e34a4b 100644 --- a/src/language/data-io/get-data.c +++ b/src/language/data-io/get-data.c @@ -1,5 +1,6 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. + Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012, + 2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -45,8 +46,10 @@ #define _(msgid) gettext (msgid) #define N_(msgid) (msgid) -static struct spreadsheet_read_info *parse_spreadsheet (struct lexer *lexer); -static void destroy_spreadsheet_read_info (struct spreadsheet_read_info *); +static bool parse_spreadsheet (struct lexer *lexer, char **filename, + struct spreadsheet_read_options *opts); + +static void destroy_spreadsheet_read_info (struct spreadsheet_read_options *); static int parse_get_txt (struct lexer *lexer, struct dataset *); static int parse_get_psql (struct lexer *lexer, struct dataset *); @@ -76,30 +79,48 @@ cmd_get_data (struct lexer *lexer, struct dataset *ds) else if (lex_match_id (lexer, "GNM") || lex_match_id (lexer, "ODS")) { + char *filename = NULL; struct casereader *reader = NULL; struct dictionary *dict = NULL; - struct spreadsheet_read_info *sri = parse_spreadsheet (lexer); - if (NULL == sri) + struct spreadsheet_read_options opts; + if (!parse_spreadsheet (lexer, &filename, &opts)) goto error; if ( 0 == strncasecmp (tok, "GNM", 3)) - reader = gnumeric_open_reader (sri, &dict); + { + struct spreadsheet *spreadsheet = gnumeric_probe (filename, true); + if (spreadsheet == NULL) + goto error; + reader = gnumeric_make_reader (spreadsheet, &opts); + dict = spreadsheet->dict; + } else if (0 == strncasecmp (tok, "ODS", 3)) - reader = ods_open_reader (sri, &dict); + { + struct spreadsheet *spreadsheet = ods_probe (filename, true); + if (spreadsheet == NULL) + goto error; + reader = ods_make_reader (spreadsheet, &opts); + dict = spreadsheet->dict; + } + + free (filename); if (reader) { dataset_set_dict (ds, dict); dataset_set_source (ds, reader); - destroy_spreadsheet_read_info (sri); free (tok); + destroy_spreadsheet_read_info (&opts); return CMD_SUCCESS; } - destroy_spreadsheet_read_info (sri); + destroy_spreadsheet_read_info (&opts); } else msg (SE, _("Unsupported TYPE %s."), tok); + + + error: free (tok); return CMD_FAILURE; @@ -181,13 +202,15 @@ parse_get_psql (struct lexer *lexer, struct dataset *ds) return CMD_FAILURE; } -static struct spreadsheet_read_info * -parse_spreadsheet (struct lexer *lexer) +static bool +parse_spreadsheet (struct lexer *lexer, char **filename, + struct spreadsheet_read_options *opts) { - struct spreadsheet_read_info *sri = xzalloc (sizeof *sri); - sri->sheet_index = 1; - sri->read_names = true; - sri->asw = -1; + opts->sheet_index = 1; + opts->sheet_name = NULL; + opts->cell_range = NULL; + opts->read_names = true; + opts->asw = -1; lex_force_match (lexer, T_SLASH); @@ -199,7 +222,7 @@ parse_spreadsheet (struct lexer *lexer) if (!lex_force_string (lexer)) goto error; - sri->file_name = utf8_to_filename (lex_tokcstr (lexer)); + *filename = utf8_to_filename (lex_tokcstr (lexer)); lex_get (lexer); @@ -208,7 +231,7 @@ parse_spreadsheet (struct lexer *lexer) if ( lex_match_id (lexer, "ASSUMEDSTRWIDTH")) { lex_match (lexer, T_EQUALS); - sri->asw = lex_integer (lexer); + opts->asw = lex_integer (lexer); lex_get (lexer); } else if (lex_match_id (lexer, "SHEET")) @@ -219,15 +242,15 @@ parse_spreadsheet (struct lexer *lexer) if ( ! lex_force_string (lexer) ) goto error; - sri->sheet_name = ss_xstrdup (lex_tokss (lexer)); - sri->sheet_index = -1; + opts->sheet_name = ss_xstrdup (lex_tokss (lexer)); + opts->sheet_index = -1; lex_get (lexer); } else if (lex_match_id (lexer, "INDEX")) { - sri->sheet_index = lex_integer (lexer); - if (sri->sheet_index <= 0) + opts->sheet_index = lex_integer (lexer); + if (opts->sheet_index <= 0) { msg (SE, _("The sheet index must be greater than or equal to 1")); goto error; @@ -247,14 +270,14 @@ parse_spreadsheet (struct lexer *lexer) if (lex_match_id (lexer, "FULL")) { - sri->cell_range = NULL; + opts->cell_range = NULL; } else if (lex_match_id (lexer, "RANGE")) { if ( ! lex_force_string (lexer) ) goto error; - sri->cell_range = ss_xstrdup (lex_tokss (lexer)); + opts->cell_range = ss_xstrdup (lex_tokss (lexer)); lex_get (lexer); } else @@ -270,11 +293,11 @@ parse_spreadsheet (struct lexer *lexer) if ( lex_match_id (lexer, "ON")) { - sri->read_names = true; + opts->read_names = true; } else if (lex_match_id (lexer, "OFF")) { - sri->read_names = false; + opts->read_names = false; } else { @@ -290,11 +313,10 @@ parse_spreadsheet (struct lexer *lexer) } } - return sri; + return true; error: - destroy_spreadsheet_read_info (sri); - return NULL; + return false; } @@ -657,13 +679,7 @@ parse_get_txt (struct lexer *lexer, struct dataset *ds) static void -destroy_spreadsheet_read_info (struct spreadsheet_read_info *sri) +destroy_spreadsheet_read_info (struct spreadsheet_read_options *opts) { - if ( NULL == sri) - return; - - free (sri->sheet_name); - free (sri->cell_range); - free (sri->file_name); - free (sri); + free (opts->cell_range); } -- 2.30.2