1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009, 2010, 2011, 2012, 2013, 2016,
3 2020 Free Software Foundation, Inc.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 #include "data/gnumeric-reader.h"
21 #include "spreadsheet-reader.h"
26 #include <libxml/xmlreader.h>
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/data-in.h"
32 #include "data/dictionary.h"
33 #include "data/format.h"
34 #include "data/identifier.h"
35 #include "data/value.h"
36 #include "data/variable.h"
37 #include "libpspp/i18n.h"
38 #include "libpspp/message.h"
39 #include "libpspp/misc.h"
40 #include "libpspp/hmap.h"
41 #include "libpspp/hash-functions.h"
43 #include "libpspp/str.h"
45 #include "gl/c-strtod.h"
46 #include "gl/minmax.h"
47 #include "gl/xalloc.h"
50 #define _(msgid) gettext (msgid)
51 #define N_(msgid) (msgid)
53 /* Setting this to false can help with debugging and development.
54 Don't forget to set it back to true, or users will complain that
55 all but the smallest spreadsheets display VERY slowly. */
56 static const bool use_cache = true;
58 /* Shamelessly lifted from the Gnumeric sources:
59 https://git.gnome.org/browse/gnumeric/tree/src/value.h
65 VALUE_INTEGER = 30, /* Note, this was removed from gnumeric in 2006 - old versions may of
66 course still be around. New ones are supposed to use float.*/
75 static void gnm_file_casereader_destroy (struct casereader *, void *);
77 static struct ccase *gnm_file_casereader_read (struct casereader *, void *);
80 static const struct casereader_class gnm_file_casereader_class =
82 gnm_file_casereader_read,
83 gnm_file_casereader_destroy,
90 STATE_PRE_INIT = 0, /* Initial state */
91 STATE_SHEET_COUNT, /* Found the sheet index */
92 STATE_INIT , /* Other Initial state */
93 STATE_SHEET_START, /* Found the start of a sheet */
94 STATE_SHEET_NAME, /* Found the sheet name */
97 STATE_SHEET_FOUND, /* Found the sheet that we actually want */
98 STATE_CELLS_START, /* Found the start of the cell array */
99 STATE_CELL /* Found a cell */
106 /* The libxml reader for this instance */
107 xmlTextReaderPtr xtr;
109 /* An internal state variable */
110 enum reader_state state;
123 state_data_destroy (struct state_data *sd)
125 xmlFreeTextReader (sd->xtr);
129 struct gnumeric_reader
131 struct spreadsheet spreadsheet;
133 struct state_data rsd;
134 struct state_data msd;
136 const xmlChar *target_sheet_name;
137 int target_sheet_index;
139 enum gnm_value_type vtype;
141 /* The total number of sheets in the "workbook" */
147 /* A value to be kept in the hash table for cache purposes. */
150 struct hmap_node node;
152 /* The cell's row. */
155 /* The cell's column. */
158 /* The value of the cell. */
163 gnumeric_destroy (struct spreadsheet *s)
165 struct gnumeric_reader *r = (struct gnumeric_reader *) s;
169 for (i = 0; i < r->n_sheets; ++i)
171 xmlFree (r->spreadsheet.sheets[i].name);
175 dict_unref (s->dict);
177 free (r->spreadsheet.sheets);
178 state_data_destroy (&r->msd);
182 struct cache_datum *cell;
183 struct cache_datum *next;
184 HMAP_FOR_EACH_SAFE (cell, next, struct cache_datum, node, &r->cache)
190 hmap_destroy (&r->cache);
197 gnumeric_get_sheet_name (struct spreadsheet *s, int n)
199 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
200 assert (n < gr->n_sheets);
202 return gr->spreadsheet.sheets[n].name;
206 static void process_node (struct gnumeric_reader *r, struct state_data *sd);
210 gnumeric_get_sheet_n_sheets (struct spreadsheet *s)
212 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
215 while (1 == (ret = xmlTextReaderRead (gr->msd.xtr)))
217 process_node (gr, &gr->msd);
225 gnumeric_get_sheet_range (struct spreadsheet *s, int n)
228 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
230 while ((gr->spreadsheet.sheets[n].last_col == -1)
232 (1 == (ret = xmlTextReaderRead (gr->msd.xtr))))
234 process_node (gr, &gr->msd);
237 assert (n < gr->n_sheets);
238 return create_cell_range (
239 gr->spreadsheet.sheets[n].first_col,
240 gr->spreadsheet.sheets[n].first_row,
241 gr->spreadsheet.sheets[n].last_col,
242 gr->spreadsheet.sheets[n].last_row);
247 gnumeric_get_sheet_n_rows (struct spreadsheet *s, int n)
249 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
251 while ((gr->spreadsheet.sheets[n].last_col == -1)
253 (1 == xmlTextReaderRead (gr->msd.xtr)))
255 process_node (gr, &gr->msd);
258 assert (n < gr->n_sheets);
259 return gr->spreadsheet.sheets[n].last_row + 1;
263 gnumeric_get_sheet_n_columns (struct spreadsheet *s, int n)
265 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
267 while ((gr->spreadsheet.sheets[n].last_col == -1)
269 (1 == xmlTextReaderRead (gr->msd.xtr)))
271 process_node (gr, &gr->msd);
274 assert (n < gr->n_sheets);
275 return gr->spreadsheet.sheets[n].last_col + 1;
278 static struct gnumeric_reader *
279 gnumeric_reopen (struct gnumeric_reader *r, const char *filename, bool show_errors);
283 gnumeric_get_sheet_cell (struct spreadsheet *s, int n, int row, int column)
285 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
287 /* See if this cell is in the cache. If it is, then use it. */
290 struct cache_datum *lookup = NULL;
291 unsigned int hash = hash_int (row, 0);
292 hash = hash_int (column, hash);
294 HMAP_FOR_EACH_WITH_HASH (lookup, struct cache_datum, node, hash,
297 if (lookup->row == row && lookup->col == column)
304 return strdup (lookup->value);
308 struct state_data sd;
310 sd.state = STATE_PRE_INIT;
311 sd.current_sheet = -1;
315 sd.gz = gzopen (s->file_name, "r");
317 sd.xtr = xmlReaderForIO ((xmlInputReadCallback) gzread,
318 (xmlInputCloseCallback) gzclose,
324 gr->target_sheet_name = NULL;
326 int current_row = -1;
327 int current_col = -1;
329 /* Spool to the target cell, caching values of cells as they are encountered. */
330 for (int ret = 1; ret; )
332 while ((ret = xmlTextReaderRead (sd.xtr)))
334 process_node (gr, &sd);
335 if (sd.state == STATE_CELL)
337 if (sd.current_sheet == n)
339 current_row = sd.row;
340 current_col = sd.col;
345 if (current_row >= row && current_col >= column - 1)
348 while ((ret = xmlTextReaderRead (sd.xtr)))
350 process_node (gr, &sd);
351 if (sd.node_type == XML_READER_TYPE_TEXT)
357 /* See if this cell has already been cached ... */
358 unsigned int hash = hash_int (current_row, 0);
359 hash = hash_int (current_col, hash);
360 struct cache_datum *probe = NULL;
361 HMAP_FOR_EACH_WITH_HASH (probe, struct cache_datum, node, hash,
364 if (probe->row == current_row && probe->col == current_col)
367 /* If not, then cache it. */
370 char *str = CHAR_CAST (char *, xmlTextReaderValue (sd.xtr));
371 struct cache_datum *cell_data = XMALLOC (struct cache_datum);
372 cell_data->row = current_row;
373 cell_data->col = current_col;
374 cell_data->value = str;
375 hmap_insert (&gr->cache, &cell_data->node, hash);
380 while (xmlTextReaderRead (sd.xtr))
382 process_node (gr, &sd);
383 if (sd.state == STATE_CELL && sd.node_type == XML_READER_TYPE_TEXT)
385 if (sd.current_sheet == n)
387 if (row == sd.row && column == sd.col)
393 char *cell_content = CHAR_CAST (char *, xmlTextReaderValue (sd.xtr));
394 xmlFreeTextReader (sd.xtr);
400 gnm_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
402 struct gnumeric_reader *r = r_;
407 state_data_destroy (&r->rsd);
409 if (r->spreadsheet.first_case && ! r->spreadsheet.used_first_case)
410 case_unref (r->spreadsheet.first_case);
412 if (r->spreadsheet.proto)
413 caseproto_unref (r->spreadsheet.proto);
415 spreadsheet_unref (&r->spreadsheet);
420 process_node (struct gnumeric_reader *r, struct state_data *sd)
422 xmlChar *name = xmlTextReaderName (sd->xtr);
424 name = xmlStrdup (_xml ("--"));
426 sd->node_type = xmlTextReaderNodeType (sd->xtr);
431 sd->current_sheet = -1;
432 if (0 == xmlStrcasecmp (name, _xml("gnm:SheetNameIndex")) &&
433 XML_READER_TYPE_ELEMENT == sd->node_type)
435 sd->state = STATE_SHEET_COUNT;
439 case STATE_SHEET_COUNT:
440 if (0 == xmlStrcasecmp (name, _xml("gnm:SheetName")) &&
441 XML_READER_TYPE_ELEMENT == sd->node_type)
444 if (sd->current_sheet + 1 > r->n_sheets)
446 struct sheet_detail *detail ;
447 r->spreadsheet.sheets = xrealloc (r->spreadsheet.sheets, (sd->current_sheet + 1) * sizeof *r->spreadsheet.sheets);
448 detail = &r->spreadsheet.sheets[sd->current_sheet];
449 detail->first_col = detail->last_col = detail->first_row = detail->last_row = -1;
451 r->n_sheets = sd->current_sheet + 1;
454 else if (0 == xmlStrcasecmp (name, _xml("gnm:SheetNameIndex")) &&
455 XML_READER_TYPE_END_ELEMENT == sd->node_type)
457 sd->state = STATE_INIT;
458 sd->current_sheet = -1;
460 else if (XML_READER_TYPE_TEXT == sd->node_type)
462 if (r->spreadsheet.sheets [r->n_sheets - 1].name == NULL)
463 r->spreadsheet.sheets [r->n_sheets - 1].name =
464 CHAR_CAST (char *, xmlTextReaderValue (sd->xtr));
469 if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet")) &&
470 XML_READER_TYPE_ELEMENT == sd->node_type)
473 sd->state = STATE_SHEET_START;
476 case STATE_SHEET_START:
477 if (0 == xmlStrcasecmp (name, _xml("gnm:Name")) &&
478 XML_READER_TYPE_ELEMENT == sd->node_type)
480 sd->state = STATE_SHEET_NAME;
483 case STATE_SHEET_NAME:
484 if (0 == xmlStrcasecmp (name, _xml("gnm:Name")) &&
485 XML_READER_TYPE_END_ELEMENT == sd->node_type)
487 sd->state = STATE_INIT;
489 else if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet")) &&
490 XML_READER_TYPE_END_ELEMENT == sd->node_type)
492 sd->state = STATE_INIT;
494 else if (XML_READER_TYPE_TEXT == sd->node_type)
496 if (r->target_sheet_name != NULL)
498 xmlChar *value = xmlTextReaderValue (sd->xtr);
499 if (0 == xmlStrcmp (value, r->target_sheet_name))
500 sd->state = STATE_SHEET_FOUND;
503 else if (r->target_sheet_index == sd->current_sheet + 1)
505 sd->state = STATE_SHEET_FOUND;
507 else if (r->target_sheet_index == -1)
509 sd->state = STATE_SHEET_FOUND;
513 case STATE_SHEET_FOUND:
514 if (0 == xmlStrcasecmp (name, _xml("gnm:Cells")) &&
515 XML_READER_TYPE_ELEMENT == sd->node_type)
517 sd->min_col = INT_MAX;
518 if (! xmlTextReaderIsEmptyElement (sd->xtr))
519 sd->state = STATE_CELLS_START;
521 else if (0 == xmlStrcasecmp (name, _xml("gnm:MaxRow")) &&
522 XML_READER_TYPE_ELEMENT == sd->node_type)
524 sd->state = STATE_MAXROW;
526 else if (0 == xmlStrcasecmp (name, _xml("gnm:MaxCol")) &&
527 XML_READER_TYPE_ELEMENT == sd->node_type)
529 sd->state = STATE_MAXCOL;
531 else if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet")) &&
532 XML_READER_TYPE_END_ELEMENT == sd->node_type)
534 sd->state = STATE_INIT;
538 if (0 == xmlStrcasecmp (name, _xml("gnm:MaxRow")) &&
539 XML_READER_TYPE_END_ELEMENT == sd->node_type)
541 sd->state = STATE_SHEET_FOUND;
543 else if (sd->node_type == XML_READER_TYPE_TEXT)
545 xmlChar *value = xmlTextReaderValue (sd->xtr);
550 if (0 == xmlStrcasecmp (name, _xml("gnm:MaxCol")) &&
551 XML_READER_TYPE_END_ELEMENT == sd->node_type)
553 sd->state = STATE_SHEET_FOUND;
555 else if (sd->node_type == XML_READER_TYPE_TEXT)
557 xmlChar *value = xmlTextReaderValue (sd->xtr);
561 case STATE_CELLS_START:
562 if (0 == xmlStrcasecmp (name, _xml ("gnm:Cell")) &&
563 XML_READER_TYPE_ELEMENT == sd->node_type)
565 xmlChar *attr = xmlTextReaderGetAttribute (sd->xtr, _xml ("Col"));
566 sd->col = _xmlchar_to_int (attr);
569 if (sd->col < sd->min_col)
570 sd->min_col = sd->col;
572 attr = xmlTextReaderGetAttribute (sd->xtr, _xml ("Row"));
573 sd->row = _xmlchar_to_int (attr);
576 if (r->spreadsheet.sheets[sd->current_sheet].first_row == -1)
578 r->spreadsheet.sheets[sd->current_sheet].first_row = sd->row;
581 if (r->spreadsheet.sheets[sd->current_sheet].first_col == -1)
583 r->spreadsheet.sheets[sd->current_sheet].first_col = sd->col;
585 if (! xmlTextReaderIsEmptyElement (sd->xtr))
586 sd->state = STATE_CELL;
588 else if ((0 == xmlStrcasecmp (name, _xml("gnm:Cells")))
589 && (XML_READER_TYPE_END_ELEMENT == sd->node_type))
591 r->spreadsheet.sheets[sd->current_sheet].last_col = sd->col;
592 r->spreadsheet.sheets[sd->current_sheet].last_row = sd->row;
593 sd->state = STATE_SHEET_NAME;
597 if (0 == xmlStrcasecmp (name, _xml("gnm:Cell"))
598 && XML_READER_TYPE_END_ELEMENT == sd->node_type)
600 sd->state = STATE_CELLS_START;
612 Sets the VAR of case C, to the value corresponding to the xml string XV
615 convert_xml_string_to_value (struct ccase *c, const struct variable *var,
616 const xmlChar *xv, enum gnm_value_type type, int col, int row)
618 union value *v = case_data_rw (c, var);
621 value_set_missing (v, var_get_width (var));
622 else if (var_is_alpha (var))
623 value_copy_str_rpad (v, var_get_width (var), xv, ' ');
624 else if (type == VALUE_FLOAT || type == VALUE_INTEGER)
626 const char *text = CHAR_CAST (const char *, xv);
630 v->f = c_strtod (text, &endptr);
631 if (errno != 0 || endptr == text)
636 const char *text = CHAR_CAST (const char *, xv);
638 const struct fmt_spec *fmt = var_get_write_format (var);
640 char *m = data_in (ss_cstr (text), "UTF-8", fmt->type,
641 settings_get_fmt_settings (), v, var_get_width (var),
646 char buf [FMT_STRING_LEN_MAX + 1];
647 char *cell = create_cell_ref (col, row);
649 msg (MW, _("Cannot convert the value in the spreadsheet cell %s to format (%s): %s"),
650 cell, fmt_to_string (fmt, buf), m);
661 xmlChar *first_value;
667 gnumeric_error_handler (void *ctx, const char *mesg,
668 xmlParserSeverities sev UNUSED,
669 xmlTextReaderLocatorPtr loc)
671 struct gnumeric_reader *r = ctx;
673 msg (MW, _("There was a problem whilst reading the %s file `%s' (near line %d): `%s'"),
675 r->spreadsheet.file_name,
676 xmlTextReaderLocatorLineNumber (loc),
680 static struct casereader *
681 gnumeric_make_reader (struct spreadsheet *spreadsheet,
682 const struct spreadsheet_read_options *opts)
686 struct gnumeric_reader *r = NULL;
687 unsigned long int vstart = 0;
689 casenumber n_cases = CASENUMBER_MAX;
691 struct var_spec *var_spec = NULL;
694 r = (struct gnumeric_reader *) (spreadsheet);
696 r = gnumeric_reopen (r, NULL, true);
698 if (opts->cell_range)
700 if (! convert_cell_ref (opts->cell_range,
701 &r->spreadsheet.start_col, &r->spreadsheet.start_row,
702 &r->spreadsheet.stop_col, &r->spreadsheet.stop_row))
704 msg (SE, _("Invalid cell range `%s'"),
711 r->spreadsheet.start_col = -1;
712 r->spreadsheet.start_row = 0;
713 r->spreadsheet.stop_col = -1;
714 r->spreadsheet.stop_row = -1;
717 r->target_sheet_name = BAD_CAST opts->sheet_name;
718 r->target_sheet_index = opts->sheet_index;
719 r->rsd.row = r->rsd.col = -1;
720 r->rsd.current_sheet = -1;
721 r->spreadsheet.first_case = NULL;
722 r->spreadsheet.proto = NULL;
724 /* Advance to the start of the cells for the target sheet */
725 while ((r->rsd.state != STATE_CELL || r->rsd.row < r->spreadsheet.start_row)
726 && 1 == (ret = xmlTextReaderRead (r->rsd.xtr)))
729 process_node (r, &r->rsd);
730 value = xmlTextReaderValue (r->rsd.xtr);
732 if (r->rsd.state == STATE_MAXROW && r->rsd.node_type == XML_READER_TYPE_TEXT)
734 n_cases = 1 + _xmlchar_to_int (value) ;
739 /* If a range has been given, then use that to calculate the number
741 if (opts->cell_range)
743 n_cases = MIN (n_cases, r->spreadsheet.stop_row - r->spreadsheet.start_row + 1);
746 if (opts->read_names)
748 r->spreadsheet.start_row++;
753 /* Read in the first row of cells,
754 including the headers if read_names was set */
756 ((r->rsd.state == STATE_CELLS_START && r->rsd.row <= r->spreadsheet.start_row) || r->rsd.state == STATE_CELL)
757 && (ret = xmlTextReaderRead (r->rsd.xtr))
762 if (r->rsd.state == STATE_CELL && r->rsd.node_type == XML_READER_TYPE_TEXT)
765 xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
767 type = _xmlchar_to_int (attr);
772 process_node (r, &r->rsd);
774 if (r->rsd.row > r->spreadsheet.start_row)
777 xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
779 r->vtype = _xmlchar_to_int (attr);
785 if (r->rsd.col < r->spreadsheet.start_col ||
786 (r->spreadsheet.stop_col != -1 && r->rsd.col > r->spreadsheet.stop_col))
789 idx = r->rsd.col - r->spreadsheet.start_col;
791 if (idx >= n_var_specs)
794 var_spec = xrealloc (var_spec, sizeof (*var_spec) * (idx + 1));
795 for (i = n_var_specs; i <= idx; ++i)
797 var_spec [i].name = NULL;
798 var_spec [i].width = -1;
799 var_spec [i].first_value = NULL;
800 var_spec [i].first_type = -1;
802 n_var_specs = idx + 1 ;
805 var_spec [idx].first_type = type;
807 if (r->rsd.node_type == XML_READER_TYPE_TEXT)
809 xmlChar *value = xmlTextReaderValue (r->rsd.xtr);
810 const char *text = CHAR_CAST (const char *, value);
812 if (r->rsd.row < r->spreadsheet.start_row)
814 if (opts->read_names)
816 var_spec [idx].name = xstrdup (text);
821 var_spec [idx].first_value = xmlStrdup (value);
823 if (-1 == var_spec [idx].width)
824 var_spec [idx].width = (opts->asw == -1) ?
825 ROUND_UP (strlen(text), SPREADSHEET_DEFAULT_WIDTH) : opts->asw;
830 else if (r->rsd.node_type == XML_READER_TYPE_ELEMENT
831 && r->rsd.state == STATE_CELL)
833 if (r->rsd.row == r->spreadsheet.start_row)
836 xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
838 if (NULL == attr || VALUE_STRING != _xmlchar_to_int (attr))
839 var_spec [idx].width = 0;
847 const xmlChar *enc = xmlTextReaderConstEncoding (r->rsd.xtr);
850 /* Create the dictionary and populate it */
851 spreadsheet->dict = dict_create (CHAR_CAST (const char *, enc));
854 for (i = 0 ; i < n_var_specs ; ++i)
858 if ((var_spec[i].name == NULL) && (var_spec[i].first_value == NULL))
861 /* Probably no data exists for this variable, so allocate a
863 if (var_spec[i].width == -1)
864 var_spec[i].width = SPREADSHEET_DEFAULT_WIDTH;
866 name = dict_make_unique_var_name (r->spreadsheet.dict, var_spec[i].name, &vstart);
867 dict_create_var (r->spreadsheet.dict, name, var_spec[i].width);
871 /* Create the first case, and cache it */
872 r->spreadsheet.used_first_case = false;
874 if (n_var_specs == 0)
876 msg (MW, _("Selected sheet or range of spreadsheet `%s' is empty."),
877 spreadsheet->file_name);
881 r->spreadsheet.proto = caseproto_ref (dict_get_proto (r->spreadsheet.dict));
882 r->spreadsheet.first_case = case_create (r->spreadsheet.proto);
883 case_set_missing (r->spreadsheet.first_case);
886 for (i = 0 ; i < n_var_specs ; ++i)
888 const struct variable *var;
890 if ((var_spec[i].name == NULL) && (var_spec[i].first_value == NULL))
893 var = dict_get_var (r->spreadsheet.dict, x++);
895 convert_xml_string_to_value (r->spreadsheet.first_case, var,
896 var_spec[i].first_value,
897 var_spec[i].first_type,
902 for (i = 0 ; i < n_var_specs ; ++i)
904 free (var_spec[i].first_value);
905 free (var_spec[i].name);
911 return casereader_create_sequential
913 r->spreadsheet.proto,
915 &gnm_file_casereader_class, r);
919 for (i = 0 ; i < n_var_specs ; ++i)
921 free (var_spec[i].first_value);
922 free (var_spec[i].name);
927 gnm_file_casereader_destroy (NULL, r);
933 /* Reads and returns one case from READER's file. Returns a null
934 pointer on failure. */
935 static struct ccase *
936 gnm_file_casereader_read (struct casereader *reader UNUSED, void *r_)
941 struct gnumeric_reader *r = r_;
942 int current_row = r->rsd.row;
944 if (!r->spreadsheet.used_first_case)
946 r->spreadsheet.used_first_case = true;
947 return r->spreadsheet.first_case;
950 c = case_create (r->spreadsheet.proto);
951 case_set_missing (c);
953 if (r->spreadsheet.start_col == -1)
954 r->spreadsheet.start_col = r->rsd.min_col;
957 while ((r->rsd.state == STATE_CELL || r->rsd.state == STATE_CELLS_START)
958 && r->rsd.row == current_row && (ret = xmlTextReaderRead (r->rsd.xtr)))
960 process_node (r, &r->rsd);
962 if (r->rsd.state == STATE_CELL && r->rsd.node_type == XML_READER_TYPE_ELEMENT)
965 xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
967 r->vtype = _xmlchar_to_int (attr);
972 if (r->rsd.col < r->spreadsheet.start_col || (r->spreadsheet.stop_col != -1 &&
973 r->rsd.col > r->spreadsheet.stop_col))
976 if (r->rsd.col - r->spreadsheet.start_col >= caseproto_get_n_widths (r->spreadsheet.proto))
979 if (r->spreadsheet.stop_row != -1 && r->rsd.row > r->spreadsheet.stop_row)
983 if (r->rsd.node_type == XML_READER_TYPE_TEXT)
985 xmlChar *value = xmlTextReaderValue (r->rsd.xtr);
986 const int idx = r->rsd.col - r->spreadsheet.start_col;
987 const struct variable *var = dict_get_var (r->spreadsheet.dict, idx);
989 convert_xml_string_to_value (c, var, value, r->vtype,
990 r->rsd.col, r->rsd.row);
1005 static struct gnumeric_reader *
1006 gnumeric_reopen (struct gnumeric_reader *r, const char *filename, bool show_errors)
1009 struct state_data *sd;
1011 xmlTextReaderPtr xtr;
1014 assert (r == NULL || filename == NULL);
1018 gz = gzopen (filename, "r");
1022 gz = gzopen (r->spreadsheet.file_name, "r");
1030 r = xzalloc (sizeof *r);
1032 r->spreadsheet.file_name = strdup (filename);
1033 struct spreadsheet *s = SPREADSHEET_CAST (r);
1034 strcpy (s->type, "GNM");
1035 s->destroy = gnumeric_destroy;
1036 s->make_reader = gnumeric_make_reader;
1037 s->get_sheet_name = gnumeric_get_sheet_name;
1038 s->get_sheet_range = gnumeric_get_sheet_range;
1039 s->get_sheet_n_sheets = gnumeric_get_sheet_n_sheets;
1040 s->get_sheet_n_rows = gnumeric_get_sheet_n_rows;
1041 s->get_sheet_n_columns = gnumeric_get_sheet_n_columns;
1042 s->get_sheet_cell = gnumeric_get_sheet_cell;
1045 hmap_init (&r->cache);
1053 r = (struct gnumeric_reader *) spreadsheet_ref (SPREADSHEET_CAST (r));
1056 xtr = xmlReaderForIO ((xmlInputReadCallback) gzread,
1057 (xmlInputCloseCallback) gzclose, gz,
1059 show_errors ? 0 : (XML_PARSE_NOERROR | XML_PARSE_NOWARNING));
1069 xmlTextReaderSetErrorHandler (xtr, gnumeric_error_handler, r);
1071 sd->row = sd->col = -1;
1072 sd->state = STATE_PRE_INIT;
1076 r->target_sheet_name = NULL;
1077 r->target_sheet_index = -1;
1080 /* Advance to the start of the workbook.
1081 This gives us some confidence that we are actually dealing with a gnumeric
1084 while ((sd->state != STATE_INIT)
1085 && 1 == (ret = xmlTextReaderRead (sd->xtr)))
1087 process_node (r, sd);
1092 /* Does not seem to be a gnumeric file */
1093 spreadsheet_unref (&r->spreadsheet);
1099 const xmlChar *enc = xmlTextReaderConstEncoding (sd->xtr);
1100 xmlCharEncoding xce = xmlParseCharEncoding (CHAR_CAST (const char *, enc));
1102 if (XML_CHAR_ENCODING_UTF8 != xce)
1104 /* I have been told that ALL gnumeric files are UTF8 encoded. If that is correct, this
1105 can never happen. */
1106 msg (MW, _("The gnumeric file `%s' is encoded as %s instead of the usual UTF-8 encoding. "
1107 "Any non-ascii characters will be incorrectly imported."),
1108 r->spreadsheet.file_name,
1117 struct spreadsheet *
1118 gnumeric_probe (const char *filename, bool report_errors)
1120 struct gnumeric_reader *r = gnumeric_reopen (NULL, filename, report_errors);
1122 return &r->spreadsheet;