From: Ben Pfaff Date: Thu, 7 Jun 2007 05:27:32 +0000 (+0000) Subject: sparse_cases data structure that augments a sparse_array of cases with X-Git-Tag: v0.6.0~447 X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1b2982dce3a196a0c9011c6c836d57a9763fcfc3;p=pspp-builds.git sparse_cases data structure that augments a sparse_array of cases with the ability to dump cases to disk if we get too many cases in memory. --- diff --git a/src/data/ChangeLog b/src/data/ChangeLog index 1e6e4d87..554148ff 100644 --- a/src/data/ChangeLog +++ b/src/data/ChangeLog @@ -1,3 +1,15 @@ +2007-06-06 Ben Pfaff + + sparse_cases data structure that augments a sparse_array of cases + with the ability to dump cases to disk if we get too many cases in + memory. + + * automake.mk: Add new files. + + * sparse-cases.c: New file. + + * sparse-cases.h: New file. + 2007-06-06 Ben Pfaff Adds a low-level on-disk case array data structure. diff --git a/src/data/automake.mk b/src/data/automake.mk index 921f812f..1fa2741e 100644 --- a/src/data/automake.mk +++ b/src/data/automake.mk @@ -61,6 +61,8 @@ src_data_libdata_a_SOURCES = \ src/data/scratch-writer.h \ src/data/settings.c \ src/data/settings.h \ + src/data/sparse-cases.c \ + src/data/sparse-cases.h \ src/data/storage-stream.c \ src/data/storage-stream.h \ src/data/sys-file-private.c \ diff --git a/src/data/sparse-cases.c b/src/data/sparse-cases.c new file mode 100644 index 00000000..8a4a263f --- /dev/null +++ b/src/data/sparse-cases.c @@ -0,0 +1,334 @@ +/* PSPP - computes sample statistics. + Copyright (C) 2007 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. */ + +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include "xalloc.h" + +/* A sparse array of cases. */ +struct sparse_cases + { + size_t column_cnt; /* Number of values per case. */ + union value *default_columns; /* Defaults for unwritten cases. */ + casenumber max_memory_cases; /* Max cases before dumping to disk. */ + struct sparse_array *memory; /* Backing, if stored in memory. */ + struct case_tmpfile *disk; /* Backing, if stored on disk. */ + struct range_set *disk_cases; /* Allocated cases, if on disk. */ + }; + +/* Creates and returns a new sparse array of cases with + COLUMN_CNT values per case. */ +struct sparse_cases * +sparse_cases_create (size_t column_cnt) +{ + struct sparse_cases *sc = xmalloc (sizeof *sc); + sc->column_cnt = column_cnt; + sc->default_columns = NULL; + sc->max_memory_cases = get_workspace_cases (column_cnt); + sc->memory = sparse_array_create (sizeof (struct ccase)); + sc->disk = NULL; + sc->disk_cases = NULL; + return sc; +} + +/* Creates and returns a new sparse array of cases that contains + the same data as OLD. */ +struct sparse_cases * +sparse_cases_clone (const struct sparse_cases *old) +{ + struct sparse_cases *new = xmalloc (sizeof *new); + + new->column_cnt = old->column_cnt; + + if (old->default_columns != NULL) + new->default_columns + = xmemdup (old->default_columns, + old->column_cnt * sizeof *old->default_columns); + else + new->default_columns = NULL; + + new->max_memory_cases = old->max_memory_cases; + + if (old->memory != NULL) + { + unsigned long int idx; + struct ccase *c; + + new->memory = sparse_array_create (sizeof (struct ccase)); + for (c = sparse_array_scan (old->memory, NULL, &idx); c != NULL; + c = sparse_array_scan (old->memory, &idx, &idx)) + case_clone (sparse_array_insert (new->memory, idx), c); + } + else + new->memory = NULL; + + if (old->disk != NULL) + { + const struct range_set_node *node; + + new->disk = case_tmpfile_create (old->column_cnt); + new->disk_cases = range_set_create (); + for (node = range_set_first (old->disk_cases); node != NULL; + node = range_set_next (old->disk_cases, node)) + { + unsigned long int start = range_set_node_get_start (node); + unsigned long int end = range_set_node_get_end (node); + unsigned long int idx; + struct ccase c; + + for (idx = start; idx < end; idx++) + if (!case_tmpfile_get_case (old->disk, idx, &c) + || !case_tmpfile_put_case (new->disk, idx, &c)) + { + sparse_cases_destroy (new); + return NULL; + } + } + } + else + { + new->disk = NULL; + new->disk_cases = NULL; + } + + return new; +} + +/* Destroys sparse array of cases SC. */ +void +sparse_cases_destroy (struct sparse_cases *sc) +{ + if (sc != NULL) + { + if (sc->memory != NULL) + { + unsigned long int idx; + struct ccase *c; + for (c = sparse_array_scan (sc->memory, NULL, &idx); c != NULL; + c = sparse_array_scan (sc->memory, &idx, &idx)) + case_destroy (c); + sparse_array_destroy (sc->memory); + } + free (sc->default_columns); + case_tmpfile_destroy (sc->disk); + range_set_destroy (sc->disk_cases); + free (sc); + } +} + +/* Returns the number of `union value's in each case in SC. */ +size_t +sparse_cases_get_value_cnt (const struct sparse_cases *sc) +{ + return sc->column_cnt; +} + +/* Dumps the cases in SC, which must currently be stored in + memory, to disk. Returns true if successful, false on I/O + error. */ +static bool +dump_sparse_cases_to_disk (struct sparse_cases *sc) +{ + unsigned long int idx; + struct ccase *c; + + assert (sc->memory != NULL); + assert (sc->disk == NULL); + + sc->disk = case_tmpfile_create (sc->column_cnt); + sc->disk_cases = range_set_create (); + + for (c = sparse_array_scan (sc->memory, NULL, &idx); c != NULL; + c = sparse_array_scan (sc->memory, &idx, &idx)) + { + if (!case_tmpfile_put_case (sc->disk, idx, c)) + { + case_tmpfile_destroy (sc->disk); + sc->disk = NULL; + range_set_destroy (sc->disk_cases); + sc->disk_cases = NULL; + return false; + } + range_set_insert (sc->disk_cases, idx, 1); + } + sparse_array_destroy (sc->memory); + sc->memory = NULL; + return true; +} + +/* Returns true if any data has ever been written to ROW in SC, + false otherwise. */ +bool +sparse_cases_contains_row (const struct sparse_cases *sc, casenumber row) +{ + return (sc->memory != NULL + ? sparse_array_get (sc->memory, row) != NULL + : range_set_contains (sc->disk_cases, row)); +} + +/* Reads columns COLUMNS...(COLUMNS + VALUE_CNT), exclusive, in + the given ROW in SC, into the VALUE_CNT values in VALUES. + Returns true if successful, false on I/O error. */ +bool +sparse_cases_read (struct sparse_cases *sc, casenumber row, size_t column, + union value values[], size_t value_cnt) +{ + assert (value_cnt <= sc->column_cnt); + assert (column + value_cnt <= sc->column_cnt); + + if (sparse_cases_contains_row (sc, row)) + { + struct ccase c; + if (sc->memory != NULL) + case_clone (&c, sparse_array_get (sc->memory, row)); + else if (!case_tmpfile_get_case (sc->disk, row, &c)) + return false; + case_copy_out (&c, column, values, value_cnt); + case_destroy (&c); + } + else + { + assert (sc->default_columns != NULL); + memcpy (values, sc->default_columns + column, + sizeof *values * value_cnt); + } + + return true; +} + +/* Implements sparse_cases_write for an on-disk sparse_cases. */ +static bool +write_disk_case (struct sparse_cases *sc, casenumber row, size_t column, + const union value values[], size_t value_cnt) +{ + struct ccase c; + bool ok; + + /* Get current case data. */ + if (column == 0 && value_cnt == sc->column_cnt) + case_create (&c, sc->column_cnt); + else if (!case_tmpfile_get_case (sc->disk, row, &c)) + return false; + + /* Copy in new data. */ + case_copy_in (&c, column, values, value_cnt); + + /* Write new case. */ + ok = case_tmpfile_put_case (sc->disk, row, &c); + if (ok) + range_set_insert (sc->disk_cases, row, 1); + + return ok; +} + +/* Writes the VALUE_CNT values in VALUES into columns + COLUMNS...(COLUMNS + VALUE_CNT), exclusive, in the given ROW + in SC. + Returns true if successful, false on I/O error. */ +bool +sparse_cases_write (struct sparse_cases *sc, casenumber row, size_t column, + const union value values[], size_t value_cnt) +{ + if (sc->memory != NULL) + { + struct ccase *c = sparse_array_get (sc->memory, row); + if (c == NULL) + { + if (sparse_array_count (sc->memory) >= sc->max_memory_cases) + { + if (!dump_sparse_cases_to_disk (sc)) + return false; + return write_disk_case (sc, row, column, values, value_cnt); + } + + c = sparse_array_insert (sc->memory, row); + case_create (c, sc->column_cnt); + if (sc->default_columns != NULL + && (column != 0 || value_cnt != sc->column_cnt)) + case_copy_in (c, 0, sc->default_columns, sc->column_cnt); + } + case_copy_in (c, column, values, value_cnt); + return true; + } + else + return write_disk_case (sc, row, column, values, value_cnt); +} + +/* Writes the VALUE_CNT values in VALUES to columns + START_COLUMN...(START_COLUMN + VALUE_CNT), exclusive, in every + row in SC, even those rows that have not yet been written. + Returns true if successful, false on I/O error. + + The runtime of this function is linear in the number of rows + in SC that have already been written. */ +bool +sparse_cases_write_columns (struct sparse_cases *sc, size_t start_column, + const union value values[], size_t value_cnt) +{ + assert (value_cnt <= sc->column_cnt); + assert (start_column + value_cnt <= sc->column_cnt); + + /* Set defaults. */ + if (sc->default_columns == NULL) + sc->default_columns = xnmalloc (sc->column_cnt, + sizeof *sc->default_columns); + memcpy (sc->default_columns + start_column, values, + value_cnt * sizeof *sc->default_columns); + + /* Set individual rows. */ + if (sc->memory != NULL) + { + struct ccase *c; + unsigned long int idx; + + for (c = sparse_array_scan (sc->memory, NULL, &idx); c != NULL; + c = sparse_array_scan (sc->memory, &idx, &idx)) + case_copy_in (c, start_column, values, value_cnt); + } + else + { + const struct range_set_node *node; + + for (node = range_set_first (sc->disk_cases); node != NULL; + node = range_set_next (sc->disk_cases, node)) + { + unsigned long int start = range_set_node_get_start (node); + unsigned long int end = range_set_node_get_end (node); + unsigned long int row; + + for (row = start; row < end; row++) + case_tmpfile_put_values (sc->disk, row, + start_column, values, value_cnt); + } + + if (case_tmpfile_error (sc->disk)) + return false; + } + return true; +} diff --git a/src/data/sparse-cases.h b/src/data/sparse-cases.h new file mode 100644 index 00000000..fce0377f --- /dev/null +++ b/src/data/sparse-cases.h @@ -0,0 +1,68 @@ +/* PSPP - computes sample statistics. + Copyright (C) 2007 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* Sparse array of cases. + + Implements a 2-d sparse array in which each row represents a + case, each column represents a variable, and each intersection + contains a `union value'. Data in the array may be accessed + randomly by column and row. When the number of cases stored + in the array is small, the data is stored in memory in memory; + when it is large, the data is stored in a temporary file. + + The sparse_cases_write_columns function provides a somewhat + unusual ability: to write a given value to every row in a + column or set of columns. This overwrites any values + previously written into those columns. For rows that have + never been written, this function sets "default" values that + later writes can override. + + The array keeps track of which row have been written. If + sparse_cases_write_columns has been used, reading from a row + that has never been written yields the default values; + otherwise, reading from such a row in an error. It is + permissible to write to only some columns in a row and leave + the rest of the row's data undefined (or, if + sparse_cases_write_columns has been used, at the default + values). The array does not keep track of which columns in a + row have never been written, but reading values that have + never been written or set as defaults yields undefined + behavior. */ + +#ifndef DATA_SPARSE_CASES_H +#define DATA_SPARSE_CASES_H 1 + +#include +#include +#include + +struct sparse_cases *sparse_cases_create (size_t value_cnt); +struct sparse_cases *sparse_cases_clone (const struct sparse_cases *); +void sparse_cases_destroy (struct sparse_cases *); + +size_t sparse_cases_get_value_cnt (const struct sparse_cases *); + +bool sparse_cases_contains_row (const struct sparse_cases *, casenumber row); +bool sparse_cases_read (struct sparse_cases *, casenumber row, size_t column, + union value[], size_t value_cnt); +bool sparse_cases_write (struct sparse_cases *, casenumber row, size_t column, + const union value[], size_t value_cnt); +bool sparse_cases_write_columns (struct sparse_cases *, size_t start_column, + const union value[], size_t value_cnt); + +#endif /* data/sparse-cases.h */