1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "libpspp/sparse-xarray.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/ext-array.h"
28 #include "libpspp/misc.h"
29 #include "libpspp/range-set.h"
30 #include "libpspp/sparse-array.h"
33 #include "gl/minmax.h"
34 #include "gl/xalloc.h"
36 /* A sparse array of arrays of bytes. */
39 size_t n_bytes; /* Number of bytes per row. */
40 uint8_t *default_row; /* Defaults for unwritten rows. */
41 unsigned long int max_memory_rows; /* Max rows before dumping to disk. */
42 struct sparse_array *memory; /* Backing, if stored in memory. */
43 struct ext_array *disk; /* Backing, if stored on disk. */
44 struct range_set *disk_rows; /* Allocated rows, if on disk. */
48 range_is_valid (const struct sparse_xarray *sx, size_t ofs, size_t n)
50 return n <= sx->n_bytes && ofs <= sx->n_bytes && ofs + n <= sx->n_bytes;
53 /* Creates and returns a new sparse array of arrays of bytes.
54 Each row in the sparse array will consist of N_BYTES bytes.
55 If fewer than MAX_MEMORY_ROWS rows are added to the array,
56 then it will be kept in memory; if more than that are added,
57 then it will be stored on disk. */
58 struct sparse_xarray *
59 sparse_xarray_create (size_t n_bytes, size_t max_memory_rows)
61 struct sparse_xarray *sx = xmalloc (sizeof *sx);
62 sx->n_bytes = n_bytes;
63 sx->default_row = xzalloc (n_bytes);
64 sx->max_memory_rows = max_memory_rows;
65 sx->memory = sparse_array_create (sizeof (uint8_t *));
71 /* Creates and returns a new sparse array of rows that contains
72 the same data as OLD. Returns a null pointer if cloning
74 struct sparse_xarray *
75 sparse_xarray_clone (const struct sparse_xarray *old)
77 struct sparse_xarray *new = xmalloc (sizeof *new);
79 new->n_bytes = old->n_bytes;
81 new->default_row = xmemdup (old->default_row, old->n_bytes);
83 new->max_memory_rows = old->max_memory_rows;
85 if (old->memory != NULL)
87 unsigned long int idx;
90 new->memory = sparse_array_create (sizeof (uint8_t *));
91 for (old_row = sparse_array_first (old->memory, &idx); old_row != NULL;
92 old_row = sparse_array_next (old->memory, idx, &idx))
94 uint8_t **new_row = sparse_array_insert (new->memory, idx);
95 *new_row = xmemdup (*old_row, new->n_bytes);
101 if (old->disk != NULL)
103 const struct range_set_node *node;
104 void *tmp = xmalloc (old->n_bytes);
106 new->disk = ext_array_create ();
107 new->disk_rows = range_set_clone (old->disk_rows, NULL);
108 RANGE_SET_FOR_EACH (node, old->disk_rows)
110 unsigned long int start = range_set_node_get_start (node);
111 unsigned long int end = range_set_node_get_end (node);
112 unsigned long int idx;
114 for (idx = start; idx < end; idx++)
116 off_t offset = (off_t) idx * old->n_bytes;
117 if (!ext_array_read (old->disk, offset, old->n_bytes, tmp)
118 || !ext_array_write (new->disk, offset, old->n_bytes, tmp))
121 sparse_xarray_destroy (new);
131 new->disk_rows = NULL;
137 /* Destroys sparse array of rows SX. */
139 sparse_xarray_destroy (struct sparse_xarray *sx)
143 free (sx->default_row);
144 if (sx->memory != NULL)
146 unsigned long int idx;
148 for (row = sparse_array_first (sx->memory, &idx); row != NULL;
149 row = sparse_array_next (sx->memory, idx, &idx))
151 sparse_array_destroy (sx->memory);
153 ext_array_destroy (sx->disk);
154 range_set_destroy (sx->disk_rows);
159 /* Returns the number of bytes in each row in SX. */
161 sparse_xarray_get_n_columns (const struct sparse_xarray *sx)
166 /* Returns the number of rows in SX. */
168 sparse_xarray_get_n_rows (const struct sparse_xarray *sx)
172 unsigned long int idx;
173 return sparse_array_last (sx->memory, &idx) != NULL ? idx + 1 : 0;
177 const struct range_set_node *last = range_set_last (sx->disk_rows);
178 return last != NULL ? range_set_node_get_end (last) : 0;
182 /* Dumps the rows in SX, which must currently be stored in
183 memory, to disk. Returns true if successful, false on I/O
186 dump_sparse_xarray_to_disk (struct sparse_xarray *sx)
188 unsigned long int idx;
191 assert (sx->memory != NULL);
192 assert (sx->disk == NULL);
194 sx->disk = ext_array_create ();
195 sx->disk_rows = range_set_create ();
197 for (row = sparse_array_first (sx->memory, &idx); row != NULL;
198 row = sparse_array_next (sx->memory, idx, &idx))
200 if (!ext_array_write (sx->disk, (off_t) idx * sx->n_bytes, sx->n_bytes,
203 ext_array_destroy (sx->disk);
205 range_set_destroy (sx->disk_rows);
206 sx->disk_rows = NULL;
209 range_set_set1 (sx->disk_rows, idx, 1);
211 sparse_array_destroy (sx->memory);
216 /* Returns true if any data has ever been written to ROW in SX,
219 sparse_xarray_contains_row (const struct sparse_xarray *sx,
220 unsigned long int row)
222 return (sx->memory != NULL
223 ? sparse_array_get (sx->memory, row) != NULL
224 : range_set_contains (sx->disk_rows, row));
227 /* Reads columns COLUMNS...(COLUMNS + VALUE_CNT), exclusive, in
228 the given ROW in SX, into the VALUE_CNT values in VALUES.
229 Returns true if successful, false on I/O error. */
231 sparse_xarray_read (const struct sparse_xarray *sx, unsigned long int row,
232 size_t start, size_t n, void *data)
234 assert (range_is_valid (sx, start, n));
236 if (sx->memory != NULL)
238 uint8_t **p = sparse_array_get (sx->memory, row);
241 memcpy (data, *p + start, n);
247 if (range_set_contains (sx->disk_rows, row))
248 return ext_array_read (sx->disk, (off_t) row * sx->n_bytes + start,
252 memcpy (data, sx->default_row + start, n);
256 /* Implements sparse_xarray_write for an on-disk sparse_xarray. */
258 write_disk_row (struct sparse_xarray *sx, unsigned long int row,
259 size_t start, size_t n, const void *data)
261 off_t ofs = (off_t) row * sx->n_bytes;
262 if (range_set_contains (sx->disk_rows, row))
263 return ext_array_write (sx->disk, ofs + start, n, data);
266 range_set_set1 (sx->disk_rows, row, 1);
267 return (ext_array_write (sx->disk, ofs, start, sx->default_row)
268 && ext_array_write (sx->disk, ofs + start, n, data)
269 && ext_array_write (sx->disk, ofs + start + n,
270 sx->n_bytes - start - n,
271 sx->default_row + start + n));
275 /* Writes the VALUE_CNT values in VALUES into columns
276 COLUMNS...(COLUMNS + VALUE_CNT), exclusive, in the given ROW
278 Returns true if successful, false on I/O error. */
280 sparse_xarray_write (struct sparse_xarray *sx, unsigned long int row,
281 size_t start, size_t n, const void *data)
283 assert (range_is_valid (sx, start, n));
284 if (sx->memory != NULL)
286 uint8_t **p = sparse_array_get (sx->memory, row);
289 if (sparse_array_count (sx->memory) < sx->max_memory_rows)
291 p = sparse_array_insert (sx->memory, row);
292 *p = xmemdup (sx->default_row, sx->n_bytes);
296 if (!dump_sparse_xarray_to_disk (sx))
298 return write_disk_row (sx, row, start, n, data);
301 memcpy (*p + start, data, n);
305 return write_disk_row (sx, row, start, n, data);
308 /* Writes the VALUE_CNT values in VALUES to columns
309 START_COLUMN...(START_COLUMN + VALUE_CNT), exclusive, in every
310 row in SX, even those rows that have not yet been written.
311 Returns true if successful, false on I/O error.
313 The runtime of this function is linear in the number of rows
314 in SX that have already been written. */
316 sparse_xarray_write_columns (struct sparse_xarray *sx, size_t start,
317 size_t n, const void *data)
319 assert (range_is_valid (sx, start, n));
322 memcpy (sx->default_row + start, data, n);
324 /* Set individual rows. */
325 if (sx->memory != NULL)
327 unsigned long int idx;
330 for (p = sparse_array_first (sx->memory, &idx); p != NULL;
331 p = sparse_array_next (sx->memory, idx, &idx))
332 memcpy (*p + start, data, n);
336 const struct range_set_node *node;
338 RANGE_SET_FOR_EACH (node, sx->disk_rows)
340 unsigned long int start_row = range_set_node_get_start (node);
341 unsigned long int end_row = range_set_node_get_end (node);
342 unsigned long int row;
344 for (row = start_row; row < end_row; row++)
346 off_t offset = (off_t) row * sx->n_bytes;
347 if (!ext_array_write (sx->disk, offset + start, n, data))
352 if (ext_array_error (sx->disk))
358 static unsigned long int
359 scan_first (const struct sparse_xarray *sx)
363 unsigned long int idx;
364 return sparse_array_first (sx->memory, &idx) ? idx : ULONG_MAX;
367 return range_set_scan (sx->disk_rows, 0);
370 static unsigned long int
371 scan_next (const struct sparse_xarray *sx, unsigned long int start)
375 unsigned long int idx;
376 return sparse_array_next (sx->memory, start, &idx) ? idx : ULONG_MAX;
379 return range_set_scan (sx->disk_rows, start + 1);
382 /* Only works for rows for which sparse_xarray_contains_row()
383 would return true. */
385 get_row (const struct sparse_xarray *sx, unsigned long int idx,
390 uint8_t **p = sparse_array_get (sx->memory, idx);
393 else if (ext_array_read (sx->disk, (off_t) idx * sx->n_bytes,
394 sx->n_bytes, buffer))
400 /* Iterates over all the rows in SX and DX, passing each pair of
401 rows with equal indexes to CB. CB's modifications, if any, to
402 destination rows are written back to DX.
404 All rows that are actually in use in SX or in DX or both are
405 passed to CB. If a row is in use in SX but not in DX, or vice
406 versa, then the "default" row (as set by
407 sparse_xarray_write_columns) is passed as the contents of the
410 CB is also called once with the default row from SX and the
411 default row from DX. Modifying the data passed as the default
412 row from DX will change DX's default row.
414 Returns true if successful, false if I/O on SX or DX fails or
415 if CB returns false. On failure, the contents of DX are
418 sparse_xarray_copy (const struct sparse_xarray *sx, struct sparse_xarray *dx,
419 bool (*cb) (const void *src, void *dst, void *aux),
424 if (!cb (sx->default_row, dx->default_row, aux))
431 unsigned long int idx;
434 for (row = sparse_array_first (sx->memory, &idx); row != NULL;
435 row = sparse_array_next (sx->memory, idx, &idx))
437 success = cb (*row, *row, aux);
444 const struct range_set_node *node;
445 void *tmp = xmalloc (sx->n_bytes);
447 RANGE_SET_FOR_EACH (node, sx->disk_rows)
449 unsigned long int start = range_set_node_get_start (node);
450 unsigned long int end = range_set_node_get_end (node);
451 unsigned long int row;
453 for (row = start; row < end; row++)
455 off_t offset = (off_t) row * sx->n_bytes;
456 success = (ext_array_read (sx->disk, offset, sx->n_bytes,
458 && cb (tmp, tmp, aux)
459 && ext_array_write (dx->disk, offset,
471 unsigned long int src_idx = scan_first (sx);
472 unsigned long int dst_idx = scan_first (dx);
474 uint8_t *tmp_src_row = xmalloc (sx->n_bytes);
475 uint8_t *tmp_dst_row = xmalloc (dx->n_bytes);
479 unsigned long int idx;
480 const uint8_t *src_row;
483 /* Determine the index of the row to process. If
484 src_idx == dst_idx, then the row has been written in
485 both SX and DX. Otherwise, it has been written in
486 only the sparse_xarray corresponding to the smaller
487 index, and has the default contents in the other. */
488 idx = MIN (src_idx, dst_idx);
489 if (idx == ULONG_MAX)
492 /* Obtain a copy of the source row as src_row. */
494 src_row = get_row (sx, idx, tmp_src_row);
496 src_row = sx->default_row;
498 /* Obtain the destination row as dst_row. */
500 dst_row = get_row (dx, idx, tmp_dst_row);
502 && sparse_array_count (dx->memory) < dx->max_memory_rows)
504 uint8_t **p = sparse_array_insert (dx->memory, idx);
505 dst_row = *p = xmemdup (dx->default_row, dx->n_bytes);
509 memcpy (tmp_dst_row, dx->default_row, dx->n_bytes);
510 dst_row = tmp_dst_row;
513 /* Run the callback. */
514 success = cb (src_row, dst_row, aux);
518 /* Write back the destination row, if necessary. */
519 if (dst_row == tmp_dst_row)
521 success = sparse_xarray_write (dx, idx, 0, dx->n_bytes, dst_row);
527 /* Nothing to do: we modified the destination row in-place. */
530 /* Advance to the next row. */
532 src_idx = scan_next (sx, src_idx);
534 dst_idx = scan_next (dx, dst_idx);
544 /* Returns a hash value for SX suitable for use with the model
545 checker. The value in BASIS is folded into the hash.
547 The returned hash value is *not* suitable for storage and
548 retrieval of sparse_xarrays that have identical contents,
549 because it will return different hash values for
550 sparse_xarrays that have the same contents (and it's slow).
552 We use MD4 because it is much faster than MD5 or SHA-1 but its
553 collision resistance is just as good. */
555 sparse_xarray_model_checker_hash (const struct sparse_xarray *sx,
558 unsigned int hash[DIV_RND_UP (20, sizeof (unsigned int))];
562 md4_process_bytes (&basis, sizeof basis, &ctx);
563 md4_process_bytes (&sx->n_bytes, sizeof sx->n_bytes, &ctx);
564 md4_process_bytes (sx->default_row, sx->n_bytes, &ctx);
568 unsigned long int idx;
571 md4_process_bytes ("m", 1, &ctx);
572 md4_process_bytes (&sx->max_memory_rows, sizeof sx->max_memory_rows,
574 for (row = sparse_array_first (sx->memory, &idx); row != NULL;
575 row = sparse_array_next (sx->memory, idx, &idx))
577 md4_process_bytes (&idx, sizeof idx, &ctx);
578 md4_process_bytes (*row, sx->n_bytes, &ctx);
583 const struct range_set_node *node;
584 void *tmp = xmalloc (sx->n_bytes);
586 md4_process_bytes ("d", 1, &ctx);
587 RANGE_SET_FOR_EACH (node, sx->disk_rows)
589 unsigned long int start = range_set_node_get_start (node);
590 unsigned long int end = range_set_node_get_end (node);
591 unsigned long int idx;
593 for (idx = start; idx < end; idx++)
595 off_t offset = (off_t) idx * sx->n_bytes;
596 if (!ext_array_read (sx->disk, offset, sx->n_bytes, tmp))
598 md4_process_bytes (&idx, sizeof idx, &ctx);
599 md4_process_bytes (tmp, sx->n_bytes, &ctx);
604 md4_finish_ctx (&ctx, hash);