1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "libpspp/sparse-xarray.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/ext-array.h"
28 #include "libpspp/misc.h"
29 #include "libpspp/range-set.h"
30 #include "libpspp/sparse-array.h"
33 #include "gl/minmax.h"
34 #include "gl/xalloc.h"
36 /* A sparse array of arrays of bytes. */
39 size_t n_bytes; /* Number of bytes per row. */
40 uint8_t *default_row; /* Defaults for unwritten rows. */
41 unsigned long int max_memory_rows; /* Max rows before dumping to disk. */
42 struct sparse_array *memory; /* Backing, if stored in memory. */
43 struct ext_array *disk; /* Backing, if stored on disk. */
44 struct range_set *disk_rows; /* Allocated rows, if on disk. */
48 range_is_valid (const struct sparse_xarray *sx, size_t ofs, size_t n)
50 return n <= sx->n_bytes && ofs <= sx->n_bytes && ofs + n <= sx->n_bytes;
53 /* Creates and returns a new sparse array of arrays of bytes.
54 Each row in the sparse array will consist of N_BYTES bytes.
55 If fewer than MAX_MEMORY_ROWS rows are added to the array,
56 then it will be kept in memory; if more than that are added,
57 then it will be stored on disk. */
58 struct sparse_xarray *
59 sparse_xarray_create (size_t n_bytes, size_t max_memory_rows)
61 struct sparse_xarray *sx = xmalloc (sizeof *sx);
62 sx->n_bytes = n_bytes;
63 sx->default_row = xzalloc (n_bytes);
64 sx->max_memory_rows = max_memory_rows;
65 sx->memory = sparse_array_create (sizeof (uint8_t *));
71 /* Creates and returns a new sparse array of rows that contains
72 the same data as OLD. Returns a null pointer if cloning
74 struct sparse_xarray *
75 sparse_xarray_clone (const struct sparse_xarray *old)
77 struct sparse_xarray *new = xmalloc (sizeof *new);
79 new->n_bytes = old->n_bytes;
81 new->default_row = xmemdup (old->default_row, old->n_bytes);
83 new->max_memory_rows = old->max_memory_rows;
85 if (old->memory != NULL)
87 unsigned long int idx;
90 new->memory = sparse_array_create (sizeof (uint8_t *));
91 for (old_row = sparse_array_first (old->memory, &idx); old_row != NULL;
92 old_row = sparse_array_next (old->memory, idx, &idx))
94 uint8_t **new_row = sparse_array_insert (new->memory, idx);
95 *new_row = xmemdup (*old_row, new->n_bytes);
101 if (old->disk != NULL)
103 const struct range_set_node *node;
104 void *tmp = xmalloc (old->n_bytes);
106 new->disk = ext_array_create ();
107 new->disk_rows = range_set_clone (old->disk_rows, NULL);
108 RANGE_SET_FOR_EACH (node, old->disk_rows)
110 unsigned long int start = range_set_node_get_start (node);
111 unsigned long int end = range_set_node_get_end (node);
112 unsigned long int idx;
114 for (idx = start; idx < end; idx++)
116 off_t offset = (off_t) idx * old->n_bytes;
117 if (!ext_array_read (old->disk, offset, old->n_bytes, tmp)
118 || !ext_array_write (new->disk, offset, old->n_bytes, tmp))
121 sparse_xarray_destroy (new);
131 new->disk_rows = NULL;
138 free_memory_rows (struct sparse_xarray *sx)
140 if (sx->memory != NULL)
142 unsigned long int idx;
144 for (row = sparse_array_first (sx->memory, &idx); row != NULL;
145 row = sparse_array_next (sx->memory, idx, &idx))
147 sparse_array_destroy (sx->memory);
152 /* Destroys sparse array of rows SX. */
154 sparse_xarray_destroy (struct sparse_xarray *sx)
158 free (sx->default_row);
159 free_memory_rows (sx);
160 ext_array_destroy (sx->disk);
161 range_set_destroy (sx->disk_rows);
166 /* Returns the number of bytes in each row in SX. */
168 sparse_xarray_get_n_columns (const struct sparse_xarray *sx)
173 /* Returns the number of rows in SX. */
175 sparse_xarray_get_n_rows (const struct sparse_xarray *sx)
179 unsigned long int idx;
180 return sparse_array_last (sx->memory, &idx) != NULL ? idx + 1 : 0;
184 const struct range_set_node *last = range_set_last (sx->disk_rows);
185 return last != NULL ? range_set_node_get_end (last) : 0;
189 /* Dumps the rows in SX, which must currently be stored in
190 memory, to disk. Returns true if successful, false on I/O
193 dump_sparse_xarray_to_disk (struct sparse_xarray *sx)
195 unsigned long int idx;
198 assert (sx->memory != NULL);
199 assert (sx->disk == NULL);
201 sx->disk = ext_array_create ();
202 sx->disk_rows = range_set_create ();
204 for (row = sparse_array_first (sx->memory, &idx); row != NULL;
205 row = sparse_array_next (sx->memory, idx, &idx))
207 if (!ext_array_write (sx->disk, (off_t) idx * sx->n_bytes, sx->n_bytes,
210 ext_array_destroy (sx->disk);
212 range_set_destroy (sx->disk_rows);
213 sx->disk_rows = NULL;
216 range_set_set1 (sx->disk_rows, idx, 1);
218 free_memory_rows (sx);
222 /* Returns true if any data has ever been written to ROW in SX,
225 sparse_xarray_contains_row (const struct sparse_xarray *sx,
226 unsigned long int row)
228 return (sx->memory != NULL
229 ? sparse_array_get (sx->memory, row) != NULL
230 : range_set_contains (sx->disk_rows, row));
233 /* Reads columns COLUMNS...(COLUMNS + VALUE_CNT), exclusive, in
234 the given ROW in SX, into the VALUE_CNT values in VALUES.
235 Returns true if successful, false on I/O error. */
237 sparse_xarray_read (const struct sparse_xarray *sx, unsigned long int row,
238 size_t start, size_t n, void *data)
240 assert (range_is_valid (sx, start, n));
242 if (sx->memory != NULL)
244 uint8_t **p = sparse_array_get (sx->memory, row);
247 memcpy (data, *p + start, n);
253 if (range_set_contains (sx->disk_rows, row))
254 return ext_array_read (sx->disk, (off_t) row * sx->n_bytes + start,
258 memcpy (data, sx->default_row + start, n);
262 /* Implements sparse_xarray_write for an on-disk sparse_xarray. */
264 write_disk_row (struct sparse_xarray *sx, unsigned long int row,
265 size_t start, size_t n, const void *data)
267 off_t ofs = (off_t) row * sx->n_bytes;
268 if (range_set_contains (sx->disk_rows, row))
269 return ext_array_write (sx->disk, ofs + start, n, data);
272 range_set_set1 (sx->disk_rows, row, 1);
273 return (ext_array_write (sx->disk, ofs, start, sx->default_row)
274 && ext_array_write (sx->disk, ofs + start, n, data)
275 && ext_array_write (sx->disk, ofs + start + n,
276 sx->n_bytes - start - n,
277 sx->default_row + start + n));
281 /* Writes the VALUE_CNT values in VALUES into columns
282 COLUMNS...(COLUMNS + VALUE_CNT), exclusive, in the given ROW
284 Returns true if successful, false on I/O error. */
286 sparse_xarray_write (struct sparse_xarray *sx, unsigned long int row,
287 size_t start, size_t n, const void *data)
289 assert (range_is_valid (sx, start, n));
290 if (sx->memory != NULL)
292 uint8_t **p = sparse_array_get (sx->memory, row);
295 if (sparse_array_count (sx->memory) < sx->max_memory_rows)
297 p = sparse_array_insert (sx->memory, row);
298 *p = xmemdup (sx->default_row, sx->n_bytes);
302 if (!dump_sparse_xarray_to_disk (sx))
304 return write_disk_row (sx, row, start, n, data);
307 memcpy (*p + start, data, n);
311 return write_disk_row (sx, row, start, n, data);
314 /* Writes the VALUE_CNT values in VALUES to columns
315 START_COLUMN...(START_COLUMN + VALUE_CNT), exclusive, in every
316 row in SX, even those rows that have not yet been written.
317 Returns true if successful, false on I/O error.
319 The runtime of this function is linear in the number of rows
320 in SX that have already been written. */
322 sparse_xarray_write_columns (struct sparse_xarray *sx, size_t start,
323 size_t n, const void *data)
325 assert (range_is_valid (sx, start, n));
328 memcpy (sx->default_row + start, data, n);
330 /* Set individual rows. */
331 if (sx->memory != NULL)
333 unsigned long int idx;
336 for (p = sparse_array_first (sx->memory, &idx); p != NULL;
337 p = sparse_array_next (sx->memory, idx, &idx))
338 memcpy (*p + start, data, n);
342 const struct range_set_node *node;
344 RANGE_SET_FOR_EACH (node, sx->disk_rows)
346 unsigned long int start_row = range_set_node_get_start (node);
347 unsigned long int end_row = range_set_node_get_end (node);
348 unsigned long int row;
350 for (row = start_row; row < end_row; row++)
352 off_t offset = (off_t) row * sx->n_bytes;
353 if (!ext_array_write (sx->disk, offset + start, n, data))
358 if (ext_array_error (sx->disk))
364 static unsigned long int
365 scan_first (const struct sparse_xarray *sx)
369 unsigned long int idx;
370 return sparse_array_first (sx->memory, &idx) ? idx : ULONG_MAX;
373 return range_set_scan (sx->disk_rows, 0);
376 static unsigned long int
377 scan_next (const struct sparse_xarray *sx, unsigned long int start)
381 unsigned long int idx;
382 return sparse_array_next (sx->memory, start, &idx) ? idx : ULONG_MAX;
385 return range_set_scan (sx->disk_rows, start + 1);
388 /* Only works for rows for which sparse_xarray_contains_row()
389 would return true. */
391 get_row (const struct sparse_xarray *sx, unsigned long int idx,
396 uint8_t **p = sparse_array_get (sx->memory, idx);
399 else if (ext_array_read (sx->disk, (off_t) idx * sx->n_bytes,
400 sx->n_bytes, buffer))
406 /* Iterates over all the rows in SX and DX, passing each pair of
407 rows with equal indexes to CB. CB's modifications, if any, to
408 destination rows are written back to DX.
410 All rows that are actually in use in SX or in DX or both are
411 passed to CB. If a row is in use in SX but not in DX, or vice
412 versa, then the "default" row (as set by
413 sparse_xarray_write_columns) is passed as the contents of the
416 CB is also called once with the default row from SX and the
417 default row from DX. Modifying the data passed as the default
418 row from DX will change DX's default row.
420 Returns true if successful, false if I/O on SX or DX fails or
421 if CB returns false. On failure, the contents of DX are
424 sparse_xarray_copy (const struct sparse_xarray *sx, struct sparse_xarray *dx,
425 bool (*cb) (const void *src, void *dst, void *aux),
430 if (!cb (sx->default_row, dx->default_row, aux))
437 unsigned long int idx;
440 for (row = sparse_array_first (sx->memory, &idx); row != NULL;
441 row = sparse_array_next (sx->memory, idx, &idx))
443 success = cb (*row, *row, aux);
450 const struct range_set_node *node;
451 void *tmp = xmalloc (sx->n_bytes);
453 RANGE_SET_FOR_EACH (node, sx->disk_rows)
455 unsigned long int start = range_set_node_get_start (node);
456 unsigned long int end = range_set_node_get_end (node);
457 unsigned long int row;
459 for (row = start; row < end; row++)
461 off_t offset = (off_t) row * sx->n_bytes;
462 success = (ext_array_read (sx->disk, offset, sx->n_bytes,
464 && cb (tmp, tmp, aux)
465 && ext_array_write (dx->disk, offset,
477 unsigned long int src_idx = scan_first (sx);
478 unsigned long int dst_idx = scan_first (dx);
480 uint8_t *tmp_src_row = xmalloc (sx->n_bytes);
481 uint8_t *tmp_dst_row = xmalloc (dx->n_bytes);
485 unsigned long int idx;
486 const uint8_t *src_row;
489 /* Determine the index of the row to process. If
490 src_idx == dst_idx, then the row has been written in
491 both SX and DX. Otherwise, it has been written in
492 only the sparse_xarray corresponding to the smaller
493 index, and has the default contents in the other. */
494 idx = MIN (src_idx, dst_idx);
495 if (idx == ULONG_MAX)
498 /* Obtain a copy of the source row as src_row. */
500 src_row = get_row (sx, idx, tmp_src_row);
502 src_row = sx->default_row;
504 /* Obtain the destination row as dst_row. */
506 dst_row = get_row (dx, idx, tmp_dst_row);
508 && sparse_array_count (dx->memory) < dx->max_memory_rows)
510 uint8_t **p = sparse_array_insert (dx->memory, idx);
511 dst_row = *p = xmemdup (dx->default_row, dx->n_bytes);
515 memcpy (tmp_dst_row, dx->default_row, dx->n_bytes);
516 dst_row = tmp_dst_row;
519 /* Run the callback. */
520 success = cb (src_row, dst_row, aux);
524 /* Write back the destination row, if necessary. */
525 if (dst_row == tmp_dst_row)
527 success = sparse_xarray_write (dx, idx, 0, dx->n_bytes, dst_row);
533 /* Nothing to do: we modified the destination row in-place. */
536 /* Advance to the next row. */
538 src_idx = scan_next (sx, src_idx);
540 dst_idx = scan_next (dx, dst_idx);
550 /* Returns a hash value for SX suitable for use with the model
551 checker. The value in BASIS is folded into the hash.
553 The returned hash value is *not* suitable for storage and
554 retrieval of sparse_xarrays that have identical contents,
555 because it will return different hash values for
556 sparse_xarrays that have the same contents (and it's slow).
558 We use MD4 because it is much faster than MD5 or SHA-1 but its
559 collision resistance is just as good. */
561 sparse_xarray_model_checker_hash (const struct sparse_xarray *sx,
564 unsigned int hash[DIV_RND_UP (20, sizeof (unsigned int))];
568 md4_process_bytes (&basis, sizeof basis, &ctx);
569 md4_process_bytes (&sx->n_bytes, sizeof sx->n_bytes, &ctx);
570 md4_process_bytes (sx->default_row, sx->n_bytes, &ctx);
574 unsigned long int idx;
577 md4_process_bytes ("m", 1, &ctx);
578 md4_process_bytes (&sx->max_memory_rows, sizeof sx->max_memory_rows,
580 for (row = sparse_array_first (sx->memory, &idx); row != NULL;
581 row = sparse_array_next (sx->memory, idx, &idx))
583 md4_process_bytes (&idx, sizeof idx, &ctx);
584 md4_process_bytes (*row, sx->n_bytes, &ctx);
589 const struct range_set_node *node;
590 void *tmp = xmalloc (sx->n_bytes);
592 md4_process_bytes ("d", 1, &ctx);
593 RANGE_SET_FOR_EACH (node, sx->disk_rows)
595 unsigned long int start = range_set_node_get_start (node);
596 unsigned long int end = range_set_node_get_end (node);
597 unsigned long int idx;
599 for (idx = start; idx < end; idx++)
601 off_t offset = (off_t) idx * sx->n_bytes;
602 if (!ext_array_read (sx->disk, offset, sx->n_bytes, tmp))
604 md4_process_bytes (&idx, sizeof idx, &ctx);
605 md4_process_bytes (tmp, sx->n_bytes, &ctx);
610 md4_finish_ctx (&ctx, hash);