work on docs

[pspp] / src / data / sys-file-private.c
diff --git a/src/data/sys-file-private.c b/src/data/sys-file-private.c

index f544a810580e2c07cd454c1b47d4df2556856283..74b0208b9387e9804f6ae725c1cf771ce3de9672 100644 (file)
--- a/src/data/sys-file-private.c
+++ b/src/data/sys-file-private.c
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 2006 Free Software Foundation, Inc.
+   Copyright (C) 2006, 2009, 2010 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -15,29 +15,268 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  
  #include <config.h>
-#include "sys-file-private.h"
  
-#include <data/value.h>
-#include <libpspp/assertion.h>
+#include "data/sys-file-private.h"
  
-/* Return the number of bytes used when writing case_data for a variable
-   of WIDTH */
-int
+#include "data/dictionary.h"
+#include "data/value.h"
+#include "data/variable.h"
+#include "libpspp/assertion.h"
+#include "libpspp/misc.h"
+
+#include "gl/c-strcase.h"
+#include "gl/minmax.h"
+#include "gl/xalloc.h"
+
+/* Number of bytes really stored in each segment of a very long
+   string variable. */
+#define REAL_VLS_CHUNK 255
+
+/* Number of bytes per segment by which the amount of space for
+   very long string variables is allocated. */
+#define EFFECTIVE_VLS_CHUNK 252
+
+/* Returns true if WIDTH is a very long string width,
+   false otherwise. */
+static bool
+is_very_long (int width)
+{
+  return width >= 256;
+}
+
+/* Returns the smaller of A or B.
+   (Defined as a function to avoid evaluating A or B more than
+   once.) */
+static int
+min_int (int a, int b)
+{
+  return MIN (a, b);
+}
+
+/* Returns the larger of A or B.
+   (Defined as a function to avoid evaluating A or B more than
+   once.) */
+static int
+max_int (int a, int b)
+{
+  return MAX (a, b);
+}
+
+/* Returns the number of bytes of uncompressed case data used for
+   writing a variable of the given WIDTH to a system file.  All
+   required space is included, including trailing padding and
+   internal padding. */
+static int
  sfm_width_to_bytes (int width)
  {
+  int bytes;
+
    assert (width >= 0);
  
    if (width == 0)
-    return MAX_SHORT_STRING;
-  else if (width < MIN_VERY_LONG_STRING)
-    return ROUND_UP (width, MAX_SHORT_STRING);
+    bytes = 8;
+  else if (!is_very_long (width))
+    bytes = width;
    else
      {
-      int chunks = width / EFFECTIVE_LONG_STRING_LENGTH ;
-      int remainder = width % EFFECTIVE_LONG_STRING_LENGTH ;
-      int bytes = remainder + (chunks * MIN_VERY_LONG_STRING);
-      return ROUND_UP (bytes, MAX_SHORT_STRING);
+      int chunks = width / EFFECTIVE_VLS_CHUNK;
+      int remainder = width % EFFECTIVE_VLS_CHUNK;
+      bytes = remainder + (chunks * ROUND_UP (REAL_VLS_CHUNK, 8));
      }
+  return ROUND_UP (bytes, 8);
+}
+
+/* Returns the number of 8-byte units (octs) used to write data
+   for a variable of the given WIDTH. */
+int
+sfm_width_to_octs (int width)
+{
+  return sfm_width_to_bytes (width) / 8;
  }
  
+/* Returns the number of "segments" used for writing case data
+   for a variable of the given WIDTH.  A segment is a physical
+   variable in the system file that represents some piece of a
+   logical variable as seen by a PSPP user.  Only very long
+   string variables have more than one segment. */
+int
+sfm_width_to_segments (int width)
+{
+  assert (width >= 0);
  
+  return !is_very_long (width) ? 1 : DIV_RND_UP (width, EFFECTIVE_VLS_CHUNK);
+}
+
+/* Returns the width to allocate to the given SEGMENT within a
+   variable of the given WIDTH.  A segment is a physical variable
+   in the system file that represents some piece of a logical
+   variable as seen by a PSPP user. */
+int
+sfm_segment_alloc_width (int width, int segment)
+{
+  assert (segment < sfm_width_to_segments (width));
+
+  return (!is_very_long (width) ? width
+          : segment < sfm_width_to_segments (width) - 1 ? 255
+          : width - segment * EFFECTIVE_VLS_CHUNK);
+}
+
+/* Returns the number of bytes to allocate to the given SEGMENT
+   within a variable of the given width.  This is the same as
+   sfm_segment_alloc_width, except that a numeric value takes up
+   8 bytes despite having a width of 0. */
+static int
+sfm_segment_alloc_bytes (int width, int segment)
+{
+  assert (segment < sfm_width_to_segments (width));
+  return (width == 0 ? 8
+          : ROUND_UP (sfm_segment_alloc_width (width, segment), 8));
+}
+
+/* Returns the number of bytes in the given SEGMENT within a
+   variable of the given WIDTH that are actually used to store
+   data.  For a numeric value (WIDTH of 0), this is 8 bytes; for
+   a string value less than 256 bytes wide, it is WIDTH bytes.
+   For very long string values, the calculation is more
+   complicated and ranges between 255 bytes for the first segment
+   to as little as 0 bytes for final segments. */
+static int
+sfm_segment_used_bytes (int width, int segment)
+{
+  assert (segment < sfm_width_to_segments (width));
+  return (width == 0 ? 8
+          : !is_very_long (width) ? width
+          : max_int (0, min_int (width - REAL_VLS_CHUNK * segment,
+                                 REAL_VLS_CHUNK)));
+}
+
+/* Returns the number of bytes at the end of the given SEGMENT
+   within a variable of the given WIDTH that are not used for
+   data; that is, the number of bytes that must be padded with
+   data that a reader ignores. */
+static int
+sfm_segment_padding (int width, int segment)
+{
+  return (sfm_segment_alloc_bytes (width, segment)
+          - sfm_segment_used_bytes (width, segment));
+}
+
+/* Returns the byte offset of the start of the given SEGMENT
+   within a variable of the given WIDTH.  The first segment
+   starts at offset 0; only very long string variables have any
+   other segments. */
+static int
+sfm_segment_offset (int width, int segment)
+{
+  assert (segment < sfm_width_to_segments (width));
+  return min_int (REAL_VLS_CHUNK * segment, width);
+}
+
+/* Returns the byte offset of the start of the given SEGMENT
+   within a variable of the given WIDTH, given the (incorrect)
+   assumption that there are EFFECTIVE_VLS_CHUNK bytes per
+   segment.  (Use of this function is questionable at best.) */
+int
+sfm_segment_effective_offset (int width, int segment)
+{
+  assert (segment < sfm_width_to_segments (width));
+  return EFFECTIVE_VLS_CHUNK * segment;
+}
+
+/* Creates and initializes an array of struct sfm_vars that
+   describe how a case drawn from dictionary DICT is laid out in
+   a system file.  Returns the number of segments in a case.  A
+   segment is a physical variable in the system file that
+   represents some piece of a logical variable as seen by a PSPP
+   user.
+
+   The array is allocated with malloc and stored in *SFM_VARS,
+   and its number of elements is stored in *SFM_N_VARS.  The
+   caller is responsible for freeing it when it is no longer
+   needed. */
+int
+sfm_dictionary_to_sfm_vars (const struct dictionary *dict,
+                            struct sfm_var **sfm_vars, size_t *sfm_n_vars)
+{
+  size_t n_vars = dict_get_n_vars (dict);
+  size_t n_segments;
+  size_t i;
+
+  /* Estimate the number of sfm_vars that will be needed.
+     We might not need all of these, because very long string
+     variables can have segments that are all padding, which do
+     not need sfm_vars of their own. */
+  n_segments = 0;
+  for (i = 0; i < n_vars; i++)
+    {
+      const struct variable *v = dict_get_var (dict, i);
+      n_segments += sfm_width_to_segments (var_get_width (v));
+    }
+
+  /* Compose the sfm_vars. */
+  *sfm_vars = xnmalloc (n_segments, sizeof **sfm_vars);
+  *sfm_n_vars = 0;
+  for (i = 0; i < n_vars; i++)
+    {
+      const struct variable *dv = dict_get_var (dict, i);
+      int width = var_get_width (dv);
+      int j;
+
+      for (j = 0; j < sfm_width_to_segments (width); j++)
+        {
+          int used_bytes = sfm_segment_used_bytes (width, j);
+          int padding = sfm_segment_padding (width, j);
+          struct sfm_var *sv;
+          if (used_bytes != 0)
+            {
+              sv = &(*sfm_vars)[(*sfm_n_vars)++];
+              sv->var_width = width;
+              sv->segment_width = width == 0 ? 0 : used_bytes;
+              sv->case_index = var_get_case_index (dv);
+              sv->offset = sfm_segment_offset (width, j);
+              sv->padding = padding;
+            }
+          else
+            {
+              /* Segment is all padding.  Just add it to the
+                 previous segment. */
+              sv = &(*sfm_vars)[*sfm_n_vars - 1];
+              sv->padding += padding;
+            }
+          assert ((sv->segment_width + sv->padding) % 8 == 0);
+        }
+    }
+
+  return n_segments;
+}
+\f
+/* Given the name of an encoding, returns the codepage number to use in the
+   'character_code' member of the machine integer info record for writing a
+   system file. */
+int
+sys_get_codepage_from_encoding (const char *name)
+{
+  const struct sys_encoding *e;
+
+  for (e = sys_codepage_name_to_number; e->name != NULL; e++)
+    if (!c_strcasecmp (name, e->name))
+      return e->number;
+
+  return 0;
+}
+
+/* Given a codepage number from the 'character_code' member of the machine
+   integer info record in a system file, returns a corresponding encoding name.
+   Most encodings have multiple aliases; the one returned is the one that would
+   be used in the character encoding record. */
+const char *
+sys_get_encoding_from_codepage (int codepage)
+{
+  const struct sys_encoding *e;
+
+  for (e = sys_codepage_number_to_name; e->name != NULL; e++)
+    if (codepage == e->number)
+      return e->name;
+
+  return NULL;
+}