From 3e30fb40d64fcf006b327a5f81934c14ef842111 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Mon, 18 Jan 2010 15:13:02 -0800 Subject: [PATCH] New library for interned strings. An "interned" string is stored in a global hash table. Only one copy of any given string is kept in the hash table, which reduces memory usage in cases where there might otherwise be many duplicates of a given string. Interned strings can be compared for equality by comparing pointers, which can also be a significant advantage in some cases. Interned strings are immutable. This commit adds a general-purpose implementation of interned strings and adapts the implementation of value labels, which already had a special-purpose and less convenient implementation of interned strings, to use them. --- src/data/value-labels.c | 113 +++---------------------------------- src/data/value-labels.h | 10 +++- src/libpspp/automake.mk | 2 + src/libpspp/intern.c | 121 ++++++++++++++++++++++++++++++++++++++++ src/libpspp/intern.h | 41 ++++++++++++++ 5 files changed, 180 insertions(+), 107 deletions(-) create mode 100644 src/libpspp/intern.c create mode 100644 src/libpspp/intern.h diff --git a/src/data/value-labels.c b/src/data/value-labels.c index 0fe829ce..6d6b57a6 100644 --- a/src/data/value-labels.c +++ b/src/data/value-labels.c @@ -28,23 +28,12 @@ #include #include #include +#include #include #include #include "xalloc.h" -static struct atom *atom_create (const char *string); -static void atom_destroy (struct atom *); -static const char *atom_to_string (const struct atom *); - -/* Returns the label in VL. The caller must not modify or free - the returned value. */ -const char * -val_lab_get_label (const struct val_lab *vl) -{ - return atom_to_string (vl->label); -} - /* Creates and returns a new, empty set of value labels with the given WIDTH. */ struct val_labs * @@ -69,7 +58,7 @@ val_labs_clone (const struct val_labs *vls) copy = val_labs_create (vls->width); HMAP_FOR_EACH (label, struct val_lab, node, &vls->labels) - val_labs_add (copy, &label->value, atom_to_string (label->label)); + val_labs_add (copy, &label->value, label->label); return copy; } @@ -124,7 +113,7 @@ val_labs_clear (struct val_labs *vls) { hmap_delete (&vls->labels, &label->node); value_destroy (&label->value, vls->width); - atom_destroy (label->label); + intern_unref (label->label); free (label); } } @@ -144,7 +133,7 @@ do_add_val_lab (struct val_labs *vls, const union value *value, struct val_lab *lab = xmalloc (sizeof *lab); value_init (&lab->value, vls->width); value_copy (&lab->value, value, vls->width); - lab->label = atom_create (label); + lab->label = intern_new (label); hmap_insert (&vls->labels, &lab->node, value_hash (value, vls->width, 0)); } @@ -173,8 +162,8 @@ val_labs_replace (struct val_labs *vls, const union value *value, struct val_lab *vl = val_labs_lookup (vls, value); if (vl != NULL) { - atom_destroy (vl->label); - vl->label = atom_create (label); + intern_unref (vl->label); + vl->label = intern_new (label); } else do_add_val_lab (vls, value, label); @@ -186,7 +175,7 @@ val_labs_remove (struct val_labs *vls, struct val_lab *label) { hmap_delete (&vls->labels, &label->node); value_destroy (&label->value, vls->width); - atom_destroy (label->label); + intern_unref (label->label); free (label); } @@ -197,7 +186,7 @@ const char * val_labs_find (const struct val_labs *vls, const union value *value) { const struct val_lab *label = val_labs_lookup (vls, value); - return label ? atom_to_string (label->label) : NULL; + return label ? label->label : NULL; } /* Searches VLS for a value label for VALUE. If successful, @@ -270,89 +259,3 @@ val_labs_sorted (const struct val_labs *vls) else return NULL; } - -/* Atoms: reference-counted constant strings. */ - -/* An atom. */ -struct atom - { - struct hmap_node node; /* Hash map node. */ - char *string; /* String value. */ - unsigned ref_count; /* Number of references. */ - }; - -/* Hash table of atoms. */ -static struct hmap atoms = HMAP_INITIALIZER (atoms); - -static void free_atom (struct atom *atom); -static void free_all_atoms (void); - -/* Creates and returns an atom for STRING. */ -static struct atom * -atom_create (const char *string) -{ - static bool initialized; - struct atom *atom; - size_t hash; - - assert (string != NULL); - - if (!initialized) - { - initialized = true; - atexit (free_all_atoms); - } - - hash = hash_string (string, 0); - HMAP_FOR_EACH_WITH_HASH (atom, struct atom, node, hash, &atoms) - if (!strcmp (atom->string, string)) - { - atom->ref_count++; - return atom; - } - - atom = xmalloc (sizeof *atom); - atom->string = xstrdup (string); - atom->ref_count = 1; - hmap_insert (&atoms, &atom->node, hash); - return atom; -} - -/* Destroys ATOM. */ -static void -atom_destroy (struct atom *atom) -{ - if (atom != NULL) - { - assert (atom->ref_count > 0); - atom->ref_count--; - if (atom->ref_count == 0) - { - hmap_delete (&atoms, &atom->node); - free_atom (atom); - } - } -} - -/* Returns the string associated with ATOM. */ -static const char * -atom_to_string (const struct atom *atom) -{ - return atom->string; -} - -static void -free_atom (struct atom *atom) -{ - free (atom->string); - free (atom); -} - -static void -free_all_atoms (void) -{ - struct atom *atom, *next; - - HMAP_FOR_EACH_SAFE (atom, next, struct atom, node, &atoms) - free_atom (atom); -} diff --git a/src/data/value-labels.h b/src/data/value-labels.h index 460ab84c..d6f65d43 100644 --- a/src/data/value-labels.h +++ b/src/data/value-labels.h @@ -38,7 +38,7 @@ struct val_lab { struct hmap_node node; /* Node in hash map. */ union value value; /* The value being labeled. */ - struct atom *label; /* A ref-counted string. */ + const char *label; /* An interned string. */ }; /* Returns the value in VL. The caller must not modify or free @@ -52,7 +52,13 @@ static inline const union value *val_lab_get_value (const struct val_lab *vl) return &vl->value; } -const char *val_lab_get_label (const struct val_lab *); +/* Returns the label in VL. The caller must not modify or free the returned + value. */ +static inline const char * +val_lab_get_label (const struct val_lab *vl) +{ + return vl->label; +} /* A set of value labels. */ struct val_labs diff --git a/src/libpspp/automake.mk b/src/libpspp/automake.mk index 90602b08..59ac613d 100644 --- a/src/libpspp/automake.mk +++ b/src/libpspp/automake.mk @@ -40,6 +40,8 @@ src_libpspp_libpspp_la_SOURCES = \ src/libpspp/i18n.h \ src/libpspp/integer-format.c \ src/libpspp/integer-format.h \ + src/libpspp/intern.c \ + src/libpspp/intern.h \ src/libpspp/legacy-encoding.c \ src/libpspp/legacy-encoding.h \ src/libpspp/ll.c \ diff --git a/src/libpspp/intern.c b/src/libpspp/intern.c new file mode 100644 index 00000000..0bb9277d --- /dev/null +++ b/src/libpspp/intern.c @@ -0,0 +1,121 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "libpspp/intern.h" + +#include +#include + +#include "libpspp/assertion.h" +#include "libpspp/hash-functions.h" +#include "libpspp/hmap.h" + +#include "gl/xalloc.h" + +/* A single interned string. */ +struct interned_string + { + struct hmap_node node; /* Node in hash table. */ + size_t ref_cnt; /* Reference count. */ + char string[1]; /* Null-terminated string. */ + }; + +/* All interned strings. */ +static struct hmap interns = HMAP_INITIALIZER (interns); + +/* Searches the table of interned string for */ +static struct interned_string * +intern_lookup__ (const char *s, size_t length, unsigned int hash) +{ + struct interned_string *is; + + HMAP_FOR_EACH_WITH_HASH (is, struct interned_string, node, hash, &interns) + if (!memcmp (s, is->string, length + 1)) + return is; + + return NULL; +} + +/* Returns an interned version of string S. Pass the returned string to + intern_unref() to release it. */ +const char * +intern_new (const char *s) +{ + size_t length = strlen (s); + unsigned int hash = hash_bytes (s, length, 0); + struct interned_string *is; + + is = intern_lookup__ (s, length, hash); + if (is != NULL) + is->ref_cnt++; + else + { + is = xmalloc (length + sizeof *is); + hmap_insert (&interns, &is->node, hash); + is->ref_cnt = 1; + memcpy (is->string, s, length + 1); + } + return is->string; +} + +static struct interned_string * +interned_string_from_string (const char *s) +{ + const size_t ofs = offsetof (struct interned_string, string); + struct interned_string *is = (struct interned_string *) (s - ofs); + assert (is->ref_cnt > 0); + return is; +} + +/* Increases the reference count on S, which must be an interned string + returned by intern_new(). */ +const char * +intern_ref (const char *s) +{ + struct interned_string *is = interned_string_from_string (s); + is->ref_cnt++; + return s; +} + +/* Decreases the reference count on S, which must be an interned string + returned by intern_new(). If the reference count reaches 0, frees the + interned string. */ +void +intern_unref (const char *s) +{ + struct interned_string *is = interned_string_from_string (s); + if (--is->ref_cnt == 0) + { + hmap_delete (&interns, &is->node); + free (is); + } +} + +/* Given null-terminated string S, returns true if S is an interned string + returned by intern_string_new(), false otherwise. + + This is appropriate for use in debug assertions, e.g.: + assert (is_interned_string (s)); +*/ +bool +is_interned_string (const char *s) +{ + size_t length = strlen (s); + unsigned int hash = hash_bytes (s, length, 0); + return intern_lookup__ (s, length, hash) != NULL; +} diff --git a/src/libpspp/intern.h b/src/libpspp/intern.h new file mode 100644 index 00000000..147a69a8 --- /dev/null +++ b/src/libpspp/intern.h @@ -0,0 +1,41 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2010 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef LIBPSPP_INTERN_H +#define LIBPSPP_INTERN_H 1 + +/* Interned strings. + + An "interned" string is stored in a global hash table. Only one copy of any + given string is kept in the hash table, which reduces memory usage in cases + where there might otherwise be many duplicates of a given string. + + Interned strings can be compared for equality by comparing pointers, which + can also be a significant advantage in some cases. + + Interned strings are immutable. + + See http://en.wikipedia.org/wiki/String_interning for more information. */ + +#include + +const char *intern_new (const char *); +const char *intern_ref (const char *); +void intern_unref (const char *); + +bool is_interned_string (const char *); + +#endif /* libpspp/intern.h */ -- 2.30.2