From a124375065d0768546f6e7670d9c6d6a0b2b5379 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 19 Feb 2011 12:55:54 -0800 Subject: [PATCH] data-out: Optimize and fix some bad assumptions. Until now, data_out_pool() and its wrapper function data_out() have always done at least two memory allocations: one to fill in the initial version of the result and another to recode it to UTF-8. However, recoding to UTF-8 is usually unnecessary, because most output formats always produce output in UTF-8 anyway. Only binary formats and the string A format ever produce data in other encodings, so this commit drops recoding entirely except for those cases. Binary formats are a particularly special case: usually it doesn't make any sense to use these formats for text output, but this commit does its best to translate the binary output bytes into valid UTF-8, at least up to the first null byte. This commit also finishes fixing up display widths. The closely related data_out_legacy() function, which only has one user in three also needed some work. It was badly named, so I renamed it to data_out_recode(). It made the bad assumption that the data passed in was encoded in ASCII (written C_ENCODING). It also made the bad assumption that the number of bytes output would be exactly the format's width. This rewrite fixes these problems. --- src/data/data-out.c | 125 ++++++++++++++++++++++++----------- src/data/data-out.h | 6 +- src/language/data-io/print.c | 6 +- 3 files changed, 93 insertions(+), 44 deletions(-) diff --git a/src/data/data-out.c b/src/data/data-out.c index 1ed83bcb..96bf58bd 100644 --- a/src/data/data-out.c +++ b/src/data/data-out.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -31,6 +32,7 @@ #include #include +#include #include #include #include @@ -92,57 +94,103 @@ static data_out_converter_func *const converters[FMT_NUMBER_OF_FORMATS] = #include "format.def" }; -/* Similar to data_out. Additionally recodes the output from - native form into the given legacy character ENCODING. - OUTPUT must be provided by the caller and must be at least - FORMAT->w long. No null terminator is appended to OUTPUT. -*/ +/* Converts the INPUT value, encoded in INPUT_ENCODING, according to format + specification FORMAT, appending the output to OUTPUT in OUTPUT_ENCODING. + However, binary formats (FMT_P, FMT_PK, FMT_IB, FMT_PIB, FMT_RB) yield the + binary results, which may not be properly encoded for OUTPUT_ENCODING. + + VALUE must be the correct width for FORMAT, that is, its width must be + fmt_var_width(FORMAT). + + INPUT_ENCODING can normally be obtained by calling dict_get_encoding() on + the dictionary with which INPUT is associated. ENCODING is only important + when FORMAT's type is FMT_A. */ void -data_out_legacy (const union value *input, const char *encoding, - const struct fmt_spec *format, char *output) +data_out_recode (const union value *input, const char *input_encoding, + const struct fmt_spec *format, + struct string *output, const char *output_encoding) { assert (fmt_check_output (format)); + if (format->type == FMT_A) + { + char *in = CHAR_CAST (char *, value_str (input, format->w)); + char *out = recode_string (output_encoding, input_encoding, + in, format->w); + ds_put_cstr (output, out); + free (out); + } + else if (fmt_get_category (format->type) == FMT_CAT_BINARY) + converters[format->type] (input, format, + ds_put_uninit (output, format->w)); + else + { + char *utf8_encoded = data_out (input, input_encoding, format); + char *output_encoded = recode_string (output_encoding, UTF8, + utf8_encoded, -1); + ds_put_cstr (output, output_encoded); + free (output_encoded); + free (utf8_encoded); + } +} - converters[format->type] (input, format, output); - if (0 != strcmp (encoding, C_ENCODING) - && fmt_get_category (format->type) != FMT_CAT_BINARY) +static char * +binary_to_utf8 (const char *in, struct pool *pool) +{ + uint8_t *out = pool_alloc_unaligned (pool, strlen (in) * 2 + 1); + uint8_t *p = out; + + while (*in != '\0') { - char *s = recode_string (encoding, C_ENCODING, output, format->w ); - memcpy (output, s, format->w); - free (s); + uint8_t byte = *in++; + int mblen = u8_uctomb (p, byte, 2); + assert (mblen > 0); + p += mblen; } + *p = '\0'; + + return CHAR_CAST (char *, out); } -/* Converts the INPUT value into a UTF8 encoded string, according - to format specification FORMAT. +/* Converts the INPUT value into a UTF-8 encoded string, according to format + specification FORMAT. - VALUE must be the correct width for FORMAT, that is, its - width must be fmt_var_width(FORMAT). + VALUE must be the correct width for FORMAT, that is, its width must be + fmt_var_width(FORMAT). - ENCODING must be the encoding of INPUT. Normally this can - be obtained by calling dict_get_encoding on the dictionary - with which INPUT is associated. + ENCODING must be the encoding of INPUT. Normally this can be obtained by + calling dict_get_encoding() on the dictionary with which INPUT is + associated. ENCODING is only important when FORMAT's type is FMT_A. - The return value is dynamically allocated, and must be freed - by the caller. If POOL is non-null, then the return value is - allocated on that pool. -*/ + The return value is dynamically allocated, and must be freed by the caller. + If POOL is non-null, then the return value is allocated on that pool. */ char * data_out_pool (const union value *input, const char *encoding, const struct fmt_spec *format, struct pool *pool) { - const struct fmt_number_style *style = settings_get_style (format->type); - char *output; - char *t ; assert (fmt_check_output (format)); + if (format->type == FMT_A) + { + char *in = CHAR_CAST (char *, value_str (input, format->w)); + return recode_string_pool (UTF8, encoding, in, format->w, pool); + } + else if (fmt_get_category (format->type) == FMT_CAT_BINARY) + { + char tmp[16]; - output = xmalloc (format->w + style->extra_bytes + 1); - - converters[format->type] (input, format, output); + assert (format->w + 1 <= sizeof tmp); + converters[format->type] (input, format, tmp); + return binary_to_utf8 (tmp, pool); + } + else + { + const struct fmt_number_style *style = settings_get_style (format->type); + size_t size = format->w + style->extra_bytes + 1; + char *output; - t = recode_string_pool (UTF8, encoding, output, format->w, pool); - free (output); - return t; + output = pool_alloc_unaligned (pool, size); + converters[format->type] (input, format, output); + return output; + } } char * @@ -529,11 +577,10 @@ output_MONTH (const union value *input, const struct fmt_spec *format, /* Outputs A format. */ static void -output_A (const union value *input, const struct fmt_spec *format, - char *output) +output_A (const union value *input UNUSED, + const struct fmt_spec *format UNUSED, char *output UNUSED) { - memcpy (output, value_str (input, format->w), format->w); - output[format->w] = '\0'; + NOT_REACHED (); } /* Outputs AHEX format. */ @@ -682,7 +729,7 @@ output_scientific (double number, const struct fmt_spec *format, int width; int fraction_width; bool add_affixes; - char buf[64], *p; + char *p; /* Allocate minimum required space. */ width = 6 + style->neg_suffix.width; @@ -706,7 +753,7 @@ output_scientific (double number, const struct fmt_spec *format, width += fraction_width; /* Format (except suffix). */ - p = buf; + p = output; if (width < format->w) p = mempset (p, ' ', format->w - width); if (number < 0) diff --git a/src/data/data-out.h b/src/data/data-out.h index 39217739..00affb0a 100644 --- a/src/data/data-out.h +++ b/src/data/data-out.h @@ -22,13 +22,15 @@ #include struct fmt_spec; +struct string; union value; char * data_out (const union value *, const char *encoding, const struct fmt_spec *); char * data_out_pool (const union value *, const char *encoding, const struct fmt_spec *, struct pool *pool); -void data_out_legacy (const union value *input, const char *encoding, - const struct fmt_spec *format, char *output); +void data_out_recode (const union value *input, const char *input_encoding, + const struct fmt_spec *, + struct string *output, const char *output_encoding); #endif /* data-out.h */ diff --git a/src/language/data-io/print.c b/src/language/data-io/print.c index a07ca2d8..62798272 100644 --- a/src/language/data-io/print.c +++ b/src/language/data-io/print.c @@ -468,11 +468,11 @@ print_trns_proc (void *trns_, struct ccase **c, casenumber case_num UNUSED) if (spec->type == PRT_VAR) { const union value *input = case_data (*c, spec->var); - char *output = ds_put_uninit (&trns->line, spec->format.w); if (!spec->sysmis_as_spaces || input->f != SYSMIS) - data_out_legacy (input, trns->encoding, &spec->format, output); + data_out_recode (input, var_get_encoding (spec->var), + &spec->format, &trns->line, trns->encoding); else - memset (output, encoded_space, spec->format.w); + ds_put_byte_multiple (&trns->line, encoded_space, spec->format.w); if (spec->add_space) ds_put_byte (&trns->line, encoded_space); } -- 2.30.2