/* PSPP - a program for statistical analysis.
- Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Free Software Foundation, Inc.
+ Copyright (C) 2006, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
+ 2016, 2021 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
#include <string.h>
#include <unicase.h>
#include <unigbrk.h>
+#include <uniwidth.h>
#include "libpspp/assertion.h"
#include "libpspp/compiler.h"
#include "libpspp/hmapx.h"
#include "libpspp/hash-functions.h"
+#include "libpspp/misc.h"
#include "libpspp/pool.h"
#include "libpspp/str.h"
#include "libpspp/version.h"
#include "gl/c-ctype.h"
#include "gl/c-strcase.h"
#include "gl/localcharset.h"
+#include <gl/localename.h>
#include "gl/minmax.h"
#include "gl/xalloc.h"
#include "gl/relocatable.h"
/* A wrapper around iconv_open */
static struct converter *
-create_iconv (const char* tocode, const char* fromcode)
+create_iconv (const char* tocode, const char* fromcode, bool warn)
{
size_t hash;
struct hmapx_node *node;
as the converters have not yet been set up */
if (error && strcmp (tocode, fromcode))
{
- fprintf (stderr,
- "Warning: "
- "cannot create a converter for `%s' to `%s': %s\n",
- fromcode, tocode, strerror (error));
+ if (warn)
+ fprintf (stderr,
+ "Warning: "
+ "cannot create a converter for `%s' to `%s': %s\n",
+ fromcode, tocode, strerror (error));
free (converter->tocode);
free (converter->fromcode);
iconv_t bconv = iconv_open (tocode, "ASCII");
if (bconv != (iconv_t) -1)
{
- ICONV_CONST char *nullstr = strdup ("");
- ICONV_CONST char *outbuf = strdup ("XXXXXXXX");
- ICONV_CONST char *snullstr = nullstr;
- ICONV_CONST char *soutbuf = outbuf;
-
- size_t inbytes = 1;
- const size_t bytes = 8;
- size_t outbytes = bytes;
- if (-1 != iconv (bconv, &nullstr, &inbytes, &outbuf, &outbytes))
- converter->null_char_width = bytes - outbytes;
- free (snullstr);
- free (soutbuf);
+ ICONV_CONST char inbuf[1] = "";
+ ICONV_CONST char *inptr = inbuf;
+ size_t inbytes = sizeof inbuf;
+
+ char outbuf[8];
+ char *outptr = outbuf;
+ size_t outbytes = sizeof outbuf;
+ if (-1 != iconv (bconv, &inptr, &inbytes, &outptr, &outbytes))
+ converter->null_char_width = outptr - outbuf;
iconv_close (bconv);
}
{
struct substring out;
- if ( text == NULL )
+ if (text == NULL)
return NULL;
- if ( length == -1 )
+ if (length == -1)
length = strlen (text);
out = recode_substring_pool (to, from, ss_buffer (text, length), pool);
return prefix_len + tail_len;
}
+/* Returns the number of display columns that would be occupied by the LENGTH
+ bytes of UTF-8 starting at S. */
+size_t
+utf8_count_columns (const char *s_, size_t length)
+{
+ const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
+
+ size_t columns = 0;
+ for (int ofs = 0; ofs < length; )
+ {
+ ucs4_t uc;
+ ofs += u8_mbtouc (&uc, s + ofs, length - ofs);
+ if (uc != '\t')
+ {
+ int width = uc_width (uc, "UTF-8");
+ if (width > 0)
+ columns += width;
+ }
+ else
+ columns = ROUND_UP (columns + 1, 8);
+ }
+ return columns;
+}
+
+/* Returns the byte offset in LENGTH-byte UTF-8 string S that is N_COLUMNS
+ display columns into the string. */
+size_t
+utf8_columns_to_bytes (const char *s_, size_t length, size_t n_columns)
+{
+ const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
+
+ size_t columns = 0;
+ int ofs;
+ for (ofs = 0; ofs < length && columns < n_columns; )
+ {
+ ucs4_t uc;
+ ofs += u8_mbtouc (&uc, s + ofs, length - ofs);
+ if (uc != '\t')
+ {
+ int width = uc_width (uc, "UTF-8");
+ if (width > 0)
+ columns += width;
+ }
+ else
+ columns = ROUND_UP (columns + 1, 8);
+ }
+ return ofs;
+}
+
/* Returns an allocated, null-terminated string, owned by the caller,
containing as many characters[*] from the beginning of S that would fit
within MAX_LEN bytes if the returned string were to be re-encoded in
if (from == NULL)
from = default_encoding;
- conv = create_iconv (to, from);
+ conv = create_iconv (to, from, true);
- if ( NULL == conv )
+ if (NULL == conv)
{
if (fallbackchar)
{
i18n_init (void)
{
setlocale (LC_ALL, "");
- bindtextdomain (PACKAGE, relocate(locale_dir));
+ char *allocated;
+ bindtextdomain (PACKAGE, relocate2 (locale_dir, &allocated));
+ free (allocated);
textdomain (PACKAGE);
assert (default_encoding == NULL);
default_encoding = xstrdup (enc);
}
+/* Return the ISO two letter code for the current LC_MESSAGES
+ locale category. */
+char *
+get_language (void)
+{
+ const char *localename = gl_locale_name (LC_MESSAGES, "LC_MESSAGES");
+ if (0 == strcmp (localename, "C"))
+ return NULL;
+ char *ln = xstrdup (localename);
+ char *end = strchr (ln, '_');
+ if (end)
+ *end = '\0';
+ return ln;
+}
+
/* Attempts to set the encoding from a locale name
returns true if successful.
loc_encoding = xstrdup (locale_charset ());
- if ( 0 == strcmp (loc_encoding, c_encoding))
+ if (0 == strcmp (loc_encoding, c_encoding))
{
ok = false;
}
{
iconv_t conv = iconv_open (UTF8, enc);
- if ( conv == (iconv_t) -1)
+ if (conv == (iconv_t) -1)
return false;
iconv_close (conv);
unsigned int
utf8_hash_case_string (const char *s, unsigned int basis)
{
- return utf8_hash_case_bytes (s, strlen (s), basis);
+ return utf8_hash_case_substring (ss_cstr (s), basis);
+}
+
+/* Returns a hash value for UTF-8 string S, with lowercase and uppercase
+ letters treated as equal, starting from BASIS. */
+unsigned int
+utf8_hash_case_substring (struct substring s, unsigned int basis)
+{
+ return utf8_hash_case_bytes (s.string, s.length, basis);
}
/* Compares UTF-8 strings A and B case-insensitively.
int
utf8_strcasecmp (const char *a, const char *b)
{
- return utf8_strncasecmp (a, strlen (a), b, strlen (b));
+ return utf8_sscasecmp (ss_cstr (a), ss_cstr (b));
+}
+
+int
+utf8_sscasecmp (struct substring a, struct substring b)
+{
+ return utf8_strncasecmp (a.string, a.length, b.string, b.length);
}
/* Compares UTF-8 strings A (with length AN) and B (with length BN)
{
return utf8_casemap (s, u8_tolower);
}
+
+char *
+utf8_to_title (const char *s)
+{
+ return utf8_casemap (s, u8_totitle);
+}
\f
bool
get_encoding_info (struct encoding_info *e, const char *name)
bool
is_encoding_supported (const char *encoding)
{
- return (create_iconv ("UTF-8", encoding)
- && create_iconv (encoding, "UTF-8"));
+ return (create_iconv ("UTF-8", encoding, false)
+ && create_iconv (encoding, "UTF-8", false));
}
/* Returns true if E is the name of a UTF-8 encoding.