/* PSPP - a program for statistical analysis.
- Copyright (C) 1997-9, 2000, 2006, 2009, 2010 Free Software Foundation, Inc.
+ Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
#include <errno.h>
#include <stdint.h>
#include <stdlib.h>
+#include <unistr.h>
#include "libpspp/cast.h"
#include "libpspp/message.h"
new->length = cnt;
}
+void
+ss_realloc (struct substring *ss, size_t size)
+{
+ ss->string = xrealloc (ss->string, size);
+}
+
/* Makes a pool_alloc_unaligned()'d copy of the contents of OLD
in POOL, and stores it in NEW. */
void
/* If the last byte in SS is C, removes it and returns true.
Otherwise, returns false without changing the string. */
bool
-ss_chomp (struct substring *ss, char c)
+ss_chomp_byte (struct substring *ss, char c)
{
if (ss_last (*ss) == c)
{
s[ss.length] = '\0';
return s;
}
+/* UTF-8. */
+
+/* Returns the character represented by the UTF-8 sequence at the start of S.
+ The return value is either a Unicode code point in the range 0 to 0x10ffff,
+ or UINT32_MAX if S is empty. */
+ucs4_t
+ss_first_mb (struct substring s)
+{
+ return ss_at_mb (s, 0);
+}
+
+/* Returns the number of bytes in the UTF-8 character at the beginning of S.
+
+ The return value is 0 if S is empty, otherwise between 1 and 4. */
+int
+ss_first_mblen (struct substring s)
+{
+ return ss_at_mblen (s, 0);
+}
+
+/* Advances S past the UTF-8 character at its beginning. Returns the Unicode
+ code point that was skipped (in the range 0 to 0x10ffff), or UINT32_MAX if S
+ was not modified because it was initially empty. */
+ucs4_t
+ss_get_mb (struct substring *s)
+{
+ if (s->length > 0)
+ {
+ ucs4_t uc;
+ int n;
+
+ n = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
+ s->string += n;
+ s->length -= n;
+ return uc;
+ }
+ else
+ return UINT32_MAX;
+}
+
+/* Returns the character represented by the UTF-8 sequence starting OFS bytes
+ into S. The return value is either a Unicode code point in the range 0 to
+ 0x10ffff, or UINT32_MAX if OFS is past the last byte in S.
+
+ (Returns 0xfffd if OFS points into the middle, not the beginning, of a UTF-8
+ sequence.) */
+ucs4_t
+ss_at_mb (struct substring s, size_t ofs)
+{
+ if (s.length > ofs)
+ {
+ ucs4_t uc;
+ u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s.string + ofs),
+ s.length - ofs);
+ return uc;
+ }
+ else
+ return UINT32_MAX;
+}
+
+/* Returns the number of bytes represented by the UTF-8 sequence starting OFS
+ bytes into S. The return value is 0 if OFS is past the last byte in S,
+ otherwise between 1 and 4. */
+int
+ss_at_mblen (struct substring s, size_t ofs)
+{
+ if (s.length > ofs)
+ {
+ ucs4_t uc;
+ return u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s.string + ofs),
+ s.length - ofs);
+ }
+ else
+ return 0;
+}
\f
/* Initializes ST as an empty string. */
void
/* If the last byte in ST is C, removes it and returns true.
Otherwise, returns false without modifying ST. */
bool
-ds_chomp (struct string *st, char c)
+ds_chomp_byte (struct string *st, char c)
{
- return ss_chomp (&st->ss, c);
+ return ss_chomp_byte (&st->ss, c);
}
/* Divides ST into tokens separated by any of the DELIMITERS.
(*line_number)++;
ds_rtrim (st, ss_cstr (CC_SPACES));
}
- while (ds_chomp (st, '\\'));
+ while (ds_chomp_byte (st, '\\'));
remove_comment (st);
return true;