From 61cb03a73ff9f5d38e9728d4bf5a449212d3acdc Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 5 Dec 2021 20:02:37 -0800 Subject: [PATCH] lexer: Factor out functions for counting columns. These will have additional upcoming users. --- src/language/lexer/lexer.c | 34 +++---------------------- src/libpspp/i18n.c | 51 ++++++++++++++++++++++++++++++++++++++ src/libpspp/i18n.h | 3 +++ src/libpspp/str.c | 17 +++++++++++++ src/libpspp/str.h | 2 ++ 5 files changed, 76 insertions(+), 31 deletions(-) diff --git a/src/language/lexer/lexer.c b/src/language/lexer/lexer.c index 27d5dedb54..bad24d3f9b 100644 --- a/src/language/lexer/lexer.c +++ b/src/language/lexer/lexer.c @@ -28,7 +28,6 @@ #include #include #include -#include #include "language/command.h" #include "language/lexer/macro.h" @@ -1219,39 +1218,12 @@ lex_token_get_last_line_number (const struct lex_source *src, } } -static int -count_columns (const char *s_, size_t length) -{ - const uint8_t *s = CHAR_CAST (const uint8_t *, s_); - int columns; - size_t ofs; - int mblen; - - columns = 0; - for (ofs = 0; ofs < length; ofs += mblen) - { - ucs4_t uc; - - mblen = u8_mbtouc (&uc, s + ofs, length - ofs); - if (uc != '\t') - { - int width = uc_width (uc, "UTF-8"); - if (width > 0) - columns += width; - } - else - columns = ROUND_UP (columns + 1, 8); - } - - return columns + 1; -} - static int lex_token_get_first_column (const struct lex_source *src, const struct lex_token *token) { - return count_columns (&src->buffer[token->line_pos - src->tail], - token->token_pos - token->line_pos); + return utf8_count_columns (&src->buffer[token->line_pos - src->tail], + token->token_pos - token->line_pos) + 1; } static int @@ -1265,7 +1237,7 @@ lex_token_get_last_column (const struct lex_source *src, newline = memrchr (start, '\n', end - start); if (newline != NULL) start = newline + 1; - return count_columns (start, end - start); + return utf8_count_columns (start, end - start) + 1; } static struct msg_location diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c index 69162f14f0..4e04d32c25 100644 --- a/src/libpspp/i18n.c +++ b/src/libpspp/i18n.c @@ -29,11 +29,13 @@ #include #include #include +#include #include "libpspp/assertion.h" #include "libpspp/compiler.h" #include "libpspp/hmapx.h" #include "libpspp/hash-functions.h" +#include "libpspp/misc.h" #include "libpspp/pool.h" #include "libpspp/str.h" #include "libpspp/version.h" @@ -501,6 +503,55 @@ utf8_encoding_concat_len (const char *head, const char *tail, return prefix_len + tail_len; } +/* Returns the number of display columns that would be occupied by the LENGTH + bytes of UTF-8 starting at S. */ +size_t +utf8_count_columns (const char *s_, size_t length) +{ + const uint8_t *s = CHAR_CAST (const uint8_t *, s_); + + size_t columns = 0; + for (int ofs = 0; ofs < length; ) + { + ucs4_t uc; + ofs += u8_mbtouc (&uc, s + ofs, length - ofs); + if (uc != '\t') + { + int width = uc_width (uc, "UTF-8"); + if (width > 0) + columns += width; + } + else + columns = ROUND_UP (columns + 1, 8); + } + return columns; +} + +/* Returns the byte offset in LENGTH-byte UTF-8 string S that is N_COLUMNS + display columns into the string. */ +size_t +utf8_columns_to_bytes (const char *s_, size_t length, size_t n_columns) +{ + const uint8_t *s = CHAR_CAST (const uint8_t *, s_); + + size_t columns = 0; + int ofs; + for (ofs = 0; ofs < length && columns < n_columns; ) + { + ucs4_t uc; + ofs += u8_mbtouc (&uc, s + ofs, length - ofs); + if (uc != '\t') + { + int width = uc_width (uc, "UTF-8"); + if (width > 0) + columns += width; + } + else + columns = ROUND_UP (columns + 1, 8); + } + return ofs; +} + /* Returns an allocated, null-terminated string, owned by the caller, containing as many characters[*] from the beginning of S that would fit within MAX_LEN bytes if the returned string were to be re-encoded in diff --git a/src/libpspp/i18n.h b/src/libpspp/i18n.h index 232c5dc166..3ae1e9e0b1 100644 --- a/src/libpspp/i18n.h +++ b/src/libpspp/i18n.h @@ -58,6 +58,9 @@ char *utf8_encoding_concat (const char *head, const char *tail, size_t utf8_encoding_concat_len (const char *head, const char *tail, const char *encoding, size_t max_len); +size_t utf8_count_columns (const char *, size_t); +size_t utf8_columns_to_bytes (const char *, size_t, size_t n_columns); + char *utf8_to_filename (const char *filename); char *filename_to_utf8 (const char *filename); diff --git a/src/libpspp/str.c b/src/libpspp/str.c index 86e7fd9199..b40cb4b4ab 100644 --- a/src/libpspp/str.c +++ b/src/libpspp/str.c @@ -26,6 +26,7 @@ #include #include "libpspp/cast.h" +#include "libpspp/i18n.h" #include "libpspp/message.h" #include "libpspp/pool.h" @@ -928,6 +929,22 @@ ss_at_mblen (struct substring s, size_t ofs) else return 0; } + +size_t +ss_utf8_count_columns (struct substring s) +{ + return utf8_count_columns (s.string, s.length); +} + +/* Returns a substring of S starting at 0-based display column START and + running for N display columns. */ +struct substring +ss_utf8_columns (struct substring s, size_t start, size_t n) +{ + ss_advance (&s, utf8_columns_to_bytes (s.string, s.length, start)); + s.length = utf8_columns_to_bytes (s.string, s.length, n); + return s; +} /* Initializes ST as an empty string. */ void diff --git a/src/libpspp/str.h b/src/libpspp/str.h index 8cde577914..aaf83d71a4 100644 --- a/src/libpspp/str.h +++ b/src/libpspp/str.h @@ -150,6 +150,8 @@ int ss_first_mblen (struct substring); ucs4_t ss_get_mb (struct substring *); ucs4_t ss_at_mb (struct substring, size_t ofs); int ss_at_mblen (struct substring, size_t ofs); +size_t ss_utf8_count_columns (struct substring); +struct substring ss_utf8_columns (struct substring, size_t start, size_t n); /* Variable length strings. */ -- 2.30.2