1 // Determine a canonical name for the current locale's character encoding.
3 // Copyright (C) 2000-2006, 2008-2023 Free Software Foundation, Inc.
5 // This file is free software: you can redistribute it and/or modify it under
6 // the terms of the GNU Lesser General Public License as published by the Free
7 // Software Foundation; either version 2.1 of the License, or (at your option)
10 // This file is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12 // A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
15 // You should have received a copy of the GNU Lesser General Public License
16 // along with this program. If not, see <https://www.gnu.org/licenses/>.
18 // Written by Bruno Haible <bruno@clisp.org>. Translated to Rust by Ben Pfaff
19 // <blp@cs.stanford.edu>.
21 use lazy_static::lazy_static;
23 fn map_aliases(s: &str) -> &'static str {
24 #[cfg(target_os = "freebsd")]
26 "ARMSCII-8" => return "ARMSCII-8",
27 "Big5" => return "BIG5",
28 "C" => return "ASCII",
29 "CP1131" => return "CP1131",
30 "CP1251" => return "CP1251",
31 "CP866" => return "CP866",
32 "GB18030" => return "GB18030",
33 "GB2312" => return "GB2312",
34 "GBK" => return "GBK",
35 "ISCII-DEV" => return "?",
36 "ISO8859-1" => return "ISO-8859-1",
37 "ISO8859-13" => return "ISO-8859-13",
38 "ISO8859-15" => return "ISO-8859-15",
39 "ISO8859-2" => return "ISO-8859-2",
40 "ISO8859-5" => return "ISO-8859-5",
41 "ISO8859-7" => return "ISO-8859-7",
42 "ISO8859-9" => return "ISO-8859-9",
43 "KOI8-R" => return "KOI8-R",
44 "KOI8-U" => return "KOI8-U",
45 "SJIS" => return "SHIFT_JIS",
46 "US-ASCII" => return "ASCII",
47 "eucCN" => return "GB2312",
48 "eucJP" => return "EUC-JP",
49 "eucKR" => return "EUC-KR",
53 #[cfg(target_os = "netbsd")]
55 "646" => return "ASCII",
56 "ARMSCII-8" => return "ARMSCII-8",
57 "BIG5" => return "BIG5",
58 "Big5-HKSCS" => return "BIG5-HKSCS",
59 "CP1251" => return "CP1251",
60 "CP866" => return "CP866",
61 "GB18030" => return "GB18030",
62 "GB2312" => return "GB2312",
63 "ISO8859-1" => return "ISO-8859-1",
64 "ISO8859-13" => return "ISO-8859-13",
65 "ISO8859-15" => return "ISO-8859-15",
66 "ISO8859-2" => return "ISO-8859-2",
67 "ISO8859-4" => return "ISO-8859-4",
68 "ISO8859-5" => return "ISO-8859-5",
69 "ISO8859-7" => return "ISO-8859-7",
70 "KOI8-R" => return "KOI8-R",
71 "KOI8-U" => return "KOI8-U",
72 "PT154" => return "PT154",
73 "SJIS" => return "SHIFT_JIS",
74 "eucCN" => return "GB2312",
75 "eucJP" => return "EUC-JP",
76 "eucKR" => return "EUC-KR",
77 "eucTW" => return "EUC-TW",
81 #[cfg(target_os = "openbsd")]
83 "646" => return "ASCII",
84 "ISO8859-1" => return "ISO-8859-1",
85 "ISO8859-13" => return "ISO-8859-13",
86 "ISO8859-15" => return "ISO-8859-15",
87 "ISO8859-2" => return "ISO-8859-2",
88 "ISO8859-4" => return "ISO-8859-4",
89 "ISO8859-5" => return "ISO-8859-5",
90 "ISO8859-7" => return "ISO-8859-7",
91 "US-ASCII" => return "ASCII",
95 /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is
97 - It returns the empty string when LANG is set to a locale of the
98 form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8
100 - The environment variables LANG, LC_CTYPE, LC_ALL are not set by
101 the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
102 - The documentation says:
103 "... all code that calls BSD system routines should ensure
104 that the const *char parameters of these routines are in UTF-8
105 encoding. All BSD system functions expect their string
106 parameters to be in UTF-8 encoding and nothing else."
108 "An additional caveat is that string parameters for files,
109 paths, and other file-system entities must be in canonical
110 UTF-8. In a canonical UTF-8 Unicode string, all decomposable
111 characters are decomposed ..."
112 but this is not true: You can pass non-decomposed UTF-8 strings
113 to file system functions, and it is the OS which will convert
114 them to decomposed UTF-8 before accessing the file system.
115 - The Apple Terminal application displays UTF-8 by default.
116 - However, other applications are free to use different encodings:
117 - xterm uses ISO-8859-1 by default.
118 - TextEdit uses MacRoman by default.
119 We prefer UTF-8 over decomposed UTF-8-MAC because one should
120 minimize the use of decomposed Unicode. Unfortunately, through the
121 Darwin file system, decomposed UTF-8 strings are leaked into user
123 Then there are also the locales with encodings other than US-ASCII
124 and UTF-8. These locales can be occasionally useful to users (e.g.
125 when grepping through ISO-8859-1 encoded text files), when all their
126 file names are in US-ASCII.
129 #[cfg(target_os = "macos")]
131 "ARMSCII-8" => return "ARMSCII-8",
132 "Big5" => return "BIG5",
133 "Big5HKSCS" => return "BIG5-HKSCS",
134 "CP1131" => return "CP1131",
135 "CP1251" => return "CP1251",
136 "CP866" => return "CP866",
137 "CP949" => return "CP949",
138 "GB18030" => return "GB18030",
139 "GB2312" => return "GB2312",
140 "GBK" => return "GBK",
141 "ISO8859-1" => return "ISO-8859-1",
142 "ISO8859-13" => return "ISO-8859-13",
143 "ISO8859-15" => return "ISO-8859-15",
144 "ISO8859-2" => return "ISO-8859-2",
145 "ISO8859-4" => return "ISO-8859-4",
146 "ISO8859-5" => return "ISO-8859-5",
147 "ISO8859-7" => return "ISO-8859-7",
148 "ISO8859-9" => return "ISO-8859-9",
149 "KOI8-R" => return "KOI8-R",
150 "KOI8-U" => return "KOI8-U",
151 "PT154" => return "PT154",
152 "SJIS" => return "SHIFT_JIS",
153 "eucCN" => return "GB2312",
154 "eucJP" => return "EUC-JP",
155 "eucKR" => return "EUC-KR",
159 #[cfg(target_os = "aix")]
161 "GBK" => return "GBK",
162 "IBM-1046" => return "CP1046",
163 "IBM-1124" => return "CP1124",
164 "IBM-1129" => return "CP1129",
165 "IBM-1252" => return "CP1252",
166 "IBM-850" => return "CP850",
167 "IBM-856" => return "CP856",
168 "IBM-921" => return "ISO-8859-13",
169 "IBM-922" => return "CP922",
170 "IBM-932" => return "CP932",
171 "IBM-943" => return "CP943",
172 "IBM-eucCN" => return "GB2312",
173 "IBM-eucJP" => return "EUC-JP",
174 "IBM-eucKR" => return "EUC-KR",
175 "IBM-eucTW" => return "EUC-TW",
176 "ISO8859-1" => return "ISO-8859-1",
177 "ISO8859-15" => return "ISO-8859-15",
178 "ISO8859-2" => return "ISO-8859-2",
179 "ISO8859-5" => return "ISO-8859-5",
180 "ISO8859-6" => return "ISO-8859-6",
181 "ISO8859-7" => return "ISO-8859-7",
182 "ISO8859-8" => return "ISO-8859-8",
183 "ISO8859-9" => return "ISO-8859-9",
184 "TIS-620" => return "TIS-620",
185 "UTF-8" => return "UTF-8",
186 "big5" => return "BIG5",
192 "CP1361" => return "JOHAB",
193 "CP20127" => return "ASCII",
194 "CP20866" => return "KOI8-R",
195 "CP20936" => return "GB2312",
196 "CP21866" => return "KOI8-RU",
197 "CP28591" => return "ISO-8859-1",
198 "CP28592" => return "ISO-8859-2",
199 "CP28593" => return "ISO-8859-3",
200 "CP28594" => return "ISO-8859-4",
201 "CP28595" => return "ISO-8859-5",
202 "CP28596" => return "ISO-8859-6",
203 "CP28597" => return "ISO-8859-7",
204 "CP28598" => return "ISO-8859-8",
205 "CP28599" => return "ISO-8859-9",
206 "CP28605" => return "ISO-8859-15",
207 "CP38598" => return "ISO-8859-8",
208 "CP51932" => return "EUC-JP",
209 "CP51936" => return "GB2312",
210 "CP51949" => return "EUC-KR",
211 "CP51950" => return "EUC-TW",
212 "CP54936" => return "GB18030",
213 "CP65001" => return "UTF-8",
214 "CP936" => return "GBK",
218 String::from(s).leak()
224 ffi::{c_int, CStr, CString},
228 use libc::{self, nl_langinfo, setlocale, CODESET, LC_CTYPE};
230 unsafe fn string_from_pointer(s: *const i8) -> Option<String> {
234 Some(CStr::from_ptr(s).to_string_lossy().into())
238 fn set_locale(category: c_int, locale: Option<&str>) -> Option<String> {
240 let locale = locale.map(|s| CString::new(s).unwrap());
241 let locale_ptr = locale.as_ref().map_or(null(), |s| s.as_ptr());
242 string_from_pointer(setlocale(category, locale_ptr))
246 pub fn locale_charset() -> Option<String> {
248 let saved_locale = set_locale(LC_CTYPE, None);
249 set_locale(LC_CTYPE, Some(""));
250 let codeset = string_from_pointer(nl_langinfo(CODESET));
251 set_locale(LC_CTYPE, saved_locale.as_deref());
259 use libc::{setlocale, LC_CTYPE};
260 use std::ffi::{CStr, CString};
261 use windows_sys::Win32::Globalization::GetACP;
263 fn current_locale() -> Option<String> {
265 let empty_cstr = CString::new("").unwrap();
266 let locale = setlocale(LC_CTYPE, empty_cstr.as_ptr());
267 if locale.is_null() {
270 Some(CStr::from_ptr(locale).to_string_lossy().into())
275 pub fn locale_charset() -> Option<String> {
276 let Some(current_locale) = current_locale() else {
279 let codepage = if let Some((_, pdot)) = current_locale.rsplit_once('.') {
282 format!("CP{}", unsafe { GetACP() })
284 Some(match codepage.as_str() {
285 "CP65001" | "CPutf8" => String::from("UTF-8"),
291 #[cfg(not(any(unix, windows)))]
293 pub fn locale_charse() -> String {
294 String::from("UTF-8")
298 /// Returns the character set used by the locale configured in the operating
300 pub fn locale_charset() -> &'static str {
302 static ref LOCALE_CHARSET: &'static str =
303 map_aliases(&inner::locale_charset().unwrap_or(String::from("UTF-8")));