# It is not intended for manual editing.
version = 3
+[[package]]
+name = "addr2line"
+version = "0.22.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678"
+dependencies = [
+ "gimli",
+]
+
[[package]]
name = "adler"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
+[[package]]
+name = "aho-corasick"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
+dependencies = [
+ "memchr",
+]
+
[[package]]
name = "android-tzdata"
version = "0.1.1"
"libc",
]
+[[package]]
+name = "anstream"
+version = "0.6.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1"
+
+[[package]]
+name = "anstyle-parse"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eb47de1e80c2b463c735db5b217a0ddc39d612e7ac9e2e96a5aed1f57616c1cb"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d36fc52c7f6c869915e99412912f22093507da8d9e942ceaf66fe4b7c14422a"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8"
+dependencies = [
+ "anstyle",
+ "windows-sys 0.52.0",
+]
+
[[package]]
name = "anyhow"
-version = "1.0.69"
+version = "1.0.86"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800"
+checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da"
+
+[[package]]
+name = "async-trait"
+version = "0.1.81"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
[[package]]
name = "atty"
"winapi",
]
+[[package]]
+name = "auto_impl"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c87f3f15e7794432337fc718554eaa4dc8f04c9677a950ffe366f20a162ae42"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
[[package]]
name = "autocfg"
-version = "1.1.0"
+version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
+checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
+
+[[package]]
+name = "backtrace"
+version = "0.3.73"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a"
+dependencies = [
+ "addr2line",
+ "cc",
+ "cfg-if",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+]
[[package]]
name = "bitflags"
[[package]]
name = "bitflags"
-version = "2.5.0"
+version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
+checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
[[package]]
name = "bumpalo"
-version = "3.13.0"
+version = "3.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
+
+[[package]]
+name = "bytes"
+version = "1.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
+checksum = "8318a53db07bb3f8dca91a600466bdb3f2eaadeedfdbcf02e1accbad9271ba50"
[[package]]
name = "cc"
-version = "1.0.79"
+version = "1.1.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+checksum = "72db2f7947ecee9b03b510377e8bb9077afa27176fdbff55c51027e976fdcc48"
+dependencies = [
+ "shlex",
+]
[[package]]
name = "cfg-if"
[[package]]
name = "chrono"
-version = "0.4.26"
+version = "0.4.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5"
+checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
dependencies = [
"android-tzdata",
"iana-time-zone",
"js-sys",
"num-traits",
- "time",
"wasm-bindgen",
- "winapi",
+ "windows-targets 0.52.6",
]
[[package]]
name = "clap"
-version = "4.1.7"
+version = "4.5.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f3061d6db6d8fcbbd4b05e057f2acace52e64e96b498c08c2d7a4e65addd340"
+checksum = "ed6719fffa43d0d87e5fd8caeab59be1554fb028cd30edc88fc4369b17971019"
dependencies = [
- "bitflags 1.3.2",
+ "clap_builder",
"clap_derive",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.5.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "216aec2b177652e3846684cbfe25c9964d18ec45234f0f5da5157b207ed1aab6"
+dependencies = [
+ "anstream",
+ "anstyle",
"clap_lex",
- "is-terminal",
- "once_cell",
"strsim",
- "termcolor 1.2.0",
"terminal_size",
]
[[package]]
name = "clap_derive"
-version = "4.1.7"
+version = "4.5.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34d122164198950ba84a918270a3bb3f7ededd25e15f7451673d986f55bd2667"
+checksum = "501d359d5f3dcaf6ecdeee48833ae73ec6e42723a1e52419c79abf9507eec0a0"
dependencies = [
"heck",
- "proc-macro-error",
"proc-macro2",
"quote",
- "syn 1.0.109",
+ "syn",
]
[[package]]
name = "clap_lex"
-version = "0.3.2"
+version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "350b9cf31731f9957399229e9b2adc51eeabdfbe9d71d9a0552275fd12710d09"
-dependencies = [
- "os_str_bytes",
-]
+checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
+
+[[package]]
+name = "colorchoice"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0"
[[package]]
name = "core-foundation-sys"
-version = "0.8.4"
+version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
+checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
[[package]]
name = "crc32fast"
-version = "1.3.2"
+version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
+checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
dependencies = [
"cfg-if",
]
+[[package]]
+name = "dashmap"
+version = "5.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856"
+dependencies = [
+ "cfg-if",
+ "hashbrown",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core",
+]
+
[[package]]
name = "diff"
version = "0.1.13"
[[package]]
name = "encoding_rs"
-version = "0.8.32"
+version = "0.8.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394"
+checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59"
dependencies = [
"cfg-if",
]
dependencies = [
"proc-macro2",
"quote",
- "syn 2.0.27",
+ "syn",
]
[[package]]
-name = "equivalent"
-version = "1.0.1"
+name = "env_filter"
+version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+checksum = "4f2c92ceda6ceec50f43169f9ee8424fe2db276791afde7b2cd8bc084cb376ab"
+dependencies = [
+ "log",
+ "regex",
+]
[[package]]
-name = "errno"
-version = "0.2.8"
+name = "env_logger"
+version = "0.11.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
+checksum = "e13fa619b91fb2381732789fc5de83b45675e882f66623b7d8cb4f643017018d"
dependencies = [
- "errno-dragonfly",
- "libc",
- "winapi",
+ "anstream",
+ "anstyle",
+ "env_filter",
+ "humantime",
+ "log",
]
[[package]]
-name = "errno"
-version = "0.3.1"
+name = "equivalent"
+version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
-dependencies = [
- "errno-dragonfly",
- "libc",
- "windows-sys 0.48.0",
-]
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
[[package]]
-name = "errno-dragonfly"
-version = "0.1.2"
+name = "errno"
+version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
+checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
dependencies = [
- "cc",
"libc",
+ "windows-sys 0.52.0",
]
[[package]]
[[package]]
name = "flate2"
-version = "1.0.26"
+version = "1.0.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
+checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920"
dependencies = [
"crc32fast",
"miniz_oxide",
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
+[[package]]
+name = "form_urlencoded"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
+dependencies = [
+ "percent-encoding",
+]
+
+[[package]]
+name = "futures"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-channel"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+]
+
+[[package]]
+name = "futures-core"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
+
+[[package]]
+name = "futures-io"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
+
+[[package]]
+name = "futures-macro"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "futures-sink"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
+
+[[package]]
+name = "futures-task"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
+
+[[package]]
+name = "futures-util"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-macro",
+ "futures-sink",
+ "futures-task",
+ "memchr",
+ "pin-project-lite",
+ "pin-utils",
+ "slab",
+]
+
+[[package]]
+name = "gimli"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd"
+
[[package]]
name = "hashbrown"
-version = "0.14.3"
+version = "0.14.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
[[package]]
name = "heck"
-version = "0.4.1"
+version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hermit-abi"
[[package]]
name = "hermit-abi"
-version = "0.3.1"
+version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
+checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
[[package]]
name = "hexplay"
checksum = "0962bea6731e28b5a443ba4aa00fe3e4fe7555dadf12012435efb738eeac5898"
dependencies = [
"atty",
- "termcolor 0.3.6",
+ "termcolor",
]
+[[package]]
+name = "httparse"
+version = "1.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9"
+
+[[package]]
+name = "humantime"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
+
[[package]]
name = "iana-time-zone"
-version = "0.1.57"
+version = "0.1.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613"
+checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141"
dependencies = [
"android_system_properties",
"core-foundation-sys",
"iana-time-zone-haiku",
"js-sys",
"wasm-bindgen",
- "windows",
+ "windows-core",
]
[[package]]
"cc",
]
+[[package]]
+name = "idna"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
+dependencies = [
+ "unicode-bidi",
+ "unicode-normalization",
+]
+
[[package]]
name = "indexmap"
-version = "2.1.0"
+version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f"
+checksum = "93ead53efc7ea8ed3cfb0c79fc8023fbb782a5432b52830b6518941cebe6505c"
dependencies = [
"equivalent",
"hashbrown",
]
[[package]]
-name = "io-lifetimes"
-version = "1.0.5"
+name = "is_terminal_polyfill"
+version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3"
-dependencies = [
- "libc",
- "windows-sys 0.45.0",
-]
+checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
-name = "is-terminal"
-version = "0.4.4"
+name = "itoa"
+version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21b6b32576413a8e69b90e952e4a026476040d81017b80445deda5f2d3921857"
-dependencies = [
- "hermit-abi 0.3.1",
- "io-lifetimes",
- "rustix 0.36.8",
- "windows-sys 0.45.0",
-]
+checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
[[package]]
name = "js-sys"
-version = "0.3.64"
+version = "0.3.70"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a"
+checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a"
dependencies = [
"wasm-bindgen",
]
[[package]]
name = "lazy_static"
-version = "1.4.0"
+version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]]
name = "libc"
-version = "0.2.147"
+version = "0.2.158"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
+checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439"
[[package]]
name = "linux-raw-sys"
-version = "0.1.4"
+version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
+checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
[[package]]
-name = "linux-raw-sys"
-version = "0.3.8"
+name = "lock_api"
+version = "0.4.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
+checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
+dependencies = [
+ "autocfg",
+ "scopeguard",
+]
[[package]]
name = "log"
-version = "0.4.19"
+version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4"
+checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
+
+[[package]]
+name = "lsp-types"
+version = "0.94.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c66bfd44a06ae10647fe3f8214762e9369fd4248df1350924b4ef9e770a85ea1"
+dependencies = [
+ "bitflags 1.3.2",
+ "serde",
+ "serde_json",
+ "serde_repr",
+ "url",
+]
[[package]]
name = "memchr"
[[package]]
name = "miniz_oxide"
-version = "0.7.1"
+version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
dependencies = [
"adler",
]
+[[package]]
+name = "mio"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec"
+dependencies = [
+ "hermit-abi 0.3.9",
+ "libc",
+ "wasi",
+ "windows-sys 0.52.0",
+]
+
[[package]]
name = "num"
-version = "0.4.0"
+version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606"
+checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
dependencies = [
"num-bigint",
"num-complex",
[[package]]
name = "num-bigint"
-version = "0.4.3"
+version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
dependencies = [
- "autocfg",
"num-integer",
"num-traits",
]
[[package]]
name = "num-complex"
-version = "0.4.3"
+version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02e0d21255c828d6f128a1e41534206671e8c3ea0c62f32291e808dc82cff17d"
+checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
dependencies = [
"num-traits",
]
[[package]]
name = "num-derive"
-version = "0.4.0"
+version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e6a0fd4f737c707bd9086cc16c925f294943eb62eb71499e9fd4cf71f8b9f4e"
+checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
dependencies = [
"proc-macro2",
"quote",
- "syn 2.0.27",
+ "syn",
]
[[package]]
name = "num-integer"
-version = "0.1.45"
+version = "0.1.46"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
dependencies = [
- "autocfg",
"num-traits",
]
[[package]]
name = "num-iter"
-version = "0.1.43"
+version = "0.1.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252"
+checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
dependencies = [
"autocfg",
"num-integer",
[[package]]
name = "num-rational"
-version = "0.4.1"
+version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
+checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
dependencies = [
- "autocfg",
"num-bigint",
"num-integer",
"num-traits",
[[package]]
name = "num-traits"
-version = "0.2.16"
+version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
+[[package]]
+name = "object"
+version = "0.36.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "27b64972346851a39438c60b341ebc01bba47464ae329e55cf343eb93964efd9"
+dependencies = [
+ "memchr",
+]
+
[[package]]
name = "once_cell"
-version = "1.17.1"
+version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
[[package]]
name = "ordered-float"
-version = "3.7.0"
+version = "3.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fc2dbde8f8a79f2102cc474ceb0ad68e3b80b85289ea62389b60e66777e4213"
+checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
dependencies = [
"num-traits",
]
[[package]]
-name = "os_str_bytes"
-version = "6.4.1"
+name = "parking_lot"
+version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee"
+checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
+dependencies = [
+ "lock_api",
+ "parking_lot_core",
+]
[[package]]
-name = "proc-macro-error"
-version = "1.0.4"
+name = "parking_lot_core"
+version = "0.9.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
+checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
dependencies = [
- "proc-macro-error-attr",
- "proc-macro2",
- "quote",
- "syn 1.0.109",
- "version_check",
+ "cfg-if",
+ "libc",
+ "redox_syscall",
+ "smallvec",
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "percent-encoding"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
+
+[[package]]
+name = "pin-project"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3"
+dependencies = [
+ "pin-project-internal",
]
[[package]]
-name = "proc-macro-error-attr"
-version = "1.0.4"
+name = "pin-project-internal"
+version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
+checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
dependencies = [
"proc-macro2",
"quote",
- "version_check",
+ "syn",
]
+[[package]]
+name = "pin-project-lite"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02"
+
+[[package]]
+name = "pin-utils"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
+
[[package]]
name = "proc-macro2"
-version = "1.0.66"
+version = "1.0.86"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
+checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
dependencies = [
"unicode-ident",
]
version = "1.0.0"
dependencies = [
"anyhow",
- "bitflags 2.5.0",
+ "bitflags 2.6.0",
"chardetng",
"chrono",
"clap",
"windows-sys 0.48.0",
]
+[[package]]
+name = "pspp-lsp"
+version = "0.1.0"
+dependencies = [
+ "env_logger",
+ "log",
+ "pspp",
+ "tokio",
+ "tower-lsp",
+]
+
[[package]]
name = "quote"
-version = "1.0.32"
+version = "1.0.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
+checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
dependencies = [
"proc-macro2",
]
[[package]]
-name = "rustix"
-version = "0.36.8"
+name = "redox_syscall"
+version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f43abb88211988493c1abb44a70efa56ff0ce98f233b7b276146f1f3f7ba9644"
+checksum = "2a908a6e00f1fdd0dfd9c0eb08ce85126f6d8bbda50017e74bc4a4b7d4a926a4"
dependencies = [
- "bitflags 1.3.2",
- "errno 0.2.8",
- "io-lifetimes",
- "libc",
- "linux-raw-sys 0.1.4",
- "windows-sys 0.45.0",
+ "bitflags 2.6.0",
+]
+
+[[package]]
+name = "regex"
+version = "1.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
]
+[[package]]
+name = "regex-syntax"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
+
+[[package]]
+name = "rustc-demangle"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
+
[[package]]
name = "rustix"
-version = "0.37.3"
+version = "0.38.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62b24138615de35e32031d041a09032ef3487a616d901ca4db224e7d557efae2"
+checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
dependencies = [
- "bitflags 1.3.2",
- "errno 0.3.1",
- "io-lifetimes",
+ "bitflags 2.6.0",
+ "errno",
"libc",
- "linux-raw-sys 0.3.8",
- "windows-sys 0.45.0",
+ "linux-raw-sys",
+ "windows-sys 0.52.0",
]
[[package]]
-name = "strsim"
-version = "0.10.0"
+name = "ryu"
+version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
+checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
[[package]]
-name = "syn"
-version = "1.0.109"
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
+[[package]]
+name = "serde"
+version = "1.0.208"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+checksum = "cff085d2cb684faa248efb494c39b68e522822ac0de72ccf08109abde717cfb2"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.208"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf"
dependencies = [
"proc-macro2",
"quote",
- "unicode-ident",
+ "syn",
]
[[package]]
-name = "syn"
-version = "2.0.27"
+name = "serde_json"
+version = "1.0.125"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed"
+dependencies = [
+ "itoa",
+ "memchr",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "serde_repr"
+version = "0.1.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b60f673f44a8255b9c8c657daf66a596d435f2da81a555b06dc644d080ba45e0"
+checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9"
dependencies = [
"proc-macro2",
"quote",
- "unicode-ident",
+ "syn",
]
[[package]]
-name = "termcolor"
-version = "0.3.6"
+name = "shlex"
+version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "adc4587ead41bf016f11af03e55a624c06568b5a19db4e90fde573d805074f83"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
+
+[[package]]
+name = "signal-hook-registry"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1"
dependencies = [
- "wincolor",
+ "libc",
+]
+
+[[package]]
+name = "slab"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "smallvec"
+version = "1.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
+
+[[package]]
+name = "socket2"
+version = "0.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
+[[package]]
+name = "syn"
+version = "2.0.75"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6af063034fc1935ede7be0122941bafa9bacb949334d090b77ca98b5817c7d9"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
]
[[package]]
name = "termcolor"
-version = "1.2.0"
+version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
+checksum = "adc4587ead41bf016f11af03e55a624c06568b5a19db4e90fde573d805074f83"
dependencies = [
- "winapi-util",
+ "wincolor",
]
[[package]]
name = "terminal_size"
-version = "0.2.6"
+version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e6bf6f19e9f8ed8d4048dc22981458ebcf406d67e94cd422e5ecd73d63b3237"
+checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7"
dependencies = [
- "rustix 0.37.3",
+ "rustix",
"windows-sys 0.48.0",
]
[[package]]
name = "thiserror"
-version = "1.0.39"
+version = "1.0.63"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a5ab016db510546d856297882807df8da66a16fb8c4101cb8b30054b0d5b2d9c"
+checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
-version = "1.0.39"
+version = "1.0.63"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5420d42e90af0c38c3290abcca25b9b3bdf379fc9f55c528f53a269d9c9a267e"
+checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261"
dependencies = [
"proc-macro2",
"quote",
- "syn 1.0.109",
+ "syn",
]
[[package]]
-name = "time"
-version = "0.1.45"
+name = "tinyvec"
+version = "1.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
+[[package]]
+name = "tokio"
+version = "1.39.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9babc99b9923bfa4804bd74722ff02c0381021eafa4db9949217e3be8e84fff5"
dependencies = [
+ "backtrace",
+ "bytes",
"libc",
- "wasi",
- "winapi",
+ "mio",
+ "parking_lot",
+ "pin-project-lite",
+ "signal-hook-registry",
+ "socket2",
+ "tokio-macros",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "tokio-macros"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tokio-util"
+version = "0.7.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tower"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "pin-project",
+ "pin-project-lite",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+
+[[package]]
+name = "tower-lsp"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4ba052b54a6627628d9b3c34c176e7eda8359b7da9acd497b9f20998d118508"
+dependencies = [
+ "async-trait",
+ "auto_impl",
+ "bytes",
+ "dashmap",
+ "futures",
+ "httparse",
+ "lsp-types",
+ "memchr",
+ "serde",
+ "serde_json",
+ "tokio",
+ "tokio-util",
+ "tower",
+ "tower-lsp-macros",
+ "tracing",
+]
+
+[[package]]
+name = "tower-lsp-macros"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84fd902d4e0b9a4b27f2f440108dc034e1758628a9b702f8ec61ad66355422fa"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tower-service"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
+
+[[package]]
+name = "tracing"
+version = "0.1.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+dependencies = [
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+dependencies = [
+ "once_cell",
]
[[package]]
name = "unicase"
-version = "2.6.0"
+version = "2.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
+checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89"
dependencies = [
"version_check",
]
+[[package]]
+name = "unicode-bidi"
+version = "0.3.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75"
+
[[package]]
name = "unicode-ident"
-version = "1.0.6"
+version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+
+[[package]]
+name = "unicode-normalization"
+version = "0.1.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
+dependencies = [
+ "tinyvec",
+]
[[package]]
name = "unicode-width"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
+[[package]]
+name = "url"
+version = "2.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c"
+dependencies = [
+ "form_urlencoded",
+ "idna",
+ "percent-encoding",
+ "serde",
+]
+
[[package]]
name = "utf8-decode"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498"
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
[[package]]
name = "version_check"
-version = "0.9.4"
+version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
name = "wasi"
-version = "0.10.0+wasi-snapshot-preview1"
+version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "wasm-bindgen"
-version = "0.2.87"
+version = "0.2.93"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342"
+checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5"
dependencies = [
"cfg-if",
+ "once_cell",
"wasm-bindgen-macro",
]
[[package]]
name = "wasm-bindgen-backend"
-version = "0.2.87"
+version = "0.2.93"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd"
+checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b"
dependencies = [
"bumpalo",
"log",
"once_cell",
"proc-macro2",
"quote",
- "syn 2.0.27",
+ "syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
-version = "0.2.87"
+version = "0.2.93"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d"
+checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
[[package]]
name = "wasm-bindgen-macro-support"
-version = "0.2.87"
+version = "0.2.93"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
+checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836"
dependencies = [
"proc-macro2",
"quote",
- "syn 2.0.27",
+ "syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
-version = "0.2.87"
+version = "0.2.93"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1"
+checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484"
[[package]]
name = "winapi"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
-[[package]]
-name = "winapi-util"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
-dependencies = [
- "winapi",
-]
-
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
]
[[package]]
-name = "windows"
-version = "0.48.0"
+name = "windows-core"
+version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
+checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
dependencies = [
- "windows-targets 0.48.1",
+ "windows-targets 0.52.6",
]
[[package]]
name = "windows-sys"
-version = "0.45.0"
+version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
dependencies = [
- "windows-targets 0.42.1",
+ "windows-targets 0.48.5",
]
[[package]]
name = "windows-sys"
-version = "0.48.0"
+version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
dependencies = [
- "windows-targets 0.48.1",
+ "windows-targets 0.52.6",
]
[[package]]
name = "windows-targets"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
dependencies = [
- "windows_aarch64_gnullvm 0.42.1",
- "windows_aarch64_msvc 0.42.1",
- "windows_i686_gnu 0.42.1",
- "windows_i686_msvc 0.42.1",
- "windows_x86_64_gnu 0.42.1",
- "windows_x86_64_gnullvm 0.42.1",
- "windows_x86_64_msvc 0.42.1",
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
]
[[package]]
name = "windows-targets"
-version = "0.48.1"
+version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
- "windows_aarch64_gnullvm 0.48.0",
- "windows_aarch64_msvc 0.48.0",
- "windows_i686_gnu 0.48.0",
- "windows_i686_msvc 0.48.0",
- "windows_x86_64_gnu 0.48.0",
- "windows_x86_64_gnullvm 0.48.0",
- "windows_x86_64_msvc 0.48.0",
+ "windows_aarch64_gnullvm 0.52.6",
+ "windows_aarch64_msvc 0.52.6",
+ "windows_i686_gnu 0.52.6",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc 0.52.6",
+ "windows_x86_64_gnu 0.52.6",
+ "windows_x86_64_gnullvm 0.52.6",
+ "windows_x86_64_msvc 0.52.6",
]
[[package]]
name = "windows_aarch64_gnullvm"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
[[package]]
name = "windows_aarch64_gnullvm"
-version = "0.48.0"
+version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
[[package]]
name = "windows_aarch64_msvc"
-version = "0.48.0"
+version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
[[package]]
name = "windows_i686_gnu"
-version = "0.48.0"
+version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
[[package]]
name = "windows_i686_msvc"
-version = "0.48.0"
+version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
[[package]]
name = "windows_x86_64_gnu"
-version = "0.48.0"
+version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
[[package]]
name = "windows_x86_64_gnullvm"
-version = "0.48.0"
+version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
[[package]]
name = "windows_x86_64_msvc"
-version = "0.48.0"
+version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
-[package]
-name = "pspp"
-version = "1.0.0"
-edition = "2021"
-authors = [ "Ben Pfaff", "John Darrington" ]
-
-[dependencies]
-anyhow = "1.0.69"
-clap = { version = "4.1.7", features = ["derive", "wrap_help"] }
-encoding_rs = "0.8.32"
-flate2 = "1.0.26"
-float_next_after = "1.0.0"
-hexplay = "0.2.1"
-lazy_static = "1.4.0"
-num = "0.4.0"
-num-derive = "0.4.0"
-num-traits = "0.2.16"
-ordered-float = "3.7.0"
-thiserror = "1.0"
-chrono = "0.4.26"
-finl_unicode = "1.2.0"
-unicase = "2.6.0"
-libc = "0.2.147"
-indexmap = "2.1.0"
-utf8-decode = "1.0.1"
-bitflags = "2.5.0"
-unicode-width = "0.1.13"
-chardetng = "0.1.17"
-enum-map = "2.7.3"
-flagset = "0.4.6"
-
-[target.'cfg(windows)'.dependencies]
-windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] }
-
-[build-dependencies]
-anyhow = "1.0.69"
-
-[[bin]]
-name = "pspp-dump-sav"
-path = "src/main.rs"
-
-[lib]
-path = "src/lib.rs"
-
-[[test]]
-name = "sack"
-path = "tests/sack.rs"
-harness = false
-
-[dev-dependencies]
-diff = "0.1.13"
+[workspace]
+members = [
+ "pspp",
+ "pspp-lsp",
+]
+resolver = "2"
+++ /dev/null
-use anyhow::{anyhow, Result as AnyResult};
-use std::{
- collections::{BTreeMap, HashSet, VecDeque},
- env::var_os,
- fs::{read_to_string, File},
- io::{Error as IoError, Write},
- path::{Path, PathBuf},
-};
-
-#[derive(Copy, Clone, PartialEq, Eq, Ord, PartialOrd)]
-enum Source {
- Codepage,
- Ibm,
- Windows,
-}
-
-// Code page number.
-type CodepageNumber = usize;
-
-fn process_converter<'a>(
- fields: &Vec<&'a str>,
- codepages: &mut BTreeMap<CodepageNumber, BTreeMap<Source, Vec<&'a str>>>,
-) {
- if fields.is_empty() || fields[0] == "{" {
- return;
- }
-
- let mut cps: BTreeMap<Source, CodepageNumber> = BTreeMap::new();
- let mut iana = VecDeque::new();
- let mut other = VecDeque::new();
-
- let mut iter = fields.iter().peekable();
- while let Some(&name) = iter.next() {
- if iter.next_if(|&&s| s == "{").is_some() {
- let mut standards = HashSet::new();
- loop {
- let &standard = iter.next().expect("missing `}` in list of standards");
- if standard == "}" {
- break;
- }
- standards.insert(standard);
- }
-
- if standards.contains("IANA*") {
- iana.push_front(name);
- } else if standards.contains("IANA") {
- iana.push_back(name);
- } else if standards.iter().any(|&s| s.ends_with('*')) {
- other.push_front(name);
- } else {
- other.push_back(name);
- }
- } else {
- // Untagged names are completely nonstandard.
- continue;
- }
-
- if let Some(number) = name.strip_prefix("cp") {
- if let Ok(number) = number.parse::<CodepageNumber>() {
- cps.insert(Source::Codepage, number);
- }
- }
-
- if let Some(number) = name.strip_prefix("windows-") {
- if let Ok(number) = number.parse::<CodepageNumber>() {
- cps.insert(Source::Windows, number);
- }
- }
-
- if let Some(number) = name.strip_prefix("ibm-") {
- if let Ok(number) = number.parse::<CodepageNumber>() {
- cps.insert(Source::Ibm, number);
- }
- }
- }
-
- // If there are no tagged names then this is completely nonstandard.
- if iana.is_empty() && other.is_empty() {
- return;
- }
-
- let all: Vec<&str> = iana.into_iter().chain(other).collect();
- for (source, number) in cps {
- codepages
- .entry(number)
- .or_default()
- .insert(source, all.clone());
- }
-}
-
-fn write_output(
- codepages: &BTreeMap<CodepageNumber, BTreeMap<Source, Vec<&str>>>,
- file_name: &PathBuf,
-) -> Result<(), IoError> {
- let mut file = File::create(file_name)?;
-
- file.write_all(
- "\
-use lazy_static::lazy_static;
-use std::collections::HashMap;
-
-lazy_static! {
- static ref CODEPAGE_NUMBER_TO_NAME: HashMap<i32, &'static str> = {
- let mut map = HashMap::new();
-"
- .as_bytes(),
- )?;
-
- for (&cpnumber, value) in codepages.iter() {
- let source = value.keys().max().unwrap();
- let name = value[source][0];
- writeln!(file, " map.insert({cpnumber}, \"{name}\");")?;
- }
- file.write_all(
- " map
- };
-
- static ref CODEPAGE_NAME_TO_NUMBER: HashMap<&'static str, u32> = {
- let mut map = HashMap::new();
-"
- .as_bytes(),
- )?;
-
- let mut names: BTreeMap<String, BTreeMap<Source, Vec<CodepageNumber>>> = BTreeMap::new();
- for (&cpnumber, value) in codepages.iter() {
- for (&source, value2) in value.iter() {
- for name in value2.iter().map(|name| name.to_ascii_lowercase()) {
- names
- .entry(name)
- .or_default()
- .entry(source)
- .or_default()
- .push(cpnumber);
- }
- }
- }
-
- for (name, value) in names.iter() {
- for (_source, numbers) in value.iter().rev().take(1) {
- writeln!(file, " map.insert(\"{name}\", {});", numbers[0])?;
- }
- }
- file.write_all(
- " map
- };
-}
-"
- .as_bytes(),
- )?;
-
- Ok(())
-}
-
-fn main() -> AnyResult<()> {
- println!("cargo:rerun-if-changed=build.rs");
-
- let input_file = Path::new(env!("CARGO_MANIFEST_DIR")).join("convrtrs.txt");
- println!("cargo:rerun-if-changed={}", input_file.to_string_lossy());
- let input = read_to_string(&input_file)
- .map_err(|e| anyhow!("{}: read failed ({e})", input_file.to_string_lossy()))?;
-
- let mut codepages: BTreeMap<CodepageNumber, BTreeMap<Source, Vec<&str>>> = BTreeMap::new();
- let mut converter: Vec<&str> = Vec::new();
- for line in input.lines() {
- let line = line
- .find('#')
- .map(|position| &line[..position])
- .unwrap_or(line)
- .trim_end();
- if !line.starts_with([' ', '\t']) {
- process_converter(&converter, &mut codepages);
- converter.clear();
- }
- converter.extend(line.split_whitespace());
- }
- process_converter(&converter, &mut codepages);
-
- let output_file_name = Path::new(&var_os("OUT_DIR").unwrap()).join("encodings.rs");
-
- write_output(&codepages, &output_file_name)
- .map_err(|e| anyhow!("{}: write failed ({e})", output_file_name.to_string_lossy()))?;
-
- Ok(())
-}
+++ /dev/null
-# Copyright (C) 2016 and later: Unicode, Inc. and others.
-# License & terms of use: http://www.unicode.org/copyright.html
-# ******************************************************************************
-# *
-# * Copyright (C) 1995-2014, International Business Machines
-# * Corporation and others. All Rights Reserved.
-# *
-# ******************************************************************************
-
-# If this converter alias table looks very confusing, a much easier to
-# understand view can be found at this demo:
-# http://demo.icu-project.org/icu-bin/convexp
-
-# IMPORTANT NOTE
-#
-# This file is not read directly by ICU. If you change it, you need to
-# run gencnval, and eventually run pkgdata to update the representation that
-# ICU uses for aliases. The gencnval tool will normally compile this file into
-# cnvalias.icu. The gencnval -v verbose option will help you when you edit
-# this file.
-
-# Please be friendly to the rest of us that edit this table by
-# keeping this table free of tabs.
-
-# This is an alias file used by the character set converter.
-# A lot of converter information can be found in unicode/ucnv.h, but here
-# is more information about this file.
-#
-# If you are adding a new converter to this list and want to include it in the
-# icu data library, please be sure to add an entry to the appropriate ucm*.mk file
-# (see ucmfiles.mk for more information).
-#
-# Here is the file format using BNF-like syntax:
-#
-# converterTable ::= tags { converterLine* }
-# converterLine ::= converterName [ tags ] { taggedAlias* }'\n'
-# taggedAlias ::= alias [ tags ]
-# tags ::= '{' { tag+ } '}'
-# tag ::= standard['*']
-# converterName ::= [0-9a-zA-Z:_'-']+
-# alias ::= converterName
-#
-# Except for the converter name, aliases are case insensitive.
-# Names are separated by whitespace.
-# Line continuation and comment sytax are similar to the GNU make syntax.
-# Any lines beginning with whitespace (e.g. U+0020 SPACE or U+0009 HORIZONTAL
-# TABULATION) are presumed to be a continuation of the previous line.
-# The # symbol starts a comment and the comment continues till the end of
-# the line.
-#
-# The converter
-#
-# All names can be tagged by including a space-separated list of tags in
-# curly braces, as in ISO_8859-1:1987{IANA*} iso-8859-1 { MIME* } or
-# some-charset{MIME* IANA*}. The order of tags does not matter, and
-# whitespace is allowed between the tagged name and the tags list.
-#
-# The tags can be used to get standard names using ucnv_getStandardName().
-#
-# The complete list of recognized tags used in this file is defined in
-# the affinity list near the beginning of the file.
-#
-# The * after the standard tag denotes that the previous alias is the
-# preferred (default) charset name for that standard. There can only
-# be one of these default charset names per converter.
-
-
-
-# The world is getting more complicated...
-# Supporting XML parsers, HTML, MIME, and similar applications
-# that mark encodings with a charset name can be difficult.
-# Many of these applications and operating systems will update
-# their codepages over time.
-
-# It means that a new codepage, one that differs from an
-# old one by changing a code point, e.g., to the Euro sign,
-# must not get an old alias, because it would mean that
-# old files with this alias would be interpreted differently.
-
-# If an codepage gets updated by assigning characters to previously
-# unassigned code points, then a new name is not necessary.
-# Also, some codepages map unassigned codepage byte values
-# to the same numbers in Unicode for roundtripping. It may be
-# industry practice to keep the encoding name in such a case, too
-# (example: Windows codepages).
-
-# The aliases listed in the list of character sets
-# that is maintained by the IANA (http://www.iana.org/) must
-# not be changed to mean encodings different from what this
-# list shows. Currently, the IANA list is at
-# http://www.iana.org/assignments/character-sets
-# It should also be mentioned that the exact mapping table used for each
-# IANA names usually isn't specified. This means that some other applications
-# and operating systems are left to interpret the exact mappings for the
-# underspecified aliases. For instance, Shift-JIS on a Solaris platform
-# may be different from Shift-JIS on a Windows platform. This is why
-# some of the aliases can be tagged to differentiate different mapping
-# tables with the same alias. If an alias is given to more than one converter,
-# it is considered to be an ambiguous alias, and the affinity list will
-# choose the converter to use when a standard isn't specified with the alias.
-
-# Name matching is case-insensitive. Also, dashes '-', underscores '_'
-# and spaces ' ' are ignored in names (thus cs-iso_latin-1, csisolatin1
-# and "cs iso latin 1" are the same).
-# However, the names in the left column are directly file names
-# or names of algorithmic converters, and their case must not
-# be changed - or else code and/or file names must also be changed.
-# For example, the converter ibm-921 is expected to be the file ibm-921.cnv.
-
-
-
-# The immediately following list is the affinity list of supported standard tags.
-# When multiple converters have the same alias under different standards,
-# the standard nearest to the top of this list with that alias will
-# be the first converter that will be opened. The ordering of the aliases
-# after this affinity list does not affect the preferred alias, but it may
-# affect the order of the returned list of aliases for a given converter.
-#
-# The general ordering is from specific and frequently used to more general
-# or rarely used at the bottom.
-{ UTR22 # Name format specified by https://www.unicode.org/reports/tr22/
- # ICU # Can also use ICU_FEATURE
- IBM # The IBM CCSID number is specified by ibm-*
- WINDOWS # The Microsoft code page identifier number is specified by windows-*. The rest are recognized IE names.
- JAVA # Source: Sun JDK. Alias name case is ignored, but dashes are not ignored.
- # GLIBC
- # AIX
- # DB2
- # SOLARIS
- # APPLE
- # HPUX
- IANA # Source: http://www.iana.org/assignments/character-sets
- MIME # Source: http://www.iana.org/assignments/character-sets
- # MSIE # MSIE is Internet Explorer, which can be different from Windows (From the IMultiLanguage COM interface)
- # ZOS_USS # z/OS (os/390) Unix System Services (USS), which has NL<->LF swapping. They have the same format as the IBM tag.
- }
-
-
-
-# Fully algorithmic converters
-
-UTF-8 { IANA* MIME* JAVA* WINDOWS }
- ibm-1208 { IBM* } # UTF-8 with IBM PUA
- ibm-1209 { IBM } # UTF-8
- ibm-5304 { IBM } # Unicode 2.0, UTF-8 with IBM PUA
- ibm-5305 { IBM } # Unicode 2.0, UTF-8
- ibm-13496 { IBM } # Unicode 3.0, UTF-8 with IBM PUA
- ibm-13497 { IBM } # Unicode 3.0, UTF-8
- ibm-17592 { IBM } # Unicode 4.0, UTF-8 with IBM PUA
- ibm-17593 { IBM } # Unicode 4.0, UTF-8
- windows-65001 { WINDOWS* }
- cp1208
- x-UTF_8J
- unicode-1-1-utf-8
- unicode-2-0-utf-8
-
-# The ICU 2.2 UTF-16/32 converters detect and write a BOM.
-UTF-16 { IANA* MIME* JAVA* } ISO-10646-UCS-2 { IANA }
- ibm-1204 { IBM* } # UTF-16 with IBM PUA and BOM sensitive
- ibm-1205 { IBM } # UTF-16 BOM sensitive
- unicode
- csUnicode
- ucs-2
-# The following Unicode CCSIDs (IBM) are not valid in ICU because they are
-# considered pure DBCS (exactly 2 bytes) of Unicode,
-# and they are a subset of Unicode. ICU does not support their encoding structures.
-# 1400 1401 1402 1410 1414 1415 1446 1447 1448 1449 64770 64771 65520 5496 5497 5498 9592 13688
-UTF-16BE { IANA* MIME* JAVA* } x-utf-16be { JAVA }
- UnicodeBigUnmarked { JAVA } # java.io name
- ibm-1200 { IBM* } # UTF-16 BE with IBM PUA
- ibm-1201 { IBM } # UTF-16 BE
- ibm-13488 { IBM } # Unicode 2.0, UTF-16 BE with IBM PUA
- ibm-13489 { IBM } # Unicode 2.0, UTF-16 BE
- ibm-17584 { IBM } # Unicode 3.0, UTF-16 BE with IBM PUA
- ibm-17585 { IBM } # Unicode 3.0, UTF-16 BE
- ibm-21680 { IBM } # Unicode 4.0, UTF-16 BE with IBM PUA
- ibm-21681 { IBM } # Unicode 4.0, UTF-16 BE
- ibm-25776 { IBM } # Unicode 4.1, UTF-16 BE with IBM PUA
- ibm-25777 { IBM } # Unicode 4.1, UTF-16 BE
- ibm-29872 { IBM } # Unicode 5.0, UTF-16 BE with IBM PUA
- ibm-29873 { IBM } # Unicode 5.0, UTF-16 BE
- ibm-61955 { IBM } # UTF-16BE with Gaidai University (Japan) PUA
- ibm-61956 { IBM } # UTF-16BE with Microsoft HKSCS-Big 5 PUA
- windows-1201 { WINDOWS* }
- cp1200
- cp1201
- UTF16_BigEndian
- # ibm-5297 { IBM } # Unicode 2.0, UTF-16 (BE) (reserved, never used)
- # iso-10646-ucs-2 { JAVA } # This is ambiguous
- # ibm-61952 is not a valid CCSID because it's Unicode 1.1
- # ibm-61953 is not a valid CCSID because it's Unicode 1.0
-UTF-16LE { IANA* MIME* JAVA* } x-utf-16le { JAVA }
- UnicodeLittleUnmarked { JAVA } # java.io name
- ibm-1202 { IBM* } # UTF-16 LE with IBM PUA
- ibm-1203 { IBM } # UTF-16 LE
- ibm-13490 { IBM } # Unicode 2.0, UTF-16 LE with IBM PUA
- ibm-13491 { IBM } # Unicode 2.0, UTF-16 LE
- ibm-17586 { IBM } # Unicode 3.0, UTF-16 LE with IBM PUA
- ibm-17587 { IBM } # Unicode 3.0, UTF-16 LE
- ibm-21682 { IBM } # Unicode 4.0, UTF-16 LE with IBM PUA
- ibm-21683 { IBM } # Unicode 4.0, UTF-16 LE
- ibm-25778 { IBM } # Unicode 4.1, UTF-16 LE with IBM PUA
- ibm-25779 { IBM } # Unicode 4.1, UTF-16 LE
- ibm-29874 { IBM } # Unicode 5.0, UTF-16 LE with IBM PUA
- ibm-29875 { IBM } # Unicode 5.0, UTF-16 LE
- UTF16_LittleEndian
- windows-1200 { WINDOWS* }
-
-UTF-32 { IANA* MIME* } ISO-10646-UCS-4 { IANA }
- ibm-1236 { IBM* } # UTF-32 with IBM PUA and BOM sensitive
- ibm-1237 { IBM } # UTF-32 BOM sensitive
- csUCS4
- ucs-4
-UTF-32BE { IANA* } UTF32_BigEndian
- ibm-1232 { IBM* } # UTF-32 BE with IBM PUA
- ibm-1233 { IBM } # UTF-32 BE
- ibm-9424 { IBM } # Unicode 4.1, UTF-32 BE with IBM PUA
-UTF-32LE { IANA* } UTF32_LittleEndian
- ibm-1234 { IBM* } # UTF-32 LE, with IBM PUA
- ibm-1235 { IBM } # UTF-32 LE
-
-# ICU-specific names for special uses
-UTF16_PlatformEndian
-UTF16_OppositeEndian
-
-UTF32_PlatformEndian
-UTF32_OppositeEndian
-
-
-# Java-specific, non-Unicode-standard UTF-16 variants.
-# These are in the Java "Basic Encoding Set (contained in lib/rt.jar)".
-# See the "Supported Encodings" at
-# http://java.sun.com/javase/6/docs/technotes/guides/intl/encoding.doc.html
-# or a newer version of this document.
-#
-# Aliases marked with { JAVA* } are canonical names for java.io and java.lang APIs.
-# Aliases marked with { JAVA } are canonical names for the java.nio API.
-#
-# "BOM" means the Unicode Byte Order Mark, which is the encoding-scheme-specific
-# byte sequence for U+FEFF.
-# "Reverse BOM" means the BOM for the sibling encoding scheme with the
-# opposite endianness. (LE<->BE)
-
-# "Sixteen-bit Unicode (or UCS) Transformation Format, big-endian byte order,
-# with byte-order mark"
-#
-# From Unicode: Writes BOM.
-# To Unicode: Detects and consumes BOM.
-# If there is a "reverse BOM", Java throws
-# MalformedInputException: Incorrect byte-order mark.
-# In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value
-# and a UCNV_ILLEGAL UConverterCallbackReason.
-UTF-16BE,version=1 UnicodeBig { JAVA* }
-
-# "Sixteen-bit Unicode (or UCS) Transformation Format, little-endian byte order,
-# with byte-order mark"
-#
-# From Unicode: Writes BOM.
-# To Unicode: Detects and consumes BOM.
-# If there is a "reverse BOM", Java throws
-# MalformedInputException: Incorrect byte-order mark.
-# In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value
-# and a UCNV_ILLEGAL UConverterCallbackReason.
-UTF-16LE,version=1 UnicodeLittle { JAVA* } x-UTF-16LE-BOM { JAVA }
-
-# This one is not mentioned on the "Supported Encodings" page
-# but is available in Java.
-# In Java, this is called "Unicode" but we cannot give it that alias
-# because the standard UTF-16 converter already has a "unicode" alias.
-#
-# From Unicode: Writes BOM.
-# To Unicode: Detects and consumes BOM.
-# If there is no BOM, rather than defaulting to BE, Java throws
-# MalformedInputException: Missing byte-order mark.
-# In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value
-# and a UCNV_ILLEGAL UConverterCallbackReason.
-UTF-16,version=1
-
-# This is the same as standard UTF-16 but always writes a big-endian byte stream,
-# regardless of the platform endianness, as expected by the Java compatibility tests.
-# See the java.nio.charset.Charset API documentation at
-# http://java.sun.com/javase/6/docs/api/java/nio/charset/Charset.html
-# or a newer version of this document.
-#
-# From Unicode: Write BE BOM and BE bytes
-# To Unicode: Detects and consumes BOM. Defaults to BE.
-UTF-16,version=2
-
-# Note: ICU does not currently support Java-specific, non-Unicode-standard UTF-32 variants.
-# Presumably, these behave analogously to the UTF-16 variants with similar names.
-# UTF_32BE_BOM x-UTF-32BE-BOM
-# UTF_32LE_BOM x-UTF-32LE-BOM
-
-# End of Java-specific, non-Unicode-standard UTF variants.
-
-
-# On UTF-7:
-# RFC 2152 (http://www.imc.org/rfc2152) allows to encode some US-ASCII
-# characters directly or in base64. Especially, the characters in set O
-# as defined in the RFC (!"#$%&*;<=>@[]^_`{|}) may be encoded directly
-# but are not allowed in, e.g., email headers.
-# By default, the ICU UTF-7 converter encodes set O directly.
-# By choosing the option "version=1", set O will be escaped instead.
-# For example:
-# utf7Converter=ucnv_open("UTF-7,version=1");
-#
-# For details about email headers see RFC 2047.
-UTF-7 { IANA* MIME* WINDOWS } windows-65000 { WINDOWS* }
- unicode-1-1-utf-7
- unicode-2-0-utf-7
-
-# UTF-EBCDIC doesn't exist in ICU, but the aliases are here for reference.
-#UTF-EBCDIC ibm-1210 { IBM* } ibm-1211 { IBM }
-
-# IMAP-mailbox-name is an ICU-specific name for the encoding of IMAP mailbox names.
-# It is a substantially modified UTF-7 encoding. See the specification in:
-#
-# RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
-# (http://www.ietf.org/rfc/rfc2060.txt)
-# Section 5.1.3. Mailbox International Naming Convention
-IMAP-mailbox-name
-
-SCSU { IANA* }
- ibm-1212 { IBM } # SCSU with IBM PUA
- ibm-1213 { IBM* } # SCSU
-BOCU-1 { IANA* }
- csBOCU-1 { IANA }
- ibm-1214 { IBM } # BOCU-1 with IBM PUA
- ibm-1215 { IBM* } # BOCU-1
-
-# See https://www.unicode.org/reports/tr26 for this Compatibility Encoding Scheme for UTF-16
-# The Unicode Consortium does not encourage the use of CESU-8
-CESU-8 { IANA* } ibm-9400 { IBM* }
-
-# Standard iso-8859-1, which does not have the Euro update.
-# See iso-8859-15 (latin9) for the Euro update
-ISO-8859-1 { MIME* IANA JAVA* }
- ibm-819 { IBM* JAVA } # This is not truely ibm-819 because it's missing the fallbacks.
- IBM819 { IANA }
- cp819 { IANA JAVA }
- latin1 { IANA JAVA }
- 8859_1 { JAVA }
- csISOLatin1 { IANA JAVA }
- iso-ir-100 { IANA JAVA }
- ISO_8859-1:1987 { IANA* JAVA }
- l1 { IANA JAVA }
- 819 { JAVA }
- # windows-28591 { WINDOWS* } # This has odd behavior because it has the Euro update, which isn't correct.
- # LATIN_1 # Old ICU name
- # ANSI_X3.110-1983 # This is for a different IANA alias. This isn't iso-8859-1.
-
-US-ASCII { MIME* IANA JAVA WINDOWS }
- ASCII { JAVA* IANA WINDOWS }
- ANSI_X3.4-1968 { IANA* WINDOWS }
- ANSI_X3.4-1986 { IANA WINDOWS }
- ISO_646.irv:1991 { IANA WINDOWS }
- iso_646.irv:1983 { JAVA }
- ISO646-US { JAVA IANA WINDOWS }
- us { IANA }
- csASCII { IANA WINDOWS }
- iso-ir-6 { IANA }
- cp367 { IANA WINDOWS }
- ascii7 { JAVA }
- 646 { JAVA }
- windows-20127 { WINDOWS* }
- ibm-367 { IBM* } IBM367 { IANA WINDOWS } # This is not truely ibm-367 because it's missing the fallbacks.
-
-# GB 18030 is partly algorithmic, using the MBCS converter
-gb18030 { IANA* } ibm-1392 { IBM* } windows-54936 { WINDOWS* } GB18030 { MIME* }
-
-# Table-based interchange codepages
-
-# Central Europe
-ibm-912_P100-1995 { UTR22* }
- ibm-912 { IBM* JAVA }
- ISO-8859-2 { MIME* IANA JAVA* WINDOWS }
- ISO_8859-2:1987 { IANA* WINDOWS JAVA }
- latin2 { IANA WINDOWS JAVA }
- csISOLatin2 { IANA WINDOWS JAVA }
- iso-ir-101 { IANA WINDOWS JAVA }
- l2 { IANA WINDOWS JAVA }
- 8859_2 { JAVA }
- cp912 { JAVA }
- 912 { JAVA }
- windows-28592 { WINDOWS* }
-
-# Maltese Esperanto
-ibm-913_P100-2000 { UTR22* }
- ibm-913 { IBM* JAVA }
- ISO-8859-3 { MIME* IANA WINDOWS JAVA* }
- ISO_8859-3:1988 { IANA* WINDOWS JAVA }
- latin3 { IANA JAVA WINDOWS }
- csISOLatin3 { IANA WINDOWS }
- iso-ir-109 { IANA WINDOWS JAVA }
- l3 { IANA WINDOWS JAVA }
- 8859_3 { JAVA }
- cp913 { JAVA }
- 913 { JAVA }
- windows-28593 { WINDOWS* }
-
-# Baltic
-ibm-914_P100-1995 { UTR22* }
- ibm-914 { IBM* JAVA }
- ISO-8859-4 { MIME* IANA WINDOWS JAVA* }
- latin4 { IANA WINDOWS JAVA }
- csISOLatin4 { IANA WINDOWS JAVA }
- iso-ir-110 { IANA WINDOWS JAVA }
- ISO_8859-4:1988 { IANA* WINDOWS JAVA }
- l4 { IANA WINDOWS JAVA }
- 8859_4 { JAVA }
- cp914 { JAVA }
- 914 { JAVA }
- windows-28594 { WINDOWS* }
-
-# Cyrillic
-ibm-915_P100-1995 { UTR22* }
- ibm-915 { IBM* JAVA }
- ISO-8859-5 { MIME* IANA WINDOWS JAVA* }
- cyrillic { IANA WINDOWS JAVA }
- csISOLatinCyrillic { IANA WINDOWS JAVA }
- iso-ir-144 { IANA WINDOWS JAVA }
- ISO_8859-5:1988 { IANA* WINDOWS JAVA }
- 8859_5 { JAVA }
- cp915 { JAVA }
- 915 { JAVA }
- windows-28595 { WINDOWS* }
-
-glibc-PT154-2.3.3 { UTR22* }
- PTCP154 { IANA* }
- csPTCP154
- PT154
- CP154
- Cyrillic-Asian
-
-# Arabic
-# ISO_8859-6-E and ISO_8859-6-I are similar to this charset, but BiDi is done differently
-# From a narrow mapping point of view, there is no difference.
-# -E means explicit. -I means implicit.
-# -E requires the client to handle the ISO 6429 bidirectional controls
-ibm-1089_P100-1995 { UTR22* }
- ibm-1089 { IBM* JAVA }
- ISO-8859-6 { MIME* IANA WINDOWS JAVA* }
- arabic { IANA WINDOWS JAVA }
- csISOLatinArabic { IANA WINDOWS JAVA }
- iso-ir-127 { IANA WINDOWS JAVA }
- ISO_8859-6:1987 { IANA* WINDOWS JAVA }
- ECMA-114 { IANA JAVA }
- ASMO-708 { IANA JAVA }
- 8859_6 { JAVA }
- cp1089 { JAVA }
- 1089 { JAVA }
- windows-28596 { WINDOWS* }
- ISO-8859-6-I { IANA MIME } # IANA considers this alias different and BiDi needs to be applied.
- ISO-8859-6-E { IANA MIME } # IANA considers this alias different and BiDi needs to be applied.
- x-ISO-8859-6S { JAVA }
-
-# ISO Greek (with euro update). This is really ISO_8859-7:2003
-ibm-9005_X110-2007 { UTR22* }
- ibm-9005 { IBM* }
- ISO-8859-7 { MIME* IANA JAVA* WINDOWS }
- 8859_7 { JAVA }
- greek { IANA JAVA WINDOWS }
- greek8 { IANA JAVA WINDOWS }
- ELOT_928 { IANA JAVA WINDOWS }
- ECMA-118 { IANA JAVA WINDOWS }
- csISOLatinGreek { IANA JAVA WINDOWS }
- iso-ir-126 { IANA JAVA WINDOWS }
- ISO_8859-7:1987 { IANA* JAVA WINDOWS }
- windows-28597 { WINDOWS* }
- sun_eu_greek # For Solaris
-
-# ISO Greek (w/o euro update)
-# JDK 1.5 has these aliases.
-ibm-813_P100-1995 { UTR22* }
- ibm-813 { IBM* JAVA* }
- cp813 { JAVA }
- 813 { JAVA }
-
-# hebrew
-# ISO_8859-8-E and ISO_8859-8-I are similar to this charset, but BiDi is done differently
-# From a narrow mapping point of view, there is no difference.
-# -E means explicit. -I means implicit.
-# -E requires the client to handle the ISO 6429 bidirectional controls
-# This matches the official mapping on unicode.org
-ibm-5012_P100-1999 { UTR22* }
- ibm-5012 { IBM* }
- ISO-8859-8 { MIME* IANA WINDOWS JAVA* }
- hebrew { IANA WINDOWS JAVA }
- csISOLatinHebrew { IANA WINDOWS JAVA }
- iso-ir-138 { IANA WINDOWS JAVA }
- ISO_8859-8:1988 { IANA* WINDOWS JAVA }
- ISO-8859-8-I { IANA MIME } # IANA and Windows considers this alias different and BiDi needs to be applied.
- ISO-8859-8-E { IANA MIME } # IANA and Windows considers this alias different and BiDi needs to be applied.
- 8859_8 { JAVA }
- windows-28598 { WINDOWS* } # Hebrew (ISO-Visual). A hybrid between ibm-5012 and ibm-916 with extra PUA mappings.
- hebrew8 # Reflect HP-UX code page update
-
-# Unfortunately, the Java aliases are split across ibm-916 and ibm-5012
-# Also many platforms are a combination between ibm-916 and ibm-5012 behaviors
-ibm-916_P100-1995 { UTR22* }
- ibm-916 { IBM* JAVA* }
- cp916 { JAVA }
- 916 { JAVA }
-
-# Turkish
-ibm-920_P100-1995 { UTR22* }
- ibm-920 { IBM* JAVA }
- ISO-8859-9 { MIME* IANA WINDOWS JAVA* }
- latin5 { IANA WINDOWS JAVA }
- csISOLatin5 { IANA JAVA }
- iso-ir-148 { IANA WINDOWS JAVA }
- ISO_8859-9:1989 { IANA* WINDOWS }
- l5 { IANA WINDOWS JAVA }
- 8859_9 { JAVA }
- cp920 { JAVA }
- 920 { JAVA }
- windows-28599 { WINDOWS* }
- ECMA-128 # IANA doesn't have this alias 6/24/2002
- turkish8 # Reflect HP-UX codepage update 8/1/2008
- turkish # Reflect HP-UX codepage update 8/1/2008
-
-# Nordic languages
-iso-8859_10-1998 { UTR22* } ISO-8859-10 { MIME* IANA* }
- iso-ir-157 { IANA }
- l6 { IANA }
- ISO_8859-10:1992 { IANA }
- csISOLatin6 { IANA }
- latin6 { IANA }
-
-# Thai
-# Be warned. There several iso-8859-11 codepage variants, and they are all incompatible.
-# ISO-8859-11 is a superset of TIS-620. The difference is that ISO-8859-11 contains the C1 control codes.
-iso-8859_11-2001 { UTR22* } ISO-8859-11
- thai8 # HP-UX alias. HP-UX says TIS-620, but it's closer to ISO-8859-11.
- x-iso-8859-11 { JAVA* }
-
-# iso-8859-13, PC Baltic (w/o euro update)
-ibm-921_P100-1995 { UTR22* }
- ibm-921 { IBM* }
- ISO-8859-13 { IANA* MIME* JAVA* }
- 8859_13 { JAVA }
- windows-28603 { WINDOWS* }
- cp921
- 921
- x-IBM921 { JAVA }
-
-# Celtic
-iso-8859_14-1998 { UTR22* } ISO-8859-14 { IANA* }
- iso-ir-199 { IANA }
- ISO_8859-14:1998 { IANA }
- latin8 { IANA }
- iso-celtic { IANA }
- l8 { IANA }
-
-# Latin 9
-ibm-923_P100-1998 { UTR22* }
- ibm-923 { IBM* JAVA }
- ISO-8859-15 { IANA* MIME* WINDOWS JAVA* }
- Latin-9 { IANA WINDOWS }
- l9 { WINDOWS }
- 8859_15 { JAVA }
- latin0 { JAVA }
- csisolatin0 { JAVA }
- csisolatin9 { JAVA }
- iso8859_15_fdis { JAVA }
- cp923 { JAVA }
- 923 { JAVA }
- windows-28605 { WINDOWS* }
-
-# CJK encodings
-
-ibm-942_P12A-1999 { UTR22* } # ibm-942_P120 is a rarely used alternate mapping (sjis78 is already old)
- ibm-942 { IBM* }
- ibm-932 { IBM }
- cp932
- shift_jis78
- sjis78
- ibm-942_VSUB_VPUA
- ibm-932_VSUB_VPUA
- x-IBM942 { JAVA* }
- x-IBM942C { JAVA }
- # Is this "JIS_C6226-1978"?
-
-# ibm-943_P15A-2003 differs from windows-932-2000 only in a few roundtrip mappings:
-# - the usual IBM PC control code rotation (1A-1C-7F)
-# - the Windows table has roundtrips for bytes 80, A0, and FD-FF to U+0080 and PUA
-ibm-943_P15A-2003 { UTR22* }
- ibm-943 # Leave untagged because this isn't the default
- Shift_JIS { IANA* MIME* WINDOWS JAVA }
- MS_Kanji { IANA WINDOWS JAVA }
- csShiftJIS { IANA WINDOWS JAVA }
- windows-31j { IANA JAVA } # A further extension of Shift_JIS to include NEC special characters (Row 13)
- csWindows31J { IANA WINDOWS JAVA } # A further extension of Shift_JIS to include NEC special characters (Row 13)
- x-sjis { WINDOWS JAVA }
- x-ms-cp932 { WINDOWS }
- cp932 { WINDOWS }
- windows-932 { WINDOWS* }
- cp943c { JAVA* } # This is slightly different, but the backslash mapping is the same.
- IBM-943C #{ AIX* } # Add this tag once AIX aliases becomes available
- ms932
- pck # Probably SOLARIS
- sjis # This might be for ibm-1351
- ibm-943_VSUB_VPUA
- x-MS932_0213 { JAVA }
- x-JISAutoDetect { JAVA }
- # cp943 # This isn't Windows, and no one else uses it.
- # IANA says that Windows-31J is an extension to csshiftjis ibm-932
-ibm-943_P130-1999 { UTR22* }
- ibm-943 { IBM* JAVA }
- Shift_JIS # Leave untagged because this isn't the default
- cp943 { JAVA* } # This is slightly different, but the backslash mapping is the same.
- 943 { JAVA }
- ibm-943_VASCII_VSUB_VPUA
- x-IBM943 { JAVA }
- # japanese. Unicode name is \u30b7\u30d5\u30c8\u7b26\u53f7\u5316\u8868\u73fe
-ibm-33722_P12A_P12A-2009_U2 { UTR22* }
- ibm-33722 # Leave untagged because this isn't the default
- ibm-5050 # Leave untagged because this isn't the default, and yes this alias is correct
- ibm-33722_VPUA
- IBM-eucJP
-windows-51932-2006 { UTR22* }
- windows-51932 { WINDOWS* }
- CP51932 { IANA* }
- csCP51932
-ibm-33722_P120-1999 { UTR22* } # Japan EUC with \ <-> Yen mapping
- ibm-33722 { IBM* JAVA }
- ibm-5050 { IBM } # Yes this is correct
- cp33722 { JAVA* }
- 33722 { JAVA }
- ibm-33722_VASCII_VPUA
- x-IBM33722 { JAVA }
- x-IBM33722A { JAVA }
- x-IBM33722C { JAVA }
-# ibm-954 seems to be almost a superset of ibm-33722 and ibm-1350
-# ibm-1350 seems to be almost a superset of ibm-33722
-# ibm-954 contains more PUA characters than the others.
-ibm-954_P101-2007 { UTR22* }
- ibm-954 { IBM* }
- x-IBM954 { JAVA* }
- x-IBM954C { JAVA }
- # eucJP # This is closest to Solaris EUC-JP.
-euc-jp-2007 { UTR22* }
- EUC-JP { MIME* IANA JAVA* WINDOWS* }
- Extended_UNIX_Code_Packed_Format_for_Japanese { IANA* JAVA WINDOWS }
- csEUCPkdFmtJapanese { IANA JAVA WINDOWS }
- X-EUC-JP { MIME JAVA WINDOWS } # Japan EUC. x-euc-jp is a MIME name
- eucjis {JAVA}
- ujis # Linux sometimes uses this name. This is an unfortunate generic and rarely used name. Its use is discouraged.
-
-aix-IBM_udcJP-4.3.6 { UTR22* }
- x-IBM-udcJP { JAVA* }
-
-java-euc_jp_linux-1.6_P { UTR22* }
- euc-jp-linux
- x-EUC_JP_LINUX { JAVA* }
-
-java-sjis_0213-1.6_P { UTR22* }
- x-SJIS_0213 { JAVA* }
-
-# Here are various interpretations and extensions of Big5
-ibm-1373_P100-2002 { UTR22* } # IBM's interpretation of Windows' Taiwan Big-5 without HKSCS extensions
- ibm-1373 { IBM* }
- windows-950 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage.
-windows-950-2000 { UTR22* }
- Big5 { IANA* MIME* JAVA* WINDOWS }
- csBig5 { IANA WINDOWS }
- windows-950 { WINDOWS* }
- x-windows-950 { JAVA }
- x-big5
- ms950
-ibm-950_P110-1999 { UTR22* } # Taiwan Big-5 (w/o euro update)
- ibm-950 { IBM* JAVA }
- cp950 { JAVA* }
- 950 { JAVA }
- x-IBM950 { JAVA }
-ibm-1375_P100-2008 { UTR22* } # Big5-HKSCS-2004 with Unicode 3.1 mappings. This uses supplementary characters.
- ibm-1375 { IBM* }
- Big5-HKSCS { IANA* JAVA* }
- big5hk { JAVA }
- HKSCS-BIG5 # From http://www.openi18n.org/localenameguide/
-ibm-5471_P100-2006 { UTR22* } # Big5-HKSCS-2001 with Unicode 3.0 mappings. This uses many PUA characters.
- ibm-5471 { IBM* }
- Big5-HKSCS
- MS950_HKSCS { JAVA* }
- hkbig5 # from HP-UX 11i, which can't handle supplementary characters.
- big5-hkscs:unicode3.0
- x-MS950-HKSCS { JAVA }
- # windows-950 # Windows-950 can be w/ or w/o HKSCS extensions. By default it's not.
- # windows-950_hkscs
-solaris-zh_TW_big5-2.7 { UTR22* }
- Big5_Solaris { JAVA* }
- x-Big5-Solaris { JAVA }
-# GBK
-ibm-1386_P100-2001 { UTR22* }
- ibm-1386 { IBM* }
- cp1386
- windows-936 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage.
- ibm-1386_VSUB_VPUA
-windows-936-2000 { UTR22* }
- GBK { IANA* WINDOWS JAVA* }
- CP936 { IANA JAVA }
- MS936 { IANA } # In JDK 1.5, this goes to x-mswin-936. This is an IANA name split.
- windows-936 { IANA WINDOWS* JAVA }
-
-# Java has two different tables for ibm-1383 and gb2312. We pick closest set for tagging.
-ibm-1383_P110-1999 { UTR22* } # China EUC.
- ibm-1383 { IBM* JAVA }
- GB2312 { IANA* MIME* }
- csGB2312 { IANA }
- cp1383 { JAVA* }
- 1383 { JAVA }
- EUC-CN # According to other platforms, windows-20936 looks more like euc-cn. x-euc-cn is also a MIME name
- ibm-eucCN
- hp15CN # From HP-UX?
- ibm-1383_VPUA
- # gb # This is not an IANA name. gb in IANA means Great Britain.
-
-ibm-5478_P100-1995 { UTR22* } ibm-5478 { IBM* } # This gb_2312_80 DBCS mapping is needed by iso-2022.
- GB_2312-80 { IANA* } # Windows maps this alias incorrectly
- chinese { IANA }
- iso-ir-58 { IANA }
- csISO58GB231280 { IANA }
- gb2312-1980
- GB2312.1980-0 # From X11R6
-
-euc-tw-2014 { UTR22* } # Updated EUC-TW converter based on ibm-964
- EUC-TW
-
-ibm-964_P110-1999 { UTR22* } # Taiwan EUC. x-euc-tw is a MIME name
- ibm-964 { IBM* JAVA }
- ibm-eucTW
- cns11643
- cp964 { JAVA* }
- 964 { JAVA }
- ibm-964_VPUA
- x-IBM964 { JAVA }
-
-# ISO-2022 needs one, and other people may need others.
-ibm-949_P110-1999 { UTR22* }
- ibm-949 { IBM* JAVA }
- cp949 { JAVA* }
- 949 { JAVA }
- ibm-949_VASCII_VSUB_VPUA
- x-IBM949 { JAVA }
-ibm-949_P11A-1999 { UTR22* }
- ibm-949 # Leave untagged because this isn't the default
- cp949c { JAVA* }
- ibm-949_VSUB_VPUA
- x-IBM949C { JAVA }
- IBM-949C { JAVA }
-
-# Korean EUC.
-#
-# <quote from="Jungshik Shin">
-# EUC-KR = KS X 1003/ISO 646-KR or ISO 646-IRV/US-ASCII in GL and KS X 1001:1998 (formerly KS C 5601-1987) in GR.
-#
-# Although widely spread on MS Windows, using
-# KS C 5601 or related names to denote EUC-KR or
-# windows-949 is very much misleading. KS C 5601-1987
-# is NOT suitable as a designation for MIME charset
-# and MBCS. It's just the name of a 94 x 94 Korean
-# coded character set standard which can be invoked
-# on either GL (with MSB reset) or GR (with MSB set).
-# Note that JOHAB (windows-1361) specified in
-# KS X 1001:1998 annex 3 (KS C 5601-1992 annex 3)
-# is a _seprate_ MBCS with a _completely different_
-# mapping.
-# </quote>
-#
-# The following aliases tries to mirror the poor state of alias recognition
-# on these platforms.
-#
-# ibm-970 is almost a subset of ibm-1363.
-# Java, Solaris and AIX use euc-kr to also mean ksc5601.
-# Java has both ibm-970 and EUC-KR as separate converters.
-ibm-970_P110_P110-2006_U2 { UTR22* }
- ibm-970 { IBM* JAVA }
- EUC-KR { IANA* MIME* WINDOWS JAVA }
- KS_C_5601-1987 { JAVA }
- windows-51949 { WINDOWS* }
- csEUCKR { IANA WINDOWS } # x-euc-kr is also a MIME name
- ibm-eucKR { JAVA }
- KSC_5601 { JAVA } # Needed by iso-2022
- 5601 { JAVA }
- cp970 { JAVA* }
- 970 { JAVA }
- ibm-970_VPUA
- x-IBM970 { JAVA }
-
-# ibm-971 is almost the set of DBCS mappings of ibm-970
-ibm-971_P100-1995 ibm-971 { IBM* } ibm-971_VPUA x-IBM971 { JAVA* }
-
-# Java, Solaris and AIX use euc-kr to also mean ksc5601, and _sometimes_ for Windows too.
-# ibm-1363 is almost a superset of ibm-970.
-ibm-1363_P11B-1998 { UTR22* }
- ibm-1363 # Leave untagged because this isn't the default
- KS_C_5601-1987 { IANA* }
- KS_C_5601-1989 { IANA }
- KSC_5601 { IANA }
- csKSC56011987 { IANA }
- korean { IANA }
- iso-ir-149 { IANA }
- cp1363 { MIME* }
- 5601
- ksc
- windows-949 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage.
- ibm-1363_VSUB_VPUA
- x-IBM1363C { JAVA* }
- # ks_x_1001:1992
- # ksc5601-1992
-
-ibm-1363_P110-1997 { UTR22* } # Korean KSC MBCS with \ <-> Won mapping
- ibm-1363 { IBM* }
- ibm-1363_VASCII_VSUB_VPUA
- x-IBM1363 { JAVA* }
-
-windows-949-2000 { UTR22* }
- windows-949 { JAVA* WINDOWS* }
- KS_C_5601-1987 { WINDOWS }
- KS_C_5601-1989 { WINDOWS }
- KSC_5601 { MIME* WINDOWS } # Needed by iso-2022
- csKSC56011987 { WINDOWS }
- korean { WINDOWS }
- iso-ir-149 { WINDOWS }
- ms949 { JAVA }
- x-KSC5601 { JAVA }
-
-windows-1361-2000 { UTR22* }
- ksc5601_1992
- ms1361
- johab
- x-Johab { JAVA* }
-
-windows-874-2000 { UTR22* } # Thai (w/ euro update)
- TIS-620 { WINDOWS }
- windows-874 { JAVA* WINDOWS* }
- MS874 { JAVA }
- x-windows-874 { JAVA }
- # iso-8859-11 { WINDOWS } # iso-8859-11 is similar to TIS-620. ibm-13162 is a closer match.
-
-ibm-874_P100-1995 { UTR22* } # Thai PC (w/o euro update).
- ibm-874 { IBM* JAVA }
- ibm-9066 { IBM } # Yes ibm-874 == ibm-9066. ibm-1161 has the euro update.
- cp874 { JAVA* }
- TIS-620 { IANA* JAVA } # This is actually separate from ibm-874, which is similar to this table
- tis620.2533 { JAVA } # This is actually separate from ibm-874, which is similar to this table
- eucTH # eucTH is an unusual alias from Solaris. eucTH has fewer mappings than TIS620
- x-IBM874 { JAVA }
-
-ibm-1162_P100-1999 { UTR22* } # Thai (w/ euro update)
- ibm-1162 { IBM* }
-
-windows-864-2000 { UTR22* }
- ibm-864s
- cp864s
- x-IBM864S { JAVA* }
-
-# Platform codepages
-# If Java supports the IBM prefix, it should also support the ibm- prefix too.
-ibm-437_P100-1995 { UTR22* } ibm-437 { IBM* } IBM437 { IANA* WINDOWS JAVA } cp437 { IANA WINDOWS JAVA* } 437 { IANA WINDOWS JAVA } csPC8CodePage437 { IANA JAVA } windows-437 { WINDOWS* } # PC US
-ibm-720_P100-1997 { UTR22* } ibm-720 { IBM* } windows-720 { WINDOWS* } DOS-720 { WINDOWS } x-IBM720 { JAVA* } # PC Arabic
-ibm-737_P100-1997 { UTR22* } ibm-737 { IBM* } IBM737 { WINDOWS JAVA } cp737 { JAVA* } windows-737 { WINDOWS* } 737 { JAVA } x-IBM737 { JAVA } # PC Greek
-ibm-775_P100-1996 { UTR22* } ibm-775 { IBM* } IBM775 { IANA* WINDOWS JAVA } cp775 { IANA WINDOWS JAVA* } csPC775Baltic { IANA } windows-775 { WINDOWS* } 775 { JAVA } # PC Baltic
-ibm-850_P100-1995 { UTR22* } ibm-850 { IBM* } IBM850 { IANA* MIME* WINDOWS JAVA } cp850 { IANA MIME WINDOWS JAVA* } 850 { IANA JAVA } csPC850Multilingual { IANA JAVA } windows-850 { WINDOWS* } # PC latin1
-ibm-851_P100-1995 { UTR22* } ibm-851 { IBM* } IBM851 { IANA* } cp851 { IANA MIME* } 851 { IANA } csPC851 { IANA } # PC DOS Greek (w/o euro)
-ibm-852_P100-1995 { UTR22* } ibm-852 { IBM* } IBM852 { IANA* WINDOWS JAVA } cp852 { IANA WINDOWS JAVA* } 852 { IANA WINDOWS JAVA } csPCp852 { IANA JAVA } windows-852 { WINDOWS* } # PC latin2 (w/o euro update)
-ibm-855_P100-1995 { UTR22* } ibm-855 { IBM* } IBM855 { IANA* JAVA } cp855 { IANA JAVA* } 855 { IANA } csIBM855 { IANA } csPCp855 { JAVA } windows-855 { WINDOWS* } # PC cyrillic (w/o euro update)
-ibm-856_P100-1995 { UTR22* } ibm-856 { IBM* } IBM856 { JAVA } cp856 { JAVA* } 856 { JAVA } x-IBM856 { JAVA } # PC Hebrew implicit order
-ibm-857_P100-1995 { UTR22* } ibm-857 { IBM* } IBM857 { IANA* MIME* WINDOWS JAVA } cp857 { IANA MIME JAVA* } 857 { IANA JAVA } csIBM857 { IANA JAVA } windows-857 { WINDOWS* } # PC Latin 5 (w/o euro update)
-ibm-858_P100-1997 { UTR22* } ibm-858 { IBM* } IBM00858 { IANA* MIME* JAVA } CCSID00858 { IANA JAVA } CP00858 { IANA JAVA } PC-Multilingual-850+euro { IANA } cp858 { MIME JAVA* } windows-858 { WINDOWS* } # PC latin1 with Euro
-ibm-860_P100-1995 { UTR22* } ibm-860 { IBM* } IBM860 { IANA* MIME* JAVA } cp860 { IANA MIME JAVA* } 860 { IANA JAVA } csIBM860 { IANA JAVA } # PC Portugal
-ibm-861_P100-1995 { UTR22* } ibm-861 { IBM* } IBM861 { IANA* MIME* WINDOWS JAVA } cp861 { IANA MIME JAVA* } 861 { IANA JAVA } cp-is { IANA JAVA } csIBM861 { IANA JAVA } windows-861 { WINDOWS* } # PC Iceland
-ibm-862_P100-1995 { UTR22* } ibm-862 { IBM* } IBM862 { IANA* MIME* JAVA } cp862 { IANA MIME JAVA* } 862 { IANA JAVA } csPC862LatinHebrew { IANA JAVA } DOS-862 { WINDOWS } windows-862 { WINDOWS* } # PC Hebrew visual order (w/o euro update)
-ibm-863_P100-1995 { UTR22* } ibm-863 { IBM* } IBM863 { IANA* MIME* JAVA } cp863 { IANA MIME JAVA* } 863 { IANA JAVA } csIBM863 { IANA JAVA } # PC Canadian French
-ibm-864_X110-1999 { UTR22* } ibm-864 { IBM* } IBM864 { IANA* MIME* JAVA } cp864 { IANA MIME JAVA* } csIBM864 { IANA JAVA } # PC Arabic (w/o euro update)
-ibm-865_P100-1995 { UTR22* } ibm-865 { IBM* } IBM865 { IANA* MIME* JAVA } cp865 { IANA MIME JAVA* } 865 { IANA JAVA } csIBM865 { IANA JAVA } # PC Nordic
-ibm-866_P100-1995 { UTR22* } ibm-866 { IBM* } IBM866 { IANA* MIME* JAVA } cp866 { IANA MIME WINDOWS JAVA* } 866 { IANA JAVA } csIBM866 { IANA JAVA } windows-866 { WINDOWS* } # PC Russian (w/o euro update)
-ibm-867_P100-1998 { UTR22* } ibm-867 { IBM* } x-IBM867 { JAVA* } # PC Hebrew (w/ euro update) Updated version of ibm-862
-ibm-868_P100-1995 { UTR22* } ibm-868 { IBM* } IBM868 { IANA* MIME* JAVA } CP868 { IANA MIME JAVA* } 868 { JAVA } csIBM868 { IANA } cp-ar { IANA } # PC Urdu
-ibm-869_P100-1995 { UTR22* } ibm-869 { IBM* } IBM869 { IANA* MIME* WINDOWS JAVA } cp869 { IANA MIME JAVA* } 869 { IANA JAVA } cp-gr { IANA JAVA } csIBM869 { IANA JAVA } windows-869 { WINDOWS* } # PC Greek (w/o euro update)
-ibm-878_P100-1996 { UTR22* } ibm-878 { IBM* } KOI8-R { IANA* MIME* WINDOWS JAVA* } koi8 { WINDOWS JAVA } csKOI8R { IANA WINDOWS JAVA } windows-20866 { WINDOWS* } cp878 # Russian internet
-ibm-901_P100-1999 { UTR22* } ibm-901 { IBM* } # PC Baltic (w/ euro update), update of ibm-921
-ibm-902_P100-1999 { UTR22* } ibm-902 { IBM* } # PC Estonian (w/ euro update), update of ibm-922
-ibm-922_P100-1999 { UTR22* } ibm-922 { IBM* } IBM922 { JAVA } cp922 { JAVA* } 922 { JAVA } x-IBM922 { JAVA } # PC Estonian (w/o euro update)
-ibm-1168_P100-2002 { UTR22* } ibm-1168 { IBM* } KOI8-U { IANA* WINDOWS } windows-21866 { WINDOWS* } # Ukrainian KOI8. koi8-ru != KOI8-U and Microsoft is wrong for aliasing them as the same.
-ibm-4909_P100-1999 { UTR22* } ibm-4909 { IBM* } # ISO Greek (w/ euro update), update of ibm-813
-
-# The cp aliases in this section aren't really windows aliases, but it was used by ICU for Windows.
-# cp is usually used to denote IBM in Java, and that is why we don't do that anymore.
-# The windows-* aliases mean windows codepages.
-ibm-5346_P100-1998 { UTR22* } ibm-5346 { IBM* } windows-1250 { IANA* JAVA* WINDOWS* } cp1250 { WINDOWS JAVA } # Windows Latin2 (w/ euro update)
-ibm-5347_P100-1998 { UTR22* } ibm-5347 { IBM* } windows-1251 { IANA* JAVA* WINDOWS* } cp1251 { WINDOWS JAVA } ANSI1251 # Windows Cyrillic (w/ euro update). ANSI1251 is from Solaris
-ibm-5348_P100-1997 { UTR22* } ibm-5348 { IBM* } windows-1252 { IANA* JAVA* WINDOWS* } cp1252 { JAVA } # Windows Latin1 (w/ euro update)
-ibm-5349_P100-1998 { UTR22* } ibm-5349 { IBM* } windows-1253 { IANA* JAVA* WINDOWS* } cp1253 { JAVA } # Windows Greek (w/ euro update)
-ibm-5350_P100-1998 { UTR22* } ibm-5350 { IBM* } windows-1254 { IANA* JAVA* WINDOWS* } cp1254 { JAVA } # Windows Turkish (w/ euro update)
-ibm-9447_P100-2002 { UTR22* } ibm-9447 { IBM* } windows-1255 { IANA* JAVA* WINDOWS* } cp1255 { JAVA } # Windows Hebrew (w/ euro update)
-ibm-9448_X100-2005 { UTR22* } ibm-9448 { IBM* } windows-1256 { IANA* JAVA* WINDOWS* } cp1256 { WINDOWS JAVA } x-windows-1256S { JAVA } # Windows Arabic (w/ euro update)
-ibm-9449_P100-2002 { UTR22* } ibm-9449 { IBM* } windows-1257 { IANA* JAVA* WINDOWS* } cp1257 { JAVA } # Windows Baltic (w/ euro update)
-ibm-5354_P100-1998 { UTR22* } ibm-5354 { IBM* } windows-1258 { IANA* JAVA* WINDOWS* } cp1258 { JAVA } # Windows Vietnamese (w/ euro update)
-
-# These tables are out of date, and most don't have the Euro
-# Leave the windows- variants untagged. They are alternate tables of the newer ones above.
-ibm-1250_P100-1995 { UTR22* } ibm-1250 { IBM* } windows-1250 # Old Windows Latin2 (w/o euro update)
-ibm-1251_P100-1995 { UTR22* } ibm-1251 { IBM* } windows-1251 # Old Windows Cyrillic (w/o euro update)
-ibm-1252_P100-2000 { UTR22* } ibm-1252 { IBM* } windows-1252 # Old Windows Latin 1 without Euro
-ibm-1253_P100-1995 { UTR22* } ibm-1253 { IBM* } windows-1253 # Old Windows Greek (w/o euro update)
-ibm-1254_P100-1995 { UTR22* } ibm-1254 { IBM* } windows-1254 # Old Windows Turkish (w/o euro update)
-ibm-1255_P100-1995 { UTR22* } ibm-1255 { IBM* } # Very old Windows Hebrew (w/o euro update)
-ibm-5351_P100-1998 { UTR22* } ibm-5351 { IBM* } windows-1255 # Old Windows Hebrew (w/ euro update)
-ibm-1256_P110-1997 { UTR22* } ibm-1256 { IBM* } # Old Windows Arabic (w/o euro update)
-ibm-5352_P100-1998 { UTR22* } ibm-5352 { IBM* } windows-1256 # Somewhat old Windows Arabic (w/ euro update)
-ibm-1257_P100-1995 { UTR22* } ibm-1257 { IBM* } # Old Windows Baltic (w/o euro update)
-ibm-5353_P100-1998 { UTR22* } ibm-5353 { IBM* } windows-1257 # Somewhat old Windows Baltic (w/ euro update)
-ibm-1258_P100-1997 { UTR22* } ibm-1258 { IBM* } windows-1258 # Old Windows Vietnamese (w/o euro update)
-
-macos-0_2-10.2 { UTR22* } macintosh { IANA* MIME* WINDOWS } mac { IANA } csMacintosh { IANA } windows-10000 { WINDOWS* } macroman { JAVA } x-macroman { JAVA* } # Apple latin 1
-macos-6_2-10.4 { UTR22* } x-mac-greek { MIME* WINDOWS } windows-10006 { WINDOWS* } macgr x-MacGreek { JAVA* } # Apple Greek
-macos-7_3-10.2 { UTR22* } x-mac-cyrillic { MIME* WINDOWS } windows-10007 { WINDOWS* } mac-cyrillic maccy x-MacCyrillic { JAVA } x-MacUkraine { JAVA* } # Apple Cyrillic
-macos-21-10.5 { UTR22* } x-mac-thai { MIME* } x-MacThai { JAVA* } MacThai { JAVA }
-macos-29-10.2 { UTR22* } x-mac-centraleurroman { MIME* } windows-10029 { WINDOWS* } x-mac-ce { WINDOWS } macce maccentraleurope x-MacCentralEurope { JAVA* } # Apple Central Europe
-macos-33-10.5 { UTR22* } x-mac-symbol { MIME* } x-MacSymbol { JAVA* } MacSymbol { JAVA }
-macos-34-10.2 { UTR22* } x-mac-dingbat { MIME* } x-MacDingbat { JAVA* } MacDingbat { JAVA }
-macos-35-10.2 { UTR22* } x-mac-turkish { MIME* WINDOWS } windows-10081 { WINDOWS* } mactr x-MacTurkish { JAVA* } # Apple Turkish
-macos-36_2-10.2 { UTR22* } x-mac-croatian { MIME* } x-MacCroatian { JAVA* } MacCroatian { JAVA }
-macos-37_5-10.2 { UTR22* } x-mac-iceland { MIME* } x-MacIceland { JAVA* } MacIceland { JAVA }
-macos-38_2-10.2 { UTR22* } x-mac-romania { MIME* } x-MacRomania { JAVA* } MacRomania { JAVA }
-macos-518-10.2 { UTR22* } x-mac-arabic { MIME* } x-MacArabic { JAVA* } MacArabic { JAVA }
-macos-1285-10.2 { UTR22* } x-mac-hebrew { MIME* } x-MacHebrew { JAVA* } MacHebrew { JAVA }
-
-ibm-1051_P100-1995 { UTR22* } ibm-1051 { IBM* } hp-roman8 { IANA* } roman8 { IANA } r8 { IANA } csHPRoman8 { IANA } x-roman8 { JAVA* } # HP Latin1
-ibm-1276_P100-1995 { UTR22* } ibm-1276 { IBM* } Adobe-Standard-Encoding { IANA* } csAdobeStandardEncoding { IANA } # Different from ISO-Unicode-IBM-1276 (GCSGID: 1276)
-
-ibm-1006_P100-1995 { UTR22* } ibm-1006 { IBM* } IBM1006 { JAVA } cp1006 { JAVA* } 1006 { JAVA } x-IBM1006 { JAVA } # Urdu
-ibm-1098_P100-1995 { UTR22* } ibm-1098 { IBM* } IBM1098 { JAVA } cp1098 { JAVA* } 1098 { JAVA } x-IBM1098 { JAVA } # PC Farsi
-ibm-1124_P100-1996 { UTR22* } ibm-1124 { IBM* JAVA } cp1124 { JAVA* } 1124 { JAVA } x-IBM1124 { JAVA } # ISO Cyrillic Ukraine
-ibm-1125_P100-1997 { UTR22* } ibm-1125 { IBM* } cp1125 # Cyrillic Ukraine PC
-ibm-1129_P100-1997 { UTR22* } ibm-1129 { IBM* } # ISO Vietnamese
-ibm-1131_P100-1997 { UTR22* } ibm-1131 { IBM* } cp1131 # Cyrillic Belarus PC
-ibm-1133_P100-1997 { UTR22* } ibm-1133 { IBM* } # ISO Lao
-
-# GSM 03.38
-gsm-03.38-2009 { UTR22* } GSM0338 # GSM0338 alias is from Perl
-
-# Partially algorithmic converters
-
-# [U_ENABLE_GENERIC_ISO_2022]
-# The _generic_ ISO-2022 converter is disabled starting 2003-dec-03 (ICU 2.8).
-# For details see the icu mailing list from 2003-dec-01 and the ucnv2022.c file.
-# Language-specific variants of ISO-2022 continue to be available as listed below.
-# ISO_2022 ISO-2022
-
-ISO_2022,locale=ja,version=0 ISO-2022-JP { IANA* MIME* JAVA* } csISO2022JP { IANA JAVA } x-windows-iso2022jp { JAVA } x-windows-50220 { JAVA }
-ISO_2022,locale=ja,version=1 ISO-2022-JP-1 { MIME* } JIS_Encoding { IANA* } csJISEncoding { IANA } ibm-5054 { IBM* } JIS x-windows-50221 { JAVA* }
-ISO_2022,locale=ja,version=2 ISO-2022-JP-2 { IANA* MIME* } csISO2022JP2 { IANA }
-ISO_2022,locale=ja,version=3 JIS7
-ISO_2022,locale=ja,version=4 JIS8
-ISO_2022,locale=ko,version=0 ISO-2022-KR { IANA* MIME* JAVA* } csISO2022KR { IANA JAVA } # This uses ibm-949
-ISO_2022,locale=ko,version=1 ibm-25546 { IBM* }
-ISO_2022,locale=zh,version=0 ISO-2022-CN { IANA* JAVA* } csISO2022CN { JAVA } x-ISO-2022-CN-GB { JAVA }
-ISO_2022,locale=zh,version=1 ISO-2022-CN-EXT { IANA* }
-ISO_2022,locale=zh,version=2 ISO-2022-CN-CNS x-ISO-2022-CN-CNS { JAVA* }
-HZ HZ-GB-2312 { IANA* }
-x11-compound-text COMPOUND_TEXT x-compound-text { JAVA* }
-
-ISCII,version=0 x-ISCII91 { JAVA* } x-iscii-de { WINDOWS } windows-57002 { WINDOWS* } iscii-dev ibm-4902 { IBM* } # ibm-806 contains non-standard box drawing symbols.
-ISCII,version=1 x-iscii-be { WINDOWS } windows-57003 { WINDOWS* } iscii-bng windows-57006 { WINDOWS } x-iscii-as { WINDOWS } # be is different from as on Windows.
-ISCII,version=2 x-iscii-pa { WINDOWS } windows-57011 { WINDOWS* } iscii-gur
-ISCII,version=3 x-iscii-gu { WINDOWS } windows-57010 { WINDOWS* } iscii-guj
-ISCII,version=4 x-iscii-or { WINDOWS } windows-57007 { WINDOWS* } iscii-ori
-ISCII,version=5 x-iscii-ta { WINDOWS } windows-57004 { WINDOWS* } iscii-tml
-ISCII,version=6 x-iscii-te { WINDOWS } windows-57005 { WINDOWS* } iscii-tlg
-ISCII,version=7 x-iscii-ka { WINDOWS } windows-57008 { WINDOWS* } iscii-knd
-ISCII,version=8 x-iscii-ma { WINDOWS } windows-57009 { WINDOWS* } iscii-mlm
-
-# Lotus specific
-LMBCS-1 lmbcs ibm-65025 { IBM* }
-
-# These Lotus specific converters still work, but they aren't advertised in this alias table.
-# These are almost never used outside of Lotus software,
-# and they take a lot of time when creating the available converter list.
-# Also Lotus doesn't really use them anyway. It was a mistake to create these LMBCS variant converters in ICU.
-#LMBCS-2
-#LMBCS-3
-#LMBCS-4
-#LMBCS-5
-#LMBCS-6
-#LMBCS-8
-#LMBCS-11
-#LMBCS-16
-#LMBCS-17
-#LMBCS-18
-#LMBCS-19
-
-# EBCDIC codepages according to the CDRA
-
-# without Euro
-ibm-37_P100-1995 { UTR22* } # EBCDIC US
- ibm-37 { IBM* }
- IBM037 { IANA* JAVA }
- ibm-037 # { JAVA }
- ebcdic-cp-us { IANA JAVA }
- ebcdic-cp-ca { IANA JAVA }
- ebcdic-cp-wt { IANA JAVA }
- ebcdic-cp-nl { IANA JAVA }
- csIBM037 { IANA JAVA }
- cp037 { JAVA* }
- 037 { JAVA }
- cpibm37 { JAVA }
- cp37
-
-ibm-273_P100-1995 { UTR22* } ibm-273 { IBM* } IBM273 { IANA* JAVA } CP273 { IANA JAVA* } csIBM273 { IANA } ebcdic-de 273 { JAVA } # EBCDIC Germanay, Austria
-ibm-277_P100-1995 { UTR22* } ibm-277 { IBM* } IBM277 { IANA* JAVA } cp277 { JAVA* } EBCDIC-CP-DK { IANA } EBCDIC-CP-NO { IANA } csIBM277 { IANA } ebcdic-dk 277 { JAVA } # EBCDIC Denmark
-ibm-278_P100-1995 { UTR22* } ibm-278 { IBM* } IBM278 { IANA* JAVA } cp278 { JAVA* } ebcdic-cp-fi { IANA } ebcdic-cp-se { IANA } csIBM278 { IANA } ebcdic-sv { JAVA } 278 { JAVA } # EBCDIC Sweden
-ibm-280_P100-1995 { UTR22* } ibm-280 { IBM* } IBM280 { IANA* JAVA } CP280 { IANA JAVA* } ebcdic-cp-it { IANA } csIBM280 { IANA } 280 { JAVA } # EBCDIC Italy
-ibm-284_P100-1995 { UTR22* } ibm-284 { IBM* } IBM284 { IANA* JAVA } CP284 { IANA JAVA* } ebcdic-cp-es { IANA } csIBM284 { IANA } cpibm284 { JAVA } 284 { JAVA } # EBCDIC Spain
-ibm-285_P100-1995 { UTR22* } ibm-285 { IBM* } IBM285 { IANA* JAVA } CP285 { IANA JAVA* } ebcdic-cp-gb { IANA } csIBM285 { IANA } cpibm285 { JAVA } ebcdic-gb { JAVA } 285 { JAVA } # EBCDIC UK Ireland
-ibm-290_P100-1995 { UTR22* } ibm-290 { IBM* } IBM290 { IANA* } cp290 { IANA } EBCDIC-JP-kana { IANA } csIBM290 { IANA } # host SBCS (Katakana)
-ibm-297_P100-1995 { UTR22* } ibm-297 { IBM* } IBM297 { IANA* JAVA } cp297 { IANA JAVA* } ebcdic-cp-fr { IANA } csIBM297 { IANA } cpibm297 { JAVA } 297 { JAVA } # EBCDIC France
-ibm-420_X120-1999 { UTR22* } ibm-420 { IBM* } IBM420 { IANA* JAVA } cp420 { IANA JAVA* } ebcdic-cp-ar1 { IANA } csIBM420 { IANA } 420 { JAVA } # EBCDIC Arabic (all presentation shapes)
-ibm-424_P100-1995 { UTR22* } ibm-424 { IBM* } IBM424 { IANA* JAVA } cp424 { IANA JAVA* } ebcdic-cp-he { IANA } csIBM424 { IANA } 424 { JAVA } # EBCDIC Hebrew
-ibm-500_P100-1995 { UTR22* } ibm-500 { IBM* } IBM500 { IANA* JAVA } CP500 { IANA JAVA* } ebcdic-cp-be { IANA } csIBM500 { IANA } ebcdic-cp-ch { IANA } 500 # EBCDIC International Latin1
-ibm-803_P100-1999 { UTR22* } ibm-803 { IBM* } cp803 # Old EBCDIC Hebrew
-ibm-838_P100-1995 { UTR22* } ibm-838 { IBM* } IBM838 { JAVA } IBM-Thai { IANA* JAVA } csIBMThai { IANA } cp838 { JAVA* } 838 { JAVA } ibm-9030 { IBM } # EBCDIC Thai. Yes ibm-9030 is an alias.
-ibm-870_P100-1995 { UTR22* } ibm-870 { IBM* } IBM870 { IANA* JAVA } CP870 { IANA JAVA* } ebcdic-cp-roece { IANA } ebcdic-cp-yu { IANA } csIBM870 { IANA } # EBCDIC Latin 2
-ibm-871_P100-1995 { UTR22* } ibm-871 { IBM* } IBM871 { IANA* JAVA } ebcdic-cp-is { IANA JAVA } csIBM871 { IANA JAVA } CP871 { IANA JAVA* } ebcdic-is { JAVA } 871 { JAVA } # EBCDIC Iceland
-ibm-875_P100-1995 { UTR22* } ibm-875 { IBM* } IBM875 { JAVA } cp875 { JAVA* } 875 { JAVA } x-IBM875 { JAVA } # EBCDIC Greek
-ibm-918_P100-1995 { UTR22* } ibm-918 { IBM* } IBM918 { IANA* JAVA } CP918 { IANA JAVA* } ebcdic-cp-ar2 { IANA } csIBM918 { IANA } # EBCDIC Urdu
-ibm-930_P120-1999 { UTR22* } # EBCDIC_STATEFUL Katakana-Kanji Host Mixed.
- ibm-930 { IBM* }
- ibm-5026 { IBM } # Yes this is correct
- IBM930 { JAVA }
- cp930 { JAVA* }
- 930 { JAVA }
- x-IBM930 { JAVA }
- x-IBM930A { JAVA }
-ibm-933_P110-1995 { UTR22* } ibm-933 { IBM* JAVA } cp933 { JAVA* } 933 { JAVA } x-IBM933 { JAVA } # Korea EBCDIC MIXED
-ibm-935_P110-1999 { UTR22* } ibm-935 { IBM* JAVA } cp935 { JAVA* } 935 { JAVA } x-IBM935 { JAVA } # China EBCDIC MIXED. Need to use Unicode, ibm-1388 or gb18030 instead because it is required by the government of China.
-ibm-937_P110-1999 { UTR22* } ibm-937 { IBM* JAVA } cp937 { JAVA* } 937 { JAVA } x-IBM937 { JAVA } # Taiwan EBCDIC MIXED
-ibm-939_P120-1999 { UTR22* } # EBCDIC_STATEFUL Latin-Kanji Host Mixed.
- ibm-939 { IBM* }
- ibm-931 { IBM } # Yes this is correct
- ibm-5035 { IBM } # Yes this is also correct
- IBM939 { JAVA }
- cp939 { JAVA* }
- 939 { JAVA }
- x-IBM939 { JAVA }
- x-IBM939A { JAVA }
-ibm-1025_P100-1995 { UTR22* } ibm-1025 { IBM* JAVA } cp1025 { JAVA* } 1025 { JAVA } x-IBM1025 { JAVA } # EBCDIC Cyrillic
-ibm-1026_P100-1995 { UTR22* } ibm-1026 { IBM* } IBM1026 { IANA* JAVA } CP1026 { IANA JAVA* } csIBM1026 { IANA } 1026 { JAVA } # EBCDIC Turkey
-ibm-1047_P100-1995 { UTR22* } ibm-1047 { IBM* } IBM1047 { IANA* JAVA } cp1047 { JAVA* } 1047 { JAVA } # EBCDIC Open systems Latin1
-ibm-1097_P100-1995 { UTR22* } ibm-1097 { IBM* JAVA } cp1097 { JAVA* } 1097 { JAVA } x-IBM1097 { JAVA } # EBCDIC Farsi
-ibm-1112_P100-1995 { UTR22* } ibm-1112 { IBM* JAVA } cp1112 { JAVA* } 1112 { JAVA } x-IBM1112 { JAVA } # EBCDIC Baltic
-ibm-1114_P100-2001 { UTR22* } ibm-1114 { IBM* } x-IBM1114 { JAVA* }
-ibm-1115_P100-1995 { UTR22* } ibm-1115 { IBM* } x-IBM1115 { JAVA* }
-ibm-1122_P100-1999 { UTR22* } ibm-1122 { IBM* JAVA } cp1122 { JAVA* } 1122 { JAVA } x-IBM1122 { JAVA } # EBCDIC Estonia
-ibm-1123_P100-1995 { UTR22* } ibm-1123 { IBM* JAVA } cp1123 { JAVA* } 1123 { JAVA } x-IBM1123 { JAVA } # EBCDIC Cyrillic Ukraine
-ibm-1130_P100-1997 { UTR22* } ibm-1130 { IBM* } # EBCDIC Vietnamese
-ibm-1132_P100-1998 { UTR22* } ibm-1132 { IBM* } # EBCDIC Lao
-ibm-1137_P100-1999 { UTR22* } ibm-1137 { IBM* } # Devanagari EBCDIC (based on Unicode character set)
-ibm-4517_P100-2005 { UTR22* } ibm-4517 { IBM* } # EBCDIC Arabic. Update of ibm-421
-
-# with Euro
-ibm-1140_P100-1997 { UTR22* } ibm-1140 { IBM* } IBM01140 { IANA* JAVA } CCSID01140 { IANA JAVA } CP01140 { IANA JAVA } cp1140 { JAVA* } ebcdic-us-37+euro { IANA } # EBCDIC US
-ibm-1141_P100-1997 { UTR22* } ibm-1141 { IBM* } IBM01141 { IANA* JAVA } CCSID01141 { IANA JAVA } CP01141 { IANA JAVA } cp1141 { JAVA* } ebcdic-de-273+euro { IANA } # EBCDIC Germanay, Austria
-ibm-1142_P100-1997 { UTR22* } ibm-1142 { IBM* } IBM01142 { IANA* JAVA } CCSID01142 { IANA JAVA } CP01142 { IANA JAVA } cp1142 { JAVA* } ebcdic-dk-277+euro { IANA } ebcdic-no-277+euro { IANA } # EBCDIC Denmark
-ibm-1143_P100-1997 { UTR22* } ibm-1143 { IBM* } IBM01143 { IANA* JAVA } CCSID01143 { IANA JAVA } CP01143 { IANA JAVA } cp1143 { JAVA* } ebcdic-fi-278+euro { IANA } ebcdic-se-278+euro { IANA } # EBCDIC Sweden
-ibm-1144_P100-1997 { UTR22* } ibm-1144 { IBM* } IBM01144 { IANA* JAVA } CCSID01144 { IANA JAVA } CP01144 { IANA JAVA } cp1144 { JAVA* } ebcdic-it-280+euro { IANA } # EBCDIC Italy
-ibm-1145_P100-1997 { UTR22* } ibm-1145 { IBM* } IBM01145 { IANA* JAVA } CCSID01145 { IANA JAVA } CP01145 { IANA JAVA } cp1145 { JAVA* } ebcdic-es-284+euro { IANA } # EBCDIC Spain
-ibm-1146_P100-1997 { UTR22* } ibm-1146 { IBM* } IBM01146 { IANA* JAVA } CCSID01146 { IANA JAVA } CP01146 { IANA JAVA } cp1146 { JAVA* } ebcdic-gb-285+euro { IANA } # EBCDIC UK Ireland
-ibm-1147_P100-1997 { UTR22* } ibm-1147 { IBM* } IBM01147 { IANA* JAVA } CCSID01147 { IANA JAVA } CP01147 { IANA JAVA } cp1147 { JAVA* } ebcdic-fr-297+euro { IANA } # EBCDIC France
-ibm-1148_P100-1997 { UTR22* } ibm-1148 { IBM* } IBM01148 { IANA* JAVA } CCSID01148 { IANA JAVA } CP01148 { IANA JAVA } cp1148 { JAVA* } ebcdic-international-500+euro { IANA } # EBCDIC International Latin1
-ibm-1149_P100-1997 { UTR22* } ibm-1149 { IBM* } IBM01149 { IANA* JAVA } CCSID01149 { IANA JAVA } CP01149 { IANA JAVA } cp1149 { JAVA* } ebcdic-is-871+euro { IANA } # EBCDIC Iceland
-ibm-1153_P100-1999 { UTR22* } ibm-1153 { IBM* } IBM1153 { JAVA } x-IBM1153 { JAVA* } # EBCDIC latin 2
-ibm-1154_P100-1999 { UTR22* } ibm-1154 { IBM* } # EBCDIC Cyrillic Multilingual
-ibm-1155_P100-1999 { UTR22* } ibm-1155 { IBM* } # EBCDIC Turkey
-ibm-1156_P100-1999 { UTR22* } ibm-1156 { IBM* } # EBCDIC Baltic Multilingual
-ibm-1157_P100-1999 { UTR22* } ibm-1157 { IBM* } # EBCDIC Estonia
-ibm-1158_P100-1999 { UTR22* } ibm-1158 { IBM* } # EBCDIC Cyrillic Ukraine
-ibm-1160_P100-1999 { UTR22* } ibm-1160 { IBM* } # EBCDIC Thailand
-ibm-1164_P100-1999 { UTR22* } ibm-1164 { IBM* } # EBCDIC Viet Nam
-ibm-1364_P110-2007 { UTR22* } ibm-1364 { IBM* } x-IBM1364 { JAVA* } # Korean Host Mixed
-ibm-1370_P100-1999 { UTR22* } ibm-1370 { IBM* } x-IBM1370 { JAVA* }
-ibm-1371_P100-1999 { UTR22* } ibm-1371 { IBM* } x-IBM1371 { JAVA* } # Taiwan EBCDIC MIXED (Euro update of ibm-937)
-ibm-1388_P103-2001 { UTR22* } ibm-1388 { IBM* } ibm-9580 { IBM } x-IBM1388 { JAVA* } # S-Ch DBCS-Host Data GBK EBCDIC_STATEFUL. Yes ibm-9580 is an alias.
-ibm-1390_P110-2003 { UTR22* } ibm-1390 { IBM* } x-IBM1390 { JAVA* } # Japan EBCDIC MIXED (JIS X 0213)
-ibm-1399_P110-2003 { UTR22* } ibm-1399 { IBM* } x-IBM1399 { JAVA* } # Host MBCS (Latin-Kanji) (JIS X 0213)
-ibm-5123_P100-1999 { UTR22* } ibm-5123 { IBM* } # Host Roman Jis. Euro update of ibm-1027. SBCS portion of ibm-1390.
-ibm-8482_P100-1999 { UTR22* } ibm-8482 { IBM* } # host SBCS (Katakana). Euro update of ibm-290. SBCS portion of ibm-1399.
-# Yes ibm-20780 is the same as ibm-16684
-ibm-16684_P110-2003 { UTR22* } ibm-16684 { IBM* } ibm-20780 { IBM } # DBCS Jis + Roman Jis Host. This is the DBCS portion of ibm-1390 and ibm-1399 (JIS X 0213).
-ibm-4899_P100-1998 { UTR22* } ibm-4899 { IBM* } # Old EBCDIC Hebrew. Update of ibm-803
-ibm-4971_P100-1999 { UTR22* } ibm-4971 { IBM* } # EBCDIC Greek. Update of ibm-875 and superceded by ibm-9067
-ibm-9067_X100-2005 { UTR22* } ibm-9067 { IBM* } # EBCDIC Greek. Update of ibm-875 and ibm-4971
-ibm-12712_P100-1998 { UTR22* } ibm-12712 { IBM* } ebcdic-he # EBCDIC Hebrew (new sheqel, control characters update). Update of ibm-424
-ibm-16804_X110-1999 { UTR22* } ibm-16804 { IBM* } ebcdic-ar # EBCDIC Arabic. Update of ibm-420
-
-java-Cp1399A-1.6_P { UTR22* } x-IBM1399A { JAVA* }
-java-Cp420s-1.6_P { UTR22* } x-IBM420S { JAVA* }
-java-Cp1390A-1.6_P { UTR22* } x-IBM1390A { JAVA* }
-
-# EBCDIC codepages for S/390, with LF and NL codes swapped
-# Starting with ICU 2.4, the swapping is done by modifying the
-# normal tables at runtime instead of at build time.
-# Append UCNV_SWAP_LFNL_OPTION_STRING to the "ibm-CCSID" name to select this.
-#
-# Example: "ibm-1047,swaplfnl" or "ibm-1047" UCNV_SWAP_LFNL_OPTION_STRING
-#
-# This avoids the duplication of all EBCDIC SBCS and mixed-SBCS/DBCS
-# mapping files.
-
-# Some examples below for declaring old-style, obsolete aliases with the "-s390"
-# suffix to map to the new-style, recommended names with the option added.
-# These are listed here for backward compatibility.
-# Do not use these; instead use the normal converter name with the option
-# added as recommended above.
-
-# Note: It is not possible to define an alias (non-initial name in a line here)
-# that itself contains a converter option like this one for swapping LF<->NL.
-# Such names would never be found because ucnv_open() will first parse and strip
-# options before looking up a name in this table.
-# ucnv_open() then parses the lookup result (the canonical name on the left
-# in lines here) as well.
-
-# This also means that it is not necessary to add anything to convrtrs.txt
-# for converter names like "ibm-1026,swaplfnl" to work -
-# they are already covered by the normal option parsing together with the
-# regular, option-less alias elsewhere in this file.
-
-ibm-37_P100-1995,swaplfnl ibm-37-s390 # ibm037-s390 also matches ibm-37-s390
-ibm-924_P100-1998,swaplfnl ibm-924-s390 IBM924_LF { JAVA* }
-ibm-1047_P100-1995,swaplfnl ibm-1047-s390 IBM1047_LF { JAVA* }
-ibm-1140_P100-1997,swaplfnl ibm-1140-s390
-ibm-1141_P100-1997,swaplfnl ibm-1141-s390 IBM1141_LF { JAVA* }
-ibm-1142_P100-1997,swaplfnl ibm-1142-s390
-ibm-1143_P100-1997,swaplfnl ibm-1143-s390
-ibm-1144_P100-1997,swaplfnl ibm-1144-s390
-ibm-1145_P100-1997,swaplfnl ibm-1145-s390
-ibm-1146_P100-1997,swaplfnl ibm-1146-s390
-ibm-1147_P100-1997,swaplfnl ibm-1147-s390
-ibm-1148_P100-1997,swaplfnl ibm-1148-s390
-ibm-1149_P100-1997,swaplfnl ibm-1149-s390
-ibm-1153_P100-1999,swaplfnl ibm-1153-s390
-ibm-12712_P100-1998,swaplfnl ibm-12712-s390
-ibm-16804_X110-1999,swaplfnl ibm-16804-s390
-
-# This is a special version of ibm-1140 that the XML4C (Xerces) parser team
-# requested in 2000.
-# It maps both EBCDIC LF and NL controls to Unicode LF U+000A.
-
-ebcdic-xml-us
-
-# These are not installed by default. They are rarely used.
-# Many of them can be added through the online ICU Data Library Customization tool
-
-ibm-1004_P100-1995 { UTR22* } ibm-1004 { IBM* }
-ibm-1008_P100-1995 { UTR22* } ibm-1008 { IBM* } # cp1008, 8-bit Arabic (w/o euro update)
-ibm-1009_P100-1995 { UTR22* } ibm-1009 { IBM* }
-ibm-1010_P100-1995 { UTR22* } ibm-1010 { IBM* } NF_Z_62-010 { IANA* } iso-ir-69 { IANA } ISO646-FR { IANA } fr { IANA } csISO69French { IANA }
-ibm-1011_P100-1995 { UTR22* } ibm-1011 { IBM* } DIN_66003 { IANA* } iso-ir-21 { IANA } de { IANA } ISO646-DE { IANA } csISO21German { IANA }
-ibm-1012_P100-1995 { UTR22* } ibm-1012 { IBM* } IT { IANA* } iso-ir-15 { IANA } ISO646-IT { IANA } csISO15Italian { IANA }
-ibm-1013_P100-1995 { UTR22* } ibm-1013 { IBM* } BS_4730 { IANA* } iso-ir-4 { IANA } ISO646-GB { IANA } gb { IANA } uk { IANA } csISO4UnitedKingdom { IANA }
-ibm-1014_P100-1995 { UTR22* } ibm-1014 { IBM* } ES2 { IANA* } iso-ir-85 { IANA } ISO646-ES2 { IANA } csISO85Spanish2 { IANA }
-ibm-1015_P100-1995 { UTR22* } ibm-1015 { IBM* } PT2 { IANA* } iso-ir-84 { IANA } ISO646-PT2 { IANA } csISO84Portuguese2 { IANA }
-ibm-1016_P100-1995 { UTR22* } ibm-1016 { IBM* } NS_4551-1 { IANA* } iso-ir-60 { IANA } ISO646-NO { IANA } no { IANA } csISO60DanishNorwegian { IANA } csISO60Norwegian1 { IANA }
-ibm-1017_P100-1995 { UTR22* } ibm-1017 { IBM* }
-ibm-1018_P100-1995 { UTR22* } ibm-1018 { IBM* } SEN_850200_B { IANA* } iso-ir-10 { IANA } FI { IANA } ISO646-FI { IANA } ISO646-SE { IANA } se { IANA } csISO10Swedish { IANA }
-ibm-1019_P100-1995 { UTR22* } ibm-1019 { IBM* }
-ibm-1020_P100-2003 { UTR22* } ibm-1020 { IBM* } CSA_Z243.4-1985-1 { IANA* } iso-ir-121 { IANA } ISO646-CA { IANA } csa7-1 { IANA } ca { IANA } csISO121Canadian1 { IANA }
-ibm-1021_P100-2003 { UTR22* } ibm-1021 { IBM* }
-ibm-1023_P100-2003 { UTR22* } ibm-1023 { IBM* } ES { IANA* } iso-ir-17 { IANA } ISO646-ES { IANA } csISO17Spanish { IANA }
-ibm-1027_P100-1995 { UTR22* } ibm-1027 { IBM* } x-IBM1027 { JAVA* }
-ibm-1041_P100-1995 { UTR22* } ibm-1041 { IBM* } x-IBM1041 { JAVA* }
-ibm-1043_P100-1995 { UTR22* } ibm-1043 { IBM* } x-IBM1043 { JAVA* }
-ibm-1046_X110-1999 { UTR22* } ibm-1046 { IBM* } x-IBM1046 { JAVA* } x-IBM1046S { JAVA } # Arabic
-ibm-1088_P100-1995 { UTR22* } ibm-1088 { IBM* } x-IBM1088 { JAVA* }
-ibm-1100_P100-2003 { UTR22* } ibm-1100 { IBM* } DEC-MCS { IANA* } dec { IANA } csDECMCS { IANA }
-ibm-1101_P100-2003 { UTR22* } ibm-1101 { IBM* }
-ibm-1102_P100-2003 { UTR22* } ibm-1102 { IBM* }
-ibm-1103_P100-2003 { UTR22* } ibm-1103 { IBM* }
-ibm-1104_P100-2003 { UTR22* } ibm-1104 { IBM* } NF_Z_62-010_1973 iso-ir-25 { IANA* } ISO646-FR1 { IANA } csISO25French { IANA } # NF_Z_62-010_(1973) is the real IANA alias, but () aren't invariant characters.
-ibm-1105_P100-2003 { UTR22* } ibm-1105 { IBM* }
-ibm-1106_P100-2003 { UTR22* } ibm-1106 { IBM* }
-ibm-1107_P100-2003 { UTR22* } ibm-1107 { IBM* } DS_2089 { IANA* } ISO646-DK { IANA } dk { IANA } csISO646Danish { IANA }
-ibm-1127_P100-2004 { UTR22* } ibm-1127 { IBM* }
-ibm-1161_P100-1999 { UTR22* } ibm-1161 { IBM* } # Thai (Euro update of ibm-1129)
-ibm-1163_P100-1999 { UTR22* } ibm-1163 { IBM* } # Vietnamese
-ibm-1165_P101-2000 { UTR22* } ibm-1165 { IBM* } # Vietnamese (EBCDIC)
-ibm-1166_P100-2002 { UTR22* } ibm-1166 { IBM* } # Cyrillic for Kazakhstan
-ibm-1167_P100-2002 { UTR22* } ibm-1167 { IBM* } KOI8-RU x-KOI8_RU { JAVA* }
-ibm-1174_X100-2007 { UTR22* } ibm-1174 { IBM* } KZ-1048 { IANA* } STRK1048-2002 { IANA } RK1048 { IANA } csKZ1048 { IANA }
-ibm-1277_P100-1995 { UTR22* } ibm-1277 { IBM* } # Adobe (Postscript) Latin-1
-ibm-13125_P100-1997 { UTR22* } ibm-13125 { IBM* } # S-Ch (DBCS subset of ibm-4933, ibm-1388)
-ibm-13140_P101-2000 { UTR22* } ibm-13140 { IBM* }
-ibm-13218_P100-1996 { UTR22* } ibm-13218 { IBM* } # Japanese (EBCDIC update of ibm-930)
-ibm-1350_P110-1997 { UTR22* } ibm-1350 { IBM* } x-eucJP-Open { JAVA* } eucJP-Open { JAVA } # Japanese (EUC-JP variant)
-ibm-1351_P110-1997 { UTR22* } ibm-1351 { IBM* } x-IBM1351 { JAVA* } # Japanese (DBCS subset of ibm-5039)
-ibm-1362_P110-1999 { UTR22* } ibm-1362 { IBM* } x-IBM1362 { JAVA* } # Korean (DBCS subset of ibm-1363)
-ibm-13676_P102-2001 { UTR22* } ibm-13676 { IBM* } # Simplified Chinese (EBCDIC)
-ibm-1380_P100-1995 { UTR22* } ibm-1380 { IBM* } x-IBM1380 { JAVA* } # Simplified Chinese (DBCS subset of ibm-1381)
-ibm-1381_P110-1999 { UTR22* } ibm-1381 { IBM* JAVA } cp1381 { JAVA* } 1381 { JAVA } x-IBM1381 { JAVA } # Simplified Chinese PC Data mixed (IBM GB)
-ibm-1382_P100-1995 { UTR22* } ibm-1382 { IBM* } x-IBM1382 { JAVA* } # Simplified Chinese (DBCS subset of ibm-1383)
-ibm-17221_P100-2001 { UTR22* } ibm-17221 { IBM* } # Simplified Chinese (EBCDIC)
-ibm-17248_X110-1999 { UTR22* } ibm-17248 { IBM* } # PC Arabic (w/ euro update) Updated version of ibm-864
-ibm-21344_P101-2000 { UTR22* } ibm-21344 { IBM* } # PC Arabic. Updated version of ibm-864
-ibm-21427_P100-1999 { UTR22* } ibm-21427 { IBM* } # Traditional Chinese (DBCS subset of ibm-1370)
-ibm-256_P100-1995 { UTR22* } ibm-256 { IBM* } # Latin 1 EBCDIC
-ibm-259_P100-1995 { UTR22* } ibm-259 { IBM* } IBM-Symbols { IANA* } csIBMSymbols { IANA }
-ibm-274_P100-2000 { UTR22* } ibm-274 { IBM* } IBM274 { IANA* } EBCDIC-BE { IANA } CP274 { IANA } csIBM274 { IANA }
-ibm-275_P100-1995 { UTR22* } ibm-275 { IBM* } IBM275 { IANA* } EBCDIC-BR { IANA } cp275 { IANA } csIBM275 { IANA }
-ibm-286_P100-2003 { UTR22* } ibm-286 { IBM* } EBCDIC-AT-DE-A { IANA* } csEBCDICATDEA { IANA }
-ibm-293_P100-1995 { UTR22* } ibm-293 { IBM* } # APL EBCDIC (APL: A Programming Language)
-ibm-300_P120-2006 { UTR22* } ibm-300 { IBM* } x-IBM300 { JAVA* } # Japanese (DBCS subset of ibm-930 and ibm-939)
-ibm-301_P110-1997 { UTR22* } ibm-301 { IBM* } x-IBM301 { JAVA* } # Japanese (DBCS subset of ibm-943)
-ibm-33058_P100-2000 { UTR22* } ibm-33058 { IBM* } # SBCS (Katakana)
-ibm-425_P101-2000 { UTR22* } ibm-425 { IBM* } # Arabic (EBCDIC)
-ibm-4930_P110-1999 { UTR22* } ibm-4930 { IBM* } # Korean (DBCS subset of ibm-1364)
-ibm-4933_P100-2002 { UTR22* } ibm-4933 { IBM* } # S-Ch (DBCS subset of ibm-1388)
-ibm-4948_P100-1995 { UTR22* } ibm-4948 { IBM* }
-ibm-4951_P100-1995 { UTR22* } ibm-4951 { IBM* }
-ibm-4952_P100-1995 { UTR22* } ibm-4952 { IBM* }
-ibm-4960_P100-1995 { UTR22* } ibm-4960 { IBM* }
-ibm-5039_P11A-1998 { UTR22* } ibm-5039 { IBM* } # Japanese (HP Shift-JIS variant)
-ibm-5048_P100-1995 { UTR22* } ibm-5048 { IBM* } # Japanese (DBCS subset of ibm-1350, JIS X208-1990)
-ibm-5049_P100-1995 { UTR22* } ibm-5049 { IBM* } # Japanese (DBCS subset of ibm-1350, JIS X212)
-ibm-5067_P100-1995 { UTR22* } ibm-5067 { IBM* } # Korean (DBCS subset of ibm-21450)
-ibm-5104_X110-1999 { UTR22* } ibm-5104 { IBM* } # cp1008, 8-bit Arabic (w/ euro update)
-ibm-5233_P100-2011 { UTR22* } ibm-5233 { IBM* } # Devanagari EBCDIC, including Indian Rupee
-ibm-806_P100-1998 { UTR22* } ibm-806 { IBM* } # Hindi (ISCII variant)
-ibm-808_P100-1999 { UTR22* } ibm-808 { IBM* } x-IBM808 { JAVA* } # Cyrillic
-ibm-833_P100-1995 { UTR22* } ibm-833 { IBM* } x-IBM833 { JAVA* }
-ibm-834_P100-1995 { UTR22* } ibm-834 { IBM* } x-IBM834 { JAVA* } # Korean (DBCS subset of ibm-933)
-ibm-835_P100-1995 { UTR22* } ibm-835 { IBM* } x-IBM835 { JAVA* } # Traditional Chinese (DBCS subset of ibm-5033)
-ibm-836_P100-1995 { UTR22* } ibm-836 { IBM* } x-IBM836 { JAVA* }
-ibm-837_P100-2011 { UTR22* } ibm-837 { IBM* } x-IBM837 { JAVA* } # Simplified Chinese (DBCS subset of ibm-5031)
-ibm-848_P100-1999 { UTR22* } ibm-848 { IBM* } # Cyrillic (euro update of ibm-1125)
-ibm-849_P100-1999 { UTR22* } ibm-849 { IBM* } # Cyrillic Belarus (euro update of ibm-1131)
-ibm-859_P100-1999 { UTR22* } ibm-859 { IBM* } x-IBM859 { JAVA* } # PC Latin 9 (w/ euro update)
-ibm-8612_P100-1995 { UTR22* } ibm-8612 { IBM* } # Arabic (EBCDIC update of ibm-420)
-ibm-872_P100-1999 { UTR22* } ibm-872 { IBM* } # Cyrillic (Euro update of ibm-855)
-ibm-880_P100-1995 { UTR22* } ibm-880 { IBM* } IBM880 { IANA* } cp880 { IANA } EBCDIC-Cyrillic { IANA } csIBM880 { IANA } windows-20880 { WINDOWS* }
-ibm-896_P100-1995 { UTR22* } ibm-896 { IBM* } # SBCS Katakana
-ibm-897_P100-1995 { UTR22* } ibm-897 { IBM* } JIS_X0201 { IANA* } X0201 { IANA } csHalfWidthKatakana { IANA } x-IBM897 { JAVA* }
-ibm-9027_P100-1999 { UTR22* } ibm-9027 { IBM* } # DBCS T-Ch Host. Euro update of ibm-835. DBCS portion of ibm-1371.
-ibm-9048_P100-1998 { UTR22* } ibm-9048 { IBM* } # Hebrew (Euro and Sequel update of ibm-856)
-ibm-905_P100-1995 { UTR22* } ibm-905 { IBM* } IBM905 { IANA* } CP905 { IANA } ebcdic-cp-tr { IANA } csIBM905 { IANA } windows-20905 { WINDOWS* }
-ibm-9056_P100-1995 { UTR22* } ibm-9056 { IBM* } # Arabic
-ibm-9061_P100-1999 { UTR22* } ibm-9061 { IBM* } # Greek (w/ euro update)
-ibm-9145_P110-1997 { UTR22* } ibm-9145 { IBM* } # Japanese (DBCS subset of ibm-5050)
-ibm-9238_X110-1999 { UTR22* } ibm-9238 { IBM* } # cp1046, PC Arabic Extended (w/ euro update)
-ibm-924_P100-1998 { UTR22* } ibm-924 { IBM* } IBM00924 { IANA* } CCSID00924 { IANA } CP00924 { IANA } ebcdic-Latin9--euro { IANA }
-ibm-926_P100-2000 { UTR22* } ibm-926 { IBM* } # Korean (DBCS subset of ibm-944)
-ibm-927_P100-1995 { UTR22* } ibm-927 { IBM* } x-IBM927 { JAVA* } # Traditional Chinese (DBCS subset of ibm-948)
-ibm-928_P100-1995 { UTR22* } ibm-928 { IBM* } # Simplified Chinese (DBCS subset of ibm-936)
-ibm-941_P13A-2001 { UTR22* } ibm-941 { IBM* } # DBCS portion of ibm-943
-ibm-944_P100-1995 { UTR22* } ibm-944 { IBM* } # Korean
-ibm-946_P100-1995 { UTR22* } ibm-946 { IBM* } # Simplified Chinese
-ibm-947_P100-1995 { UTR22* } ibm-947 { IBM* } x-IBM947 { JAVA* } # Traditional Chinese (DBCS subset of ibm-950)
-ibm-948_P110-1999 { UTR22* } ibm-948 { IBM* } x-IBM948 { JAVA* } # Traditional Chinese
-ibm-951_P100-1995 { UTR22* } ibm-951 { IBM* } x-IBM951 { JAVA* } # Korean (DBCS subset of ibm-949)
-ibm-952_P110-1997 { UTR22* } ibm-952 { IBM* } x-JIS0208 # Pure DBCS, Japanese EUC, G1 - JIS X208-1990
-ibm-953_P100-2000 { UTR22* } ibm-953 { IBM* } JIS_X0212-1990 { IANA* } # Pure DBCS, Japanese EUC, G3 - JIS X 0212-1990
-ibm-955_P110-1997 { UTR22* } ibm-955 { IBM* } # Pure DBCS, Japanese EUC, G0 - JIS X208-1978
-ibm-9577_P100-2001 { UTR22* } ibm-9577 { IBM* } ibm-1385 { IBM } x-IBM1385 { JAVA* } # ibm-9577 and ibm-1385 are identical DBCS tables.
-iso-8859_16-2001 { UTR22* } ISO-8859-16 { IANA* } iso-ir-226 { IANA } ISO_8859-16:2001 { IANA } latin10 { IANA } l10 { IANA }
-
-# To be considered for listing at a later date for the data library customization tool
-#ibm-1159_P100-1999 { UTR22* } ibm-1159 { IBM* } # SBCS T-Ch Host. Euro update of ibm-28709. This is used in combination with another CCSID mapping.
-#ibm-960_P100-2000 { UTR22* } ibm-960 { IBM* } # Pure DBCS, CNS11643 plane 1
-#ibm-963_P100-1995 { UTR22* } ibm-963 { IBM* } # Pure DBCS, CNS11643 plane 2 Traditional Chinese (DBCS subset of ibm-965)
+++ /dev/null
-target
-corpus
-artifacts
-coverage
+++ /dev/null
-# This file is automatically @generated by Cargo.
-# It is not intended for manual editing.
-version = 3
-
-[[package]]
-name = "adler"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
-
-[[package]]
-name = "android-tzdata"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
-
-[[package]]
-name = "android_system_properties"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "anstream"
-version = "0.6.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b"
-dependencies = [
- "anstyle",
- "anstyle-parse",
- "anstyle-query",
- "anstyle-wincon",
- "colorchoice",
- "is_terminal_polyfill",
- "utf8parse",
-]
-
-[[package]]
-name = "anstyle"
-version = "1.0.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b"
-
-[[package]]
-name = "anstyle-parse"
-version = "0.2.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4"
-dependencies = [
- "utf8parse",
-]
-
-[[package]]
-name = "anstyle-query"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391"
-dependencies = [
- "windows-sys 0.52.0",
-]
-
-[[package]]
-name = "anstyle-wincon"
-version = "3.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19"
-dependencies = [
- "anstyle",
- "windows-sys 0.52.0",
-]
-
-[[package]]
-name = "anyhow"
-version = "1.0.86"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da"
-
-[[package]]
-name = "arbitrary"
-version = "1.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110"
-
-[[package]]
-name = "atty"
-version = "0.2.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
-dependencies = [
- "hermit-abi",
- "libc",
- "winapi",
-]
-
-[[package]]
-name = "autocfg"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
-
-[[package]]
-name = "bitflags"
-version = "2.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
-
-[[package]]
-name = "bumpalo"
-version = "3.16.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
-
-[[package]]
-name = "cc"
-version = "1.0.106"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "066fce287b1d4eafef758e89e09d724a24808a9196fe9756b8ca90e86d0719a2"
-dependencies = [
- "jobserver",
- "libc",
- "once_cell",
-]
-
-[[package]]
-name = "cfg-if"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
-
-[[package]]
-name = "chrono"
-version = "0.4.38"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
-dependencies = [
- "android-tzdata",
- "iana-time-zone",
- "js-sys",
- "num-traits",
- "wasm-bindgen",
- "windows-targets 0.52.6",
-]
-
-[[package]]
-name = "clap"
-version = "4.5.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84b3edb18336f4df585bc9aa31dd99c036dfa5dc5e9a2939a722a188f3a8970d"
-dependencies = [
- "clap_builder",
- "clap_derive",
-]
-
-[[package]]
-name = "clap_builder"
-version = "4.5.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1c09dd5ada6c6c78075d6fd0da3f90d8080651e2d6cc8eb2f1aaa4034ced708"
-dependencies = [
- "anstream",
- "anstyle",
- "clap_lex",
- "strsim",
- "terminal_size",
-]
-
-[[package]]
-name = "clap_derive"
-version = "4.5.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2bac35c6dafb060fd4d275d9a4ffae97917c13a6327903a8be2153cd964f7085"
-dependencies = [
- "heck",
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "clap_lex"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70"
-
-[[package]]
-name = "colorchoice"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
-
-[[package]]
-name = "core-foundation-sys"
-version = "0.8.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
-
-[[package]]
-name = "crc32fast"
-version = "1.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "encoding_rs"
-version = "0.8.34"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "equivalent"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
-
-[[package]]
-name = "errno"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
-dependencies = [
- "libc",
- "windows-sys 0.52.0",
-]
-
-[[package]]
-name = "finl_unicode"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6"
-
-[[package]]
-name = "flate2"
-version = "1.0.30"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae"
-dependencies = [
- "crc32fast",
- "miniz_oxide",
-]
-
-[[package]]
-name = "float_next_after"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
-
-[[package]]
-name = "hashbrown"
-version = "0.14.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
-
-[[package]]
-name = "heck"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
-
-[[package]]
-name = "hermit-abi"
-version = "0.1.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "hexplay"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0962bea6731e28b5a443ba4aa00fe3e4fe7555dadf12012435efb738eeac5898"
-dependencies = [
- "atty",
- "termcolor",
-]
-
-[[package]]
-name = "iana-time-zone"
-version = "0.1.60"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141"
-dependencies = [
- "android_system_properties",
- "core-foundation-sys",
- "iana-time-zone-haiku",
- "js-sys",
- "wasm-bindgen",
- "windows-core",
-]
-
-[[package]]
-name = "iana-time-zone-haiku"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
-dependencies = [
- "cc",
-]
-
-[[package]]
-name = "indexmap"
-version = "2.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
-dependencies = [
- "equivalent",
- "hashbrown",
-]
-
-[[package]]
-name = "is_terminal_polyfill"
-version = "1.70.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
-
-[[package]]
-name = "jobserver"
-version = "0.1.31"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "js-sys"
-version = "0.3.69"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
-dependencies = [
- "wasm-bindgen",
-]
-
-[[package]]
-name = "lazy_static"
-version = "1.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
-
-[[package]]
-name = "libc"
-version = "0.2.155"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
-
-[[package]]
-name = "libfuzzer-sys"
-version = "0.4.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7"
-dependencies = [
- "arbitrary",
- "cc",
- "once_cell",
-]
-
-[[package]]
-name = "linux-raw-sys"
-version = "0.4.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
-
-[[package]]
-name = "log"
-version = "0.4.22"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
-
-[[package]]
-name = "miniz_oxide"
-version = "0.7.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
-dependencies = [
- "adler",
-]
-
-[[package]]
-name = "num"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
-dependencies = [
- "num-bigint",
- "num-complex",
- "num-integer",
- "num-iter",
- "num-rational",
- "num-traits",
-]
-
-[[package]]
-name = "num-bigint"
-version = "0.4.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
-dependencies = [
- "num-integer",
- "num-traits",
-]
-
-[[package]]
-name = "num-complex"
-version = "0.4.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
-dependencies = [
- "num-traits",
-]
-
-[[package]]
-name = "num-derive"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "num-integer"
-version = "0.1.46"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
-dependencies = [
- "num-traits",
-]
-
-[[package]]
-name = "num-iter"
-version = "0.1.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
-dependencies = [
- "autocfg",
- "num-integer",
- "num-traits",
-]
-
-[[package]]
-name = "num-rational"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
-dependencies = [
- "num-bigint",
- "num-integer",
- "num-traits",
-]
-
-[[package]]
-name = "num-traits"
-version = "0.2.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
-dependencies = [
- "autocfg",
-]
-
-[[package]]
-name = "once_cell"
-version = "1.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
-
-[[package]]
-name = "ordered-float"
-version = "3.9.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
-dependencies = [
- "num-traits",
-]
-
-[[package]]
-name = "proc-macro2"
-version = "1.0.86"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
-dependencies = [
- "unicode-ident",
-]
-
-[[package]]
-name = "pspp"
-version = "1.0.0"
-dependencies = [
- "anyhow",
- "bitflags",
- "chrono",
- "clap",
- "encoding_rs",
- "finl_unicode",
- "flate2",
- "float_next_after",
- "hexplay",
- "indexmap",
- "lazy_static",
- "libc",
- "num",
- "num-derive",
- "num-traits",
- "ordered-float",
- "thiserror",
- "unicase",
- "utf8-decode",
- "windows-sys 0.48.0",
-]
-
-[[package]]
-name = "pspp-fuzz"
-version = "0.0.0"
-dependencies = [
- "libfuzzer-sys",
- "pspp",
-]
-
-[[package]]
-name = "quote"
-version = "1.0.36"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
-dependencies = [
- "proc-macro2",
-]
-
-[[package]]
-name = "rustix"
-version = "0.38.34"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
-dependencies = [
- "bitflags",
- "errno",
- "libc",
- "linux-raw-sys",
- "windows-sys 0.52.0",
-]
-
-[[package]]
-name = "strsim"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
-
-[[package]]
-name = "syn"
-version = "2.0.69"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "201fcda3845c23e8212cd466bfebf0bd20694490fc0356ae8e428e0824a915a6"
-dependencies = [
- "proc-macro2",
- "quote",
- "unicode-ident",
-]
-
-[[package]]
-name = "termcolor"
-version = "0.3.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "adc4587ead41bf016f11af03e55a624c06568b5a19db4e90fde573d805074f83"
-dependencies = [
- "wincolor",
-]
-
-[[package]]
-name = "terminal_size"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7"
-dependencies = [
- "rustix",
- "windows-sys 0.48.0",
-]
-
-[[package]]
-name = "thiserror"
-version = "1.0.61"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
-dependencies = [
- "thiserror-impl",
-]
-
-[[package]]
-name = "thiserror-impl"
-version = "1.0.61"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "unicase"
-version = "2.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89"
-dependencies = [
- "version_check",
-]
-
-[[package]]
-name = "unicode-ident"
-version = "1.0.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
-
-[[package]]
-name = "utf8-decode"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498"
-
-[[package]]
-name = "utf8parse"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
-
-[[package]]
-name = "version_check"
-version = "0.9.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
-
-[[package]]
-name = "wasm-bindgen"
-version = "0.2.92"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
-dependencies = [
- "cfg-if",
- "wasm-bindgen-macro",
-]
-
-[[package]]
-name = "wasm-bindgen-backend"
-version = "0.2.92"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
-dependencies = [
- "bumpalo",
- "log",
- "once_cell",
- "proc-macro2",
- "quote",
- "syn",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-macro"
-version = "0.2.92"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
-dependencies = [
- "quote",
- "wasm-bindgen-macro-support",
-]
-
-[[package]]
-name = "wasm-bindgen-macro-support"
-version = "0.2.92"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
- "wasm-bindgen-backend",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-shared"
-version = "0.2.92"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
-
-[[package]]
-name = "winapi"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
-dependencies = [
- "winapi-i686-pc-windows-gnu",
- "winapi-x86_64-pc-windows-gnu",
-]
-
-[[package]]
-name = "winapi-i686-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
-
-[[package]]
-name = "winapi-x86_64-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
-
-[[package]]
-name = "wincolor"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eeb06499a3a4d44302791052df005d5232b927ed1a9658146d842165c4de7767"
-dependencies = [
- "winapi",
-]
-
-[[package]]
-name = "windows-core"
-version = "0.52.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
-dependencies = [
- "windows-targets 0.52.6",
-]
-
-[[package]]
-name = "windows-sys"
-version = "0.48.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
-dependencies = [
- "windows-targets 0.48.5",
-]
-
-[[package]]
-name = "windows-sys"
-version = "0.52.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
-dependencies = [
- "windows-targets 0.52.6",
-]
-
-[[package]]
-name = "windows-targets"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
-dependencies = [
- "windows_aarch64_gnullvm 0.48.5",
- "windows_aarch64_msvc 0.48.5",
- "windows_i686_gnu 0.48.5",
- "windows_i686_msvc 0.48.5",
- "windows_x86_64_gnu 0.48.5",
- "windows_x86_64_gnullvm 0.48.5",
- "windows_x86_64_msvc 0.48.5",
-]
-
-[[package]]
-name = "windows-targets"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
-dependencies = [
- "windows_aarch64_gnullvm 0.52.6",
- "windows_aarch64_msvc 0.52.6",
- "windows_i686_gnu 0.52.6",
- "windows_i686_gnullvm",
- "windows_i686_msvc 0.52.6",
- "windows_x86_64_gnu 0.52.6",
- "windows_x86_64_gnullvm 0.52.6",
- "windows_x86_64_msvc 0.52.6",
-]
-
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
-
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
-
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
-
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
-
-[[package]]
-name = "windows_i686_gnu"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
-
-[[package]]
-name = "windows_i686_gnu"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
-
-[[package]]
-name = "windows_i686_gnullvm"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
-
-[[package]]
-name = "windows_i686_msvc"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
-
-[[package]]
-name = "windows_i686_msvc"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
-
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
-
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
-
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
-
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
-
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
-
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+++ /dev/null
-[package]
-name = "pspp-fuzz"
-version = "0.0.0"
-publish = false
-edition = "2021"
-
-[package.metadata]
-cargo-fuzz = true
-
-[dependencies]
-libfuzzer-sys = "0.4"
-
-[dependencies.pspp]
-path = ".."
-
-[[bin]]
-name = "fuzz_target_1"
-path = "fuzz_targets/fuzz_target_1.rs"
-test = false
-doc = false
-bench = false
-
-[[bin]]
-name = "segment"
-path = "fuzz_targets/segment.rs"
-test = false
-doc = false
-bench = false
+++ /dev/null
-#![no_main]
-
-use libfuzzer_sys::fuzz_target;
-
-fuzz_target!(|data: &[u8]| {
- // fuzzed code goes here
-});
+++ /dev/null
-#![no_main]
-
-use libfuzzer_sys::fuzz_target;
-use pspp::lex::segment::{Segmenter, Mode, Type};
-
-fuzz_target!(|data: &[u8]| {
- if let Ok(mut input) = std::str::from_utf8(data) {
- let mut segmenter = Segmenter::new(Mode::Auto, false);
- loop {
- let (rest, type_) = segmenter.push(input, true).unwrap();
- match type_ {
- Type::End => break,
- _ => (),
- }
- input = rest;
- }
- }
-});
--- /dev/null
+[package]
+name = "pspp-lsp"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+env_logger = "0.11.5"
+log = "0.4.22"
+pspp = { version = "1.0.0", path = "../pspp" }
+tokio = { version = "1.39.3", features = ["full"] }
+tower-lsp = "0.20.0"
--- /dev/null
+use std::collections::HashMap;
+
+use tokio::sync::Mutex;
+use tower_lsp::{
+ jsonrpc::Result,
+ lsp_types::*,
+ Client, LanguageServer, LspService, Server,
+};
+
+#[tokio::main]
+async fn main() {
+ env_logger::init();
+
+ let stdin = tokio::io::stdin();
+ let stdout = tokio::io::stdout();
+
+ let (service, socket) = LspService::build(|client| Backend {
+ client,
+ document_map: Mutex::new(HashMap::new()),
+ })
+ .finish();
+
+ Server::new(stdin, stdout, socket).serve(service).await;
+}
+
+#[derive(Debug)]
+struct Backend {
+ client: Client,
+ document_map: Mutex<HashMap<String, String>>,
+}
+
+#[tower_lsp::async_trait]
+impl LanguageServer for Backend {
+ async fn initialize(&self, params: InitializeParams) -> Result<InitializeResult> {
+ Ok(InitializeResult {
+ server_info: None,
+ capabilities: ServerCapabilities {
+ text_document_sync: Some(TextDocumentSyncCapability::Kind(
+ TextDocumentSyncKind::FULL,
+ )),
+ workspace: Some(WorkspaceServerCapabilities {
+ workspace_folders: Some(WorkspaceFoldersServerCapabilities {
+ supported: Some(true),
+ change_notifications: Some(OneOf::Left(true)),
+ }),
+ file_operations: None,
+ }),
+/*
+ semantic_tokens_provider: Some(
+ SemanticTokensServerCapabilities::SemanticTokensRegistrationOptions(
+ SemanticTokensRegistrationOptions {
+ text_document_registration_options: {
+ TextDocumentRegistrationOptions {
+ document_selector: Some(vec![DocumentFilter {
+ language: Some("pspp".to_string()),
+ scheme: Some("file".to_string()),
+ pattern: None,
+ }]),
+ }
+ },
+ semantic_tokens_options: SemanticTokensOptions {
+ work_done_progress_options: WorkDoneProgressOptions::default(),
+ legend: SemanticTokensLegend {
+ token_types: LEGEND_TYPE.into(),
+ token_modifiers: vec![],
+ },
+ range: Some(true),
+ full: Some(SemanticTokensFullOptions::Bool(true)),
+ },
+ static_registration_options: StaticRegistrationOptions::default(),
+ },
+ ),
+ ),
+*/
+ definition_provider: Some(OneOf::Left(true)),
+ references_provider: Some(OneOf::Left(true)),
+ rename_provider: Some(OneOf::Left(true)),
+ ..ServerCapabilities::default()
+ },
+ })
+ }
+
+ async fn shutdown(&self) -> Result<()> {
+ Ok(())
+ }
+}
--- /dev/null
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "adler"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
+
+[[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800"
+
+[[package]]
+name = "atty"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
+dependencies = [
+ "hermit-abi 0.1.19",
+ "libc",
+ "winapi",
+]
+
+[[package]]
+name = "autocfg"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "bitflags"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
+
+[[package]]
+name = "bumpalo"
+version = "3.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
+
+[[package]]
+name = "cc"
+version = "1.0.79"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "chardetng"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
+dependencies = [
+ "cfg-if",
+ "encoding_rs",
+ "memchr",
+]
+
+[[package]]
+name = "chrono"
+version = "0.4.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5"
+dependencies = [
+ "android-tzdata",
+ "iana-time-zone",
+ "js-sys",
+ "num-traits",
+ "time",
+ "wasm-bindgen",
+ "winapi",
+]
+
+[[package]]
+name = "clap"
+version = "4.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f3061d6db6d8fcbbd4b05e057f2acace52e64e96b498c08c2d7a4e65addd340"
+dependencies = [
+ "bitflags 1.3.2",
+ "clap_derive",
+ "clap_lex",
+ "is-terminal",
+ "once_cell",
+ "strsim",
+ "termcolor 1.2.0",
+ "terminal_size",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34d122164198950ba84a918270a3bb3f7ededd25e15f7451673d986f55bd2667"
+dependencies = [
+ "heck",
+ "proc-macro-error",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "350b9cf31731f9957399229e9b2adc51eeabdfbe9d71d9a0552275fd12710d09"
+dependencies = [
+ "os_str_bytes",
+]
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
+
+[[package]]
+name = "crc32fast"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "diff"
+version = "0.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
+
+[[package]]
+name = "encoding_rs"
+version = "0.8.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "enum-map"
+version = "2.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6866f3bfdf8207509a033af1a75a7b08abda06bbaaeae6669323fd5a097df2e9"
+dependencies = [
+ "enum-map-derive",
+]
+
+[[package]]
+name = "enum-map-derive"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f282cfdfe92516eb26c2af8589c274c7c17681f5ecc03c18255fe741c6aa64eb"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.27",
+]
+
+[[package]]
+name = "equivalent"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+
+[[package]]
+name = "errno"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
+dependencies = [
+ "errno-dragonfly",
+ "libc",
+ "winapi",
+]
+
+[[package]]
+name = "errno"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
+dependencies = [
+ "errno-dragonfly",
+ "libc",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "errno-dragonfly"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
+dependencies = [
+ "cc",
+ "libc",
+]
+
+[[package]]
+name = "finl_unicode"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6"
+
+[[package]]
+name = "flagset"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b3ea1ec5f8307826a5b71094dd91fc04d4ae75d5709b20ad351c7fb4815c86ec"
+
+[[package]]
+name = "flate2"
+version = "1.0.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
+dependencies = [
+ "crc32fast",
+ "miniz_oxide",
+]
+
+[[package]]
+name = "float_next_after"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
+
+[[package]]
+name = "hashbrown"
+version = "0.14.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
+
+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+
+[[package]]
+name = "hermit-abi"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "hermit-abi"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
+
+[[package]]
+name = "hexplay"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0962bea6731e28b5a443ba4aa00fe3e4fe7555dadf12012435efb738eeac5898"
+dependencies = [
+ "atty",
+ "termcolor 0.3.6",
+]
+
+[[package]]
+name = "iana-time-zone"
+version = "0.1.57"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "wasm-bindgen",
+ "windows",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "indexmap"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f"
+dependencies = [
+ "equivalent",
+ "hashbrown",
+]
+
+[[package]]
+name = "io-lifetimes"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3"
+dependencies = [
+ "libc",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "is-terminal"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21b6b32576413a8e69b90e952e4a026476040d81017b80445deda5f2d3921857"
+dependencies = [
+ "hermit-abi 0.3.1",
+ "io-lifetimes",
+ "rustix 0.36.8",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "js-sys"
+version = "0.3.64"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a"
+dependencies = [
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+
+[[package]]
+name = "libc"
+version = "0.2.147"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
+
+[[package]]
+name = "log"
+version = "0.4.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4"
+
+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+dependencies = [
+ "adler",
+]
+
+[[package]]
+name = "num"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606"
+dependencies = [
+ "num-bigint",
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-complex"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02e0d21255c828d6f128a1e41534206671e8c3ea0c62f32291e808dc82cff17d"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-derive"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e6a0fd4f737c707bd9086cc16c925f294943eb62eb71499e9fd4cf71f8b9f4e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.27",
+]
+
+[[package]]
+name = "num-integer"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
+dependencies = [
+ "autocfg",
+ "num-traits",
+]
+
+[[package]]
+name = "num-iter"
+version = "0.1.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
+dependencies = [
+ "autocfg",
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.17.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
+
+[[package]]
+name = "ordered-float"
+version = "3.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2fc2dbde8f8a79f2102cc474ceb0ad68e3b80b85289ea62389b60e66777e4213"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "os_str_bytes"
+version = "6.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee"
+
+[[package]]
+name = "proc-macro-error"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
+dependencies = [
+ "proc-macro-error-attr",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro-error-attr"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.66"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "pspp"
+version = "1.0.0"
+dependencies = [
+ "anyhow",
+ "bitflags 2.5.0",
+ "chardetng",
+ "chrono",
+ "clap",
+ "diff",
+ "encoding_rs",
+ "enum-map",
+ "finl_unicode",
+ "flagset",
+ "flate2",
+ "float_next_after",
+ "hexplay",
+ "indexmap",
+ "lazy_static",
+ "libc",
+ "num",
+ "num-derive",
+ "num-traits",
+ "ordered-float",
+ "thiserror",
+ "unicase",
+ "unicode-width",
+ "utf8-decode",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rustix"
+version = "0.36.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f43abb88211988493c1abb44a70efa56ff0ce98f233b7b276146f1f3f7ba9644"
+dependencies = [
+ "bitflags 1.3.2",
+ "errno 0.2.8",
+ "io-lifetimes",
+ "libc",
+ "linux-raw-sys 0.1.4",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "rustix"
+version = "0.37.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62b24138615de35e32031d041a09032ef3487a616d901ca4db224e7d557efae2"
+dependencies = [
+ "bitflags 1.3.2",
+ "errno 0.3.1",
+ "io-lifetimes",
+ "libc",
+ "linux-raw-sys 0.3.8",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "strsim"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
+
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b60f673f44a8255b9c8c657daf66a596d435f2da81a555b06dc644d080ba45e0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "termcolor"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adc4587ead41bf016f11af03e55a624c06568b5a19db4e90fde573d805074f83"
+dependencies = [
+ "wincolor",
+]
+
+[[package]]
+name = "termcolor"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "terminal_size"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e6bf6f19e9f8ed8d4048dc22981458ebcf406d67e94cd422e5ecd73d63b3237"
+dependencies = [
+ "rustix 0.37.3",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.39"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5ab016db510546d856297882807df8da66a16fb8c4101cb8b30054b0d5b2d9c"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.39"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5420d42e90af0c38c3290abcca25b9b3bdf379fc9f55c528f53a269d9c9a267e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "time"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a"
+dependencies = [
+ "libc",
+ "wasi",
+ "winapi",
+]
+
+[[package]]
+name = "unicase"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
+dependencies = [
+ "version_check",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
+
+[[package]]
+name = "unicode-width"
+version = "0.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
+
+[[package]]
+name = "utf8-decode"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498"
+
+[[package]]
+name = "version_check"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
+
+[[package]]
+name = "wasi"
+version = "0.10.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342"
+dependencies = [
+ "cfg-if",
+ "wasm-bindgen-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-backend"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd"
+dependencies = [
+ "bumpalo",
+ "log",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.27",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.27",
+ "wasm-bindgen-backend",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1"
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-util"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "wincolor"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eeb06499a3a4d44302791052df005d5232b927ed1a9658146d842165c4de7767"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "windows"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
+dependencies = [
+ "windows-targets 0.48.1",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.45.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
+dependencies = [
+ "windows-targets 0.42.1",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+dependencies = [
+ "windows-targets 0.48.1",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.42.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7"
+dependencies = [
+ "windows_aarch64_gnullvm 0.42.1",
+ "windows_aarch64_msvc 0.42.1",
+ "windows_i686_gnu 0.42.1",
+ "windows_i686_msvc 0.42.1",
+ "windows_x86_64_gnu 0.42.1",
+ "windows_x86_64_gnullvm 0.42.1",
+ "windows_x86_64_msvc 0.42.1",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.48.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f"
+dependencies = [
+ "windows_aarch64_gnullvm 0.48.0",
+ "windows_aarch64_msvc 0.48.0",
+ "windows_i686_gnu 0.48.0",
+ "windows_i686_msvc 0.48.0",
+ "windows_x86_64_gnu 0.48.0",
+ "windows_x86_64_gnullvm 0.48.0",
+ "windows_x86_64_msvc 0.48.0",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.42.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.42.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.42.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.42.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.42.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.42.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.42.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
--- /dev/null
+[package]
+name = "pspp"
+version = "1.0.0"
+edition = "2021"
+authors = [ "Ben Pfaff", "John Darrington" ]
+
+[dependencies]
+anyhow = "1.0.69"
+clap = { version = "4.1.7", features = ["derive", "wrap_help"] }
+encoding_rs = "0.8.32"
+flate2 = "1.0.26"
+float_next_after = "1.0.0"
+hexplay = "0.2.1"
+lazy_static = "1.4.0"
+num = "0.4.0"
+num-derive = "0.4.0"
+num-traits = "0.2.16"
+ordered-float = "3.7.0"
+thiserror = "1.0"
+chrono = "0.4.26"
+finl_unicode = "1.2.0"
+unicase = "2.6.0"
+libc = "0.2.147"
+indexmap = "2.1.0"
+utf8-decode = "1.0.1"
+bitflags = "2.5.0"
+unicode-width = "0.1.13"
+chardetng = "0.1.17"
+enum-map = "2.7.3"
+flagset = "0.4.6"
+
+[target.'cfg(windows)'.dependencies]
+windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] }
+
+[build-dependencies]
+anyhow = "1.0.69"
+
+[[bin]]
+name = "pspp-dump-sav"
+path = "src/main.rs"
+
+[lib]
+path = "src/lib.rs"
+
+[[test]]
+name = "sack"
+path = "tests/sack.rs"
+harness = false
+
+[dev-dependencies]
+diff = "0.1.13"
--- /dev/null
+use anyhow::{anyhow, Result as AnyResult};
+use std::{
+ collections::{BTreeMap, HashSet, VecDeque},
+ env::var_os,
+ fs::{read_to_string, File},
+ io::{Error as IoError, Write},
+ path::{Path, PathBuf},
+};
+
+#[derive(Copy, Clone, PartialEq, Eq, Ord, PartialOrd)]
+enum Source {
+ Codepage,
+ Ibm,
+ Windows,
+}
+
+// Code page number.
+type CodepageNumber = usize;
+
+fn process_converter<'a>(
+ fields: &Vec<&'a str>,
+ codepages: &mut BTreeMap<CodepageNumber, BTreeMap<Source, Vec<&'a str>>>,
+) {
+ if fields.is_empty() || fields[0] == "{" {
+ return;
+ }
+
+ let mut cps: BTreeMap<Source, CodepageNumber> = BTreeMap::new();
+ let mut iana = VecDeque::new();
+ let mut other = VecDeque::new();
+
+ let mut iter = fields.iter().peekable();
+ while let Some(&name) = iter.next() {
+ if iter.next_if(|&&s| s == "{").is_some() {
+ let mut standards = HashSet::new();
+ loop {
+ let &standard = iter.next().expect("missing `}` in list of standards");
+ if standard == "}" {
+ break;
+ }
+ standards.insert(standard);
+ }
+
+ if standards.contains("IANA*") {
+ iana.push_front(name);
+ } else if standards.contains("IANA") {
+ iana.push_back(name);
+ } else if standards.iter().any(|&s| s.ends_with('*')) {
+ other.push_front(name);
+ } else {
+ other.push_back(name);
+ }
+ } else {
+ // Untagged names are completely nonstandard.
+ continue;
+ }
+
+ if let Some(number) = name.strip_prefix("cp") {
+ if let Ok(number) = number.parse::<CodepageNumber>() {
+ cps.insert(Source::Codepage, number);
+ }
+ }
+
+ if let Some(number) = name.strip_prefix("windows-") {
+ if let Ok(number) = number.parse::<CodepageNumber>() {
+ cps.insert(Source::Windows, number);
+ }
+ }
+
+ if let Some(number) = name.strip_prefix("ibm-") {
+ if let Ok(number) = number.parse::<CodepageNumber>() {
+ cps.insert(Source::Ibm, number);
+ }
+ }
+ }
+
+ // If there are no tagged names then this is completely nonstandard.
+ if iana.is_empty() && other.is_empty() {
+ return;
+ }
+
+ let all: Vec<&str> = iana.into_iter().chain(other).collect();
+ for (source, number) in cps {
+ codepages
+ .entry(number)
+ .or_default()
+ .insert(source, all.clone());
+ }
+}
+
+fn write_output(
+ codepages: &BTreeMap<CodepageNumber, BTreeMap<Source, Vec<&str>>>,
+ file_name: &PathBuf,
+) -> Result<(), IoError> {
+ let mut file = File::create(file_name)?;
+
+ file.write_all(
+ "\
+use lazy_static::lazy_static;
+use std::collections::HashMap;
+
+lazy_static! {
+ static ref CODEPAGE_NUMBER_TO_NAME: HashMap<i32, &'static str> = {
+ let mut map = HashMap::new();
+"
+ .as_bytes(),
+ )?;
+
+ for (&cpnumber, value) in codepages.iter() {
+ let source = value.keys().max().unwrap();
+ let name = value[source][0];
+ writeln!(file, " map.insert({cpnumber}, \"{name}\");")?;
+ }
+ file.write_all(
+ " map
+ };
+
+ static ref CODEPAGE_NAME_TO_NUMBER: HashMap<&'static str, u32> = {
+ let mut map = HashMap::new();
+"
+ .as_bytes(),
+ )?;
+
+ let mut names: BTreeMap<String, BTreeMap<Source, Vec<CodepageNumber>>> = BTreeMap::new();
+ for (&cpnumber, value) in codepages.iter() {
+ for (&source, value2) in value.iter() {
+ for name in value2.iter().map(|name| name.to_ascii_lowercase()) {
+ names
+ .entry(name)
+ .or_default()
+ .entry(source)
+ .or_default()
+ .push(cpnumber);
+ }
+ }
+ }
+
+ for (name, value) in names.iter() {
+ for (_source, numbers) in value.iter().rev().take(1) {
+ writeln!(file, " map.insert(\"{name}\", {});", numbers[0])?;
+ }
+ }
+ file.write_all(
+ " map
+ };
+}
+"
+ .as_bytes(),
+ )?;
+
+ Ok(())
+}
+
+fn main() -> AnyResult<()> {
+ println!("cargo:rerun-if-changed=build.rs");
+
+ let input_file = Path::new(env!("CARGO_MANIFEST_DIR")).join("convrtrs.txt");
+ println!("cargo:rerun-if-changed={}", input_file.to_string_lossy());
+ let input = read_to_string(&input_file)
+ .map_err(|e| anyhow!("{}: read failed ({e})", input_file.to_string_lossy()))?;
+
+ let mut codepages: BTreeMap<CodepageNumber, BTreeMap<Source, Vec<&str>>> = BTreeMap::new();
+ let mut converter: Vec<&str> = Vec::new();
+ for line in input.lines() {
+ let line = line
+ .find('#')
+ .map(|position| &line[..position])
+ .unwrap_or(line)
+ .trim_end();
+ if !line.starts_with([' ', '\t']) {
+ process_converter(&converter, &mut codepages);
+ converter.clear();
+ }
+ converter.extend(line.split_whitespace());
+ }
+ process_converter(&converter, &mut codepages);
+
+ let output_file_name = Path::new(&var_os("OUT_DIR").unwrap()).join("encodings.rs");
+
+ write_output(&codepages, &output_file_name)
+ .map_err(|e| anyhow!("{}: write failed ({e})", output_file_name.to_string_lossy()))?;
+
+ Ok(())
+}
--- /dev/null
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# ******************************************************************************
+# *
+# * Copyright (C) 1995-2014, International Business Machines
+# * Corporation and others. All Rights Reserved.
+# *
+# ******************************************************************************
+
+# If this converter alias table looks very confusing, a much easier to
+# understand view can be found at this demo:
+# http://demo.icu-project.org/icu-bin/convexp
+
+# IMPORTANT NOTE
+#
+# This file is not read directly by ICU. If you change it, you need to
+# run gencnval, and eventually run pkgdata to update the representation that
+# ICU uses for aliases. The gencnval tool will normally compile this file into
+# cnvalias.icu. The gencnval -v verbose option will help you when you edit
+# this file.
+
+# Please be friendly to the rest of us that edit this table by
+# keeping this table free of tabs.
+
+# This is an alias file used by the character set converter.
+# A lot of converter information can be found in unicode/ucnv.h, but here
+# is more information about this file.
+#
+# If you are adding a new converter to this list and want to include it in the
+# icu data library, please be sure to add an entry to the appropriate ucm*.mk file
+# (see ucmfiles.mk for more information).
+#
+# Here is the file format using BNF-like syntax:
+#
+# converterTable ::= tags { converterLine* }
+# converterLine ::= converterName [ tags ] { taggedAlias* }'\n'
+# taggedAlias ::= alias [ tags ]
+# tags ::= '{' { tag+ } '}'
+# tag ::= standard['*']
+# converterName ::= [0-9a-zA-Z:_'-']+
+# alias ::= converterName
+#
+# Except for the converter name, aliases are case insensitive.
+# Names are separated by whitespace.
+# Line continuation and comment sytax are similar to the GNU make syntax.
+# Any lines beginning with whitespace (e.g. U+0020 SPACE or U+0009 HORIZONTAL
+# TABULATION) are presumed to be a continuation of the previous line.
+# The # symbol starts a comment and the comment continues till the end of
+# the line.
+#
+# The converter
+#
+# All names can be tagged by including a space-separated list of tags in
+# curly braces, as in ISO_8859-1:1987{IANA*} iso-8859-1 { MIME* } or
+# some-charset{MIME* IANA*}. The order of tags does not matter, and
+# whitespace is allowed between the tagged name and the tags list.
+#
+# The tags can be used to get standard names using ucnv_getStandardName().
+#
+# The complete list of recognized tags used in this file is defined in
+# the affinity list near the beginning of the file.
+#
+# The * after the standard tag denotes that the previous alias is the
+# preferred (default) charset name for that standard. There can only
+# be one of these default charset names per converter.
+
+
+
+# The world is getting more complicated...
+# Supporting XML parsers, HTML, MIME, and similar applications
+# that mark encodings with a charset name can be difficult.
+# Many of these applications and operating systems will update
+# their codepages over time.
+
+# It means that a new codepage, one that differs from an
+# old one by changing a code point, e.g., to the Euro sign,
+# must not get an old alias, because it would mean that
+# old files with this alias would be interpreted differently.
+
+# If an codepage gets updated by assigning characters to previously
+# unassigned code points, then a new name is not necessary.
+# Also, some codepages map unassigned codepage byte values
+# to the same numbers in Unicode for roundtripping. It may be
+# industry practice to keep the encoding name in such a case, too
+# (example: Windows codepages).
+
+# The aliases listed in the list of character sets
+# that is maintained by the IANA (http://www.iana.org/) must
+# not be changed to mean encodings different from what this
+# list shows. Currently, the IANA list is at
+# http://www.iana.org/assignments/character-sets
+# It should also be mentioned that the exact mapping table used for each
+# IANA names usually isn't specified. This means that some other applications
+# and operating systems are left to interpret the exact mappings for the
+# underspecified aliases. For instance, Shift-JIS on a Solaris platform
+# may be different from Shift-JIS on a Windows platform. This is why
+# some of the aliases can be tagged to differentiate different mapping
+# tables with the same alias. If an alias is given to more than one converter,
+# it is considered to be an ambiguous alias, and the affinity list will
+# choose the converter to use when a standard isn't specified with the alias.
+
+# Name matching is case-insensitive. Also, dashes '-', underscores '_'
+# and spaces ' ' are ignored in names (thus cs-iso_latin-1, csisolatin1
+# and "cs iso latin 1" are the same).
+# However, the names in the left column are directly file names
+# or names of algorithmic converters, and their case must not
+# be changed - or else code and/or file names must also be changed.
+# For example, the converter ibm-921 is expected to be the file ibm-921.cnv.
+
+
+
+# The immediately following list is the affinity list of supported standard tags.
+# When multiple converters have the same alias under different standards,
+# the standard nearest to the top of this list with that alias will
+# be the first converter that will be opened. The ordering of the aliases
+# after this affinity list does not affect the preferred alias, but it may
+# affect the order of the returned list of aliases for a given converter.
+#
+# The general ordering is from specific and frequently used to more general
+# or rarely used at the bottom.
+{ UTR22 # Name format specified by https://www.unicode.org/reports/tr22/
+ # ICU # Can also use ICU_FEATURE
+ IBM # The IBM CCSID number is specified by ibm-*
+ WINDOWS # The Microsoft code page identifier number is specified by windows-*. The rest are recognized IE names.
+ JAVA # Source: Sun JDK. Alias name case is ignored, but dashes are not ignored.
+ # GLIBC
+ # AIX
+ # DB2
+ # SOLARIS
+ # APPLE
+ # HPUX
+ IANA # Source: http://www.iana.org/assignments/character-sets
+ MIME # Source: http://www.iana.org/assignments/character-sets
+ # MSIE # MSIE is Internet Explorer, which can be different from Windows (From the IMultiLanguage COM interface)
+ # ZOS_USS # z/OS (os/390) Unix System Services (USS), which has NL<->LF swapping. They have the same format as the IBM tag.
+ }
+
+
+
+# Fully algorithmic converters
+
+UTF-8 { IANA* MIME* JAVA* WINDOWS }
+ ibm-1208 { IBM* } # UTF-8 with IBM PUA
+ ibm-1209 { IBM } # UTF-8
+ ibm-5304 { IBM } # Unicode 2.0, UTF-8 with IBM PUA
+ ibm-5305 { IBM } # Unicode 2.0, UTF-8
+ ibm-13496 { IBM } # Unicode 3.0, UTF-8 with IBM PUA
+ ibm-13497 { IBM } # Unicode 3.0, UTF-8
+ ibm-17592 { IBM } # Unicode 4.0, UTF-8 with IBM PUA
+ ibm-17593 { IBM } # Unicode 4.0, UTF-8
+ windows-65001 { WINDOWS* }
+ cp1208
+ x-UTF_8J
+ unicode-1-1-utf-8
+ unicode-2-0-utf-8
+
+# The ICU 2.2 UTF-16/32 converters detect and write a BOM.
+UTF-16 { IANA* MIME* JAVA* } ISO-10646-UCS-2 { IANA }
+ ibm-1204 { IBM* } # UTF-16 with IBM PUA and BOM sensitive
+ ibm-1205 { IBM } # UTF-16 BOM sensitive
+ unicode
+ csUnicode
+ ucs-2
+# The following Unicode CCSIDs (IBM) are not valid in ICU because they are
+# considered pure DBCS (exactly 2 bytes) of Unicode,
+# and they are a subset of Unicode. ICU does not support their encoding structures.
+# 1400 1401 1402 1410 1414 1415 1446 1447 1448 1449 64770 64771 65520 5496 5497 5498 9592 13688
+UTF-16BE { IANA* MIME* JAVA* } x-utf-16be { JAVA }
+ UnicodeBigUnmarked { JAVA } # java.io name
+ ibm-1200 { IBM* } # UTF-16 BE with IBM PUA
+ ibm-1201 { IBM } # UTF-16 BE
+ ibm-13488 { IBM } # Unicode 2.0, UTF-16 BE with IBM PUA
+ ibm-13489 { IBM } # Unicode 2.0, UTF-16 BE
+ ibm-17584 { IBM } # Unicode 3.0, UTF-16 BE with IBM PUA
+ ibm-17585 { IBM } # Unicode 3.0, UTF-16 BE
+ ibm-21680 { IBM } # Unicode 4.0, UTF-16 BE with IBM PUA
+ ibm-21681 { IBM } # Unicode 4.0, UTF-16 BE
+ ibm-25776 { IBM } # Unicode 4.1, UTF-16 BE with IBM PUA
+ ibm-25777 { IBM } # Unicode 4.1, UTF-16 BE
+ ibm-29872 { IBM } # Unicode 5.0, UTF-16 BE with IBM PUA
+ ibm-29873 { IBM } # Unicode 5.0, UTF-16 BE
+ ibm-61955 { IBM } # UTF-16BE with Gaidai University (Japan) PUA
+ ibm-61956 { IBM } # UTF-16BE with Microsoft HKSCS-Big 5 PUA
+ windows-1201 { WINDOWS* }
+ cp1200
+ cp1201
+ UTF16_BigEndian
+ # ibm-5297 { IBM } # Unicode 2.0, UTF-16 (BE) (reserved, never used)
+ # iso-10646-ucs-2 { JAVA } # This is ambiguous
+ # ibm-61952 is not a valid CCSID because it's Unicode 1.1
+ # ibm-61953 is not a valid CCSID because it's Unicode 1.0
+UTF-16LE { IANA* MIME* JAVA* } x-utf-16le { JAVA }
+ UnicodeLittleUnmarked { JAVA } # java.io name
+ ibm-1202 { IBM* } # UTF-16 LE with IBM PUA
+ ibm-1203 { IBM } # UTF-16 LE
+ ibm-13490 { IBM } # Unicode 2.0, UTF-16 LE with IBM PUA
+ ibm-13491 { IBM } # Unicode 2.0, UTF-16 LE
+ ibm-17586 { IBM } # Unicode 3.0, UTF-16 LE with IBM PUA
+ ibm-17587 { IBM } # Unicode 3.0, UTF-16 LE
+ ibm-21682 { IBM } # Unicode 4.0, UTF-16 LE with IBM PUA
+ ibm-21683 { IBM } # Unicode 4.0, UTF-16 LE
+ ibm-25778 { IBM } # Unicode 4.1, UTF-16 LE with IBM PUA
+ ibm-25779 { IBM } # Unicode 4.1, UTF-16 LE
+ ibm-29874 { IBM } # Unicode 5.0, UTF-16 LE with IBM PUA
+ ibm-29875 { IBM } # Unicode 5.0, UTF-16 LE
+ UTF16_LittleEndian
+ windows-1200 { WINDOWS* }
+
+UTF-32 { IANA* MIME* } ISO-10646-UCS-4 { IANA }
+ ibm-1236 { IBM* } # UTF-32 with IBM PUA and BOM sensitive
+ ibm-1237 { IBM } # UTF-32 BOM sensitive
+ csUCS4
+ ucs-4
+UTF-32BE { IANA* } UTF32_BigEndian
+ ibm-1232 { IBM* } # UTF-32 BE with IBM PUA
+ ibm-1233 { IBM } # UTF-32 BE
+ ibm-9424 { IBM } # Unicode 4.1, UTF-32 BE with IBM PUA
+UTF-32LE { IANA* } UTF32_LittleEndian
+ ibm-1234 { IBM* } # UTF-32 LE, with IBM PUA
+ ibm-1235 { IBM } # UTF-32 LE
+
+# ICU-specific names for special uses
+UTF16_PlatformEndian
+UTF16_OppositeEndian
+
+UTF32_PlatformEndian
+UTF32_OppositeEndian
+
+
+# Java-specific, non-Unicode-standard UTF-16 variants.
+# These are in the Java "Basic Encoding Set (contained in lib/rt.jar)".
+# See the "Supported Encodings" at
+# http://java.sun.com/javase/6/docs/technotes/guides/intl/encoding.doc.html
+# or a newer version of this document.
+#
+# Aliases marked with { JAVA* } are canonical names for java.io and java.lang APIs.
+# Aliases marked with { JAVA } are canonical names for the java.nio API.
+#
+# "BOM" means the Unicode Byte Order Mark, which is the encoding-scheme-specific
+# byte sequence for U+FEFF.
+# "Reverse BOM" means the BOM for the sibling encoding scheme with the
+# opposite endianness. (LE<->BE)
+
+# "Sixteen-bit Unicode (or UCS) Transformation Format, big-endian byte order,
+# with byte-order mark"
+#
+# From Unicode: Writes BOM.
+# To Unicode: Detects and consumes BOM.
+# If there is a "reverse BOM", Java throws
+# MalformedInputException: Incorrect byte-order mark.
+# In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value
+# and a UCNV_ILLEGAL UConverterCallbackReason.
+UTF-16BE,version=1 UnicodeBig { JAVA* }
+
+# "Sixteen-bit Unicode (or UCS) Transformation Format, little-endian byte order,
+# with byte-order mark"
+#
+# From Unicode: Writes BOM.
+# To Unicode: Detects and consumes BOM.
+# If there is a "reverse BOM", Java throws
+# MalformedInputException: Incorrect byte-order mark.
+# In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value
+# and a UCNV_ILLEGAL UConverterCallbackReason.
+UTF-16LE,version=1 UnicodeLittle { JAVA* } x-UTF-16LE-BOM { JAVA }
+
+# This one is not mentioned on the "Supported Encodings" page
+# but is available in Java.
+# In Java, this is called "Unicode" but we cannot give it that alias
+# because the standard UTF-16 converter already has a "unicode" alias.
+#
+# From Unicode: Writes BOM.
+# To Unicode: Detects and consumes BOM.
+# If there is no BOM, rather than defaulting to BE, Java throws
+# MalformedInputException: Missing byte-order mark.
+# In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value
+# and a UCNV_ILLEGAL UConverterCallbackReason.
+UTF-16,version=1
+
+# This is the same as standard UTF-16 but always writes a big-endian byte stream,
+# regardless of the platform endianness, as expected by the Java compatibility tests.
+# See the java.nio.charset.Charset API documentation at
+# http://java.sun.com/javase/6/docs/api/java/nio/charset/Charset.html
+# or a newer version of this document.
+#
+# From Unicode: Write BE BOM and BE bytes
+# To Unicode: Detects and consumes BOM. Defaults to BE.
+UTF-16,version=2
+
+# Note: ICU does not currently support Java-specific, non-Unicode-standard UTF-32 variants.
+# Presumably, these behave analogously to the UTF-16 variants with similar names.
+# UTF_32BE_BOM x-UTF-32BE-BOM
+# UTF_32LE_BOM x-UTF-32LE-BOM
+
+# End of Java-specific, non-Unicode-standard UTF variants.
+
+
+# On UTF-7:
+# RFC 2152 (http://www.imc.org/rfc2152) allows to encode some US-ASCII
+# characters directly or in base64. Especially, the characters in set O
+# as defined in the RFC (!"#$%&*;<=>@[]^_`{|}) may be encoded directly
+# but are not allowed in, e.g., email headers.
+# By default, the ICU UTF-7 converter encodes set O directly.
+# By choosing the option "version=1", set O will be escaped instead.
+# For example:
+# utf7Converter=ucnv_open("UTF-7,version=1");
+#
+# For details about email headers see RFC 2047.
+UTF-7 { IANA* MIME* WINDOWS } windows-65000 { WINDOWS* }
+ unicode-1-1-utf-7
+ unicode-2-0-utf-7
+
+# UTF-EBCDIC doesn't exist in ICU, but the aliases are here for reference.
+#UTF-EBCDIC ibm-1210 { IBM* } ibm-1211 { IBM }
+
+# IMAP-mailbox-name is an ICU-specific name for the encoding of IMAP mailbox names.
+# It is a substantially modified UTF-7 encoding. See the specification in:
+#
+# RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
+# (http://www.ietf.org/rfc/rfc2060.txt)
+# Section 5.1.3. Mailbox International Naming Convention
+IMAP-mailbox-name
+
+SCSU { IANA* }
+ ibm-1212 { IBM } # SCSU with IBM PUA
+ ibm-1213 { IBM* } # SCSU
+BOCU-1 { IANA* }
+ csBOCU-1 { IANA }
+ ibm-1214 { IBM } # BOCU-1 with IBM PUA
+ ibm-1215 { IBM* } # BOCU-1
+
+# See https://www.unicode.org/reports/tr26 for this Compatibility Encoding Scheme for UTF-16
+# The Unicode Consortium does not encourage the use of CESU-8
+CESU-8 { IANA* } ibm-9400 { IBM* }
+
+# Standard iso-8859-1, which does not have the Euro update.
+# See iso-8859-15 (latin9) for the Euro update
+ISO-8859-1 { MIME* IANA JAVA* }
+ ibm-819 { IBM* JAVA } # This is not truely ibm-819 because it's missing the fallbacks.
+ IBM819 { IANA }
+ cp819 { IANA JAVA }
+ latin1 { IANA JAVA }
+ 8859_1 { JAVA }
+ csISOLatin1 { IANA JAVA }
+ iso-ir-100 { IANA JAVA }
+ ISO_8859-1:1987 { IANA* JAVA }
+ l1 { IANA JAVA }
+ 819 { JAVA }
+ # windows-28591 { WINDOWS* } # This has odd behavior because it has the Euro update, which isn't correct.
+ # LATIN_1 # Old ICU name
+ # ANSI_X3.110-1983 # This is for a different IANA alias. This isn't iso-8859-1.
+
+US-ASCII { MIME* IANA JAVA WINDOWS }
+ ASCII { JAVA* IANA WINDOWS }
+ ANSI_X3.4-1968 { IANA* WINDOWS }
+ ANSI_X3.4-1986 { IANA WINDOWS }
+ ISO_646.irv:1991 { IANA WINDOWS }
+ iso_646.irv:1983 { JAVA }
+ ISO646-US { JAVA IANA WINDOWS }
+ us { IANA }
+ csASCII { IANA WINDOWS }
+ iso-ir-6 { IANA }
+ cp367 { IANA WINDOWS }
+ ascii7 { JAVA }
+ 646 { JAVA }
+ windows-20127 { WINDOWS* }
+ ibm-367 { IBM* } IBM367 { IANA WINDOWS } # This is not truely ibm-367 because it's missing the fallbacks.
+
+# GB 18030 is partly algorithmic, using the MBCS converter
+gb18030 { IANA* } ibm-1392 { IBM* } windows-54936 { WINDOWS* } GB18030 { MIME* }
+
+# Table-based interchange codepages
+
+# Central Europe
+ibm-912_P100-1995 { UTR22* }
+ ibm-912 { IBM* JAVA }
+ ISO-8859-2 { MIME* IANA JAVA* WINDOWS }
+ ISO_8859-2:1987 { IANA* WINDOWS JAVA }
+ latin2 { IANA WINDOWS JAVA }
+ csISOLatin2 { IANA WINDOWS JAVA }
+ iso-ir-101 { IANA WINDOWS JAVA }
+ l2 { IANA WINDOWS JAVA }
+ 8859_2 { JAVA }
+ cp912 { JAVA }
+ 912 { JAVA }
+ windows-28592 { WINDOWS* }
+
+# Maltese Esperanto
+ibm-913_P100-2000 { UTR22* }
+ ibm-913 { IBM* JAVA }
+ ISO-8859-3 { MIME* IANA WINDOWS JAVA* }
+ ISO_8859-3:1988 { IANA* WINDOWS JAVA }
+ latin3 { IANA JAVA WINDOWS }
+ csISOLatin3 { IANA WINDOWS }
+ iso-ir-109 { IANA WINDOWS JAVA }
+ l3 { IANA WINDOWS JAVA }
+ 8859_3 { JAVA }
+ cp913 { JAVA }
+ 913 { JAVA }
+ windows-28593 { WINDOWS* }
+
+# Baltic
+ibm-914_P100-1995 { UTR22* }
+ ibm-914 { IBM* JAVA }
+ ISO-8859-4 { MIME* IANA WINDOWS JAVA* }
+ latin4 { IANA WINDOWS JAVA }
+ csISOLatin4 { IANA WINDOWS JAVA }
+ iso-ir-110 { IANA WINDOWS JAVA }
+ ISO_8859-4:1988 { IANA* WINDOWS JAVA }
+ l4 { IANA WINDOWS JAVA }
+ 8859_4 { JAVA }
+ cp914 { JAVA }
+ 914 { JAVA }
+ windows-28594 { WINDOWS* }
+
+# Cyrillic
+ibm-915_P100-1995 { UTR22* }
+ ibm-915 { IBM* JAVA }
+ ISO-8859-5 { MIME* IANA WINDOWS JAVA* }
+ cyrillic { IANA WINDOWS JAVA }
+ csISOLatinCyrillic { IANA WINDOWS JAVA }
+ iso-ir-144 { IANA WINDOWS JAVA }
+ ISO_8859-5:1988 { IANA* WINDOWS JAVA }
+ 8859_5 { JAVA }
+ cp915 { JAVA }
+ 915 { JAVA }
+ windows-28595 { WINDOWS* }
+
+glibc-PT154-2.3.3 { UTR22* }
+ PTCP154 { IANA* }
+ csPTCP154
+ PT154
+ CP154
+ Cyrillic-Asian
+
+# Arabic
+# ISO_8859-6-E and ISO_8859-6-I are similar to this charset, but BiDi is done differently
+# From a narrow mapping point of view, there is no difference.
+# -E means explicit. -I means implicit.
+# -E requires the client to handle the ISO 6429 bidirectional controls
+ibm-1089_P100-1995 { UTR22* }
+ ibm-1089 { IBM* JAVA }
+ ISO-8859-6 { MIME* IANA WINDOWS JAVA* }
+ arabic { IANA WINDOWS JAVA }
+ csISOLatinArabic { IANA WINDOWS JAVA }
+ iso-ir-127 { IANA WINDOWS JAVA }
+ ISO_8859-6:1987 { IANA* WINDOWS JAVA }
+ ECMA-114 { IANA JAVA }
+ ASMO-708 { IANA JAVA }
+ 8859_6 { JAVA }
+ cp1089 { JAVA }
+ 1089 { JAVA }
+ windows-28596 { WINDOWS* }
+ ISO-8859-6-I { IANA MIME } # IANA considers this alias different and BiDi needs to be applied.
+ ISO-8859-6-E { IANA MIME } # IANA considers this alias different and BiDi needs to be applied.
+ x-ISO-8859-6S { JAVA }
+
+# ISO Greek (with euro update). This is really ISO_8859-7:2003
+ibm-9005_X110-2007 { UTR22* }
+ ibm-9005 { IBM* }
+ ISO-8859-7 { MIME* IANA JAVA* WINDOWS }
+ 8859_7 { JAVA }
+ greek { IANA JAVA WINDOWS }
+ greek8 { IANA JAVA WINDOWS }
+ ELOT_928 { IANA JAVA WINDOWS }
+ ECMA-118 { IANA JAVA WINDOWS }
+ csISOLatinGreek { IANA JAVA WINDOWS }
+ iso-ir-126 { IANA JAVA WINDOWS }
+ ISO_8859-7:1987 { IANA* JAVA WINDOWS }
+ windows-28597 { WINDOWS* }
+ sun_eu_greek # For Solaris
+
+# ISO Greek (w/o euro update)
+# JDK 1.5 has these aliases.
+ibm-813_P100-1995 { UTR22* }
+ ibm-813 { IBM* JAVA* }
+ cp813 { JAVA }
+ 813 { JAVA }
+
+# hebrew
+# ISO_8859-8-E and ISO_8859-8-I are similar to this charset, but BiDi is done differently
+# From a narrow mapping point of view, there is no difference.
+# -E means explicit. -I means implicit.
+# -E requires the client to handle the ISO 6429 bidirectional controls
+# This matches the official mapping on unicode.org
+ibm-5012_P100-1999 { UTR22* }
+ ibm-5012 { IBM* }
+ ISO-8859-8 { MIME* IANA WINDOWS JAVA* }
+ hebrew { IANA WINDOWS JAVA }
+ csISOLatinHebrew { IANA WINDOWS JAVA }
+ iso-ir-138 { IANA WINDOWS JAVA }
+ ISO_8859-8:1988 { IANA* WINDOWS JAVA }
+ ISO-8859-8-I { IANA MIME } # IANA and Windows considers this alias different and BiDi needs to be applied.
+ ISO-8859-8-E { IANA MIME } # IANA and Windows considers this alias different and BiDi needs to be applied.
+ 8859_8 { JAVA }
+ windows-28598 { WINDOWS* } # Hebrew (ISO-Visual). A hybrid between ibm-5012 and ibm-916 with extra PUA mappings.
+ hebrew8 # Reflect HP-UX code page update
+
+# Unfortunately, the Java aliases are split across ibm-916 and ibm-5012
+# Also many platforms are a combination between ibm-916 and ibm-5012 behaviors
+ibm-916_P100-1995 { UTR22* }
+ ibm-916 { IBM* JAVA* }
+ cp916 { JAVA }
+ 916 { JAVA }
+
+# Turkish
+ibm-920_P100-1995 { UTR22* }
+ ibm-920 { IBM* JAVA }
+ ISO-8859-9 { MIME* IANA WINDOWS JAVA* }
+ latin5 { IANA WINDOWS JAVA }
+ csISOLatin5 { IANA JAVA }
+ iso-ir-148 { IANA WINDOWS JAVA }
+ ISO_8859-9:1989 { IANA* WINDOWS }
+ l5 { IANA WINDOWS JAVA }
+ 8859_9 { JAVA }
+ cp920 { JAVA }
+ 920 { JAVA }
+ windows-28599 { WINDOWS* }
+ ECMA-128 # IANA doesn't have this alias 6/24/2002
+ turkish8 # Reflect HP-UX codepage update 8/1/2008
+ turkish # Reflect HP-UX codepage update 8/1/2008
+
+# Nordic languages
+iso-8859_10-1998 { UTR22* } ISO-8859-10 { MIME* IANA* }
+ iso-ir-157 { IANA }
+ l6 { IANA }
+ ISO_8859-10:1992 { IANA }
+ csISOLatin6 { IANA }
+ latin6 { IANA }
+
+# Thai
+# Be warned. There several iso-8859-11 codepage variants, and they are all incompatible.
+# ISO-8859-11 is a superset of TIS-620. The difference is that ISO-8859-11 contains the C1 control codes.
+iso-8859_11-2001 { UTR22* } ISO-8859-11
+ thai8 # HP-UX alias. HP-UX says TIS-620, but it's closer to ISO-8859-11.
+ x-iso-8859-11 { JAVA* }
+
+# iso-8859-13, PC Baltic (w/o euro update)
+ibm-921_P100-1995 { UTR22* }
+ ibm-921 { IBM* }
+ ISO-8859-13 { IANA* MIME* JAVA* }
+ 8859_13 { JAVA }
+ windows-28603 { WINDOWS* }
+ cp921
+ 921
+ x-IBM921 { JAVA }
+
+# Celtic
+iso-8859_14-1998 { UTR22* } ISO-8859-14 { IANA* }
+ iso-ir-199 { IANA }
+ ISO_8859-14:1998 { IANA }
+ latin8 { IANA }
+ iso-celtic { IANA }
+ l8 { IANA }
+
+# Latin 9
+ibm-923_P100-1998 { UTR22* }
+ ibm-923 { IBM* JAVA }
+ ISO-8859-15 { IANA* MIME* WINDOWS JAVA* }
+ Latin-9 { IANA WINDOWS }
+ l9 { WINDOWS }
+ 8859_15 { JAVA }
+ latin0 { JAVA }
+ csisolatin0 { JAVA }
+ csisolatin9 { JAVA }
+ iso8859_15_fdis { JAVA }
+ cp923 { JAVA }
+ 923 { JAVA }
+ windows-28605 { WINDOWS* }
+
+# CJK encodings
+
+ibm-942_P12A-1999 { UTR22* } # ibm-942_P120 is a rarely used alternate mapping (sjis78 is already old)
+ ibm-942 { IBM* }
+ ibm-932 { IBM }
+ cp932
+ shift_jis78
+ sjis78
+ ibm-942_VSUB_VPUA
+ ibm-932_VSUB_VPUA
+ x-IBM942 { JAVA* }
+ x-IBM942C { JAVA }
+ # Is this "JIS_C6226-1978"?
+
+# ibm-943_P15A-2003 differs from windows-932-2000 only in a few roundtrip mappings:
+# - the usual IBM PC control code rotation (1A-1C-7F)
+# - the Windows table has roundtrips for bytes 80, A0, and FD-FF to U+0080 and PUA
+ibm-943_P15A-2003 { UTR22* }
+ ibm-943 # Leave untagged because this isn't the default
+ Shift_JIS { IANA* MIME* WINDOWS JAVA }
+ MS_Kanji { IANA WINDOWS JAVA }
+ csShiftJIS { IANA WINDOWS JAVA }
+ windows-31j { IANA JAVA } # A further extension of Shift_JIS to include NEC special characters (Row 13)
+ csWindows31J { IANA WINDOWS JAVA } # A further extension of Shift_JIS to include NEC special characters (Row 13)
+ x-sjis { WINDOWS JAVA }
+ x-ms-cp932 { WINDOWS }
+ cp932 { WINDOWS }
+ windows-932 { WINDOWS* }
+ cp943c { JAVA* } # This is slightly different, but the backslash mapping is the same.
+ IBM-943C #{ AIX* } # Add this tag once AIX aliases becomes available
+ ms932
+ pck # Probably SOLARIS
+ sjis # This might be for ibm-1351
+ ibm-943_VSUB_VPUA
+ x-MS932_0213 { JAVA }
+ x-JISAutoDetect { JAVA }
+ # cp943 # This isn't Windows, and no one else uses it.
+ # IANA says that Windows-31J is an extension to csshiftjis ibm-932
+ibm-943_P130-1999 { UTR22* }
+ ibm-943 { IBM* JAVA }
+ Shift_JIS # Leave untagged because this isn't the default
+ cp943 { JAVA* } # This is slightly different, but the backslash mapping is the same.
+ 943 { JAVA }
+ ibm-943_VASCII_VSUB_VPUA
+ x-IBM943 { JAVA }
+ # japanese. Unicode name is \u30b7\u30d5\u30c8\u7b26\u53f7\u5316\u8868\u73fe
+ibm-33722_P12A_P12A-2009_U2 { UTR22* }
+ ibm-33722 # Leave untagged because this isn't the default
+ ibm-5050 # Leave untagged because this isn't the default, and yes this alias is correct
+ ibm-33722_VPUA
+ IBM-eucJP
+windows-51932-2006 { UTR22* }
+ windows-51932 { WINDOWS* }
+ CP51932 { IANA* }
+ csCP51932
+ibm-33722_P120-1999 { UTR22* } # Japan EUC with \ <-> Yen mapping
+ ibm-33722 { IBM* JAVA }
+ ibm-5050 { IBM } # Yes this is correct
+ cp33722 { JAVA* }
+ 33722 { JAVA }
+ ibm-33722_VASCII_VPUA
+ x-IBM33722 { JAVA }
+ x-IBM33722A { JAVA }
+ x-IBM33722C { JAVA }
+# ibm-954 seems to be almost a superset of ibm-33722 and ibm-1350
+# ibm-1350 seems to be almost a superset of ibm-33722
+# ibm-954 contains more PUA characters than the others.
+ibm-954_P101-2007 { UTR22* }
+ ibm-954 { IBM* }
+ x-IBM954 { JAVA* }
+ x-IBM954C { JAVA }
+ # eucJP # This is closest to Solaris EUC-JP.
+euc-jp-2007 { UTR22* }
+ EUC-JP { MIME* IANA JAVA* WINDOWS* }
+ Extended_UNIX_Code_Packed_Format_for_Japanese { IANA* JAVA WINDOWS }
+ csEUCPkdFmtJapanese { IANA JAVA WINDOWS }
+ X-EUC-JP { MIME JAVA WINDOWS } # Japan EUC. x-euc-jp is a MIME name
+ eucjis {JAVA}
+ ujis # Linux sometimes uses this name. This is an unfortunate generic and rarely used name. Its use is discouraged.
+
+aix-IBM_udcJP-4.3.6 { UTR22* }
+ x-IBM-udcJP { JAVA* }
+
+java-euc_jp_linux-1.6_P { UTR22* }
+ euc-jp-linux
+ x-EUC_JP_LINUX { JAVA* }
+
+java-sjis_0213-1.6_P { UTR22* }
+ x-SJIS_0213 { JAVA* }
+
+# Here are various interpretations and extensions of Big5
+ibm-1373_P100-2002 { UTR22* } # IBM's interpretation of Windows' Taiwan Big-5 without HKSCS extensions
+ ibm-1373 { IBM* }
+ windows-950 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage.
+windows-950-2000 { UTR22* }
+ Big5 { IANA* MIME* JAVA* WINDOWS }
+ csBig5 { IANA WINDOWS }
+ windows-950 { WINDOWS* }
+ x-windows-950 { JAVA }
+ x-big5
+ ms950
+ibm-950_P110-1999 { UTR22* } # Taiwan Big-5 (w/o euro update)
+ ibm-950 { IBM* JAVA }
+ cp950 { JAVA* }
+ 950 { JAVA }
+ x-IBM950 { JAVA }
+ibm-1375_P100-2008 { UTR22* } # Big5-HKSCS-2004 with Unicode 3.1 mappings. This uses supplementary characters.
+ ibm-1375 { IBM* }
+ Big5-HKSCS { IANA* JAVA* }
+ big5hk { JAVA }
+ HKSCS-BIG5 # From http://www.openi18n.org/localenameguide/
+ibm-5471_P100-2006 { UTR22* } # Big5-HKSCS-2001 with Unicode 3.0 mappings. This uses many PUA characters.
+ ibm-5471 { IBM* }
+ Big5-HKSCS
+ MS950_HKSCS { JAVA* }
+ hkbig5 # from HP-UX 11i, which can't handle supplementary characters.
+ big5-hkscs:unicode3.0
+ x-MS950-HKSCS { JAVA }
+ # windows-950 # Windows-950 can be w/ or w/o HKSCS extensions. By default it's not.
+ # windows-950_hkscs
+solaris-zh_TW_big5-2.7 { UTR22* }
+ Big5_Solaris { JAVA* }
+ x-Big5-Solaris { JAVA }
+# GBK
+ibm-1386_P100-2001 { UTR22* }
+ ibm-1386 { IBM* }
+ cp1386
+ windows-936 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage.
+ ibm-1386_VSUB_VPUA
+windows-936-2000 { UTR22* }
+ GBK { IANA* WINDOWS JAVA* }
+ CP936 { IANA JAVA }
+ MS936 { IANA } # In JDK 1.5, this goes to x-mswin-936. This is an IANA name split.
+ windows-936 { IANA WINDOWS* JAVA }
+
+# Java has two different tables for ibm-1383 and gb2312. We pick closest set for tagging.
+ibm-1383_P110-1999 { UTR22* } # China EUC.
+ ibm-1383 { IBM* JAVA }
+ GB2312 { IANA* MIME* }
+ csGB2312 { IANA }
+ cp1383 { JAVA* }
+ 1383 { JAVA }
+ EUC-CN # According to other platforms, windows-20936 looks more like euc-cn. x-euc-cn is also a MIME name
+ ibm-eucCN
+ hp15CN # From HP-UX?
+ ibm-1383_VPUA
+ # gb # This is not an IANA name. gb in IANA means Great Britain.
+
+ibm-5478_P100-1995 { UTR22* } ibm-5478 { IBM* } # This gb_2312_80 DBCS mapping is needed by iso-2022.
+ GB_2312-80 { IANA* } # Windows maps this alias incorrectly
+ chinese { IANA }
+ iso-ir-58 { IANA }
+ csISO58GB231280 { IANA }
+ gb2312-1980
+ GB2312.1980-0 # From X11R6
+
+euc-tw-2014 { UTR22* } # Updated EUC-TW converter based on ibm-964
+ EUC-TW
+
+ibm-964_P110-1999 { UTR22* } # Taiwan EUC. x-euc-tw is a MIME name
+ ibm-964 { IBM* JAVA }
+ ibm-eucTW
+ cns11643
+ cp964 { JAVA* }
+ 964 { JAVA }
+ ibm-964_VPUA
+ x-IBM964 { JAVA }
+
+# ISO-2022 needs one, and other people may need others.
+ibm-949_P110-1999 { UTR22* }
+ ibm-949 { IBM* JAVA }
+ cp949 { JAVA* }
+ 949 { JAVA }
+ ibm-949_VASCII_VSUB_VPUA
+ x-IBM949 { JAVA }
+ibm-949_P11A-1999 { UTR22* }
+ ibm-949 # Leave untagged because this isn't the default
+ cp949c { JAVA* }
+ ibm-949_VSUB_VPUA
+ x-IBM949C { JAVA }
+ IBM-949C { JAVA }
+
+# Korean EUC.
+#
+# <quote from="Jungshik Shin">
+# EUC-KR = KS X 1003/ISO 646-KR or ISO 646-IRV/US-ASCII in GL and KS X 1001:1998 (formerly KS C 5601-1987) in GR.
+#
+# Although widely spread on MS Windows, using
+# KS C 5601 or related names to denote EUC-KR or
+# windows-949 is very much misleading. KS C 5601-1987
+# is NOT suitable as a designation for MIME charset
+# and MBCS. It's just the name of a 94 x 94 Korean
+# coded character set standard which can be invoked
+# on either GL (with MSB reset) or GR (with MSB set).
+# Note that JOHAB (windows-1361) specified in
+# KS X 1001:1998 annex 3 (KS C 5601-1992 annex 3)
+# is a _seprate_ MBCS with a _completely different_
+# mapping.
+# </quote>
+#
+# The following aliases tries to mirror the poor state of alias recognition
+# on these platforms.
+#
+# ibm-970 is almost a subset of ibm-1363.
+# Java, Solaris and AIX use euc-kr to also mean ksc5601.
+# Java has both ibm-970 and EUC-KR as separate converters.
+ibm-970_P110_P110-2006_U2 { UTR22* }
+ ibm-970 { IBM* JAVA }
+ EUC-KR { IANA* MIME* WINDOWS JAVA }
+ KS_C_5601-1987 { JAVA }
+ windows-51949 { WINDOWS* }
+ csEUCKR { IANA WINDOWS } # x-euc-kr is also a MIME name
+ ibm-eucKR { JAVA }
+ KSC_5601 { JAVA } # Needed by iso-2022
+ 5601 { JAVA }
+ cp970 { JAVA* }
+ 970 { JAVA }
+ ibm-970_VPUA
+ x-IBM970 { JAVA }
+
+# ibm-971 is almost the set of DBCS mappings of ibm-970
+ibm-971_P100-1995 ibm-971 { IBM* } ibm-971_VPUA x-IBM971 { JAVA* }
+
+# Java, Solaris and AIX use euc-kr to also mean ksc5601, and _sometimes_ for Windows too.
+# ibm-1363 is almost a superset of ibm-970.
+ibm-1363_P11B-1998 { UTR22* }
+ ibm-1363 # Leave untagged because this isn't the default
+ KS_C_5601-1987 { IANA* }
+ KS_C_5601-1989 { IANA }
+ KSC_5601 { IANA }
+ csKSC56011987 { IANA }
+ korean { IANA }
+ iso-ir-149 { IANA }
+ cp1363 { MIME* }
+ 5601
+ ksc
+ windows-949 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage.
+ ibm-1363_VSUB_VPUA
+ x-IBM1363C { JAVA* }
+ # ks_x_1001:1992
+ # ksc5601-1992
+
+ibm-1363_P110-1997 { UTR22* } # Korean KSC MBCS with \ <-> Won mapping
+ ibm-1363 { IBM* }
+ ibm-1363_VASCII_VSUB_VPUA
+ x-IBM1363 { JAVA* }
+
+windows-949-2000 { UTR22* }
+ windows-949 { JAVA* WINDOWS* }
+ KS_C_5601-1987 { WINDOWS }
+ KS_C_5601-1989 { WINDOWS }
+ KSC_5601 { MIME* WINDOWS } # Needed by iso-2022
+ csKSC56011987 { WINDOWS }
+ korean { WINDOWS }
+ iso-ir-149 { WINDOWS }
+ ms949 { JAVA }
+ x-KSC5601 { JAVA }
+
+windows-1361-2000 { UTR22* }
+ ksc5601_1992
+ ms1361
+ johab
+ x-Johab { JAVA* }
+
+windows-874-2000 { UTR22* } # Thai (w/ euro update)
+ TIS-620 { WINDOWS }
+ windows-874 { JAVA* WINDOWS* }
+ MS874 { JAVA }
+ x-windows-874 { JAVA }
+ # iso-8859-11 { WINDOWS } # iso-8859-11 is similar to TIS-620. ibm-13162 is a closer match.
+
+ibm-874_P100-1995 { UTR22* } # Thai PC (w/o euro update).
+ ibm-874 { IBM* JAVA }
+ ibm-9066 { IBM } # Yes ibm-874 == ibm-9066. ibm-1161 has the euro update.
+ cp874 { JAVA* }
+ TIS-620 { IANA* JAVA } # This is actually separate from ibm-874, which is similar to this table
+ tis620.2533 { JAVA } # This is actually separate from ibm-874, which is similar to this table
+ eucTH # eucTH is an unusual alias from Solaris. eucTH has fewer mappings than TIS620
+ x-IBM874 { JAVA }
+
+ibm-1162_P100-1999 { UTR22* } # Thai (w/ euro update)
+ ibm-1162 { IBM* }
+
+windows-864-2000 { UTR22* }
+ ibm-864s
+ cp864s
+ x-IBM864S { JAVA* }
+
+# Platform codepages
+# If Java supports the IBM prefix, it should also support the ibm- prefix too.
+ibm-437_P100-1995 { UTR22* } ibm-437 { IBM* } IBM437 { IANA* WINDOWS JAVA } cp437 { IANA WINDOWS JAVA* } 437 { IANA WINDOWS JAVA } csPC8CodePage437 { IANA JAVA } windows-437 { WINDOWS* } # PC US
+ibm-720_P100-1997 { UTR22* } ibm-720 { IBM* } windows-720 { WINDOWS* } DOS-720 { WINDOWS } x-IBM720 { JAVA* } # PC Arabic
+ibm-737_P100-1997 { UTR22* } ibm-737 { IBM* } IBM737 { WINDOWS JAVA } cp737 { JAVA* } windows-737 { WINDOWS* } 737 { JAVA } x-IBM737 { JAVA } # PC Greek
+ibm-775_P100-1996 { UTR22* } ibm-775 { IBM* } IBM775 { IANA* WINDOWS JAVA } cp775 { IANA WINDOWS JAVA* } csPC775Baltic { IANA } windows-775 { WINDOWS* } 775 { JAVA } # PC Baltic
+ibm-850_P100-1995 { UTR22* } ibm-850 { IBM* } IBM850 { IANA* MIME* WINDOWS JAVA } cp850 { IANA MIME WINDOWS JAVA* } 850 { IANA JAVA } csPC850Multilingual { IANA JAVA } windows-850 { WINDOWS* } # PC latin1
+ibm-851_P100-1995 { UTR22* } ibm-851 { IBM* } IBM851 { IANA* } cp851 { IANA MIME* } 851 { IANA } csPC851 { IANA } # PC DOS Greek (w/o euro)
+ibm-852_P100-1995 { UTR22* } ibm-852 { IBM* } IBM852 { IANA* WINDOWS JAVA } cp852 { IANA WINDOWS JAVA* } 852 { IANA WINDOWS JAVA } csPCp852 { IANA JAVA } windows-852 { WINDOWS* } # PC latin2 (w/o euro update)
+ibm-855_P100-1995 { UTR22* } ibm-855 { IBM* } IBM855 { IANA* JAVA } cp855 { IANA JAVA* } 855 { IANA } csIBM855 { IANA } csPCp855 { JAVA } windows-855 { WINDOWS* } # PC cyrillic (w/o euro update)
+ibm-856_P100-1995 { UTR22* } ibm-856 { IBM* } IBM856 { JAVA } cp856 { JAVA* } 856 { JAVA } x-IBM856 { JAVA } # PC Hebrew implicit order
+ibm-857_P100-1995 { UTR22* } ibm-857 { IBM* } IBM857 { IANA* MIME* WINDOWS JAVA } cp857 { IANA MIME JAVA* } 857 { IANA JAVA } csIBM857 { IANA JAVA } windows-857 { WINDOWS* } # PC Latin 5 (w/o euro update)
+ibm-858_P100-1997 { UTR22* } ibm-858 { IBM* } IBM00858 { IANA* MIME* JAVA } CCSID00858 { IANA JAVA } CP00858 { IANA JAVA } PC-Multilingual-850+euro { IANA } cp858 { MIME JAVA* } windows-858 { WINDOWS* } # PC latin1 with Euro
+ibm-860_P100-1995 { UTR22* } ibm-860 { IBM* } IBM860 { IANA* MIME* JAVA } cp860 { IANA MIME JAVA* } 860 { IANA JAVA } csIBM860 { IANA JAVA } # PC Portugal
+ibm-861_P100-1995 { UTR22* } ibm-861 { IBM* } IBM861 { IANA* MIME* WINDOWS JAVA } cp861 { IANA MIME JAVA* } 861 { IANA JAVA } cp-is { IANA JAVA } csIBM861 { IANA JAVA } windows-861 { WINDOWS* } # PC Iceland
+ibm-862_P100-1995 { UTR22* } ibm-862 { IBM* } IBM862 { IANA* MIME* JAVA } cp862 { IANA MIME JAVA* } 862 { IANA JAVA } csPC862LatinHebrew { IANA JAVA } DOS-862 { WINDOWS } windows-862 { WINDOWS* } # PC Hebrew visual order (w/o euro update)
+ibm-863_P100-1995 { UTR22* } ibm-863 { IBM* } IBM863 { IANA* MIME* JAVA } cp863 { IANA MIME JAVA* } 863 { IANA JAVA } csIBM863 { IANA JAVA } # PC Canadian French
+ibm-864_X110-1999 { UTR22* } ibm-864 { IBM* } IBM864 { IANA* MIME* JAVA } cp864 { IANA MIME JAVA* } csIBM864 { IANA JAVA } # PC Arabic (w/o euro update)
+ibm-865_P100-1995 { UTR22* } ibm-865 { IBM* } IBM865 { IANA* MIME* JAVA } cp865 { IANA MIME JAVA* } 865 { IANA JAVA } csIBM865 { IANA JAVA } # PC Nordic
+ibm-866_P100-1995 { UTR22* } ibm-866 { IBM* } IBM866 { IANA* MIME* JAVA } cp866 { IANA MIME WINDOWS JAVA* } 866 { IANA JAVA } csIBM866 { IANA JAVA } windows-866 { WINDOWS* } # PC Russian (w/o euro update)
+ibm-867_P100-1998 { UTR22* } ibm-867 { IBM* } x-IBM867 { JAVA* } # PC Hebrew (w/ euro update) Updated version of ibm-862
+ibm-868_P100-1995 { UTR22* } ibm-868 { IBM* } IBM868 { IANA* MIME* JAVA } CP868 { IANA MIME JAVA* } 868 { JAVA } csIBM868 { IANA } cp-ar { IANA } # PC Urdu
+ibm-869_P100-1995 { UTR22* } ibm-869 { IBM* } IBM869 { IANA* MIME* WINDOWS JAVA } cp869 { IANA MIME JAVA* } 869 { IANA JAVA } cp-gr { IANA JAVA } csIBM869 { IANA JAVA } windows-869 { WINDOWS* } # PC Greek (w/o euro update)
+ibm-878_P100-1996 { UTR22* } ibm-878 { IBM* } KOI8-R { IANA* MIME* WINDOWS JAVA* } koi8 { WINDOWS JAVA } csKOI8R { IANA WINDOWS JAVA } windows-20866 { WINDOWS* } cp878 # Russian internet
+ibm-901_P100-1999 { UTR22* } ibm-901 { IBM* } # PC Baltic (w/ euro update), update of ibm-921
+ibm-902_P100-1999 { UTR22* } ibm-902 { IBM* } # PC Estonian (w/ euro update), update of ibm-922
+ibm-922_P100-1999 { UTR22* } ibm-922 { IBM* } IBM922 { JAVA } cp922 { JAVA* } 922 { JAVA } x-IBM922 { JAVA } # PC Estonian (w/o euro update)
+ibm-1168_P100-2002 { UTR22* } ibm-1168 { IBM* } KOI8-U { IANA* WINDOWS } windows-21866 { WINDOWS* } # Ukrainian KOI8. koi8-ru != KOI8-U and Microsoft is wrong for aliasing them as the same.
+ibm-4909_P100-1999 { UTR22* } ibm-4909 { IBM* } # ISO Greek (w/ euro update), update of ibm-813
+
+# The cp aliases in this section aren't really windows aliases, but it was used by ICU for Windows.
+# cp is usually used to denote IBM in Java, and that is why we don't do that anymore.
+# The windows-* aliases mean windows codepages.
+ibm-5346_P100-1998 { UTR22* } ibm-5346 { IBM* } windows-1250 { IANA* JAVA* WINDOWS* } cp1250 { WINDOWS JAVA } # Windows Latin2 (w/ euro update)
+ibm-5347_P100-1998 { UTR22* } ibm-5347 { IBM* } windows-1251 { IANA* JAVA* WINDOWS* } cp1251 { WINDOWS JAVA } ANSI1251 # Windows Cyrillic (w/ euro update). ANSI1251 is from Solaris
+ibm-5348_P100-1997 { UTR22* } ibm-5348 { IBM* } windows-1252 { IANA* JAVA* WINDOWS* } cp1252 { JAVA } # Windows Latin1 (w/ euro update)
+ibm-5349_P100-1998 { UTR22* } ibm-5349 { IBM* } windows-1253 { IANA* JAVA* WINDOWS* } cp1253 { JAVA } # Windows Greek (w/ euro update)
+ibm-5350_P100-1998 { UTR22* } ibm-5350 { IBM* } windows-1254 { IANA* JAVA* WINDOWS* } cp1254 { JAVA } # Windows Turkish (w/ euro update)
+ibm-9447_P100-2002 { UTR22* } ibm-9447 { IBM* } windows-1255 { IANA* JAVA* WINDOWS* } cp1255 { JAVA } # Windows Hebrew (w/ euro update)
+ibm-9448_X100-2005 { UTR22* } ibm-9448 { IBM* } windows-1256 { IANA* JAVA* WINDOWS* } cp1256 { WINDOWS JAVA } x-windows-1256S { JAVA } # Windows Arabic (w/ euro update)
+ibm-9449_P100-2002 { UTR22* } ibm-9449 { IBM* } windows-1257 { IANA* JAVA* WINDOWS* } cp1257 { JAVA } # Windows Baltic (w/ euro update)
+ibm-5354_P100-1998 { UTR22* } ibm-5354 { IBM* } windows-1258 { IANA* JAVA* WINDOWS* } cp1258 { JAVA } # Windows Vietnamese (w/ euro update)
+
+# These tables are out of date, and most don't have the Euro
+# Leave the windows- variants untagged. They are alternate tables of the newer ones above.
+ibm-1250_P100-1995 { UTR22* } ibm-1250 { IBM* } windows-1250 # Old Windows Latin2 (w/o euro update)
+ibm-1251_P100-1995 { UTR22* } ibm-1251 { IBM* } windows-1251 # Old Windows Cyrillic (w/o euro update)
+ibm-1252_P100-2000 { UTR22* } ibm-1252 { IBM* } windows-1252 # Old Windows Latin 1 without Euro
+ibm-1253_P100-1995 { UTR22* } ibm-1253 { IBM* } windows-1253 # Old Windows Greek (w/o euro update)
+ibm-1254_P100-1995 { UTR22* } ibm-1254 { IBM* } windows-1254 # Old Windows Turkish (w/o euro update)
+ibm-1255_P100-1995 { UTR22* } ibm-1255 { IBM* } # Very old Windows Hebrew (w/o euro update)
+ibm-5351_P100-1998 { UTR22* } ibm-5351 { IBM* } windows-1255 # Old Windows Hebrew (w/ euro update)
+ibm-1256_P110-1997 { UTR22* } ibm-1256 { IBM* } # Old Windows Arabic (w/o euro update)
+ibm-5352_P100-1998 { UTR22* } ibm-5352 { IBM* } windows-1256 # Somewhat old Windows Arabic (w/ euro update)
+ibm-1257_P100-1995 { UTR22* } ibm-1257 { IBM* } # Old Windows Baltic (w/o euro update)
+ibm-5353_P100-1998 { UTR22* } ibm-5353 { IBM* } windows-1257 # Somewhat old Windows Baltic (w/ euro update)
+ibm-1258_P100-1997 { UTR22* } ibm-1258 { IBM* } windows-1258 # Old Windows Vietnamese (w/o euro update)
+
+macos-0_2-10.2 { UTR22* } macintosh { IANA* MIME* WINDOWS } mac { IANA } csMacintosh { IANA } windows-10000 { WINDOWS* } macroman { JAVA } x-macroman { JAVA* } # Apple latin 1
+macos-6_2-10.4 { UTR22* } x-mac-greek { MIME* WINDOWS } windows-10006 { WINDOWS* } macgr x-MacGreek { JAVA* } # Apple Greek
+macos-7_3-10.2 { UTR22* } x-mac-cyrillic { MIME* WINDOWS } windows-10007 { WINDOWS* } mac-cyrillic maccy x-MacCyrillic { JAVA } x-MacUkraine { JAVA* } # Apple Cyrillic
+macos-21-10.5 { UTR22* } x-mac-thai { MIME* } x-MacThai { JAVA* } MacThai { JAVA }
+macos-29-10.2 { UTR22* } x-mac-centraleurroman { MIME* } windows-10029 { WINDOWS* } x-mac-ce { WINDOWS } macce maccentraleurope x-MacCentralEurope { JAVA* } # Apple Central Europe
+macos-33-10.5 { UTR22* } x-mac-symbol { MIME* } x-MacSymbol { JAVA* } MacSymbol { JAVA }
+macos-34-10.2 { UTR22* } x-mac-dingbat { MIME* } x-MacDingbat { JAVA* } MacDingbat { JAVA }
+macos-35-10.2 { UTR22* } x-mac-turkish { MIME* WINDOWS } windows-10081 { WINDOWS* } mactr x-MacTurkish { JAVA* } # Apple Turkish
+macos-36_2-10.2 { UTR22* } x-mac-croatian { MIME* } x-MacCroatian { JAVA* } MacCroatian { JAVA }
+macos-37_5-10.2 { UTR22* } x-mac-iceland { MIME* } x-MacIceland { JAVA* } MacIceland { JAVA }
+macos-38_2-10.2 { UTR22* } x-mac-romania { MIME* } x-MacRomania { JAVA* } MacRomania { JAVA }
+macos-518-10.2 { UTR22* } x-mac-arabic { MIME* } x-MacArabic { JAVA* } MacArabic { JAVA }
+macos-1285-10.2 { UTR22* } x-mac-hebrew { MIME* } x-MacHebrew { JAVA* } MacHebrew { JAVA }
+
+ibm-1051_P100-1995 { UTR22* } ibm-1051 { IBM* } hp-roman8 { IANA* } roman8 { IANA } r8 { IANA } csHPRoman8 { IANA } x-roman8 { JAVA* } # HP Latin1
+ibm-1276_P100-1995 { UTR22* } ibm-1276 { IBM* } Adobe-Standard-Encoding { IANA* } csAdobeStandardEncoding { IANA } # Different from ISO-Unicode-IBM-1276 (GCSGID: 1276)
+
+ibm-1006_P100-1995 { UTR22* } ibm-1006 { IBM* } IBM1006 { JAVA } cp1006 { JAVA* } 1006 { JAVA } x-IBM1006 { JAVA } # Urdu
+ibm-1098_P100-1995 { UTR22* } ibm-1098 { IBM* } IBM1098 { JAVA } cp1098 { JAVA* } 1098 { JAVA } x-IBM1098 { JAVA } # PC Farsi
+ibm-1124_P100-1996 { UTR22* } ibm-1124 { IBM* JAVA } cp1124 { JAVA* } 1124 { JAVA } x-IBM1124 { JAVA } # ISO Cyrillic Ukraine
+ibm-1125_P100-1997 { UTR22* } ibm-1125 { IBM* } cp1125 # Cyrillic Ukraine PC
+ibm-1129_P100-1997 { UTR22* } ibm-1129 { IBM* } # ISO Vietnamese
+ibm-1131_P100-1997 { UTR22* } ibm-1131 { IBM* } cp1131 # Cyrillic Belarus PC
+ibm-1133_P100-1997 { UTR22* } ibm-1133 { IBM* } # ISO Lao
+
+# GSM 03.38
+gsm-03.38-2009 { UTR22* } GSM0338 # GSM0338 alias is from Perl
+
+# Partially algorithmic converters
+
+# [U_ENABLE_GENERIC_ISO_2022]
+# The _generic_ ISO-2022 converter is disabled starting 2003-dec-03 (ICU 2.8).
+# For details see the icu mailing list from 2003-dec-01 and the ucnv2022.c file.
+# Language-specific variants of ISO-2022 continue to be available as listed below.
+# ISO_2022 ISO-2022
+
+ISO_2022,locale=ja,version=0 ISO-2022-JP { IANA* MIME* JAVA* } csISO2022JP { IANA JAVA } x-windows-iso2022jp { JAVA } x-windows-50220 { JAVA }
+ISO_2022,locale=ja,version=1 ISO-2022-JP-1 { MIME* } JIS_Encoding { IANA* } csJISEncoding { IANA } ibm-5054 { IBM* } JIS x-windows-50221 { JAVA* }
+ISO_2022,locale=ja,version=2 ISO-2022-JP-2 { IANA* MIME* } csISO2022JP2 { IANA }
+ISO_2022,locale=ja,version=3 JIS7
+ISO_2022,locale=ja,version=4 JIS8
+ISO_2022,locale=ko,version=0 ISO-2022-KR { IANA* MIME* JAVA* } csISO2022KR { IANA JAVA } # This uses ibm-949
+ISO_2022,locale=ko,version=1 ibm-25546 { IBM* }
+ISO_2022,locale=zh,version=0 ISO-2022-CN { IANA* JAVA* } csISO2022CN { JAVA } x-ISO-2022-CN-GB { JAVA }
+ISO_2022,locale=zh,version=1 ISO-2022-CN-EXT { IANA* }
+ISO_2022,locale=zh,version=2 ISO-2022-CN-CNS x-ISO-2022-CN-CNS { JAVA* }
+HZ HZ-GB-2312 { IANA* }
+x11-compound-text COMPOUND_TEXT x-compound-text { JAVA* }
+
+ISCII,version=0 x-ISCII91 { JAVA* } x-iscii-de { WINDOWS } windows-57002 { WINDOWS* } iscii-dev ibm-4902 { IBM* } # ibm-806 contains non-standard box drawing symbols.
+ISCII,version=1 x-iscii-be { WINDOWS } windows-57003 { WINDOWS* } iscii-bng windows-57006 { WINDOWS } x-iscii-as { WINDOWS } # be is different from as on Windows.
+ISCII,version=2 x-iscii-pa { WINDOWS } windows-57011 { WINDOWS* } iscii-gur
+ISCII,version=3 x-iscii-gu { WINDOWS } windows-57010 { WINDOWS* } iscii-guj
+ISCII,version=4 x-iscii-or { WINDOWS } windows-57007 { WINDOWS* } iscii-ori
+ISCII,version=5 x-iscii-ta { WINDOWS } windows-57004 { WINDOWS* } iscii-tml
+ISCII,version=6 x-iscii-te { WINDOWS } windows-57005 { WINDOWS* } iscii-tlg
+ISCII,version=7 x-iscii-ka { WINDOWS } windows-57008 { WINDOWS* } iscii-knd
+ISCII,version=8 x-iscii-ma { WINDOWS } windows-57009 { WINDOWS* } iscii-mlm
+
+# Lotus specific
+LMBCS-1 lmbcs ibm-65025 { IBM* }
+
+# These Lotus specific converters still work, but they aren't advertised in this alias table.
+# These are almost never used outside of Lotus software,
+# and they take a lot of time when creating the available converter list.
+# Also Lotus doesn't really use them anyway. It was a mistake to create these LMBCS variant converters in ICU.
+#LMBCS-2
+#LMBCS-3
+#LMBCS-4
+#LMBCS-5
+#LMBCS-6
+#LMBCS-8
+#LMBCS-11
+#LMBCS-16
+#LMBCS-17
+#LMBCS-18
+#LMBCS-19
+
+# EBCDIC codepages according to the CDRA
+
+# without Euro
+ibm-37_P100-1995 { UTR22* } # EBCDIC US
+ ibm-37 { IBM* }
+ IBM037 { IANA* JAVA }
+ ibm-037 # { JAVA }
+ ebcdic-cp-us { IANA JAVA }
+ ebcdic-cp-ca { IANA JAVA }
+ ebcdic-cp-wt { IANA JAVA }
+ ebcdic-cp-nl { IANA JAVA }
+ csIBM037 { IANA JAVA }
+ cp037 { JAVA* }
+ 037 { JAVA }
+ cpibm37 { JAVA }
+ cp37
+
+ibm-273_P100-1995 { UTR22* } ibm-273 { IBM* } IBM273 { IANA* JAVA } CP273 { IANA JAVA* } csIBM273 { IANA } ebcdic-de 273 { JAVA } # EBCDIC Germanay, Austria
+ibm-277_P100-1995 { UTR22* } ibm-277 { IBM* } IBM277 { IANA* JAVA } cp277 { JAVA* } EBCDIC-CP-DK { IANA } EBCDIC-CP-NO { IANA } csIBM277 { IANA } ebcdic-dk 277 { JAVA } # EBCDIC Denmark
+ibm-278_P100-1995 { UTR22* } ibm-278 { IBM* } IBM278 { IANA* JAVA } cp278 { JAVA* } ebcdic-cp-fi { IANA } ebcdic-cp-se { IANA } csIBM278 { IANA } ebcdic-sv { JAVA } 278 { JAVA } # EBCDIC Sweden
+ibm-280_P100-1995 { UTR22* } ibm-280 { IBM* } IBM280 { IANA* JAVA } CP280 { IANA JAVA* } ebcdic-cp-it { IANA } csIBM280 { IANA } 280 { JAVA } # EBCDIC Italy
+ibm-284_P100-1995 { UTR22* } ibm-284 { IBM* } IBM284 { IANA* JAVA } CP284 { IANA JAVA* } ebcdic-cp-es { IANA } csIBM284 { IANA } cpibm284 { JAVA } 284 { JAVA } # EBCDIC Spain
+ibm-285_P100-1995 { UTR22* } ibm-285 { IBM* } IBM285 { IANA* JAVA } CP285 { IANA JAVA* } ebcdic-cp-gb { IANA } csIBM285 { IANA } cpibm285 { JAVA } ebcdic-gb { JAVA } 285 { JAVA } # EBCDIC UK Ireland
+ibm-290_P100-1995 { UTR22* } ibm-290 { IBM* } IBM290 { IANA* } cp290 { IANA } EBCDIC-JP-kana { IANA } csIBM290 { IANA } # host SBCS (Katakana)
+ibm-297_P100-1995 { UTR22* } ibm-297 { IBM* } IBM297 { IANA* JAVA } cp297 { IANA JAVA* } ebcdic-cp-fr { IANA } csIBM297 { IANA } cpibm297 { JAVA } 297 { JAVA } # EBCDIC France
+ibm-420_X120-1999 { UTR22* } ibm-420 { IBM* } IBM420 { IANA* JAVA } cp420 { IANA JAVA* } ebcdic-cp-ar1 { IANA } csIBM420 { IANA } 420 { JAVA } # EBCDIC Arabic (all presentation shapes)
+ibm-424_P100-1995 { UTR22* } ibm-424 { IBM* } IBM424 { IANA* JAVA } cp424 { IANA JAVA* } ebcdic-cp-he { IANA } csIBM424 { IANA } 424 { JAVA } # EBCDIC Hebrew
+ibm-500_P100-1995 { UTR22* } ibm-500 { IBM* } IBM500 { IANA* JAVA } CP500 { IANA JAVA* } ebcdic-cp-be { IANA } csIBM500 { IANA } ebcdic-cp-ch { IANA } 500 # EBCDIC International Latin1
+ibm-803_P100-1999 { UTR22* } ibm-803 { IBM* } cp803 # Old EBCDIC Hebrew
+ibm-838_P100-1995 { UTR22* } ibm-838 { IBM* } IBM838 { JAVA } IBM-Thai { IANA* JAVA } csIBMThai { IANA } cp838 { JAVA* } 838 { JAVA } ibm-9030 { IBM } # EBCDIC Thai. Yes ibm-9030 is an alias.
+ibm-870_P100-1995 { UTR22* } ibm-870 { IBM* } IBM870 { IANA* JAVA } CP870 { IANA JAVA* } ebcdic-cp-roece { IANA } ebcdic-cp-yu { IANA } csIBM870 { IANA } # EBCDIC Latin 2
+ibm-871_P100-1995 { UTR22* } ibm-871 { IBM* } IBM871 { IANA* JAVA } ebcdic-cp-is { IANA JAVA } csIBM871 { IANA JAVA } CP871 { IANA JAVA* } ebcdic-is { JAVA } 871 { JAVA } # EBCDIC Iceland
+ibm-875_P100-1995 { UTR22* } ibm-875 { IBM* } IBM875 { JAVA } cp875 { JAVA* } 875 { JAVA } x-IBM875 { JAVA } # EBCDIC Greek
+ibm-918_P100-1995 { UTR22* } ibm-918 { IBM* } IBM918 { IANA* JAVA } CP918 { IANA JAVA* } ebcdic-cp-ar2 { IANA } csIBM918 { IANA } # EBCDIC Urdu
+ibm-930_P120-1999 { UTR22* } # EBCDIC_STATEFUL Katakana-Kanji Host Mixed.
+ ibm-930 { IBM* }
+ ibm-5026 { IBM } # Yes this is correct
+ IBM930 { JAVA }
+ cp930 { JAVA* }
+ 930 { JAVA }
+ x-IBM930 { JAVA }
+ x-IBM930A { JAVA }
+ibm-933_P110-1995 { UTR22* } ibm-933 { IBM* JAVA } cp933 { JAVA* } 933 { JAVA } x-IBM933 { JAVA } # Korea EBCDIC MIXED
+ibm-935_P110-1999 { UTR22* } ibm-935 { IBM* JAVA } cp935 { JAVA* } 935 { JAVA } x-IBM935 { JAVA } # China EBCDIC MIXED. Need to use Unicode, ibm-1388 or gb18030 instead because it is required by the government of China.
+ibm-937_P110-1999 { UTR22* } ibm-937 { IBM* JAVA } cp937 { JAVA* } 937 { JAVA } x-IBM937 { JAVA } # Taiwan EBCDIC MIXED
+ibm-939_P120-1999 { UTR22* } # EBCDIC_STATEFUL Latin-Kanji Host Mixed.
+ ibm-939 { IBM* }
+ ibm-931 { IBM } # Yes this is correct
+ ibm-5035 { IBM } # Yes this is also correct
+ IBM939 { JAVA }
+ cp939 { JAVA* }
+ 939 { JAVA }
+ x-IBM939 { JAVA }
+ x-IBM939A { JAVA }
+ibm-1025_P100-1995 { UTR22* } ibm-1025 { IBM* JAVA } cp1025 { JAVA* } 1025 { JAVA } x-IBM1025 { JAVA } # EBCDIC Cyrillic
+ibm-1026_P100-1995 { UTR22* } ibm-1026 { IBM* } IBM1026 { IANA* JAVA } CP1026 { IANA JAVA* } csIBM1026 { IANA } 1026 { JAVA } # EBCDIC Turkey
+ibm-1047_P100-1995 { UTR22* } ibm-1047 { IBM* } IBM1047 { IANA* JAVA } cp1047 { JAVA* } 1047 { JAVA } # EBCDIC Open systems Latin1
+ibm-1097_P100-1995 { UTR22* } ibm-1097 { IBM* JAVA } cp1097 { JAVA* } 1097 { JAVA } x-IBM1097 { JAVA } # EBCDIC Farsi
+ibm-1112_P100-1995 { UTR22* } ibm-1112 { IBM* JAVA } cp1112 { JAVA* } 1112 { JAVA } x-IBM1112 { JAVA } # EBCDIC Baltic
+ibm-1114_P100-2001 { UTR22* } ibm-1114 { IBM* } x-IBM1114 { JAVA* }
+ibm-1115_P100-1995 { UTR22* } ibm-1115 { IBM* } x-IBM1115 { JAVA* }
+ibm-1122_P100-1999 { UTR22* } ibm-1122 { IBM* JAVA } cp1122 { JAVA* } 1122 { JAVA } x-IBM1122 { JAVA } # EBCDIC Estonia
+ibm-1123_P100-1995 { UTR22* } ibm-1123 { IBM* JAVA } cp1123 { JAVA* } 1123 { JAVA } x-IBM1123 { JAVA } # EBCDIC Cyrillic Ukraine
+ibm-1130_P100-1997 { UTR22* } ibm-1130 { IBM* } # EBCDIC Vietnamese
+ibm-1132_P100-1998 { UTR22* } ibm-1132 { IBM* } # EBCDIC Lao
+ibm-1137_P100-1999 { UTR22* } ibm-1137 { IBM* } # Devanagari EBCDIC (based on Unicode character set)
+ibm-4517_P100-2005 { UTR22* } ibm-4517 { IBM* } # EBCDIC Arabic. Update of ibm-421
+
+# with Euro
+ibm-1140_P100-1997 { UTR22* } ibm-1140 { IBM* } IBM01140 { IANA* JAVA } CCSID01140 { IANA JAVA } CP01140 { IANA JAVA } cp1140 { JAVA* } ebcdic-us-37+euro { IANA } # EBCDIC US
+ibm-1141_P100-1997 { UTR22* } ibm-1141 { IBM* } IBM01141 { IANA* JAVA } CCSID01141 { IANA JAVA } CP01141 { IANA JAVA } cp1141 { JAVA* } ebcdic-de-273+euro { IANA } # EBCDIC Germanay, Austria
+ibm-1142_P100-1997 { UTR22* } ibm-1142 { IBM* } IBM01142 { IANA* JAVA } CCSID01142 { IANA JAVA } CP01142 { IANA JAVA } cp1142 { JAVA* } ebcdic-dk-277+euro { IANA } ebcdic-no-277+euro { IANA } # EBCDIC Denmark
+ibm-1143_P100-1997 { UTR22* } ibm-1143 { IBM* } IBM01143 { IANA* JAVA } CCSID01143 { IANA JAVA } CP01143 { IANA JAVA } cp1143 { JAVA* } ebcdic-fi-278+euro { IANA } ebcdic-se-278+euro { IANA } # EBCDIC Sweden
+ibm-1144_P100-1997 { UTR22* } ibm-1144 { IBM* } IBM01144 { IANA* JAVA } CCSID01144 { IANA JAVA } CP01144 { IANA JAVA } cp1144 { JAVA* } ebcdic-it-280+euro { IANA } # EBCDIC Italy
+ibm-1145_P100-1997 { UTR22* } ibm-1145 { IBM* } IBM01145 { IANA* JAVA } CCSID01145 { IANA JAVA } CP01145 { IANA JAVA } cp1145 { JAVA* } ebcdic-es-284+euro { IANA } # EBCDIC Spain
+ibm-1146_P100-1997 { UTR22* } ibm-1146 { IBM* } IBM01146 { IANA* JAVA } CCSID01146 { IANA JAVA } CP01146 { IANA JAVA } cp1146 { JAVA* } ebcdic-gb-285+euro { IANA } # EBCDIC UK Ireland
+ibm-1147_P100-1997 { UTR22* } ibm-1147 { IBM* } IBM01147 { IANA* JAVA } CCSID01147 { IANA JAVA } CP01147 { IANA JAVA } cp1147 { JAVA* } ebcdic-fr-297+euro { IANA } # EBCDIC France
+ibm-1148_P100-1997 { UTR22* } ibm-1148 { IBM* } IBM01148 { IANA* JAVA } CCSID01148 { IANA JAVA } CP01148 { IANA JAVA } cp1148 { JAVA* } ebcdic-international-500+euro { IANA } # EBCDIC International Latin1
+ibm-1149_P100-1997 { UTR22* } ibm-1149 { IBM* } IBM01149 { IANA* JAVA } CCSID01149 { IANA JAVA } CP01149 { IANA JAVA } cp1149 { JAVA* } ebcdic-is-871+euro { IANA } # EBCDIC Iceland
+ibm-1153_P100-1999 { UTR22* } ibm-1153 { IBM* } IBM1153 { JAVA } x-IBM1153 { JAVA* } # EBCDIC latin 2
+ibm-1154_P100-1999 { UTR22* } ibm-1154 { IBM* } # EBCDIC Cyrillic Multilingual
+ibm-1155_P100-1999 { UTR22* } ibm-1155 { IBM* } # EBCDIC Turkey
+ibm-1156_P100-1999 { UTR22* } ibm-1156 { IBM* } # EBCDIC Baltic Multilingual
+ibm-1157_P100-1999 { UTR22* } ibm-1157 { IBM* } # EBCDIC Estonia
+ibm-1158_P100-1999 { UTR22* } ibm-1158 { IBM* } # EBCDIC Cyrillic Ukraine
+ibm-1160_P100-1999 { UTR22* } ibm-1160 { IBM* } # EBCDIC Thailand
+ibm-1164_P100-1999 { UTR22* } ibm-1164 { IBM* } # EBCDIC Viet Nam
+ibm-1364_P110-2007 { UTR22* } ibm-1364 { IBM* } x-IBM1364 { JAVA* } # Korean Host Mixed
+ibm-1370_P100-1999 { UTR22* } ibm-1370 { IBM* } x-IBM1370 { JAVA* }
+ibm-1371_P100-1999 { UTR22* } ibm-1371 { IBM* } x-IBM1371 { JAVA* } # Taiwan EBCDIC MIXED (Euro update of ibm-937)
+ibm-1388_P103-2001 { UTR22* } ibm-1388 { IBM* } ibm-9580 { IBM } x-IBM1388 { JAVA* } # S-Ch DBCS-Host Data GBK EBCDIC_STATEFUL. Yes ibm-9580 is an alias.
+ibm-1390_P110-2003 { UTR22* } ibm-1390 { IBM* } x-IBM1390 { JAVA* } # Japan EBCDIC MIXED (JIS X 0213)
+ibm-1399_P110-2003 { UTR22* } ibm-1399 { IBM* } x-IBM1399 { JAVA* } # Host MBCS (Latin-Kanji) (JIS X 0213)
+ibm-5123_P100-1999 { UTR22* } ibm-5123 { IBM* } # Host Roman Jis. Euro update of ibm-1027. SBCS portion of ibm-1390.
+ibm-8482_P100-1999 { UTR22* } ibm-8482 { IBM* } # host SBCS (Katakana). Euro update of ibm-290. SBCS portion of ibm-1399.
+# Yes ibm-20780 is the same as ibm-16684
+ibm-16684_P110-2003 { UTR22* } ibm-16684 { IBM* } ibm-20780 { IBM } # DBCS Jis + Roman Jis Host. This is the DBCS portion of ibm-1390 and ibm-1399 (JIS X 0213).
+ibm-4899_P100-1998 { UTR22* } ibm-4899 { IBM* } # Old EBCDIC Hebrew. Update of ibm-803
+ibm-4971_P100-1999 { UTR22* } ibm-4971 { IBM* } # EBCDIC Greek. Update of ibm-875 and superceded by ibm-9067
+ibm-9067_X100-2005 { UTR22* } ibm-9067 { IBM* } # EBCDIC Greek. Update of ibm-875 and ibm-4971
+ibm-12712_P100-1998 { UTR22* } ibm-12712 { IBM* } ebcdic-he # EBCDIC Hebrew (new sheqel, control characters update). Update of ibm-424
+ibm-16804_X110-1999 { UTR22* } ibm-16804 { IBM* } ebcdic-ar # EBCDIC Arabic. Update of ibm-420
+
+java-Cp1399A-1.6_P { UTR22* } x-IBM1399A { JAVA* }
+java-Cp420s-1.6_P { UTR22* } x-IBM420S { JAVA* }
+java-Cp1390A-1.6_P { UTR22* } x-IBM1390A { JAVA* }
+
+# EBCDIC codepages for S/390, with LF and NL codes swapped
+# Starting with ICU 2.4, the swapping is done by modifying the
+# normal tables at runtime instead of at build time.
+# Append UCNV_SWAP_LFNL_OPTION_STRING to the "ibm-CCSID" name to select this.
+#
+# Example: "ibm-1047,swaplfnl" or "ibm-1047" UCNV_SWAP_LFNL_OPTION_STRING
+#
+# This avoids the duplication of all EBCDIC SBCS and mixed-SBCS/DBCS
+# mapping files.
+
+# Some examples below for declaring old-style, obsolete aliases with the "-s390"
+# suffix to map to the new-style, recommended names with the option added.
+# These are listed here for backward compatibility.
+# Do not use these; instead use the normal converter name with the option
+# added as recommended above.
+
+# Note: It is not possible to define an alias (non-initial name in a line here)
+# that itself contains a converter option like this one for swapping LF<->NL.
+# Such names would never be found because ucnv_open() will first parse and strip
+# options before looking up a name in this table.
+# ucnv_open() then parses the lookup result (the canonical name on the left
+# in lines here) as well.
+
+# This also means that it is not necessary to add anything to convrtrs.txt
+# for converter names like "ibm-1026,swaplfnl" to work -
+# they are already covered by the normal option parsing together with the
+# regular, option-less alias elsewhere in this file.
+
+ibm-37_P100-1995,swaplfnl ibm-37-s390 # ibm037-s390 also matches ibm-37-s390
+ibm-924_P100-1998,swaplfnl ibm-924-s390 IBM924_LF { JAVA* }
+ibm-1047_P100-1995,swaplfnl ibm-1047-s390 IBM1047_LF { JAVA* }
+ibm-1140_P100-1997,swaplfnl ibm-1140-s390
+ibm-1141_P100-1997,swaplfnl ibm-1141-s390 IBM1141_LF { JAVA* }
+ibm-1142_P100-1997,swaplfnl ibm-1142-s390
+ibm-1143_P100-1997,swaplfnl ibm-1143-s390
+ibm-1144_P100-1997,swaplfnl ibm-1144-s390
+ibm-1145_P100-1997,swaplfnl ibm-1145-s390
+ibm-1146_P100-1997,swaplfnl ibm-1146-s390
+ibm-1147_P100-1997,swaplfnl ibm-1147-s390
+ibm-1148_P100-1997,swaplfnl ibm-1148-s390
+ibm-1149_P100-1997,swaplfnl ibm-1149-s390
+ibm-1153_P100-1999,swaplfnl ibm-1153-s390
+ibm-12712_P100-1998,swaplfnl ibm-12712-s390
+ibm-16804_X110-1999,swaplfnl ibm-16804-s390
+
+# This is a special version of ibm-1140 that the XML4C (Xerces) parser team
+# requested in 2000.
+# It maps both EBCDIC LF and NL controls to Unicode LF U+000A.
+
+ebcdic-xml-us
+
+# These are not installed by default. They are rarely used.
+# Many of them can be added through the online ICU Data Library Customization tool
+
+ibm-1004_P100-1995 { UTR22* } ibm-1004 { IBM* }
+ibm-1008_P100-1995 { UTR22* } ibm-1008 { IBM* } # cp1008, 8-bit Arabic (w/o euro update)
+ibm-1009_P100-1995 { UTR22* } ibm-1009 { IBM* }
+ibm-1010_P100-1995 { UTR22* } ibm-1010 { IBM* } NF_Z_62-010 { IANA* } iso-ir-69 { IANA } ISO646-FR { IANA } fr { IANA } csISO69French { IANA }
+ibm-1011_P100-1995 { UTR22* } ibm-1011 { IBM* } DIN_66003 { IANA* } iso-ir-21 { IANA } de { IANA } ISO646-DE { IANA } csISO21German { IANA }
+ibm-1012_P100-1995 { UTR22* } ibm-1012 { IBM* } IT { IANA* } iso-ir-15 { IANA } ISO646-IT { IANA } csISO15Italian { IANA }
+ibm-1013_P100-1995 { UTR22* } ibm-1013 { IBM* } BS_4730 { IANA* } iso-ir-4 { IANA } ISO646-GB { IANA } gb { IANA } uk { IANA } csISO4UnitedKingdom { IANA }
+ibm-1014_P100-1995 { UTR22* } ibm-1014 { IBM* } ES2 { IANA* } iso-ir-85 { IANA } ISO646-ES2 { IANA } csISO85Spanish2 { IANA }
+ibm-1015_P100-1995 { UTR22* } ibm-1015 { IBM* } PT2 { IANA* } iso-ir-84 { IANA } ISO646-PT2 { IANA } csISO84Portuguese2 { IANA }
+ibm-1016_P100-1995 { UTR22* } ibm-1016 { IBM* } NS_4551-1 { IANA* } iso-ir-60 { IANA } ISO646-NO { IANA } no { IANA } csISO60DanishNorwegian { IANA } csISO60Norwegian1 { IANA }
+ibm-1017_P100-1995 { UTR22* } ibm-1017 { IBM* }
+ibm-1018_P100-1995 { UTR22* } ibm-1018 { IBM* } SEN_850200_B { IANA* } iso-ir-10 { IANA } FI { IANA } ISO646-FI { IANA } ISO646-SE { IANA } se { IANA } csISO10Swedish { IANA }
+ibm-1019_P100-1995 { UTR22* } ibm-1019 { IBM* }
+ibm-1020_P100-2003 { UTR22* } ibm-1020 { IBM* } CSA_Z243.4-1985-1 { IANA* } iso-ir-121 { IANA } ISO646-CA { IANA } csa7-1 { IANA } ca { IANA } csISO121Canadian1 { IANA }
+ibm-1021_P100-2003 { UTR22* } ibm-1021 { IBM* }
+ibm-1023_P100-2003 { UTR22* } ibm-1023 { IBM* } ES { IANA* } iso-ir-17 { IANA } ISO646-ES { IANA } csISO17Spanish { IANA }
+ibm-1027_P100-1995 { UTR22* } ibm-1027 { IBM* } x-IBM1027 { JAVA* }
+ibm-1041_P100-1995 { UTR22* } ibm-1041 { IBM* } x-IBM1041 { JAVA* }
+ibm-1043_P100-1995 { UTR22* } ibm-1043 { IBM* } x-IBM1043 { JAVA* }
+ibm-1046_X110-1999 { UTR22* } ibm-1046 { IBM* } x-IBM1046 { JAVA* } x-IBM1046S { JAVA } # Arabic
+ibm-1088_P100-1995 { UTR22* } ibm-1088 { IBM* } x-IBM1088 { JAVA* }
+ibm-1100_P100-2003 { UTR22* } ibm-1100 { IBM* } DEC-MCS { IANA* } dec { IANA } csDECMCS { IANA }
+ibm-1101_P100-2003 { UTR22* } ibm-1101 { IBM* }
+ibm-1102_P100-2003 { UTR22* } ibm-1102 { IBM* }
+ibm-1103_P100-2003 { UTR22* } ibm-1103 { IBM* }
+ibm-1104_P100-2003 { UTR22* } ibm-1104 { IBM* } NF_Z_62-010_1973 iso-ir-25 { IANA* } ISO646-FR1 { IANA } csISO25French { IANA } # NF_Z_62-010_(1973) is the real IANA alias, but () aren't invariant characters.
+ibm-1105_P100-2003 { UTR22* } ibm-1105 { IBM* }
+ibm-1106_P100-2003 { UTR22* } ibm-1106 { IBM* }
+ibm-1107_P100-2003 { UTR22* } ibm-1107 { IBM* } DS_2089 { IANA* } ISO646-DK { IANA } dk { IANA } csISO646Danish { IANA }
+ibm-1127_P100-2004 { UTR22* } ibm-1127 { IBM* }
+ibm-1161_P100-1999 { UTR22* } ibm-1161 { IBM* } # Thai (Euro update of ibm-1129)
+ibm-1163_P100-1999 { UTR22* } ibm-1163 { IBM* } # Vietnamese
+ibm-1165_P101-2000 { UTR22* } ibm-1165 { IBM* } # Vietnamese (EBCDIC)
+ibm-1166_P100-2002 { UTR22* } ibm-1166 { IBM* } # Cyrillic for Kazakhstan
+ibm-1167_P100-2002 { UTR22* } ibm-1167 { IBM* } KOI8-RU x-KOI8_RU { JAVA* }
+ibm-1174_X100-2007 { UTR22* } ibm-1174 { IBM* } KZ-1048 { IANA* } STRK1048-2002 { IANA } RK1048 { IANA } csKZ1048 { IANA }
+ibm-1277_P100-1995 { UTR22* } ibm-1277 { IBM* } # Adobe (Postscript) Latin-1
+ibm-13125_P100-1997 { UTR22* } ibm-13125 { IBM* } # S-Ch (DBCS subset of ibm-4933, ibm-1388)
+ibm-13140_P101-2000 { UTR22* } ibm-13140 { IBM* }
+ibm-13218_P100-1996 { UTR22* } ibm-13218 { IBM* } # Japanese (EBCDIC update of ibm-930)
+ibm-1350_P110-1997 { UTR22* } ibm-1350 { IBM* } x-eucJP-Open { JAVA* } eucJP-Open { JAVA } # Japanese (EUC-JP variant)
+ibm-1351_P110-1997 { UTR22* } ibm-1351 { IBM* } x-IBM1351 { JAVA* } # Japanese (DBCS subset of ibm-5039)
+ibm-1362_P110-1999 { UTR22* } ibm-1362 { IBM* } x-IBM1362 { JAVA* } # Korean (DBCS subset of ibm-1363)
+ibm-13676_P102-2001 { UTR22* } ibm-13676 { IBM* } # Simplified Chinese (EBCDIC)
+ibm-1380_P100-1995 { UTR22* } ibm-1380 { IBM* } x-IBM1380 { JAVA* } # Simplified Chinese (DBCS subset of ibm-1381)
+ibm-1381_P110-1999 { UTR22* } ibm-1381 { IBM* JAVA } cp1381 { JAVA* } 1381 { JAVA } x-IBM1381 { JAVA } # Simplified Chinese PC Data mixed (IBM GB)
+ibm-1382_P100-1995 { UTR22* } ibm-1382 { IBM* } x-IBM1382 { JAVA* } # Simplified Chinese (DBCS subset of ibm-1383)
+ibm-17221_P100-2001 { UTR22* } ibm-17221 { IBM* } # Simplified Chinese (EBCDIC)
+ibm-17248_X110-1999 { UTR22* } ibm-17248 { IBM* } # PC Arabic (w/ euro update) Updated version of ibm-864
+ibm-21344_P101-2000 { UTR22* } ibm-21344 { IBM* } # PC Arabic. Updated version of ibm-864
+ibm-21427_P100-1999 { UTR22* } ibm-21427 { IBM* } # Traditional Chinese (DBCS subset of ibm-1370)
+ibm-256_P100-1995 { UTR22* } ibm-256 { IBM* } # Latin 1 EBCDIC
+ibm-259_P100-1995 { UTR22* } ibm-259 { IBM* } IBM-Symbols { IANA* } csIBMSymbols { IANA }
+ibm-274_P100-2000 { UTR22* } ibm-274 { IBM* } IBM274 { IANA* } EBCDIC-BE { IANA } CP274 { IANA } csIBM274 { IANA }
+ibm-275_P100-1995 { UTR22* } ibm-275 { IBM* } IBM275 { IANA* } EBCDIC-BR { IANA } cp275 { IANA } csIBM275 { IANA }
+ibm-286_P100-2003 { UTR22* } ibm-286 { IBM* } EBCDIC-AT-DE-A { IANA* } csEBCDICATDEA { IANA }
+ibm-293_P100-1995 { UTR22* } ibm-293 { IBM* } # APL EBCDIC (APL: A Programming Language)
+ibm-300_P120-2006 { UTR22* } ibm-300 { IBM* } x-IBM300 { JAVA* } # Japanese (DBCS subset of ibm-930 and ibm-939)
+ibm-301_P110-1997 { UTR22* } ibm-301 { IBM* } x-IBM301 { JAVA* } # Japanese (DBCS subset of ibm-943)
+ibm-33058_P100-2000 { UTR22* } ibm-33058 { IBM* } # SBCS (Katakana)
+ibm-425_P101-2000 { UTR22* } ibm-425 { IBM* } # Arabic (EBCDIC)
+ibm-4930_P110-1999 { UTR22* } ibm-4930 { IBM* } # Korean (DBCS subset of ibm-1364)
+ibm-4933_P100-2002 { UTR22* } ibm-4933 { IBM* } # S-Ch (DBCS subset of ibm-1388)
+ibm-4948_P100-1995 { UTR22* } ibm-4948 { IBM* }
+ibm-4951_P100-1995 { UTR22* } ibm-4951 { IBM* }
+ibm-4952_P100-1995 { UTR22* } ibm-4952 { IBM* }
+ibm-4960_P100-1995 { UTR22* } ibm-4960 { IBM* }
+ibm-5039_P11A-1998 { UTR22* } ibm-5039 { IBM* } # Japanese (HP Shift-JIS variant)
+ibm-5048_P100-1995 { UTR22* } ibm-5048 { IBM* } # Japanese (DBCS subset of ibm-1350, JIS X208-1990)
+ibm-5049_P100-1995 { UTR22* } ibm-5049 { IBM* } # Japanese (DBCS subset of ibm-1350, JIS X212)
+ibm-5067_P100-1995 { UTR22* } ibm-5067 { IBM* } # Korean (DBCS subset of ibm-21450)
+ibm-5104_X110-1999 { UTR22* } ibm-5104 { IBM* } # cp1008, 8-bit Arabic (w/ euro update)
+ibm-5233_P100-2011 { UTR22* } ibm-5233 { IBM* } # Devanagari EBCDIC, including Indian Rupee
+ibm-806_P100-1998 { UTR22* } ibm-806 { IBM* } # Hindi (ISCII variant)
+ibm-808_P100-1999 { UTR22* } ibm-808 { IBM* } x-IBM808 { JAVA* } # Cyrillic
+ibm-833_P100-1995 { UTR22* } ibm-833 { IBM* } x-IBM833 { JAVA* }
+ibm-834_P100-1995 { UTR22* } ibm-834 { IBM* } x-IBM834 { JAVA* } # Korean (DBCS subset of ibm-933)
+ibm-835_P100-1995 { UTR22* } ibm-835 { IBM* } x-IBM835 { JAVA* } # Traditional Chinese (DBCS subset of ibm-5033)
+ibm-836_P100-1995 { UTR22* } ibm-836 { IBM* } x-IBM836 { JAVA* }
+ibm-837_P100-2011 { UTR22* } ibm-837 { IBM* } x-IBM837 { JAVA* } # Simplified Chinese (DBCS subset of ibm-5031)
+ibm-848_P100-1999 { UTR22* } ibm-848 { IBM* } # Cyrillic (euro update of ibm-1125)
+ibm-849_P100-1999 { UTR22* } ibm-849 { IBM* } # Cyrillic Belarus (euro update of ibm-1131)
+ibm-859_P100-1999 { UTR22* } ibm-859 { IBM* } x-IBM859 { JAVA* } # PC Latin 9 (w/ euro update)
+ibm-8612_P100-1995 { UTR22* } ibm-8612 { IBM* } # Arabic (EBCDIC update of ibm-420)
+ibm-872_P100-1999 { UTR22* } ibm-872 { IBM* } # Cyrillic (Euro update of ibm-855)
+ibm-880_P100-1995 { UTR22* } ibm-880 { IBM* } IBM880 { IANA* } cp880 { IANA } EBCDIC-Cyrillic { IANA } csIBM880 { IANA } windows-20880 { WINDOWS* }
+ibm-896_P100-1995 { UTR22* } ibm-896 { IBM* } # SBCS Katakana
+ibm-897_P100-1995 { UTR22* } ibm-897 { IBM* } JIS_X0201 { IANA* } X0201 { IANA } csHalfWidthKatakana { IANA } x-IBM897 { JAVA* }
+ibm-9027_P100-1999 { UTR22* } ibm-9027 { IBM* } # DBCS T-Ch Host. Euro update of ibm-835. DBCS portion of ibm-1371.
+ibm-9048_P100-1998 { UTR22* } ibm-9048 { IBM* } # Hebrew (Euro and Sequel update of ibm-856)
+ibm-905_P100-1995 { UTR22* } ibm-905 { IBM* } IBM905 { IANA* } CP905 { IANA } ebcdic-cp-tr { IANA } csIBM905 { IANA } windows-20905 { WINDOWS* }
+ibm-9056_P100-1995 { UTR22* } ibm-9056 { IBM* } # Arabic
+ibm-9061_P100-1999 { UTR22* } ibm-9061 { IBM* } # Greek (w/ euro update)
+ibm-9145_P110-1997 { UTR22* } ibm-9145 { IBM* } # Japanese (DBCS subset of ibm-5050)
+ibm-9238_X110-1999 { UTR22* } ibm-9238 { IBM* } # cp1046, PC Arabic Extended (w/ euro update)
+ibm-924_P100-1998 { UTR22* } ibm-924 { IBM* } IBM00924 { IANA* } CCSID00924 { IANA } CP00924 { IANA } ebcdic-Latin9--euro { IANA }
+ibm-926_P100-2000 { UTR22* } ibm-926 { IBM* } # Korean (DBCS subset of ibm-944)
+ibm-927_P100-1995 { UTR22* } ibm-927 { IBM* } x-IBM927 { JAVA* } # Traditional Chinese (DBCS subset of ibm-948)
+ibm-928_P100-1995 { UTR22* } ibm-928 { IBM* } # Simplified Chinese (DBCS subset of ibm-936)
+ibm-941_P13A-2001 { UTR22* } ibm-941 { IBM* } # DBCS portion of ibm-943
+ibm-944_P100-1995 { UTR22* } ibm-944 { IBM* } # Korean
+ibm-946_P100-1995 { UTR22* } ibm-946 { IBM* } # Simplified Chinese
+ibm-947_P100-1995 { UTR22* } ibm-947 { IBM* } x-IBM947 { JAVA* } # Traditional Chinese (DBCS subset of ibm-950)
+ibm-948_P110-1999 { UTR22* } ibm-948 { IBM* } x-IBM948 { JAVA* } # Traditional Chinese
+ibm-951_P100-1995 { UTR22* } ibm-951 { IBM* } x-IBM951 { JAVA* } # Korean (DBCS subset of ibm-949)
+ibm-952_P110-1997 { UTR22* } ibm-952 { IBM* } x-JIS0208 # Pure DBCS, Japanese EUC, G1 - JIS X208-1990
+ibm-953_P100-2000 { UTR22* } ibm-953 { IBM* } JIS_X0212-1990 { IANA* } # Pure DBCS, Japanese EUC, G3 - JIS X 0212-1990
+ibm-955_P110-1997 { UTR22* } ibm-955 { IBM* } # Pure DBCS, Japanese EUC, G0 - JIS X208-1978
+ibm-9577_P100-2001 { UTR22* } ibm-9577 { IBM* } ibm-1385 { IBM } x-IBM1385 { JAVA* } # ibm-9577 and ibm-1385 are identical DBCS tables.
+iso-8859_16-2001 { UTR22* } ISO-8859-16 { IANA* } iso-ir-226 { IANA } ISO_8859-16:2001 { IANA } latin10 { IANA } l10 { IANA }
+
+# To be considered for listing at a later date for the data library customization tool
+#ibm-1159_P100-1999 { UTR22* } ibm-1159 { IBM* } # SBCS T-Ch Host. Euro update of ibm-28709. This is used in combination with another CCSID mapping.
+#ibm-960_P100-2000 { UTR22* } ibm-960 { IBM* } # Pure DBCS, CNS11643 plane 1
+#ibm-963_P100-1995 { UTR22* } ibm-963 { IBM* } # Pure DBCS, CNS11643 plane 2 Traditional Chinese (DBCS subset of ibm-965)
--- /dev/null
+target
+corpus
+artifacts
+coverage
--- /dev/null
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "adler"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
+
+[[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "anstream"
+version = "0.6.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b"
+
+[[package]]
+name = "anstyle-parse"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19"
+dependencies = [
+ "anstyle",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.86"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da"
+
+[[package]]
+name = "arbitrary"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110"
+
+[[package]]
+name = "atty"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "winapi",
+]
+
+[[package]]
+name = "autocfg"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
+
+[[package]]
+name = "bitflags"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
+
+[[package]]
+name = "bumpalo"
+version = "3.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
+
+[[package]]
+name = "cc"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "066fce287b1d4eafef758e89e09d724a24808a9196fe9756b8ca90e86d0719a2"
+dependencies = [
+ "jobserver",
+ "libc",
+ "once_cell",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "chrono"
+version = "0.4.38"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
+dependencies = [
+ "android-tzdata",
+ "iana-time-zone",
+ "js-sys",
+ "num-traits",
+ "wasm-bindgen",
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "clap"
+version = "4.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84b3edb18336f4df585bc9aa31dd99c036dfa5dc5e9a2939a722a188f3a8970d"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1c09dd5ada6c6c78075d6fd0da3f90d8080651e2d6cc8eb2f1aaa4034ced708"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+ "terminal_size",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2bac35c6dafb060fd4d275d9a4ffae97917c13a6327903a8be2153cd964f7085"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70"
+
+[[package]]
+name = "colorchoice"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
+
+[[package]]
+name = "crc32fast"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "encoding_rs"
+version = "0.8.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "equivalent"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+
+[[package]]
+name = "errno"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "finl_unicode"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6"
+
+[[package]]
+name = "flate2"
+version = "1.0.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae"
+dependencies = [
+ "crc32fast",
+ "miniz_oxide",
+]
+
+[[package]]
+name = "float_next_after"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
+
+[[package]]
+name = "hashbrown"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
+[[package]]
+name = "hermit-abi"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "hexplay"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0962bea6731e28b5a443ba4aa00fe3e4fe7555dadf12012435efb738eeac5898"
+dependencies = [
+ "atty",
+ "termcolor",
+]
+
+[[package]]
+name = "iana-time-zone"
+version = "0.1.60"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "wasm-bindgen",
+ "windows-core",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "indexmap"
+version = "2.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
+dependencies = [
+ "equivalent",
+ "hashbrown",
+]
+
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
+
+[[package]]
+name = "jobserver"
+version = "0.1.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "js-sys"
+version = "0.3.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
+dependencies = [
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
+
+[[package]]
+name = "libc"
+version = "0.2.155"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
+
+[[package]]
+name = "libfuzzer-sys"
+version = "0.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7"
+dependencies = [
+ "arbitrary",
+ "cc",
+ "once_cell",
+]
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
+
+[[package]]
+name = "log"
+version = "0.4.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
+dependencies = [
+ "adler",
+]
+
+[[package]]
+name = "num"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
+dependencies = [
+ "num-bigint",
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
+dependencies = [
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-complex"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-derive"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "num-integer"
+version = "0.1.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-iter"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
+dependencies = [
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
+
+[[package]]
+name = "ordered-float"
+version = "3.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.86"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "pspp"
+version = "1.0.0"
+dependencies = [
+ "anyhow",
+ "bitflags",
+ "chrono",
+ "clap",
+ "encoding_rs",
+ "finl_unicode",
+ "flate2",
+ "float_next_after",
+ "hexplay",
+ "indexmap",
+ "lazy_static",
+ "libc",
+ "num",
+ "num-derive",
+ "num-traits",
+ "ordered-float",
+ "thiserror",
+ "unicase",
+ "utf8-decode",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "pspp-fuzz"
+version = "0.0.0"
+dependencies = [
+ "libfuzzer-sys",
+ "pspp",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.36"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rustix"
+version = "0.38.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
+dependencies = [
+ "bitflags",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
+[[package]]
+name = "syn"
+version = "2.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "201fcda3845c23e8212cd466bfebf0bd20694490fc0356ae8e428e0824a915a6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "termcolor"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adc4587ead41bf016f11af03e55a624c06568b5a19db4e90fde573d805074f83"
+dependencies = [
+ "wincolor",
+]
+
+[[package]]
+name = "terminal_size"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7"
+dependencies = [
+ "rustix",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.61"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.61"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "unicase"
+version = "2.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89"
+dependencies = [
+ "version_check",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+
+[[package]]
+name = "utf8-decode"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498"
+
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
+[[package]]
+name = "version_check"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
+dependencies = [
+ "cfg-if",
+ "wasm-bindgen-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-backend"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
+dependencies = [
+ "bumpalo",
+ "log",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-backend",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "wincolor"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eeb06499a3a4d44302791052df005d5232b927ed1a9658146d842165c4de7767"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "windows-core"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+dependencies = [
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
+dependencies = [
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.6",
+ "windows_aarch64_msvc 0.52.6",
+ "windows_i686_gnu 0.52.6",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc 0.52.6",
+ "windows_x86_64_gnu 0.52.6",
+ "windows_x86_64_gnullvm 0.52.6",
+ "windows_x86_64_msvc 0.52.6",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
--- /dev/null
+[package]
+name = "pspp-fuzz"
+version = "0.0.0"
+publish = false
+edition = "2021"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+libfuzzer-sys = "0.4"
+
+[dependencies.pspp]
+path = ".."
+
+[[bin]]
+name = "fuzz_target_1"
+path = "fuzz_targets/fuzz_target_1.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "segment"
+path = "fuzz_targets/segment.rs"
+test = false
+doc = false
+bench = false
--- /dev/null
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+ // fuzzed code goes here
+});
--- /dev/null
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pspp::lex::segment::{Segmenter, Mode, Type};
+
+fuzz_target!(|data: &[u8]| {
+ if let Ok(mut input) = std::str::from_utf8(data) {
+ let mut segmenter = Segmenter::new(Mode::Auto, false);
+ loop {
+ let (rest, type_) = segmenter.push(input, true).unwrap();
+ match type_ {
+ Type::End => break,
+ _ => (),
+ }
+ input = rest;
+ }
+ }
+});
--- /dev/null
+use std::{fmt::Write, sync::OnceLock};
+
+use flagset::{flags, FlagSet};
+
+use crate::{
+ integer::ToInteger,
+ lex::{
+ command_name::CommandMatcher,
+ lexer::Lexer,
+ token::{Punct, Token},
+ },
+ message::Diagnostic,
+};
+
+flags! {
+ enum State: u8 {
+ /// No active dataset yet defined.
+ Initial,
+
+ /// Active dataset has been defined.
+ Data,
+
+ /// Inside `INPUT PROGRAM`.
+ InputProgram,
+
+ /// Inside `FILE TYPE`.
+ FileType,
+
+ /// State nested inside `LOOP` or `DO IF`, inside [State::Data].
+ NestedData,
+
+ /// State nested inside `LOOP` or `DO IF`, inside [State::InputProgram].
+ NestedInputProgram,
+ }
+}
+
+struct Command {
+ allowed_states: FlagSet<State>,
+ enhanced_only: bool,
+ testing_only: bool,
+ no_abbrev: bool,
+ name: &'static str,
+ run: Box<dyn Fn(&Context) + Send + Sync>,
+}
+
+fn commands() -> &'static [Command] {
+ fn new_commands() -> Vec<Command> {
+ vec![Command {
+ allowed_states: State::Initial | State::Data,
+ enhanced_only: false,
+ testing_only: false,
+ no_abbrev: false,
+ name: "ECHO",
+ run: Box::new(|_context| {
+ println!("hi");
+ }),
+ }]
+ }
+
+ static COMMANDS: OnceLock<Vec<Command>> = OnceLock::new();
+ COMMANDS.get_or_init(|| new_commands()).as_slice()
+}
+
+fn parse_command_word(lexer: &mut Lexer, s: &mut String, n: isize) -> bool {
+ let separator = match s.chars().next_back() {
+ Some(c) if c != '-' => " ",
+ _ => "",
+ };
+
+ match lexer.next(n) {
+ Token::Punct(Punct::Dash) => {
+ s.push('-');
+ true
+ }
+ Token::Id(id) => {
+ write!(s, "{separator}{id}").unwrap();
+ true
+ }
+ Token::Number(number) if number.is_sign_positive() => {
+ if let Some(integer) = number.to_exact_usize() {
+ write!(s, "{separator}{integer}").unwrap();
+ true
+ } else {
+ false
+ }
+ }
+ _ => false,
+ }
+}
+
+fn find_best_match(s: &str) -> (Option<&'static Command>, isize) {
+ let mut cm = CommandMatcher::new(s);
+ for command in commands() {
+ cm.add(command.name, command);
+ }
+ cm.get_match()
+}
+
+fn parse_command_name(
+ lexer: &mut Lexer,
+ error: &Box<dyn Fn(Diagnostic)>,
+) -> Result<(&'static Command, isize), ()> {
+ let mut s = String::new();
+ let mut word = 0;
+ let mut missing_words = 0;
+ let mut command = None;
+ while parse_command_word(lexer, &mut s, word) {
+ (command, missing_words) = find_best_match(&s);
+ if missing_words <= 0 {
+ break;
+ }
+ word += 1;
+ }
+ if command.is_none() && missing_words > 0 {
+ s.push_str(" .");
+ (command, missing_words) = find_best_match(&s);
+ s.truncate(s.len() - 2);
+ }
+
+ match command {
+ Some(command) => Ok((command, (word + 1) + missing_words)),
+ None => {
+ if s.is_empty() {
+ error(lexer.error("Syntax error expecting command name"))
+ } else {
+ error(lexer.error("Unknown command `{s}`."))
+ };
+ Err(())
+ }
+ }
+}
+
+pub enum Success {
+ Success,
+ Eof,
+ Finish,
+}
+
+pub fn end_of_command(context: &Context) -> Result<Success, ()> {
+ match context.lexer.token() {
+ Token::EndCommand | Token::End => Ok(Success::Success),
+ _ => {
+ context.error(
+ context
+ .lexer
+ .error("Syntax error expecting end of command."),
+ );
+ Err(())
+ }
+ }
+}
+
+fn parse_in_state(lexer: &mut Lexer, error: &Box<dyn Fn(Diagnostic)>, _state: State) {
+ match lexer.token() {
+ Token::End | Token::EndCommand => (),
+ _ => {
+ if let Ok((command, n_tokens)) = parse_command_name(lexer, error) {
+ for _ in 0..n_tokens {
+ lexer.get();
+ }
+ let context = Context {
+ error,
+ lexer,
+ command_name: Some(command.name),
+ };
+ (command.run)(&context);
+ end_of_command(&context);
+ }
+ lexer.interactive_reset();
+ lexer.discard_rest_of_command();
+ }
+ }
+ while let Token::EndCommand = lexer.token() {
+ lexer.get();
+ }
+}
+
+pub fn parse(lexer: &mut Lexer, error: &Box<dyn Fn(Diagnostic)>) {
+ parse_in_state(lexer, error, State::Initial)
+}
+
+pub struct Context<'a> {
+ error: &'a Box<dyn Fn(Diagnostic)>,
+ lexer: &'a mut Lexer,
+ command_name: Option<&'static str>,
+}
+
+impl<'a> Context<'a> {
+ pub fn error(&self, diagnostic: Diagnostic) {
+ (self.error)(diagnostic);
+ }
+}
--- /dev/null
+use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc};
+
+use crate::{
+ dictionary::{Dictionary, VarWidth, Variable},
+ encoding::Error as EncodingError,
+ endian::Endian,
+ format::{Error as FormatError, Format, UncheckedFormat},
+ identifier::{Error as IdError, Identifier},
+ raw::{
+ self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord,
+ FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongNamesRecord,
+ LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord,
+ NumberOfCasesRecord, ProductInfoRecord, RawStr, ValueLabel, ValueLabelRecord,
+ VarDisplayRecord, VariableAttributeRecord, VariableRecord, VariableSetRecord,
+ VeryLongStringsRecord, ZHeader, ZTrailer,
+ },
+};
+use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
+use encoding_rs::Encoding;
+use num::Integer;
+use thiserror::Error as ThisError;
+
+pub use crate::raw::{CategoryLabels, Compression};
+
+#[derive(ThisError, Debug)]
+pub enum Error {
+ #[error("Missing header record")]
+ MissingHeaderRecord,
+
+ // XXX this is an internal error
+ #[error("More than one file header record")]
+ DuplicateHeaderRecord,
+
+ #[error("{0}")]
+ EncodingError(EncodingError),
+
+ #[error("Using default encoding {0}.")]
+ UsingDefaultEncoding(String),
+
+ #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)]
+ InvalidVariableWidth { offsets: Range<u64>, width: i32 },
+
+ #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
+ InvalidLongMissingValueFormat,
+
+ #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
+ InvalidCreationDate { creation_date: String },
+
+ #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
+ InvalidCreationTime { creation_time: String },
+
+ #[error("{id_error} Renaming variable to {new_name}.")]
+ InvalidVariableName {
+ id_error: IdError,
+ new_name: Identifier,
+ },
+
+ #[error(
+ "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}"
+ )]
+ InvalidPrintFormat {
+ new_spec: Format,
+ variable: Identifier,
+ format_error: FormatError,
+ },
+
+ #[error(
+ "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}"
+ )]
+ InvalidWriteFormat {
+ new_spec: Format,
+ variable: Identifier,
+ format_error: FormatError,
+ },
+
+ #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")]
+ DuplicateVariableName {
+ duplicate_name: Identifier,
+ new_name: Identifier,
+ },
+
+ #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")]
+ InvalidDictIndex { dict_index: usize, max_index: usize },
+
+ #[error("Dictionary index {0} refers to a long string continuation.")]
+ DictIndexIsContinuation(usize),
+
+ #[error("At offset {offset:#x}, one or more variable indexes for value labels referred to long string continuation records: {indexes:?}")]
+ LongStringContinuationIndexes { offset: u64, indexes: Vec<u32> },
+
+ #[error(
+ "At offsets {:#x}...{:#x}, record types 3 and 4 may not add value labels to one or more long string variables: {variables:?}", .offsets.start, .offsets.end
+ )]
+ InvalidLongStringValueLabels {
+ offsets: Range<u64>,
+ variables: Vec<Identifier>,
+ },
+
+ #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")]
+ ValueLabelsDifferentTypes {
+ numeric_var: Identifier,
+ string_var: Identifier,
+ },
+
+ #[error("Invalid multiple response set name. {0}")]
+ InvalidMrSetName(IdError),
+
+ #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")]
+ UnknownMrSetVariable {
+ mr_set: Identifier,
+ short_name: Identifier,
+ },
+
+ #[error("Multiple response set {0} has no variables.")]
+ EmptyMrSet(Identifier),
+
+ #[error("Multiple response set {0} has only one variable.")]
+ OneVarMrSet(Identifier),
+
+ #[error("Multiple response set {0} contains both string and numeric variables.")]
+ MixedMrSet(Identifier),
+
+ #[error(
+ "Invalid numeric format for counted value {number} in multiple response set {mr_set}."
+ )]
+ InvalidMDGroupCountedValue { mr_set: Identifier, number: String },
+
+ #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")]
+ TooWideMDGroupCountedValue {
+ mr_set: Identifier,
+ value: String,
+ width: usize,
+ max_width: u16,
+ },
+
+ #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")]
+ InvalidLongValueLabelWidth {
+ name: Identifier,
+ width: u32,
+ min_width: u16,
+ max_width: u16,
+ },
+
+ #[error("Invalid attribute name. {0}")]
+ InvalidAttributeName(IdError),
+
+ #[error("Invalid short name in long variable name record. {0}")]
+ InvalidShortName(IdError),
+
+ #[error("Invalid name in long variable name record. {0}")]
+ InvalidLongName(IdError),
+
+ #[error("Invalid variable name in very long string record. {0}")]
+ InvalidLongStringName(IdError),
+
+ #[error("Invalid variable name in long string value label record. {0}")]
+ InvalidLongStringValueLabelName(IdError),
+
+ #[error("Invalid variable name in attribute record. {0}")]
+ InvalidAttributeVariableName(IdError),
+
+ // XXX This is risky because `text` might be arbitarily long.
+ #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
+ MalformedString { encoding: String, text: String },
+
+ #[error("Details TBD")]
+ TBD,
+}
+
+type DictIndex = usize;
+
+#[derive(Clone, Debug)]
+pub struct Headers {
+ pub header: HeaderRecord<String>,
+ pub variable: Vec<VariableRecord<String, String>>,
+ pub value_label: Vec<ValueLabelRecord<RawStr<8>, String>>,
+ pub document: Vec<DocumentRecord<String>>,
+ pub integer_info: Option<IntegerInfoRecord>,
+ pub float_info: Option<FloatInfoRecord>,
+ pub var_display: Option<VarDisplayRecord>,
+ pub multiple_response: Vec<MultipleResponseRecord<Identifier, String>>,
+ pub long_string_value_labels: Vec<LongStringValueLabelRecord<Identifier, String>>,
+ pub long_string_missing_values: Vec<LongStringMissingValueRecord<Identifier, String>>,
+ pub encoding: Option<EncodingRecord>,
+ pub number_of_cases: Option<NumberOfCasesRecord>,
+ pub variable_sets: Vec<VariableSetRecord>,
+ pub product_info: Option<ProductInfoRecord>,
+ pub long_names: Vec<LongNamesRecord>,
+ pub very_long_strings: Vec<VeryLongStringsRecord>,
+ pub file_attributes: Vec<FileAttributeRecord>,
+ pub variable_attributes: Vec<VariableAttributeRecord>,
+ pub other_extension: Vec<Extension>,
+ pub end_of_headers: Option<u32>,
+ pub z_header: Option<ZHeader>,
+ pub z_trailer: Option<ZTrailer>,
+ pub cases: Option<Rc<RefCell<Cases>>>,
+}
+
+fn take_first<T, F>(mut vec: Vec<T>, more_than_one: F) -> Option<T>
+where
+ F: FnOnce(),
+{
+ if vec.len() > 1 {
+ more_than_one();
+ }
+ vec.drain(..).next()
+}
+
+impl Headers {
+ pub fn new(headers: Vec<raw::DecodedRecord>, warn: &impl Fn(Error)) -> Result<Headers, Error> {
+ let mut file_header = Vec::new();
+ let mut variable = Vec::new();
+ let mut value_label = Vec::new();
+ let mut document = Vec::new();
+ let mut integer_info = Vec::new();
+ let mut float_info = Vec::new();
+ let mut var_display = Vec::new();
+ let mut multiple_response = Vec::new();
+ let mut long_string_value_labels = Vec::new();
+ let mut long_string_missing_values = Vec::new();
+ let mut encoding = Vec::new();
+ let mut number_of_cases = Vec::new();
+ let mut variable_sets = Vec::new();
+ let mut product_info = Vec::new();
+ let mut long_names = Vec::new();
+ let mut very_long_strings = Vec::new();
+ let mut file_attributes = Vec::new();
+ let mut variable_attributes = Vec::new();
+ let mut other_extension = Vec::new();
+ let mut end_of_headers = Vec::new();
+ let mut z_header = Vec::new();
+ let mut z_trailer = Vec::new();
+ let mut cases = Vec::new();
+
+ for header in headers {
+ match header {
+ DecodedRecord::Header(record) => {
+ file_header.push(record);
+ }
+ DecodedRecord::Variable(record) => {
+ variable.push(record);
+ }
+ DecodedRecord::ValueLabel(record) => {
+ value_label.push(record);
+ }
+ DecodedRecord::Document(record) => {
+ document.push(record);
+ }
+ DecodedRecord::IntegerInfo(record) => {
+ integer_info.push(record);
+ }
+ DecodedRecord::FloatInfo(record) => {
+ float_info.push(record);
+ }
+ DecodedRecord::VariableSets(record) => {
+ variable_sets.push(record);
+ }
+ DecodedRecord::VarDisplay(record) => {
+ var_display.push(record);
+ }
+ DecodedRecord::MultipleResponse(record) => {
+ multiple_response.push(record);
+ }
+ DecodedRecord::LongStringValueLabels(record) => {
+ long_string_value_labels.push(record)
+ }
+ DecodedRecord::LongStringMissingValues(record) => {
+ long_string_missing_values.push(record);
+ }
+ DecodedRecord::Encoding(record) => {
+ encoding.push(record);
+ }
+ DecodedRecord::NumberOfCases(record) => {
+ number_of_cases.push(record);
+ }
+ DecodedRecord::ProductInfo(record) => {
+ product_info.push(record);
+ }
+ DecodedRecord::LongNames(record) => {
+ long_names.push(record);
+ }
+ DecodedRecord::VeryLongStrings(record) => {
+ very_long_strings.push(record);
+ }
+ DecodedRecord::FileAttributes(record) => {
+ file_attributes.push(record);
+ }
+ DecodedRecord::VariableAttributes(record) => {
+ variable_attributes.push(record);
+ }
+ DecodedRecord::OtherExtension(record) => {
+ other_extension.push(record);
+ }
+ DecodedRecord::EndOfHeaders(record) => {
+ end_of_headers.push(record);
+ }
+ DecodedRecord::ZHeader(record) => {
+ z_header.push(record);
+ }
+ DecodedRecord::ZTrailer(record) => {
+ z_trailer.push(record);
+ }
+ DecodedRecord::Cases(record) => {
+ cases.push(record);
+ }
+ }
+ }
+
+ let Some(file_header) = take_first(file_header, || warn(Error::DuplicateHeaderRecord))
+ else {
+ return Err(Error::MissingHeaderRecord);
+ };
+
+ Ok(Headers {
+ header: file_header,
+ variable,
+ value_label,
+ document,
+ integer_info: take_first(integer_info, || warn(Error::TBD)),
+ float_info: take_first(float_info, || warn(Error::TBD)),
+ var_display: take_first(var_display, || warn(Error::TBD)),
+ multiple_response,
+ long_string_value_labels,
+ long_string_missing_values,
+ encoding: take_first(encoding, || warn(Error::TBD)),
+ number_of_cases: take_first(number_of_cases, || warn(Error::TBD)),
+ variable_sets,
+ product_info: take_first(product_info, || warn(Error::TBD)),
+ long_names,
+ very_long_strings,
+ file_attributes,
+ variable_attributes,
+ other_extension,
+ end_of_headers: take_first(end_of_headers, || warn(Error::TBD)),
+ z_header: take_first(z_header, || warn(Error::TBD)),
+ z_trailer: take_first(z_trailer, || warn(Error::TBD)),
+ cases: take_first(cases, || warn(Error::TBD)),
+ })
+ }
+}
+
+pub struct Metadata {
+ creation: NaiveDateTime,
+ endian: Endian,
+ compression: Option<Compression>,
+ n_cases: Option<u64>,
+ product: String,
+ product_ext: Option<String>,
+ version: Option<(i32, i32, i32)>,
+}
+
+impl Metadata {
+ fn decode(headers: &Headers, warn: impl Fn(Error)) -> Self {
+ let header = &headers.header;
+ let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
+ .unwrap_or_else(|_| {
+ warn(Error::InvalidCreationDate {
+ creation_date: header.creation_date.to_string(),
+ });
+ Default::default()
+ });
+ let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
+ .unwrap_or_else(|_| {
+ warn(Error::InvalidCreationTime {
+ creation_time: header.creation_time.to_string(),
+ });
+ Default::default()
+ });
+ let creation = NaiveDateTime::new(creation_date, creation_time);
+
+ let product = header
+ .eye_catcher
+ .trim_start_matches("@(#) SPSS DATA FILE")
+ .trim_end()
+ .to_string();
+
+ Self {
+ creation,
+ endian: header.endian,
+ compression: header.compression,
+ n_cases: header.n_cases.map(|n| n as u64),
+ product,
+ product_ext: headers.product_info.as_ref().map(|pe| fix_line_ends(&pe.0)),
+ version: headers.integer_info.as_ref().map(|ii| ii.version),
+ }
+ }
+}
+
+struct Decoder {
+ //pub raw: raw::Decoder,
+ pub encoding: &'static Encoding,
+ //pub variables: HashMap<DictIndex, Variable>,
+ //pub var_names: HashMap<Identifier, DictIndex>,
+ //pub dictionary: Dictionary,
+ //n_dict_indexes: usize,
+ n_generated_names: usize,
+}
+
+impl Decoder {
+ fn generate_name(&mut self, dictionary: &Dictionary) -> Identifier {
+ loop {
+ self.n_generated_names += 1;
+ let name = Identifier::from_encoding(&format!("VAR{:03}", self.n_generated_names), self.encoding)
+ .unwrap();
+ if !dictionary.variables.contains(&name.0) {
+ return name;
+ }
+ assert!(self.n_generated_names < usize::MAX);
+ }
+ }
+}
+
+pub fn decode(
+ mut headers: Headers,
+ encoding: &'static Encoding,
+ warn: impl Fn(Error),
+) -> Result<(Dictionary, Metadata), Error> {
+ let mut dictionary = Dictionary::new(encoding);
+
+ let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' '));
+ if !file_label.is_empty() {
+ dictionary.file_label = Some(file_label);
+ }
+
+ for attributes in headers.file_attributes.drain(..) {
+ dictionary.attributes.extend(attributes.0 .0.into_iter())
+ }
+
+ // Concatenate all the document records (really there should only be one)
+ // and trim off the trailing spaces that pad them to 80 bytes.
+ dictionary.documents = headers
+ .document
+ .drain(..)
+ .flat_map(|record| record.lines)
+ .map(trim_end_spaces)
+ .collect();
+
+ // XXX warn for weird integer format
+ // XXX warn for weird floating-point format, etc.
+
+ let mut decoder = Decoder {
+ encoding,
+ n_generated_names: 0,
+ };
+
+ let mut header_vars = headers.variable.iter().enumerate();
+ let mut var_index_map = HashMap::new();
+ while let Some((value_index, input)) = header_vars.next() {
+ let name = trim_end_spaces(input.name.to_string());
+ let name = match Identifier::from_encoding(&name, encoding) {
+ Ok(name) => {
+ if !dictionary.variables.contains(&name.0) {
+ name
+ } else {
+ let new_name = decoder.generate_name(&dictionary);
+ warn(Error::DuplicateVariableName {
+ duplicate_name: name.clone(),
+ new_name: new_name.clone(),
+ });
+ new_name
+ }
+ }
+ Err(id_error) => {
+ let new_name = decoder.generate_name(&dictionary);
+ warn(Error::InvalidVariableName {
+ id_error,
+ new_name: new_name.clone(),
+ });
+ new_name
+ }
+ };
+ let mut variable = Variable::new(name.clone(), VarWidth::from_raw(input.width).unwrap());
+
+ // Set the short name the same as the long name (even if we renamed it).
+ variable.short_names = vec![name];
+
+ variable.label = input.label.clone();
+
+ variable.missing_values = input.missing_values.clone();
+
+ variable.print_format = decode_format(
+ input.print_format,
+ variable.width,
+ |new_spec, format_error| {
+ warn(Error::InvalidPrintFormat {
+ new_spec,
+ variable: variable.name.clone(),
+ format_error,
+ })
+ },
+ );
+ variable.write_format = decode_format(
+ input.write_format,
+ variable.width,
+ |new_spec, format_error| {
+ warn(Error::InvalidWriteFormat {
+ new_spec,
+ variable: variable.name.clone(),
+ format_error,
+ })
+ },
+ );
+
+ // Skip long string continuation records.
+ if input.width > 0 {
+ #[allow(unstable_name_collisions)]
+ for _ in 1..input.width.div_ceil(&8) {
+ if let Some((_, continuation)) = header_vars.next() {
+ if continuation.width == -1 {
+ continue;
+ }
+ }
+ return Err(Error::TBD);
+ }
+ }
+
+ let dict_index = dictionary.add_var(variable).unwrap();
+ assert_eq!(var_index_map.insert(value_index, dict_index), None);
+ }
+
+ for record in headers.value_label.drain(..) {
+ let mut dict_indexes = Vec::with_capacity(record.dict_indexes.len());
+ let mut continuation_indexes = Vec::new();
+ let mut long_string_variables = Vec::new();
+ for value_index in record.dict_indexes.iter() {
+ if let Some(dict_index) = var_index_map.get(&(*value_index as usize - 1)) {
+ let variable = &dictionary.variables[*dict_index];
+ if variable.width.is_long_string() {
+ long_string_variables.push(variable.name.clone());
+ } else {
+ dict_indexes.push(*dict_index);
+ }
+ } else {
+ continuation_indexes.push(*value_index);
+ }
+ }
+ if !continuation_indexes.is_empty() {
+ warn(Error::LongStringContinuationIndexes {
+ offset: record.offsets.start,
+ indexes: continuation_indexes,
+ });
+ }
+ if !long_string_variables.is_empty() {
+ warn(Error::InvalidLongStringValueLabels {
+ offsets: record.offsets.clone(),
+ variables: long_string_variables,
+ });
+ }
+
+ for dict_index in dict_indexes {
+ let mut variable = &dictionary.variables[dict_index];
+ for ValueLabel { value, label } in record.labels.iter().cloned() {
+
+ }
+ }
+ }
+
+ let metadata = Metadata::decode(&headers, warn);
+ Ok((dictionary, metadata))
+}
+
+fn trim_end_spaces(mut s: String) -> String {
+ s.truncate(s.trim_end_matches(' ').len());
+ s
+}
+
+/// Returns a copy of `s` in which all lone CR and CR LF pairs have been
+/// replaced by LF.
+///
+/// (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
+/// files that use CR-only line ends in the file label and extra product info.)
+fn fix_line_ends(s: &str) -> String {
+ let mut out = String::with_capacity(s.len());
+ let mut s = s.chars().peekable();
+ while let Some(c) = s.next() {
+ match c {
+ '\r' => {
+ s.next_if_eq(&'\n');
+ out.push('\n')
+ }
+ c => out.push(c),
+ }
+ }
+ out
+}
+
+fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Format, FormatError)) -> Format {
+ UncheckedFormat::try_from(raw)
+ .and_then(Format::try_from)
+ .and_then(|x| x.check_width_compatibility(width))
+ .unwrap_or_else(|error| {
+ let new_format = Format::default_for_width(width);
+ warn(new_format, error);
+ new_format
+ })
+}
+
+/*
+impl Decoder {
+ fn generate_name(&mut self) -> Identifier {
+ loop {
+ self.n_generated_names += 1;
+ let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
+ .unwrap();
+ if !self.var_names.contains_key(&name) {
+ return name;
+ }
+ assert!(self.n_generated_names < usize::MAX);
+ }
+ }
+ fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
+ let (output, malformed) = self.encoding.decode_without_bom_handling(input);
+ if malformed {
+ warn(Error::MalformedString {
+ encoding: self.encoding.name().into(),
+ text: output.clone().into(),
+ });
+ }
+ output
+ }
+ fn decode_string(&self, input: &[u8], warn: &impl Fn(Error)) -> String {
+ self.decode_string_cow(input, warn).into()
+ }
+ pub fn decode_identifier(
+ &self,
+ input: &[u8],
+ warn: &impl Fn(Error),
+ ) -> Result<Identifier, IdError> {
+ let s = self.decode_string_cow(input, warn);
+ Identifier::new(&s, self.encoding)
+ }
+ fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> {
+ let max_index = self.n_dict_indexes;
+ if dict_index == 0 || dict_index > max_index {
+ return Err(Error::InvalidDictIndex {
+ dict_index,
+ max_index,
+ });
+ }
+ let Some(variable) = self.variables.get(&(dict_index - 1)) else {
+ return Err(Error::DictIndexIsContinuation(dict_index));
+ };
+ Ok(variable)
+ }
+
+ /// Returns `input` decoded from `self.encoding` into UTF-8 such that
+ /// re-encoding the result back into `self.encoding` will have exactly the
+ /// same length in bytes.
+ ///
+ /// XXX warn about errors?
+ fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
+ if let (s, false) = self.encoding.decode_without_bom_handling(input) {
+ // This is the common case. Usually there will be no errors.
+ s
+ } else {
+ // Unusual case. Don't bother to optimize it much.
+ let mut decoder = self.encoding.new_decoder_without_bom_handling();
+ let mut output = String::with_capacity(
+ decoder
+ .max_utf8_buffer_length_without_replacement(input.len())
+ .unwrap(),
+ );
+ let mut rest = input;
+ while !rest.is_empty() {
+ match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
+ (DecoderResult::InputEmpty, _) => break,
+ (DecoderResult::OutputFull, _) => unreachable!(),
+ (DecoderResult::Malformed(a, b), consumed) => {
+ let skipped = a as usize + b as usize;
+ output.extend(repeat('?').take(skipped));
+ rest = &rest[consumed..];
+ }
+ }
+ }
+ assert_eq!(self.encoding.encode(&output).0.len(), input.len());
+ output.into()
+ }
+ }
+}
+
+pub trait TryDecode: Sized {
+ type Input<'a>;
+ fn try_decode(
+ decoder: &mut Decoder,
+ input: &Self::Input<'_>,
+ warn: impl Fn(Error),
+ ) -> Result<Option<Self>, Error>;
+}
+
+pub trait Decode<Input>: Sized {
+ fn decode(decoder: &Decoder, input: &Input, warn: impl Fn(Error)) -> Self;
+}
+
+impl<const N: usize> Decode<RawStr<N>> for String {
+ fn decode(decoder: &Decoder, input: &RawStr<N>, warn: impl Fn(Error)) -> Self {
+ decoder.decode_string(&input.0, &warn)
+ }
+}
+*/
+/*
+#[derive(Clone, Debug)]
+pub struct HeaderRecord {
+ pub eye_catcher: String,
+ pub weight_index: Option<usize>,
+ pub n_cases: Option<u64>,
+ pub creation: NaiveDateTime,
+ pub file_label: String,
+}
+
+fn trim_end_spaces(mut s: String) -> String {
+ s.truncate(s.trim_end_matches(' ').len());
+ s
+}
+
+/// Data file info that doesn't fit in [Dictionary].
+pub struct Metadata {
+ creation: NaiveDateTime,
+ endian: Endian,
+ compression: Option<Compression>,
+ n_cases: Option<u64>,
+ product: String,
+ product_ext: Option<String>,
+ version: Option<(i32, i32, i32)>,
+}
+
+impl Metadata {
+ fn decode(
+ header: &crate::raw::HeaderRecord<Cow<str>>,
+ integer_info: Option<&IntegerInfoRecord>,
+ product_ext: Option<&ProductInfoRecord>,
+ warn: impl Fn(Error),
+ ) -> Self {
+ let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
+ .unwrap_or_else(|_| {
+ warn(Error::InvalidCreationDate {
+ creation_date: header.creation_date.to_string(),
+ });
+ Default::default()
+ });
+ let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
+ .unwrap_or_else(|_| {
+ warn(Error::InvalidCreationTime {
+ creation_time: header.creation_time.to_string(),
+ });
+ Default::default()
+ });
+ let creation = NaiveDateTime::new(creation_date, creation_time);
+
+ let product = header
+ .eye_catcher
+ .trim_start_matches("@(#) SPSS DATA FILE")
+ .trim_end()
+ .to_string();
+
+ Self {
+ creation,
+ endian: header.endian,
+ compression: header.compression,
+ n_cases: header.n_cases.map(|n| n as u64),
+ product,
+ product_ext: product_ext.map(|pe| pe.0.clone()),
+ version: integer_info.map(|ii| ii.version),
+ }
+ }
+}
+
+impl TryDecode for HeaderRecord {
+ type Input<'a> = crate::raw::HeaderRecord<Cow<'a, str>>;
+
+ fn try_decode(
+ _decoder: &mut Decoder,
+ input: &Self::Input<'_>,
+ warn: impl Fn(Error),
+ ) -> Result<Option<Self>, Error> {
+ let eye_catcher = trim_end_spaces(input.eye_catcher.to_string());
+ let file_label = trim_end_spaces(input.file_label.to_string());
+ let creation_date = NaiveDate::parse_from_str(&input.creation_date, "%e %b %Y")
+ .unwrap_or_else(|_| {
+ warn(Error::InvalidCreationDate {
+ creation_date: input.creation_date.to_string(),
+ });
+ Default::default()
+ });
+ let creation_time = NaiveTime::parse_from_str(&input.creation_time, "%H:%M:%S")
+ .unwrap_or_else(|_| {
+ warn(Error::InvalidCreationTime {
+ creation_time: input.creation_time.to_string(),
+ });
+ Default::default()
+ });
+ Ok(Some(HeaderRecord {
+ eye_catcher,
+ weight_index: input.weight_index.map(|n| n as usize),
+ n_cases: input.n_cases.map(|n| n as u64),
+ creation: NaiveDateTime::new(creation_date, creation_time),
+ file_label,
+ }))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableRecord {
+ pub width: VarWidth,
+ pub name: Identifier,
+ pub print_format: Spec,
+ pub write_format: Spec,
+ pub missing_values: MissingValues<String>,
+ pub label: Option<String>,
+}
+
+
+fn parse_variable_record(
+ decoder: &mut Decoder,
+ input: &raw::VariableRecord<Cow<str>, String>,
+ warn: impl Fn(Error),
+) -> Result<(), Error> {
+ let width = match input.width {
+ 0 => VarWidth::Numeric,
+ w @ 1..=255 => VarWidth::String(w as u16),
+ -1 => return Ok(()),
+ _ => {
+ return Err(Error::InvalidVariableWidth {
+ offsets: input.offsets.clone(),
+ width: input.width,
+ })
+ }
+ };
+ let name = trim_end_spaces(input.name.to_string());
+ let name = match Identifier::new(&name, decoder.encoding) {
+ Ok(name) => {
+ if !decoder.var_names.contains_key(&name) {
+ name
+ } else {
+ let new_name = decoder.generate_name();
+ warn(Error::DuplicateVariableName {
+ duplicate_name: name.clone(),
+ new_name: new_name.clone(),
+ });
+ new_name
+ }
+ }
+ Err(id_error) => {
+ let new_name = decoder.generate_name();
+ warn(Error::InvalidVariableName {
+ id_error,
+ new_name: new_name.clone(),
+ });
+ new_name
+ }
+ };
+ let variable = Variable {
+ dict_index: decoder.n_dict_indexes,
+ short_name: name.clone(),
+ long_name: None,
+ width,
+ };
+ decoder.n_dict_indexes += width.n_dict_indexes();
+ assert!(decoder
+ .var_names
+ .insert(name.clone(), variable.dict_index)
+ .is_none());
+ assert!(decoder
+ .variables
+ .insert(variable.dict_index, variable)
+ .is_none());
+
+ let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
+ warn(Error::InvalidPrintFormat {
+ new_spec,
+ variable: name.clone(),
+ format_error,
+ })
+ });
+ let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
+ warn(Error::InvalidWriteFormat {
+ new_spec,
+ variable: name.clone(),
+ format_error,
+ })
+ });
+ let mut variable = dictionary::Variable::new(name, width);
+ variable.print_format = print_format;
+ variable.write_format = write_format;
+ variable.missing_values = input.missing_values.clone();
+ if let Some(ref label) = input.label {
+ variable.label = Some(label.to_string());
+ }
+ decoder.dictionary.add_var(variable).unwrap();
+ Ok(())
+}
+
+#[derive(Clone, Debug)]
+pub struct DocumentRecord(Vec<String>);
+
+impl TryDecode for DocumentRecord {
+ type Input<'a> = crate::raw::DocumentRecord<RawDocumentLine>;
+
+ fn try_decode(
+ decoder: &mut Decoder,
+ input: &Self::Input<'_>,
+ warn: impl Fn(Error),
+ ) -> Result<Option<Self>, Error> {
+ Ok(Some(DocumentRecord(
+ input
+ .lines
+ .iter()
+ .map(|s| trim_end_spaces(decoder.decode_string(&s.0, &warn)))
+ .collect(),
+ )))
+ }
+}
+
+trait TextRecord
+where
+ Self: Sized,
+{
+ const NAME: &'static str;
+ fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error>;
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableSet {
+ pub name: String,
+ pub vars: Vec<String>,
+}
+
+impl VariableSet {
+ fn parse(input: &str) -> Result<Self, Error> {
+ let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
+ let vars = input.split_ascii_whitespace().map(String::from).collect();
+ Ok(VariableSet {
+ name: name.into(),
+ vars,
+ })
+ }
+}
+
+trait WarnOnError<T> {
+ fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
+}
+impl<T> WarnOnError<T> for Result<T, Error> {
+ fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
+ match self {
+ Ok(result) => Some(result),
+ Err(error) => {
+ warn(error);
+ None
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct ValueLabel {
+ pub value: Value,
+ pub label: String,
+}
+
+#[derive(Clone, Debug)]
+pub struct ValueLabelRecord {
+ pub var_type: VarType,
+ pub labels: Vec<ValueLabel>,
+ pub variables: Vec<Identifier>,
+}
+
+impl TryDecode for ValueLabelRecord {
+ type Input<'a> = crate::raw::ValueLabelRecord<RawStr<8>, RawString>;
+ fn try_decode(
+ decoder: &mut Decoder,
+ input: &Self::Input<'_>,
+ warn: impl Fn(Error),
+ ) -> Result<Option<ValueLabelRecord>, Error> {
+ let variables: Vec<&Variable> = input
+ .dict_indexes
+ .iter()
+ .filter_map(|&dict_index| {
+ decoder
+ .get_var_by_index(dict_index as usize)
+ .warn_on_error(&warn)
+ })
+ .filter(|&variable| match variable.width {
+ VarWidth::String(width) if width > 8 => {
+ warn(Error::InvalidLongStringValueLabel(
+ variable.short_name.clone(),
+ ));
+ false
+ }
+ _ => true,
+ })
+ .collect();
+ let mut i = variables.iter();
+ let Some(&first_var) = i.next() else {
+ return Ok(None);
+ };
+ let var_type: VarType = first_var.width.into();
+ for &variable in i {
+ let this_type: VarType = variable.width.into();
+ if var_type != this_type {
+ let (numeric_var, string_var) = match var_type {
+ VarType::Numeric => (first_var, variable),
+ VarType::String => (variable, first_var),
+ };
+ warn(Error::ValueLabelsDifferentTypes {
+ numeric_var: numeric_var.short_name.clone(),
+ string_var: string_var.short_name.clone(),
+ });
+ return Ok(None);
+ }
+ }
+ let labels = input
+ .labels
+ .iter()
+ .map(|raw::ValueLabel { value, label }| {
+ let label = decoder.decode_string(&label.0, &warn);
+ let value = Value::decode(value, decoder);
+ ValueLabel { value, label }
+ })
+ .collect();
+ let variables = variables
+ .iter()
+ .map(|&variable| variable.short_name.clone())
+ .collect();
+ Ok(Some(ValueLabelRecord {
+ var_type,
+ labels,
+ variables,
+ }))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableSetRecord(Vec<VariableSet>);
+
+impl TextRecord for VariableSetRecord {
+ const NAME: &'static str = "variable set";
+ fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
+ let mut sets = Vec::new();
+ for line in input.lines() {
+ if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) {
+ sets.push(set)
+ }
+ }
+ Ok(VariableSetRecord(sets))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongName {
+ pub short_name: Identifier,
+ pub long_name: Identifier,
+}
+
+impl LongName {
+ fn new(decoder: &mut Decoder, short_name: &str, long_name: &str) -> Result<LongName, Error> {
+ let short_name =
+ Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidShortName)?;
+ let long_name =
+ Identifier::new(long_name, decoder.encoding).map_err(Error::InvalidLongName)?;
+ Ok(LongName {
+ short_name,
+ long_name,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongNameRecord(Vec<LongName>);
+
+impl LongNameRecord {
+ pub fn parse(decoder: &mut Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
+ let mut names = Vec::new();
+ for pair in input.split('\t').filter(|s| !s.is_empty()) {
+ if let Some((short_name, long_name)) = pair.split_once('=') {
+ if let Some(long_name) =
+ LongName::new(decoder, short_name, long_name).warn_on_error(&warn)
+ {
+ names.push(long_name);
+ }
+ } else {
+ warn(Error::TBD)
+ }
+ }
+ Ok(LongNameRecord(names))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VeryLongString {
+ pub short_name: Identifier,
+ pub length: u16,
+}
+
+impl VeryLongString {
+ fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Error> {
+ let Some((short_name, length)) = input.split_once('=') else {
+ return Err(Error::TBD);
+ };
+ let short_name =
+ Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidLongStringName)?;
+ let length: u16 = length.parse().map_err(|_| Error::TBD)?;
+ if length > VarWidth::MAX_STRING {
+ return Err(Error::TBD);
+ }
+ Ok(VeryLongString { short_name, length })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VeryLongStringRecord(Vec<VeryLongString>);
+
+impl VeryLongStringRecord {
+ pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
+ let mut very_long_strings = Vec::new();
+ for tuple in input
+ .split('\0')
+ .map(|s| s.trim_end_matches('\t'))
+ .filter(|s| !s.is_empty())
+ {
+ if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&warn) {
+ very_long_strings.push(vls)
+ }
+ }
+ Ok(VeryLongStringRecord(very_long_strings))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Attribute {
+ pub name: Identifier,
+ pub values: Vec<String>,
+}
+
+impl Attribute {
+ fn parse<'a>(
+ decoder: &Decoder,
+ input: &'a str,
+ warn: &impl Fn(Error),
+ ) -> Result<(Option<Attribute>, &'a str), Error> {
+ let Some((name, mut input)) = input.split_once('(') else {
+ return Err(Error::TBD);
+ };
+ let mut values = Vec::new();
+ loop {
+ let Some((value, rest)) = input.split_once('\n') else {
+ return Err(Error::TBD);
+ };
+ if let Some(stripped) = value
+ .strip_prefix('\'')
+ .and_then(|value| value.strip_suffix('\''))
+ {
+ values.push(stripped.into());
+ } else {
+ warn(Error::TBD);
+ values.push(value.into());
+ }
+ if let Some(rest) = rest.strip_prefix(')') {
+ let attribute = Identifier::new(name, decoder.encoding)
+ .map_err(Error::InvalidAttributeName)
+ .warn_on_error(warn)
+ .map(|name| Attribute { name, values });
+ return Ok((attribute, rest));
+ };
+ input = rest;
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct AttributeSet(pub Vec<Attribute>);
+
+impl AttributeSet {
+ fn parse<'a>(
+ decoder: &Decoder,
+ mut input: &'a str,
+ sentinel: Option<char>,
+ warn: &impl Fn(Error),
+ ) -> Result<(AttributeSet, &'a str), Error> {
+ let mut attributes = Vec::new();
+ let rest = loop {
+ match input.chars().next() {
+ None => break input,
+ c if c == sentinel => break &input[1..],
+ _ => {
+ let (attribute, rest) = Attribute::parse(decoder, input, &warn)?;
+ if let Some(attribute) = attribute {
+ attributes.push(attribute);
+ }
+ input = rest;
+ }
+ }
+ };
+ Ok((AttributeSet(attributes), rest))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct FileAttributeRecord(AttributeSet);
+
+impl FileAttributeRecord {
+ pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
+ let (set, rest) = AttributeSet::parse(decoder, input, None, &warn)?;
+ if !rest.is_empty() {
+ warn(Error::TBD);
+ }
+ Ok(FileAttributeRecord(set))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VarAttributeSet {
+ pub long_var_name: Identifier,
+ pub attributes: AttributeSet,
+}
+
+impl VarAttributeSet {
+ fn parse<'a>(
+ decoder: &Decoder,
+ input: &'a str,
+ warn: &impl Fn(Error),
+ ) -> Result<(Option<VarAttributeSet>, &'a str), Error> {
+ let Some((long_var_name, rest)) = input.split_once(':') else {
+ return Err(Error::TBD);
+ };
+ let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'), warn)?;
+ let var_attribute = Identifier::new(long_var_name, decoder.encoding)
+ .map_err(Error::InvalidAttributeVariableName)
+ .warn_on_error(warn)
+ .map(|name| VarAttributeSet {
+ long_var_name: name,
+ attributes,
+ });
+ Ok((var_attribute, rest))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
+
+impl VariableAttributeRecord {
+ pub fn parse(decoder: &Decoder, mut input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
+ let mut var_attribute_sets = Vec::new();
+ while !input.is_empty() {
+ let Some((var_attribute, rest)) =
+ VarAttributeSet::parse(decoder, input, &warn).warn_on_error(&warn)
+ else {
+ break;
+ };
+ if let Some(var_attribute) = var_attribute {
+ var_attribute_sets.push(var_attribute);
+ }
+ input = rest;
+ }
+ Ok(VariableAttributeRecord(var_attribute_sets))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub enum MultipleResponseType {
+ MultipleDichotomy {
+ value: Value,
+ labels: CategoryLabels,
+ },
+ MultipleCategory,
+}
+
+impl MultipleResponseType {
+ fn decode(
+ decoder: &Decoder,
+ mr_set: &Identifier,
+ input: &raw::MultipleResponseType,
+ min_width: VarWidth,
+ warn: &impl Fn(Error),
+ ) -> Result<Self, Error> {
+ let mr_type = match input {
+ raw::MultipleResponseType::MultipleDichotomy { value, labels } => {
+ let value = decoder.decode_string_cow(&value.0, warn);
+ let value = match min_width {
+ VarWidth::Numeric => {
+ let number: f64 = value.trim().parse().map_err(|_| {
+ Error::InvalidMDGroupCountedValue {
+ mr_set: mr_set.clone(),
+ number: value.into(),
+ }
+ })?;
+ Value::Number(Some(number.into()))
+ }
+ VarWidth::String(max_width) => {
+ let value = value.trim_end_matches(' ');
+ let width = value.len();
+ if width > max_width as usize {
+ return Err(Error::TooWideMDGroupCountedValue {
+ mr_set: mr_set.clone(),
+ value: value.into(),
+ width,
+ max_width,
+ });
+ };
+ Value::String(value.into())
+ }
+ };
+ MultipleResponseType::MultipleDichotomy {
+ value,
+ labels: *labels,
+ }
+ }
+ raw::MultipleResponseType::MultipleCategory => MultipleResponseType::MultipleCategory,
+ };
+ Ok(mr_type)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseSet {
+ pub name: Identifier,
+ pub min_width: VarWidth,
+ pub max_width: VarWidth,
+ pub label: String,
+ pub mr_type: MultipleResponseType,
+ pub dict_indexes: Vec<DictIndex>,
+}
+
+impl MultipleResponseSet {
+ fn decode(
+ decoder: &Decoder,
+ input: &raw::MultipleResponseSet<Identifier, Cow<str>>,
+ warn: &impl Fn(Error),
+ ) -> Result<Self, Error> {
+ let mr_set_name = input.name.clone();
+ let mut dict_indexes = Vec::with_capacity(input.short_names.len());
+ for short_name in input.short_names.iter() {
+ let Some(&dict_index) = decoder.var_names.get(&short_name) else {
+ warn(Error::UnknownMrSetVariable {
+ mr_set: mr_set_name.clone(),
+ short_name: short_name.clone(),
+ });
+ continue;
+ };
+ dict_indexes.push(dict_index);
+ }
+
+ match dict_indexes.len() {
+ 0 => return Err(Error::EmptyMrSet(mr_set_name)),
+ 1 => return Err(Error::OneVarMrSet(mr_set_name)),
+ _ => (),
+ }
+
+ let Some((Some(min_width), Some(max_width))) = dict_indexes
+ .iter()
+ .map(|dict_index| decoder.variables[dict_index].width)
+ .map(|w| (Some(w), Some(w)))
+ .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb)))
+ else {
+ return Err(Error::MixedMrSet(mr_set_name));
+ };
+
+ let mr_type =
+ MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?;
+
+ Ok(MultipleResponseSet {
+ name: mr_set_name,
+ min_width,
+ max_width,
+ label: input.label.to_string(),
+ mr_type,
+ dict_indexes,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
+
+impl TryDecode for MultipleResponseRecord {
+ type Input<'a> = raw::MultipleResponseRecord<Identifier, Cow<'a, str>>;
+
+ fn try_decode(
+ decoder: &mut Decoder,
+ input: &Self::Input<'_>,
+ warn: impl Fn(Error),
+ ) -> Result<Option<Self>, Error> {
+ let mut sets = Vec::with_capacity(input.0.len());
+ for set in &input.0 {
+ match MultipleResponseSet::decode(decoder, set, &warn) {
+ Ok(set) => sets.push(set),
+ Err(error) => warn(error),
+ }
+ }
+ Ok(Some(MultipleResponseRecord(sets)))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringValueLabels {
+ pub var_name: Identifier,
+ pub width: VarWidth,
+ pub labels: Vec<ValueLabel>,
+}
+
+impl LongStringValueLabels {
+ fn decode(
+ decoder: &Decoder,
+ input: &raw::LongStringValueLabels<RawString>,
+ warn: &impl Fn(Error),
+ ) -> Result<Self, Error> {
+ let var_name = decoder.decode_string(&input.var_name.0, warn);
+ let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
+ .map_err(Error::InvalidLongStringValueLabelName)?;
+
+ let min_width = 9;
+ let max_width = VarWidth::MAX_STRING;
+ if input.width < 9 || input.width > max_width as u32 {
+ return Err(Error::InvalidLongValueLabelWidth {
+ name: var_name,
+ width: input.width,
+ min_width,
+ max_width,
+ });
+ }
+ let width = input.width as u16;
+
+ let mut labels = Vec::with_capacity(input.labels.len());
+ for (value, label) in input.labels.iter() {
+ let value = Value::String(decoder.decode_exact_length(&value.0).into());
+ let label = decoder.decode_string(&label.0, warn);
+ labels.push(ValueLabel { value, label });
+ }
+
+ Ok(LongStringValueLabels {
+ var_name,
+ width: VarWidth::String(width),
+ labels,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
+
+impl TryDecode for LongStringValueLabelRecord {
+ type Input<'a> = raw::LongStringValueLabelRecord<RawString>;
+
+ fn try_decode(
+ decoder: &mut Decoder,
+ input: &Self::Input<'_>,
+ warn: impl Fn(Error),
+ ) -> Result<Option<Self>, Error> {
+ let mut labels = Vec::with_capacity(input.0.len());
+ for label in &input.0 {
+ match LongStringValueLabels::decode(decoder, label, &warn) {
+ Ok(set) => labels.push(set),
+ Err(error) => warn(error),
+ }
+ }
+ Ok(Some(LongStringValueLabelRecord(labels)))
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use encoding_rs::WINDOWS_1252;
+
+ #[test]
+ fn test() {
+ let mut s = String::new();
+ s.push(char::REPLACEMENT_CHARACTER);
+ let encoded = WINDOWS_1252.encode(&s).0;
+ let decoded = WINDOWS_1252.decode(&encoded[..]).0;
+ println!("{:?}", decoded);
+ }
+
+ #[test]
+ fn test2() {
+ let charset: Vec<u8> = (0..=255).collect();
+ println!("{}", charset.len());
+ let decoded = WINDOWS_1252.decode(&charset[..]).0;
+ println!("{}", decoded.len());
+ let encoded = WINDOWS_1252.encode(&decoded[..]).0;
+ println!("{}", encoded.len());
+ assert_eq!(&charset[..], &encoded[..]);
+ }
+}
+*/
--- /dev/null
+use std::{
+ cmp::Ordering,
+ collections::{HashMap, HashSet},
+ fmt::Debug,
+ ops::{Bound, RangeBounds},
+};
+
+use encoding_rs::Encoding;
+use indexmap::IndexSet;
+use num::integer::div_ceil;
+use ordered_float::OrderedFloat;
+use unicase::UniCase;
+
+use crate::{
+ format::Format,
+ identifier::{ByIdentifier, HasIdentifier, Identifier},
+ raw::{self, Alignment, CategoryLabels, Decoder, Measure, MissingValues, RawStr, VarType},
+};
+
+pub type DictIndex = usize;
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum VarWidth {
+ Numeric,
+ String(u16),
+}
+
+impl PartialOrd for VarWidth {
+ fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+ match (self, other) {
+ (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal),
+ (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)),
+ _ => None,
+ }
+ }
+}
+
+impl VarWidth {
+ pub const MAX_STRING: u16 = 32767;
+
+ pub fn n_dict_indexes(self) -> usize {
+ match self {
+ VarWidth::Numeric => 1,
+ VarWidth::String(w) => div_ceil(w as usize, 8),
+ }
+ }
+
+ fn width_predicate(
+ a: Option<VarWidth>,
+ b: Option<VarWidth>,
+ f: impl Fn(u16, u16) -> u16,
+ ) -> Option<VarWidth> {
+ match (a, b) {
+ (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric),
+ (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => {
+ Some(VarWidth::String(f(a, b)))
+ }
+ _ => None,
+ }
+ }
+
+ /// Returns the wider of `self` and `other`:
+ /// - Numerical variable widths are equally wide.
+ /// - Longer strings are wider than shorter strings.
+ /// - Numerical and string types are incomparable, so result in `None`.
+ /// - Any `None` in the input yields `None` in the output.
+ pub fn wider(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
+ Self::width_predicate(a, b, |a, b| a.max(b))
+ }
+
+ /// Returns the narrower of `self` and `other` (see [`Self::wider`]).
+ pub fn narrower(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
+ Self::width_predicate(a, b, |a, b| a.min(b))
+ }
+
+ pub fn default_display_width(&self) -> u32 {
+ match self {
+ VarWidth::Numeric => 8,
+ VarWidth::String(width) => *width.min(&32) as u32,
+ }
+ }
+
+ pub fn from_raw(raw: impl Into<i32>) -> Result<Self, ()> {
+ let raw: i32 = raw.into();
+ match raw {
+ 0 => Ok(Self::Numeric),
+ 1..=255 => Ok(Self::String(raw as u16)),
+ _ => Err(()),
+ }
+ }
+
+ pub fn is_long_string(&self) -> bool {
+ if let Self::String(width) = self {
+ *width > 8
+ } else {
+ false
+ }
+ }
+}
+
+impl From<VarWidth> for VarType {
+ fn from(source: VarWidth) -> Self {
+ match source {
+ VarWidth::Numeric => VarType::Numeric,
+ VarWidth::String(_) => VarType::String,
+ }
+ }
+}
+
+#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Value {
+ Number(Option<OrderedFloat<f64>>),
+ String(String),
+}
+
+impl Value {
+ pub fn decode(raw: &raw::Value<RawStr<8>>, decoder: &Decoder) -> Self {
+ match raw {
+ raw::Value::Number(x) => Value::Number(x.map(|x| x.into())),
+ raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Dictionary {
+ pub variables: IndexSet<ByIdentifier<Variable>>,
+ pub split_file: Vec<DictIndex>,
+ pub weight: Option<DictIndex>,
+ pub filter: Option<DictIndex>,
+ pub case_limit: Option<u64>,
+ pub file_label: Option<String>,
+ pub documents: Vec<String>,
+ pub vectors: HashSet<ByIdentifier<Vector>>,
+ pub attributes: HashMap<Identifier, Vec<String>>,
+ pub mrsets: HashSet<ByIdentifier<MultipleResponseSet>>,
+ pub variable_sets: HashSet<ByIdentifier<VariableSet>>,
+ pub encoding: &'static Encoding,
+}
+
+#[derive(Debug)]
+pub struct DuplicateVariableName;
+
+impl Dictionary {
+ pub fn new(encoding: &'static Encoding) -> Self {
+ Self {
+ variables: IndexSet::new(),
+ split_file: Vec::new(),
+ weight: None,
+ filter: None,
+ case_limit: None,
+ file_label: None,
+ documents: Vec::new(),
+ vectors: HashSet::new(),
+ attributes: HashMap::new(),
+ mrsets: HashSet::new(),
+ variable_sets: HashSet::new(),
+ encoding,
+ }
+ }
+
+ pub fn add_var(&mut self, variable: Variable) -> Result<usize, DuplicateVariableName> {
+ let (index, inserted) = self.variables.insert_full(ByIdentifier::new(variable));
+ if inserted {
+ Ok(index)
+ } else {
+ Err(DuplicateVariableName)
+ }
+ }
+
+ pub fn reorder_var(&mut self, from_index: DictIndex, to_index: DictIndex) {
+ if from_index != to_index {
+ self.variables.move_index(from_index, to_index);
+ self.update_dict_indexes(&|index| {
+ #[allow(clippy::collapsible_else_if)]
+ if index == from_index {
+ Some(to_index)
+ } else if from_index < to_index {
+ if index > from_index && index <= to_index {
+ Some(index - 1)
+ } else {
+ Some(index)
+ }
+ } else {
+ if index >= to_index && index < from_index {
+ Some(index + 1)
+ } else {
+ Some(index)
+ }
+ }
+ })
+ }
+ }
+
+ pub fn retain_vars<F>(&mut self, keep: F)
+ where
+ F: Fn(&Variable) -> bool,
+ {
+ let mut deleted = Vec::new();
+ let mut index = 0;
+ self.variables.retain(|var_by_id| {
+ let keep = keep(&var_by_id.0);
+ if !keep {
+ deleted.push(index);
+ }
+ index += 1;
+ keep
+ });
+ if !deleted.is_empty() {
+ self.update_dict_indexes(&|index| match deleted.binary_search(&index) {
+ Ok(_) => None,
+ Err(position) => Some(position),
+ })
+ }
+ }
+
+ pub fn delete_vars<R>(&mut self, range: R)
+ where
+ R: RangeBounds<DictIndex>,
+ {
+ let start = match range.start_bound() {
+ Bound::Included(&start) => start,
+ Bound::Excluded(&start) => start + 1,
+ Bound::Unbounded => 0,
+ };
+ let end = match range.end_bound() {
+ Bound::Included(&end) => end + 1,
+ Bound::Excluded(&end) => end,
+ Bound::Unbounded => self.variables.len(),
+ };
+ if end > start {
+ self.variables.drain(start..end);
+ self.update_dict_indexes(&|index| {
+ if index < start {
+ Some(index)
+ } else if index < end {
+ None
+ } else {
+ Some(index - end - start)
+ }
+ })
+ }
+ }
+
+ fn update_dict_indexes<F>(&mut self, f: &F)
+ where
+ F: Fn(DictIndex) -> Option<DictIndex>,
+ {
+ update_dict_index_vec(&mut self.split_file, f);
+ self.weight = self.weight.and_then(f);
+ self.filter = self.filter.and_then(f);
+ self.vectors = self
+ .vectors
+ .drain()
+ .filter_map(|vector_by_id| {
+ vector_by_id
+ .0
+ .with_updated_dict_indexes(f)
+ .map(ByIdentifier::new)
+ })
+ .collect();
+ self.mrsets = self
+ .mrsets
+ .drain()
+ .filter_map(|mrset_by_id| {
+ mrset_by_id
+ .0
+ .with_updated_dict_indexes(f)
+ .map(ByIdentifier::new)
+ })
+ .collect();
+ self.variable_sets = self
+ .variable_sets
+ .drain()
+ .filter_map(|var_set_by_id| {
+ var_set_by_id
+ .0
+ .with_updated_dict_indexes(f)
+ .map(ByIdentifier::new)
+ })
+ .collect();
+ }
+}
+
+fn update_dict_index_vec<F>(dict_indexes: &mut Vec<DictIndex>, f: F)
+where
+ F: Fn(DictIndex) -> Option<DictIndex>,
+{
+ dict_indexes.retain_mut(|index| {
+ if let Some(new) = f(*index) {
+ *index = new;
+ true
+ } else {
+ false
+ }
+ });
+}
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
+pub enum Role {
+ Input,
+ Target,
+ Both,
+ None,
+ Partition,
+ Split,
+}
+
+impl Default for Role {
+ fn default() -> Self {
+ Self::Input
+ }
+}
+
+pub enum DictClass {
+ Ordinary,
+ System,
+ Scratch,
+}
+
+impl DictClass {
+ pub fn from_identifier(id: &Identifier) -> Self {
+ if id.0.starts_with('$') {
+ Self::System
+ } else if id.0.starts_with('#') {
+ Self::Scratch
+ } else {
+ Self::Ordinary
+ }
+ }
+
+ pub fn must_leave(self) -> bool {
+ match self {
+ DictClass::Ordinary => false,
+ DictClass::System => false,
+ DictClass::Scratch => true,
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Variable {
+ pub name: Identifier,
+ pub width: VarWidth,
+ pub missing_values: MissingValues,
+ pub print_format: Format,
+ pub write_format: Format,
+ pub value_labels: HashMap<Value, String>,
+ pub label: Option<String>,
+ pub measure: Option<Measure>,
+ pub role: Role,
+ pub display_width: u32,
+ pub alignment: Alignment,
+ pub leave: bool,
+ pub short_names: Vec<Identifier>,
+ pub attributes: HashSet<ByIdentifier<Attribute>>,
+}
+
+impl Variable {
+ pub fn new(name: Identifier, width: VarWidth) -> Self {
+ let var_type = VarType::from_width(width);
+ let leave = DictClass::from_identifier(&name).must_leave();
+ Self {
+ name,
+ width,
+ missing_values: MissingValues::default(),
+ print_format: Format::default_for_width(width),
+ write_format: Format::default_for_width(width),
+ value_labels: HashMap::new(),
+ label: None,
+ measure: Measure::default_for_type(var_type),
+ role: Role::default(),
+ display_width: width.default_display_width(),
+ alignment: Alignment::default_for_type(var_type),
+ leave,
+ short_names: Vec::new(),
+ attributes: HashSet::new(),
+ }
+ }
+}
+
+impl HasIdentifier for Variable {
+ fn identifier(&self) -> &UniCase<String> {
+ &self.name.0
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Vector {
+ pub name: Identifier,
+ pub variables: Vec<DictIndex>,
+}
+
+impl Vector {
+ fn with_updated_dict_indexes(
+ mut self,
+ f: impl Fn(DictIndex) -> Option<DictIndex>,
+ ) -> Option<Self> {
+ update_dict_index_vec(&mut self.variables, f);
+ (!self.variables.is_empty()).then_some(self)
+ }
+}
+
+impl HasIdentifier for Vector {
+ fn identifier(&self) -> &UniCase<String> {
+ &self.name.0
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Attribute {
+ pub name: Identifier,
+ pub values: Vec<String>,
+}
+
+impl HasIdentifier for Attribute {
+ fn identifier(&self) -> &UniCase<String> {
+ &self.name.0
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseSet {
+ pub name: Identifier,
+ pub label: String,
+ pub mr_type: MultipleResponseType,
+ pub variables: Vec<DictIndex>,
+}
+
+impl MultipleResponseSet {
+ fn with_updated_dict_indexes(
+ mut self,
+ f: impl Fn(DictIndex) -> Option<DictIndex>,
+ ) -> Option<Self> {
+ update_dict_index_vec(&mut self.variables, f);
+ (self.variables.len() > 1).then_some(self)
+ }
+}
+
+impl HasIdentifier for MultipleResponseSet {
+ fn identifier(&self) -> &UniCase<String> {
+ &self.name.0
+ }
+}
+
+#[derive(Clone, Debug)]
+pub enum MultipleResponseType {
+ MultipleDichotomy {
+ value: Value,
+ labels: CategoryLabels,
+ },
+ MultipleCategory,
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableSet {
+ pub name: Identifier,
+ pub variables: Vec<DictIndex>,
+}
+
+impl VariableSet {
+ fn with_updated_dict_indexes(
+ mut self,
+ f: impl Fn(DictIndex) -> Option<DictIndex>,
+ ) -> Option<Self> {
+ update_dict_index_vec(&mut self.variables, f);
+ (!self.variables.is_empty()).then_some(self)
+ }
+}
+
+impl HasIdentifier for VariableSet {
+ fn identifier(&self) -> &UniCase<String> {
+ &self.name.0
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use std::collections::HashSet;
+
+ use unicase::UniCase;
+
+ use crate::identifier::Identifier;
+
+ use super::{ByIdentifier, HasIdentifier};
+
+ #[derive(PartialEq, Eq, Debug, Clone)]
+ struct Variable {
+ name: Identifier,
+ value: i32,
+ }
+
+ impl HasIdentifier for Variable {
+ fn identifier(&self) -> &UniCase<String> {
+ &self.name.0
+ }
+ }
+
+ #[test]
+ fn test() {
+ // Variables should not be the same if their values differ.
+ let abcd = Identifier::new("abcd").unwrap();
+ let abcd1 = Variable {
+ name: abcd.clone(),
+ value: 1,
+ };
+ let abcd2 = Variable {
+ name: abcd,
+ value: 2,
+ };
+ assert_ne!(abcd1, abcd2);
+
+ // But `ByName` should treat them the same.
+ let abcd1_by_name = ByIdentifier::new(abcd1);
+ let abcd2_by_name = ByIdentifier::new(abcd2);
+ assert_eq!(abcd1_by_name, abcd2_by_name);
+
+ // And a `HashSet` of `ByName` should also treat them the same.
+ let mut vars: HashSet<ByIdentifier<Variable>> = HashSet::new();
+ assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
+ assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
+ assert_eq!(
+ vars.get(&UniCase::new(String::from("abcd")))
+ .unwrap()
+ .0
+ .value,
+ 1
+ );
+ }
+}
--- /dev/null
+use crate::locale_charset::locale_charset;
+use encoding_rs::{Encoding, UTF_8};
+
+include!(concat!(env!("OUT_DIR"), "/encodings.rs"));
+
+pub fn codepage_from_encoding(encoding: &str) -> Option<u32> {
+ CODEPAGE_NAME_TO_NUMBER
+ .get(encoding.to_ascii_lowercase().as_str())
+ .copied()
+}
+
+use thiserror::Error as ThisError;
+
+#[derive(ThisError, Debug)]
+pub enum Error {
+ #[error("This system file does not indicate its own character encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")]
+ NoEncoding,
+
+ #[error("This system file encodes text strings with unknown code page {0}.")]
+ UnknownCodepage(i32),
+
+ #[error("This system file encodes text strings with unknown encoding {0}.")]
+ UnknownEncoding(String),
+
+ #[error("This system file is encoded in EBCDIC, which is not supported.")]
+ Ebcdic,
+}
+
+pub fn default_encoding() -> &'static Encoding {
+ lazy_static! {
+ static ref DEFAULT_ENCODING: &'static Encoding =
+ Encoding::for_label(locale_charset().as_bytes()).unwrap_or(UTF_8);
+ }
+ &DEFAULT_ENCODING
+}
+
+pub fn get_encoding(
+ encoding: Option<&str>,
+ character_code: Option<i32>,
+) -> Result<&'static Encoding, Error> {
+ let label = if let Some(encoding) = encoding {
+ encoding
+ } else if let Some(codepage) = character_code {
+ match codepage {
+ 1 => return Err(Error::Ebcdic),
+ 2 | 3 => {
+ // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
+ // respectively. However, many files have character code 2 but
+ // data which are clearly not ASCII. Therefore, ignore these
+ // values.
+ return Err(Error::NoEncoding);
+ }
+ 4 => "MS_KANJI",
+ _ => CODEPAGE_NUMBER_TO_NAME
+ .get(&codepage)
+ .copied()
+ .ok_or(Error::UnknownCodepage(codepage))?,
+ }
+ } else {
+ return Err(Error::NoEncoding);
+ };
+
+ Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into()))
+}
--- /dev/null
+/// The endianness for integer and floating-point numbers in SPSS system files.
+///
+/// SPSS system files can declare IBM 370 and DEC VAX floating-point
+/// representations, but no file that uses either of these has ever been found
+/// in the wild, so this code does not handle them.
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum Endian {
+ /// Big-endian: MSB at lowest address.
+ Big,
+
+ /// Little-endian: LSB at lowest address.
+ Little,
+}
+
+impl Endian {
+ #[cfg(target_endian = "big")]
+ pub const NATIVE: Endian = Endian::Big;
+ #[cfg(target_endian = "little")]
+ pub const NATIVE: Endian = Endian::Little;
+
+ pub fn identify_u32(expected_value: u32, bytes: [u8; 4]) -> Option<Self> {
+ let as_big: u32 = Endian::Big.parse(bytes);
+ let as_little: u32 = Endian::Little.parse(bytes);
+ match (as_big == expected_value, as_little == expected_value) {
+ (true, false) => Some(Endian::Big),
+ (false, true) => Some(Endian::Little),
+ _ => None,
+ }
+ }
+
+ pub fn identify_f64(expected_value: f64, bytes: [u8; 8]) -> Option<Self> {
+ let as_big: f64 = Endian::Big.parse(bytes);
+ let as_little: f64 = Endian::Little.parse(bytes);
+ match (as_big == expected_value, as_little == expected_value) {
+ (true, false) => Some(Endian::Big),
+ (false, true) => Some(Endian::Little),
+ _ => None,
+ }
+ }
+}
+
+pub trait ToBytes<T, const N: usize> {
+ fn to_bytes(self, value: T) -> [u8; N];
+}
+impl ToBytes<i64, 8> for Endian {
+ fn to_bytes(self, value: i64) -> [u8; 8] {
+ match self {
+ Endian::Big => i64::to_be_bytes(value),
+ Endian::Little => i64::to_le_bytes(value),
+ }
+ }
+}
+impl ToBytes<u32, 4> for Endian {
+ fn to_bytes(self, value: u32) -> [u8; 4] {
+ match self {
+ Endian::Big => u32::to_be_bytes(value),
+ Endian::Little => u32::to_le_bytes(value),
+ }
+ }
+}
+impl ToBytes<i32, 4> for Endian {
+ fn to_bytes(self, value: i32) -> [u8; 4] {
+ match self {
+ Endian::Big => i32::to_be_bytes(value),
+ Endian::Little => i32::to_le_bytes(value),
+ }
+ }
+}
+impl ToBytes<u16, 2> for Endian {
+ fn to_bytes(self, value: u16) -> [u8; 2] {
+ match self {
+ Endian::Big => u16::to_be_bytes(value),
+ Endian::Little => u16::to_le_bytes(value),
+ }
+ }
+}
+impl ToBytes<u8, 1> for Endian {
+ fn to_bytes(self, value: u8) -> [u8; 1] {
+ [value]
+ }
+}
+impl ToBytes<f64, 8> for Endian {
+ fn to_bytes(self, value: f64) -> [u8; 8] {
+ match self {
+ Endian::Big => f64::to_be_bytes(value),
+ Endian::Little => f64::to_le_bytes(value),
+ }
+ }
+}
+
+/// Parses an `N`-byte slice in one of the supported formats into native format
+/// as type `T`.
+pub trait Parse<T, const N: usize> {
+ /// Given 'bytes', returns `T`.
+ fn parse(self, bytes: [u8; N]) -> T;
+}
+impl Parse<u64, 8> for Endian {
+ fn parse(self, bytes: [u8; 8]) -> u64 {
+ match self {
+ Endian::Big => u64::from_be_bytes(bytes),
+ Endian::Little => u64::from_le_bytes(bytes),
+ }
+ }
+}
+impl Parse<u32, 4> for Endian {
+ fn parse(self, bytes: [u8; 4]) -> u32 {
+ match self {
+ Endian::Big => u32::from_be_bytes(bytes),
+ Endian::Little => u32::from_le_bytes(bytes),
+ }
+ }
+}
+impl Parse<u16, 2> for Endian {
+ fn parse(self, bytes: [u8; 2]) -> u16 {
+ match self {
+ Endian::Big => u16::from_be_bytes(bytes),
+ Endian::Little => u16::from_le_bytes(bytes),
+ }
+ }
+}
+impl Parse<u8, 1> for Endian {
+ fn parse(self, bytes: [u8; 1]) -> u8 {
+ match self {
+ Endian::Big => u8::from_be_bytes(bytes),
+ Endian::Little => u8::from_le_bytes(bytes),
+ }
+ }
+}
+impl Parse<i64, 8> for Endian {
+ fn parse(self, bytes: [u8; 8]) -> i64 {
+ match self {
+ Endian::Big => i64::from_be_bytes(bytes),
+ Endian::Little => i64::from_le_bytes(bytes),
+ }
+ }
+}
+impl Parse<i32, 4> for Endian {
+ fn parse(self, bytes: [u8; 4]) -> i32 {
+ match self {
+ Endian::Big => i32::from_be_bytes(bytes),
+ Endian::Little => i32::from_le_bytes(bytes),
+ }
+ }
+}
+impl Parse<i16, 2> for Endian {
+ fn parse(self, bytes: [u8; 2]) -> i16 {
+ match self {
+ Endian::Big => i16::from_be_bytes(bytes),
+ Endian::Little => i16::from_le_bytes(bytes),
+ }
+ }
+}
+impl Parse<i8, 1> for Endian {
+ fn parse(self, bytes: [u8; 1]) -> i8 {
+ match self {
+ Endian::Big => i8::from_be_bytes(bytes),
+ Endian::Little => i8::from_le_bytes(bytes),
+ }
+ }
+}
+impl Parse<f64, 8> for Endian {
+ fn parse(self, bytes: [u8; 8]) -> f64 {
+ match self {
+ Endian::Big => f64::from_be_bytes(bytes),
+ Endian::Little => f64::from_le_bytes(bytes),
+ }
+ }
+}
--- /dev/null
+use crate::{
+ command::parse,
+ lex::{lexer::{Lexer, Source}, token::Token},
+ message::Diagnostic,
+};
+
+pub struct Engine {
+ lexer: Lexer,
+}
+
+impl Engine {
+ fn new() -> Self {
+ Self {
+ lexer: Lexer::new(Box::new(|location, error| println!("{location}: {error}"))),
+ }
+ }
+ fn run(&mut self, source: Source) {
+ self.lexer.append(source);
+ self.lexer.get();
+ while self.lexer.token() != &Token::End {
+ let error: Box<dyn Fn(Diagnostic)> = Box::new(|diagnostic| {
+ println!("{diagnostic}");
+ });
+ parse(&mut self.lexer, &error);
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use encoding_rs::UTF_8;
+
+ use crate::lex::{
+ lexer::{ErrorHandling, Source},
+ segment::Mode,
+ };
+
+ use super::Engine;
+
+ #[test]
+ fn test_echo() {
+ let mut engine = Engine::new();
+ engine.run(Source::for_file_contents(
+ "ECHO 'hi there'.\nECHO 'bye there'.\n".to_string(),
+ Some("test.sps".to_string()),
+ UTF_8,
+ Mode::default(),
+ ErrorHandling::default(),
+ ));
+ }
+}
--- /dev/null
+use std::{
+ fmt::{Display, Formatter, Result as FmtResult},
+ ops::RangeInclusive,
+};
+
+use enum_map::{Enum, EnumMap};
+use thiserror::Error as ThisError;
+
+use crate::{
+ dictionary::VarWidth,
+ raw::{self, VarType},
+};
+
+#[derive(ThisError, Debug)]
+pub enum Error {
+ #[error("Unknown format type {value}.")]
+ UnknownFormat { value: u16 },
+
+ #[error("Output format {0} specifies width {}, but {} requires an even width.", .0.w, .0.type_)]
+ OddWidthNotAllowed(UncheckedFormat),
+
+ #[error("Output format {0} specifies width {}, but {} requires a width between {} and {}.", .0.w, .0.type_, .0.type_.min_width(), .0.type_.max_width())]
+ BadWidth(UncheckedFormat),
+
+ #[error("Output format {0} specifies decimal places, but {} format does not allow any decimals.", .0.type_)]
+ DecimalsNotAllowedForFormat(UncheckedFormat),
+
+ #[error("Output format {0} specifies {} decimal places, but with a width of {}, {} does not allow any decimal places.", .0.d, .0.w, .0.type_)]
+ DecimalsNotAllowedForWidth(UncheckedFormat),
+
+ #[error("Output format {spec} specifies {} decimal places but, with a width of {}, {} allows at most {max_d} decimal places.", .spec.d, .spec.w, .spec.type_)]
+ TooManyDecimalsForWidth {
+ spec: UncheckedFormat,
+ max_d: Decimals,
+ },
+
+ #[error("String variable is not compatible with numeric format {0}.")]
+ UnnamedVariableNotCompatibleWithNumericFormat(Type),
+
+ #[error("Numeric variable is not compatible with string format {0}.")]
+ UnnamedVariableNotCompatibleWithStringFormat(Type),
+
+ #[error("String variable {variable} with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")]
+ NamedStringVariableBadSpecWidth {
+ variable: String,
+ width: Width,
+ bad_spec: Format,
+ good_spec: Format,
+ },
+
+ #[error("String variable with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")]
+ UnnamedStringVariableBadSpecWidth {
+ width: Width,
+ bad_spec: Format,
+ good_spec: Format,
+ },
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum Category {
+ // Numeric formats.
+ Basic,
+ Custom,
+ Legacy,
+ Binary,
+ Hex,
+ Date,
+ Time,
+ DateComponent,
+
+ // String formats.
+ String,
+}
+
+impl From<Type> for Category {
+ fn from(source: Type) -> Self {
+ match source {
+ Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => Self::Basic,
+ Type::CC(_) => Self::Custom,
+ Type::N | Type::Z => Self::Legacy,
+ Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => Self::Binary,
+ Type::PIBHex | Type::RBHex => Self::Hex,
+ Type::Date
+ | Type::ADate
+ | Type::EDate
+ | Type::JDate
+ | Type::SDate
+ | Type::QYr
+ | Type::MoYr
+ | Type::WkYr
+ | Type::DateTime
+ | Type::YMDHMS => Self::Date,
+ Type::MTime | Type::Time | Type::DTime => Self::Time,
+ Type::WkDay | Type::Month => Self::DateComponent,
+ Type::A | Type::AHex => Self::String,
+ }
+ }
+}
+
+#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Hash)]
+pub enum CC {
+ A,
+ B,
+ C,
+ D,
+ E,
+}
+
+impl Display for CC {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ let s = match self {
+ CC::A => "A",
+ CC::B => "B",
+ CC::C => "C",
+ CC::D => "D",
+ CC::E => "E",
+ };
+ write!(f, "{}", s)
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum Type {
+ // Basic numeric formats.
+ F,
+ Comma,
+ Dot,
+ Dollar,
+ Pct,
+ E,
+
+ // Custom currency formats.
+ CC(CC),
+
+ // Legacy numeric formats.
+ N,
+ Z,
+
+ // Binary and hexadecimal formats.
+ P,
+ PK,
+ IB,
+ PIB,
+ PIBHex,
+ RB,
+ RBHex,
+
+ // Time and date formats.
+ Date,
+ ADate,
+ EDate,
+ JDate,
+ SDate,
+ QYr,
+ MoYr,
+ WkYr,
+ DateTime,
+ YMDHMS,
+ MTime,
+ Time,
+ DTime,
+
+ // Date component formats.
+ WkDay,
+ Month,
+
+ // String formats.
+ A,
+ AHex,
+}
+
+pub type Width = u16;
+pub type SignedWidth = i16;
+
+pub type Decimals = u8;
+
+impl Type {
+ pub fn max_width(self) -> Width {
+ match self {
+ Self::P | Self::PK | Self::PIBHex | Self::RBHex => 16,
+ Self::IB | Self::PIB | Self::RB => 8,
+ Self::A => 32767,
+ Self::AHex => 32767 * 2,
+ _ => 40,
+ }
+ }
+
+ pub fn min_width(self) -> Width {
+ match self {
+ // Basic numeric formats.
+ Self::F => 1,
+ Self::Comma => 1,
+ Self::Dot => 1,
+ Self::Dollar => 2,
+ Self::Pct => 2,
+ Self::E => 6,
+
+ // Custom currency formats.
+ Self::CC(_) => 2,
+
+ // Legacy numeric formats.
+ Self::N => 1,
+ Self::Z => 1,
+
+ // Binary and hexadecimal formats.
+ Self::P => 1,
+ Self::PK => 1,
+ Self::IB => 1,
+ Self::PIB => 1,
+ Self::PIBHex => 2,
+ Self::RB => 2,
+ Self::RBHex => 4,
+
+ // Time and date formats.
+ Self::Date => 9,
+ Self::ADate => 8,
+ Self::EDate => 8,
+ Self::JDate => 5,
+ Self::SDate => 8,
+ Self::QYr => 6,
+ Self::MoYr => 6,
+ Self::WkYr => 8,
+ Self::DateTime => 17,
+ Self::YMDHMS => 16,
+ Self::MTime => 5,
+ Self::Time => 5,
+ Self::DTime => 8,
+
+ // Date component formats.
+ Self::WkDay => 2,
+ Self::Month => 3,
+
+ // String formats.
+ Self::A => 1,
+ Self::AHex => 2,
+ }
+ }
+
+ pub fn width_range(self) -> RangeInclusive<Width> {
+ self.min_width()..=self.max_width()
+ }
+
+ pub fn max_decimals(self, width: Width) -> Decimals {
+ let width = width.clamp(1, 40) as SignedWidth;
+ let max = match self {
+ Self::F | Self::Comma | Self::Dot | Self::CC(_) => width - 1,
+ Self::Dollar | Self::Pct => width - 2,
+ Self::E => width - 7,
+ Self::N | Self::Z => width,
+ Self::P => width * 2 - 1,
+ Self::PK => width * 2,
+ Self::IB | Self::PIB => max_digits_for_bytes(width as usize) as SignedWidth,
+ Self::PIBHex => 0,
+ Self::RB | Self::RBHex => 16,
+ Self::Date
+ | Self::ADate
+ | Self::EDate
+ | Self::JDate
+ | Self::SDate
+ | Self::QYr
+ | Self::MoYr
+ | Self::WkYr => 0,
+ Self::DateTime => width - 21,
+ Self::YMDHMS => width - 20,
+ Self::MTime => width - 6,
+ Self::Time => width - 9,
+ Self::DTime => width - 12,
+ Self::WkDay | Self::Month | Self::A | Self::AHex => 0,
+ };
+ max.clamp(0, 16) as Decimals
+ }
+
+ pub fn takes_decimals(self) -> bool {
+ self.max_decimals(Width::MAX) > 0
+ }
+
+ pub fn category(self) -> Category {
+ self.into()
+ }
+
+ pub fn width_step(self) -> Width {
+ if self.category() == Category::Hex || self == Self::AHex {
+ 2
+ } else {
+ 1
+ }
+ }
+
+ pub fn clamp_width(self, width: Width) -> Width {
+ let (min, max) = self.width_range().into_inner();
+ let width = width.clamp(min, max);
+ if self.width_step() == 2 {
+ width / 2 * 2
+ } else {
+ width
+ }
+ }
+
+ pub fn var_type(self) -> VarType {
+ match self {
+ Self::A | Self::AHex => VarType::String,
+ _ => VarType::Numeric,
+ }
+ }
+
+ /// Checks whether this format is valid for a variable with the given
+ /// `var_type`.
+ pub fn check_type_compatibility(self, var_type: VarType) -> Result<(), Error> {
+ let my_type = self.var_type();
+ match (my_type, var_type) {
+ (VarType::Numeric, VarType::String) => {
+ Err(Error::UnnamedVariableNotCompatibleWithNumericFormat(self))
+ }
+ (VarType::String, VarType::Numeric) => {
+ Err(Error::UnnamedVariableNotCompatibleWithStringFormat(self))
+ }
+ _ => Ok(()),
+ }
+ }
+}
+
+impl Display for Type {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ let s = match self {
+ Self::F => "F",
+ Self::Comma => "COMMA",
+ Self::Dot => "DOT",
+ Self::Dollar => "DOLLAR",
+ Self::Pct => "PCT",
+ Self::E => "E",
+ Self::CC(cc) => return write!(f, "{}", cc),
+ Self::N => "N",
+ Self::Z => "Z",
+ Self::P => "P",
+ Self::PK => "PK",
+ Self::IB => "IB",
+ Self::PIB => "PIB",
+ Self::PIBHex => "PIBHEX",
+ Self::RB => "RB",
+ Self::RBHex => "RBHEX",
+ Self::Date => "DATE",
+ Self::ADate => "ADATE",
+ Self::EDate => "EDATE",
+ Self::JDate => "JDATE",
+ Self::SDate => "SDATE",
+ Self::QYr => "QYR",
+ Self::MoYr => "MOYR",
+ Self::WkYr => "WKYR",
+ Self::DateTime => "DATETIME",
+ Self::YMDHMS => "YMDHMS",
+ Self::MTime => "MTIME",
+ Self::Time => "TIME",
+ Self::DTime => "DTIME",
+ Self::WkDay => "WKDAY",
+ Self::Month => "MONTH",
+ Self::A => "A",
+ Self::AHex => "AHEX",
+ };
+ write!(f, "{}", s)
+ }
+}
+
+fn max_digits_for_bytes(bytes: usize) -> usize {
+ *[0, 3, 5, 8, 10, 13, 15, 17].get(bytes).unwrap_or(&20)
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct Format {
+ type_: Type,
+ w: Width,
+ d: Decimals,
+}
+
+impl Format {
+ pub const F40: Format = Format {
+ type_: Type::F,
+ w: 40,
+ d: 0,
+ };
+
+ pub const F8_2: Format = Format {
+ type_: Type::F,
+ w: 8,
+ d: 2,
+ };
+
+ pub fn format(self) -> Type {
+ self.type_
+ }
+ pub fn w(self) -> Width {
+ self.w
+ }
+ pub fn d(self) -> Decimals {
+ self.d
+ }
+
+ pub fn default_for_width(var_width: VarWidth) -> Self {
+ match var_width {
+ VarWidth::Numeric => Format {
+ type_: Type::F,
+ w: 8,
+ d: 2,
+ },
+ VarWidth::String(w) => Format {
+ type_: Type::A,
+ w,
+ d: 0,
+ },
+ }
+ }
+
+ pub fn fixed_from(source: &UncheckedFormat) -> Self {
+ let UncheckedFormat {
+ type_: format,
+ w,
+ d,
+ } = *source;
+ let (min, max) = format.width_range().into_inner();
+ let mut w = w.clamp(min, max);
+ if d <= format.max_decimals(Width::MAX) {
+ while d > format.max_decimals(w) {
+ w += 1;
+ assert!(w <= 40);
+ }
+ }
+ let d = d.clamp(0, format.max_decimals(w));
+ Self {
+ type_: format,
+ w,
+ d,
+ }
+ }
+
+ pub fn var_width(self) -> VarWidth {
+ match self.type_ {
+ Type::A => VarWidth::String(self.w),
+ Type::AHex => VarWidth::String(self.w / 2),
+ _ => VarWidth::Numeric,
+ }
+ }
+
+ pub fn var_type(self) -> VarType {
+ self.type_.var_type()
+ }
+
+ /// Checks whether this format specification is valid for a variable with
+ /// width `var_width`.
+ pub fn check_width_compatibility(self, var_width: VarWidth) -> Result<Self, Error> {
+ // Verify that the format is right for the variable's type.
+ self.type_.check_type_compatibility(var_width.into())?;
+
+ if let VarWidth::String(w) = var_width {
+ if var_width != self.var_width() {
+ let bad_spec = self;
+ let good_spec = if self.type_ == Type::A {
+ Format { w, ..self }
+ } else {
+ Format { w: w * 2, ..self }
+ };
+ return Err(Error::UnnamedStringVariableBadSpecWidth {
+ width: w,
+ bad_spec,
+ good_spec,
+ });
+ }
+ }
+
+ Ok(self)
+ }
+}
+
+impl Display for Format {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "{}{}", self.type_, self.w)?;
+ if self.type_.takes_decimals() || self.d > 0 {
+ write!(f, ".{}", self.d)?;
+ }
+ Ok(())
+ }
+}
+
+impl TryFrom<UncheckedFormat> for Format {
+ type Error = Error;
+
+ fn try_from(source: UncheckedFormat) -> Result<Self, Self::Error> {
+ let UncheckedFormat {
+ type_: format,
+ w,
+ d,
+ } = source;
+ let max_d = format.max_decimals(w);
+ if w % format.width_step() != 0 {
+ Err(Error::OddWidthNotAllowed(source))
+ } else if !format.width_range().contains(&w) {
+ Err(Error::BadWidth(source))
+ } else if d > max_d {
+ if format.takes_decimals() {
+ Err(Error::DecimalsNotAllowedForFormat(source))
+ } else if max_d > 0 {
+ Err(Error::TooManyDecimalsForWidth {
+ spec: source,
+ max_d,
+ })
+ } else {
+ Err(Error::DecimalsNotAllowedForWidth(source))
+ }
+ } else {
+ Ok(Format {
+ type_: format,
+ w,
+ d,
+ })
+ }
+ }
+}
+
+impl TryFrom<u16> for Type {
+ type Error = Error;
+
+ fn try_from(source: u16) -> Result<Self, Self::Error> {
+ match source {
+ 1 => Ok(Self::A),
+ 2 => Ok(Self::AHex),
+ 3 => Ok(Self::Comma),
+ 4 => Ok(Self::Dollar),
+ 5 => Ok(Self::F),
+ 6 => Ok(Self::IB),
+ 7 => Ok(Self::PIBHex),
+ 8 => Ok(Self::P),
+ 9 => Ok(Self::PIB),
+ 10 => Ok(Self::PK),
+ 11 => Ok(Self::RB),
+ 12 => Ok(Self::RBHex),
+ 15 => Ok(Self::Z),
+ 16 => Ok(Self::N),
+ 17 => Ok(Self::E),
+ 20 => Ok(Self::Date),
+ 21 => Ok(Self::Time),
+ 22 => Ok(Self::DateTime),
+ 23 => Ok(Self::ADate),
+ 24 => Ok(Self::JDate),
+ 25 => Ok(Self::DTime),
+ 26 => Ok(Self::WkDay),
+ 27 => Ok(Self::Month),
+ 28 => Ok(Self::MoYr),
+ 29 => Ok(Self::QYr),
+ 30 => Ok(Self::WkYr),
+ 31 => Ok(Self::Pct),
+ 32 => Ok(Self::Dot),
+ 33 => Ok(Self::CC(CC::A)),
+ 34 => Ok(Self::CC(CC::B)),
+ 35 => Ok(Self::CC(CC::C)),
+ 36 => Ok(Self::CC(CC::D)),
+ 37 => Ok(Self::CC(CC::E)),
+ 38 => Ok(Self::EDate),
+ 39 => Ok(Self::SDate),
+ 40 => Ok(Self::MTime),
+ 41 => Ok(Self::YMDHMS),
+ _ => Err(Error::UnknownFormat { value: source }),
+ }
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct UncheckedFormat {
+ pub type_: Type,
+
+ pub w: Width,
+
+ pub d: Decimals,
+}
+
+impl TryFrom<raw::Spec> for UncheckedFormat {
+ type Error = Error;
+
+ fn try_from(raw: raw::Spec) -> Result<Self, Self::Error> {
+ let raw = raw.0;
+ let raw_format = (raw >> 16) as u16;
+ let format = raw_format.try_into()?;
+ let w = ((raw >> 8) & 0xff) as Width;
+ let d = (raw & 0xff) as Decimals;
+ Ok(Self {
+ type_: format,
+ w,
+ d,
+ })
+ }
+}
+
+impl Display for UncheckedFormat {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "{}{}", self.type_, self.w)?;
+ if self.type_.takes_decimals() || self.d > 0 {
+ write!(f, ".{}", self.d)?;
+ }
+ Ok(())
+ }
+}
+
+pub struct Settings {
+ epoch: Option<i32>,
+
+ /// Either `'.'` or `','`.
+ decimal: char,
+
+ /// Format `F`, `E`, `COMMA`, and `DOT` with leading zero (e.g. `0.5`
+ /// instead of `.5`)?
+ include_leading_zero: bool,
+
+ /// Custom currency styles.
+ ccs: EnumMap<CC, Option<NumberStyle>>,
+}
+
+impl Default for Settings {
+ fn default() -> Self {
+ Self {
+ epoch: None,
+ decimal: '.',
+ include_leading_zero: false,
+ ccs: Default::default(),
+ }
+ }
+}
+
+/// A numeric output style. This can express numeric formats in
+/// [Category::Basic] and [Category::Custom].
+pub struct NumberStyle {
+ neg_prefix: Affix,
+ prefix: Affix,
+ suffix: Affix,
+ neg_suffix: Affix,
+
+ /// Decimal point: `'.'` or `','`.
+ decimal: char,
+
+ /// Grouping character: `'.'` or `','` or `None`.
+ grouping: Option<char>,
+
+ /// Format as `.5` or `0.5`?
+ include_leading_zero: bool,
+
+ /// An `Affix` may require more bytes than its display width; for example,
+ /// U+00A5 (¥) is 2 bytes in UTF-8 but occupies only one display column.
+ /// This member is the sum of the number of bytes required by all of the
+ /// `Affix` members in this struct, minus their display widths. Thus, it
+ /// can be used to size memory allocations: for example, the formatted
+ /// result of `CCA20.5` requires no more than `(20 + extra_bytes)` bytes in
+ /// UTF-8.
+ extra_bytes: usize,
+}
+
+pub struct Affix {
+ /// String contents of affix.
+ s: String,
+
+ /// Display width in columns (see [unicode_width])
+ width: usize,
+}
--- /dev/null
+use num::Float;
+use std::{num::FpCategory, fmt::{Display, Formatter, Result}};
+
+pub struct HexFloat<T: Float>(pub T);
+
+impl<T: Float> Display for HexFloat<T> {
+ fn fmt(&self, f: &mut Formatter<'_>) -> Result {
+ let sign = if self.0.is_sign_negative() { "-" } else { "" };
+ match self.0.classify() {
+ FpCategory::Nan => return write!(f, "NaN"),
+ FpCategory::Infinite => return write!(f, "{sign}Infinity"),
+ FpCategory::Zero => return write!(f, "{sign}0.0"),
+ _ => (),
+ };
+ let (significand, mut exponent, _) = self.0.integer_decode();
+ let mut hex_sig = format!("{:x}", significand);
+ while hex_sig.ends_with('0') {
+ hex_sig.pop();
+ exponent += 4;
+ }
+ match hex_sig.len() {
+ 0 => write!(f, "{sign}0.0"),
+ 1 => write!(f, "{sign}0x{hex_sig}.0p{exponent}"),
+ len => write!(
+ f,
+ "{sign}0x{}.{}p{}",
+ hex_sig.chars().next().unwrap(),
+ &hex_sig[1..],
+ exponent + 4 * (len as i16 - 1)
+ ),
+ }
+ }
+}
+
+#[cfg(test)]
+mod hex_float_tests {
+ use crate::HexFloat;
+ use num::Float;
+
+ #[test]
+ fn test() {
+ assert_eq!(format!("{}", HexFloat(1.0)), "0x1.0p0");
+ assert_eq!(format!("{}", HexFloat(123.0)), "0x1.ecp6");
+ assert_eq!(format!("{}", HexFloat(1.0 / 16.0)), "0x1.0p-4");
+ assert_eq!(format!("{}", HexFloat(f64::infinity())), "Infinity");
+ assert_eq!(format!("{}", HexFloat(f64::neg_infinity())), "-Infinity");
+ assert_eq!(format!("{}", HexFloat(f64::nan())), "NaN");
+ assert_eq!(format!("{}", HexFloat(0.0)), "0.0");
+ assert_eq!(format!("{}", HexFloat(f64::neg_zero())), "-0.0");
+ }
+}
+
--- /dev/null
+use std::{
+ borrow::Borrow,
+ cmp::Ordering,
+ fmt::{Debug, Display, Formatter, Result as FmtResult},
+ hash::{Hash, Hasher},
+ ops::Deref,
+};
+
+use encoding_rs::{EncoderResult, Encoding, UTF_8};
+use finl_unicode::categories::{CharacterCategories, MajorCategory};
+use thiserror::Error as ThisError;
+use unicase::UniCase;
+
+pub trait IdentifierChar {
+ /// Returns true if `self` is an ASCII character that may be the first
+ /// character in an identifier.
+ fn ascii_may_start_id(self) -> bool;
+
+ /// Returns true if `self` may be the first character in an identifier.
+ fn may_start_id(self) -> bool;
+
+ /// Returns true if `self` is an ASCII character that may be a second or
+ /// subsequent character in an identifier.
+ fn ascii_may_continue_id(self) -> bool;
+
+ /// Returns true if `self` may be a second or subsequent character in an
+ /// identifier.
+ fn may_continue_id(self) -> bool;
+}
+
+impl IdentifierChar for char {
+ fn ascii_may_start_id(self) -> bool {
+ matches!(self, 'a'..='z' | 'A'..='Z' | '@' | '#' | '$' | '!')
+ }
+
+ fn may_start_id(self) -> bool {
+ if self < '\u{0080}' {
+ self.ascii_may_start_id()
+ } else {
+ use MajorCategory::*;
+
+ [L, M, S].contains(&self.get_major_category()) && self != char::REPLACEMENT_CHARACTER
+ }
+ }
+
+ fn ascii_may_continue_id(self) -> bool {
+ matches!(self, 'a'..='z' | 'A'..='Z' | '0'..='9' | '@' | '#' | '$' | '.' | '_')
+ }
+
+ fn may_continue_id(self) -> bool {
+ if self < '\u{0080}' {
+ self.ascii_may_continue_id()
+ } else {
+ use MajorCategory::*;
+
+ [L, M, S, N].contains(&self.get_major_category()) && self != char::REPLACEMENT_CHARACTER
+ }
+ }
+}
+
+#[derive(Clone, Debug, ThisError)]
+pub enum Error {
+ #[error("Identifier cannot be empty string.")]
+ Empty,
+
+ #[error("\"{0}\" may not be used as an identifier because it is a reserved word.")]
+ Reserved(String),
+
+ #[error("\"!\" is not a valid identifier.")]
+ Bang,
+
+ #[error("\"{0}\" may not be used as an identifier because it begins with disallowed character \"{1}\".")]
+ BadFirstCharacter(String, char),
+
+ #[error("\"{0}\" may not be used as an identifier because it contains disallowed character \"{1}\".")]
+ BadLaterCharacter(String, char),
+
+ #[error("Identifier \"{id}\" is {length} bytes in the encoding in use ({encoding}), which exceeds the {max}-byte limit.")]
+ TooLong {
+ id: String,
+ length: usize,
+ encoding: &'static str,
+ max: usize,
+ },
+
+ #[error("\"{id}\" may not be used as an identifier because the encoding in use ({encoding}) cannot represent \"{c}\".")]
+ NotEncodable {
+ id: String,
+ encoding: &'static str,
+ c: char,
+ },
+}
+
+pub enum ReservedWord {
+ And,
+ Or,
+ Not,
+ Eq,
+ Ge,
+ Gt,
+ Le,
+ Lt,
+ Ne,
+ All,
+ By,
+ To,
+ With,
+}
+
+impl TryFrom<&str> for ReservedWord {
+ type Error = ();
+
+ fn try_from(source: &str) -> Result<Self, Self::Error> {
+ if !(2..=4).contains(&source.len()) {
+ Err(())
+ } else {
+ let b = source.as_bytes();
+ let c0 = b[0].to_ascii_uppercase();
+ let c1 = b[1].to_ascii_uppercase();
+ match (source.len(), c0, c1) {
+ (2, b'B', b'Y') => Ok(Self::By),
+ (2, b'E', b'Q') => Ok(Self::Eq),
+ (2, b'G', b'T') => Ok(Self::Gt),
+ (2, b'G', b'E') => Ok(Self::Ge),
+ (2, b'L', b'T') => Ok(Self::Lt),
+ (2, b'L', b'E') => Ok(Self::Le),
+ (2, b'N', b'E') => Ok(Self::Ne),
+ (3, b'N', b'O') if b[2].to_ascii_uppercase() == b'T' => Ok(Self::Not),
+ (2, b'O', b'R') => Ok(Self::Or),
+ (2, b'T', b'O') => Ok(Self::To),
+ (3, b'A', b'L') if b[2].to_ascii_uppercase() == b'L' => Ok(Self::All),
+ (3, b'A', b'N') if b[2].to_ascii_uppercase() == b'D' => Ok(Self::And),
+ (4, b'W', b'I')
+ if b[2].to_ascii_uppercase() == b'T' && b[3].to_ascii_uppercase() == b'H' =>
+ {
+ Ok(Self::With)
+ }
+ _ => Err(()),
+ }
+ }
+ }
+}
+
+pub fn is_reserved_word(s: &str) -> bool {
+ ReservedWord::try_from(s).is_ok()
+}
+
+#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct Identifier(pub UniCase<String>);
+
+impl Identifier {
+ /// Maximum length of an identifier, in bytes. The limit applies in the
+ /// encoding used by the dictionary, not in UTF-8.
+ pub const MAX_LEN: usize = 64;
+
+ pub fn new(s: &str) -> Result<Self, Error> {
+ Self::from_encoding(s, UTF_8)
+ }
+ pub fn from_encoding(s: &str, encoding: &'static Encoding) -> Result<Identifier, Error> {
+ Self::is_plausible(s)?;
+ let identifier = Identifier(s.into());
+ identifier.check_encoding(encoding)?;
+ Ok(identifier)
+ }
+
+ /// Checks whether this is a valid identifier in the given `encoding`. An
+ /// identifier that is valid in one encoding might be invalid in another
+ /// because some characters are unencodable or because it is too long.
+ pub fn check_encoding(&self, encoding: &'static Encoding) -> Result<(), Error> {
+ let s = self.0.as_str();
+ let (_encoded, _, unencodable) = encoding.encode(s);
+ if unencodable {
+ let mut encoder = encoding.new_encoder();
+ let mut buf = Vec::with_capacity(
+ encoder
+ .max_buffer_length_from_utf8_without_replacement(s.len())
+ .unwrap(),
+ );
+ let EncoderResult::Unmappable(c) = encoder
+ .encode_from_utf8_to_vec_without_replacement(s, &mut buf, true)
+ .0
+ else {
+ unreachable!();
+ };
+ return Err(Error::NotEncodable {
+ id: s.into(),
+ encoding: encoding.name(),
+ c,
+ });
+ }
+ /*
+ if encoded.len() > Self::MAX_LEN {
+ return Err(Error::TooLong {
+ id: s.into(),
+ length: encoded.len(),
+ encoding: encoding.name(),
+ max: Self::MAX_LEN,
+ });
+ }*/
+ Ok(())
+ }
+ pub fn is_plausible(s: &str) -> Result<(), Error> {
+ if s.is_empty() {
+ return Err(Error::Empty);
+ }
+ if is_reserved_word(s) {
+ return Err(Error::Reserved(s.into()));
+ }
+ if s == "!" {
+ return Err(Error::Bang);
+ }
+
+ let mut i = s.chars();
+ let first = i.next().unwrap();
+ if !first.may_start_id() {
+ return Err(Error::BadFirstCharacter(s.into(), first));
+ }
+ for c in i {
+ if !c.may_continue_id() {
+ return Err(Error::BadLaterCharacter(s.into(), c));
+ }
+ }
+ Ok(())
+ }
+
+ /// Returns true if `token` is a case-insensitive match for `keyword`.
+ ///
+ /// Keywords match `keyword` and `token` are identical, or `token` is at
+ /// least 3 characters long and those characters are identical to `keyword`
+ /// or differ only in case.
+ ///
+ /// `keyword` must be ASCII.
+ pub fn matches_keyword(&self, keyword: &str) -> bool {
+ id_match_n_nonstatic(keyword, self.0.as_str(), 3)
+ }
+
+ /// Returns true if `token` is a case-insensitive match for at least the
+ /// first `n` characters of `keyword`.
+ ///
+ /// `keyword` must be ASCII.
+ pub fn matches_keyword_n(&self, keyword: &str, n: usize) -> bool {
+ id_match_n_nonstatic(keyword, self.0.as_str(), n)
+ }
+}
+
+impl PartialEq<str> for Identifier {
+ fn eq(&self, other: &str) -> bool {
+ self.0.eq(&UniCase::new(other))
+ }
+}
+
+/// Returns true if `token` is a case-insensitive match for `keyword`.
+///
+/// Keywords match `keyword` and `token` are identical, or `token` is at least 3
+/// characters long and those characters are identical to `keyword` or differ
+/// only in case.
+///
+/// `keyword` must be ASCII. It's normally a constant string, so it's declared
+/// as `&'static str` to make it harder to reverse the argument order. But
+/// there's no reason that a non-static string won't work, so use
+/// [`id_match_n_nonstatic`] instead if you need it.
+pub fn id_match(keyword: &'static str, token: &str) -> bool {
+ id_match_n(keyword, token, 3)
+}
+
+/// Returns true if `token` is a case-insensitive match for at least the first
+/// `n` characters of `keyword`.
+///
+/// `keyword` must be ASCII. It's normally a constant string, so it's declared
+/// as `&'static str` to make it harder to reverse the argument order. But
+/// there's no reason that a non-static string won't work, so use
+/// [`id_match_n_nonstatic`] instead if you need it.
+pub fn id_match_n(keyword: &'static str, token: &str, n: usize) -> bool {
+ id_match_n_nonstatic(keyword, token, n)
+}
+
+/// Returns true if `token` is a case-insensitive match for at least the first
+/// `n` characters of `keyword`.
+///
+/// `keyword` must be ASCII.
+pub fn id_match_n_nonstatic(keyword: &str, token: &str, n: usize) -> bool {
+ debug_assert!(keyword.is_ascii());
+ let keyword_prefix = if (n..keyword.len()).contains(&token.len()) {
+ &keyword[..token.len()]
+ } else {
+ keyword
+ };
+ keyword_prefix.eq_ignore_ascii_case(token)
+}
+
+impl Display for Identifier {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "{}", self.0)
+ }
+}
+
+impl Debug for Identifier {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ write!(f, "{}", self.0)
+ }
+}
+
+pub trait HasIdentifier {
+ fn identifier(&self) -> &UniCase<String>;
+}
+
+pub struct ByIdentifier<T>(pub T)
+where
+ T: HasIdentifier;
+
+impl<T> ByIdentifier<T>
+where
+ T: HasIdentifier,
+{
+ pub fn new(inner: T) -> Self {
+ Self(inner)
+ }
+}
+
+impl<T> PartialEq for ByIdentifier<T>
+where
+ T: HasIdentifier,
+{
+ fn eq(&self, other: &Self) -> bool {
+ self.0.identifier().eq(other.0.identifier())
+ }
+}
+
+impl<T> Eq for ByIdentifier<T> where T: HasIdentifier {}
+
+impl<T> PartialOrd for ByIdentifier<T>
+where
+ T: HasIdentifier,
+{
+ fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+impl<T> Ord for ByIdentifier<T>
+where
+ T: HasIdentifier,
+{
+ fn cmp(&self, other: &Self) -> Ordering {
+ self.0.identifier().cmp(other.0.identifier())
+ }
+}
+
+impl<T> Hash for ByIdentifier<T>
+where
+ T: HasIdentifier,
+{
+ fn hash<H: Hasher>(&self, state: &mut H) {
+ self.0.identifier().hash(state)
+ }
+}
+
+impl<T> Borrow<UniCase<String>> for ByIdentifier<T>
+where
+ T: HasIdentifier,
+{
+ fn borrow(&self) -> &UniCase<String> {
+ self.0.identifier()
+ }
+}
+
+impl<T> Debug for ByIdentifier<T>
+where
+ T: HasIdentifier + Debug,
+{
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ self.0.fmt(f)
+ }
+}
+
+impl<T> Clone for ByIdentifier<T>
+where
+ T: HasIdentifier + Clone,
+{
+ fn clone(&self) -> Self {
+ Self(self.0.clone())
+ }
+}
+
+impl<T> Deref for ByIdentifier<T>
+where
+ T: HasIdentifier + Clone,
+{
+ type Target = T;
+
+ fn deref(&self) -> &Self::Target {
+ &self.0
+ }
+}
--- /dev/null
+pub trait ToInteger {
+ fn to_exact_integer<T>(&self) -> Option<T>
+ where
+ T: FromFloat;
+ fn to_exact_usize(&self) -> Option<usize> {
+ self.to_exact_integer()
+ }
+ fn to_exact_u8(&self) -> Option<u8> {
+ self.to_exact_integer()
+ }
+ fn to_exact_u16(&self) -> Option<u16> {
+ self.to_exact_integer()
+ }
+ fn to_exact_u32(&self) -> Option<u32> {
+ self.to_exact_integer()
+ }
+ fn to_exact_u64(&self) -> Option<u64> {
+ self.to_exact_integer()
+ }
+ fn to_exact_u128(&self) -> Option<u128> {
+ self.to_exact_integer()
+ }
+ fn to_exact_isize(&self) -> Option<usize> {
+ self.to_exact_integer()
+ }
+ fn to_exact_i8(&self) -> Option<i8> {
+ self.to_exact_integer()
+ }
+ fn to_exact_i16(&self) -> Option<i16> {
+ self.to_exact_integer()
+ }
+ fn to_exact_i32(&self) -> Option<i32> {
+ self.to_exact_integer()
+ }
+ fn to_exact_i64(&self) -> Option<i64> {
+ self.to_exact_integer()
+ }
+ fn to_exact_i128(&self) -> Option<i128> {
+ self.to_exact_integer()
+ }
+}
+
+impl ToInteger for f64 {
+ fn to_exact_integer<T>(&self) -> Option<T>
+ where
+ T: FromFloat,
+ {
+ T::from_float(*self)
+ }
+}
+
+pub trait FromFloat {
+ fn from_float(x: f64) -> Option<Self>
+ where
+ Self: Sized;
+}
+
+macro_rules! impl_from_float {
+ ($T:ident) => {
+ impl FromFloat for $T {
+ fn from_float(x: f64) -> Option<Self>
+ where
+ Self: Sized,
+ {
+ if x.trunc() == x && x >= $T::MIN as f64 && x <= $T::MAX as f64 {
+ Some(x as Self)
+ } else {
+ None
+ }
+ }
+ }
+ };
+}
+
+impl_from_float!(usize);
+impl_from_float!(u8);
+impl_from_float!(u16);
+impl_from_float!(u32);
+impl_from_float!(u64);
+impl_from_float!(u128);
+impl_from_float!(isize);
+impl_from_float!(i8);
+impl_from_float!(i16);
+impl_from_float!(i32);
+impl_from_float!(i64);
+impl_from_float!(i128);
--- /dev/null
+use crate::identifier::id_match_n_nonstatic;
+
+pub struct Match {
+ pub exact: bool,
+ pub missing_words: isize,
+}
+
+fn count_words(s: &str) -> isize {
+ s.split_whitespace().count() as isize
+}
+
+/// Compares `string` obtained from the user against the full name of a `command`,
+/// using this algorithm:
+///
+/// 1. Divide `command` into words `c[0]` through `c[n - 1]`.
+///
+/// 2. Divide `string` into words `s[0]` through `s[m - 1]`.
+///
+/// 3. Compare word `c[i]` against `s[i]` for `0 <= i < min(n, m)`, using the keyword
+/// matching algorithm implemented by lex_id_match(). If any of them fail to
+/// match, then `string` does not match `command` and the function returns false.
+///
+/// 4. Otherwise, `string` and `command` match. Set *MISSING_WORDS to n - m. Set
+/// *EXACT to false if any of the S[i] were found to be abbreviated in the
+/// comparisons done in step 3, or to true if they were all exactly equal
+/// (modulo case). Return true.
+pub fn command_match(command: &str, string: &str) -> Option<Match> {
+ let mut command_words = command.split_whitespace();
+ let mut string_words = string.split_whitespace();
+ let mut exact = true;
+ loop {
+ let Some(cw) = command_words.next() else {
+ return Some(Match {
+ exact,
+ missing_words: -(string_words.count() as isize),
+ });
+ };
+ let Some(sw) = string_words.next() else {
+ return Some(Match {
+ exact,
+ missing_words: 1 + command_words.count() as isize,
+ });
+ };
+ if !id_match_n_nonstatic(cw, sw, 3) {
+ return None;
+ }
+ if sw.len() < cw.len() {
+ exact = false;
+ }
+ }
+}
+
+/// Matches a string against a collection of command names.
+pub struct CommandMatcher<'a, T> {
+ string: &'a str,
+ extensible: bool,
+ exact_match: Option<T>,
+ n_matches: usize,
+ match_: Option<T>,
+ match_missing_words: isize,
+}
+
+impl<'a, T> CommandMatcher<'a, T> {
+ pub fn new(string: &'a str) -> Self {
+ Self {
+ string,
+ extensible: false,
+ exact_match: None,
+ n_matches: 0,
+ match_: None,
+ match_missing_words: 0,
+ }
+ }
+
+ /// Consider `command` as a candidate for the command name being parsed. If
+ /// `command` is the correct command name, then [Self::get_match] will
+ /// return `aux` later.
+ pub fn add(&mut self, command: &str, aux: T) {
+ if let Some(Match {
+ missing_words,
+ exact,
+ }) = command_match(command, self.string)
+ {
+ if missing_words > 0 {
+ self.extensible = true;
+ } else if exact && missing_words == 0 {
+ self.exact_match = Some(aux);
+ } else {
+ if missing_words > self.match_missing_words {
+ self.n_matches = 0;
+ }
+ if missing_words >= self.match_missing_words || self.n_matches == 0 {
+ self.n_matches += 1;
+ self.match_ = Some(aux);
+ self.match_missing_words = missing_words;
+ }
+ }
+ }
+ }
+
+ pub fn get_match(self) -> (Option<T>, isize) {
+ if self.extensible {
+ (None, 1)
+ } else if let Some(exact_match) = self.exact_match {
+ (Some(exact_match), 0)
+ } else if self.n_matches == 1 {
+ (self.match_, self.match_missing_words)
+ } else {
+ (None, self.match_missing_words)
+ }
+ }
+}
+
+pub const COMMAND_NAMES: &'static [&'static str] = &[
+ "2SLS",
+ "ACF",
+ "ADD DOCUMENT",
+ "ADD FILES",
+ "ADD VALUE LABELS",
+ "AGGREGATE",
+ "ALSCAL",
+ "ANACOR",
+ "ANOVA",
+ "APPLY DICTIONARY",
+ "AUTORECODE",
+ "BEGIN DATA",
+ "BREAK",
+ "CACHE",
+ "CASEPLOT",
+ "CASESTOVARS",
+ "CATPCA",
+ "CATREG",
+ "CCF",
+ "CD",
+ "CLEAR TRANSFORMATIONS",
+ "CLOSE FILE HANDLE",
+ "CLUSTER",
+ "COMPUTE",
+ "CONJOINT",
+ "CORRELATIONS",
+ "CORRESPONDENCE",
+ "COUNT",
+ "COXREG",
+ "CREATE",
+ "CROSSTABS",
+ "CSDESCRIPTIVES",
+ "CSGLM",
+ "CSLOGISTIC",
+ "CSPLAN",
+ "CSSELECT",
+ "CSTABULATE",
+ "CTABLES",
+ "CURVEFIT",
+ "DATA LIST",
+ "DATAFILE ATTRIBUTE",
+ "DATASET ACTIVATE",
+ "DATASET CLOSE",
+ "DATASET COPY",
+ "DATASET DECLARE",
+ "DATASET DISPLAY",
+ "DATASET NAME",
+ "DATE",
+ "DEBUG EVALUATE",
+ "DEBUG EXPAND",
+ "DEBUG FLOAT FORMAT",
+ "DEBUG FORMAT GUESSER",
+ "DEBUG MATRIX READ",
+ "DEBUG MOMENTS",
+ "DEBUG PAPER SIZE",
+ "DEBUG POOL",
+ "DEBUG XFORM FAIL",
+ "DEFINE",
+ "DELETE VARIABLES",
+ "DESCRIPTIVES",
+ "DETECTANOMALY",
+ "DISCRIMINANT",
+ "DISPLAY MACROS",
+ "DISPLAY VARIABLE SETS",
+ "DISPLAY",
+ "DO IF",
+ "DO REPEAT",
+ "DOCUMENT",
+ "DROP DOCUMENTS",
+ "ECHO",
+ "EDIT",
+ "ELSE IF",
+ "ELSE",
+ "END CASE",
+ "END FILE TYPE",
+ "END FILE",
+ "END IF",
+ "END LOOP",
+ "END REPEAT",
+ "ERASE",
+ "EXAMINE",
+ "EXECUTE",
+ "EXIT",
+ "EXPORT",
+ "FACTOR",
+ "FILE HANDLE",
+ "FILE LABEL",
+ "FILE TYPE",
+ "FILTER",
+ "FINISH",
+ "FIT",
+ "FLIP",
+ "FORMATS",
+ "FREQUENCIES",
+ "GENLOG",
+ "GET DATA",
+ "GET TRANSLATE",
+ "GET",
+ "GGRAPH",
+ "GLM",
+ "GRAPH",
+ "HILOGLINEAR",
+ "HOMALS",
+ "HOST",
+ "IF",
+ "IGRAPH",
+ "IMPORT",
+ "INCLUDE",
+ "INFO",
+ "INPUT PROGRAM",
+ "INSERT",
+ "KEYED DATA LIST",
+ "KM",
+ "LEAVE",
+ "LIST",
+ "LOGISTIC REGRESSION",
+ "LOGLINEAR",
+ "LOOP",
+ "MANOVA",
+ "MAPS",
+ "MATCH FILES",
+ "MATRIX DATA",
+ "MATRIX",
+ "MCONVERT",
+ "MEANS",
+ "MISSING VALUES",
+ "MIXED",
+ "MODEL CLOSE",
+ "MODEL HANDLE",
+ "MODEL LIST",
+ "MODEL NAME",
+ "MRSETS",
+ "MULT RESPONSE",
+ "MULTIPLE CORRESPONDENCE",
+ "MVA",
+ "N OF CASES",
+ "N",
+ "NAIVEBAYES",
+ "NEW FILE",
+ "NLR",
+ "NOMREG",
+ "NONPAR CORR",
+ "NPAR TESTS",
+ "NUMBERED",
+ "NUMERIC",
+ "OLAP CUBES",
+ "OMS",
+ "ONEWAY",
+ "ORTHOPLAN",
+ "OUTPUT MODIFY",
+ "OVERALS",
+ "PACF",
+ "PARTIAL CORR",
+ "PEARSON CORRELATIONS",
+ "PERMISSIONS",
+ "PLANCARDS",
+ "PLUM",
+ "POINT",
+ "PPLOT",
+ "PREDICT",
+ "PREFSCAL",
+ "PRESERVE",
+ "PRINCALS",
+ "PRINT EJECT",
+ "PRINT FORMATS",
+ "PRINT SPACE",
+ "PRINT",
+ "PROBIT",
+ "PROCEDURE OUTPUT",
+ "PROXIMITIES",
+ "PROXSCAL",
+ "Q",
+ "QUICK CLUSTER",
+ "QUIT",
+ "RANK",
+ "RATIO STATISTICS",
+ "READ MODEL",
+ "RECODE",
+ "RECORD TYPE",
+ "REFORMAT",
+ "REGRESSION",
+ "RELIABILITY",
+ "RENAME VARIABLES",
+ "REPEATING DATA",
+ "REPORT",
+ "REREAD",
+ "RESTORE",
+ "RMV",
+ "ROC",
+ "SAMPLE",
+ "SAVE DATA COLLECTION",
+ "SAVE TRANSLATE",
+ "SAVE",
+ "SCRIPT",
+ "SEASON",
+ "SELECT IF",
+ "SELECTPRED",
+ "SET",
+ "SHOW",
+ "SORT CASES",
+ "SORT VARIABLES",
+ "SPCHART",
+ "SPECTRA",
+ "SPLIT FILE",
+ "STEMLEAF",
+ "STRING",
+ "SUBTITLE",
+ "SUMMARIZE",
+ "SURVIVAL",
+ "SYSFILE INFO",
+ "T-TEST",
+ "TDISPLAY",
+ "TEMPORARY",
+ "TITLE",
+ "TREE",
+ "TSAPPLY",
+ "TSET",
+ "TSHOW",
+ "TSMODEL",
+ "TSPLOT",
+ "TWOSTEP CLUSTER",
+ "UNIANOVA",
+ "UNNUMBERED",
+ "UPDATE",
+ "USE",
+ "VALIDATEDATA",
+ "VALUE LABELS",
+ "VARCOMP",
+ "VARIABLE ALIGNMENT",
+ "VARIABLE ATTRIBUTE",
+ "VARIABLE LABELS",
+ "VARIABLE LEVEL",
+ "VARIABLE ROLE",
+ "VARIABLE WIDTH",
+ "VARSTOCASES",
+ "VECTOR",
+ "VERIFY",
+ "WEIGHT",
+ "WLS",
+ "WRITE FORMATS",
+ "WRITE",
+ "XEXPORT",
+ "XGRAPH",
+ "XSAVE",
+];
--- /dev/null
+use std::{
+ borrow::{Borrow, Cow},
+ collections::{HashMap, VecDeque},
+ fmt::Write,
+ fs,
+ io::Result as IoResult,
+ mem,
+ ops::{Range, RangeInclusive},
+ path::Path,
+ sync::Arc,
+};
+
+use chardetng::EncodingDetector;
+use encoding_rs::{Encoding, UTF_8};
+use thiserror::Error as ThisError;
+use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
+
+use crate::{
+ macros::{macro_tokens_to_syntax, MacroSet, ParseStatus, Parser},
+ message::{Category, Diagnostic, Location, Point, Severity},
+ prompt::PromptStyle,
+ settings::Settings,
+};
+
+use super::{
+ scan::{MergeResult, ScanError, ScanToken},
+ segment::{Mode, Segment, Segmenter},
+ token::Token,
+};
+
+/// Error handling for a [`Reader`].
+#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
+pub enum ErrorHandling {
+ /// Discard input line and continue reading.
+ Terminal,
+
+ /// Continue to next command, except for cascading failures.
+ #[default]
+ Continue,
+
+ /// Continue, even for cascading failures.
+ Ignore,
+
+ /// Stop processing,
+ Stop,
+}
+
+/// # Token pipeline
+///
+/// Tokens pass through a pipeline with the following stages. Each token
+/// eventually made available to the parser passes through of these stages.
+/// The stages are named after the processing that happens in each one.
+///
+/// Initially, tokens come from the segmenter and scanner to `pp`:
+///
+/// - `pp`: Tokens that need to pass through the macro preprocessor to end up
+/// in `merge`.
+///
+/// - `merge`: Tokens that need to pass through
+/// [`super::scan::ScanToken::merge`] to end up in `parse`.
+///
+/// - `parse`: Tokens available to the client for parsing.
+///
+/// `pp` and `merge` store tokens only temporarily until they pass into `parse`.
+/// Tokens then live in `parse` until the command is fully consumed, at which
+/// time they are freed together.
+pub struct Source {
+ /// Error-handling mode.
+ error_handling: ErrorHandling,
+
+ /// Encoding.
+ encoding: &'static Encoding,
+
+ /// `None` if this reader is not associated with a file.
+ file_name: Option<Arc<String>>,
+
+ /// True if we've reached EOF already.
+ eof: bool,
+
+ /// Read some input from the source. If successful, returns the input that
+ /// was read. At end of file or on error, returns an empty string.
+ ///
+ /// `prompt` provides a hint to interactive readers as to what kind of
+ /// syntax is being read right now.
+ read: Box<dyn Fn(PromptStyle) -> String>,
+
+ /// Source file contents.
+ buffer: String,
+
+ /// 0-based line number of the first line not yet written to the journal.
+ journal_line: usize,
+
+ /// Byte offset of first character not yet scanned as token.
+ seg_pos: usize,
+
+ /// Byte offsets into `buffer` of starts of lines. The first element is 0.
+ lines: Vec<usize>,
+
+ /// Tokens that need to pass through the macro preprocessor to end up in
+ /// `merge`.
+ pp: VecDeque<LexToken>,
+
+ /// Tokens that need to pass through [`super::scan::ScanToken::merge`] to
+ /// end up in `parse`.
+ merge: VecDeque<LexToken>,
+
+ /// Tokens available to the client for parsing.
+ parse: Vec<LexToken>,
+
+ /// Offset in `parse` of the current token.
+ parse_ofs: usize,
+
+ segmenter: Segmenter,
+
+ suppress_next_newline: bool,
+}
+
+impl Default for Source {
+ fn default() -> Self {
+ Self {
+ error_handling: ErrorHandling::default(),
+ encoding: UTF_8,
+ file_name: None,
+ eof: false,
+ read: Box::new(|_| String::new()),
+ buffer: String::new(),
+ journal_line: 0,
+ seg_pos: 0,
+ lines: vec![0],
+ pp: VecDeque::new(),
+ merge: VecDeque::new(),
+ parse: Vec::new(),
+ parse_ofs: 0,
+ segmenter: Segmenter::new(Mode::default(), false),
+ suppress_next_newline: false,
+ }
+ }
+}
+
+impl Source {
+ pub fn for_file<P>(
+ path: P,
+ encoding: Option<&'static Encoding>,
+ syntax: Mode,
+ error_handling: ErrorHandling,
+ ) -> IoResult<Self>
+ where
+ P: AsRef<Path>,
+ {
+ let bytes = fs::read(path.as_ref())?;
+ let encoding = encoding.unwrap_or_else(|| {
+ let mut encoding_detector = EncodingDetector::new();
+ encoding_detector.feed(&bytes, true);
+ encoding_detector.guess(None, true)
+ });
+ let (contents, _malformed) = encoding.decode_with_bom_removal(&bytes);
+ Ok(Self::for_file_contents(
+ contents.to_string(),
+ Some(path.as_ref().to_string_lossy().to_string()),
+ encoding,
+ syntax,
+ error_handling,
+ ))
+ }
+
+ pub fn for_file_contents(
+ contents: String,
+ file_name: Option<String>,
+ encoding: &'static Encoding,
+ syntax: Mode,
+ error_handling: ErrorHandling,
+ ) -> Self {
+ Self {
+ buffer: contents,
+ file_name: file_name.map(Arc::new),
+ encoding,
+ error_handling,
+ segmenter: Segmenter::new(syntax, false),
+ ..Self::default()
+ }
+ }
+
+ pub fn for_string(contents: String, encoding: &'static Encoding) -> Self {
+ Self {
+ buffer: contents,
+ encoding,
+ ..Self::default()
+ }
+ }
+
+ pub fn for_function(
+ read: Box<dyn Fn(PromptStyle) -> String>,
+ file_name: Option<String>,
+ encoding: &'static Encoding,
+ syntax: Mode,
+ error_handling: ErrorHandling,
+ ) -> Self {
+ Self {
+ read,
+ file_name: file_name.map(Arc::new),
+ encoding,
+ segmenter: Segmenter::new(syntax, false),
+ error_handling,
+ ..Self::default()
+ }
+ }
+
+ fn read(&mut self) {
+ loop {
+ let prompt = self.segmenter.prompt();
+ let s = (self.read)(prompt);
+ if s.is_empty() {
+ self.eof = true;
+ return;
+ }
+ self.buffer.push_str(&s);
+ if self.buffer[self.seg_pos..].contains('\n') {
+ return;
+ }
+ }
+ }
+ fn try_get_pp(&mut self, context: &Context) -> bool {
+ let (seg_len, seg_type) = loop {
+ if let Ok(result) = self.segmenter.push(&self.buffer[self.seg_pos..], self.eof) {
+ break result;
+ }
+
+ debug_assert!(!self.eof);
+ self.read();
+ };
+
+ let pos = self.seg_pos..self.seg_pos + seg_len;
+ self.seg_pos += seg_len;
+ if seg_type == Segment::Newline {
+ self.lines.push(self.seg_pos);
+ }
+
+ let scan_token = ScanToken::from_segment(&self.buffer[pos.clone()], seg_type);
+
+ let n_lines = match (seg_type, self.suppress_next_newline) {
+ (Segment::EndCommand, false) => {
+ self.suppress_next_newline = true;
+ 1
+ }
+ (Segment::Newline, true) => {
+ self.suppress_next_newline = false;
+ 0
+ }
+ (Segment::Newline, false) => 1,
+ _ => 0,
+ };
+ for line_num in self.journal_line..self.journal_line + n_lines {
+ let start_ofs = self.lines[line_num];
+ let end_ofs = self
+ .lines
+ .get(line_num + 1)
+ .copied()
+ .unwrap_or(self.buffer.len());
+ let line = &self.buffer[start_ofs..end_ofs];
+ let _line = line
+ .strip_suffix("\r\n")
+ .unwrap_or(line.strip_suffix('\n').unwrap_or(line));
+ // XXX submit the line as syntax
+ }
+ self.journal_line += n_lines;
+
+ let pos = pos.start..pos.end;
+ match scan_token {
+ None => false,
+ Some(ScanToken::Token(Token::End)) => {
+ self.pp.push_back(LexToken {
+ token: Token::EndCommand,
+ pos,
+ macro_rep: None,
+ });
+ self.eof = true;
+ true
+ }
+ Some(ScanToken::Token(token)) => {
+ self.pp.push_back(LexToken {
+ token,
+ pos,
+ macro_rep: None,
+ });
+ true
+ }
+ Some(ScanToken::Error(error)) => {
+ (context.error)(
+ Location {
+ file_name: self.file_name.clone(),
+ span: Some(self.offset_to_point(pos.start)..self.offset_to_point(pos.end)),
+ omit_underlines: false,
+ },
+ error.into(),
+ );
+ false
+ }
+ }
+ }
+
+ fn get_pp(&mut self, context: &Context) -> bool {
+ while !self.eof {
+ if self.try_get_pp(context) {
+ return true;
+ }
+ }
+ false
+ }
+
+ fn try_get_merge(&mut self, context: &Context) -> bool {
+ if self.pp.is_empty() && !self.get_pp(context) {
+ return false;
+ }
+
+ if !Settings::global().macros.expand {
+ self.merge.append(&mut self.pp);
+ return true;
+ }
+
+ // Now pass tokens one-by-one to the macro expander.
+ let Some(mut parser) = Parser::new(context.macros, &self.pp[0].token) else {
+ // Common case where there is no macro to expand.
+ self.merge.push_back(self.pp.pop_front().unwrap());
+ return true;
+ };
+ for ofs in 1.. {
+ if self.pp.len() <= ofs && !self.get_pp(context) {
+ // This should not be reachable because we always get a
+ // `Token::EndCommand` at the end of an input file, which should
+ // always terminate macro expansion.
+ unreachable!();
+ }
+ let token = &self.pp[ofs];
+ if parser.push(&token.token, &self.buffer[token.pos.clone()], &|e| {
+ println!("{e:?}")
+ }) == ParseStatus::Complete
+ {
+ break;
+ }
+ }
+ let call = parser.finish();
+ if call.len() == 0 {
+ // False alarm: no macro to expand after all.
+ self.merge.push_back(self.pp.pop_front().unwrap());
+ return true;
+ }
+
+ // Expand the tokens.
+ let c0 = &self.pp[0];
+ let c1 = &self.pp[call.len() - 1];
+ let mut expansion = Vec::new();
+ call.expand(
+ self.segmenter.mode(),
+ self.token_location(c0..=c1),
+ &mut expansion,
+ |e| println!("{e:?}"),
+ );
+ let retval = !expansion.is_empty();
+
+ if Settings::global().macros.print_expansions {
+ // XXX
+ }
+
+ // Append the macro expansion tokens to the lookahead.
+ let mut macro_rep = String::new();
+ let mut pos = Vec::with_capacity(expansion.len());
+ for [prefix, token] in macro_tokens_to_syntax(expansion.as_slice()) {
+ macro_rep.push_str(prefix);
+ let len = macro_rep.len();
+ pos.push(len..=len + token.len() - 1);
+ }
+ let macro_rep = Arc::new(macro_rep);
+ for (index, token) in expansion.into_iter().enumerate() {
+ let lt = LexToken {
+ token: token.token,
+ pos: c0.pos.start..c1.pos.end,
+ macro_rep: Some(MacroRepresentation {
+ expansion: Arc::clone(¯o_rep),
+ pos: pos[index].clone(),
+ }),
+ };
+ self.merge.push_back(lt);
+ }
+ self.pp.drain(..call.len());
+ retval
+ }
+
+ /// Attempts to obtain at least one new token into `self.merge`.
+ ///
+ /// Returns true if successful, false on failure. In the latter case, this source
+ /// exhausted and 'self.eof' is now true.
+ fn get_merge(&mut self, context: &Context) -> bool {
+ while !self.eof {
+ if self.try_get_merge(context) {
+ return true;
+ }
+ }
+ false
+ }
+
+ fn get_parse__(&mut self, context: &Context) -> bool {
+ for i in 0.. {
+ if self.merge.len() <= i && !self.get_merge(context) {
+ // We always get a `Token::EndCommand` at the end of an input
+ // file and the merger should return `Some(...)` for that token.
+ debug_assert_eq!(self.merge.len(), 0);
+ return false;
+ }
+
+ match ScanToken::merge(&self.merge) {
+ None => (),
+ Some(MergeResult::Copy) => {
+ self.parse.push(self.merge.pop_front().unwrap());
+ return true;
+ }
+ Some(MergeResult::Expand { n, token }) => {
+ let first = &self.merge[0];
+ let last = &self.merge[n - 1];
+ self.parse.push(LexToken {
+ token,
+ pos: first.pos.start..last.pos.end,
+ macro_rep: match (&first.macro_rep, &last.macro_rep) {
+ (Some(a), Some(b)) if Arc::ptr_eq(&a.expansion, &b.expansion) => {
+ Some(MacroRepresentation {
+ expansion: a.expansion.clone(),
+ pos: *a.pos.start()..=*b.pos.end(),
+ })
+ }
+ _ => None,
+ },
+ });
+ self.merge.drain(..n);
+ return true;
+ }
+ }
+ }
+ unreachable!();
+ }
+
+ fn get_parse(&mut self, context: &Context) -> bool {
+ // XXX deal with accumulated messages
+ self.get_parse__(context)
+ }
+
+ fn offset_to_point(&self, offset: usize) -> Point {
+ let line = self
+ .lines
+ .partition_point(|&line_start| line_start <= offset);
+ Point {
+ line: line as i32,
+ column: Some(
+ self.buffer
+ .get(self.lines[line - 1]..offset)
+ .unwrap_or_default()
+ .width() as i32
+ + 1,
+ ),
+ }
+ }
+
+ /// Returns the syntax for 1-based line-number `line_number`.
+ fn get_line(&self, line_number: i32) -> &str {
+ if (1..=self.lines.len() as i32).contains(&line_number) {
+ let line_number = line_number as usize;
+ let start = self.lines[line_number - 1];
+ let end = self.lines.get(line_number).copied().unwrap_or(
+ self.buffer[start..]
+ .find('\n')
+ .map(|ofs| ofs + start)
+ .unwrap_or(self.buffer.len()),
+ );
+ let line = &self.buffer[start..end];
+ line.strip_suffix("\r\n")
+ .unwrap_or(line.strip_suffix('\n').unwrap_or(line))
+ } else {
+ ""
+ }
+ }
+
+ fn token_location(&self, range: RangeInclusive<&LexToken>) -> Location {
+ Location {
+ file_name: self.file_name.clone(),
+ span: Some(
+ self.offset_to_point(range.start().pos.start)
+ ..self.offset_to_point(range.end().pos.end),
+ ),
+ omit_underlines: false,
+ }
+ }
+
+ fn ofs_location(&self, range: RangeInclusive<usize>) -> Location {
+ if *range.start() <= *range.end() && *range.end() < self.parse.len() {
+ self.token_location(&self.parse[*range.start()]..=&self.parse[*range.end()])
+ } else {
+ Location {
+ file_name: self.file_name.clone(),
+ span: None,
+ omit_underlines: false,
+ }
+ }
+ }
+
+ fn token(&self) -> &Token {
+ &self.parse[self.parse_ofs].token
+ }
+
+ fn next(&mut self, offset: isize, context: &Context) -> &Token {
+ let Some(index) = offset.checked_add(self.parse_ofs as isize) else {
+ return &Token::EndCommand;
+ };
+ let Ok(index) = usize::try_from(index) else {
+ return &Token::EndCommand;
+ };
+
+ while index >= self.parse.len() {
+ if let Some(token) = self.parse.last() {
+ match token.token {
+ Token::End => return &Token::End,
+ Token::EndCommand => return &Token::EndCommand,
+ _ => (),
+ }
+ }
+ self.get_parse(context);
+ }
+ &self.parse[index].token
+ }
+
+ /// If the tokens in `ofs` contains a macro call, this returns the raw
+ /// syntax for the macro call (not for the expansion) and for any other
+ /// tokens included in that range. The syntax is encoded in UTF-8 and in
+ /// the original form supplied to the lexer so that, for example, it may
+ /// include comments, spaces, and new-lines if it spans multiple tokens.
+ ///
+ /// Returns `None` if the token range doesn't include a macro call.
+ fn get_macro_call(&self, ofs: RangeInclusive<usize>) -> Option<&str> {
+ if self
+ .parse
+ .get(ofs.clone())
+ .unwrap_or_default()
+ .iter()
+ .all(|token| token.macro_rep.is_none())
+ {
+ return None;
+ }
+
+ let token0 = &self.parse[*ofs.start()];
+ let token1 = &self.parse[*ofs.end()];
+ Some(&self.buffer[token0.pos.start..token1.pos.end])
+ }
+
+ fn is_empty(&self) -> bool {
+ self.buffer.is_empty() && self.eof
+ }
+
+ fn diagnostic(
+ &self,
+ severity: Severity,
+ ofs: RangeInclusive<usize>,
+ text: String,
+ ) -> Diagnostic {
+ let mut s = String::with_capacity(text.len() + 16);
+ if self.is_empty() {
+ s.push_str("At end of input: ");
+ } else if let Some(call) = self.get_macro_call(ofs.clone()) {
+ write!(&mut s, "In syntax expanded from `{}`: ", ellipsize(call)).unwrap();
+ }
+
+ if !text.is_empty() {
+ s.push_str(&text);
+ } else {
+ s.push_str("Syntax error.");
+ }
+
+ if !s.ends_with('.') {
+ s.push('.');
+ }
+
+ let location = self.ofs_location(ofs);
+ let mut source = Vec::new();
+ if let Some(Range {
+ start: Point { line: l0, .. },
+ end: Point { line: l1, .. },
+ }) = location.span
+ {
+ let lines = if l1 - l0 > 3 {
+ vec![l0, l0 + 1, l1]
+ } else {
+ (l0..=l1).collect()
+ };
+ for line_number in lines {
+ source.push((line_number, self.get_line(line_number).to_string()));
+ }
+ }
+
+ Diagnostic {
+ category: Category::Syntax,
+ severity,
+ location,
+ source,
+ stack: Vec::new(),
+ command_name: None, // XXX
+ text: s,
+ }
+ }
+
+ fn interactive_reset(&mut self) {
+ if self.error_handling == ErrorHandling::Terminal {
+ let Source {
+ error_handling,
+ encoding,
+ read,
+ ..
+ } = mem::take(self);
+ *self = Self {
+ error_handling,
+ encoding,
+ read,
+ ..Source::default()
+ };
+ }
+ }
+}
+
+fn ellipsize(s: &str) -> Cow<str> {
+ if s.width() > 64 {
+ let mut out = String::new();
+ let mut width = 0;
+ for c in s.chars() {
+ out.push(c);
+ width += c.width().unwrap_or(0);
+ if width > 64 {
+ break;
+ }
+ }
+ out.push_str("...");
+ Cow::from(out)
+ } else {
+ Cow::from(s)
+ }
+}
+
+/// A token in a [`Source`].
+struct LexToken {
+ /// The regular token.
+ token: Token,
+
+ /// For a token obtained through the lexer in an ordinary way, this is the
+ /// location of the token in the [`Source`]'s buffer.
+ ///
+ /// For a token produced through macro expansion, this is the entire macro
+ /// call.
+ pos: Range<usize>,
+
+ /// For a token obtained through macro expansion, the part of the macro
+ /// expansion that represents this token.
+ ///
+ /// For a token obtained through the lexer in an ordinary way, this is
+ /// `None`.
+ macro_rep: Option<MacroRepresentation>,
+}
+
+impl Borrow<Token> for LexToken {
+ fn borrow(&self) -> &Token {
+ &self.token
+ }
+}
+
+struct MacroRepresentation {
+ /// An entire macro expansion.
+ expansion: Arc<String>,
+
+ /// The substring of `expansion` that represents a single token.
+ pos: RangeInclusive<usize>,
+}
+
+pub struct Lexer {
+ source: Source,
+ stack: Vec<Source>,
+ macros: MacroSet,
+ error: Box<dyn Fn(Location, Error)>,
+}
+
+struct Context<'a> {
+ macros: &'a MacroSet,
+ error: &'a Box<dyn Fn(Location, Error)>,
+}
+
+impl Lexer {
+ pub fn new(error: Box<dyn Fn(Location, Error)>) -> Self {
+ Self {
+ source: Source::default(),
+ stack: Vec::new(),
+ macros: HashMap::new(),
+ error,
+ }
+ }
+
+ pub fn get(&mut self) -> &Token {
+ if self.source.parse_ofs < self.source.parse.len() {
+ if let Token::EndCommand = self.source.token() {
+ self.source.parse.clear();
+ self.source.parse_ofs = 0;
+ } else {
+ self.source.parse_ofs += 1;
+ }
+ }
+
+ while self.source.parse_ofs == self.source.parse.len() {
+ let context = Context {
+ macros: &self.macros,
+ error: &self.error,
+ };
+ if !self.source.get_parse(&context) && !self.pop_stack() {
+ return &Token::End;
+ }
+ }
+ self.source.token()
+ }
+
+ fn pop_stack(&mut self) -> bool {
+ if let Some(new_source) = self.stack.pop() {
+ self.source = new_source;
+ true
+ } else {
+ self.source = Source::default();
+ self.source.parse.push(LexToken {
+ token: Token::End,
+ pos: 0..0,
+ macro_rep: None,
+ });
+ false
+ }
+ }
+
+ /// Inserts `source` so that the next token comes from it. This is only
+ /// permitted when the lexer is either empty or at `Token::EndCommand`.
+ pub fn include(&mut self, mut source: Source) {
+ // XXX what's the right assertion?
+ let context = Context {
+ macros: &self.macros,
+ error: &self.error,
+ };
+ source.get_parse(&context);
+ let old_source = mem::replace(&mut self.source, source);
+ self.stack.push(old_source);
+ }
+
+ /// Inserts `source` so that it will be read after all the other sources.
+ pub fn append(&mut self, mut source: Source) {
+ let context = Context {
+ macros: &self.macros,
+ error: &self.error,
+ };
+ source.get_parse(&context);
+ self.stack.insert(0, source);
+ }
+
+ pub fn token(&self) -> &Token {
+ self.source.token()
+ }
+
+ pub fn next(&mut self, offset: isize) -> &Token {
+ let context = Context {
+ macros: &self.macros,
+ error: &self.error,
+ };
+ self.source.next(offset, &context)
+ }
+
+ pub fn error<S>(&self, text: S) -> Diagnostic
+ where
+ S: ToString,
+ {
+ self.diagnostic(
+ Severity::Error,
+ self.source.parse_ofs..=self.source.parse_ofs,
+ text,
+ )
+ }
+
+ pub fn diagnostic<S>(
+ &self,
+ severity: Severity,
+ ofs: RangeInclusive<usize>,
+ text: S,
+ ) -> Diagnostic
+ where
+ S: ToString,
+ {
+ self.source.diagnostic(severity, ofs, text.to_string())
+ }
+
+ pub fn error_handling(&self) -> ErrorHandling {
+ self.source.error_handling
+ }
+
+ /// Discards all lookahead tokens, then discards all input sources
+ /// until it encounters one with error mode [ErrorHandling::Terminal] or until it
+ /// runs out of input sources.
+ pub fn discard_noninteractive(&mut self) {
+ while self.source.error_handling != ErrorHandling::Ignore {
+ self.source.pp.clear();
+ self.source.merge.clear();
+ self.source.parse.clear();
+ self.source.parse_ofs = 0;
+
+ if self.source.error_handling == ErrorHandling::Terminal || !self.pop_stack() {
+ return;
+ }
+ }
+ }
+
+ /// If the source that the lexer is currently reading has error mode
+ /// [ErrorHandling::Terminal], discards all buffered input and tokens, so
+ /// that the next token to be read comes directly from whatever is next read
+ /// from the stream.
+ ///
+ /// It makes sense to call this function after encountering an error in a
+ /// command entered on the console, because usually the user would prefer
+ /// not to have cascading errors.
+ pub fn interactive_reset(&mut self) {
+ self.source.interactive_reset()
+ }
+
+ /// Advances past any tokens up to [Token::EndCommand] or [Token::End].
+ pub fn discard_rest_of_command(&mut self) {
+ while !matches!(self.token(), Token::EndCommand | Token::End) {
+ self.get();
+ }
+ }
+}
+
+#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
+pub enum Error {
+ /// Error forming tokens from the input.
+ #[error("{0}")]
+ TokenError(#[from] ScanError),
+}
+
+#[cfg(test)]
+mod tests {
+ use encoding_rs::UTF_8;
+
+ use crate::lex::{segment::Mode, token::Token};
+
+ use super::{ErrorHandling, Lexer, Source};
+
+ #[test]
+ fn test() {
+ let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}")));
+ lexer.include(Source::for_string(
+ String::from(
+ r#"#! /usr/local/bin/pspp
+DATA LIST LIST NOTABLE /a.
+BEGIN DATA.
+1
+2
+END DATA.
+LIST.
+"#,
+ ),
+ UTF_8,
+ ));
+ loop {
+ lexer.get();
+ let token = lexer.token();
+ println!("{token:?}");
+ if let Token::End = token {
+ break;
+ }
+ }
+ }
+
+ #[test]
+ fn test_scan_errors() {
+ let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}")));
+ lexer.include(Source::for_file_contents(
+ String::from(
+ r#"x'123'
+x'1x'
+u''
+u'012345678'
+u'd800'
+u'110000'
+'foo
+'very long unterminated string that be ellipsized in its error message
+1e .x
+^
+�
+"#,
+ ),
+ Some(String::from("syntax.sps")),
+ UTF_8,
+ Mode::default(),
+ ErrorHandling::default(),
+ ));
+ loop {
+ lexer.get();
+ let token = lexer.token();
+ println!("{token:?}");
+ if let Token::End = token {
+ break;
+ }
+ }
+ }
+
+ #[test]
+ fn test_null_byte() {
+ let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}")));
+ lexer.include(Source::for_file_contents(
+ String::from(
+ "datA dist list notable file='input.txt'/a b c.
+lis|.\0",
+ ),
+ Some(String::from("syntax.sps")),
+ UTF_8,
+ Mode::default(),
+ ErrorHandling::default(),
+ ));
+ loop {
+ lexer.get();
+ let token = lexer.token();
+ println!("{token:?}");
+ if let Token::End = token {
+ break;
+ }
+ }
+ }
+}
--- /dev/null
+//! PSPP syntax scanning.
+//!
+//! PSPP divides traditional "lexical analysis" or "tokenization" into two
+//! phases: a lower-level phase called "segmentation" and a higher-level phase
+//! called "scanning". [super::segment] implements the segmentation phase and
+//! this module the scanning phase.
+//!
+//! Scanning accepts as input a stream of segments, which are UTF-8 strings each
+//! labeled with a segment type. It outputs a stream of "scan tokens", which
+//! are the same as the tokens used by the PSPP parser with a few additional
+//! types.
+
+pub mod segment;
+pub mod scan;
+pub mod command_name;
+pub mod token;
+pub mod lexer;
--- /dev/null
+//! PSPP lexical analysis.
+//!
+//! PSPP divides traditional "lexical analysis" or "tokenization" into two
+//! phases: a lower-level phase called "segmentation" and a higher-level phase
+//! called "scanning". [segment] implements the segmentation phase and [scan]
+//! the scanning phase.
+//!
+//! Scanning accepts as input a stream of segments, which are UTF-8 strings each
+//! labeled with a segment type. It outputs a stream of "scan tokens", which
+//! are the same as the tokens used by the PSPP parser with a few additional
+//! types.
+
+use crate::identifier::{Identifier, ReservedWord};
+
+use super::{
+ segment::{Mode, Segment, Segmenter},
+ token::{Punct, Token},
+};
+use std::{borrow::Borrow, collections::VecDeque};
+use thiserror::Error as ThisError;
+
+#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
+pub enum ScanError {
+ /// Unterminated string constant.
+ #[error("Unterminated string constant.")]
+ ExpectedQuote,
+
+ /// Missing exponent.
+ #[error("Missing exponent following `{0}`")]
+ ExpectedExponent(String),
+
+ /// Odd length hex string.
+ #[error("String of hex digits has {0} characters, which is not a multiple of 2.")]
+ OddLengthHexString(usize),
+
+ /// Invalid hex digit.
+ #[error("Invalid hex digit {0:?}.")]
+ BadHexDigit(char),
+
+ /// Incomplete UTF-8 sequence.
+ #[error("Incomplete UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
+ IncompleteUtf8 { substring: String, offset: usize },
+
+ /// Bad UTF-8 sequence.
+ #[error("Invalid UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
+ BadUtf8 { substring: String, offset: usize },
+
+ /// Invalid length Unicode string.
+ #[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")]
+ BadLengthUnicodeString(usize),
+
+ /// Invalid code point.
+ #[error("U+{0:04X} is not a valid Unicode code point.")]
+ BadCodePoint(u32),
+
+ /// Expected hexadecimal Unicode code point
+ #[error("Expected hexadecimal Unicode code point.")]
+ ExpectedCodePoint,
+
+ /// `DO REPEAT` nested too deeply.
+ #[error("`DO REPEAT` nested too deeply.")]
+ DoRepeatOverflow,
+
+ /// Unexpected character.
+ #[error("Unexpected character {0:?} in input.")]
+ UnexpectedChar(char),
+}
+
+/// The input or output to token merging.
+#[derive(Clone, Debug, PartialEq)]
+pub enum ScanToken {
+ Token(Token),
+ Error(ScanError),
+}
+
+/// The result of merging tokens.
+#[derive(Clone, Debug)]
+pub enum MergeResult {
+ /// Copy one token literally from input to output.
+ Copy,
+
+ /// Expand `n` tokens from the input into `token` in the output.
+ Expand {
+ /// Number of tokens to expand.
+ n: usize,
+
+ /// Replacement token.
+ token: Token,
+ },
+}
+
+impl ScanToken {
+ pub fn from_segment(s: &str, segment: Segment) -> Option<Self> {
+ match segment {
+ Segment::Number => Some(Self::Token(Token::Number(s.parse().unwrap()))),
+ Segment::QuotedString => {
+ // Trim quote mark from front and back.
+ let mut chars = s.chars();
+ let quote = chars.next().unwrap();
+ let s = chars.as_str().strip_suffix(quote).unwrap();
+
+ // Replace doubled quotes by single ones.
+ let (single_quote, double_quote) = match quote {
+ '\'' => ("'", "''"),
+ '"' => ("\"", "\"\""),
+ _ => unreachable!(),
+ };
+ Some(Self::Token(Token::String(
+ s.replace(double_quote, single_quote),
+ )))
+ }
+ Segment::HexString => {
+ // Strip `X"` prefix and `"` suffix (or variations).
+ let s = &s[2..s.len() - 1];
+ for c in s.chars() {
+ if !c.is_ascii_hexdigit() {
+ return Some(Self::Error(ScanError::BadHexDigit(c)));
+ }
+ }
+ if s.len() % 2 != 0 {
+ return Some(Self::Error(ScanError::OddLengthHexString(s.len())));
+ }
+ let bytes = s
+ .as_bytes()
+ .chunks_exact(2)
+ .map(|pair| {
+ let hi = char::from(pair[0]).to_digit(16).unwrap() as u8;
+ let lo = char::from(pair[1]).to_digit(16).unwrap() as u8;
+ hi * 16 + lo
+ })
+ .collect::<Vec<_>>();
+ match String::from_utf8(bytes) {
+ Ok(string) => Some(Self::Token(Token::String(string))),
+ Err(error) => {
+ let details = error.utf8_error();
+ let offset = details.valid_up_to() * 2;
+ let end = details
+ .error_len()
+ .map(|len| offset + len * 2)
+ .unwrap_or(s.len());
+ let substring = String::from(&s[offset..end]);
+ Some(Self::Error(if details.error_len().is_some() {
+ ScanError::BadUtf8 { substring, offset }
+ } else {
+ ScanError::IncompleteUtf8 { substring, offset }
+ }))
+ }
+ }
+ }
+ Segment::UnicodeString => {
+ // Strip `U"` prefix and `"` suffix (or variations).
+ let s = &s[2..s.len() - 1];
+ if !(1..=8).contains(&s.len()) {
+ return Some(Self::Error(ScanError::BadLengthUnicodeString(s.len())));
+ }
+ let Ok(code_point) = u32::from_str_radix(s, 16) else {
+ return Some(Self::Error(ScanError::ExpectedCodePoint));
+ };
+ let Some(c) = char::from_u32(code_point) else {
+ return Some(Self::Error(ScanError::BadCodePoint(code_point)));
+ };
+ Some(Self::Token(Token::String(String::from(c))))
+ }
+
+ Segment::UnquotedString
+ | Segment::DoRepeatCommand
+ | Segment::InlineData
+ | Segment::Document
+ | Segment::MacroBody
+ | Segment::MacroName => Some(Self::Token(Token::String(String::from(s)))),
+
+ Segment::Identifier => {
+ if let Ok(reserved_word) = ReservedWord::try_from(s) {
+ match reserved_word {
+ ReservedWord::And => Some(Self::Token(Token::Punct(Punct::And))),
+ ReservedWord::Or => Some(Self::Token(Token::Punct(Punct::Or))),
+ ReservedWord::Not => Some(Self::Token(Token::Punct(Punct::Not))),
+ ReservedWord::Eq => Some(Self::Token(Token::Punct(Punct::Eq))),
+ ReservedWord::Ge => Some(Self::Token(Token::Punct(Punct::Ge))),
+ ReservedWord::Gt => Some(Self::Token(Token::Punct(Punct::Gt))),
+ ReservedWord::Le => Some(Self::Token(Token::Punct(Punct::Le))),
+ ReservedWord::Lt => Some(Self::Token(Token::Punct(Punct::Lt))),
+ ReservedWord::Ne => Some(Self::Token(Token::Punct(Punct::Ne))),
+ ReservedWord::All => Some(Self::Token(Token::Punct(Punct::All))),
+ ReservedWord::By => Some(Self::Token(Token::Punct(Punct::By))),
+ ReservedWord::To => Some(Self::Token(Token::Punct(Punct::To))),
+ ReservedWord::With => Some(Self::Token(Token::Punct(Punct::With))),
+ }
+ } else {
+ Some(Self::Token(Token::Id(Identifier::new(s).unwrap())))
+ }
+ }
+ Segment::Punct => match s {
+ "(" => Some(Self::Token(Token::Punct(Punct::LParen))),
+ ")" => Some(Self::Token(Token::Punct(Punct::RParen))),
+ "[" => Some(Self::Token(Token::Punct(Punct::LSquare))),
+ "]" => Some(Self::Token(Token::Punct(Punct::RSquare))),
+ "{" => Some(Self::Token(Token::Punct(Punct::LCurly))),
+ "}" => Some(Self::Token(Token::Punct(Punct::RCurly))),
+ "," => Some(Self::Token(Token::Punct(Punct::Comma))),
+ "=" => Some(Self::Token(Token::Punct(Punct::Equals))),
+ "-" => Some(Self::Token(Token::Punct(Punct::Dash))),
+ "&" => Some(Self::Token(Token::Punct(Punct::And))),
+ "|" => Some(Self::Token(Token::Punct(Punct::Or))),
+ "+" => Some(Self::Token(Token::Punct(Punct::Plus))),
+ "/" => Some(Self::Token(Token::Punct(Punct::Slash))),
+ "*" => Some(Self::Token(Token::Punct(Punct::Asterisk))),
+ "<" => Some(Self::Token(Token::Punct(Punct::Lt))),
+ ">" => Some(Self::Token(Token::Punct(Punct::Gt))),
+ "~" => Some(Self::Token(Token::Punct(Punct::Not))),
+ ":" => Some(Self::Token(Token::Punct(Punct::Colon))),
+ ";" => Some(Self::Token(Token::Punct(Punct::Semicolon))),
+ "**" => Some(Self::Token(Token::Punct(Punct::Exp))),
+ "<=" => Some(Self::Token(Token::Punct(Punct::Le))),
+ "<>" => Some(Self::Token(Token::Punct(Punct::Ne))),
+ "~=" => Some(Self::Token(Token::Punct(Punct::Ne))),
+ ">=" => Some(Self::Token(Token::Punct(Punct::Ge))),
+ "!" => Some(Self::Token(Token::Punct(Punct::Bang))),
+ "%" => Some(Self::Token(Token::Punct(Punct::Percent))),
+ "?" => Some(Self::Token(Token::Punct(Punct::Question))),
+ "`" => Some(Self::Token(Token::Punct(Punct::Backtick))),
+ "_" => Some(Self::Token(Token::Punct(Punct::Underscore))),
+ "." => Some(Self::Token(Token::Punct(Punct::Dot))),
+ "!*" => Some(Self::Token(Token::Punct(Punct::BangAsterisk))),
+ _ => unreachable!("bad punctuator {s:?}"),
+ },
+ Segment::Shbang
+ | Segment::Spaces
+ | Segment::Comment
+ | Segment::Newline
+ | Segment::CommentCommand => None,
+ Segment::DoRepeatOverflow => Some(Self::Error(ScanError::DoRepeatOverflow)),
+ Segment::StartDocument => {
+ Some(Self::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())))
+ }
+ Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => {
+ Some(Self::Token(Token::EndCommand))
+ }
+ Segment::End => Some(Self::Token(Token::End)),
+ Segment::ExpectedQuote => Some(Self::Error(ScanError::ExpectedQuote)),
+ Segment::ExpectedExponent => {
+ Some(Self::Error(ScanError::ExpectedExponent(String::from(s))))
+ }
+ Segment::UnexpectedChar => Some(Self::Error(ScanError::UnexpectedChar(
+ s.chars().next().unwrap(),
+ ))),
+ }
+ }
+
+ /// Attempts to merge a sequence of tokens together into a single token. The
+ /// tokens are taken from the beginning of `input`. If successful, removes one
+ /// or more token from the beginning of `input` and returnss the merged
+ /// token. More input tokens might be needed; if so, leaves `input` alone and
+ /// returns `None`. In the latter case, the caller should add more tokens to the
+ /// input ([Token::End] or [Token::Punct(Punct::EndCmd)] is always sufficient).
+ ///
+ /// This performs two different kinds of token merging:
+ ///
+ /// - String concatenation, where syntax like `"a" + "b"` is converted into a
+ /// single string token. This is definitely needed because the parser relies
+ /// on it.
+ ///
+ /// - Negative number merging, where syntax like `-5` is converted from a pair
+ /// of tokens (a dash and a positive number) into a single token (a negative
+ /// number). This might not be needed anymore because the segmenter
+ /// directly treats a dash followed by a number, with optional intervening
+ /// white space, as a negative number. It's only needed if we want
+ /// intervening comments to be allowed or for part of the negative number
+ /// token to be produced by macro expansion.
+ pub fn merge<T>(tokens: &T) -> Option<MergeResult>
+ where
+ T: Tokens,
+ {
+ match tokens.get(0)? {
+ Token::Punct(Punct::Dash) => match tokens.get(1)? {
+ Token::Number(number) if number.is_sign_positive() => {
+ let number = *number;
+ return Some(MergeResult::Expand {
+ n: 2,
+ token: Token::Number(-number),
+ });
+ }
+ _ => Some(MergeResult::Copy),
+ },
+ Token::String(_) => {
+ let mut i = 0;
+ while matches!(tokens.get(i * 2 + 1)?, Token::Punct(Punct::Plus))
+ && matches!(tokens.get(i * 2 + 2)?, Token::String(_))
+ {
+ i += 1;
+ }
+ if i == 0 {
+ Some(MergeResult::Copy)
+ } else {
+ let mut output = String::new();
+ for i in 0..=i {
+ let Token::String(s) = tokens.get(i * 2).unwrap() else {
+ unreachable!()
+ };
+ output.push_str(&s);
+ }
+ Some(MergeResult::Expand {
+ n: i * 2 + 1,
+ token: Token::String(output),
+ })
+ }
+ }
+ _ => Some(MergeResult::Copy),
+ }
+ }
+}
+
+pub trait Tokens {
+ fn get(&self, index: usize) -> Option<&Token>;
+}
+
+impl<T> Tokens for VecDeque<T>
+where
+ T: Borrow<Token>,
+{
+ fn get(&self, index: usize) -> Option<&Token> {
+ self.get(index).map(|token| token.borrow())
+ }
+}
+
+pub struct StringSegmenter<'a> {
+ input: &'a str,
+ segmenter: Segmenter,
+}
+
+impl<'a> StringSegmenter<'a> {
+ pub fn new(input: &'a str, mode: Mode, is_snippet: bool) -> Self {
+ Self {
+ input,
+ segmenter: Segmenter::new(mode, is_snippet),
+ }
+ }
+}
+
+impl<'a> Iterator for StringSegmenter<'a> {
+ type Item = (&'a str, ScanToken);
+
+ fn next(&mut self) -> Option<Self::Item> {
+ loop {
+ let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap();
+ if seg_type == Segment::End {
+ return None;
+ }
+ let (s, rest) = self.input.split_at(seg_len);
+ self.input = rest;
+
+ if let Some(token) = ScanToken::from_segment(s, seg_type) {
+ return Some((s, token));
+ }
+ }
+ }
+}
+
+pub struct StringScanner<'a> {
+ input: &'a str,
+ segmenter: Segmenter,
+ tokens: VecDeque<Token>,
+}
+
+impl<'a> StringScanner<'a> {
+ pub fn new(input: &'a str, mode: Mode, is_snippet: bool) -> Self {
+ Self {
+ input,
+ segmenter: Segmenter::new(mode, is_snippet),
+ tokens: VecDeque::with_capacity(1),
+ }
+ }
+
+ fn merge(&mut self) -> Option<ScanToken> {
+ let result = ScanToken::merge(&self.tokens)?;
+ match result {
+ MergeResult::Copy => Some(ScanToken::Token(self.tokens.pop_front().unwrap())),
+ MergeResult::Expand { n, token } => {
+ self.tokens.drain(..n);
+ Some(ScanToken::Token(token))
+ }
+ }
+ }
+}
+
+impl<'a> Iterator for StringScanner<'a> {
+ type Item = ScanToken;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ if let Some(token) = self.merge() {
+ return Some(token);
+ }
+ loop {
+ let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap();
+ if seg_type == Segment::End && self.tokens.is_empty() {
+ return None;
+ }
+ let (s, rest) = self.input.split_at(seg_len);
+ self.input = rest;
+
+ match ScanToken::from_segment(s, seg_type) {
+ Some(ScanToken::Error(error)) => return Some(ScanToken::Error(error)),
+ Some(ScanToken::Token(token)) => {
+ self.tokens.push_back(token);
+ if let Some(token) = self.merge() {
+ return Some(token);
+ }
+ }
+ None => (),
+ }
+ }
+ }
+}
+
+#[cfg(test)]
+mod test;
--- /dev/null
+use crate::{identifier::Identifier, lex::{
+ segment::Mode,
+ token::{Punct, Token},
+}};
+
+use super::{ScanError, ScanToken, StringScanner};
+
+fn print_token(token: &Token) {
+ match token {
+ Token::End => print!("Token::End"),
+ Token::Id(s) => print!("Token::Id(String::from({s:?}))"),
+ Token::Number(number) => print!("Token::Number({number:?})"),
+ Token::String(s) => print!("Token::String(String::from({s:?}))"),
+ Token::EndCommand => print!("Token::EndCommand"),
+ Token::Punct(punct) => print!("Token::Punct(Punct::{punct:?})"),
+ }
+}
+
+fn check_scan(input: &str, mode: Mode, expected: &[ScanToken]) {
+ let tokens = StringScanner::new(input, mode, false).collect::<Vec<_>>();
+
+ if &tokens != expected {
+ for token in &tokens {
+ match token {
+ ScanToken::Token(token) => {
+ print!("ScanToken::Token(");
+ print_token(token);
+ print!(")");
+ }
+ ScanToken::Error(error) => print!("ScanToken::Error(ScanError::{error:?})"),
+ }
+ println!(",");
+ }
+
+ eprintln!("tokens differ from expected:");
+ let difference = diff::slice(expected, &tokens);
+ for result in difference {
+ match result {
+ diff::Result::Left(left) => eprintln!("-{left:?}"),
+ diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
+ diff::Result::Right(right) => eprintln!("+{right:?}"),
+ }
+ }
+ panic!();
+ }
+}
+
+#[test]
+fn test_identifiers() {
+ check_scan(
+ r#"a aB i5 $x @efg @@. !abcd !* !*a #.# .x _z.
+abcd. abcd.
+QRSTUV./* end of line comment */
+QrStUv./* end of line comment */
+WXYZ. /* unterminated end of line comment
+�. /* U+FFFD is not valid in an identifier
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("aB").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("i5").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("$x").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("@efg").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("@@.").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("!abcd").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::BangAsterisk)),
+ ScanToken::Token(Token::Punct(Punct::BangAsterisk)),
+ ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("#.#").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::Dot)),
+ ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::Underscore)),
+ ScanToken::Token(Token::Id(Identifier::new("z").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("abcd.").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("abcd").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("QRSTUV").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("QrStUv").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("WXYZ").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Error(ScanError::UnexpectedChar('�')),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+}
+
+#[test]
+fn test_reserved_words() {
+ check_scan(
+ r#"and or not eq ge gt le lt ne all by to with
+AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
+andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
+and. with.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Punct(Punct::And)),
+ ScanToken::Token(Token::Punct(Punct::Or)),
+ ScanToken::Token(Token::Punct(Punct::Not)),
+ ScanToken::Token(Token::Punct(Punct::Eq)),
+ ScanToken::Token(Token::Punct(Punct::Ge)),
+ ScanToken::Token(Token::Punct(Punct::Gt)),
+ ScanToken::Token(Token::Punct(Punct::Le)),
+ ScanToken::Token(Token::Punct(Punct::Lt)),
+ ScanToken::Token(Token::Punct(Punct::Ne)),
+ ScanToken::Token(Token::Punct(Punct::All)),
+ ScanToken::Token(Token::Punct(Punct::By)),
+ ScanToken::Token(Token::Punct(Punct::To)),
+ ScanToken::Token(Token::Punct(Punct::With)),
+ ScanToken::Token(Token::Punct(Punct::And)),
+ ScanToken::Token(Token::Punct(Punct::Or)),
+ ScanToken::Token(Token::Punct(Punct::Not)),
+ ScanToken::Token(Token::Punct(Punct::Eq)),
+ ScanToken::Token(Token::Punct(Punct::Ge)),
+ ScanToken::Token(Token::Punct(Punct::Gt)),
+ ScanToken::Token(Token::Punct(Punct::Le)),
+ ScanToken::Token(Token::Punct(Punct::Lt)),
+ ScanToken::Token(Token::Punct(Punct::Ne)),
+ ScanToken::Token(Token::Punct(Punct::All)),
+ ScanToken::Token(Token::Punct(Punct::By)),
+ ScanToken::Token(Token::Punct(Punct::To)),
+ ScanToken::Token(Token::Punct(Punct::With)),
+ ScanToken::Token(Token::Id(Identifier::new("andx").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("orx").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("notx").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("eqx").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("gex").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("gtx").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("lex").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("ltx").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("nex").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("allx").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("byx").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("tox").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("withx").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("and.").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::With)),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+}
+
+#[test]
+fn test_punctuation() {
+ check_scan(
+ r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] **
+~&|=>=><=<~=<>(),-+*/[]**
+% : ; ? _ ` { } ~
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Punct(Punct::Not)),
+ ScanToken::Token(Token::Punct(Punct::And)),
+ ScanToken::Token(Token::Punct(Punct::Or)),
+ ScanToken::Token(Token::Punct(Punct::Equals)),
+ ScanToken::Token(Token::Punct(Punct::Ge)),
+ ScanToken::Token(Token::Punct(Punct::Gt)),
+ ScanToken::Token(Token::Punct(Punct::Le)),
+ ScanToken::Token(Token::Punct(Punct::Lt)),
+ ScanToken::Token(Token::Punct(Punct::Ne)),
+ ScanToken::Token(Token::Punct(Punct::Ne)),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::Punct(Punct::Comma)),
+ ScanToken::Token(Token::Punct(Punct::Dash)),
+ ScanToken::Token(Token::Punct(Punct::Plus)),
+ ScanToken::Token(Token::Punct(Punct::Asterisk)),
+ ScanToken::Token(Token::Punct(Punct::Slash)),
+ ScanToken::Token(Token::Punct(Punct::LSquare)),
+ ScanToken::Token(Token::Punct(Punct::RSquare)),
+ ScanToken::Token(Token::Punct(Punct::Exp)),
+ ScanToken::Token(Token::Punct(Punct::Not)),
+ ScanToken::Token(Token::Punct(Punct::And)),
+ ScanToken::Token(Token::Punct(Punct::Or)),
+ ScanToken::Token(Token::Punct(Punct::Equals)),
+ ScanToken::Token(Token::Punct(Punct::Ge)),
+ ScanToken::Token(Token::Punct(Punct::Gt)),
+ ScanToken::Token(Token::Punct(Punct::Le)),
+ ScanToken::Token(Token::Punct(Punct::Lt)),
+ ScanToken::Token(Token::Punct(Punct::Ne)),
+ ScanToken::Token(Token::Punct(Punct::Ne)),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::Punct(Punct::Comma)),
+ ScanToken::Token(Token::Punct(Punct::Dash)),
+ ScanToken::Token(Token::Punct(Punct::Plus)),
+ ScanToken::Token(Token::Punct(Punct::Asterisk)),
+ ScanToken::Token(Token::Punct(Punct::Slash)),
+ ScanToken::Token(Token::Punct(Punct::LSquare)),
+ ScanToken::Token(Token::Punct(Punct::RSquare)),
+ ScanToken::Token(Token::Punct(Punct::Exp)),
+ ScanToken::Token(Token::Punct(Punct::Percent)),
+ ScanToken::Token(Token::Punct(Punct::Colon)),
+ ScanToken::Token(Token::Punct(Punct::Semicolon)),
+ ScanToken::Token(Token::Punct(Punct::Question)),
+ ScanToken::Token(Token::Punct(Punct::Underscore)),
+ ScanToken::Token(Token::Punct(Punct::Backtick)),
+ ScanToken::Token(Token::Punct(Punct::LCurly)),
+ ScanToken::Token(Token::Punct(Punct::RCurly)),
+ ScanToken::Token(Token::Punct(Punct::Not)),
+ ],
+ );
+}
+
+#[test]
+fn test_positive_numbers() {
+ check_scan(
+ r#"0 1 01 001. 1.
+123. /* comment 1 */ /* comment 2 */
+.1 0.1 00.1 00.10
+5e1 6E-1 7e+1 6E+01 6e-03
+.3E1 .4e-1 .5E+1 .6e+01 .7E-03
+1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
+. 1e e1 1e+ 1e-
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Number(0.0)),
+ ScanToken::Token(Token::Number(1.0)),
+ ScanToken::Token(Token::Number(1.0)),
+ ScanToken::Token(Token::Number(1.0)),
+ ScanToken::Token(Token::Number(1.0)),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Number(123.0)),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Number(1.0)),
+ ScanToken::Token(Token::Number(0.1)),
+ ScanToken::Token(Token::Number(0.1)),
+ ScanToken::Token(Token::Number(0.1)),
+ ScanToken::Token(Token::Number(50.0)),
+ ScanToken::Token(Token::Number(0.6)),
+ ScanToken::Token(Token::Number(70.0)),
+ ScanToken::Token(Token::Number(60.0)),
+ ScanToken::Token(Token::Number(0.006)),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Number(30.0)),
+ ScanToken::Token(Token::Number(0.04)),
+ ScanToken::Token(Token::Number(5.0)),
+ ScanToken::Token(Token::Number(6.0)),
+ ScanToken::Token(Token::Number(0.0007)),
+ ScanToken::Token(Token::Number(12.3)),
+ ScanToken::Token(Token::Number(4.56)),
+ ScanToken::Token(Token::Number(789.0)),
+ ScanToken::Token(Token::Number(999.0)),
+ ScanToken::Token(Token::Number(0.0112)),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Error(ScanError::ExpectedExponent(String::from("1e"))),
+ ScanToken::Token(Token::Id(Identifier::new("e1").unwrap())),
+ ScanToken::Error(ScanError::ExpectedExponent(String::from("1e+"))),
+ ScanToken::Error(ScanError::ExpectedExponent(String::from("1e-"))),
+ ],
+ );
+}
+
+#[test]
+fn test_negative_numbers() {
+ check_scan(
+ r#" -0 -1 -01 -001. -1.
+ -123. /* comment 1 */ /* comment 2 */
+ -.1 -0.1 -00.1 -00.10
+ -5e1 -6E-1 -7e+1 -6E+01 -6e-03
+ -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03
+ -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03
+ -/**/1
+ -. -1e -e1 -1e+ -1e- -1.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Number(-0.0)),
+ ScanToken::Token(Token::Number(-1.0)),
+ ScanToken::Token(Token::Number(-1.0)),
+ ScanToken::Token(Token::Number(-1.0)),
+ ScanToken::Token(Token::Number(-1.0)),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Number(-123.0)),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Number(-0.1)),
+ ScanToken::Token(Token::Number(-0.1)),
+ ScanToken::Token(Token::Number(-0.1)),
+ ScanToken::Token(Token::Number(-0.1)),
+ ScanToken::Token(Token::Number(-50.0)),
+ ScanToken::Token(Token::Number(-0.6)),
+ ScanToken::Token(Token::Number(-70.0)),
+ ScanToken::Token(Token::Number(-60.0)),
+ ScanToken::Token(Token::Number(-0.006)),
+ ScanToken::Token(Token::Number(-3.0)),
+ ScanToken::Token(Token::Number(-0.04)),
+ ScanToken::Token(Token::Number(-5.0)),
+ ScanToken::Token(Token::Number(-6.0)),
+ ScanToken::Token(Token::Number(-0.0007)),
+ ScanToken::Token(Token::Number(-12.3)),
+ ScanToken::Token(Token::Number(-4.56)),
+ ScanToken::Token(Token::Number(-789.0)),
+ ScanToken::Token(Token::Number(-999.0)),
+ ScanToken::Token(Token::Number(-0.0112)),
+ ScanToken::Token(Token::Number(-1.0)),
+ ScanToken::Token(Token::Punct(Punct::Dash)),
+ ScanToken::Token(Token::Punct(Punct::Dot)),
+ ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e"))),
+ ScanToken::Token(Token::Punct(Punct::Dash)),
+ ScanToken::Token(Token::Id(Identifier::new("e1").unwrap())),
+ ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e+"))),
+ ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e-"))),
+ ScanToken::Token(Token::Number(-1.0)),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+}
+
+#[test]
+fn test_strings() {
+ check_scan(
+ r#"'x' "y" 'abc'
+'Don''t' "Can't" 'Won''t'
+"""quoted""" '"quoted"'
+'' "" '''' """"
+'missing end quote
+"missing double quote
+'x' + "y"
++ 'z' +
+'a' /* abc */ + "b" /*
++ 'c' +/* */"d"/* */+'e'
+'foo'
++ /* special case: + in column 0 would ordinarily start a new command
+'bar'
+'foo'
+ +
+'bar'
+'foo'
++
+
+'bar'
+
++
+x"4142"+'5152'
+"4142"+
+x'5152'
+x"4142"
++u'304a'
+"�あいうえお"
+"abc"+U"FFFD"+u'3048'+"xyz"
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::String(String::from("x"))),
+ ScanToken::Token(Token::String(String::from("y"))),
+ ScanToken::Token(Token::String(String::from("abc"))),
+ ScanToken::Token(Token::String(String::from("Don't"))),
+ ScanToken::Token(Token::String(String::from("Can't"))),
+ ScanToken::Token(Token::String(String::from("Won't"))),
+ ScanToken::Token(Token::String(String::from("\"quoted\""))),
+ ScanToken::Token(Token::String(String::from("\"quoted\""))),
+ ScanToken::Token(Token::String(String::from(""))),
+ ScanToken::Token(Token::String(String::from(""))),
+ ScanToken::Token(Token::String(String::from("'"))),
+ ScanToken::Token(Token::String(String::from("\""))),
+ ScanToken::Error(ScanError::ExpectedQuote),
+ ScanToken::Error(ScanError::ExpectedQuote),
+ ScanToken::Token(Token::String(String::from("xyzabcde"))),
+ ScanToken::Token(Token::String(String::from("foobar"))),
+ ScanToken::Token(Token::String(String::from("foobar"))),
+ ScanToken::Token(Token::String(String::from("foo"))),
+ ScanToken::Token(Token::Punct(Punct::Plus)),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::String(String::from("bar"))),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Punct(Punct::Plus)),
+ ScanToken::Token(Token::String(String::from("AB5152"))),
+ ScanToken::Token(Token::String(String::from("4142QR"))),
+ ScanToken::Token(Token::String(String::from("ABお"))),
+ ScanToken::Token(Token::String(String::from("�あいうえお"))),
+ ScanToken::Token(Token::String(String::from("abc�えxyz"))),
+ ScanToken::Token(Token::End),
+ ],
+ );
+}
+
+#[test]
+fn test_shbang() {
+ check_scan(
+ r#"#! /usr/bin/pspp
+#! /usr/bin/pspp
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("#").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::Bang)),
+ ScanToken::Token(Token::Punct(Punct::Slash)),
+ ScanToken::Token(Token::Id(Identifier::new("usr").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::Slash)),
+ ScanToken::Token(Token::Id(Identifier::new("bin").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::Slash)),
+ ScanToken::Token(Token::Id(Identifier::new("pspp").unwrap())),
+ ],
+ );
+}
+
+#[test]
+fn test_comments() {
+ check_scan(
+ r#"* Comment commands "don't
+have to contain valid tokens.
+
+** Check ambiguity with ** token.
+****************.
+
+comment keyword works too.
+COMM also.
+com is ambiguous with COMPUTE.
+
+ * Comment need not start at left margin.
+
+* Comment ends with blank line
+
+next command.
+
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("com").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("is").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("ambiguous").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::With)),
+ ScanToken::Token(Token::Id(Identifier::new("COMPUTE").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("next").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+}
+
+#[test]
+fn test_document() {
+ check_scan(
+ r#"DOCUMENT one line.
+DOC more
+ than
+ one
+ line.
+docu
+first.paragraph
+isn't parsed as tokens
+
+second paragraph.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())),
+ ScanToken::Token(Token::String(String::from("DOCUMENT one line."))),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())),
+ ScanToken::Token(Token::String(String::from("DOC more"))),
+ ScanToken::Token(Token::String(String::from(" than"))),
+ ScanToken::Token(Token::String(String::from(" one"))),
+ ScanToken::Token(Token::String(String::from(" line."))),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())),
+ ScanToken::Token(Token::String(String::from("docu"))),
+ ScanToken::Token(Token::String(String::from("first.paragraph"))),
+ ScanToken::Token(Token::String(String::from("isn't parsed as tokens"))),
+ ScanToken::Token(Token::String(String::from(""))),
+ ScanToken::Token(Token::String(String::from("second paragraph."))),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+}
+
+#[test]
+fn test_file_label() {
+ check_scan(
+ r#"FIL label isn't quoted.
+FILE
+ lab 'is quoted'.
+FILE /*
+/**/ lab not quoted here either
+
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("FIL").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("label").unwrap())),
+ ScanToken::Token(Token::String(String::from("isn't quoted"))),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("FILE").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("lab").unwrap())),
+ ScanToken::Token(Token::String(String::from("is quoted"))),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("FILE").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("lab").unwrap())),
+ ScanToken::Token(Token::String(String::from("not quoted here either"))),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+}
+
+#[test]
+fn test_begin_data() {
+ check_scan(
+ r#"begin data.
+123
+xxx
+end data.
+
+BEG /**/ DAT /*
+5 6 7 /* x
+
+end data
+end data
+.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("begin").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::String(String::from("123"))),
+ ScanToken::Token(Token::String(String::from("xxx"))),
+ ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("BEG").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("DAT").unwrap())),
+ ScanToken::Token(Token::String(String::from("5 6 7 /* x"))),
+ ScanToken::Token(Token::String(String::from(""))),
+ ScanToken::Token(Token::String(String::from("end data"))),
+ ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+}
+
+#[test]
+fn test_do_repeat() {
+ check_scan(
+ r#"do repeat x=a b c
+ y=d e f.
+ do repeat a=1 thru 5.
+another command.
+second command
++ third command.
+end /* x */ /* y */ repeat print.
+end
+ repeat.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("do").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::Equals)),
+ ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("y").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::Equals)),
+ ScanToken::Token(Token::Id(Identifier::new("d").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("e").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("f").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::String(String::from(" do repeat a=1 thru 5."))),
+ ScanToken::Token(Token::String(String::from("another command."))),
+ ScanToken::Token(Token::String(String::from("second command"))),
+ ScanToken::Token(Token::String(String::from("+ third command."))),
+ ScanToken::Token(Token::String(String::from(
+ "end /* x */ /* y */ repeat print.",
+ ))),
+ ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+}
+
+#[test]
+fn test_do_repeat_batch() {
+ check_scan(
+ r#"do repeat x=a b c
+ y=d e f
+do repeat a=1 thru 5
+another command
+second command
++ third command
+end /* x */ /* y */ repeat print
+end
+ repeat
+do
+ repeat #a=1
+
+ inner command
+end repeat
+"#,
+ Mode::Batch,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("do").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::Equals)),
+ ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("y").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::Equals)),
+ ScanToken::Token(Token::Id(Identifier::new("d").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("e").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("f").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::String(String::from("do repeat a=1 thru 5"))),
+ ScanToken::Token(Token::String(String::from("another command"))),
+ ScanToken::Token(Token::String(String::from("second command"))),
+ ScanToken::Token(Token::String(String::from("+ third command"))),
+ ScanToken::Token(Token::String(String::from(
+ "end /* x */ /* y */ repeat print",
+ ))),
+ ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("do").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("#a").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::Equals)),
+ ScanToken::Token(Token::Number(1.0)),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::String(String::from(" inner command"))),
+ ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
+ ],
+ );
+}
+
+#[test]
+fn test_batch_mode() {
+ check_scan(
+ r#"first command
+ another line of first command
++ second command
+third command
+
+fourth command.
+ fifth command.
+"#,
+ Mode::Batch,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("first").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("another").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("line").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("of").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("first").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("second").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("third").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("fourth").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("fifth").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+}
+
+mod define {
+ use crate::{identifier::Identifier, lex::{
+ scan::ScanToken,
+ segment::Mode,
+ token::{Punct, Token},
+ }};
+
+ use super::check_scan;
+
+ #[test]
+ fn test_simple() {
+ check_scan(
+ r#"define !macro1()
+var1 var2 var3
+!enddefine.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+ ScanToken::Token(Token::String(String::from("!macro1"))),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
+ ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+ }
+
+ #[test]
+ fn test_no_newline_after_parentheses() {
+ check_scan(
+ r#"define !macro1() var1 var2 var3
+!enddefine.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+ ScanToken::Token(Token::String(String::from("!macro1"))),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::String(String::from(" var1 var2 var3"))),
+ ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+ }
+
+ #[test]
+ fn test_no_newline_before_enddefine() {
+ check_scan(
+ r#"define !macro1()
+var1 var2 var3!enddefine.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+ ScanToken::Token(Token::String(String::from("!macro1"))),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
+ ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+ }
+
+ #[test]
+ fn test_all_on_one_line() {
+ check_scan(
+ r#"define !macro1()var1 var2 var3!enddefine.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+ ScanToken::Token(Token::String(String::from("!macro1"))),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
+ ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+ }
+
+ #[test]
+ fn test_empty() {
+ check_scan(
+ r#"define !macro1()
+!enddefine.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+ ScanToken::Token(Token::String(String::from("!macro1"))),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+ }
+
+ #[test]
+ fn test_blank_lines() {
+ check_scan(
+ r#"define !macro1()
+
+
+!enddefine.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+ ScanToken::Token(Token::String(String::from("!macro1"))),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::String(String::from(""))),
+ ScanToken::Token(Token::String(String::from(""))),
+ ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+ }
+
+ #[test]
+ fn test_arguments() {
+ check_scan(
+ r#"define !macro1(a(), b(), c())
+!enddefine.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+ ScanToken::Token(Token::String(String::from("!macro1"))),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::Punct(Punct::Comma)),
+ ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::Punct(Punct::Comma)),
+ ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+ }
+
+ #[test]
+ fn test_multiline_arguments() {
+ check_scan(
+ r#"define !macro1(
+ a(), b(
+ ),
+ c()
+)
+!enddefine.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+ ScanToken::Token(Token::String(String::from("!macro1"))),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::Punct(Punct::Comma)),
+ ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::Punct(Punct::Comma)),
+ ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+ }
+
+ #[test]
+ fn test_arguments_start_on_second_line() {
+ check_scan(
+ r#"define !macro1
+(x,y,z
+)
+content 1
+content 2
+!enddefine.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+ ScanToken::Token(Token::String(String::from("!macro1"))),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::Comma)),
+ ScanToken::Token(Token::Id(Identifier::new("y").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::Comma)),
+ ScanToken::Token(Token::Id(Identifier::new("z").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::String(String::from("content 1"))),
+ ScanToken::Token(Token::String(String::from("content 2"))),
+ ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+ }
+
+ #[test]
+ fn test_early_end_of_command_1() {
+ check_scan(
+ r#"define !macro1.
+data list /x 1.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+ ScanToken::Token(Token::String(String::from("!macro1"))),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::Slash)),
+ ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+ ScanToken::Token(Token::Number(1.0)),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+ }
+
+ #[test]
+ fn test_early_end_of_command_2() {
+ check_scan(
+ r#"define !macro1
+x.
+data list /x 1.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+ ScanToken::Token(Token::String(String::from("!macro1"))),
+ ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::Slash)),
+ ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+ ScanToken::Token(Token::Number(1.0)),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+ }
+
+ #[test]
+ fn test_early_end_of_command_3() {
+ check_scan(
+ r#"define !macro1(.
+x.
+data list /x 1.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+ ScanToken::Token(Token::String(String::from("!macro1"))),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::Slash)),
+ ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+ ScanToken::Token(Token::Number(1.0)),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+ }
+
+ #[test]
+ fn test_early_end_of_command_4() {
+ // Notice the command terminator at the end of the DEFINE command,
+ // which should not be there and ends it early.
+ check_scan(
+ r#"define !macro1.
+data list /x 1.
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+ ScanToken::Token(Token::String(String::from("!macro1"))),
+ ScanToken::Token(Token::EndCommand),
+ ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
+ ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
+ ScanToken::Token(Token::Punct(Punct::Slash)),
+ ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+ ScanToken::Token(Token::Number(1.0)),
+ ScanToken::Token(Token::EndCommand),
+ ],
+ );
+ }
+
+ #[test]
+ fn test_missing_enddefine() {
+ check_scan(
+ r#"define !macro1()
+content line 1
+content line 2
+"#,
+ Mode::Auto,
+ &[
+ ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+ ScanToken::Token(Token::String(String::from("!macro1"))),
+ ScanToken::Token(Token::Punct(Punct::LParen)),
+ ScanToken::Token(Token::Punct(Punct::RParen)),
+ ScanToken::Token(Token::String(String::from("content line 1"))),
+ ScanToken::Token(Token::String(String::from("content line 2"))),
+ ScanToken::Token(Token::End),
+ ],
+ );
+ }
+}
--- /dev/null
+//! Syntax segmentation.
+//!
+//! PSPP divides traditional "lexical analysis" or "tokenization" into two
+//! phases: a lower-level phase called "segmentation" and a higher-level phase
+//! called "scanning". This module implements the segmentation phase.
+//! [`super::scan`] contains declarations for the scanning phase.
+//!
+//! Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label
+//! (a segment type) for each byte or contiguous sequence of bytes in the input.
+//! It also, in a few corner cases, outputs zero-width segments that label the
+//! boundary between a pair of bytes in the input.
+//!
+//! Some segment types correspond directly to tokens; for example, an
+//! "identifier" segment (SEG_IDENTIFIER) becomes an identifier token (T_ID)
+//! later in lexical analysis. Other segments contribute to tokens but do not
+//! correspond directly; for example, multiple quoted string segments
+//! (SEG_QUOTED_STRING) separated by spaces (SEG_SPACES) and "+" punctuators
+//! (SEG_PUNCT) may be combined to form a single string token (T_STRING). Still
+//! other segments are ignored (e.g. SEG_SPACES) or trigger special behavior
+//! such as error messages later in tokenization (e.g. SEG_EXPECTED_QUOTE).
+
+use crate::{
+ identifier::{id_match, id_match_n, IdentifierChar},
+ prompt::PromptStyle,
+};
+use bitflags::bitflags;
+
+use super::command_name::{command_match, COMMAND_NAMES};
+
+/// Segmentation mode.
+///
+/// PSPP syntax is written in one of two modes which are broadly defined as
+/// follows:
+///
+/// - In interactive mode, commands end with a period at the end of the line
+/// or with a blank line.
+///
+/// - In batch mode, the second and subsequent lines of a command are indented
+/// from the left margin.
+///
+/// The segmenter can also try to automatically detect the mode in use, using a
+/// heuristic that is usually correct.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
+pub enum Mode {
+ /// Try to interpret input correctly regardless of whether it is written
+ /// for interactive or batch mode.
+ #[default]
+ Auto,
+
+ /// Interactive syntax mode.
+ Interactive,
+
+ /// Batch syntax mode.
+ Batch,
+}
+
+/// The type of a segment.
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum Segment {
+ Number,
+ QuotedString,
+ HexString,
+ UnicodeString,
+ UnquotedString,
+ Identifier,
+ Punct,
+ Shbang,
+ Spaces,
+ Comment,
+ Newline,
+ CommentCommand,
+ DoRepeatCommand,
+ DoRepeatOverflow,
+ InlineData,
+ MacroName,
+ MacroBody,
+ StartDocument,
+ Document,
+ StartCommand,
+ SeparateCommands,
+ EndCommand,
+ End,
+ ExpectedQuote,
+ ExpectedExponent,
+ UnexpectedChar,
+}
+
+bitflags! {
+ #[derive(Copy, Clone, Debug)]
+ pub struct Substate: u8 {
+ const START_OF_LINE = 1;
+ const START_OF_COMMAND = 2;
+ }
+}
+
+#[derive(Copy, Clone)]
+pub struct Segmenter {
+ state: (State, Substate),
+ nest: u8,
+ mode: Mode,
+}
+
+#[derive(Copy, Clone, Debug)]
+pub struct Incomplete;
+
+impl Segmenter {
+ /// Returns a segmenter with the given syntax `mode`.
+ ///
+ /// If `is_snippet` is false, then the segmenter will parse as if it's being
+ /// given a whole file. This means, for example, that it will interpret `-`
+ /// or `+` at the beginning of the syntax as a separator between commands
+ /// (since `-` or `+` at the beginning of a line has this meaning).
+ ///
+ /// If `is_snippet` is true, then the segmenter will parse as if it's being
+ /// given an isolated piece of syntax. This means that, for example, that
+ /// it will interpret `-` or `+` at the beginning of the syntax as an
+ /// operator token or (if followed by a digit) as part of a number.
+ pub fn new(mode: Mode, is_snippet: bool) -> Self {
+ Self {
+ state: if is_snippet {
+ (State::General, Substate::empty())
+ } else {
+ (State::Shbang, Substate::empty())
+ },
+ mode,
+ nest: 0,
+ }
+ }
+
+ pub fn mode(&self) -> Mode {
+ self.mode
+ }
+
+ fn start_of_line(&self) -> bool {
+ self.state.1.contains(Substate::START_OF_LINE)
+ }
+
+ fn start_of_command(&self) -> bool {
+ self.state.1.contains(Substate::START_OF_COMMAND)
+ }
+
+ /// Returns the style of command prompt to display to an interactive user
+ /// for input in the current state.. The return value is most accurate in
+ /// mode `Mode::Interactive` and at the beginning of a line (that is, if
+ /// [`Segmenter::push`] consumed as much as possible of the input up to a
+ /// new-line).
+ pub fn prompt(&self) -> PromptStyle {
+ match self.state.0 {
+ State::Shbang => PromptStyle::First,
+ State::General => {
+ if self.start_of_command() {
+ PromptStyle::First
+ } else {
+ PromptStyle::Later
+ }
+ }
+ State::Comment1 | State::Comment2 => PromptStyle::Comment,
+ State::Document1 | State::Document2 => PromptStyle::Document,
+ State::Document3 => PromptStyle::First,
+ State::FileLabel1 => PromptStyle::Later,
+ State::FileLabel2 | State::FileLabel3 => PromptStyle::First,
+ State::DoRepeat1 | State::DoRepeat2 => {
+ if self.start_of_command() {
+ PromptStyle::First
+ } else {
+ PromptStyle::Later
+ }
+ }
+ State::DoRepeat3 => PromptStyle::DoRepeat,
+ State::DoRepeat4 => PromptStyle::DoRepeat,
+ State::Define1 | State::Define2 | State::Define3 => {
+ if self.start_of_command() {
+ PromptStyle::First
+ } else {
+ PromptStyle::Later
+ }
+ }
+ State::Define4 | State::Define5 | State::Define6 => PromptStyle::Define,
+ State::BeginData1 => PromptStyle::First,
+ State::BeginData2 => PromptStyle::Later,
+ State::BeginData3 | State::BeginData4 => PromptStyle::Data,
+ }
+ }
+
+ /// Attempts to label a prefix of the remaining input with a segment type.
+ /// The caller supplies a prefix of the remaining input as `input`. If
+ /// `eof` is true, then `input` is the entire (remainder) of the input; if
+ /// `eof` is false, then further input is potentially available.
+ ///
+ /// The input may contain '\n' or '\r\n' line ends in any combination.
+ ///
+ /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes
+ /// in the segment at the beginning of `input` (a number in
+ /// `0..=input.len()`) and the type of that segment. The next call should
+ /// not include those bytes in `input`, because they have (figuratively)
+ /// been consumed by the segmenter.
+ ///
+ /// Segments can have zero length, including segment types `Type::End`,
+ /// `Type::SeparateCommands`, `Type::StartDocument`, `Type::InlineData`, and
+ /// `Type::Spaces`.
+ ///
+ /// Failure occurs only if the segment type of the bytes in `input` cannot
+ /// yet be determined. In this case, this function returns `Err(Incomplete)`. If
+ /// more input is available, the caller should obtain some more, then call
+ /// again with a longer `input`. If this is not enough, the process might
+ /// need to repeat again and again. If input is exhausted, then the caller
+ /// may call again setting `eof` to true. This function will never return
+ /// `Err(Incomplete)` when `eof` is true.
+ ///
+ /// The caller must not, in a sequence of calls, supply contradictory input.
+ /// That is, bytes provided as part of `input` in one call, but not
+ /// consumed, must not be provided with *different* values on subsequent
+ /// calls. This is because the function must often make decisions based on
+ /// looking ahead beyond the bytes that it consumes.
+ fn push_rest<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ if input.is_empty() {
+ if eof {
+ return Ok((input, Segment::End));
+ } else {
+ return Err(Incomplete);
+ };
+ }
+
+ match self.state.0 {
+ State::Shbang => return self.parse_shbang(input, eof),
+ State::General => {
+ if self.start_of_line() {
+ self.parse_start_of_line(input, eof)
+ } else {
+ self.parse_mid_line(input, eof)
+ }
+ }
+ State::Comment1 => self.parse_comment_1(input, eof),
+ State::Comment2 => self.parse_comment_2(input, eof),
+ State::Document1 => self.parse_document_1(input, eof),
+ State::Document2 => self.parse_document_2(input, eof),
+ State::Document3 => self.parse_document_3(input, eof),
+ State::FileLabel1 => self.parse_file_label_1(input, eof),
+ State::FileLabel2 => self.parse_file_label_2(input, eof),
+ State::FileLabel3 => self.parse_file_label_3(input, eof),
+ State::DoRepeat1 => self.parse_do_repeat_1(input, eof),
+ State::DoRepeat2 => self.parse_do_repeat_2(input, eof),
+ State::DoRepeat3 => self.parse_do_repeat_3(input, eof),
+ State::DoRepeat4 => self.parse_do_repeat_4(input),
+ State::Define1 => self.parse_define_1_2(input, eof),
+ State::Define2 => self.parse_define_1_2(input, eof),
+ State::Define3 => self.parse_define_3(input, eof),
+ State::Define4 => self.parse_define_4_5(input, eof),
+ State::Define5 => self.parse_define_4_5(input, eof),
+ State::Define6 => self.parse_define_6(input, eof),
+ State::BeginData1 => self.parse_begin_data_1(input, eof),
+ State::BeginData2 => self.parse_begin_data_2(input, eof),
+ State::BeginData3 => self.parse_begin_data_3(input, eof),
+ State::BeginData4 => self.parse_begin_data_4(input, eof),
+ }
+ }
+
+ pub fn push(&mut self, input: &str, eof: bool) -> Result<(usize, Segment), Incomplete> {
+ let (rest, seg_type) = self.push_rest(input, eof)?;
+ Ok((input.len() - rest.len(), seg_type))
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+enum State {
+ Shbang,
+ General,
+ Comment1,
+ Comment2,
+ Document1,
+ Document2,
+ Document3,
+ FileLabel1,
+ FileLabel2,
+ FileLabel3,
+ DoRepeat1,
+ DoRepeat2,
+ DoRepeat3,
+ DoRepeat4,
+ Define1,
+ Define2,
+ Define3,
+ Define4,
+ Define5,
+ Define6,
+ BeginData1,
+ BeginData2,
+ BeginData3,
+ BeginData4,
+}
+
+fn take(input: &str, eof: bool) -> Result<(Option<char>, &str), Incomplete> {
+ let mut iter = input.chars();
+ match iter.next() {
+ None if !eof => Err(Incomplete),
+ c => Ok((c, iter.as_str())),
+ }
+}
+
+fn skip_comment(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(input);
+ };
+ match c {
+ '\n' | '\r' if is_end_of_line(input, eof)? => return Ok(input),
+ '*' => {
+ if let (Some('/'), rest) = take(rest, eof)? {
+ return Ok(rest);
+ }
+ }
+ _ => (),
+ };
+ input = rest;
+ }
+}
+
+fn skip_matching<F>(f: F, input: &str, eof: bool) -> Result<&str, Incomplete>
+where
+ F: Fn(char) -> bool,
+{
+ let input = input.trim_start_matches(f);
+ if input.is_empty() && !eof {
+ Err(Incomplete)
+ } else {
+ Ok(input)
+ }
+}
+
+fn match_char<F>(f: F, input: &str, eof: bool) -> Result<Option<&str>, Incomplete>
+where
+ F: Fn(char) -> bool,
+{
+ if let (Some(c), rest) = take(input, eof)? {
+ if f(c) {
+ return Ok(Some(rest));
+ }
+ }
+ Ok(None)
+}
+
+fn skip_spaces(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(input);
+ };
+ match c {
+ '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
+ c if c.is_whitespace() => (),
+ _ => return Ok(input),
+ }
+ input = rest;
+ }
+}
+
+fn skip_digits(input: &str, eof: bool) -> Result<&str, Incomplete> {
+ skip_matching(|c| c.is_ascii_digit(), input, eof)
+}
+
+fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(input);
+ };
+ match c {
+ '/' => {
+ let (c, rest2) = take(rest, eof)?;
+ match c {
+ Some('*') => input = skip_comment(rest2, eof)?,
+ Some(_) | None => return Ok(rest),
+ }
+ }
+ '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
+ c if c.is_whitespace() => input = rest,
+ _ => return Ok(input),
+ };
+ }
+}
+
+fn is_start_of_string(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(false);
+ };
+ match c {
+ 'x' | 'X' | 'u' | 'U' => {
+ let (c, _rest) = take(rest, eof)?;
+ Ok(c == Some('\'') || c == Some('"'))
+ }
+ '\'' | '"' => Ok(true),
+ '\n' | '\r' if is_end_of_line(input, eof)? => Ok(true),
+ _ => Ok(false),
+ }
+}
+
+fn is_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(true);
+ };
+ Ok(match c {
+ '\n' => true,
+ '\r' => take(rest, eof)?.0 == Some('\n'),
+ _ => false,
+ })
+}
+
+fn at_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ is_end_of_line(skip_spaces_and_comments(input, eof)?, eof)
+}
+
+fn first(s: &str) -> char {
+ s.chars().next().unwrap()
+}
+fn get_command_name_candidates(target: &str) -> &[&'static str] {
+ if target.is_empty() {
+ return &[];
+ }
+ let target_first = first(target).to_ascii_uppercase();
+ let low = COMMAND_NAMES.partition_point(|s| first(s) < target_first);
+ let high = COMMAND_NAMES.partition_point(|s| first(s) <= target_first);
+ &COMMAND_NAMES[low..high]
+}
+
+fn detect_command_name(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ let command_name = input
+ .split(|c: char| {
+ !((c.is_whitespace() && c != '\n') || (c.may_continue_id() && c != '.') || c == '-')
+ })
+ .next()
+ .unwrap();
+ if !eof && command_name.len() == input.len() {
+ return Err(Incomplete);
+ }
+ let command_name = command_name.trim_end_matches(|c: char| c.is_whitespace() || c == '.');
+ for command in get_command_name_candidates(command_name) {
+ if let Some(m) = command_match(command, command_name) {
+ if m.missing_words <= 0 {
+ return Ok(true);
+ }
+ }
+ }
+ Ok(false)
+}
+
+impl Segmenter {
+ fn parse_shbang<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ if let (Some('#'), rest) = take(input, eof)? {
+ if let (Some('!'), rest) = take(rest, eof)? {
+ let rest = self.parse_full_line(rest, eof)?;
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((rest, Segment::Shbang));
+ }
+ }
+
+ self.state = (
+ State::General,
+ Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+ );
+ self.push_rest(input, eof)
+ }
+ fn at_command_start(&self, input: &str, eof: bool) -> Result<bool, Incomplete> {
+ match self.mode {
+ Mode::Auto => detect_command_name(input, eof),
+ Mode::Interactive => Ok(false),
+ Mode::Batch => Ok(true),
+ }
+ }
+ fn parse_start_of_line<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ debug_assert_eq!(self.state.0, State::General);
+ debug_assert!(self.start_of_line());
+ debug_assert!(!input.is_empty());
+
+ let (Some(c), rest) = take(input, eof).unwrap() else {
+ unreachable!()
+ };
+ match c {
+ '+' if is_start_of_string(skip_spaces_and_comments(rest, eof)?, eof)? => {
+ // This `+` is punctuation that may separate pieces of a string.
+ self.state = (State::General, Substate::empty());
+ return Ok((rest, Segment::Punct));
+ }
+ '+' | '-' | '.' => {
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((rest, Segment::StartCommand));
+ }
+ _ if c.is_whitespace() => {
+ if at_end_of_line(input, eof)? {
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((input, Segment::SeparateCommands));
+ }
+ }
+ _ => {
+ if self.at_command_start(input, eof)?
+ && !self.state.1.contains(Substate::START_OF_COMMAND)
+ {
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((input, Segment::StartCommand));
+ }
+ }
+ }
+ self.state.1 = Substate::START_OF_COMMAND;
+ self.parse_mid_line(input, eof)
+ }
+ fn parse_mid_line<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ debug_assert!(self.state.0 == State::General);
+ debug_assert!(!self.state.1.contains(Substate::START_OF_LINE));
+ let (Some(c), rest) = take(input, eof)? else {
+ unreachable!()
+ };
+ match c {
+ '\r' | '\n' if is_end_of_line(input, eof)? => {
+ self.state.1 |= Substate::START_OF_LINE;
+ Ok((
+ self.parse_newline(input, eof).unwrap().unwrap(),
+ Segment::Newline,
+ ))
+ }
+ '/' => {
+ if let (Some('*'), rest) = take(rest, eof)? {
+ let rest = skip_comment(rest, eof)?;
+ return Ok((rest, Segment::Comment));
+ } else {
+ self.state.1 = Substate::empty();
+ return Ok((rest, Segment::Punct));
+ }
+ }
+ '-' => {
+ let (c, rest2) = take(skip_spaces(rest, eof)?, eof)?;
+ match c {
+ Some(c) if c.is_ascii_digit() => {
+ return self.parse_number(rest, eof);
+ }
+ Some('.') => {
+ if let (Some(c), _rest) = take(rest2, eof)? {
+ if c.is_ascii_digit() {
+ return self.parse_number(rest, eof);
+ }
+ }
+ }
+ None | Some(_) => (),
+ }
+ self.state.1 = Substate::empty();
+ return Ok((rest, Segment::Punct));
+ }
+ '(' | ')' | '[' | ']' | '{' | '}' | ',' | '=' | ';' | ':' | '&' | '|' | '+' => {
+ self.state.1 = Substate::empty();
+ return Ok((rest, Segment::Punct));
+ }
+ '*' => {
+ if self.state.1.contains(Substate::START_OF_COMMAND) {
+ self.state = (State::Comment1, Substate::empty());
+ self.parse_comment_1(input, eof)
+ } else {
+ self.parse_digraph(&['*'], rest, eof)
+ }
+ }
+ '<' => self.parse_digraph(&['=', '>'], rest, eof),
+ '>' => self.parse_digraph(&['='], rest, eof),
+ '~' => self.parse_digraph(&['='], rest, eof),
+ '.' if at_end_of_line(rest, eof)? => {
+ self.state.1 = Substate::START_OF_COMMAND;
+ Ok((rest, Segment::EndCommand))
+ }
+ '.' => match take(rest, eof)? {
+ (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof),
+ _ => Ok((rest, Segment::Punct)),
+ },
+ '0'..='9' => self.parse_number(input, eof),
+ 'u' | 'U' => self.maybe_parse_string(Segment::UnicodeString, (input, rest), eof),
+ 'x' | 'X' => self.maybe_parse_string(Segment::HexString, (input, rest), eof),
+ '\'' | '"' => self.parse_string(Segment::QuotedString, c, rest, eof),
+ '!' => {
+ let (c, rest2) = take(rest, eof)?;
+ match c {
+ Some('*') => Ok((rest2, Segment::Punct)),
+ Some(_) => self.parse_id(input, eof),
+ None => Ok((rest, Segment::Punct)),
+ }
+ }
+ c if c.is_whitespace() => Ok((skip_spaces(rest, eof)?, Segment::Spaces)),
+ c if c.may_start_id() => self.parse_id(input, eof),
+ '#'..='~' if c != '\\' && c != '^' => {
+ self.state.1 = Substate::empty();
+ Ok((rest, Segment::Punct))
+ }
+ _ => {
+ self.state.1 = Substate::empty();
+ Ok((rest, Segment::UnexpectedChar))
+ }
+ }
+ }
+ fn parse_string<'a>(
+ &mut self,
+ segment: Segment,
+ quote: char,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ while let (Some(c), rest) = take(input, eof)? {
+ match c {
+ _ if c == quote => {
+ let (c, rest2) = take(rest, eof)?;
+ if c != Some(quote) {
+ self.state.1 = Substate::empty();
+ return Ok((rest, segment));
+ }
+ input = rest2;
+ }
+ '\r' | '\n' if is_end_of_line(input, eof)? => break,
+ _ => input = rest,
+ }
+ }
+ self.state.1 = Substate::empty();
+ Ok((input, Segment::ExpectedQuote))
+ }
+ fn maybe_parse_string<'a>(
+ &mut self,
+ segment: Segment,
+ input: (&'a str, &'a str),
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ match take(input.1, eof)? {
+ (Some(c), rest) if c == '\'' || c == '"' => self.parse_string(segment, c, rest, eof),
+ _ => self.parse_id(input.0, eof),
+ }
+ }
+ fn next_id_in_command<'a>(
+ &self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, &'a str), Incomplete> {
+ let mut sub = Segmenter::new(self.mode, true);
+ loop {
+ let (seg_len, seg_type) = sub.push(input, eof)?;
+ let (segment, rest) = input.split_at(seg_len);
+ match seg_type {
+ Segment::Shbang | Segment::Spaces | Segment::Comment | Segment::Newline => (),
+
+ Segment::Identifier => return Ok((segment, rest)),
+
+ Segment::Number
+ | Segment::QuotedString
+ | Segment::HexString
+ | Segment::UnicodeString
+ | Segment::UnquotedString
+ | Segment::Punct
+ | Segment::CommentCommand
+ | Segment::DoRepeatCommand
+ | Segment::DoRepeatOverflow
+ | Segment::InlineData
+ | Segment::MacroName
+ | Segment::MacroBody
+ | Segment::StartDocument
+ | Segment::Document
+ | Segment::StartCommand
+ | Segment::SeparateCommands
+ | Segment::EndCommand
+ | Segment::End
+ | Segment::ExpectedQuote
+ | Segment::ExpectedExponent
+ | Segment::UnexpectedChar => return Ok(("", rest)),
+ }
+ input = rest;
+ }
+ }
+ fn parse_id<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let (Some(_), mut end) = take(input, eof).unwrap() else {
+ unreachable!()
+ };
+ while let (Some(c), rest) = take(end, eof)? {
+ if !c.may_continue_id() {
+ break;
+ };
+ end = rest;
+ }
+ let identifier = &input[..input.len() - end.len()];
+ let identifier = match identifier.strip_suffix('.') {
+ Some(without_dot) if at_end_of_line(end, eof)? => without_dot,
+ _ => identifier,
+ };
+ let rest = &input[identifier.len()..];
+
+ if self.state.1.contains(Substate::START_OF_COMMAND) {
+ if id_match_n("COMMENT", identifier, 4) {
+ self.state = (State::Comment1, Substate::empty());
+ return self.parse_comment_1(input, eof);
+ } else if id_match("DOCUMENT", identifier) {
+ self.state = (State::Document1, Substate::empty());
+ return Ok((input, Segment::StartDocument));
+ } else if id_match_n("DEFINE", identifier, 6) {
+ self.state = (State::Define1, Substate::empty());
+ } else if id_match("FILE", identifier) {
+ if id_match("LABEL", self.next_id_in_command(rest, eof)?.0) {
+ self.state = (State::FileLabel1, Substate::empty());
+ return Ok((rest, Segment::Identifier));
+ }
+ } else if id_match("DO", identifier) {
+ if id_match("REPEAT", self.next_id_in_command(rest, eof)?.0) {
+ self.state = (State::DoRepeat1, Substate::empty());
+ return Ok((rest, Segment::Identifier));
+ }
+ } else if id_match("BEGIN", identifier) {
+ let (next_id, rest2) = self.next_id_in_command(rest, eof)?;
+ if id_match("DATA", next_id) {
+ let rest2 = skip_spaces_and_comments(rest2, eof)?;
+ let rest2 = if let Some(s) = rest2.strip_prefix('.') {
+ skip_spaces_and_comments(s, eof)?
+ } else {
+ rest2
+ };
+ if is_end_of_line(rest2, eof)? {
+ let s = &input[..input.len() - rest2.len()];
+ self.state = (
+ if s.contains('\n') {
+ State::BeginData1
+ } else {
+ State::BeginData2
+ },
+ Substate::empty(),
+ );
+ return Ok((rest, Segment::Identifier));
+ }
+ }
+ }
+ }
+
+ self.state.1 = Substate::empty();
+ Ok((
+ rest,
+ if identifier != "!" {
+ Segment::Identifier
+ } else {
+ Segment::Punct
+ },
+ ))
+ }
+ fn parse_digraph<'a>(
+ &mut self,
+ seconds: &[char],
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let (c, rest) = take(input, eof)?;
+ self.state.1 = Substate::empty();
+ Ok((
+ match c {
+ Some(c) if seconds.contains(&c) => rest,
+ _ => input,
+ },
+ Segment::Punct,
+ ))
+ }
+ fn parse_number<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let mut input = skip_digits(input, eof)?;
+ if let Some(rest) = match_char(|c| c == '.', input, eof)? {
+ let rest2 = skip_digits(rest, eof)?;
+ if rest2.len() < rest.len() || !at_end_of_line(rest2, eof)? {
+ input = rest2;
+ }
+ };
+ if let Some(rest) = match_char(|c| c == 'e' || c == 'E', input, eof)? {
+ let rest = match_char(|c| c == '+' || c == '-', rest, eof)?.unwrap_or(rest);
+ let rest2 = skip_digits(rest, eof)?;
+ if rest2.len() == rest.len() {
+ self.state.1 = Substate::empty();
+ return Ok((rest, Segment::ExpectedExponent));
+ }
+ input = rest2;
+ }
+ self.state.1 = Substate::empty();
+ Ok((input, Segment::Number))
+ }
+ fn parse_comment_1<'a>(
+ &mut self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ enum CommentState<'a> {
+ Blank,
+ NotBlank,
+ Period(&'a str),
+ }
+ let mut state = CommentState::Blank;
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ // End of file.
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((input, Segment::SeparateCommands));
+ };
+ match c {
+ '.' => state = CommentState::Period(input),
+ '\n' | '\r' if is_end_of_line(input, eof)? => {
+ match state {
+ CommentState::Blank => {
+ // Blank line ends comment command.
+ self.state = (State::General, Substate::START_OF_COMMAND);
+ return Ok((input, Segment::SeparateCommands));
+ }
+ CommentState::Period(period) => {
+ // '.' at end of line ends comment command.
+ self.state = (State::General, Substate::empty());
+ return Ok((period, Segment::CommentCommand));
+ }
+ CommentState::NotBlank => {
+ // Comment continues onto next line.
+ self.state = (State::Comment2, Substate::empty());
+ return Ok((input, Segment::CommentCommand));
+ }
+ }
+ }
+ c if c.is_whitespace() => (),
+ _ => state = CommentState::NotBlank,
+ }
+ input = rest;
+ }
+ }
+ fn parse_comment_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let rest = self.parse_newline(input, eof)?.unwrap();
+
+ let new_command = match take(rest, eof)?.0 {
+ Some('+') | Some('-') | Some('.') => true,
+ Some(c) if !c.is_whitespace() => self.at_command_start(rest, eof)?,
+ None | Some(_) => false,
+ };
+ if new_command {
+ self.state = (
+ State::General,
+ Substate::START_OF_LINE | Substate::START_OF_COMMAND,
+ );
+ } else {
+ self.state = (State::Comment1, Substate::empty());
+ }
+ Ok((rest, Segment::Newline))
+ }
+ fn parse_document_1<'a>(
+ &mut self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let mut end_cmd = false;
+ loop {
+ let (Some(c), rest) = take(input, eof)? else {
+ self.state = (State::Document3, Substate::empty());
+ return Ok((input, Segment::Document));
+ };
+ match c {
+ '.' => end_cmd = true,
+ '\n' | '\r' if is_end_of_line(input, eof)? => {
+ self.state.0 = if end_cmd {
+ State::Document3
+ } else {
+ State::Document2
+ };
+ return Ok((input, Segment::Document));
+ }
+ c if !c.is_whitespace() => end_cmd = false,
+ _ => (),
+ }
+ input = rest;
+ }
+ }
+ fn parse_document_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let rest = self.parse_newline(input, eof)?.unwrap();
+ self.state = (State::Document1, Substate::empty());
+ Ok((rest, Segment::Newline))
+ }
+ fn parse_document_3<'a>(
+ &mut self,
+ input: &'a str,
+ _eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ self.state = (
+ State::General,
+ Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+ );
+ Ok((input, Segment::EndCommand))
+ }
+ fn quoted_file_label(input: &str, eof: bool) -> Result<bool, Incomplete> {
+ let input = skip_spaces_and_comments(input, eof)?;
+ match take(input, eof)?.0 {
+ Some('\'') | Some('"') | Some('\n') => Ok(true),
+ _ => Ok(false),
+ }
+ }
+ fn parse_file_label_1<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let mut sub = Segmenter {
+ state: (State::General, self.state.1),
+ ..*self
+ };
+ let (rest, segment) = sub.push_rest(input, eof)?;
+ if segment == Segment::Identifier {
+ let id = &input[..input.len() - rest.len()];
+ debug_assert!(id_match("LABEL", id), "{id} should be LABEL");
+ if Self::quoted_file_label(rest, eof)? {
+ *self = sub;
+ } else {
+ self.state.0 = State::FileLabel2;
+ }
+ } else {
+ self.state.1 = sub.state.1;
+ }
+ Ok((rest, segment))
+ }
+ fn parse_file_label_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let input = skip_spaces(input, eof)?;
+ self.state = (State::FileLabel3, Substate::empty());
+ Ok((input, Segment::Spaces))
+ }
+ fn parse_file_label_3<'a>(
+ &mut self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let mut end_cmd = None;
+ loop {
+ let (c, rest) = take(input, eof)?;
+ match c {
+ None | Some('\n') | Some('\r') if is_end_of_line(input, eof)? => {
+ self.state = (State::General, Substate::empty());
+ return Ok((end_cmd.unwrap_or(input), Segment::UnquotedString));
+ }
+ None => unreachable!(),
+ Some('.') => end_cmd = Some(input),
+ Some(c) if !c.is_whitespace() => end_cmd = None,
+ Some(_) => (),
+ }
+ input = rest;
+ }
+ }
+ fn subparse<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let mut sub = Segmenter {
+ mode: self.mode,
+ state: (State::General, self.state.1),
+ nest: 0,
+ };
+ let result = sub.push_rest(input, eof)?;
+ self.state.1 = sub.state.1;
+ Ok(result)
+ }
+ /// We are segmenting a `DO REPEAT` command, currently reading the syntax
+ /// that defines the stand-in variables (the head) before the lines of
+ /// syntax to be repeated (the body).
+ fn parse_do_repeat_1<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let (rest, segment) = self.subparse(input, eof)?;
+ if segment == Segment::SeparateCommands {
+ // We reached a blank line that separates the head from the body.
+ self.state.0 = State::DoRepeat2;
+ } else if segment == Segment::EndCommand || segment == Segment::StartCommand {
+ // We reached the body.
+ self.state.0 = State::DoRepeat3;
+ self.nest = 1;
+ }
+ Ok((rest, segment))
+ }
+ /// We are segmenting a `DO REPEAT` command, currently reading a blank line
+ /// that separates the head from the body.
+ fn parse_do_repeat_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let (rest, segment) = self.subparse(input, eof)?;
+ if segment == Segment::Newline {
+ // We reached the body.
+ self.state.0 = State::DoRepeat3;
+ self.nest = 1;
+ }
+ Ok((rest, segment))
+ }
+ fn parse_newline<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<Option<&'a str>, Incomplete> {
+ let (Some(c), rest) = take(input, eof)? else {
+ return Ok(None);
+ };
+ match c {
+ '\n' => Ok(Some(rest)),
+ '\r' => {
+ if let (Some('\n'), rest) = take(rest, eof)? {
+ Ok(Some(rest))
+ } else {
+ Ok(None)
+ }
+ }
+ _ => Ok(None),
+ }
+ }
+
+ fn parse_full_line<'a>(
+ &mut self,
+ mut input: &'a str,
+ eof: bool,
+ ) -> Result<&'a str, Incomplete> {
+ loop {
+ if is_end_of_line(input, eof)? {
+ return Ok(input);
+ }
+ input = take(input, eof).unwrap().1;
+ }
+ }
+ fn check_repeat_command<'a>(&mut self, input: &'a str, eof: bool) -> Result<isize, Incomplete> {
+ let input = input.strip_prefix(&['-', '+']).unwrap_or(input);
+ let (id1, input) = self.next_id_in_command(input, eof)?;
+ if id_match("DO", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) {
+ Ok(1)
+ } else if id_match("END", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0)
+ {
+ Ok(-1)
+ } else {
+ Ok(0)
+ }
+ }
+ /// We are in the body of `DO REPEAT`, segmenting the lines of syntax that
+ /// are to be repeated. Report each line of syntax as a single
+ /// [`Type::DoRepeatCommand`].
+ ///
+ /// `DO REPEAT` can be nested, so we look for `DO REPEAT...END REPEAT`
+ /// blocks inside the lines we're segmenting. `self.nest` counts the
+ /// nesting level, starting at 1.
+ fn parse_do_repeat_3<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ if let Some(rest) = self.parse_newline(input, eof)? {
+ return Ok((rest, Segment::Newline));
+ }
+ let rest = self.parse_full_line(input, eof)?;
+ let direction = self.check_repeat_command(input, eof)?;
+ if direction > 0 {
+ if let Some(nest) = self.nest.checked_add(1) {
+ self.nest = nest;
+ } else {
+ self.state.0 = State::DoRepeat4;
+ }
+ } else if direction < 0 {
+ self.nest -= 1;
+ if self.nest == 0 {
+ // Nesting level dropped to 0, so we've finished reading the `DO
+ // REPEAT` body.
+ self.state = (
+ State::General,
+ Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+ );
+ return self.push_rest(input, eof);
+ }
+ }
+ return Ok((rest, Segment::DoRepeatCommand));
+ }
+ fn parse_do_repeat_4<'a>(&mut self, input: &'a str) -> Result<(&'a str, Segment), Incomplete> {
+ self.state.0 = State::DoRepeat3;
+ Ok((input, Segment::DoRepeatOverflow))
+ }
+ /// We are segmenting a `DEFINE` command, which consists of:
+ ///
+ /// - The `DEFINE` keyword.
+ ///
+ /// - An identifier. We transform this into `Type::MacroName` instead of
+ /// `Type::Identifier` because this identifier must never be macro-expanded.
+ ///
+ /// - Anything but `(`.
+ ///
+ /// - `(` followed by a sequence of tokens possibly including balanced
+ /// parentheses up to a final `)`.
+ ///
+ /// - A sequence of any number of lines, one string per line, ending with
+ /// `!ENDDEFINE`. The first line is usually blank (that is, a newline
+ /// follows the `(`). The last line usually just has `!ENDDEFINE.` on
+ /// it, but it can start with other tokens. The whole
+ /// DEFINE...!ENDDEFINE can be on a single line, even.
+ fn parse_define_1_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let (rest, segment) = self.subparse(input, eof)?;
+ match segment {
+ Segment::Identifier if self.state.0 == State::Define1 => {
+ self.state.0 = State::Define2;
+ return Ok((rest, Segment::MacroName));
+ }
+ Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => {
+ // The DEFINE command is malformed because we reached its end
+ // without ever hitting a `(` token. Transition back to general
+ // parsing.
+ self.state.0 = State::General;
+ }
+ Segment::Punct if input.starts_with('(') => {
+ self.state.0 = State::Define3;
+ self.nest = 1;
+ }
+ _ => (),
+ }
+ Ok((rest, segment))
+ }
+ fn parse_define_3<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let (rest, segment) = self.subparse(input, eof)?;
+ match segment {
+ Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => {
+ // The DEFINE command is malformed because we reached its end
+ // without ever hitting a `(` token. Transition back to general
+ // parsing.
+ self.state.0 = State::General;
+ }
+ Segment::Punct if input.starts_with('(') => {
+ self.nest += 1;
+ }
+ Segment::Punct if input.starts_with(')') => {
+ self.nest -= 1;
+ if self.nest == 0 {
+ self.state = (State::Define4, Substate::empty());
+ }
+ }
+ _ => (),
+ }
+ Ok((rest, segment))
+ }
+ fn find_enddefine<'a>(mut input: &'a str) -> Option<&'a str> {
+ loop {
+ input = skip_spaces_and_comments(input, true).unwrap();
+ let (Some(c), rest) = take(input, true).unwrap() else {
+ return None;
+ };
+ match c {
+ '!' if strip_prefix_ignore_ascii_case(input, "!ENDDEFINE").is_some() => {
+ return Some(input)
+ }
+ '\'' | '"' => {
+ let index = rest.find(c)?;
+ input = &rest[index + 1..];
+ }
+ _ => input = rest,
+ }
+ }
+ }
+
+ /// We are in the body of a macro definition, looking for additional lines
+ /// of the body or `!ENDDEFINE`.
+ ///
+ /// In `State::Define4`, we're parsing the first line of the macro body (the
+ /// same line as the closing parenthesis in the argument definition). In
+ /// `State::Define5`, we're on a later line.
+ fn parse_define_4_5<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let rest = self.parse_full_line(input, eof)?;
+ let line = &input[..input.len() - rest.len()];
+ if let Some(end) = Self::find_enddefine(line) {
+ // Macro ends at the !ENDDEFINE on this line.
+ self.state = (State::General, Substate::empty());
+ let (prefix, rest) = input.split_at(line.len() - end.len());
+ if prefix.is_empty() {
+ // Line starts with `!ENDDEFINE`.
+ self.push_rest(input, eof)
+ } else if prefix.trim_start().is_empty() {
+ // Line starts with spaces followed by `!ENDDEFINE`.
+ Ok((rest, Segment::Spaces))
+ } else {
+ // Line starts with some content followed by `!ENDDEFINE`.
+ Ok((rest, Segment::MacroBody))
+ }
+ } else {
+ // No `!ENDDEFINE`. We have a full line of macro body.
+ //
+ // If the first line of the macro body is blank, we just report it
+ // as spaces, or not at all if there are no spaces, because it's not
+ // significant.
+ //
+ // However, if it's a later line, we need to report it because blank
+ // lines can have significance.
+ let segment = if self.state.0 == State::Define4 && line.trim_start().is_empty() {
+ if line.is_empty() {
+ return self.parse_define_6(input, eof);
+ }
+ Segment::Spaces
+ } else {
+ Segment::MacroBody
+ };
+ self.state.0 = State::Define6;
+ Ok((rest, segment))
+ }
+ }
+ fn parse_define_6<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let rest = self.parse_newline(input, eof)?.unwrap();
+ self.state.0 = State::Define5;
+ Ok((rest, Segment::Newline))
+ }
+ fn parse_begin_data_1<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let (rest, segment) = self.subparse(input, eof)?;
+ if segment == Segment::Newline {
+ self.state.0 = State::BeginData2;
+ }
+ Ok((rest, segment))
+ }
+ fn parse_begin_data_2<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let (rest, segment) = self.subparse(input, eof)?;
+ if segment == Segment::Newline {
+ self.state.0 = State::BeginData3;
+ }
+ Ok((rest, segment))
+ }
+ fn is_end_data(line: &str) -> bool {
+ let Some(rest) = strip_prefix_ignore_ascii_case(line, "END") else {
+ return false;
+ };
+ let (Some(c), rest) = take(rest, true).unwrap() else {
+ return false;
+ };
+ if !c.is_whitespace() {
+ return false;
+ };
+ let Some(rest) = strip_prefix_ignore_ascii_case(rest, "DATA") else {
+ return false;
+ };
+
+ let mut endcmd = false;
+ for c in rest.chars() {
+ match c {
+ '.' if endcmd => return false,
+ '.' => endcmd = true,
+ c if c.is_whitespace() => (),
+ _ => return false,
+ }
+ }
+ true
+ }
+ fn parse_begin_data_3<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let rest = self.parse_full_line(input, eof)?;
+ let line = &input[..input.len() - rest.len()];
+ if Self::is_end_data(line) {
+ self.state = (
+ State::General,
+ Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+ );
+ self.push_rest(input, eof)
+ } else {
+ self.state.0 = State::BeginData4;
+ Ok((rest, Segment::InlineData))
+ }
+ }
+ fn parse_begin_data_4<'a>(
+ &mut self,
+ input: &'a str,
+ eof: bool,
+ ) -> Result<(&'a str, Segment), Incomplete> {
+ let rest = self.parse_newline(input, eof)?.unwrap();
+ self.state.0 = State::BeginData3;
+ Ok((rest, Segment::Newline))
+ }
+}
+
+fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&'a str> {
+ line.get(..pattern.len())
+ .map(|prefix| {
+ prefix
+ .eq_ignore_ascii_case(pattern)
+ .then(|| &line[pattern.len()..])
+ })
+ .flatten()
+}
+
+#[cfg(test)]
+mod test;
--- /dev/null
+use crate::prompt::PromptStyle;
+
+use super::{Mode, Segment, Segmenter};
+
+fn push_segment<'a>(
+ segmenter: &mut Segmenter,
+ input: &'a str,
+ one_byte: bool,
+) -> (usize, Segment) {
+ if one_byte {
+ for len in input.char_indices().map(|(pos, _c)| pos) {
+ if let Ok(result) = segmenter.push(&input[..len], false) {
+ return result;
+ }
+ }
+ }
+ segmenter.push(input, true).unwrap()
+}
+
+fn _check_segmentation(
+ mut input: &str,
+ mode: Mode,
+ expect_segments: &[(Segment, &str)],
+ expect_prompts: &[PromptStyle],
+ one_byte: bool,
+) {
+ let mut segments = Vec::with_capacity(expect_segments.len());
+ let mut prompts = Vec::new();
+ let mut segmenter = Segmenter::new(mode, false);
+ loop {
+ let (seg_len, seg_type) = push_segment(&mut segmenter, input, one_byte);
+ let (token, rest) = input.split_at(seg_len);
+ segments.push((seg_type, token));
+ match seg_type {
+ Segment::End => break,
+ Segment::Newline => prompts.push(segmenter.prompt()),
+ _ => (),
+ }
+ input = rest;
+ }
+
+ if &segments != expect_segments {
+ eprintln!("segments differ from expected:");
+ let difference = diff::slice(expect_segments, &segments);
+ for result in difference {
+ match result {
+ diff::Result::Left(left) => eprintln!("-{left:?}"),
+ diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
+ diff::Result::Right(right) => eprintln!("+{right:?}"),
+ }
+ }
+ panic!();
+ }
+
+ if &prompts != expect_prompts {
+ eprintln!("prompts differ from expected:");
+ let difference = diff::slice(expect_prompts, &prompts);
+ for result in difference {
+ match result {
+ diff::Result::Left(left) => eprintln!("-{left:?}"),
+ diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
+ diff::Result::Right(right) => eprintln!("+{right:?}"),
+ }
+ }
+ panic!();
+ }
+}
+
+fn check_segmentation(
+ input: &str,
+ mode: Mode,
+ expect_segments: &[(Segment, &str)],
+ expect_prompts: &[PromptStyle],
+) {
+ for (one_byte, one_byte_name) in [(false, "full-string"), (true, "byte-by-byte")] {
+ println!("running {one_byte_name} segmentation test with LF newlines...");
+ _check_segmentation(input, mode, expect_segments, expect_prompts, one_byte);
+
+ println!("running {one_byte_name} segmentation test with CRLF newlines...");
+ _check_segmentation(
+ &input.replace('\n', "\r\n"),
+ mode,
+ &expect_segments
+ .iter()
+ .map(|(segment, s)| match *segment {
+ Segment::Newline => (Segment::Newline, "\r\n"),
+ _ => (*segment, *s),
+ })
+ .collect::<Vec<_>>(),
+ expect_prompts,
+ one_byte,
+ );
+
+ if let Some(input) = input.strip_suffix('\n') {
+ println!("running {one_byte_name} segmentation test without final newline...");
+ let mut expect_segments: Vec<_> = expect_segments.iter().copied().collect();
+ assert_eq!(expect_segments.pop(), Some((Segment::End, "")));
+ assert_eq!(expect_segments.pop(), Some((Segment::Newline, "\n")));
+ while let Some((Segment::SeparateCommands | Segment::EndCommand, "")) =
+ expect_segments.last()
+ {
+ expect_segments.pop();
+ }
+ expect_segments.push((Segment::End, ""));
+ _check_segmentation(
+ input,
+ mode,
+ &expect_segments,
+ &expect_prompts[..expect_prompts.len() - 1],
+ one_byte,
+ );
+ }
+ }
+}
+
+#[allow(dead_code)]
+fn print_segmentation(mut input: &str) {
+ let mut segmenter = Segmenter::new(Mode::Interactive, false);
+ loop {
+ let (seg_len, seg_type) = segmenter.push(input, true).unwrap();
+ let (token, rest) = input.split_at(seg_len);
+ print!("{seg_type:?} {token:?}");
+ match seg_type {
+ Segment::Newline => print!(" ({:?})", segmenter.prompt()),
+ Segment::End => break,
+ _ => (),
+ }
+ println!();
+ input = rest;
+ }
+}
+
+#[test]
+fn test_identifiers() {
+ check_segmentation(
+ r#"a ab abc abcd !abcd
+A AB ABC ABCD !ABCD
+aB aBC aBcD !aBcD
+$x $y $z !$z
+grève Ângstrom poté
+#a #b #c ## #d !#d
+@efg @ @@. @#@ !@
+## # #12345 #.#
+f@#_.#6
+GhIjK
+.x 1y _z
+!abc abc!
+"#,
+ Mode::Auto,
+ &[
+ (Segment::Identifier, "a"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "ab"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "abc"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "abcd"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "!abcd"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "A"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "AB"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "ABC"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "ABCD"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "!ABCD"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "aB"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "aBC"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "aBcD"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "!aBcD"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "$x"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "$y"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "$z"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "!$z"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "grève"),
+ (Segment::Spaces, "\u{00a0}"),
+ (Segment::Identifier, "Ângstrom"),
+ (Segment::Spaces, "\u{00a0}"),
+ (Segment::Identifier, "poté"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "#a"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "#b"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "#c"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "##"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "#d"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "!#d"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "@efg"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "@"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "@@."),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "@#@"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "!@"),
+ (Segment::Spaces, " "),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "##"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "#"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "#12345"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "#.#"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "f@#_.#6"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "GhIjK"),
+ (Segment::Newline, "\n"),
+ (Segment::StartCommand, "."),
+ (Segment::Identifier, "x"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "1"),
+ (Segment::Identifier, "y"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "_"),
+ (Segment::Identifier, "z"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "!abc"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "abc"),
+ (Segment::Punct, "!"),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ ],
+ );
+}
+
+#[test]
+fn test_identifiers_ending_in_dot() {
+ check_segmentation(
+ r#"abcd. abcd.
+ABCD. ABCD.
+aBcD. aBcD.
+$y. $z. あいうえお.
+#c. #d..
+@@. @@....
+#.#.
+#abcd.
+.
+.
+LMNOP.
+QRSTUV./* end of line comment */
+qrstuv. /* end of line comment */
+QrStUv./* end of line comment */
+wxyz./* unterminated end of line comment
+WXYZ. /* unterminated end of line comment
+WxYz./* unterminated end of line comment
+"#,
+ Mode::Auto,
+ &[
+ (Segment::Identifier, "abcd."),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "abcd"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "ABCD."),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "ABCD"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "aBcD."),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "aBcD"),
+ (Segment::EndCommand, "."),
+ (Segment::Spaces, " "),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "$y."),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "$z."),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "あいうえお"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "#c."),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "#d."),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "@@."),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "@@..."),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "#.#"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "#abcd"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::StartCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::StartCommand, "."),
+ (Segment::Spaces, " "),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "LMNOP"),
+ (Segment::EndCommand, "."),
+ (Segment::Spaces, " "),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "QRSTUV"),
+ (Segment::EndCommand, "."),
+ (Segment::Comment, "/* end of line comment */"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "qrstuv"),
+ (Segment::EndCommand, "."),
+ (Segment::Spaces, " "),
+ (Segment::Comment, "/* end of line comment */"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "QrStUv"),
+ (Segment::EndCommand, "."),
+ (Segment::Comment, "/* end of line comment */"),
+ (Segment::Spaces, " "),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "wxyz"),
+ (Segment::EndCommand, "."),
+ (Segment::Comment, "/* unterminated end of line comment"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "WXYZ"),
+ (Segment::EndCommand, "."),
+ (Segment::Spaces, " "),
+ (Segment::Comment, "/* unterminated end of line comment"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "WxYz"),
+ (Segment::EndCommand, "."),
+ (Segment::Comment, "/* unterminated end of line comment "),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_reserved_words() {
+ check_segmentation(
+ r#"and or not eq ge gt le lt ne all by to with
+AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
+andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
+and. with.
+"#,
+ Mode::Auto,
+ &[
+ (Segment::Identifier, "and"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "or"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "not"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "eq"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "ge"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "gt"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "le"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "lt"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "ne"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "all"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "by"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "to"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "with"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "AND"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "OR"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "NOT"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "EQ"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "GE"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "GT"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "LE"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "LT"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "NE"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "ALL"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "BY"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "TO"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "WITH"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "andx"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "orx"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "notx"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "eqx"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "gex"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "gtx"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "lex"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "ltx"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "nex"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "allx"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "byx"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "tox"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "withx"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "and."),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "with"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_punctuation() {
+ check_segmentation(
+ r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] **
+~&|=>=><=<~=<>(),-+*/[]**!*
+% : ; ? _ ` { } ~ !*
+"#,
+ Mode::Auto,
+ &[
+ (Segment::Punct, "~"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "&"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "|"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "="),
+ (Segment::Spaces, " "),
+ (Segment::Punct, ">="),
+ (Segment::Spaces, " "),
+ (Segment::Punct, ">"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "<="),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "<"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "~="),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "<>"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "("),
+ (Segment::Spaces, " "),
+ (Segment::Punct, ")"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, ","),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "-"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "+"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "*"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "/"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "["),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "]"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "**"),
+ (Segment::Newline, "\n"),
+ (Segment::Punct, "~"),
+ (Segment::Punct, "&"),
+ (Segment::Punct, "|"),
+ (Segment::Punct, "="),
+ (Segment::Punct, ">="),
+ (Segment::Punct, ">"),
+ (Segment::Punct, "<="),
+ (Segment::Punct, "<"),
+ (Segment::Punct, "~="),
+ (Segment::Punct, "<>"),
+ (Segment::Punct, "("),
+ (Segment::Punct, ")"),
+ (Segment::Punct, ","),
+ (Segment::Punct, "-"),
+ (Segment::Punct, "+"),
+ (Segment::Punct, "*"),
+ (Segment::Punct, "/"),
+ (Segment::Punct, "["),
+ (Segment::Punct, "]"),
+ (Segment::Punct, "**"),
+ (Segment::Punct, "!*"),
+ (Segment::Newline, "\n"),
+ (Segment::Punct, "%"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, ":"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, ";"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "?"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "_"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "`"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "{"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "}"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "~"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "!*"),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[PromptStyle::Later, PromptStyle::Later, PromptStyle::Later],
+ );
+}
+
+#[test]
+fn test_positive_numbers() {
+ check_segmentation(
+ r#"0 1 01 001. 1.
+123. /* comment 1 */ /* comment 2 */
+.1 0.1 00.1 00.10
+5e1 6E-1 7e+1 6E+01 6e-03
+.3E1 .4e-1 .5E+1 .6e+01 .7E-03
+1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
+. 1e e1 1e+ 1e- 1.
+"#,
+ Mode::Auto,
+ &[
+ (Segment::Number, "0"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "01"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "001."),
+ (Segment::Spaces, " "),
+ (Segment::Number, "1"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Number, "123"),
+ (Segment::EndCommand, "."),
+ (Segment::Spaces, " "),
+ (Segment::Comment, "/* comment 1 */"),
+ (Segment::Spaces, " "),
+ (Segment::Comment, "/* comment 2 */"),
+ (Segment::Newline, "\n"),
+ (Segment::StartCommand, "."),
+ (Segment::Number, "1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "0.1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "00.1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "00.10"),
+ (Segment::Newline, "\n"),
+ (Segment::Number, "5e1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "6E-1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "7e+1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "6E+01"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "6e-03"),
+ (Segment::Newline, "\n"),
+ (Segment::StartCommand, "."),
+ (Segment::Number, "3E1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, ".4e-1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, ".5E+1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, ".6e+01"),
+ (Segment::Spaces, " "),
+ (Segment::Number, ".7E-03"),
+ (Segment::Newline, "\n"),
+ (Segment::Number, "1.23e1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "45.6E-1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "78.9e+1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "99.9E+01"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "11.2e-03"),
+ (Segment::Newline, "\n"),
+ (Segment::StartCommand, "."),
+ (Segment::Spaces, " "),
+ (Segment::ExpectedExponent, "1e"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "e1"),
+ (Segment::Spaces, " "),
+ (Segment::ExpectedExponent, "1e+"),
+ (Segment::Spaces, " "),
+ (Segment::ExpectedExponent, "1e-"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "1"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_negative_numbers() {
+ check_segmentation(
+ r#" -0 -1 -01 -001. -1.
+ -123. /* comment 1 */ /* comment 2 */
+ -.1 -0.1 -00.1 -00.10
+ -5e1 -6E-1 -7e+1 -6E+01 -6e-03
+ -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03
+ -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03
+ -/**/1
+ -. -1e -e1 -1e+ -1e- -1.
+"#,
+ Mode::Auto,
+ &[
+ (Segment::Spaces, " "),
+ (Segment::Number, "-0"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-01"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-001."),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-1"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-123"),
+ (Segment::EndCommand, "."),
+ (Segment::Spaces, " "),
+ (Segment::Comment, "/* comment 1 */"),
+ (Segment::Spaces, " "),
+ (Segment::Comment, "/* comment 2 */"),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-.1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-0.1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-00.1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-00.10"),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-5e1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-6E-1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-7e+1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-6E+01"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-6e-03"),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-.3E1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-.4e-1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-.5E+1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-.6e+01"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-.7E-03"),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-1.23e1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-45.6E-1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-78.9e+1"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-99.9E+01"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-11.2e-03"),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "-"),
+ (Segment::Comment, "/**/"),
+ (Segment::Number, "1"),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "-"),
+ (Segment::Punct, "."),
+ (Segment::Spaces, " "),
+ (Segment::ExpectedExponent, "-1e"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "-"),
+ (Segment::Identifier, "e1"),
+ (Segment::Spaces, " "),
+ (Segment::ExpectedExponent, "-1e+"),
+ (Segment::Spaces, " "),
+ (Segment::ExpectedExponent, "-1e-"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "-1"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_strings() {
+ check_segmentation(
+ r#"'x' "y" 'abc'
+'Don''t' "Can't" 'Won''t'
+"""quoted""" '"quoted"'
+'' ""
+'missing end quote
+"missing double quote
+x"4142" X'5152'
+u'fffd' U"041"
++ new command
++ /* comment */ 'string continuation'
++ /* also a punctuator on blank line
+- 'new command'
+"#,
+ Mode::Auto,
+ &[
+ (Segment::QuotedString, "'x'"),
+ (Segment::Spaces, " "),
+ (Segment::QuotedString, "\"y\""),
+ (Segment::Spaces, " "),
+ (Segment::QuotedString, "'abc'"),
+ (Segment::Newline, "\n"),
+ (Segment::QuotedString, "'Don''t'"),
+ (Segment::Spaces, " "),
+ (Segment::QuotedString, "\"Can't\""),
+ (Segment::Spaces, " "),
+ (Segment::QuotedString, "'Won''t'"),
+ (Segment::Newline, "\n"),
+ (Segment::QuotedString, "\"\"\"quoted\"\"\""),
+ (Segment::Spaces, " "),
+ (Segment::QuotedString, "'\"quoted\"'"),
+ (Segment::Newline, "\n"),
+ (Segment::QuotedString, "''"),
+ (Segment::Spaces, " "),
+ (Segment::QuotedString, "\"\""),
+ (Segment::Newline, "\n"),
+ (Segment::ExpectedQuote, "'missing end quote"),
+ (Segment::Newline, "\n"),
+ (Segment::ExpectedQuote, "\"missing double quote"),
+ (Segment::Newline, "\n"),
+ (Segment::HexString, "x\"4142\""),
+ (Segment::Spaces, " "),
+ (Segment::HexString, "X'5152'"),
+ (Segment::Newline, "\n"),
+ (Segment::UnicodeString, "u'fffd'"),
+ (Segment::Spaces, " "),
+ (Segment::UnicodeString, "U\"041\""),
+ (Segment::Newline, "\n"),
+ (Segment::StartCommand, "+"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "new"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "command"),
+ (Segment::Newline, "\n"),
+ (Segment::Punct, "+"),
+ (Segment::Spaces, " "),
+ (Segment::Comment, "/* comment */"),
+ (Segment::Spaces, " "),
+ (Segment::QuotedString, "'string continuation'"),
+ (Segment::Newline, "\n"),
+ (Segment::Punct, "+"),
+ (Segment::Spaces, " "),
+ (Segment::Comment, "/* also a punctuator on blank line"),
+ (Segment::Newline, "\n"),
+ (Segment::StartCommand, "-"),
+ (Segment::Spaces, " "),
+ (Segment::QuotedString, "'new command'"),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ ],
+ );
+}
+
+#[test]
+fn test_shbang() {
+ check_segmentation(
+ r#"#! /usr/bin/pspp
+title my title.
+#! /usr/bin/pspp
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Shbang, "#! /usr/bin/pspp"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "title"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "my"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "title"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "#"),
+ (Segment::Punct, "!"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "/"),
+ (Segment::Identifier, "usr"),
+ (Segment::Punct, "/"),
+ (Segment::Identifier, "bin"),
+ (Segment::Punct, "/"),
+ (Segment::Identifier, "pspp"),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[PromptStyle::First, PromptStyle::First, PromptStyle::Later],
+ );
+}
+
+#[test]
+fn test_comment_command() {
+ check_segmentation(
+ r#"* Comment commands "don't
+have to contain valid tokens.
+
+** Check ambiguity with ** token.
+****************.
+
+comment keyword works too.
+COMM also.
+com is ambiguous with COMPUTE.
+
+ * Comment need not start at left margin.
+
+* Comment ends with blank line
+
+next command.
+
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::CommentCommand, "* Comment commands \"don't"),
+ (Segment::Newline, "\n"),
+ (Segment::CommentCommand, "have to contain valid tokens"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::CommentCommand, "** Check ambiguity with ** token"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::CommentCommand, "****************"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::CommentCommand, "comment keyword works too"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::CommentCommand, "COMM also"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "com"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "is"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "ambiguous"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "with"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "COMPUTE"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (
+ Segment::CommentCommand,
+ "* Comment need not start at left margin",
+ ),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::CommentCommand, "* Comment ends with blank line"),
+ (Segment::Newline, "\n"),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "next"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "command"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::Comment,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::Comment,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_document_command() {
+ check_segmentation(
+ r#"DOCUMENT one line.
+DOC more
+ than
+ one
+ line.
+docu
+first.paragraph
+isn't parsed as tokens
+
+second paragraph.
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::StartDocument, ""),
+ (Segment::Document, "DOCUMENT one line."),
+ (Segment::EndCommand, ""),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::StartDocument, ""),
+ (Segment::Document, "DOC more"),
+ (Segment::Newline, "\n"),
+ (Segment::Document, " than"),
+ (Segment::Newline, "\n"),
+ (Segment::Document, " one"),
+ (Segment::Newline, "\n"),
+ (Segment::Document, " line."),
+ (Segment::EndCommand, ""),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::StartDocument, ""),
+ (Segment::Document, "docu"),
+ (Segment::Newline, "\n"),
+ (Segment::Document, "first.paragraph"),
+ (Segment::Newline, "\n"),
+ (Segment::Document, "isn't parsed as tokens"),
+ (Segment::Newline, "\n"),
+ (Segment::Document, ""),
+ (Segment::Newline, "\n"),
+ (Segment::Document, "second paragraph."),
+ (Segment::EndCommand, ""),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::First,
+ PromptStyle::Document,
+ PromptStyle::Document,
+ PromptStyle::Document,
+ PromptStyle::First,
+ PromptStyle::Document,
+ PromptStyle::Document,
+ PromptStyle::Document,
+ PromptStyle::Document,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_file_label_command() {
+ check_segmentation(
+ r#"FIL label isn't quoted.
+FILE
+ lab 'is quoted'.
+FILE /*
+/**/ lab not quoted here either
+
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "FIL"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "label"),
+ (Segment::Spaces, " "),
+ (Segment::UnquotedString, "isn't quoted"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "FILE"),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "lab"),
+ (Segment::Spaces, " "),
+ (Segment::QuotedString, "'is quoted'"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "FILE"),
+ (Segment::Spaces, " "),
+ (Segment::Comment, "/*"),
+ (Segment::Newline, "\n"),
+ (Segment::Comment, "/**/"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "lab"),
+ (Segment::Spaces, " "),
+ (Segment::UnquotedString, "not quoted here either"),
+ (Segment::Newline, "\n"),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::First,
+ PromptStyle::Later,
+ PromptStyle::First,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_begin_data() {
+ check_segmentation(
+ r#"begin data.
+end data.
+
+begin data. /*
+123
+xxx
+end data.
+
+BEG /**/ DAT /*
+5 6 7 /* x
+
+end data
+end data
+.
+
+begin
+ data.
+data
+end data.
+
+begin data "xxx".
+begin data 123.
+not data
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "begin"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "data"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "end"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "data"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "begin"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "data"),
+ (Segment::EndCommand, "."),
+ (Segment::Spaces, " "),
+ (Segment::Comment, "/*"),
+ (Segment::Newline, "\n"),
+ (Segment::InlineData, "123"),
+ (Segment::Newline, "\n"),
+ (Segment::InlineData, "xxx"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "end"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "data"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "BEG"),
+ (Segment::Spaces, " "),
+ (Segment::Comment, "/**/"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "DAT"),
+ (Segment::Spaces, " "),
+ (Segment::Comment, "/*"),
+ (Segment::Newline, "\n"),
+ (Segment::InlineData, "5 6 7 /* x"),
+ (Segment::Newline, "\n"),
+ (Segment::InlineData, ""),
+ (Segment::Newline, "\n"),
+ (Segment::InlineData, "end data"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "end"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "data"),
+ (Segment::Newline, "\n"),
+ (Segment::StartCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "begin"),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "data"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::InlineData, "data"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "end"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "data"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "begin"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "data"),
+ (Segment::Spaces, " "),
+ (Segment::QuotedString, "\"xxx\""),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "begin"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "data"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "123"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "not"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "data"),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::Data,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::Data,
+ PromptStyle::Data,
+ PromptStyle::Data,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::Data,
+ PromptStyle::Data,
+ PromptStyle::Data,
+ PromptStyle::Data,
+ PromptStyle::Later,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::Later,
+ PromptStyle::Data,
+ PromptStyle::Data,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::Later,
+ ],
+ );
+}
+
+#[test]
+fn test_do_repeat() {
+ check_segmentation(
+ r#"do repeat x=a b c
+ y=d e f.
+ do repeat a=1 thru 5.
+another command.
+second command
++ third command.
+end /* x */ /* y */ repeat print.
+end
+ repeat.
+do
+ repeat #a=1.
+ inner command.
+end repeat.
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "do"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "repeat"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "x"),
+ (Segment::Punct, "="),
+ (Segment::Identifier, "a"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "b"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "c"),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "y"),
+ (Segment::Punct, "="),
+ (Segment::Identifier, "d"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "e"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "f"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::DoRepeatCommand, " do repeat a=1 thru 5."),
+ (Segment::Newline, "\n"),
+ (Segment::DoRepeatCommand, "another command."),
+ (Segment::Newline, "\n"),
+ (Segment::DoRepeatCommand, "second command"),
+ (Segment::Newline, "\n"),
+ (Segment::DoRepeatCommand, "+ third command."),
+ (Segment::Newline, "\n"),
+ (
+ Segment::DoRepeatCommand,
+ "end /* x */ /* y */ repeat print.",
+ ),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "end"),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "repeat"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "do"),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "repeat"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "#a"),
+ (Segment::Punct, "="),
+ (Segment::Number, "1"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::DoRepeatCommand, " inner command."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "end"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "repeat"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::Later,
+ PromptStyle::First,
+ PromptStyle::Later,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_do_repeat_overflow() {
+ const N: usize = 257;
+ let do_repeat: Vec<String> = (0..N)
+ .map(|i| format!("do repeat v{i}={i} thru {}.\n", i + 5))
+ .collect();
+ let end_repeat: Vec<String> = (0..N)
+ .rev()
+ .map(|i| format!("end repeat. /* {i}\n"))
+ .collect();
+
+ let s: String = do_repeat
+ .iter()
+ .chain(end_repeat.iter())
+ .map(|s| s.as_str())
+ .collect();
+ let mut expect_output = vec![
+ (Segment::Identifier, "do"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "repeat"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "v0"),
+ (Segment::Punct, "="),
+ (Segment::Number, "0"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "thru"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "5"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ ];
+ for i in 1..N {
+ expect_output.push((Segment::DoRepeatCommand, &do_repeat[i].trim_end()));
+ if i >= 255 {
+ expect_output.push((Segment::DoRepeatOverflow, ""));
+ }
+ expect_output.push((Segment::Newline, "\n"));
+ }
+ for i in 0..254 {
+ expect_output.push((Segment::DoRepeatCommand, &end_repeat[i].trim_end()));
+ expect_output.push((Segment::Newline, "\n"));
+ }
+ let comments: Vec<String> = (0..(N - 254)).rev().map(|i| format!("/* {i}")).collect();
+ for comment in &comments {
+ expect_output.extend([
+ (Segment::Identifier, "end"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "repeat"),
+ (Segment::EndCommand, "."),
+ (Segment::Spaces, " "),
+ (Segment::Comment, comment),
+ (Segment::Newline, "\n"),
+ ]);
+ }
+ expect_output.push((Segment::End, ""));
+
+ let expect_prompts: Vec<_> = (0..N * 2 - 3)
+ .map(|_| PromptStyle::DoRepeat)
+ .chain([PromptStyle::First, PromptStyle::First, PromptStyle::First])
+ .collect();
+ check_segmentation(&s, Mode::Interactive, &expect_output, &expect_prompts);
+}
+
+#[test]
+fn test_do_repeat_batch() {
+ check_segmentation(
+ r#"do repeat x=a b c
+ y=d e f
+do repeat a=1 thru 5
+another command
+second command
++ third command
+end /* x */ /* y */ repeat print
+end
+ repeat
+do
+ repeat #a=1
+
+ inner command
+end repeat
+"#,
+ Mode::Batch,
+ &[
+ (Segment::Identifier, "do"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "repeat"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "x"),
+ (Segment::Punct, "="),
+ (Segment::Identifier, "a"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "b"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "c"),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "y"),
+ (Segment::Punct, "="),
+ (Segment::Identifier, "d"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "e"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "f"),
+ (Segment::Newline, "\n"),
+ (Segment::StartCommand, ""),
+ (Segment::DoRepeatCommand, "do repeat a=1 thru 5"),
+ (Segment::Newline, "\n"),
+ (Segment::DoRepeatCommand, "another command"),
+ (Segment::Newline, "\n"),
+ (Segment::DoRepeatCommand, "second command"),
+ (Segment::Newline, "\n"),
+ (Segment::DoRepeatCommand, "+ third command"),
+ (Segment::Newline, "\n"),
+ (Segment::DoRepeatCommand, "end /* x */ /* y */ repeat print"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "end"),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "repeat"),
+ (Segment::Newline, "\n"),
+ (Segment::StartCommand, ""),
+ (Segment::Identifier, "do"),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "repeat"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "#a"),
+ (Segment::Punct, "="),
+ (Segment::Number, "1"),
+ (Segment::Newline, "\n"),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::DoRepeatCommand, " inner command"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "end"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "repeat"),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::DoRepeat,
+ PromptStyle::DoRepeat,
+ PromptStyle::Later,
+ ],
+ );
+}
+
+mod define {
+ use crate::{
+ lex::segment::{Mode, Segment},
+ prompt::PromptStyle,
+ };
+
+ use super::check_segmentation;
+
+ #[test]
+ fn test_simple() {
+ check_segmentation(
+ r#"define !macro1()
+var1 var2 var3 "!enddefine"
+!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "define"),
+ (Segment::Spaces, " "),
+ (Segment::MacroName, "!macro1"),
+ (Segment::Punct, "("),
+ (Segment::Punct, ")"),
+ (Segment::Newline, "\n"),
+ (Segment::MacroBody, "var1 var2 var3 \"!enddefine\""),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "!enddefine"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[PromptStyle::Define, PromptStyle::Define, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_no_newline_after_parentheses() {
+ check_segmentation(
+ r#"define !macro1() var1 var2 var3 /* !enddefine
+!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "define"),
+ (Segment::Spaces, " "),
+ (Segment::MacroName, "!macro1"),
+ (Segment::Punct, "("),
+ (Segment::Punct, ")"),
+ (Segment::MacroBody, " var1 var2 var3 /* !enddefine"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "!enddefine"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[PromptStyle::Define, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_no_newline_before_enddefine() {
+ check_segmentation(
+ r#"define !macro1()
+var1 var2 var3!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "define"),
+ (Segment::Spaces, " "),
+ (Segment::MacroName, "!macro1"),
+ (Segment::Punct, "("),
+ (Segment::Punct, ")"),
+ (Segment::Newline, "\n"),
+ (Segment::MacroBody, "var1 var2 var3"),
+ (Segment::Identifier, "!enddefine"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[PromptStyle::Define, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_all_on_one_line() {
+ check_segmentation(
+ r#"define !macro1()var1 var2 var3!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "define"),
+ (Segment::Spaces, " "),
+ (Segment::MacroName, "!macro1"),
+ (Segment::Punct, "("),
+ (Segment::Punct, ")"),
+ (Segment::MacroBody, "var1 var2 var3"),
+ (Segment::Identifier, "!enddefine"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_empty() {
+ check_segmentation(
+ r#"define !macro1()
+!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "define"),
+ (Segment::Spaces, " "),
+ (Segment::MacroName, "!macro1"),
+ (Segment::Punct, "("),
+ (Segment::Punct, ")"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "!enddefine"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[PromptStyle::Define, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_blank_lines() {
+ check_segmentation(
+ r#"define !macro1()
+
+
+!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "define"),
+ (Segment::Spaces, " "),
+ (Segment::MacroName, "!macro1"),
+ (Segment::Punct, "("),
+ (Segment::Punct, ")"),
+ (Segment::Newline, "\n"),
+ (Segment::MacroBody, ""),
+ (Segment::Newline, "\n"),
+ (Segment::MacroBody, ""),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "!enddefine"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::Define,
+ PromptStyle::Define,
+ PromptStyle::Define,
+ PromptStyle::First,
+ ],
+ );
+ }
+
+ #[test]
+ fn test_arguments() {
+ check_segmentation(
+ r#"define !macro1(a(), b(), c())
+!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "define"),
+ (Segment::Spaces, " "),
+ (Segment::MacroName, "!macro1"),
+ (Segment::Punct, "("),
+ (Segment::Identifier, "a"),
+ (Segment::Punct, "("),
+ (Segment::Punct, ")"),
+ (Segment::Punct, ","),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "b"),
+ (Segment::Punct, "("),
+ (Segment::Punct, ")"),
+ (Segment::Punct, ","),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "c"),
+ (Segment::Punct, "("),
+ (Segment::Punct, ")"),
+ (Segment::Punct, ")"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "!enddefine"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[PromptStyle::Define, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_multiline_arguments() {
+ check_segmentation(
+ r#"define !macro1(
+ a(), b(
+ ),
+ c()
+)
+!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "define"),
+ (Segment::Spaces, " "),
+ (Segment::MacroName, "!macro1"),
+ (Segment::Punct, "("),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "a"),
+ (Segment::Punct, "("),
+ (Segment::Punct, ")"),
+ (Segment::Punct, ","),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "b"),
+ (Segment::Punct, "("),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, ")"),
+ (Segment::Punct, ","),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "c"),
+ (Segment::Punct, "("),
+ (Segment::Punct, ")"),
+ (Segment::Newline, "\n"),
+ (Segment::Punct, ")"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "!enddefine"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Define,
+ PromptStyle::First,
+ ],
+ );
+ }
+
+ #[test]
+ fn test_arguments_start_on_second_line() {
+ check_segmentation(
+ r#"define !macro1
+(x,y,z
+)
+content 1
+content 2
+!enddefine.
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "define"),
+ (Segment::Spaces, " "),
+ (Segment::MacroName, "!macro1"),
+ (Segment::Newline, "\n"),
+ (Segment::Punct, "("),
+ (Segment::Identifier, "x"),
+ (Segment::Punct, ","),
+ (Segment::Identifier, "y"),
+ (Segment::Punct, ","),
+ (Segment::Identifier, "z"),
+ (Segment::Newline, "\n"),
+ (Segment::Punct, ")"),
+ (Segment::Newline, "\n"),
+ (Segment::MacroBody, "content 1"),
+ (Segment::Newline, "\n"),
+ (Segment::MacroBody, "content 2"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "!enddefine"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Define,
+ PromptStyle::Define,
+ PromptStyle::Define,
+ PromptStyle::First,
+ ],
+ );
+ }
+
+ #[test]
+ fn test_early_end_of_command_1() {
+ check_segmentation(
+ r#"define !macro1.
+data list /x 1.
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "define"),
+ (Segment::Spaces, " "),
+ (Segment::MacroName, "!macro1"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "data"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "list"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "/"),
+ (Segment::Identifier, "x"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "1"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[PromptStyle::First, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_early_end_of_command_2() {
+ check_segmentation(
+ r#"define !macro1
+x.
+data list /x 1.
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "define"),
+ (Segment::Spaces, " "),
+ (Segment::MacroName, "!macro1"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "x"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "data"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "list"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "/"),
+ (Segment::Identifier, "x"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "1"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[PromptStyle::Later, PromptStyle::First, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_early_end_of_command_3() {
+ check_segmentation(
+ r#"define !macro1(.
+x.
+data list /x 1.
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "define"),
+ (Segment::Spaces, " "),
+ (Segment::MacroName, "!macro1"),
+ (Segment::Punct, "("),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "x"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "data"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "list"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "/"),
+ (Segment::Identifier, "x"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "1"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[PromptStyle::First, PromptStyle::First, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_early_end_of_command_4() {
+ // Notice the command terminator at the end of the `DEFINE` command,
+ // which should not be there and ends it early.
+ check_segmentation(
+ r#"define !macro1.
+data list /x 1.
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "define"),
+ (Segment::Spaces, " "),
+ (Segment::MacroName, "!macro1"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "data"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "list"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "/"),
+ (Segment::Identifier, "x"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "1"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[PromptStyle::First, PromptStyle::First],
+ );
+ }
+
+ #[test]
+ fn test_missing_enddefine() {
+ check_segmentation(
+ r#"define !macro1()
+content line 1
+content line 2
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "define"),
+ (Segment::Spaces, " "),
+ (Segment::MacroName, "!macro1"),
+ (Segment::Punct, "("),
+ (Segment::Punct, ")"),
+ (Segment::Newline, "\n"),
+ (Segment::MacroBody, "content line 1"),
+ (Segment::Newline, "\n"),
+ (Segment::MacroBody, "content line 2"),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::Define,
+ PromptStyle::Define,
+ PromptStyle::Define,
+ ],
+ );
+ }
+
+ #[test]
+ fn test_missing_enddefine_2() {
+ check_segmentation(
+ r#"define !macro1()
+"#,
+ Mode::Interactive,
+ &[
+ (Segment::Identifier, "define"),
+ (Segment::Spaces, " "),
+ (Segment::MacroName, "!macro1"),
+ (Segment::Punct, "("),
+ (Segment::Punct, ")"),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[PromptStyle::Define],
+ );
+ }
+}
+
+#[test]
+fn test_batch_mode() {
+ check_segmentation(
+ r#"first command
+ another line of first command
++ second command
+third command
+
+fourth command.
+ fifth command.
+"#,
+ Mode::Batch,
+ &[
+ (Segment::Identifier, "first"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "command"),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "another"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "line"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "of"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "first"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "command"),
+ (Segment::Newline, "\n"),
+ (Segment::StartCommand, "+"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "second"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "command"),
+ (Segment::Newline, "\n"),
+ (Segment::StartCommand, ""),
+ (Segment::Identifier, "third"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "command"),
+ (Segment::Newline, "\n"),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "fourth"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "command"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "fifth"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "command"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ ],
+ );
+}
+
+#[test]
+fn test_auto_mode() {
+ check_segmentation(
+ r#"command
+ another line of command
+2sls
++ another command
+another line of second command
+data list /x 1
+aggregate.
+print eject.
+twostep cluster
+
+
+fourth command.
+ fifth command.
+"#,
+ Mode::Auto,
+ &[
+ (Segment::Identifier, "command"),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "another"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "line"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "of"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "command"),
+ (Segment::Newline, "\n"),
+ (Segment::StartCommand, ""),
+ (Segment::Number, "2"),
+ (Segment::Identifier, "sls"),
+ (Segment::Newline, "\n"),
+ (Segment::StartCommand, "+"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "another"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "command"),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "another"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "line"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "of"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "second"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "command"),
+ (Segment::Newline, "\n"),
+ (Segment::StartCommand, ""),
+ (Segment::Identifier, "data"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "list"),
+ (Segment::Spaces, " "),
+ (Segment::Punct, "/"),
+ (Segment::Identifier, "x"),
+ (Segment::Spaces, " "),
+ (Segment::Number, "1"),
+ (Segment::Newline, "\n"),
+ (Segment::StartCommand, ""),
+ (Segment::Identifier, "aggregate"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "print"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "eject"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "twostep"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "cluster"),
+ (Segment::Newline, "\n"),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::SeparateCommands, ""),
+ (Segment::Newline, "\n"),
+ (Segment::Identifier, "fourth"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "command"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "fifth"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "command"),
+ (Segment::EndCommand, "."),
+ (Segment::Newline, "\n"),
+ (Segment::End, ""),
+ ],
+ &[
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::Later,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::Later,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ PromptStyle::First,
+ ],
+ );
+}
--- /dev/null
+use std::fmt::{Display, Formatter, Result as FmtResult};
+
+use crate::identifier::Identifier;
+
+#[derive(Clone, Debug, PartialEq)]
+pub enum Token {
+ /// End of input.
+ End,
+
+ /// Identifier.
+ Id(Identifier),
+
+ /// Number.
+ Number(f64),
+
+ /// Quoted string.
+ String(String),
+
+ /// Command terminator or separator.
+ ///
+ /// Usually this is `.`, but a blank line also separates commands, and in
+ /// batch mode any line that begins with a non-blank starts a new command.
+ EndCommand,
+
+ /// Operators, punctuators, and reserved words.
+ Punct(Punct),
+}
+
+impl Token {
+ pub fn id(&self) -> Option<&Identifier> {
+ match self {
+ Self::Id(identifier) => Some(identifier),
+ _ => None,
+ }
+ }
+}
+
+fn is_printable(c: char) -> bool {
+ !c.is_control() || ['\t', '\r', '\n'].contains(&c)
+}
+
+fn string_representation(s: &str, quote: char, f: &mut Formatter<'_>) -> FmtResult {
+ write!(f, "{quote}")?;
+ for section in s.split_inclusive(quote) {
+ if let Some(rest) = section.strip_suffix(quote) {
+ write!(f, "{rest}{quote}{quote}")?;
+ } else {
+ write!(f, "{section}")?;
+ }
+ }
+ write!(f, "{quote}")
+}
+
+impl Display for Token {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ match self {
+ Token::End => Ok(()),
+ Token::Id(s) => write!(f, "{s}"),
+ Token::Number(number) => {
+ if number.is_sign_negative() {
+ write!(f, "-{}", number.abs())
+ } else {
+ write!(f, "{number}")
+ }
+ }
+ Token::String(s) => {
+ if s.chars().all(|c| is_printable(c)) {
+ if s.contains('"') {
+ string_representation(s, '\'', f)
+ } else {
+ string_representation(s, '"', f)
+ }
+ } else {
+ write!(f, "X\"")?;
+ for byte in s.bytes() {
+ let c1 = char::from_digit((byte >> 4) as u32, 16)
+ .unwrap()
+ .to_ascii_uppercase();
+ let c2 = char::from_digit((byte & 0xf) as u32, 16)
+ .unwrap()
+ .to_ascii_uppercase()
+ .to_ascii_lowercase();
+ write!(f, "{c1}{c2}")?;
+ }
+ write!(f, "\"")
+ }
+ }
+ Token::EndCommand => write!(f, "."),
+ Token::Punct(punct) => punct.fmt(f),
+ }
+ }
+}
+
+/// Check that all negative numbers, even -0, get formatted with a leading `-`.
+#[cfg(test)]
+mod test {
+ use crate::lex::token::Token;
+
+ #[test]
+ fn test_string() {
+ assert_eq!(Token::String(String::from("abc")).to_string(), "\"abc\"");
+ assert_eq!(
+ Token::String(String::from("\u{0080}")).to_string(),
+ "X\"C280\""
+ );
+ }
+
+ #[test]
+ fn test_neg0() {
+ assert_eq!(Token::Number(-0.0).to_string(), "-0");
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum Punct {
+ /// `+`.
+ Plus,
+
+ /// `-`.
+ Dash,
+
+ /// `*`.
+ Asterisk,
+
+ /// `/`.
+ Slash,
+
+ /// `=`.
+ Equals,
+
+ /// `(`.
+ LParen,
+
+ /// `)`.
+ RParen,
+
+ /// `[`.
+ LSquare,
+
+ /// `]`.
+ RSquare,
+
+ /// `{`.
+ LCurly,
+
+ /// `}`.
+ RCurly,
+
+ /// `,`.
+ Comma,
+
+ /// `;`.
+ Semicolon,
+
+ /// `:`.
+ Colon,
+
+ /// `AND` or `&`.
+ And,
+
+ /// `OR` or `|`.
+ Or,
+
+ /// `NOT` or `~`.
+ Not,
+
+ /// `EQ` or `=`.
+ Eq,
+
+ /// `GE` or '>=`
+ Ge,
+
+ /// `GT` or `>`.
+ Gt,
+
+ /// `LE` or `<=`.
+ Le,
+
+ /// `LT` or `<`.
+ Lt,
+
+ /// `NE` or `~=` or `<>`.
+ Ne,
+
+ /// `ALL`.
+ All,
+
+ /// `BY`.
+ By,
+
+ /// `TO`.
+ To,
+
+ /// `WITH`.
+ With,
+
+ /// `**`.
+ Exp,
+
+ /// `!` (only appears in macros).
+ Bang,
+
+ /// `%` (only appears in macros).
+ Percent,
+
+ /// `?` (only appears in macros).
+ Question,
+
+ /// ```` (only appears in macros).
+ Backtick,
+
+ /// `.`.
+ ///
+ /// This represents a dot in the middle of a line by itself, where it does not end a command.
+ Dot,
+
+ /// `_` (only appears in macros).
+ ///
+ /// Although underscores may appear within identifiers, they can't be the
+ /// first character, so this represents an underscore found on its own.
+ Underscore,
+
+ /// `!*` (only appears in macros).
+ BangAsterisk,
+}
+
+impl Punct {
+ pub fn as_str(&self) -> &'static str {
+ match self {
+ Self::Plus => "+",
+ Self::Dash => "-",
+ Self::Asterisk => "*",
+ Self::Slash => "/",
+ Self::Equals => "=",
+ Self::LParen => "(",
+ Self::RParen => ")",
+ Self::LSquare => "[",
+ Self::RSquare => "]",
+ Self::LCurly => "{",
+ Self::RCurly => "}",
+ Self::Comma => ",",
+ Self::Semicolon => ";",
+ Self::Colon => ":",
+ Self::And => "AND",
+ Self::Or => "OR",
+ Self::Not => "NOT",
+ Self::Eq => "EQ",
+ Self::Ge => ">=",
+ Self::Gt => ">",
+ Self::Le => "<=",
+ Self::Lt => "<",
+ Self::Ne => "~=",
+ Self::All => "ALL",
+ Self::By => "BY",
+ Self::To => "TO",
+ Self::With => "WITH",
+ Self::Exp => "**",
+ Self::Bang => "!",
+ Self::Percent => "%",
+ Self::Question => "?",
+ Self::Backtick => "`",
+ Self::Dot => ".",
+ Self::Underscore => "_",
+ Self::BangAsterisk => "!*",
+ }
+ }
+}
+impl Display for Punct {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ write!(f, "{}", self.as_str())
+ }
+}
--- /dev/null
+#[allow(unused_variables, unused_mut, dead_code)]
+pub mod cooked;
+pub mod dictionary;
+pub mod encoding;
+pub mod endian;
+pub mod format;
+pub mod identifier;
+pub mod locale_charset;
+pub mod output;
+#[allow(unused_variables, unused_mut, dead_code)]
+pub mod raw;
+pub mod sack;
+pub mod lex;
+pub mod prompt;
+pub mod message;
+pub mod macros;
+pub mod settings;
+pub mod command;
+pub mod integer;
+pub mod engine;
--- /dev/null
+// Determine a canonical name for the current locale's character encoding.
+//
+// Copyright (C) 2000-2006, 2008-2023 Free Software Foundation, Inc.
+//
+// This file is free software: you can redistribute it and/or modify it under
+// the terms of the GNU Lesser General Public License as published by the Free
+// Software Foundation; either version 2.1 of the License, or (at your option)
+// any later version.
+//
+// This file is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+// A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this program. If not, see <https://www.gnu.org/licenses/>.
+//
+// Written by Bruno Haible <bruno@clisp.org>. Translated to Rust by Ben Pfaff
+// <blp@cs.stanford.edu>.
+
+use lazy_static::lazy_static;
+
+fn map_aliases(s: &str) -> &'static str {
+ #[cfg(target_os = "freebsd")]
+ match s {
+ "ARMSCII-8" => return "ARMSCII-8",
+ "Big5" => return "BIG5",
+ "C" => return "ASCII",
+ "CP1131" => return "CP1131",
+ "CP1251" => return "CP1251",
+ "CP866" => return "CP866",
+ "GB18030" => return "GB18030",
+ "GB2312" => return "GB2312",
+ "GBK" => return "GBK",
+ "ISCII-DEV" => return "?",
+ "ISO8859-1" => return "ISO-8859-1",
+ "ISO8859-13" => return "ISO-8859-13",
+ "ISO8859-15" => return "ISO-8859-15",
+ "ISO8859-2" => return "ISO-8859-2",
+ "ISO8859-5" => return "ISO-8859-5",
+ "ISO8859-7" => return "ISO-8859-7",
+ "ISO8859-9" => return "ISO-8859-9",
+ "KOI8-R" => return "KOI8-R",
+ "KOI8-U" => return "KOI8-U",
+ "SJIS" => return "SHIFT_JIS",
+ "US-ASCII" => return "ASCII",
+ "eucCN" => return "GB2312",
+ "eucJP" => return "EUC-JP",
+ "eucKR" => return "EUC-KR",
+ _ => (),
+ };
+
+ #[cfg(target_os = "netbsd")]
+ match s {
+ "646" => return "ASCII",
+ "ARMSCII-8" => return "ARMSCII-8",
+ "BIG5" => return "BIG5",
+ "Big5-HKSCS" => return "BIG5-HKSCS",
+ "CP1251" => return "CP1251",
+ "CP866" => return "CP866",
+ "GB18030" => return "GB18030",
+ "GB2312" => return "GB2312",
+ "ISO8859-1" => return "ISO-8859-1",
+ "ISO8859-13" => return "ISO-8859-13",
+ "ISO8859-15" => return "ISO-8859-15",
+ "ISO8859-2" => return "ISO-8859-2",
+ "ISO8859-4" => return "ISO-8859-4",
+ "ISO8859-5" => return "ISO-8859-5",
+ "ISO8859-7" => return "ISO-8859-7",
+ "KOI8-R" => return "KOI8-R",
+ "KOI8-U" => return "KOI8-U",
+ "PT154" => return "PT154",
+ "SJIS" => return "SHIFT_JIS",
+ "eucCN" => return "GB2312",
+ "eucJP" => return "EUC-JP",
+ "eucKR" => return "EUC-KR",
+ "eucTW" => return "EUC-TW",
+ _ => (),
+ };
+
+ #[cfg(target_os = "openbsd")]
+ match s {
+ "646" => return "ASCII",
+ "ISO8859-1" => return "ISO-8859-1",
+ "ISO8859-13" => return "ISO-8859-13",
+ "ISO8859-15" => return "ISO-8859-15",
+ "ISO8859-2" => return "ISO-8859-2",
+ "ISO8859-4" => return "ISO-8859-4",
+ "ISO8859-5" => return "ISO-8859-5",
+ "ISO8859-7" => return "ISO-8859-7",
+ "US-ASCII" => return "ASCII",
+ _ => (),
+ };
+
+ /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is
+ useless:
+ - It returns the empty string when LANG is set to a locale of the
+ form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8
+ LC_CTYPE file.
+ - The environment variables LANG, LC_CTYPE, LC_ALL are not set by
+ the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
+ - The documentation says:
+ "... all code that calls BSD system routines should ensure
+ that the const *char parameters of these routines are in UTF-8
+ encoding. All BSD system functions expect their string
+ parameters to be in UTF-8 encoding and nothing else."
+ It also says
+ "An additional caveat is that string parameters for files,
+ paths, and other file-system entities must be in canonical
+ UTF-8. In a canonical UTF-8 Unicode string, all decomposable
+ characters are decomposed ..."
+ but this is not true: You can pass non-decomposed UTF-8 strings
+ to file system functions, and it is the OS which will convert
+ them to decomposed UTF-8 before accessing the file system.
+ - The Apple Terminal application displays UTF-8 by default.
+ - However, other applications are free to use different encodings:
+ - xterm uses ISO-8859-1 by default.
+ - TextEdit uses MacRoman by default.
+ We prefer UTF-8 over decomposed UTF-8-MAC because one should
+ minimize the use of decomposed Unicode. Unfortunately, through the
+ Darwin file system, decomposed UTF-8 strings are leaked into user
+ space nevertheless.
+ Then there are also the locales with encodings other than US-ASCII
+ and UTF-8. These locales can be occasionally useful to users (e.g.
+ when grepping through ISO-8859-1 encoded text files), when all their
+ file names are in US-ASCII.
+ */
+
+ #[cfg(target_os = "macos")]
+ match s {
+ "ARMSCII-8" => return "ARMSCII-8",
+ "Big5" => return "BIG5",
+ "Big5HKSCS" => return "BIG5-HKSCS",
+ "CP1131" => return "CP1131",
+ "CP1251" => return "CP1251",
+ "CP866" => return "CP866",
+ "CP949" => return "CP949",
+ "GB18030" => return "GB18030",
+ "GB2312" => return "GB2312",
+ "GBK" => return "GBK",
+ "ISO8859-1" => return "ISO-8859-1",
+ "ISO8859-13" => return "ISO-8859-13",
+ "ISO8859-15" => return "ISO-8859-15",
+ "ISO8859-2" => return "ISO-8859-2",
+ "ISO8859-4" => return "ISO-8859-4",
+ "ISO8859-5" => return "ISO-8859-5",
+ "ISO8859-7" => return "ISO-8859-7",
+ "ISO8859-9" => return "ISO-8859-9",
+ "KOI8-R" => return "KOI8-R",
+ "KOI8-U" => return "KOI8-U",
+ "PT154" => return "PT154",
+ "SJIS" => return "SHIFT_JIS",
+ "eucCN" => return "GB2312",
+ "eucJP" => return "EUC-JP",
+ "eucKR" => return "EUC-KR",
+ _ => (),
+ };
+
+ #[cfg(target_os = "aix")]
+ match s {
+ "GBK" => return "GBK",
+ "IBM-1046" => return "CP1046",
+ "IBM-1124" => return "CP1124",
+ "IBM-1129" => return "CP1129",
+ "IBM-1252" => return "CP1252",
+ "IBM-850" => return "CP850",
+ "IBM-856" => return "CP856",
+ "IBM-921" => return "ISO-8859-13",
+ "IBM-922" => return "CP922",
+ "IBM-932" => return "CP932",
+ "IBM-943" => return "CP943",
+ "IBM-eucCN" => return "GB2312",
+ "IBM-eucJP" => return "EUC-JP",
+ "IBM-eucKR" => return "EUC-KR",
+ "IBM-eucTW" => return "EUC-TW",
+ "ISO8859-1" => return "ISO-8859-1",
+ "ISO8859-15" => return "ISO-8859-15",
+ "ISO8859-2" => return "ISO-8859-2",
+ "ISO8859-5" => return "ISO-8859-5",
+ "ISO8859-6" => return "ISO-8859-6",
+ "ISO8859-7" => return "ISO-8859-7",
+ "ISO8859-8" => return "ISO-8859-8",
+ "ISO8859-9" => return "ISO-8859-9",
+ "TIS-620" => return "TIS-620",
+ "UTF-8" => return "UTF-8",
+ "big5" => return "BIG5",
+ _ => (),
+ };
+
+ #[cfg(windows)]
+ match s {
+ "CP1361" => return "JOHAB",
+ "CP20127" => return "ASCII",
+ "CP20866" => return "KOI8-R",
+ "CP20936" => return "GB2312",
+ "CP21866" => return "KOI8-RU",
+ "CP28591" => return "ISO-8859-1",
+ "CP28592" => return "ISO-8859-2",
+ "CP28593" => return "ISO-8859-3",
+ "CP28594" => return "ISO-8859-4",
+ "CP28595" => return "ISO-8859-5",
+ "CP28596" => return "ISO-8859-6",
+ "CP28597" => return "ISO-8859-7",
+ "CP28598" => return "ISO-8859-8",
+ "CP28599" => return "ISO-8859-9",
+ "CP28605" => return "ISO-8859-15",
+ "CP38598" => return "ISO-8859-8",
+ "CP51932" => return "EUC-JP",
+ "CP51936" => return "GB2312",
+ "CP51949" => return "EUC-KR",
+ "CP51950" => return "EUC-TW",
+ "CP54936" => return "GB18030",
+ "CP65001" => return "UTF-8",
+ "CP936" => return "GBK",
+ _ => (),
+ };
+
+ String::from(s).leak()
+}
+
+#[cfg(unix)]
+mod inner {
+ use std::{
+ ffi::{c_int, CStr, CString},
+ ptr::null,
+ };
+
+ use libc::{self, nl_langinfo, setlocale, CODESET, LC_CTYPE};
+
+ unsafe fn string_from_pointer(s: *const i8) -> Option<String> {
+ if s.is_null() {
+ None
+ } else {
+ Some(CStr::from_ptr(s).to_string_lossy().into())
+ }
+ }
+
+ fn set_locale(category: c_int, locale: Option<&str>) -> Option<String> {
+ unsafe {
+ let locale = locale.map(|s| CString::new(s).unwrap());
+ let locale_ptr = locale.as_ref().map_or(null(), |s| s.as_ptr());
+ string_from_pointer(setlocale(category, locale_ptr))
+ }
+ }
+
+ pub fn locale_charset() -> Option<String> {
+ unsafe {
+ let saved_locale = set_locale(LC_CTYPE, None);
+ set_locale(LC_CTYPE, Some(""));
+ let codeset = string_from_pointer(nl_langinfo(CODESET));
+ set_locale(LC_CTYPE, saved_locale.as_deref());
+ codeset
+ }
+ }
+}
+
+#[cfg(windows)]
+mod inner {
+ use libc::{setlocale, LC_CTYPE};
+ use std::ffi::{CStr, CString};
+ use windows_sys::Win32::Globalization::GetACP;
+
+ fn current_locale() -> Option<String> {
+ unsafe {
+ let empty_cstr = CString::new("").unwrap();
+ let locale = setlocale(LC_CTYPE, empty_cstr.as_ptr());
+ if locale.is_null() {
+ None
+ } else {
+ Some(CStr::from_ptr(locale).to_string_lossy().into())
+ }
+ }
+ }
+
+ pub fn locale_charset() -> Option<String> {
+ let Some(current_locale) = current_locale() else {
+ return None;
+ };
+ let codepage = if let Some((_, pdot)) = current_locale.rsplit_once('.') {
+ format!("CP{pdot}")
+ } else {
+ format!("CP{}", unsafe { GetACP() })
+ };
+ Some(match codepage.as_str() {
+ "CP65001" | "CPutf8" => String::from("UTF-8"),
+ _ => codepage,
+ })
+ }
+}
+
+#[cfg(not(any(unix, windows)))]
+mod inner {
+ pub fn locale_charse() -> String {
+ String::from("UTF-8")
+ }
+}
+
+/// Returns the character set used by the locale configured in the operating
+/// system.
+pub fn locale_charset() -> &'static str {
+ lazy_static! {
+ static ref LOCALE_CHARSET: &'static str =
+ map_aliases(&inner::locale_charset().unwrap_or(String::from("UTF-8")));
+ }
+ &LOCALE_CHARSET
+}
--- /dev/null
+use lazy_static::lazy_static;
+use num::Integer;
+use std::{
+ cell::RefCell,
+ cmp::Ordering,
+ collections::{BTreeMap, HashMap, HashSet},
+ mem::take,
+ num::NonZeroUsize,
+ ops::RangeInclusive,
+};
+use thiserror::Error as ThisError;
+use unicase::UniCase;
+
+use crate::{
+ identifier::Identifier,
+ lex::{
+ scan::{ScanError, ScanToken, StringScanner, StringSegmenter},
+ segment::Mode,
+ token::{Punct, Token},
+ },
+ message::Location,
+ settings::Settings,
+};
+
+#[derive(Clone, Debug, ThisError)]
+pub enum MacroError {
+ /// Expected more tokens.
+ #[error(
+ "Reached end of command expecting {n} more tokens in argument {arg} to macro {macro_}."
+ )]
+ ExpectedMoreTokens {
+ n: usize,
+ arg: Identifier,
+ macro_: Identifier,
+ },
+
+ /// Expected a particular token at end of command.
+ #[error("Reached end of command expecting {token:?} in argument {arg} to macro {macro_}.")]
+ ExpectedToken {
+ token: String,
+ arg: Identifier,
+ macro_: Identifier,
+ },
+
+ /// Expected a particular token, got a different one.
+ #[error(
+ "Found `{actual}` while expecting `{expected}` reading argument {arg} to macro {macro_}."
+ )]
+ UnexpectedToken {
+ actual: String,
+ expected: String,
+ arg: Identifier,
+ macro_: Identifier,
+ },
+
+ /// Argument specified multiple times,
+ #[error("Argument {arg} specified multiple times in call to macro {macro_}.")]
+ DuplicateArg { arg: Identifier, macro_: Identifier },
+
+ /// Maximum nesting limit exceeded.
+ #[error("Maximum nesting level {limit} exceeded. (Use `SET MNEST` to change the limit.)")]
+ TooDeep { limit: usize },
+
+ /// Invalid `!*`.
+ #[error("`!*` may only be used within the expansion of a macro.")]
+ InvalidBangAsterisk,
+
+ /// Error tokenizing during expansion.
+ #[error(transparent)]
+ ScanError(ScanError),
+
+ /// Expecting `)` in macro expression.
+ #[error("Expecting `)` in macro expression.")]
+ ExpectingRParen,
+
+ /// Expecting literal.
+ #[error("Expecting literal or function invocation in macro expression.")]
+ ExpectingLiteral,
+
+ /// Expecting `!THEN`.
+ #[error("`!THEN` expected in macro `!IF` construct.")]
+ ExpectingThen,
+
+ /// Expecting `!ELSE` or `!THEN`.
+ #[error("`!ELSE` or `!THEN` expected in macro `!IF` construct.")]
+ ExpectingElseOrIfEnd,
+
+ /// Expecting `!IFEND`.
+ #[error("`!IFEND` expected in macro `!IF` construct.")]
+ ExpectingIfEnd,
+
+ /// Expecting macro variable name.
+ #[error("Expecting macro variable name following `{0}`.")]
+ ExpectingMacroVarName(&'static str),
+
+ /// Invalid macro variable name.
+ #[error("Cannot use argument name or macro keyword {name} as `{construct}` variable name.")]
+ BadMacroVarName {
+ name: Identifier,
+ construct: &'static str,
+ },
+
+ /// Expecting `=` following `!LET`.
+ #[error("Expecting `=` following `!LET`.")]
+ ExpectingEquals,
+
+ /// Expecting `=` or `!IN` in `!DO` loop.
+ #[error("Expecting `=` or `!IN` in `!DO` loop.")]
+ ExpectingEqualsOrIn,
+
+ /// Missing `!DOEND`.
+ #[error("Missing `!DOEND`.")]
+ MissingDoEnd,
+
+ /// Bad numberic macro expression.
+ #[error("Macro expression must evaluate to a number (not {0:?})")]
+ BadNumericMacroExpression(String),
+
+ /// Too many iteration for list-based loop.
+ #[error("`!DO` loop over list exceeded maximum number of iterations {0}. (Use `SET MITERATE` to change the limit.)")]
+ MiterateList(usize),
+
+ /// Too many iteration for numerical loop.
+ #[error("Numerical `!DO` loop exceeded maximum number of iterations {0}. (Use `SET MITERATE` to change the limit.)")]
+ MiterateNumeric(usize),
+
+ /// Expecting `!TO` in numerical `!DO` loop.
+ #[error("Expecting `!TO` in numerical `!DO` loop.")]
+ ExpectingTo,
+
+ /// `!BY` value cannot be zero.
+ #[error("`!BY` value cannot be zero.")]
+ ZeroBy,
+
+ /// `!BREAK` outside `!DO`.
+ #[error("`!BREAK` outside `!DO`.")]
+ BreakOutsideDo,
+
+ /// `,` or `)` expected in call to macro function.
+ #[error("`,` or `)` expected in call to macro function `{0}`.")]
+ ExpectingCommaOrRParen(Identifier),
+
+ /// Macro function takes one argument.
+ #[error("Macro function `{name}` takes one argument (not {n_args}).")]
+ ExpectingOneArg { name: Identifier, n_args: usize },
+
+ /// Macro function takes two arguments.
+ #[error("Macro function `{name}` takes two arguments (not {n_args}).")]
+ ExpectingTwoArgs { name: Identifier, n_args: usize },
+
+ /// Macro function takes two or three arguments.
+ #[error("Macro function `{name}` takes two or three arguments (not {n_args}).")]
+ ExpectingTwoOrThreeArgs { name: Identifier, n_args: usize },
+
+ /// Macro function needs at least one argument).
+ #[error("Macro function `{name}` needs at least one argument).")]
+ ExpectingOneOrMoreArgs { name: Identifier },
+
+ /// Argument to `!BLANKS` must be non-negative integer (not `{0}`).
+ #[error("Argument to `!BLANKS` must be non-negative integer (not `{0}`).")]
+ InvalidBlanks(String),
+
+ /// Second argument of `!SUBSTR` must be positive integer (not `{0}`).
+ #[error("Second argument of `!SUBSTR` must be positive integer (not `{0}`).")]
+ InvalidSubstr2(String),
+
+ /// Third argument of `!SUBSTR` must be non-negative integer (not `{0}`).
+ #[error("Third argument of `!SUBSTR` must be non-negative integer (not `{0}`).")]
+ InvalidSubstr3(String),
+}
+
+/// A PSPP macro as defined with `!DEFINE`.
+pub struct Macro {
+ /// The macro's name. This is an ordinary identifier except that it is
+ /// allowed (but not required) to begin with `!`.
+ pub name: Identifier,
+
+ /// Source code location of macro definition, for error reporting.
+ pub location: Location,
+
+ /// Parameters.
+ parameters: Vec<Parameter>,
+
+ /// Body.
+ body: Vec<MacroToken>,
+}
+
+impl Macro {
+ fn initial_state(&self) -> ParserState {
+ if self.parameters.is_empty() {
+ ParserState::Finished
+ } else if self.parameters[0].is_positional() {
+ ParserState::Keyword
+ } else if let ValueType::Enclose(_, _) = self.parameters[0].arg {
+ ParserState::Enclose
+ } else {
+ ParserState::Arg
+ }
+ }
+
+ fn find_parameter(&self, name: &Identifier) -> Option<usize> {
+ self.parameters.iter().position(|param| ¶m.name == name)
+ }
+}
+
+struct Parameter {
+ /// `!name` or `!1`.
+ name: Identifier,
+
+ /// Default value.
+ ///
+ /// The tokens don't include white space, etc. between them.
+ default: Vec<MacroToken>,
+
+ /// Macro-expand the value?
+ expand_value: bool,
+
+ /// How the argument is specified.
+ arg: ValueType,
+}
+
+impl Parameter {
+ /// Returns true if this is a positional parameter. Positional parameters
+ /// are expanded by index (position) rather than by name.
+ fn is_positional(&self) -> bool {
+ self.name.0.as_bytes()[1].is_ascii_digit()
+ }
+}
+
+enum ValueType {
+ /// Argument consists of `.0` tokens.
+ NTokens(usize),
+
+ /// Argument runs until token `.0`.
+ CharEnd(Token),
+
+ /// Argument starts with token `.0` and ends with token `.1`.
+ Enclose(Token, Token),
+
+ /// Argument runs until the end of the command.
+ CmdEnd,
+}
+
+/// A token and the syntax that was tokenized to produce it. The syntax allows
+/// the token to be turned back into syntax accurately.
+#[derive(Clone)]
+pub struct MacroToken {
+ /// The token.
+ pub token: Token,
+
+ /// The syntax that produces `token`.
+ pub syntax: String,
+}
+
+fn tokenize_string_into(
+ s: &str,
+ mode: Mode,
+ error: &impl Fn(MacroError),
+ output: &mut Vec<MacroToken>,
+) {
+ for (syntax, token) in StringSegmenter::new(s, mode, true) {
+ match token {
+ ScanToken::Token(token) => output.push(MacroToken {
+ token,
+ syntax: String::from(syntax),
+ }),
+ ScanToken::Error(scan_error) => error(MacroError::ScanError(scan_error)),
+ }
+ }
+}
+
+fn tokenize_string(s: &str, mode: Mode, error: &impl Fn(MacroError)) -> Vec<MacroToken> {
+ let mut tokens = Vec::new();
+ tokenize_string_into(s, mode, error, &mut tokens);
+ tokens
+}
+
+fn try_unquote_string(input: &String, mode: Mode) -> Option<String> {
+ let mut scanner = StringScanner::new(input, mode, true);
+ let Some(ScanToken::Token(Token::String(unquoted))) = scanner.next() else {
+ return None;
+ };
+ let None = scanner.next() else { return None };
+ return Some(unquoted);
+}
+
+fn unquote_string(input: String, mode: Mode) -> String {
+ try_unquote_string(&input, mode).unwrap_or(input)
+}
+
+#[derive(Clone)]
+struct MacroTokens<'a>(&'a [MacroToken]);
+
+impl<'a> MacroTokens<'a> {
+ fn is_empty(&self) -> bool {
+ self.0.is_empty()
+ }
+ fn match_(&mut self, s: &str) -> bool {
+ if let Some((first, rest)) = self.0.split_first() {
+ if first.syntax.eq_ignore_ascii_case(s) {
+ self.0 = rest;
+ return true;
+ }
+ }
+ false
+ }
+ fn take_relop(&mut self) -> Option<RelOp> {
+ if let Some((first, rest)) = self.0.split_first() {
+ if let Ok(relop) = first.syntax.as_str().try_into() {
+ self.0 = rest;
+ return Some(relop);
+ }
+ }
+ None
+ }
+ fn macro_id(&self) -> Option<&Identifier> {
+ self.0.get(0).map(|mt| mt.token.macro_id()).flatten()
+ }
+ fn take_macro_id(&mut self) -> Option<&Identifier> {
+ let result = self.0.get(0).map(|mt| mt.token.macro_id()).flatten();
+ if result.is_some() {
+ self.advance();
+ }
+ result
+ }
+ fn take(&mut self) -> Option<&MacroToken> {
+ match self.0.split_first() {
+ Some((first, rest)) => {
+ self.0 = rest;
+ Some(first)
+ }
+ None => None,
+ }
+ }
+ fn advance(&mut self) -> &MacroToken {
+ let (first, rest) = self.0.split_first().unwrap();
+ self.0 = rest;
+ first
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+enum TokenClass {
+ /// No space before or after (new-line after).
+ EndCommand,
+
+ /// Space on both sides.
+ BinaryOperator,
+
+ /// Space afterward.
+ Comma,
+
+ /// Don't need spaces except sequentially.
+ Id,
+
+ /// Don't need spaces except sequentially.
+ Punct,
+}
+
+impl TokenClass {
+ fn separator(prev: Self, next: Self) -> &'static str {
+ match (prev, next) {
+ // Don't need a separator before the end of a command, but we
+ // need a new-line afterward.
+ (_, Self::EndCommand) => "",
+ (Self::EndCommand, _) => "\n",
+
+ // Binary operators always have a space on both sides, and a comma always has a space afterward.
+ (Self::BinaryOperator, _) | (_, Self::BinaryOperator) | (Self::Comma, _) => " ",
+
+ // Otherwise, `prev` is `Self::Punct`, which only need a space if
+ // there are two or them in a row.
+ (Self::Punct, Self::Punct) => " ",
+ _ => "",
+ }
+ }
+}
+
+impl From<&Token> for TokenClass {
+ fn from(source: &Token) -> Self {
+ match source {
+ Token::End => Self::Punct,
+ Token::Id(_) | Token::Number(_) | Token::String(_) => Self::Id,
+ Token::EndCommand => Self::EndCommand,
+ Token::Punct(punct) => match punct {
+ Punct::LParen
+ | Punct::RParen
+ | Punct::LSquare
+ | Punct::RSquare
+ | Punct::LCurly
+ | Punct::RCurly => Self::Punct,
+
+ Punct::Plus
+ | Punct::Dash
+ | Punct::Asterisk
+ | Punct::Slash
+ | Punct::Equals
+ | Punct::Colon
+ | Punct::And
+ | Punct::Or
+ | Punct::Not
+ | Punct::Eq
+ | Punct::Ge
+ | Punct::Gt
+ | Punct::Le
+ | Punct::Lt
+ | Punct::Ne
+ | Punct::All
+ | Punct::By
+ | Punct::To
+ | Punct::With
+ | Punct::Exp
+ | Punct::Bang
+ | Punct::Percent
+ | Punct::Question
+ | Punct::Backtick
+ | Punct::Dot
+ | Punct::Underscore
+ | Punct::BangAsterisk => Self::BinaryOperator,
+
+ Punct::Comma | Punct::Semicolon => Self::Comma,
+ },
+ }
+ }
+}
+
+pub fn macro_tokens_to_syntax(input: &[MacroToken]) -> impl Iterator<Item = [&str; 2]> {
+ input
+ .iter()
+ .take(1)
+ .map(|token| ["", token.syntax.as_str()])
+ .chain(input.windows(2).map(|w| {
+ let c0 = (&w[0].token).into();
+ let c1 = (&w[1].token).into();
+ [TokenClass::separator(c0, c1), w[1].syntax.as_str()]
+ }))
+}
+
+trait MacroId {
+ fn macro_id(&self) -> Option<&Identifier>;
+}
+
+impl MacroId for Token {
+ fn macro_id(&self) -> Option<&Identifier> {
+ let id = self.id()?;
+ id.0.starts_with('!').then_some(id)
+ }
+}
+
+enum RelOp {
+ Eq,
+ Ne,
+ Lt,
+ Gt,
+ Le,
+ Ge,
+}
+
+impl TryFrom<&str> for RelOp {
+ type Error = ();
+
+ fn try_from(source: &str) -> Result<Self, Self::Error> {
+ match source {
+ "=" => Ok(Self::Eq),
+ "~=" | "<>" => Ok(Self::Ne),
+ "<" => Ok(Self::Lt),
+ ">" => Ok(Self::Gt),
+ "<=" => Ok(Self::Le),
+ ">=" => Ok(Self::Ge),
+ _ if source.len() == 3 && source.as_bytes()[0] == b'!' => match (
+ source.as_bytes()[0].to_ascii_uppercase(),
+ source.as_bytes()[1].to_ascii_uppercase(),
+ ) {
+ (b'E', b'Q') => Ok(Self::Eq),
+ (b'N', b'E') => Ok(Self::Ne),
+ (b'L', b'T') => Ok(Self::Lt),
+ (b'L', b'E') => Ok(Self::Le),
+ (b'G', b'T') => Ok(Self::Gt),
+ (b'G', b'E') => Ok(Self::Ge),
+ _ => Err(()),
+ },
+ _ => Err(()),
+ }
+ }
+}
+
+impl RelOp {
+ fn evaluate(&self, cmp: Ordering) -> bool {
+ match self {
+ RelOp::Eq => cmp == Ordering::Equal,
+ RelOp::Ne => cmp != Ordering::Equal,
+ RelOp::Lt => cmp == Ordering::Less,
+ RelOp::Gt => cmp == Ordering::Greater,
+ RelOp::Le => cmp != Ordering::Greater,
+ RelOp::Ge => cmp != Ordering::Less,
+ }
+ }
+}
+
+pub type MacroSet = HashMap<UniCase<String>, Macro>;
+
+enum ParserState {
+ /// Accumulating tokens toward the end of any type of argument.
+ Arg,
+
+ /// Expecting the opening delimiter of an ARG_ENCLOSE argument.
+ Enclose,
+
+ /// Expecting a keyword for a keyword argument.
+ Keyword,
+
+ /// Expecting an equal sign for a keyword argument.
+ Equals,
+
+ /// Macro fully parsed and ready for expansion.
+ Finished,
+}
+
+/// Macro call parser FSM.
+pub struct Parser<'a> {
+ macros: &'a MacroSet,
+ macro_: &'a Macro,
+ state: ParserState,
+ args: Box<[Option<Vec<MacroToken>>]>,
+ arg_index: usize,
+
+ /// Length of macro call so far.
+ n_tokens: usize,
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum ParseStatus {
+ Complete,
+ Incomplete,
+}
+
+impl<'a> Parser<'a> {
+ pub fn new(macros: &'a MacroSet, token: &Token) -> Option<Self> {
+ let macro_ = macros.get(&token.id()?.0)?;
+ Some(Self {
+ macros,
+ macro_,
+ state: macro_.initial_state(),
+ args: (0..macro_.parameters.len()).map(|_| None).collect(),
+ arg_index: 0,
+ n_tokens: 1,
+ })
+ }
+
+ fn finished(&mut self) {
+ self.state = ParserState::Finished;
+ for (i, arg) in self.args.iter_mut().enumerate() {
+ if arg.is_none() {
+ *arg = Some(self.macro_.parameters[i].default.clone());
+ }
+ }
+ self.state = ParserState::Finished;
+ }
+
+ fn next_arg(&mut self) {
+ if self.macro_.parameters.is_empty() {
+ self.finished()
+ } else {
+ let param = &self.macro_.parameters[self.arg_index];
+ if param.is_positional() {
+ self.arg_index += 1;
+ if self.arg_index >= self.args.len() {
+ self.finished()
+ } else {
+ let param = &self.macro_.parameters[self.arg_index];
+ self.state = if !param.is_positional() {
+ ParserState::Keyword
+ } else if let ValueType::Enclose(_, _) = param.arg {
+ ParserState::Enclose
+ } else {
+ ParserState::Arg
+ };
+ }
+ } else {
+ if self.args.iter().any(|arg| arg.is_none()) {
+ self.state = ParserState::Keyword;
+ } else {
+ self.finished();
+ }
+ }
+ }
+ }
+
+ fn push_arg(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) {
+ let param = &self.macro_.parameters[self.args.len() - 1];
+ if let Token::EndCommand | Token::End = token {
+ if let Some(arg) = &self.args[self.arg_index] {
+ let param = &self.macro_.parameters[self.args.len() - 1];
+
+ match ¶m.arg {
+ ValueType::NTokens(n) => error(MacroError::ExpectedMoreTokens {
+ n: n - arg.len(),
+ arg: param.name.clone(),
+ macro_: self.macro_.name.clone(),
+ }),
+ ValueType::CharEnd(end) | ValueType::Enclose(_, end) => {
+ error(MacroError::ExpectedToken {
+ token: end.to_string(),
+ arg: param.name.clone(),
+ macro_: self.macro_.name.clone(),
+ })
+ }
+ ValueType::CmdEnd => {
+ // This is OK, it's the expected way to end the argument.
+ }
+ }
+ }
+ self.finished();
+ }
+
+ self.n_tokens += 1;
+ let arg = self.args[self.arg_index].get_or_insert(Vec::new());
+ let (
+ add_token, // Should we add `mt` to the current arg?
+ next_arg, // Should we advance to the next arg?
+ ) = match ¶m.arg {
+ ValueType::NTokens(n) => (arg.len() + 1 >= *n, true),
+ ValueType::CharEnd(end) | ValueType::Enclose(_, end) => {
+ let at_end = token == end;
+ (at_end, !at_end)
+ }
+ ValueType::CmdEnd => (false, true),
+ };
+ if add_token {
+ if true
+ // !macro_expand_arg (&mt->token, mc->me, *argp)
+ {
+ arg.push(MacroToken {
+ token: token.clone(),
+ syntax: String::from(syntax),
+ });
+ }
+ }
+ if next_arg {
+ self.next_arg()
+ }
+ }
+
+ fn push_enclose(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) {
+ let param = &self.macro_.parameters[self.arg_index];
+ let ValueType::Enclose(start, _) = ¶m.arg else {
+ unreachable!()
+ };
+ if token == start {
+ self.n_tokens += 1;
+ self.args[self.arg_index].get_or_insert(Vec::new());
+ self.state = ParserState::Arg;
+ } else if param.is_positional() && matches!(token, Token::End | Token::EndCommand) {
+ self.finished();
+ } else {
+ error(MacroError::UnexpectedToken {
+ actual: String::from(syntax),
+ expected: start.to_string(),
+ arg: param.name.clone(),
+ macro_: self.macro_.name.clone(),
+ });
+ self.finished();
+ }
+ }
+
+ fn push_keyword(&mut self, token: &Token, _syntax: &str, error: &impl Fn(MacroError)) {
+ let Some(id) = token.id() else {
+ return self.finished();
+ };
+ let Some(arg_index) = self.macro_.find_parameter(id) else {
+ return self.finished();
+ };
+ self.arg_index = arg_index;
+ if self.args[arg_index].is_some() {
+ error(MacroError::DuplicateArg {
+ arg: id.clone(),
+ macro_: self.macro_.name.clone(),
+ });
+ }
+ self.args[arg_index] = Some(Vec::new());
+ }
+
+ fn push_equals(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) {
+ let param = &self.macro_.parameters[self.arg_index];
+ if let Token::Punct(Punct::Eq) = token {
+ self.n_tokens += 1;
+ self.state = if let ValueType::Enclose(_, _) = param.arg {
+ ParserState::Enclose
+ } else {
+ ParserState::Arg
+ };
+ } else {
+ error(MacroError::UnexpectedToken {
+ actual: syntax.into(),
+ expected: String::from("="),
+ arg: param.name.clone(),
+ macro_: self.macro_.name.clone(),
+ });
+ self.finished()
+ }
+ }
+
+ /// Adds `token`, which has the given `syntax`, to the collection of tokens
+ /// in `self` that potentially need to be macro expanded.
+ ///
+ /// Returns [ParseStatus::Incomplete] if the macro expander needs more
+ /// tokens, for macro arguments or to decide whether this is actually a
+ /// macro invocation. The caller should call `push` again with the next
+ /// token.
+ ///
+ /// Returns [ParseStatus::Complete] if the macro invocation is now complete.
+ /// The caller should call [`Self::finish()`] to obtain the expansion.
+ pub fn push(
+ &mut self,
+ token: &Token,
+ syntax: &str,
+ error: &impl Fn(MacroError),
+ ) -> ParseStatus {
+ match self.state {
+ ParserState::Arg => self.push_arg(token, syntax, error),
+ ParserState::Enclose => self.push_enclose(token, syntax, error),
+ ParserState::Keyword => self.push_keyword(token, syntax, error),
+ ParserState::Equals => self.push_equals(token, syntax, error),
+ ParserState::Finished => (),
+ }
+ if let ParserState::Finished = self.state {
+ ParseStatus::Complete
+ } else {
+ ParseStatus::Incomplete
+ }
+ }
+
+ pub fn finish(self) -> Call<'a> {
+ let ParserState::Finished = self.state else {
+ panic!()
+ };
+ Call(self)
+ }
+}
+
+/// Expansion stack entry.
+struct Frame {
+ /// A macro name or `!IF`, `!DO`, etc.
+ name: Option<Identifier>,
+
+ /// Source location, if available.
+ location: Option<Location>,
+}
+
+struct Expander<'a> {
+ /// Macros to expand recursively.
+ macros: &'a MacroSet,
+
+ /// Error reporting callback.
+ error: &'a Box<dyn Fn(MacroError) + 'a>,
+
+ /// Tokenization mode.
+ mode: Mode,
+
+ /// Remaining nesting levels.
+ nesting_countdown: usize,
+
+ /// Stack for error reporting.
+ stack: Vec<Frame>,
+
+ // May macro calls be expanded?
+ expand: &'a RefCell<bool>,
+
+ /// Variables from `!DO` and `!LET`.
+ vars: &'a RefCell<BTreeMap<Identifier, String>>,
+
+ // Only set if inside a `!DO` loop. If true, break out of the loop.
+ break_: Option<&'a mut bool>,
+
+ /// Only set if expanding a macro (and not, say, a macro argument).
+ macro_: Option<&'a Macro>,
+
+ /// Only set if expanding a macro (and not, say, a macro argument).
+ args: Option<&'a [Option<Vec<MacroToken>>]>,
+}
+
+fn bool_to_string(b: bool) -> String {
+ if b {
+ String::from("1")
+ } else {
+ String::from("0")
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+enum IfEndClause {
+ Else,
+ IfEnd,
+}
+
+fn macro_keywords() -> HashSet<Identifier> {
+ let mut keywords = HashSet::new();
+ for kw in [
+ "!BREAK",
+ "!CHAREND",
+ "!CMDEND",
+ "!DEFAULT",
+ "!DO",
+ "!DOEND",
+ "!ELSE",
+ "!ENCLOSE",
+ "!ENDDEFINE",
+ "!IF",
+ "!IFEND",
+ "!IN",
+ "!LET",
+ "!NOEXPAND",
+ "!OFFEXPAND",
+ "!ONEXPAND",
+ "!POSITIONAL",
+ "!THEN",
+ "!TOKENS",
+ ] {
+ keywords.insert(Identifier::new(kw).unwrap());
+ }
+ keywords
+}
+
+fn is_macro_keyword(s: &Identifier) -> bool {
+ lazy_static! {
+ static ref KEYWORDS: HashSet<Identifier> = macro_keywords();
+ }
+ KEYWORDS.contains(s)
+}
+
+enum DoInput {
+ List(Vec<String>),
+ Up { first: f64, last: f64, by: f64 },
+ Down { first: f64, last: f64, by: f64 },
+ Empty,
+}
+
+impl DoInput {
+ fn from_list(items: Vec<MacroToken>) -> Self {
+ Self::List(
+ items
+ .into_iter()
+ .rev()
+ .take(Settings::global().macros.max_iterations + 1)
+ .map(|mt| mt.syntax)
+ .collect(),
+ )
+ }
+
+ fn from_by(first: f64, last: f64, by: f64) -> Self {
+ if by > 0.0 && first <= last {
+ Self::Up { first, last, by }
+ } else if by > 0.0 && first <= last {
+ Self::Down { first, last, by }
+ } else {
+ Self::Empty
+ }
+ }
+}
+
+impl Iterator for DoInput {
+ type Item = String;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ match self {
+ DoInput::List(vec) => vec.pop(),
+ DoInput::Up { first, last, by } => {
+ if first <= last {
+ let value = *first;
+ *first += *by;
+ Some(format!("{value}"))
+ } else {
+ None
+ }
+ }
+ DoInput::Down { first, last, by } => {
+ if first >= last {
+ let value = *first;
+ *first += *by;
+ Some(format!("{value}"))
+ } else {
+ None
+ }
+ }
+ DoInput::Empty => None,
+ }
+ }
+}
+
+impl<'a> Expander<'a> {
+ fn may_expand(&self) -> bool {
+ *self.expand.borrow()
+ }
+
+ fn should_break(&self) -> bool {
+ self.break_.as_ref().map(|b| **b).unwrap_or(false)
+ }
+
+ fn expand(&mut self, input: &mut MacroTokens, output: &mut Vec<MacroToken>) {
+ if self.nesting_countdown == 0 {
+ (self.error)(MacroError::TooDeep {
+ limit: Settings::global().macros.max_nest,
+ });
+ output.extend(take(&mut input.0).iter().cloned());
+ } else {
+ while !input.is_empty() && !self.should_break() {
+ self.expand__(input, output);
+ }
+ }
+ }
+
+ fn expand_arg(&mut self, param_idx: usize, output: &mut Vec<MacroToken>) {
+ let param = &self.macro_.unwrap().parameters[param_idx];
+ let arg = &self.args.unwrap()[param_idx].as_ref().unwrap();
+ if self.may_expand() && param.expand_value {
+ let vars = RefCell::new(BTreeMap::new());
+ let mut stack = take(&mut self.stack);
+ stack.push(Frame {
+ name: Some(param.name.clone()),
+ location: None,
+ });
+ let mut subexpander = Expander {
+ stack,
+ vars: &vars,
+ break_: None,
+ macro_: None,
+ args: None,
+ ..*self
+ };
+ let mut arg_tokens = MacroTokens(&arg);
+ subexpander.expand(&mut arg_tokens, output);
+ self.stack = subexpander.stack;
+ self.stack.pop();
+ } else {
+ output.extend(arg.iter().cloned());
+ }
+ }
+ fn parse_function_args(
+ &mut self,
+ function: &Identifier,
+ input: &mut MacroTokens,
+ ) -> Option<Vec<String>> {
+ input.advance();
+ input.advance();
+ let mut args = Vec::new();
+ if input.match_(")") {
+ return Some(args);
+ }
+ loop {
+ args.push(self.parse_function_arg(input)?);
+ match input.take() {
+ Some(MacroToken {
+ token: Token::Punct(Punct::Comma),
+ ..
+ }) => (),
+ Some(MacroToken {
+ token: Token::Punct(Punct::RParen),
+ ..
+ }) => return Some(args),
+ _ => {
+ (self.error)(MacroError::ExpectingCommaOrRParen(function.clone()));
+ return None;
+ }
+ }
+ }
+ }
+
+ fn expand_blanks(e: &mut Expander, args: Vec<String>) -> Option<String> {
+ let Ok(n) = args[0].trim().parse::<usize>() else {
+ (e.error)(MacroError::InvalidBlanks(args[0].clone()));
+ return None;
+ };
+ Some(std::iter::repeat(' ').take(n).collect())
+ }
+
+ fn expand_concat(e: &mut Expander, args: Vec<String>) -> Option<String> {
+ Some(
+ args.into_iter()
+ .map(|arg| unquote_string(arg, e.mode))
+ .collect(),
+ )
+ }
+
+ fn expand_eval(e: &mut Expander, args: Vec<String>) -> Option<String> {
+ let tokens = tokenize_string(&args[0], e.mode, e.error);
+ let mut stack = take(&mut e.stack);
+ stack.push(Frame {
+ name: Some(Identifier::new("!EVAL").unwrap()),
+ location: None,
+ });
+ let mut break_ = false;
+ let mut subexpander = Expander {
+ break_: Some(&mut break_),
+ stack,
+ vars: e.vars,
+ ..*e
+ };
+ let mut output = Vec::new();
+ subexpander.expand(&mut MacroTokens(tokens.as_slice()), &mut output);
+ subexpander.stack.pop();
+ e.stack = subexpander.stack;
+ Some(macro_tokens_to_syntax(&output).flatten().collect())
+ }
+
+ fn expand_head(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
+ let arg = unquote_string(args.remove(0), e.mode);
+ let mut output = tokenize_string(&arg, e.mode, e.error);
+ if output.is_empty() {
+ Some(String::new())
+ } else {
+ Some(output.swap_remove(0).syntax)
+ }
+ }
+
+ fn expand_index(_e: &mut Expander, args: Vec<String>) -> Option<String> {
+ let haystack = &args[0];
+ let needle = &args[1];
+ let position = haystack.find(needle);
+ Some(format!(
+ "{}",
+ position.map_or(0, |position| &haystack[0..position].chars().count() + 1)
+ ))
+ }
+
+ fn expand_length(_e: &mut Expander, args: Vec<String>) -> Option<String> {
+ Some(format!("{}", args[0].chars().count()))
+ }
+
+ fn expand_quote(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
+ let arg = args.remove(0);
+ if try_unquote_string(&arg, e.mode).is_some() {
+ Some(arg)
+ } else {
+ let mut output = String::with_capacity(arg.len() + 2);
+ output.push('\'');
+ for c in arg.chars() {
+ if c == '"' {
+ output.push('\'');
+ }
+ output.push(c);
+ }
+ output.push('\'');
+ Some(output)
+ }
+ }
+
+ fn expand_substr(e: &mut Expander, args: Vec<String>) -> Option<String> {
+ let Ok(start) = args[1].trim().parse::<NonZeroUsize>() else {
+ (e.error)(MacroError::InvalidSubstr3(args[0].clone()));
+ return None;
+ };
+ let start = start.get();
+ let Ok(count) = args[2].trim().parse::<usize>() else {
+ (e.error)(MacroError::InvalidSubstr2(args[0].clone()));
+ return None;
+ };
+
+ Some(args[0].chars().skip(start - 1).take(count).collect())
+ }
+
+ fn expand_tail(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
+ let arg = unquote_string(args.remove(0), e.mode);
+ let mut output = tokenize_string(&arg, e.mode, e.error);
+ Some(
+ output
+ .pop()
+ .map_or_else(|| String::new(), |tail| tail.syntax),
+ )
+ }
+
+ fn expand_unquote(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
+ Some(unquote_string(args.remove(0), e.mode))
+ }
+
+ fn expand_upcase(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
+ Some(unquote_string(args.remove(0), e.mode).to_uppercase())
+ }
+
+ fn expand_macro_function(&mut self, orig_input: &mut MacroTokens) -> Option<String> {
+ let mut input = orig_input.clone();
+ let name = input.macro_id()?;
+ if name == "!NULL" {
+ return Some(String::new());
+ }
+ if input.0.len() < 2 || !matches!(input.0[1].token, Token::Punct(Punct::LParen)) {
+ return None;
+ }
+
+ struct MacroFunction {
+ name: Identifier,
+ args: RangeInclusive<usize>,
+ parser: fn(&mut Expander, Vec<String>) -> Option<String>,
+ }
+ impl MacroFunction {
+ fn new(
+ name: &str,
+ args: RangeInclusive<usize>,
+ parser: fn(&mut Expander, Vec<String>) -> Option<String>,
+ ) -> Self {
+ Self {
+ name: Identifier::new(name).unwrap(),
+ args,
+ parser,
+ }
+ }
+ }
+ lazy_static! {
+ static ref MACRO_FUNCTIONS: [MacroFunction; 11] = [
+ MacroFunction::new("!BLANKS", 1..=1, Expander::expand_blanks),
+ MacroFunction::new("!CONCAT", 1..=usize::MAX, Expander::expand_concat),
+ MacroFunction::new("!HEAD", 1..=1, Expander::expand_head),
+ MacroFunction::new("!INDEX", 2..=2, Expander::expand_index),
+ MacroFunction::new("!LENGTH", 1..=1, Expander::expand_length),
+ MacroFunction::new("!QUOTE", 1..=1, Expander::expand_quote),
+ MacroFunction::new("!SUBSTR", 2..=3, Expander::expand_substr),
+ MacroFunction::new("!TAIL", 1..=1, Expander::expand_tail),
+ MacroFunction::new("!UNQUOTE", 1..=1, Expander::expand_unquote),
+ MacroFunction::new("!UPCASE", 1..=1, Expander::expand_upcase),
+ MacroFunction::new("!EVAL", 1..=1, Expander::expand_eval),
+ ];
+ }
+
+ let function = MACRO_FUNCTIONS.iter().find(|mf| &mf.name == name)?;
+
+ let args = self.parse_function_args(&function.name, &mut input)?;
+
+ let n_args = args.len();
+ if !function.args.contains(&n_args) {
+ let name = function.name.clone();
+ let error = match &function.args {
+ x if x == &(1..=1) => MacroError::ExpectingOneArg { name, n_args },
+ x if x == &(2..=2) => MacroError::ExpectingTwoArgs { name, n_args },
+ x if x == &(2..=3) => MacroError::ExpectingTwoOrThreeArgs { name, n_args },
+ x if x == &(1..=usize::MAX) => MacroError::ExpectingOneOrMoreArgs { name },
+ _ => unreachable!(),
+ };
+ (self.error)(error);
+ return None;
+ }
+
+ *orig_input = input;
+ (function.parser)(self, args)
+ }
+
+ /// Parses one function argument from `input`. Each argument to a macro
+ /// function is one of:
+ ///
+ /// - A quoted string or other single literal token.
+ ///
+ /// - An argument to the macro being expanded, e.g. `!1` or a named
+ /// argument.
+ ///
+ /// - `!*`.
+ ///
+ /// - A function invocation.
+ ///
+ /// Each function invocation yields a character sequence to be turned into a
+ /// sequence of tokens. The case where that character sequence is a single
+ /// quoted string is an important special case.
+ fn parse_function_arg(&mut self, input: &mut MacroTokens) -> Option<String> {
+ if let Some(macro_) = self.macro_ {
+ match &input.0.get(0)?.token {
+ Token::Id(id) if id.0.starts_with('!') => {
+ if let Some(param_idx) = macro_.find_parameter(id) {
+ input.advance();
+ return Some(
+ macro_tokens_to_syntax(self.args.unwrap()[param_idx].as_ref().unwrap())
+ .flatten()
+ .collect(),
+ );
+ }
+ if let Some(value) = self.vars.borrow().get(id) {
+ return Some(value.clone());
+ }
+
+ if let Some(output) = self.expand_macro_function(input) {
+ return Some(output);
+ }
+ }
+ Token::Punct(Punct::BangAsterisk) => {
+ let mut arg = String::new();
+ for i in 0..macro_.parameters.len() {
+ if !macro_.parameters[i].is_positional() {
+ break;
+ }
+ if i > 0 {
+ arg.push(' ')
+ }
+ arg.extend(
+ macro_tokens_to_syntax(self.args.unwrap()[i].as_ref().unwrap())
+ .flatten(),
+ );
+ }
+ input.advance();
+ return Some(arg);
+ }
+ _ => (),
+ }
+ }
+ Some(input.advance().syntax.clone())
+ }
+
+ fn evaluate_literal(&mut self, input: &mut MacroTokens) -> Option<String> {
+ if input.match_("(") {
+ let value = self.evaluate_or(input)?;
+ if input.match_(")") {
+ Some(value)
+ } else {
+ (self.error)(MacroError::ExpectingRParen);
+ None
+ }
+ } else if input.match_(")") {
+ (self.error)(MacroError::ExpectingLiteral);
+ None
+ } else {
+ Some(unquote_string(self.parse_function_arg(input)?, self.mode))
+ }
+ }
+
+ fn evaluate_relational(&mut self, input: &mut MacroTokens) -> Option<String> {
+ let lhs = self.evaluate_literal(input)?;
+ let Some(relop) = input.take_relop() else {
+ return Some(lhs);
+ };
+ let rhs = self.evaluate_literal(input)?;
+ let cmp = unquote_string(lhs, self.mode).cmp(&unquote_string(rhs, self.mode));
+ Some(bool_to_string(relop.evaluate(cmp)))
+ }
+
+ fn evaluate_not(&mut self, input: &mut MacroTokens) -> Option<String> {
+ let mut negations = 0;
+ while input.match_("!AND") || input.match_("&") {
+ negations += 1;
+ }
+
+ let operand = self.evaluate_relational(input)?;
+ if negations == 0 {
+ return Some(operand);
+ }
+
+ let mut b = operand != "0";
+ if negations.is_odd() {
+ b = !b;
+ }
+ Some(bool_to_string(b))
+ }
+
+ fn evaluate_and(&mut self, input: &mut MacroTokens) -> Option<String> {
+ let mut lhs = self.evaluate_not(input)?;
+ while input.match_("!AND") || input.match_("&") {
+ let rhs = self.evaluate_not(input)?;
+ lhs = bool_to_string(lhs != "0" && rhs != "0");
+ }
+ Some(lhs)
+ }
+ fn evaluate_or(&mut self, input: &mut MacroTokens) -> Option<String> {
+ let mut lhs = self.evaluate_and(input)?;
+ while input.match_("!OR") || input.match_("|") {
+ let rhs = self.evaluate_and(input)?;
+ lhs = bool_to_string(lhs != "0" || rhs != "0");
+ }
+ Some(lhs)
+ }
+
+ fn evaluate_expression(&mut self, input: &mut MacroTokens) -> Option<String> {
+ self.evaluate_or(input)
+ }
+
+ fn evaluate_number(&mut self, input: &mut MacroTokens) -> Option<f64> {
+ let s = self.evaluate_expression(input)?;
+ let tokens = tokenize_string(&s, self.mode, self.error);
+ let (
+ Some(MacroToken {
+ token: Token::Number(number),
+ ..
+ }),
+ 1,
+ ) = (tokens.get(0), tokens.len())
+ else {
+ (self.error)(MacroError::BadNumericMacroExpression(s));
+ return None;
+ };
+
+ Some(*number)
+ }
+
+ fn find_ifend_clause<'b>(
+ input: &mut MacroTokens<'b>,
+ ) -> Option<(MacroTokens<'b>, IfEndClause)> {
+ let input_copy = input.clone();
+ let mut nesting = 0;
+ while !input.is_empty() {
+ if input.match_("!IF") {
+ nesting += 1;
+ } else if input.match_("!IFEND") {
+ if nesting == 0 {
+ return Some((
+ MacroTokens(&input_copy.0[..input_copy.0.len() - input.0.len() - 1]),
+ IfEndClause::IfEnd,
+ ));
+ }
+ nesting -= 1;
+ } else if input.match_("!ELSE") && nesting == 0 {
+ return Some((
+ MacroTokens(&input_copy.0[..input_copy.0.len() - input.0.len() - 1]),
+ IfEndClause::Else,
+ ));
+ } else {
+ input.advance();
+ }
+ }
+ return None;
+ }
+ fn expand_if(&mut self, orig_input: &mut MacroTokens, output: &mut Vec<MacroToken>) -> bool {
+ let mut input = orig_input.clone();
+ if !input.match_("!IF") {
+ return false;
+ }
+ let Some(result) = self.evaluate_expression(&mut input) else {
+ return false;
+ };
+ if !input.match_("!THEN") {
+ (self.error)(MacroError::ExpectingThen);
+ return false;
+ }
+
+ let Some((if_tokens, clause)) = Self::find_ifend_clause(&mut input) else {
+ (self.error)(MacroError::ExpectingElseOrIfEnd);
+ return false;
+ };
+
+ let else_tokens = match clause {
+ IfEndClause::Else => {
+ let Some((else_tokens, IfEndClause::IfEnd)) = Self::find_ifend_clause(&mut input)
+ else {
+ (self.error)(MacroError::ExpectingIfEnd);
+ return false;
+ };
+ Some(else_tokens)
+ }
+ IfEndClause::IfEnd => None,
+ };
+
+ let subinput = match result.as_str() {
+ "0" => else_tokens,
+ _ => Some(if_tokens),
+ };
+ if let Some(mut subinput) = subinput {
+ self.stack.push(Frame {
+ name: Some(Identifier::new("!IF").unwrap()),
+ location: None,
+ });
+ self.expand(&mut subinput, output);
+ self.stack.pop();
+ }
+ *orig_input = input;
+ true
+ }
+
+ fn take_macro_var_name(
+ &mut self,
+ input: &mut MacroTokens,
+ construct: &'static str,
+ ) -> Option<Identifier> {
+ let Some(var_name) = input.take_macro_id() else {
+ (self.error)(MacroError::ExpectingMacroVarName(construct));
+ return None;
+ };
+ if is_macro_keyword(var_name)
+ || self
+ .macro_
+ .map(|m| m.find_parameter(var_name))
+ .flatten()
+ .is_some()
+ {
+ (self.error)(MacroError::BadMacroVarName {
+ name: var_name.clone(),
+ construct,
+ });
+ None
+ } else {
+ Some(var_name.clone())
+ }
+ }
+
+ fn expand_let(&mut self, orig_input: &mut MacroTokens) -> bool {
+ let mut input = orig_input.clone();
+ if !input.match_("!LET") {
+ return false;
+ }
+
+ let Some(var_name) = self.take_macro_var_name(&mut input, "!LET") else {
+ return false;
+ };
+ input.advance();
+
+ if !input.match_("=") {
+ (self.error)(MacroError::ExpectingEquals);
+ return false;
+ }
+
+ let Some(value) = self.evaluate_expression(&mut input) else {
+ return false;
+ };
+ self.vars.borrow_mut().insert(var_name.clone(), value);
+ *orig_input = input;
+ true
+ }
+
+ fn find_doend<'b>(&mut self, input: &mut MacroTokens<'b>) -> Option<MacroTokens<'b>> {
+ let input_copy = input.clone();
+ let mut nesting = 0;
+ while !input.is_empty() {
+ if input.match_("!DO") {
+ nesting += 1;
+ } else if input.match_("!DOEND") {
+ if nesting == 0 {
+ return Some(MacroTokens(
+ &input_copy.0[..input_copy.0.len() - input.0.len() - 1],
+ ));
+ }
+ nesting -= 1;
+ } else {
+ input.advance();
+ }
+ }
+ (self.error)(MacroError::MissingDoEnd);
+ return None;
+ }
+
+ fn expand_do(&mut self, orig_input: &mut MacroTokens, output: &mut Vec<MacroToken>) -> bool {
+ let mut input = orig_input.clone();
+ if !input.match_("!DO") {
+ return false;
+ }
+
+ let Some(var_name) = self.take_macro_var_name(&mut input, "!DO") else {
+ return false;
+ };
+
+ let (items, miterate_error) = if input.match_("!IN") {
+ let Some(list) = self.evaluate_expression(&mut input) else {
+ return false;
+ };
+ let items = tokenize_string(list.as_str(), self.mode, &self.error);
+ (
+ DoInput::from_list(items),
+ MacroError::MiterateList(Settings::global().macros.max_iterations),
+ )
+ } else if input.match_("=") {
+ let Some(first) = self.evaluate_number(&mut input) else {
+ return false;
+ };
+ if !input.match_("!TO") {
+ (self.error)(MacroError::ExpectingTo);
+ return false;
+ }
+ let Some(last) = self.evaluate_number(&mut input) else {
+ return false;
+ };
+ let by = if input.match_("!BY") {
+ let Some(by) = self.evaluate_number(&mut input) else {
+ return false;
+ };
+ if by == 0.0 {
+ (self.error)(MacroError::ZeroBy);
+ return false;
+ }
+ by
+ } else {
+ 1.0
+ };
+ (
+ DoInput::from_by(first, last, by),
+ MacroError::MiterateNumeric(Settings::global().macros.max_iterations),
+ )
+ } else {
+ (self.error)(MacroError::ExpectingEqualsOrIn);
+ return false;
+ };
+
+ let Some(body) = self.find_doend(&mut input) else {
+ return false;
+ };
+
+ let mut stack = take(&mut self.stack);
+ stack.push(Frame {
+ name: Some(Identifier::new("!DO").unwrap()),
+ location: None,
+ });
+ let mut break_ = false;
+ let mut subexpander = Expander {
+ break_: Some(&mut break_),
+ stack,
+ vars: self.vars,
+ ..*self
+ };
+
+ for (i, item) in items.enumerate() {
+ if subexpander.should_break() {
+ break;
+ }
+ if i >= Settings::global().macros.max_iterations {
+ (self.error)(miterate_error);
+ break;
+ }
+ let mut vars = self.vars.borrow_mut();
+ if let Some(value) = vars.get_mut(&var_name) {
+ *value = item;
+ } else {
+ vars.insert(var_name.clone(), item);
+ }
+ subexpander.expand(&mut body.clone(), output);
+ }
+ *orig_input = input;
+ true
+ }
+
+ fn expand__(&mut self, input: &mut MacroTokens, output: &mut Vec<MacroToken>) {
+ // Recursive macro calls.
+ if self.may_expand() {
+ if let Some(call) = Call::for_tokens(self.macros, &input.0, &self.error) {
+ let vars = RefCell::new(BTreeMap::new());
+ let mut stack = take(&mut self.stack);
+ stack.push(Frame {
+ name: Some(call.0.macro_.name.clone()),
+ location: Some(call.0.macro_.location.clone()),
+ });
+ let mut subexpander = Expander {
+ break_: None,
+ vars: &vars,
+ nesting_countdown: self.nesting_countdown.saturating_sub(1),
+ stack,
+ ..*self
+ };
+ let mut body = MacroTokens(call.0.macro_.body.as_slice());
+ subexpander.expand(&mut body, output);
+ self.stack = subexpander.stack;
+ self.stack.pop();
+ input.0 = &input.0[call.len()..];
+ return;
+ }
+ }
+
+ // Only identifiers beginning with `!` receive further processing.
+ let id = match &input.0[0].token {
+ Token::Id(id) if id.0.starts_with('!') => id,
+ Token::Punct(Punct::BangAsterisk) => {
+ if let Some(macro_) = self.macro_ {
+ for i in 0..macro_.parameters.len() {
+ self.expand_arg(i, output);
+ }
+ } else {
+ (self.error)(MacroError::InvalidBangAsterisk);
+ }
+ input.advance();
+ return;
+ }
+ _ => {
+ output.push(input.advance().clone());
+ return;
+ }
+ };
+
+ // Macro arguments.
+ if let Some(macro_) = self.macro_ {
+ if let Some(param_idx) = macro_.find_parameter(id) {
+ self.expand_arg(param_idx, output);
+ input.advance();
+ return;
+ }
+ }
+
+ // Variables set by `!DO` or `!LET`.
+ if let Some(value) = self.vars.borrow().get(id) {
+ tokenize_string_into(value.as_str(), self.mode, &self.error, output);
+ input.advance();
+ return;
+ }
+
+ // Macro functions.
+ if self.expand_if(input, output) {
+ return;
+ }
+ if self.expand_let(input) {
+ return;
+ }
+ if self.expand_do(input, output) {
+ return;
+ }
+
+ if input.match_("!BREAK") {
+ if let Some(ref mut break_) = self.break_ {
+ **break_ = true;
+ } else {
+ (self.error)(MacroError::BreakOutsideDo);
+ }
+ return;
+ }
+
+ if input.match_("!ONEXPAND") {
+ *self.expand.borrow_mut() = true;
+ } else if input.match_("!OFFEXPAND") {
+ *self.expand.borrow_mut() = false;
+ } else {
+ output.push(input.advance().clone());
+ }
+ }
+}
+
+pub struct Call<'a>(Parser<'a>);
+
+impl<'a> Call<'a> {
+ pub fn for_tokens<F>(macros: &'a MacroSet, tokens: &[MacroToken], error: &F) -> Option<Self>
+ where
+ F: Fn(MacroError),
+ {
+ let mut parser = Parser::new(macros, &tokens.get(0)?.token)?;
+ for token in tokens[1..].iter().chain(&[MacroToken {
+ token: Token::EndCommand,
+ syntax: String::from(""),
+ }]) {
+ if parser.push(&token.token, &token.syntax, error) == ParseStatus::Complete {
+ return Some(parser.finish());
+ }
+ }
+ return None;
+ }
+
+ pub fn expand<F>(&self, mode: Mode, call_loc: Location, output: &mut Vec<MacroToken>, error: F)
+ where
+ F: Fn(MacroError) + 'a,
+ {
+ let error: Box<dyn Fn(MacroError) + 'a> = Box::new(error);
+ let vars = RefCell::new(BTreeMap::new());
+ let expand = RefCell::new(true);
+ let mut me = Expander {
+ macros: self.0.macros,
+ error: &error,
+ macro_: Some(self.0.macro_),
+ args: Some(&self.0.args),
+ mode,
+ nesting_countdown: Settings::global().macros.max_nest,
+ stack: vec![
+ Frame {
+ name: None,
+ location: Some(call_loc),
+ },
+ Frame {
+ name: Some(self.0.macro_.name.clone()),
+ location: Some(self.0.macro_.location.clone()),
+ },
+ ],
+ vars: &vars,
+ break_: None,
+ expand: &expand,
+ };
+ let mut body = MacroTokens(&self.0.macro_.body);
+ me.expand(&mut body, output);
+ }
+
+ /// Returns the number of tokens consumed from the input for the macro
+ /// invocation. If the result is 0, then there was no macro invocation and
+ /// the expansion will be empty.
+ pub fn len(&self) -> usize {
+ self.0.n_tokens
+ }
+}
--- /dev/null
+/* PSPP - a program for statistical analysis.
+ * Copyright (C) 2023 Free Software Foundation, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+use anyhow::Result;
+use clap::{Parser, ValueEnum};
+use encoding_rs::Encoding;
+use pspp::raw::{encoding_from_headers, Decoder, Magic, Reader, Record};
+use std::fs::File;
+use std::io::BufReader;
+use std::path::{Path, PathBuf};
+use std::str;
+use thiserror::Error as ThisError;
+
+/// A utility to dissect SPSS system files.
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+ /// Maximum number of cases to print.
+ #[arg(long = "data", default_value_t = 0)]
+ max_cases: u64,
+
+ /// Files to dissect.
+ #[arg(required = true)]
+ files: Vec<PathBuf>,
+
+ /// How to dissect the file.
+ #[arg(short, long, value_enum, default_value_t)]
+ mode: Mode,
+
+ /// The encoding to use.
+ #[arg(long, value_parser = parse_encoding)]
+ encoding: Option<&'static Encoding>,
+}
+
+#[derive(ThisError, Debug)]
+#[error("{0}: unknown encoding")]
+struct UnknownEncodingError(String);
+
+fn parse_encoding(arg: &str) -> Result<&'static Encoding, UnknownEncodingError> {
+ match Encoding::for_label_no_replacement(arg.as_bytes()) {
+ Some(encoding) => Ok(encoding),
+ None => Err(UnknownEncodingError(arg.to_string())),
+ }
+}
+
+#[derive(Clone, Copy, Debug, Default, ValueEnum)]
+enum Mode {
+ Identify,
+ Raw,
+ Decoded,
+ #[default]
+ Cooked,
+}
+
+fn main() -> Result<()> {
+ let Args {
+ max_cases,
+ files,
+ mode,
+ encoding,
+ } = Args::parse();
+
+ for file in files {
+ dissect(&file, max_cases, mode, encoding)?;
+ }
+ Ok(())
+}
+
+fn dissect(
+ file_name: &Path,
+ max_cases: u64,
+ mode: Mode,
+ encoding: Option<&'static Encoding>,
+) -> Result<()> {
+ let reader = File::open(file_name)?;
+ let reader = BufReader::new(reader);
+ let mut reader = Reader::new(reader, |warning| println!("{warning}"))?;
+
+ match mode {
+ Mode::Identify => {
+ let Record::Header(header) = reader.next().unwrap()? else {
+ unreachable!()
+ };
+ match header.magic {
+ Magic::Sav => println!("SPSS System File"),
+ Magic::Zsav => println!("SPSS System File with Zlib compression"),
+ Magic::Ebcdic => println!("EBCDIC-encoded SPSS System File"),
+ }
+ return Ok(());
+ }
+ Mode::Raw => {
+ for header in reader {
+ let header = header?;
+ println!("{:?}", header);
+ if let Record::Cases(cases) = header {
+ let mut cases = cases.borrow_mut();
+ for _ in 0..max_cases {
+ let Some(Ok(record)) = cases.next() else {
+ break;
+ };
+ println!("{:?}", record);
+ }
+ }
+ }
+ }
+ Mode::Decoded => {
+ let headers: Vec<Record> = reader.collect::<Result<Vec<_>, _>>()?;
+ let encoding = match encoding {
+ Some(encoding) => encoding,
+ None => encoding_from_headers(&headers, &|e| eprintln!("{e}"))?,
+ };
+ let decoder = Decoder::new(encoding, |e| eprintln!("{e}"));
+ for header in headers {
+ let header = header.decode(&decoder);
+ println!("{:?}", header);
+ /*
+ if let Record::Cases(cases) = header {
+ let mut cases = cases.borrow_mut();
+ for _ in 0..max_cases {
+ let Some(Ok(record)) = cases.next() else {
+ break;
+ };
+ println!("{:?}", record);
+ }
+ }
+ */
+ }
+ }
+ Mode::Cooked => {
+ /*
+ let headers: Vec<Record> = reader.collect::<Result<Vec<_>, _>>()?;
+ let encoding = encoding_from_headers(&headers, &|e| eprintln!("{e}"))?;
+ let (headers, _) = decode(headers, encoding, &|e| eprintln!("{e}"))?;
+ for header in headers {
+ println!("{header:?}");
+ }
+ */
+ }
+ }
+
+ Ok(())
+}
--- /dev/null
+use std::{
+ cmp::{max, min},
+ fmt::{Display, Formatter, Result as FmtResult},
+ ops::Range,
+ sync::Arc,
+};
+
+use enum_map::Enum;
+use unicode_width::UnicodeWidthStr;
+
+/// A line number and optional column number within a source file.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub struct Point {
+ /// 1-based line number.
+ pub line: i32,
+
+ /// 1-based column number.
+ ///
+ /// Column numbers are measured according to the width of characters as
+ /// shown in a typical fixed-width font, in which CJK characters have width
+ /// 2 and combining characters have width 0, as measured by the
+ /// `unicode_width` crate.
+ pub column: Option<i32>,
+}
+
+impl Point {
+ /// Takes `point`, adds to it the syntax in `syntax`, incrementing the line
+ /// number for each new-line in `syntax` and the column number for each
+ /// column, and returns the result.
+ pub fn advance(&self, syntax: &str) -> Self {
+ let mut result = *self;
+ for line in syntax.split_inclusive('\n') {
+ if line.ends_with('\n') {
+ result.line += 1;
+ result.column = Some(1);
+ } else {
+ result.column = result.column.map(|column| column + line.width() as i32);
+ }
+ }
+ result
+ }
+
+ pub fn without_column(&self) -> Self {
+ Self {
+ line: self.line,
+ column: None,
+ }
+ }
+}
+
+/// Location relevant to an diagnostic message.
+#[derive(Clone, Debug)]
+pub struct Location {
+ /// File name, if any.
+ pub file_name: Option<Arc<String>>,
+
+ /// Starting and ending point, if any.
+ pub span: Option<Range<Point>>,
+
+ /// Normally, if `span` contains column information, then displaying the
+ /// message will underline the location. Setting this to true disables
+ /// displaying underlines.
+ pub omit_underlines: bool,
+}
+
+impl Display for Location {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ if let Some(file_name) = &self.file_name {
+ write!(f, "{}", file_name)?;
+ }
+
+ if let Some(span) = &self.span {
+ if self.file_name.is_some() {
+ write!(f, ":")?;
+ }
+ let l1 = span.start.line;
+ let l2 = span.end.line;
+ if let (Some(c1), Some(c2)) = (span.start.column, span.end.column) {
+ if l2 > l1 {
+ write!(f, "{l1}.{c1}-{l2}.{}", c2 - 1)?;
+ } else {
+ write!(f, "{l1}.{c1}-{}", c2 - 1)?;
+ }
+ } else {
+ if l2 > l1 {
+ write!(f, "{l1}-{l2}")?;
+ } else {
+ write!(f, "{l1}")?;
+ }
+ }
+ }
+ Ok(())
+ }
+}
+
+impl Location {
+ pub fn without_columns(&self) -> Self {
+ Self {
+ file_name: self.file_name.clone(),
+ span: self
+ .span
+ .as_ref()
+ .map(|span| span.start.without_column()..span.end.without_column()),
+ omit_underlines: self.omit_underlines,
+ }
+ }
+ pub fn merge(a: Option<Self>, b: &Option<Self>) -> Option<Self> {
+ let Some(a) = a else { return b.clone() };
+ let Some(b) = b else { return Some(a) };
+ if a.file_name != b.file_name {
+ // Failure.
+ return Some(a);
+ }
+ let span = match (&a.span, &b.span) {
+ (None, None) => None,
+ (Some(r), None) | (None, Some(r)) => Some(r.clone()),
+ (Some(ar), Some(br)) => {
+ Some(min(ar.start, br.start).clone()..max(ar.end, br.end).clone())
+ }
+ };
+ Some(Self {
+ file_name: a.file_name,
+ span,
+ omit_underlines: a.omit_underlines || b.omit_underlines,
+ })
+ }
+ pub fn is_empty(&self) -> bool {
+ self.file_name.is_none() && self.span.is_none()
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Enum)]
+pub enum Severity {
+ Error,
+ Warning,
+ Note,
+}
+
+impl Severity {
+ fn as_str(&self) -> &'static str {
+ match self {
+ Severity::Error => "error",
+ Severity::Warning => "warning",
+ Severity::Note => "note",
+ }
+ }
+}
+
+impl Display for Severity {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ write!(f, "{}", self.as_str())
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum Category {
+ General,
+ Syntax,
+ Data,
+}
+
+pub struct Stack {
+ location: Location,
+ description: String,
+}
+
+pub struct Diagnostic {
+ pub severity: Severity,
+ pub category: Category,
+ pub location: Location,
+ pub source: Vec<(i32, String)>,
+ pub stack: Vec<Stack>,
+ pub command_name: Option<&'static str>,
+ pub text: String,
+}
+
+impl Display for Diagnostic {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ for Stack {
+ location,
+ description,
+ } in &self.stack
+ {
+ if !!location.is_empty() {
+ write!(f, "{location}: ")?;
+ }
+ writeln!(f, "{description}")?;
+ }
+ if self.category != Category::General && !self.location.is_empty() {
+ write!(f, "{}: ", self.location)?;
+ }
+
+ write!(f, "{}: ", self.severity)?;
+
+ match self.command_name {
+ Some(command_name) if self.category == Category::Syntax => {
+ write!(f, "{command_name}: ")?
+ }
+ _ => (),
+ }
+
+ write!(f, "{}", self.text)?;
+
+ if let Some(Range {
+ start: Point {
+ line: l0,
+ column: Some(c0),
+ },
+ end: Point {
+ line: l1,
+ column: Some(c1),
+ },
+ }) = self.location.span
+ {
+ let mut prev_line_number = None;
+ for (line_number, line) in &self.source {
+ if let Some(prev_line_number) = prev_line_number {
+ if *line_number != prev_line_number + 1 {
+ write!(f, "\n ... |")?;
+ }
+ }
+ prev_line_number = Some(line_number);
+
+ write!(f, "\n{line_number:5} | {line}")?;
+
+ if !self.location.omit_underlines {
+ let c0 = if *line_number == l0 { c0 } else { 1 };
+ let c1 = if *line_number == l1 {
+ c1
+ } else {
+ line.width() as i32
+ };
+ write!(f, "\n |")?;
+ for _ in 0..c0 {
+ f.write_str(" ")?;
+ }
+ if *line_number == l0 {
+ f.write_str("^")?;
+ for _ in c0..c1 {
+ f.write_str("~")?;
+ }
+ } else {
+ for _ in c0..=c1 {
+ f.write_str("~")?;
+ }
+ }
+ }
+ }
+ }
+ Ok(())
+ }
+}
--- /dev/null
+use std::sync::Arc;
+
+use self::pivot::Value;
+
+pub mod pivot;
+
+/// A single output item.
+pub struct Item {
+ /// The localized label for the item that appears in the outline pane in the
+ /// output viewer and in PDF outlines. This is `None` if no label has been
+ /// explicitly set.
+ label: Option<String>,
+
+ /// A locale-invariant identifier for the command that produced the output,
+ /// which may be `None` if unknown or if a command did not produce this
+ /// output.
+ command_name: Option<String>,
+
+ /// For a group item, this is true if the group's subtree should
+ /// be expanded in an outline view, false otherwise.
+ ///
+ /// For other kinds of output items, this is true to show the item's
+ /// content, false to hide it. The item's label is always shown in an
+ /// outline view.
+ show: bool,
+
+ /// Item details.
+ details: Details,
+}
+
+pub enum Details {
+ Chart,
+ Image,
+ Group(Vec<Arc<Item>>),
+ Message,
+ Table,
+ Text(Text),
+}
+
+pub struct Text {
+ type_: TextType,
+
+ content: Value,
+}
+
+pub enum TextType {
+ /// `TITLE` and `SUBTITLE` commands.
+ PageTitle,
+
+ /// Title,
+ Title,
+
+ /// Syntax printback logging.
+ Syntax,
+
+ /// Other logging.
+ Log,
+}
--- /dev/null
+//! Pivot tables.
+//!
+//! Pivot tables are PSPP's primary form of output. They are analogous to the
+//! pivot tables you might be familiar with from spreadsheets and databases.
+//! See <https://en.wikipedia.org/wiki/Pivot_table> for a brief introduction to
+//! the overall concept of a pivot table.
+//!
+//! In PSPP, the most important internal pieces of a pivot table are:
+//!
+//! - Title. Every pivot table has a title that is displayed above it. It also
+//! has an optional caption (displayed below it) and corner text (displayed in
+//! the upper left corner).
+//!
+//! - Dimensions. A dimension consists of zero or more categories. A category
+//! has a label, such as "df" or "Asymp. Sig." or 123 or a variable name. The
+//! categories are the leaves of a tree whose non-leaf nodes form groups of
+//! categories. The tree always has a root group whose label is the name of
+//! the dimension.
+//!
+//! - Axes. A table has three axes: column, row, and layer. Each dimension is
+//! assigned to an axis, and each axis has zero or more dimensions. When an
+//! axis has more than one dimension, they are ordered from innermost to
+//! outermost.
+//!
+//! - Data. A table's data consists of zero or more cells. Each cell maps from
+//! a category for each dimension to a value, which is commonly a number but
+//! could also be a variable name or an arbitrary text string.
+//!
+//! Creating a pivot table usually consists of the following steps:
+//!
+//! 1. Create the table with pivot_table_create(), passing in the title.
+//!
+//! 2. Optionally, set the format to use for "count" values with
+//! pivot_table_set_weight_var() or pivot_table_set_weight_format().
+//!
+//! 3. Create each dimension with pivot_dimension_create() and populate it with
+//! categories and, possibly, with groups that contain the categories. This
+//! call also assigns the dimension to an axis.
+//!
+//! In simple cases, only a call to pivot_dimension_create() is needed.
+//! Other functions such as pivot_category_create_group() can be used for
+//! hierarchies of categories.
+//!
+//! Sometimes it's easier to create categories in tandem with inserting data,
+//! for example by adding a category for a variable just before inserting the
+//! first cell for that variable. In that case, creating categories and
+//! inserting data can be interleaved.
+//!
+//! 4. Insert data. For each cell, supply the category indexes, which are
+//! assigned starting from 0 in the order in which the categories were
+//! created in step 2, and the value to go in the cell. If the table has a
+//! small, fixed number of dimensions, functions like, e.g.
+//! pivot_table_put3() for 3 dimensions, can be used. The general function
+//! pivot_table_put() works for other cases.
+//!
+//! 5. Output the table for user consumption. Use pivot_table_submit().
+
+use std::{
+ collections::HashMap,
+ ops::Range,
+ sync::{Arc, OnceLock},
+};
+
+use chrono::NaiveDateTime;
+use enum_map::{enum_map, Enum, EnumMap};
+
+use crate::format::{Format, Settings as FormatSettings};
+
+/// Areas of a pivot table for styling purposes.
+#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq)]
+pub enum Area {
+ Title,
+ Caption,
+
+ /// Footnotes,
+ Footer,
+
+ // Top-left corner.
+ Corner,
+
+ ColumnLabels,
+ RowLabels,
+ Data,
+
+ /// Layer indication.
+ Layers,
+}
+
+/// Table borders for styling purposes.
+#[derive(Debug, Enum)]
+pub enum Border {
+ Title,
+ OuterFrame(BoxBorder),
+ InnerFrame(BoxBorder),
+ Dimensions(RowColBorder),
+ Categories(RowColBorder),
+ DataLeft,
+ DataTop,
+}
+
+/// The borders on a box.
+#[derive(Debug, Enum)]
+pub enum BoxBorder {
+ Left,
+ Top,
+ Right,
+ Bottom,
+}
+
+/// Borders between rows and columns.
+#[derive(Debug, Enum, PartialEq, Eq)]
+pub enum RowColBorder {
+ RowHorz,
+ RowVert,
+ ColHorz,
+ ColVert,
+}
+
+/// Sizing for rows or columns of a rendered table.
+///
+/// The comments below talk about columns and their widths but they apply
+/// equally to rows and their heights.
+#[derive(Default)]
+pub struct Sizing {
+ /// Specific column widths, in 1/96" units.
+ widths: Vec<i32>,
+
+ /// Specific page breaks: 0-based columns after which a page break must
+ /// occur, e.g. a value of 1 requests a break after the second column.
+ breaks: Vec<usize>,
+
+ /// Keeps: columns to keep together on a page if possible.
+ keeps: Vec<Range<usize>>,
+}
+
+#[derive(Enum)]
+pub enum Axis3 {
+ X,
+ Y,
+ Z,
+}
+
+/// An axis within a pivot table.
+#[derive(Default)]
+pub struct TableAxis {
+ /// `dimensions[0]` is the innermost dimension.
+ dimensions: Vec<Dimension>,
+
+ /// The number of rows or columns along the axis, that is, the product of
+ /// `dimensions[*].n_leaves`. It is 0 if any dimension has 0 leaves.
+ extent: usize,
+
+ /// Sum of `dimensions[*].label_depth`.
+ label_depth: usize,
+}
+
+/// Dimensions.
+///
+/// A [Dimension] identifies the categories associated with a single dimension
+/// within a multidimensional pivot table.
+///
+/// A dimension contains a collection of categories, which are the leaves in a
+/// tree of groups.
+///
+/// (A dimension or a group can contain zero categories, but this is unusual.
+/// If a dimension contains no categories, then its table cannot contain any
+/// data.)
+pub struct Dimension {
+ axis_type: Axis3,
+ level: usize,
+
+ top_index: usize,
+
+ /// Hierarchy of categories within the dimension. The groups and categories
+ /// are sorted in the order that should be used for display. This might be
+ /// different from the original order produced for output if the user
+ /// adjusted it.
+ ///
+ /// The root must always be a group, although it is allowed to have no
+ /// subcategories.
+ root: Group,
+
+ /// All of the leaves reachable via the root.
+ ///
+ /// The indexing for presentation_leaves is presentation order, thus
+ /// `presentation_leaves[i]->presentation_index == i`. This order is the
+ /// same as would be produced by an in-order traversal of the groups. It
+ /// is the order into which the user reordered or sorted the categories.
+ ///
+ /// The indexing for `data_leaves` is that used for `idx` in [Cell], thus
+ /// `data_leaves[i]->data_index == i`. This might differ from what an
+ /// in-order traversal of `root` would yield, if the user reordered
+ /// categories.
+ data_leaves: Vec<Arc<Leaf>>,
+ presentation_leaves: Vec<Arc<Leaf>>,
+
+ /// Display.
+ hide_all_labels: bool,
+
+ /// Number of rows or columns needed to express the labels.
+ label_depth: usize,
+}
+
+pub struct Group {
+ name: Value,
+ label_depth: usize,
+ extra_depth: usize,
+
+ /// The child categories.
+ ///
+ /// A group usually has multiple children, but it is allowed to have
+ /// only one or even (pathologically) none.
+ children: Vec<Category>,
+
+ /// Display a label for the group itself?
+ show_label: bool,
+
+ show_label_in_corner: bool,
+}
+
+pub struct Leaf {
+ name: Value,
+ label_depth: usize,
+ extra_depth: usize,
+
+ group_index: usize,
+ data_index: usize,
+ presentation_index: usize,
+
+ /// Default format for values in this category.
+ format: Format,
+
+ /// Honor [Table]'s `small` setting?
+ honor_small: bool,
+}
+
+/// A pivot_category is a leaf (a category) or a group.
+pub enum Category {
+ Group(Arc<Group>),
+ Leaf(Arc<Leaf>),
+}
+
+trait CategoryTrait {
+ fn name(&self) -> &Value;
+ fn label_depth(&self) -> usize;
+ fn extra_depth(&self) -> usize;
+}
+
+impl CategoryTrait for Group {
+ fn name(&self) -> &Value {
+ &self.name
+ }
+
+ fn label_depth(&self) -> usize {
+ self.label_depth
+ }
+
+ fn extra_depth(&self) -> usize {
+ self.extra_depth
+ }
+}
+
+impl CategoryTrait for Leaf {
+ fn name(&self) -> &Value {
+ &self.name
+ }
+
+ fn label_depth(&self) -> usize {
+ self.label_depth
+ }
+
+ fn extra_depth(&self) -> usize {
+ self.extra_depth
+ }
+}
+
+impl CategoryTrait for Category {
+ fn name(&self) -> &Value {
+ match self {
+ Category::Group(group) => group.name(),
+ Category::Leaf(leaf) => leaf.name(),
+ }
+ }
+
+ fn label_depth(&self) -> usize {
+ match self {
+ Category::Group(group) => group.label_depth(),
+ Category::Leaf(leaf) => leaf.label_depth(),
+ }
+ }
+
+ fn extra_depth(&self) -> usize {
+ match self {
+ Category::Group(group) => group.extra_depth(),
+ Category::Leaf(leaf) => leaf.extra_depth(),
+ }
+ }
+}
+
+/// Styling for a pivot table.
+///
+/// The division between this and the style information in [Table] seems fairly
+/// arbitrary. The ultimate reason for the division is simply because that's
+/// how SPSS documentation and file formats do it.
+struct Look {
+ name: Option<String>,
+
+ omit_empty: bool,
+ row_labels_in_corner: bool,
+
+ /// Range of column widths for columns in the row headings and corner , in 1/96"
+ /// units.
+ row_heading_widths: Range<usize>,
+
+ /// Range of column widths for columns in the column headings , in 1/96"
+ /// units.
+ col_heading_widths: Range<usize>,
+
+ /// Kind of markers to use for footnotes.
+ footnote_marker_type: FootnoteMarkerType,
+
+ /// Where to put the footnote markers.
+ footnote_marker_position: FootnoteMarkerPosition,
+
+ /// Styles for areas of the pivot table.
+ areas: EnumMap<Area, AreaStyle>,
+
+ /// Styles for borders in the pivot table.
+ borders: EnumMap<Border, BorderStyle>,
+
+ print_all_layers: bool,
+
+ paginate_layers: bool,
+
+ shrink_to_fit: EnumMap<Axis2, bool>,
+
+ top_continuation: bool,
+
+ bottom_continuation: bool,
+
+ continuation: Option<String>,
+
+ n_orphan_lines: usize,
+}
+
+impl Default for Look {
+ fn default() -> Self {
+ Self {
+ name: None,
+ omit_empty: true,
+ row_labels_in_corner: true,
+ row_heading_widths: 36..72,
+ col_heading_widths: 36..120,
+ footnote_marker_type: FootnoteMarkerType::Alphabetic,
+ footnote_marker_position: FootnoteMarkerPosition::Subscript,
+ areas: EnumMap::from_fn(|area| {
+ use HorzAlign::*;
+ use VertAlign::*;
+ let (halign, valign, hmargins, vmargins) = match area {
+ Area::Title => (Center, Middle, [8, 11], [1, 8]),
+ Area::Caption => (Left, Top, [8, 11], [1, 1]),
+ Area::Footer => (Left, Top, [11, 8], [2, 3]),
+ Area::Corner => (Left, Bottom, [8, 11], [1, 1]),
+ Area::ColumnLabels => (Left, Top, [8, 11], [1, 3]),
+ Area::RowLabels => (Left, Top, [8, 11], [1, 3]),
+ Area::Data => (Mixed, Top, [8, 11], [1, 1]),
+ Area::Layers => (Left, Bottom, [8, 11], [1, 3]),
+ };
+ AreaStyle {
+ cell_style: CellStyle {
+ horz_align: halign,
+ vert_align: valign,
+ margins: enum_map! { Axis2::X => hmargins, Axis2::Y => vmargins },
+ },
+ font_style: FontStyle {
+ bold: area == Area::Title,
+ italic: false,
+ underline: false,
+ markup: false,
+ font: String::from("Sans Serif"),
+ fg: [Color::BLACK; 2],
+ bg: [Color::WHITE; 2],
+ size: 9,
+ },
+ }
+ }),
+ borders: EnumMap::from_fn(|border| {
+ let stroke = match border {
+ Border::InnerFrame(_) | Border::DataLeft | Border::DataTop => Stroke::Thick,
+ Border::Dimensions(side) if side != RowColBorder::RowVert => Stroke::Solid,
+ Border::Categories(RowColBorder::ColHorz | RowColBorder::ColVert) => {
+ Stroke::Solid
+ }
+ _ => Stroke::None,
+ };
+ BorderStyle {
+ stroke,
+ color: Color::BLACK,
+ }
+ }),
+ print_all_layers: false,
+ paginate_layers: false,
+ shrink_to_fit: EnumMap::from_fn(|_| false),
+ top_continuation: false,
+ bottom_continuation: false,
+ continuation: None,
+ n_orphan_lines: 0,
+ }
+ }
+}
+
+impl Look {
+ fn shared_default() -> Arc<Look> {
+ static LOOK: OnceLock<Arc<Look>> = OnceLock::new();
+ LOOK.get_or_init(|| Arc::new(Look::default())).clone()
+ }
+}
+
+pub struct AreaStyle {
+ cell_style: CellStyle,
+ font_style: FontStyle,
+}
+
+pub struct CellStyle {
+ horz_align: HorzAlign,
+ vert_align: VertAlign,
+
+ /// Margins in 1/96" units.
+ ///
+ /// `margins[Axis2::X][0]` is the left margin.
+ /// `margins[Axis2::X][1]` is the right margin.
+ /// `margins[Axis2::Y][0]` is the top margin.
+ /// `margins[Axis2::Y][1]` is the bottom margin.
+ margins: EnumMap<Axis2, [i32; 2]>,
+}
+
+pub enum HorzAlign {
+ /// Right aligned.
+ Right,
+
+ /// Left aligned.
+ Left,
+
+ /// Centered.
+ Center,
+
+ /// Align strings to the left, other formats to the right.
+ Mixed,
+
+ /// Align the decimal point at the specified position.
+ Decimal {
+ /// Decimal offset from the right side of the cell, in 1/96" units.
+ offset: f64,
+
+ /// Decimal character: either `b'.'` or `b','`.
+ c: char,
+ },
+}
+
+pub enum VertAlign {
+ /// Top alignment.
+ Top,
+
+ /// Centered,
+ Middle,
+
+ /// Bottom alignment.
+ Bottom,
+}
+
+pub struct FontStyle {
+ bold: bool,
+ italic: bool,
+ underline: bool,
+ markup: bool,
+ font: String,
+ fg: [Color; 2],
+ bg: [Color; 2],
+
+ /// In 1/72" units.
+ size: i32,
+}
+
+pub struct Color {
+ alpha: u8,
+ r: u8,
+ g: u8,
+ b: u8,
+}
+
+impl Color {
+ const BLACK: Color = Color::new(0, 0, 0);
+ const WHITE: Color = Color::new(255, 255, 255);
+
+ const fn new(r: u8, g: u8, b: u8) -> Self {
+ Self {
+ alpha: 255,
+ r,
+ g,
+ b,
+ }
+ }
+}
+
+pub struct BorderStyle {
+ stroke: Stroke,
+ color: Color,
+}
+
+pub enum Stroke {
+ None,
+ Solid,
+ Dashed,
+ Thick,
+ Thin,
+ Double,
+}
+
+/// An axis of a flat table.
+#[derive(Debug, Enum)]
+pub enum Axis2 {
+ X,
+ Y,
+}
+
+pub enum FootnoteMarkerType {
+ /// a, b, c, ...
+ Alphabetic,
+
+ /// 1, 2, 3, ...
+ Numeric,
+}
+
+pub enum FootnoteMarkerPosition {
+ /// Subscripts.
+ Subscript,
+
+ /// Superscripts.
+ Superscript,
+}
+
+pub struct Table {
+ look: Arc<Look>,
+
+ rotate_inner_column_labels: bool,
+
+ rotate_outer_row_labels: bool,
+
+ show_grid_lines: bool,
+
+ show_title: bool,
+
+ show_caption: bool,
+
+ show_value: Option<ValueShow>,
+
+ show_variables: Option<ValueShow>,
+
+ weight_format: Format,
+
+ /// Current layer indexes, with axes[PIVOT_AXIS_LAYER].n_dimensions
+ /// elements. current_layer[i] is an offset into
+ /// axes[PIVOT_AXIS_LAYER].dimensions[i]->data_leaves[], EXCEPT that a
+ /// dimension can have zero leaves, in which case current_layer[i] is zero
+ /// and there's no corresponding leaf.
+ current_layer: Vec<usize>,
+
+ /// Column and row sizing and page breaks.
+ sizing: EnumMap<Axis2, Sizing>,
+
+ /// Format settings.
+ settings: FormatSettings,
+
+ /// Numeric grouping character (usually `.` or `,`).
+ grouping: Option<char>,
+
+ small: f64,
+
+ command_local: Option<String>,
+ command_c: Option<String>,
+ language: Option<String>,
+ locale: Option<String>,
+ dataset: Option<String>,
+ datafile: Option<String>,
+ date: Option<NaiveDateTime>,
+ footnotes: Vec<Footnote>,
+ title: Option<Value>,
+ subtype: Option<Value>,
+ corner_text: Option<Value>,
+ caption: Option<Value>,
+ notes: Option<String>,
+ dimensions: Vec<Dimension>,
+ axes: EnumMap<Axis3, TableAxis>,
+ cells: HashMap<u64, Value>,
+}
+
+impl Table {
+ fn new() -> Self {
+ Self {
+ look: Look::shared_default(),
+ rotate_inner_column_labels: false,
+ rotate_outer_row_labels: false,
+ show_grid_lines: false,
+ show_title: true,
+ show_caption: true,
+ show_value: None,
+ show_variables: None,
+ weight_format: Format::F40,
+ current_layer: Vec::new(),
+ sizing: EnumMap::default(),
+ settings: FormatSettings::default(), // XXX from settings
+ grouping: None,
+ small: 0.0001, // XXX from settings.
+ command_local: None,
+ command_c: None, // XXX from current command name.
+ language: None,
+ locale: None,
+ dataset: None,
+ datafile: None,
+ date: None,
+ footnotes: Vec::new(),
+ subtype: None,
+ title: None,
+ corner_text: None,
+ caption: None,
+ notes: None,
+ dimensions: Vec::new(),
+ axes: EnumMap::default(),
+ cells: HashMap::new(),
+ }
+ }
+}
+
+/// Whether to show variable or value labels or the underlying value or variable name.
+pub enum ValueShow {
+ /// Value or variable name only.
+ Value,
+
+ /// Label only.
+ Label,
+
+ /// Value and label.
+ Both,
+}
+
+pub struct Footnote {
+ content: Value,
+ marker: Value,
+ show: bool,
+}
+
+/// The content of a single pivot table cell.
+///
+/// A [Value] is also a pivot table's title, caption, footnote marker and
+/// contents, and so on.
+///
+/// A given [Value] is one of:
+///
+/// 1. A number resulting from a calculation.
+///
+/// A number has an associated display format (usually [F] or [Pct]). This
+/// format can be set directly, but that is not usually the easiest way.
+/// Instead, it is usually true that all of the values in a single category
+/// should have the same format (e.g. all "Significance" values might use
+/// format `F40.3`), so PSPP makes it easy to set the default format for a
+/// category while creating the category. See pivot_dimension_create() for
+/// more details.
+///
+/// [F]: crate::format::Format::F
+/// [Pct]: crate::format::Format::Pct
+///
+/// 2. A numeric or string value obtained from data (PIVOT_VALUE_NUMERIC or
+/// PIVOT_VALUE_STRING). If such a value corresponds to a variable, then the
+/// variable's name can be attached to the pivot_value. If the value has a
+/// value label, then that can also be attached. When a label is present,
+/// the user can control whether to show the value or the label or both.
+///
+/// 3. A variable name (PIVOT_VALUE_VARIABLE). The variable label, if any, can
+/// be attached too, and again the user can control whether to show the value
+/// or the label or both.
+///
+/// 4. A text string (PIVOT_VALUE_TEXT). The value stores the string in English
+/// and translated into the output language (localized). Use
+/// pivot_value_new_text() or pivot_value_new_text_format() for those cases.
+/// In some cases, only an English or a localized version is available for
+/// one reason or another, although this is regrettable; in those cases, use
+/// pivot_value_new_user_text() or pivot_value_new_user_text_nocopy().
+///
+/// 5. A template. PSPP doesn't create these itself yet, but it can read and
+/// interpret those created by SPSS.
+pub struct Value {
+ styling: Option<Box<ValueStyle>>,
+ inner: ValueInner,
+}
+
+pub enum ValueInner {
+ Number {
+ show: ValueShow,
+ format: Format,
+ honor_small: bool,
+ value: f64,
+ var_name: Option<String>,
+ value_label: Option<String>,
+ },
+ String {
+ show: ValueShow,
+ hex: bool,
+ s: Option<String>,
+ var_name: Option<String>,
+ value_label: Option<String>,
+ },
+ Variable {
+ show: ValueShow,
+ var_name: Option<String>,
+ value_label: Option<String>,
+ },
+ Text {
+ user_provided: bool,
+ /// Localized.
+ local: String,
+ /// English.
+ c: String,
+ /// Identifier.
+ id: String,
+ },
+ Template {
+ args: Vec<Vec<Value>>,
+ local: String,
+ id: String,
+ },
+}
+
+pub struct ValueStyle {
+ font_style: FontStyle,
+ cell_style: CellStyle,
+ subscripts: Vec<String>,
+ footnote_indexes: Vec<usize>,
+}
--- /dev/null
+#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)]
+pub enum PromptStyle {
+ /// First line of command.
+ First,
+
+ /// Second or later line of command.
+ Later,
+
+ /// Between `BEGIN DATA` and `END DATA`.
+ Data,
+
+ /// `COMMENT` or `*` command.
+ Comment,
+
+ /// DOCUMENT command.
+ Document,
+
+ /// `DO REPEAT` command.
+ DoRepeat,
+
+ /// `DEFINE` command.
+ Define,
+}
+
+impl PromptStyle {
+ pub fn to_string(&self) -> &'static str {
+ match self {
+ PromptStyle::First => "first",
+ PromptStyle::Later => "later",
+ PromptStyle::Data => "data",
+ PromptStyle::Comment => "COMMENT",
+ PromptStyle::Document => "DOCUMENT",
+ PromptStyle::DoRepeat => "DO REPEAT",
+ PromptStyle::Define => "DEFINE",
+ }
+ }
+}
--- /dev/null
+use crate::{
+ dictionary::VarWidth,
+ encoding::{default_encoding, get_encoding, Error as EncodingError},
+ endian::{Endian, Parse, ToBytes},
+ identifier::{Error as IdError, Identifier},
+};
+
+use encoding_rs::{mem::decode_latin1, DecoderResult, Encoding};
+use flate2::read::ZlibDecoder;
+use num::Integer;
+use std::{
+ borrow::Cow,
+ cell::RefCell,
+ cmp::Ordering,
+ collections::{HashMap, VecDeque},
+ fmt::{Debug, Display, Formatter, Result as FmtResult},
+ io::{Error as IoError, Read, Seek, SeekFrom},
+ iter::repeat,
+ mem::take,
+ ops::Range,
+ rc::Rc,
+ str::from_utf8,
+};
+use thiserror::Error as ThisError;
+
+#[derive(ThisError, Debug)]
+pub enum Error {
+ #[error("Not an SPSS system file")]
+ NotASystemFile,
+
+ #[error("Invalid magic number {0:?}")]
+ BadMagic([u8; 4]),
+
+ #[error("I/O error ({0})")]
+ Io(#[from] IoError),
+
+ #[error("Invalid SAV compression code {0}")]
+ InvalidSavCompression(u32),
+
+ #[error("Invalid ZSAV compression code {0}")]
+ InvalidZsavCompression(u32),
+
+ #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
+ BadDocumentLength { offset: u64, n: usize, max: usize },
+
+ #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
+ BadRecordType { offset: u64, rec_type: u32 },
+
+ #[error("In variable record starting at offset {start_offset:#x}, variable width is not in the valid range -1 to 255.")]
+ BadVariableWidth { start_offset: u64, width: i32 },
+
+ #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
+ BadVariableLabelCode {
+ start_offset: u64,
+ code_offset: u64,
+ code: u32,
+ },
+
+ #[error(
+ "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
+ )]
+ BadNumericMissingValueCode { offset: u64, code: i32 },
+
+ #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
+ BadStringMissingValueCode { offset: u64, code: i32 },
+
+ #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
+ BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
+
+ #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
+ ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
+
+ #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")]
+ TooManyVarIndexes { offset: u64, n: u32, max: u32 },
+
+ #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
+ ExtensionRecordTooLarge {
+ offset: u64,
+ subtype: u32,
+ size: u32,
+ count: u32,
+ },
+
+ #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
+ EofInCase {
+ offset: u64,
+ case_ofs: u64,
+ case_len: usize,
+ },
+
+ #[error(
+ "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
+ )]
+ EofInCompressedCase { offset: u64, case_ofs: u64 },
+
+ #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
+ PartialCompressedCase { offset: u64, case_ofs: u64 },
+
+ #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
+ CompressedNumberExpected { offset: u64, case_ofs: u64 },
+
+ #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
+ CompressedStringExpected { offset: u64, case_ofs: u64 },
+
+ #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
+ BadZlibTrailerNBlocks {
+ offset: u64,
+ n_blocks: u32,
+ expected_n_blocks: u64,
+ ztrailer_len: u64,
+ },
+
+ #[error("{0}")]
+ EncodingError(EncodingError),
+}
+
+#[derive(ThisError, Debug)]
+pub enum Warning {
+ #[error("Unexpected end of data inside extension record.")]
+ UnexpectedEndOfData,
+
+ #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")]
+ NoVarIndexes { offset: u64 },
+
+ #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())]
+ MixedVarTypes {
+ offset: u64,
+ var_type: VarType,
+ wrong_types: Vec<u32>,
+ },
+
+ #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")]
+ InvalidVarIndexes {
+ offset: u64,
+ max: usize,
+ invalid: Vec<u32>,
+ },
+
+ #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
+ BadRecordSize {
+ offset: u64,
+ record: String,
+ size: u32,
+ expected_size: u32,
+ },
+
+ #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
+ BadRecordCount {
+ offset: u64,
+ record: String,
+ count: u32,
+ expected_count: u32,
+ },
+
+ #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
+ BadLongMissingValueLength {
+ record_offset: u64,
+ offset: u64,
+ value_len: u32,
+ },
+
+ #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
+ BadEncodingName { offset: u64 },
+
+ // XXX This is risky because `text` might be arbitarily long.
+ #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
+ MalformedString { encoding: String, text: String },
+
+ #[error("Invalid variable measurement level value {0}")]
+ InvalidMeasurement(u32),
+
+ #[error("Invalid variable display alignment value {0}")]
+ InvalidAlignment(u32),
+
+ #[error("Invalid attribute name. {0}")]
+ InvalidAttributeName(IdError),
+
+ #[error("Invalid variable name in attribute record. {0}")]
+ InvalidAttributeVariableName(IdError),
+
+ #[error("Invalid short name in long variable name record. {0}")]
+ InvalidShortName(IdError),
+
+ #[error("Invalid name in long variable name record. {0}")]
+ InvalidLongName(IdError),
+
+ #[error("Invalid variable name in very long string record. {0}")]
+ InvalidLongStringName(IdError),
+
+ #[error("Invalid variable name in variable set record. {0}")]
+ InvalidVariableSetName(IdError),
+
+ #[error("Invalid multiple response set name. {0}")]
+ InvalidMrSetName(IdError),
+
+ #[error("Invalid multiple response set variable name. {0}")]
+ InvalidMrSetVariableName(IdError),
+
+ #[error("Invalid variable name in long string missing values record. {0}")]
+ InvalidLongStringMissingValueVariableName(IdError),
+
+ #[error("Invalid variable name in long string value label record. {0}")]
+ InvalidLongStringValueLabelName(IdError),
+
+ #[error("{0}")]
+ EncodingError(EncodingError),
+
+ #[error("Details TBD")]
+ TBD,
+}
+
+impl From<IoError> for Warning {
+ fn from(_source: IoError) -> Self {
+ Self::UnexpectedEndOfData
+ }
+}
+
+#[derive(Clone, Debug)]
+pub enum Record {
+ Header(HeaderRecord<RawString>),
+ Variable(VariableRecord<RawString, RawStr<8>>),
+ ValueLabel(ValueLabelRecord<RawStr<8>, RawString>),
+ Document(DocumentRecord<RawDocumentLine>),
+ IntegerInfo(IntegerInfoRecord),
+ FloatInfo(FloatInfoRecord),
+ VarDisplay(VarDisplayRecord),
+ MultipleResponse(MultipleResponseRecord<RawString, RawString>),
+ LongStringValueLabels(LongStringValueLabelRecord<RawString, RawString>),
+ LongStringMissingValues(LongStringMissingValueRecord<RawString, RawStr<8>>),
+ Encoding(EncodingRecord),
+ NumberOfCases(NumberOfCasesRecord),
+ Text(TextRecord),
+ OtherExtension(Extension),
+ EndOfHeaders(u32),
+ ZHeader(ZHeader),
+ ZTrailer(ZTrailer),
+ Cases(Rc<RefCell<Cases>>),
+}
+
+#[derive(Clone, Debug)]
+pub enum DecodedRecord {
+ Header(HeaderRecord<String>),
+ Variable(VariableRecord<String, String>),
+ ValueLabel(ValueLabelRecord<RawStr<8>, String>),
+ Document(DocumentRecord<String>),
+ IntegerInfo(IntegerInfoRecord),
+ FloatInfo(FloatInfoRecord),
+ VarDisplay(VarDisplayRecord),
+ MultipleResponse(MultipleResponseRecord<Identifier, String>),
+ LongStringValueLabels(LongStringValueLabelRecord<Identifier, String>),
+ LongStringMissingValues(LongStringMissingValueRecord<Identifier, String>),
+ Encoding(EncodingRecord),
+ NumberOfCases(NumberOfCasesRecord),
+ VariableSets(VariableSetRecord),
+ ProductInfo(ProductInfoRecord),
+ LongNames(LongNamesRecord),
+ VeryLongStrings(VeryLongStringsRecord),
+ FileAttributes(FileAttributeRecord),
+ VariableAttributes(VariableAttributeRecord),
+ OtherExtension(Extension),
+ EndOfHeaders(u32),
+ ZHeader(ZHeader),
+ ZTrailer(ZTrailer),
+ Cases(Rc<RefCell<Cases>>),
+}
+
+impl Record {
+ fn read<R>(
+ reader: &mut R,
+ endian: Endian,
+ var_types: &[VarType],
+ warn: &dyn Fn(Warning),
+ ) -> Result<Option<Record>, Error>
+ where
+ R: Read + Seek,
+ {
+ let rec_type: u32 = endian.parse(read_bytes(reader)?);
+ match rec_type {
+ 2 => Ok(Some(VariableRecord::read(reader, endian)?)),
+ 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?),
+ 6 => Ok(Some(DocumentRecord::read(reader, endian)?)),
+ 7 => Extension::read(reader, endian, var_types.len(), warn),
+ 999 => Ok(Some(Record::EndOfHeaders(
+ endian.parse(read_bytes(reader)?),
+ ))),
+ _ => Err(Error::BadRecordType {
+ offset: reader.stream_position()?,
+ rec_type,
+ }),
+ }
+ }
+
+ pub fn decode(self, decoder: &Decoder) -> Result<DecodedRecord, Error> {
+ Ok(match self {
+ Record::Header(record) => record.decode(decoder),
+ Record::Variable(record) => record.decode(decoder),
+ Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)),
+ Record::Document(record) => record.decode(decoder),
+ Record::IntegerInfo(record) => DecodedRecord::IntegerInfo(record.clone()),
+ Record::FloatInfo(record) => DecodedRecord::FloatInfo(record.clone()),
+ Record::VarDisplay(record) => DecodedRecord::VarDisplay(record.clone()),
+ Record::MultipleResponse(record) => record.decode(decoder),
+ Record::LongStringValueLabels(record) => {
+ DecodedRecord::LongStringValueLabels(record.decode(decoder))
+ }
+ Record::LongStringMissingValues(record) => {
+ DecodedRecord::LongStringMissingValues(record.decode(decoder))
+ }
+ Record::Encoding(record) => DecodedRecord::Encoding(record.clone()),
+ Record::NumberOfCases(record) => DecodedRecord::NumberOfCases(record.clone()),
+ Record::Text(record) => record.decode(decoder),
+ Record::OtherExtension(record) => DecodedRecord::OtherExtension(record.clone()),
+ Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record),
+ Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()),
+ Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()),
+ Record::Cases(record) => DecodedRecord::Cases(record.clone()),
+ })
+ }
+}
+
+pub fn encoding_from_headers(
+ headers: &Vec<Record>,
+ warn: &impl Fn(Warning),
+) -> Result<&'static Encoding, Error> {
+ let mut encoding_record = None;
+ let mut integer_info_record = None;
+ for record in headers {
+ match record {
+ Record::Encoding(record) => encoding_record = Some(record),
+ Record::IntegerInfo(record) => integer_info_record = Some(record),
+ _ => (),
+ }
+ }
+ let encoding = encoding_record.map(|record| record.0.as_str());
+ let character_code = integer_info_record.map(|record| record.character_code);
+ match get_encoding(encoding, character_code) {
+ Ok(encoding) => Ok(encoding),
+ Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)),
+ Err(err) => {
+ warn(Warning::EncodingError(err));
+ // Warn that we're using the default encoding.
+ Ok(default_encoding())
+ }
+ }
+}
+
+// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
+// decoded as Latin-1 (actually bytes interpreted as Unicode code points).
+fn default_decode(s: &[u8]) -> Cow<str> {
+ from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum Compression {
+ Simple,
+ ZLib,
+}
+
+trait Header {
+ fn offsets(&self) -> Range<u64>;
+}
+
+#[derive(Clone)]
+pub struct HeaderRecord<S>
+where
+ S: Debug,
+{
+ /// Offset in file.
+ pub offsets: Range<u64>,
+
+ /// Magic number.
+ pub magic: Magic,
+
+ /// Eye-catcher string, product name, in the file's encoding. Padded
+ /// on the right with spaces.
+ pub eye_catcher: S,
+
+ /// Layout code, normally either 2 or 3.
+ pub layout_code: u32,
+
+ /// Number of variable positions, or `None` if the value in the file is
+ /// questionably trustworthy.
+ pub nominal_case_size: Option<u32>,
+
+ /// Compression type, if any,
+ pub compression: Option<Compression>,
+
+ /// 1-based variable index of the weight variable, or `None` if the file is
+ /// unweighted.
+ pub weight_index: Option<u32>,
+
+ /// Claimed number of cases, if known.
+ pub n_cases: Option<u32>,
+
+ /// Compression bias, usually 100.0.
+ pub bias: f64,
+
+ /// `dd mmm yy` in the file's encoding.
+ pub creation_date: S,
+
+ /// `HH:MM:SS` in the file's encoding.
+ pub creation_time: S,
+
+ /// File label, in the file's encoding. Padded on the right with spaces.
+ pub file_label: S,
+
+ /// Endianness of the data in the file header.
+ pub endian: Endian,
+}
+
+impl<S> HeaderRecord<S>
+where
+ S: Debug,
+{
+ fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult
+ where
+ T: Debug,
+ {
+ writeln!(f, "{name:>17}: {:?}", value)
+ }
+}
+
+impl<S> Debug for HeaderRecord<S>
+where
+ S: Debug,
+{
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ writeln!(f, "File header record:")?;
+ self.debug_field(f, "Magic", self.magic)?;
+ self.debug_field(f, "Product name", &self.eye_catcher)?;
+ self.debug_field(f, "Layout code", self.layout_code)?;
+ self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
+ self.debug_field(f, "Compression", self.compression)?;
+ self.debug_field(f, "Weight index", self.weight_index)?;
+ self.debug_field(f, "Number of cases", self.n_cases)?;
+ self.debug_field(f, "Compression bias", self.bias)?;
+ self.debug_field(f, "Creation date", &self.creation_date)?;
+ self.debug_field(f, "Creation time", &self.creation_time)?;
+ self.debug_field(f, "File label", &self.file_label)?;
+ self.debug_field(f, "Endianness", self.endian)
+ }
+}
+
+impl HeaderRecord<RawString> {
+ fn read<R: Read + Seek>(r: &mut R) -> Result<Self, Error> {
+ let start = r.stream_position()?;
+
+ let magic: [u8; 4] = read_bytes(r)?;
+ let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
+
+ let eye_catcher = RawString(read_vec(r, 60)?);
+ let layout_code: [u8; 4] = read_bytes(r)?;
+ let endian = Endian::identify_u32(2, layout_code)
+ .or_else(|| Endian::identify_u32(2, layout_code))
+ .ok_or_else(|| Error::NotASystemFile)?;
+ let layout_code = endian.parse(layout_code);
+
+ let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
+ let nominal_case_size =
+ (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
+
+ let compression_code: u32 = endian.parse(read_bytes(r)?);
+ let compression = match (magic, compression_code) {
+ (Magic::Zsav, 2) => Some(Compression::ZLib),
+ (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)),
+ (_, 0) => None,
+ (_, 1) => Some(Compression::Simple),
+ (_, code) => return Err(Error::InvalidSavCompression(code)),
+ };
+
+ let weight_index: u32 = endian.parse(read_bytes(r)?);
+ let weight_index = (weight_index > 0).then_some(weight_index);
+
+ let n_cases: u32 = endian.parse(read_bytes(r)?);
+ let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
+
+ let bias: f64 = endian.parse(read_bytes(r)?);
+
+ let creation_date = RawString(read_vec(r, 9)?);
+ let creation_time = RawString(read_vec(r, 8)?);
+ let file_label = RawString(read_vec(r, 64)?);
+ let _: [u8; 3] = read_bytes(r)?;
+
+ Ok(HeaderRecord {
+ offsets: start..r.stream_position()?,
+ magic,
+ layout_code,
+ nominal_case_size,
+ compression,
+ weight_index,
+ n_cases,
+ bias,
+ creation_date,
+ creation_time,
+ eye_catcher,
+ file_label,
+ endian,
+ })
+ }
+
+ pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ let eye_catcher = decoder.decode(&self.eye_catcher).to_string();
+ let file_label = decoder.decode(&self.file_label).to_string();
+ let creation_date = decoder.decode(&self.creation_date).to_string();
+ let creation_time = decoder.decode(&self.creation_time).to_string();
+ DecodedRecord::Header(HeaderRecord {
+ eye_catcher,
+ weight_index: self.weight_index,
+ n_cases: self.n_cases,
+ file_label,
+ offsets: self.offsets.clone(),
+ magic: self.magic,
+ layout_code: self.layout_code,
+ nominal_case_size: self.nominal_case_size,
+ compression: self.compression,
+ bias: self.bias,
+ creation_date,
+ creation_time,
+ endian: self.endian,
+ })
+ }
+}
+
+pub struct Decoder {
+ pub encoding: &'static Encoding,
+ pub warn: Box<dyn Fn(Warning)>,
+}
+
+impl Decoder {
+ pub fn new<F>(encoding: &'static Encoding, warn: F) -> Self
+ where
+ F: Fn(Warning) + 'static,
+ {
+ Self {
+ encoding,
+ warn: Box::new(warn),
+ }
+ }
+ fn warn(&self, warning: Warning) {
+ (self.warn)(warning)
+ }
+ fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
+ let (output, malformed) = self.encoding.decode_without_bom_handling(input);
+ if malformed {
+ self.warn(Warning::MalformedString {
+ encoding: self.encoding.name().into(),
+ text: output.clone().into(),
+ });
+ }
+ output
+ }
+
+ fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> {
+ self.decode_slice(input.0.as_slice())
+ }
+
+ /// Returns `input` decoded from `self.encoding` into UTF-8 such that
+ /// re-encoding the result back into `self.encoding` will have exactly the
+ /// same length in bytes.
+ ///
+ /// XXX warn about errors?
+ pub fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
+ if let (s, false) = self.encoding.decode_without_bom_handling(input) {
+ // This is the common case. Usually there will be no errors.
+ s
+ } else {
+ // Unusual case. Don't bother to optimize it much.
+ let mut decoder = self.encoding.new_decoder_without_bom_handling();
+ let mut output = String::with_capacity(
+ decoder
+ .max_utf8_buffer_length_without_replacement(input.len())
+ .unwrap(),
+ );
+ let mut rest = input;
+ while !rest.is_empty() {
+ match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
+ (DecoderResult::InputEmpty, _) => break,
+ (DecoderResult::OutputFull, _) => unreachable!(),
+ (DecoderResult::Malformed(a, b), consumed) => {
+ let skipped = a as usize + b as usize;
+ output.extend(repeat('?').take(skipped));
+ rest = &rest[consumed..];
+ }
+ }
+ }
+ assert_eq!(self.encoding.encode(&output).0.len(), input.len());
+ output.into()
+ }
+ }
+
+ pub fn decode_identifier(&self, input: &RawString) -> Result<Identifier, IdError> {
+ self.new_identifier(&self.decode(input))
+ }
+
+ pub fn new_identifier(&self, name: &str) -> Result<Identifier, IdError> {
+ Identifier::from_encoding(name, self.encoding)
+ }
+}
+
+impl<S> Header for HeaderRecord<S>
+where
+ S: Debug,
+{
+ fn offsets(&self) -> Range<u64> {
+ self.offsets.clone()
+ }
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+pub enum Magic {
+ /// Regular system file.
+ Sav,
+
+ /// System file with Zlib-compressed data.
+ Zsav,
+
+ /// EBCDIC-encoded system file.
+ Ebcdic,
+}
+
+impl Magic {
+ /// Magic number for a regular system file.
+ pub const SAV: [u8; 4] = *b"$FL2";
+
+ /// Magic number for a system file that contains zlib-compressed data.
+ pub const ZSAV: [u8; 4] = *b"$FL3";
+
+ /// Magic number for an EBCDIC-encoded system file. This is `$FL2` encoded
+ /// in EBCDIC.
+ pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2];
+}
+
+impl Debug for Magic {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ let s = match *self {
+ Magic::Sav => "$FL2",
+ Magic::Zsav => "$FL3",
+ Magic::Ebcdic => "($FL2 in EBCDIC)",
+ };
+ write!(f, "{s}")
+ }
+}
+
+impl TryFrom<[u8; 4]> for Magic {
+ type Error = Error;
+
+ fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
+ match value {
+ Magic::SAV => Ok(Magic::Sav),
+ Magic::ZSAV => Ok(Magic::Zsav),
+ Magic::EBCDIC => Ok(Magic::Ebcdic),
+ _ => Err(Error::BadMagic(value)),
+ }
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum VarType {
+ Numeric,
+ String,
+}
+
+impl VarType {
+ pub fn from_width(width: VarWidth) -> VarType {
+ match width {
+ VarWidth::Numeric => Self::Numeric,
+ VarWidth::String(_) => Self::String,
+ }
+ }
+
+ pub fn opposite(self) -> VarType {
+ match self {
+ Self::Numeric => Self::String,
+ Self::String => Self::Numeric,
+ }
+ }
+}
+
+impl Display for VarType {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ match self {
+ VarType::Numeric => write!(f, "numeric"),
+ VarType::String => write!(f, "string"),
+ }
+ }
+}
+
+#[derive(Copy, Clone)]
+pub enum Value<S>
+where
+ S: Debug,
+{
+ Number(Option<f64>),
+ String(S),
+}
+
+type RawValue = Value<RawStr<8>>;
+
+impl<S> Debug for Value<S>
+where
+ S: Debug,
+{
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ match self {
+ Value::Number(Some(number)) => write!(f, "{number:?}"),
+ Value::Number(None) => write!(f, "SYSMIS"),
+ Value::String(s) => write!(f, "{:?}", s),
+ }
+ }
+}
+
+impl RawValue {
+ fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Self, IoError> {
+ Ok(Self::from_raw(
+ &UntypedValue(read_bytes(r)?),
+ var_type,
+ endian,
+ ))
+ }
+
+ pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self {
+ match var_type {
+ VarType::String => Value::String(RawStr(raw.0)),
+ VarType::Numeric => {
+ let number: f64 = endian.parse(raw.0);
+ Value::Number((number != -f64::MAX).then_some(number))
+ }
+ }
+ }
+
+ fn read_case<R: Read + Seek>(
+ reader: &mut R,
+ var_types: &[VarType],
+ endian: Endian,
+ ) -> Result<Option<Vec<Self>>, Error> {
+ let case_start = reader.stream_position()?;
+ let mut values = Vec::with_capacity(var_types.len());
+ for (i, &var_type) in var_types.iter().enumerate() {
+ let Some(raw) = try_read_bytes(reader)? else {
+ if i == 0 {
+ return Ok(None);
+ } else {
+ let offset = reader.stream_position()?;
+ return Err(Error::EofInCase {
+ offset,
+ case_ofs: offset - case_start,
+ case_len: var_types.len() * 8,
+ });
+ }
+ };
+ values.push(Value::from_raw(&UntypedValue(raw), var_type, endian));
+ }
+ Ok(Some(values))
+ }
+
+ fn read_compressed_case<R: Read + Seek>(
+ reader: &mut R,
+ var_types: &[VarType],
+ codes: &mut VecDeque<u8>,
+ endian: Endian,
+ bias: f64,
+ ) -> Result<Option<Vec<Self>>, Error> {
+ let case_start = reader.stream_position()?;
+ let mut values = Vec::with_capacity(var_types.len());
+ for (i, &var_type) in var_types.iter().enumerate() {
+ let value = loop {
+ let Some(code) = codes.pop_front() else {
+ let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
+ if i == 0 {
+ return Ok(None);
+ } else {
+ let offset = reader.stream_position()?;
+ return Err(Error::EofInCompressedCase {
+ offset,
+ case_ofs: offset - case_start,
+ });
+ }
+ };
+ codes.extend(new_codes.into_iter());
+ continue;
+ };
+ match code {
+ 0 => (),
+ 1..=251 => match var_type {
+ VarType::Numeric => break Self::Number(Some(code as f64 - bias)),
+ VarType::String => {
+ break Self::String(RawStr(endian.to_bytes(code as f64 - bias)))
+ }
+ },
+ 252 => {
+ if i == 0 {
+ return Ok(None);
+ } else {
+ let offset = reader.stream_position()?;
+ return Err(Error::PartialCompressedCase {
+ offset,
+ case_ofs: offset - case_start,
+ });
+ }
+ }
+ 253 => {
+ break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian)
+ }
+ 254 => match var_type {
+ VarType::String => break Self::String(RawStr(*b" ")), // XXX EBCDIC
+ VarType::Numeric => {
+ return Err(Error::CompressedStringExpected {
+ offset: case_start,
+ case_ofs: reader.stream_position()? - case_start,
+ })
+ }
+ },
+ 255 => match var_type {
+ VarType::Numeric => break Self::Number(None),
+ VarType::String => {
+ return Err(Error::CompressedNumberExpected {
+ offset: case_start,
+ case_ofs: reader.stream_position()? - case_start,
+ })
+ }
+ },
+ }
+ };
+ values.push(value);
+ }
+ Ok(Some(values))
+ }
+
+ pub fn decode(self, decoder: &Decoder) -> Value<String> {
+ match self {
+ Self::Number(x) => Value::Number(x),
+ Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
+ }
+ }
+}
+
+struct ZlibDecodeMultiple<R>
+where
+ R: Read + Seek,
+{
+ reader: Option<ZlibDecoder<R>>,
+}
+
+impl<R> ZlibDecodeMultiple<R>
+where
+ R: Read + Seek,
+{
+ fn new(reader: R) -> ZlibDecodeMultiple<R> {
+ ZlibDecodeMultiple {
+ reader: Some(ZlibDecoder::new(reader)),
+ }
+ }
+}
+
+impl<R> Read for ZlibDecodeMultiple<R>
+where
+ R: Read + Seek,
+{
+ fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
+ loop {
+ match self.reader.as_mut().unwrap().read(buf)? {
+ 0 => {
+ let inner = self.reader.take().unwrap().into_inner();
+ self.reader = Some(ZlibDecoder::new(inner));
+ }
+ n => return Ok(n),
+ };
+ }
+ }
+}
+
+impl<R> Seek for ZlibDecodeMultiple<R>
+where
+ R: Read + Seek,
+{
+ fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
+ self.reader.as_mut().unwrap().get_mut().seek(pos)
+ }
+}
+
+enum ReaderState {
+ Start,
+ Headers,
+ ZlibHeader,
+ ZlibTrailer {
+ ztrailer_offset: u64,
+ ztrailer_len: u64,
+ },
+ Cases,
+ End,
+}
+
+pub struct Reader<R>
+where
+ R: Read + Seek + 'static,
+{
+ reader: Option<R>,
+ warn: Box<dyn Fn(Warning)>,
+
+ header: HeaderRecord<RawString>,
+ var_types: Vec<VarType>,
+
+ state: ReaderState,
+}
+
+impl<R> Reader<R>
+where
+ R: Read + Seek + 'static,
+{
+ pub fn new<F>(mut reader: R, warn: F) -> Result<Self, Error>
+ where
+ F: Fn(Warning) + 'static,
+ {
+ let header = HeaderRecord::read(&mut reader)?;
+ Ok(Self {
+ reader: Some(reader),
+ warn: Box::new(warn),
+ header,
+ var_types: Vec::new(),
+ state: ReaderState::Start,
+ })
+ }
+ fn cases(&mut self) -> Cases {
+ self.state = ReaderState::End;
+ Cases::new(
+ self.reader.take().unwrap(),
+ take(&mut self.var_types),
+ &self.header,
+ )
+ }
+ fn _next(&mut self) -> Option<<Self as Iterator>::Item> {
+ match self.state {
+ ReaderState::Start => {
+ self.state = ReaderState::Headers;
+ Some(Ok(Record::Header(self.header.clone())))
+ }
+ ReaderState::Headers => {
+ let record = loop {
+ match Record::read(
+ self.reader.as_mut().unwrap(),
+ self.header.endian,
+ self.var_types.as_slice(),
+ &self.warn,
+ ) {
+ Ok(Some(record)) => break record,
+ Ok(None) => (),
+ Err(error) => return Some(Err(error)),
+ }
+ };
+ match record {
+ Record::Variable(VariableRecord { width, .. }) => {
+ self.var_types.push(if width == 0 {
+ VarType::Numeric
+ } else {
+ VarType::String
+ });
+ }
+ Record::EndOfHeaders(_) => {
+ self.state = if let Some(Compression::ZLib) = self.header.compression {
+ ReaderState::ZlibHeader
+ } else {
+ ReaderState::Cases
+ };
+ }
+ _ => (),
+ };
+ Some(Ok(record))
+ }
+ ReaderState::ZlibHeader => {
+ let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian)
+ {
+ Ok(zheader) => zheader,
+ Err(error) => return Some(Err(error)),
+ };
+ self.state = ReaderState::ZlibTrailer {
+ ztrailer_offset: zheader.ztrailer_offset,
+ ztrailer_len: zheader.ztrailer_len,
+ };
+ Some(Ok(Record::ZHeader(zheader)))
+ }
+ ReaderState::ZlibTrailer {
+ ztrailer_offset,
+ ztrailer_len,
+ } => {
+ match ZTrailer::read(
+ self.reader.as_mut().unwrap(),
+ self.header.endian,
+ ztrailer_offset,
+ ztrailer_len,
+ ) {
+ Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
+ Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))),
+ Err(error) => Some(Err(error)),
+ }
+ }
+ ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
+ ReaderState::End => None,
+ }
+ }
+}
+
+impl<R> Iterator for Reader<R>
+where
+ R: Read + Seek + 'static,
+{
+ type Item = Result<Record, Error>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ let retval = self._next();
+ if matches!(retval, Some(Err(_))) {
+ self.state = ReaderState::End;
+ }
+ retval
+ }
+}
+
+trait ReadSeek: Read + Seek {}
+impl<T> ReadSeek for T where T: Read + Seek {}
+
+pub struct Cases {
+ reader: Box<dyn ReadSeek>,
+ var_types: Vec<VarType>,
+ compression: Option<Compression>,
+ bias: f64,
+ endian: Endian,
+ codes: VecDeque<u8>,
+ eof: bool,
+}
+
+impl Debug for Cases {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "Cases")
+ }
+}
+
+impl Cases {
+ fn new<R>(reader: R, var_types: Vec<VarType>, header: &HeaderRecord<RawString>) -> Self
+ where
+ R: Read + Seek + 'static,
+ {
+ Self {
+ reader: if header.compression == Some(Compression::ZLib) {
+ Box::new(ZlibDecodeMultiple::new(reader))
+ } else {
+ Box::new(reader)
+ },
+ var_types,
+ compression: header.compression,
+ bias: header.bias,
+ endian: header.endian,
+ codes: VecDeque::with_capacity(8),
+ eof: false,
+ }
+ }
+}
+
+impl Iterator for Cases {
+ type Item = Result<Vec<RawValue>, Error>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ if self.eof {
+ return None;
+ }
+
+ let retval = if self.compression.is_some() {
+ Value::read_compressed_case(
+ &mut self.reader,
+ &self.var_types,
+ &mut self.codes,
+ self.endian,
+ self.bias,
+ )
+ .transpose()
+ } else {
+ Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose()
+ };
+ self.eof = matches!(retval, None | Some(Err(_)));
+ retval
+ }
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+pub struct Spec(pub u32);
+
+impl Debug for Spec {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ let type_ = format_name(self.0 >> 16);
+ let w = (self.0 >> 8) & 0xff;
+ let d = self.0 & 0xff;
+ write!(f, "{:06x} ({type_}{w}.{d})", self.0)
+ }
+}
+
+fn format_name(type_: u32) -> Cow<'static, str> {
+ match type_ {
+ 1 => "A",
+ 2 => "AHEX",
+ 3 => "COMMA",
+ 4 => "DOLLAR",
+ 5 => "F",
+ 6 => "IB",
+ 7 => "PIBHEX",
+ 8 => "P",
+ 9 => "PIB",
+ 10 => "PK",
+ 11 => "RB",
+ 12 => "RBHEX",
+ 15 => "Z",
+ 16 => "N",
+ 17 => "E",
+ 20 => "DATE",
+ 21 => "TIME",
+ 22 => "DATETIME",
+ 23 => "ADATE",
+ 24 => "JDATE",
+ 25 => "DTIME",
+ 26 => "WKDAY",
+ 27 => "MONTH",
+ 28 => "MOYR",
+ 29 => "QYR",
+ 30 => "WKYR",
+ 31 => "PCT",
+ 32 => "DOT",
+ 33 => "CCA",
+ 34 => "CCB",
+ 35 => "CCC",
+ 36 => "CCD",
+ 37 => "CCE",
+ 38 => "EDATE",
+ 39 => "SDATE",
+ 40 => "MTIME",
+ 41 => "YMDHMS",
+ _ => return format!("<unknown format {type_}>").into(),
+ }
+ .into()
+}
+
+#[derive(Clone)]
+pub struct MissingValues<S = String>
+where
+ S: Debug,
+{
+ /// Individual missing values, up to 3 of them.
+ pub values: Vec<Value<S>>,
+
+ /// Optional range of missing values.
+ pub range: Option<(Value<S>, Value<S>)>,
+}
+
+impl<S> Debug for MissingValues<S>
+where
+ S: Debug,
+{
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ for (i, value) in self.values.iter().enumerate() {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ write!(f, "{value:?}")?;
+ }
+
+ if let Some((low, high)) = &self.range {
+ if !self.values.is_empty() {
+ write!(f, ", ")?;
+ }
+ write!(f, "{low:?} THRU {high:?}")?;
+ }
+
+ if self.is_empty() {
+ write!(f, "none")?;
+ }
+
+ Ok(())
+ }
+}
+
+impl<S> MissingValues<S>
+where
+ S: Debug,
+{
+ fn is_empty(&self) -> bool {
+ self.values.is_empty() && self.range.is_none()
+ }
+}
+
+impl<S> Default for MissingValues<S>
+where
+ S: Debug,
+{
+ fn default() -> Self {
+ Self {
+ values: Vec::new(),
+ range: None,
+ }
+ }
+}
+
+impl MissingValues<RawStr<8>> {
+ fn read<R: Read + Seek>(
+ r: &mut R,
+ offset: u64,
+ width: i32,
+ code: i32,
+ endian: Endian,
+ ) -> Result<Self, Error> {
+ let (n_values, has_range) = match (width, code) {
+ (_, 0..=3) => (code, false),
+ (0, -2) => (0, true),
+ (0, -3) => (1, true),
+ (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
+ (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
+ };
+
+ let var_type = if width == 0 {
+ VarType::Numeric
+ } else {
+ VarType::String
+ };
+
+ let mut values = Vec::new();
+ for _ in 0..n_values {
+ values.push(RawValue::read(r, var_type, endian)?);
+ }
+ let range = if has_range {
+ let low = RawValue::read(r, var_type, endian)?;
+ let high = RawValue::read(r, var_type, endian)?;
+ Some((low, high))
+ } else {
+ None
+ };
+ Ok(Self { values, range })
+ }
+ fn decode(&self, decoder: &Decoder) -> MissingValues<String> {
+ MissingValues {
+ values: self
+ .values
+ .iter()
+ .map(|value| value.decode(decoder))
+ .collect(),
+ range: self
+ .range
+ .as_ref()
+ .map(|(low, high)| (low.decode(decoder), high.decode(decoder))),
+ }
+ }
+}
+
+#[derive(Clone)]
+pub struct VariableRecord<S, V>
+where
+ S: Debug,
+ V: Debug,
+{
+ /// Range of offsets in file.
+ pub offsets: Range<u64>,
+
+ /// Variable width, in the range -1..=255.
+ pub width: i32,
+
+ /// Variable name, padded on the right with spaces.
+ pub name: S,
+
+ /// Print format.
+ pub print_format: Spec,
+
+ /// Write format.
+ pub write_format: Spec,
+
+ /// Missing values.
+ pub missing_values: MissingValues<V>,
+
+ /// Optional variable label.
+ pub label: Option<S>,
+}
+
+impl<S, V> Debug for VariableRecord<S, V>
+where
+ S: Debug,
+ V: Debug,
+{
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ writeln!(
+ f,
+ "Width: {} ({})",
+ self.width,
+ match self.width.cmp(&0) {
+ Ordering::Greater => "string",
+ Ordering::Equal => "numeric",
+ Ordering::Less => "long string continuation record",
+ }
+ )?;
+ writeln!(f, "Print format: {:?}", self.print_format)?;
+ writeln!(f, "Write format: {:?}", self.write_format)?;
+ writeln!(f, "Name: {:?}", &self.name)?;
+ writeln!(f, "Variable label: {:?}", self.label)?;
+ writeln!(f, "Missing values: {:?}", self.missing_values)
+ }
+}
+
+impl VariableRecord<RawString, RawStr<8>> {
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
+ let start_offset = r.stream_position()?;
+ let width: i32 = endian.parse(read_bytes(r)?);
+ if !(-1..=255).contains(&width) {
+ return Err(Error::BadVariableWidth {
+ start_offset,
+ width,
+ });
+ }
+ let code_offset = r.stream_position()?;
+ let has_variable_label: u32 = endian.parse(read_bytes(r)?);
+ let missing_value_code: i32 = endian.parse(read_bytes(r)?);
+ let print_format = Spec(endian.parse(read_bytes(r)?));
+ let write_format = Spec(endian.parse(read_bytes(r)?));
+ let name = RawString(read_vec(r, 8)?);
+
+ let label = match has_variable_label {
+ 0 => None,
+ 1 => {
+ let len: u32 = endian.parse(read_bytes(r)?);
+ let read_len = len.min(65535) as usize;
+ let label = RawString(read_vec(r, read_len)?);
+
+ let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
+ let _ = read_vec(r, padding_bytes as usize)?;
+
+ Some(label)
+ }
+ _ => {
+ return Err(Error::BadVariableLabelCode {
+ start_offset,
+ code_offset,
+ code: has_variable_label,
+ })
+ }
+ };
+
+ let missing_values =
+ MissingValues::read(r, start_offset, width, missing_value_code, endian)?;
+
+ let end_offset = r.stream_position()?;
+
+ Ok(Record::Variable(VariableRecord {
+ offsets: start_offset..end_offset,
+ width,
+ name,
+ print_format,
+ write_format,
+ missing_values,
+ label,
+ }))
+ }
+
+ pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ DecodedRecord::Variable(VariableRecord {
+ offsets: self.offsets.clone(),
+ width: self.width,
+ name: decoder.decode(&self.name).to_string(),
+ print_format: self.print_format,
+ write_format: self.write_format,
+ missing_values: self.missing_values.decode(decoder),
+ label: self
+ .label
+ .as_ref()
+ .map(|label| decoder.decode(label).to_string()),
+ })
+ }
+}
+
+#[derive(Copy, Clone)]
+pub struct UntypedValue(pub [u8; 8]);
+
+impl Debug for UntypedValue {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ let little: f64 = Endian::Little.parse(self.0);
+ let little = format!("{:?}", little);
+ let big: f64 = Endian::Big.parse(self.0);
+ let big = format!("{:?}", big);
+ let number = if little.len() <= big.len() {
+ little
+ } else {
+ big
+ };
+ write!(f, "{number}")?;
+
+ let string = default_decode(&self.0);
+ let string = string
+ .split(|c: char| c == '\0' || c.is_control())
+ .next()
+ .unwrap();
+ write!(f, "{string:?}")?;
+ Ok(())
+ }
+}
+
+#[derive(Clone)]
+pub struct RawString(pub Vec<u8>);
+
+impl From<Vec<u8>> for RawString {
+ fn from(source: Vec<u8>) -> Self {
+ Self(source)
+ }
+}
+
+impl From<&[u8]> for RawString {
+ fn from(source: &[u8]) -> Self {
+ Self(source.into())
+ }
+}
+
+impl Debug for RawString {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "{:?}", default_decode(self.0.as_slice()))
+ }
+}
+
+#[derive(Copy, Clone)]
+pub struct RawStr<const N: usize>(pub [u8; N]);
+
+impl<const N: usize> From<[u8; N]> for RawStr<N> {
+ fn from(source: [u8; N]) -> Self {
+ Self(source)
+ }
+}
+
+impl<const N: usize> Debug for RawStr<N> {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "{:?}", default_decode(&self.0))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct ValueLabel<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
+ pub value: Value<V>,
+ pub label: S,
+}
+
+#[derive(Clone)]
+pub struct ValueLabelRecord<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
+ /// Range of offsets in file.
+ pub offsets: Range<u64>,
+
+ /// The labels.
+ pub labels: Vec<ValueLabel<V, S>>,
+
+ /// The 1-based indexes of the variable indexes.
+ pub dict_indexes: Vec<u32>,
+
+ /// The types of the variables.
+ pub var_type: VarType,
+}
+
+impl<V, S> Debug for ValueLabelRecord<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ writeln!(f, "labels: ")?;
+ for label in self.labels.iter() {
+ writeln!(f, "{label:?}")?;
+ }
+ write!(f, "apply to {} variables", self.var_type)?;
+ for dict_index in self.dict_indexes.iter() {
+ write!(f, " #{dict_index}")?;
+ }
+ Ok(())
+ }
+}
+
+impl<V, S> Header for ValueLabelRecord<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
+ fn offsets(&self) -> Range<u64> {
+ self.offsets.clone()
+ }
+}
+
+impl<V, S> ValueLabelRecord<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
+ /// Maximum number of value labels in a record.
+ pub const MAX_LABELS: u32 = u32::MAX / 8;
+
+ /// Maximum number of variable indexes in a record.
+ pub const MAX_INDEXES: u32 = u32::MAX / 8;
+}
+
+impl ValueLabelRecord<RawStr<8>, RawString> {
+ fn read<R: Read + Seek>(
+ r: &mut R,
+ endian: Endian,
+ var_types: &[VarType],
+ warn: &dyn Fn(Warning),
+ ) -> Result<Option<Record>, Error> {
+ let label_offset = r.stream_position()?;
+ let n: u32 = endian.parse(read_bytes(r)?);
+ if n > Self::MAX_LABELS {
+ return Err(Error::BadNumberOfValueLabels {
+ offset: label_offset,
+ n,
+ max: Self::MAX_LABELS,
+ });
+ }
+
+ let mut labels = Vec::new();
+ for _ in 0..n {
+ let value = UntypedValue(read_bytes(r)?);
+ let label_len: u8 = endian.parse(read_bytes(r)?);
+ let label_len = label_len as usize;
+ let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
+
+ let mut label = read_vec(r, padded_len - 1)?;
+ label.truncate(label_len);
+ labels.push((value, RawString(label)));
+ }
+
+ let index_offset = r.stream_position()?;
+ let rec_type: u32 = endian.parse(read_bytes(r)?);
+ if rec_type != 4 {
+ return Err(Error::ExpectedVarIndexRecord {
+ offset: index_offset,
+ rec_type,
+ });
+ }
+
+ let n: u32 = endian.parse(read_bytes(r)?);
+ if n > Self::MAX_INDEXES {
+ return Err(Error::TooManyVarIndexes {
+ offset: index_offset,
+ n,
+ max: Self::MAX_INDEXES,
+ });
+ } else if n == 0 {
+ warn(Warning::NoVarIndexes {
+ offset: index_offset,
+ });
+ return Ok(None);
+ }
+
+ let index_offset = r.stream_position()?;
+ let mut dict_indexes = Vec::with_capacity(n as usize);
+ let mut invalid_indexes = Vec::new();
+ for _ in 0..n {
+ let index: u32 = endian.parse(read_bytes(r)?);
+ if index == 0 || index as usize > var_types.len() {
+ dict_indexes.push(index);
+ } else {
+ invalid_indexes.push(index);
+ }
+ }
+ if !invalid_indexes.is_empty() {
+ warn(Warning::InvalidVarIndexes {
+ offset: index_offset,
+ max: var_types.len(),
+ invalid: invalid_indexes,
+ });
+ }
+
+ let Some(&first_index) = dict_indexes.first() else {
+ return Ok(None);
+ };
+ let var_type = var_types[first_index as usize - 1];
+ let mut wrong_type_indexes = Vec::new();
+ dict_indexes.retain(|&index| {
+ if var_types[index as usize - 1] != var_type {
+ wrong_type_indexes.push(index);
+ false
+ } else {
+ true
+ }
+ });
+ if !wrong_type_indexes.is_empty() {
+ warn(Warning::MixedVarTypes {
+ offset: index_offset,
+ var_type,
+ wrong_types: wrong_type_indexes,
+ });
+ }
+
+ let labels = labels
+ .into_iter()
+ .map(|(value, label)| ValueLabel {
+ value: Value::from_raw(&value, var_type, endian),
+ label,
+ })
+ .collect();
+
+ let end_offset = r.stream_position()?;
+ Ok(Some(Record::ValueLabel(ValueLabelRecord {
+ offsets: label_offset..end_offset,
+ labels,
+ dict_indexes,
+ var_type,
+ })))
+ }
+
+ fn decode(self, decoder: &Decoder) -> ValueLabelRecord<RawStr<8>, String> {
+ let labels = self
+ .labels
+ .iter()
+ .map(|ValueLabel { value, label }| ValueLabel {
+ value: *value,
+ label: decoder.decode(label).to_string(),
+ })
+ .collect();
+ ValueLabelRecord {
+ offsets: self.offsets.clone(),
+ labels,
+ dict_indexes: self.dict_indexes.clone(),
+ var_type: self.var_type,
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct DocumentRecord<S>
+where
+ S: Debug,
+{
+ pub offsets: Range<u64>,
+
+ /// The document, as an array of lines. Raw lines are exactly 80 bytes long
+ /// and are right-padded with spaces without any new-line termination.
+ pub lines: Vec<S>,
+}
+
+pub type RawDocumentLine = RawStr<DOC_LINE_LEN>;
+
+/// Length of a line in a document. Document lines are fixed-length and
+/// padded on the right with spaces.
+pub const DOC_LINE_LEN: usize = 80;
+
+impl DocumentRecord<RawDocumentLine> {
+ /// Maximum number of lines we will accept in a document. This is simply
+ /// the maximum number that will fit in a 32-bit space.
+ pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
+
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
+ let start_offset = r.stream_position()?;
+ let n: u32 = endian.parse(read_bytes(r)?);
+ let n = n as usize;
+ if n > Self::MAX_LINES {
+ Err(Error::BadDocumentLength {
+ offset: start_offset,
+ n,
+ max: Self::MAX_LINES,
+ })
+ } else {
+ let mut lines = Vec::with_capacity(n);
+ for _ in 0..n {
+ lines.push(RawStr(read_bytes(r)?));
+ }
+ let end_offset = r.stream_position()?;
+ Ok(Record::Document(DocumentRecord {
+ offsets: start_offset..end_offset,
+ lines,
+ }))
+ }
+ }
+
+ pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ DecodedRecord::Document(DocumentRecord {
+ offsets: self.offsets.clone(),
+ lines: self
+ .lines
+ .iter()
+ .map(|s| decoder.decode_slice(&s.0).to_string())
+ .collect(),
+ })
+ }
+}
+
+impl<S> Header for DocumentRecord<S>
+where
+ S: Debug,
+{
+ fn offsets(&self) -> Range<u64> {
+ self.offsets.clone()
+ }
+}
+
+trait ExtensionRecord {
+ const SUBTYPE: u32;
+ const SIZE: Option<u32>;
+ const COUNT: Option<u32>;
+ const NAME: &'static str;
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning>;
+}
+
+#[derive(Clone, Debug)]
+pub struct IntegerInfoRecord {
+ pub offsets: Range<u64>,
+ pub version: (i32, i32, i32),
+ pub machine_code: i32,
+ pub floating_point_rep: i32,
+ pub compression_code: i32,
+ pub endianness: i32,
+ pub character_code: i32,
+}
+
+impl ExtensionRecord for IntegerInfoRecord {
+ const SUBTYPE: u32 = 3;
+ const SIZE: Option<u32> = Some(4);
+ const COUNT: Option<u32> = Some(8);
+ const NAME: &'static str = "integer record";
+
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let data: Vec<i32> = (0..8)
+ .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
+ .collect();
+ Ok(Record::IntegerInfo(IntegerInfoRecord {
+ offsets: ext.offsets.clone(),
+ version: (data[0], data[1], data[2]),
+ machine_code: data[3],
+ floating_point_rep: data[4],
+ compression_code: data[5],
+ endianness: data[6],
+ character_code: data[7],
+ }))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct FloatInfoRecord {
+ pub sysmis: f64,
+ pub highest: f64,
+ pub lowest: f64,
+}
+
+impl ExtensionRecord for FloatInfoRecord {
+ const SUBTYPE: u32 = 4;
+ const SIZE: Option<u32> = Some(8);
+ const COUNT: Option<u32> = Some(3);
+ const NAME: &'static str = "floating point record";
+
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let data: Vec<f64> = (0..3)
+ .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
+ .collect();
+ Ok(Record::FloatInfo(FloatInfoRecord {
+ sysmis: data[0],
+ highest: data[1],
+ lowest: data[2],
+ }))
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum CategoryLabels {
+ VarLabels,
+ CountedValues,
+}
+
+#[derive(Clone, Debug)]
+pub enum MultipleResponseType {
+ MultipleDichotomy {
+ value: RawString,
+ labels: CategoryLabels,
+ },
+ MultipleCategory,
+}
+
+impl MultipleResponseType {
+ fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> {
+ let (mr_type, input) = match input.split_first() {
+ Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
+ Some((b'D', input)) => {
+ let (value, input) = parse_counted_string(input)?;
+ (
+ MultipleResponseType::MultipleDichotomy {
+ value,
+ labels: CategoryLabels::VarLabels,
+ },
+ input,
+ )
+ }
+ Some((b'E', input)) => {
+ let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
+ (CategoryLabels::CountedValues, rest)
+ } else if let Some(rest) = input.strip_prefix(b" 11 ") {
+ (CategoryLabels::VarLabels, rest)
+ } else {
+ return Err(Warning::TBD);
+ };
+ let (value, input) = parse_counted_string(input)?;
+ (
+ MultipleResponseType::MultipleDichotomy { value, labels },
+ input,
+ )
+ }
+ _ => return Err(Warning::TBD),
+ };
+ Ok((mr_type, input))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseSet<I, S>
+where
+ I: Debug,
+ S: Debug,
+{
+ pub name: I,
+ pub label: S,
+ pub mr_type: MultipleResponseType,
+ pub short_names: Vec<I>,
+}
+
+impl MultipleResponseSet<RawString, RawString> {
+ fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> {
+ let Some(equals) = input.iter().position(|&b| b == b'=') else {
+ return Err(Warning::TBD);
+ };
+ let (name, input) = input.split_at(equals);
+ let (mr_type, input) = MultipleResponseType::parse(input)?;
+ let Some(input) = input.strip_prefix(b" ") else {
+ return Err(Warning::TBD);
+ };
+ let (label, mut input) = parse_counted_string(input)?;
+ let mut vars = Vec::new();
+ while input.first() != Some(&b'\n') {
+ match input.split_first() {
+ Some((b' ', rest)) => {
+ let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
+ return Err(Warning::TBD);
+ };
+ let (var, rest) = rest.split_at(length);
+ if !var.is_empty() {
+ vars.push(var.into());
+ }
+ input = rest;
+ }
+ _ => return Err(Warning::TBD),
+ }
+ }
+ while input.first() == Some(&b'\n') {
+ input = &input[1..];
+ }
+ Ok((
+ MultipleResponseSet {
+ name: name.into(),
+ label,
+ mr_type,
+ short_names: vars,
+ },
+ input,
+ ))
+ }
+
+ fn decode(
+ &self,
+ decoder: &Decoder,
+ ) -> Result<MultipleResponseSet<Identifier, String>, Warning> {
+ let mut short_names = Vec::with_capacity(self.short_names.len());
+ for short_name in self.short_names.iter() {
+ if let Some(short_name) = decoder
+ .decode_identifier(short_name)
+ .map_err(Warning::InvalidMrSetName)
+ .issue_warning(&decoder.warn)
+ {
+ short_names.push(short_name);
+ }
+ }
+ Ok(MultipleResponseSet {
+ name: decoder
+ .decode_identifier(&self.name)
+ .map_err(Warning::InvalidMrSetVariableName)?,
+ label: decoder.decode(&self.label).to_string(),
+ mr_type: self.mr_type.clone(),
+ short_names,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseRecord<I, S>(pub Vec<MultipleResponseSet<I, S>>)
+where
+ I: Debug,
+ S: Debug;
+
+impl ExtensionRecord for MultipleResponseRecord<RawString, RawString> {
+ const SUBTYPE: u32 = 7;
+ const SIZE: Option<u32> = Some(1);
+ const COUNT: Option<u32> = None;
+ const NAME: &'static str = "multiple response set record";
+
+ fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let mut sets = Vec::new();
+ while !input.is_empty() {
+ let (set, rest) = MultipleResponseSet::parse(input)?;
+ sets.push(set);
+ input = rest;
+ }
+ Ok(Record::MultipleResponse(MultipleResponseRecord(sets)))
+ }
+}
+
+impl MultipleResponseRecord<RawString, RawString> {
+ fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ let mut sets = Vec::new();
+ for set in self.0.iter() {
+ if let Some(set) = set.decode(decoder).issue_warning(&decoder.warn) {
+ sets.push(set);
+ }
+ }
+ DecodedRecord::MultipleResponse(MultipleResponseRecord(sets))
+ }
+}
+
+fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> {
+ let Some(space) = input.iter().position(|&b| b == b' ') else {
+ return Err(Warning::TBD);
+ };
+ let Ok(length) = from_utf8(&input[..space]) else {
+ return Err(Warning::TBD);
+ };
+ let Ok(length): Result<usize, _> = length.parse() else {
+ return Err(Warning::TBD);
+ };
+
+ let input = &input[space + 1..];
+ if input.len() < length {
+ return Err(Warning::TBD);
+ };
+
+ let (string, rest) = input.split_at(length);
+ Ok((string.into(), rest))
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Measure {
+ Nominal,
+ Ordinal,
+ Scale,
+}
+
+impl Measure {
+ pub fn default_for_type(var_type: VarType) -> Option<Measure> {
+ match var_type {
+ VarType::Numeric => None,
+ VarType::String => Some(Self::Nominal),
+ }
+ }
+
+ fn try_decode(source: u32) -> Result<Option<Measure>, Warning> {
+ match source {
+ 0 => Ok(None),
+ 1 => Ok(Some(Measure::Nominal)),
+ 2 => Ok(Some(Measure::Ordinal)),
+ 3 => Ok(Some(Measure::Scale)),
+ _ => Err(Warning::InvalidMeasurement(source)),
+ }
+ }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Alignment {
+ Left,
+ Right,
+ Center,
+}
+
+impl Alignment {
+ fn try_decode(source: u32) -> Result<Option<Alignment>, Warning> {
+ match source {
+ 0 => Ok(None),
+ 1 => Ok(Some(Alignment::Left)),
+ 2 => Ok(Some(Alignment::Right)),
+ 3 => Ok(Some(Alignment::Center)),
+ _ => Err(Warning::InvalidAlignment(source)),
+ }
+ }
+
+ pub fn default_for_type(var_type: VarType) -> Self {
+ match var_type {
+ VarType::Numeric => Self::Right,
+ VarType::String => Self::Left,
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VarDisplay {
+ pub measure: Option<Measure>,
+ pub width: Option<u32>,
+ pub alignment: Option<Alignment>,
+}
+
+#[derive(Clone, Debug)]
+pub struct VarDisplayRecord(pub Vec<VarDisplay>);
+
+impl VarDisplayRecord {
+ const SUBTYPE: u32 = 11;
+
+ fn parse(
+ ext: &Extension,
+ n_vars: usize,
+ endian: Endian,
+ warn: &dyn Fn(Warning),
+ ) -> Result<Record, Warning> {
+ if ext.size != 4 {
+ return Err(Warning::BadRecordSize {
+ offset: ext.offsets.start,
+ record: String::from("variable display record"),
+ size: ext.size,
+ expected_size: 4,
+ });
+ }
+
+ let has_width = if ext.count as usize == 3 * n_vars {
+ true
+ } else if ext.count as usize == 2 * n_vars {
+ false
+ } else {
+ return Err(Warning::TBD);
+ };
+
+ let mut var_displays = Vec::new();
+ let mut input = &ext.data[..];
+ for _ in 0..n_vars {
+ let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
+ .issue_warning(&warn)
+ .flatten();
+ let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
+ let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
+ .issue_warning(&warn)
+ .flatten();
+ var_displays.push(VarDisplay {
+ measure,
+ width,
+ alignment,
+ });
+ }
+ Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringMissingValues<N, V>
+where
+ N: Debug,
+ V: Debug,
+{
+ /// Variable name.
+ pub var_name: N,
+
+ /// Missing values.
+ pub missing_values: MissingValues<V>,
+}
+
+impl LongStringMissingValues<RawString, RawStr<8>> {
+ fn decode(
+ &self,
+ decoder: &Decoder,
+ ) -> Result<LongStringMissingValues<Identifier, String>, IdError> {
+ Ok(LongStringMissingValues {
+ var_name: decoder.decode_identifier(&self.var_name)?,
+ missing_values: self.missing_values.decode(decoder),
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringMissingValueRecord<N, V>(pub Vec<LongStringMissingValues<N, V>>)
+where
+ N: Debug,
+ V: Debug;
+
+impl ExtensionRecord for LongStringMissingValueRecord<RawString, RawStr<8>> {
+ const SUBTYPE: u32 = 22;
+ const SIZE: Option<u32> = Some(1);
+ const COUNT: Option<u32> = None;
+ const NAME: &'static str = "long string missing values record";
+
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let mut missing_value_set = Vec::new();
+ while !input.is_empty() {
+ let var_name = read_string(&mut input, endian)?;
+ let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
+ let value_len: u32 = endian.parse(read_bytes(&mut input)?);
+ if value_len != 8 {
+ let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
+ return Err(Warning::BadLongMissingValueLength {
+ record_offset: ext.offsets.start,
+ offset,
+ value_len,
+ });
+ }
+ let mut values = Vec::new();
+ for i in 0..n_missing_values {
+ let value: [u8; 8] = read_bytes(&mut input)?;
+ let numeric_value: u64 = endian.parse(value);
+ let value = if i > 0 && numeric_value == 8 {
+ // Tolerate files written by old, buggy versions of PSPP
+ // where we believed that the value_length was repeated
+ // before each missing value.
+ read_bytes(&mut input)?
+ } else {
+ value
+ };
+ values.push(Value::String(RawStr(value)));
+ }
+ let missing_values = MissingValues {
+ values,
+ range: None,
+ };
+ missing_value_set.push(LongStringMissingValues {
+ var_name,
+ missing_values,
+ });
+ }
+ Ok(Record::LongStringMissingValues(
+ LongStringMissingValueRecord(missing_value_set),
+ ))
+ }
+}
+
+impl LongStringMissingValueRecord<RawString, RawStr<8>> {
+ pub fn decode(self, decoder: &Decoder) -> LongStringMissingValueRecord<Identifier, String> {
+ let mut mvs = Vec::with_capacity(self.0.len());
+ for mv in self.0.iter() {
+ if let Some(mv) = mv
+ .decode(decoder)
+ .map_err(Warning::InvalidLongStringMissingValueVariableName)
+ .issue_warning(&decoder.warn)
+ {
+ mvs.push(mv);
+ }
+ }
+ LongStringMissingValueRecord(mvs)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct EncodingRecord(pub String);
+
+impl ExtensionRecord for EncodingRecord {
+ const SUBTYPE: u32 = 20;
+ const SIZE: Option<u32> = Some(1);
+ const COUNT: Option<u32> = None;
+ const NAME: &'static str = "encoding record";
+
+ fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
+ ext.check_size::<Self>()?;
+
+ Ok(Record::Encoding(EncodingRecord(
+ String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName {
+ offset: ext.offsets.start,
+ })?,
+ )))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct NumberOfCasesRecord {
+ /// Always observed as 1.
+ pub one: u64,
+
+ /// Number of cases.
+ pub n_cases: u64,
+}
+
+impl ExtensionRecord for NumberOfCasesRecord {
+ const SUBTYPE: u32 = 16;
+ const SIZE: Option<u32> = Some(8);
+ const COUNT: Option<u32> = Some(2);
+ const NAME: &'static str = "extended number of cases record";
+
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let one = endian.parse(read_bytes(&mut input)?);
+ let n_cases = endian.parse(read_bytes(&mut input)?);
+
+ Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct TextRecord {
+ pub offsets: Range<u64>,
+
+ /// Type of record.
+ pub rec_type: TextRecordType,
+
+ /// The text content of the record.
+ pub text: RawString,
+}
+
+#[derive(Clone, Copy, Debug)]
+pub enum TextRecordType {
+ VariableSets,
+ ProductInfo,
+ LongNames,
+ VeryLongStrings,
+ FileAttributes,
+ VariableAttributes,
+}
+
+impl TextRecord {
+ fn new(extension: Extension, rec_type: TextRecordType) -> Self {
+ Self {
+ offsets: extension.offsets,
+ rec_type,
+ text: extension.data.into(),
+ }
+ }
+ pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ match self.rec_type {
+ TextRecordType::VariableSets => {
+ DecodedRecord::VariableSets(VariableSetRecord::decode(&self, decoder))
+ }
+ TextRecordType::ProductInfo => {
+ DecodedRecord::ProductInfo(ProductInfoRecord::decode(&self, decoder))
+ }
+ TextRecordType::LongNames => {
+ DecodedRecord::LongNames(LongNamesRecord::decode(&self, decoder))
+ }
+ TextRecordType::VeryLongStrings => {
+ DecodedRecord::VeryLongStrings(VeryLongStringsRecord::decode(&self, decoder))
+ }
+ TextRecordType::FileAttributes => {
+ DecodedRecord::FileAttributes(FileAttributeRecord::decode(&self, decoder))
+ }
+ TextRecordType::VariableAttributes => {
+ DecodedRecord::VariableAttributes(VariableAttributeRecord::decode(&self, decoder))
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VeryLongString {
+ pub short_name: Identifier,
+ pub length: u16,
+}
+
+impl VeryLongString {
+ fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Warning> {
+ let Some((short_name, length)) = input.split_once('=') else {
+ return Err(Warning::TBD);
+ };
+ let short_name = decoder
+ .new_identifier(short_name)
+ .map_err(Warning::InvalidLongStringName)?;
+ let length = length.parse().map_err(|_| Warning::TBD)?;
+ Ok(VeryLongString { short_name, length })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VeryLongStringsRecord(Vec<VeryLongString>);
+
+impl VeryLongStringsRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ let input = decoder.decode(&source.text);
+ let mut very_long_strings = Vec::new();
+ for tuple in input
+ .split('\0')
+ .map(|s| s.trim_end_matches('\t'))
+ .filter(|s| !s.is_empty())
+ {
+ if let Some(vls) = VeryLongString::parse(decoder, tuple).issue_warning(&decoder.warn) {
+ very_long_strings.push(vls)
+ }
+ }
+ VeryLongStringsRecord(very_long_strings)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Attribute {
+ pub name: Identifier,
+ pub values: Vec<String>,
+}
+
+impl Attribute {
+ fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> {
+ let Some((name, mut input)) = input.split_once('(') else {
+ return Err(Warning::TBD);
+ };
+ let name = decoder
+ .new_identifier(name)
+ .map_err(Warning::InvalidAttributeName)?;
+ let mut values = Vec::new();
+ loop {
+ let Some((value, rest)) = input.split_once('\n') else {
+ return Err(Warning::TBD);
+ };
+ if let Some(stripped) = value
+ .strip_prefix('\'')
+ .and_then(|value| value.strip_suffix('\''))
+ {
+ values.push(stripped.into());
+ } else {
+ decoder.warn(Warning::TBD);
+ values.push(value.into());
+ }
+ if let Some(rest) = rest.strip_prefix(')') {
+ let attribute = Attribute { name, values };
+ return Ok((attribute, rest));
+ };
+ input = rest;
+ }
+ }
+}
+
+#[derive(Clone, Debug, Default)]
+pub struct AttributeSet(pub HashMap<Identifier, Vec<String>>);
+
+impl AttributeSet {
+ fn parse<'a>(
+ decoder: &Decoder,
+ mut input: &'a str,
+ sentinel: Option<char>,
+ ) -> Result<(AttributeSet, &'a str), Warning> {
+ let mut attributes = HashMap::new();
+ let rest = loop {
+ match input.chars().next() {
+ None => break input,
+ c if c == sentinel => break &input[1..],
+ _ => {
+ let (attribute, rest) = Attribute::parse(decoder, input)?;
+ // XXX report duplicate name
+ attributes.insert(attribute.name, attribute.values);
+ input = rest;
+ }
+ }
+ };
+ Ok((AttributeSet(attributes), rest))
+ }
+}
+
+#[derive(Clone, Debug, Default)]
+pub struct FileAttributeRecord(pub AttributeSet);
+
+impl FileAttributeRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ let input = decoder.decode(&source.text);
+ match AttributeSet::parse(decoder, &input, None).issue_warning(&decoder.warn) {
+ Some((set, rest)) => {
+ if !rest.is_empty() {
+ decoder.warn(Warning::TBD);
+ }
+ FileAttributeRecord(set)
+ }
+ None => FileAttributeRecord::default(),
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VarAttributeSet {
+ pub long_var_name: Identifier,
+ pub attributes: AttributeSet,
+}
+
+impl VarAttributeSet {
+ fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributeSet, &'a str), Warning> {
+ let Some((long_var_name, rest)) = input.split_once(':') else {
+ return Err(Warning::TBD);
+ };
+ let long_var_name = decoder
+ .new_identifier(long_var_name)
+ .map_err(Warning::InvalidAttributeVariableName)?;
+ let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?;
+ let var_attribute = VarAttributeSet {
+ long_var_name,
+ attributes,
+ };
+ Ok((var_attribute, rest))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
+
+impl VariableAttributeRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ let decoded = decoder.decode(&source.text);
+ let mut input = decoded.as_ref();
+ let mut var_attribute_sets = Vec::new();
+ while !input.is_empty() {
+ let Some((var_attribute, rest)) =
+ VarAttributeSet::parse(decoder, input).issue_warning(&decoder.warn)
+ else {
+ break;
+ };
+ var_attribute_sets.push(var_attribute);
+ input = rest;
+ }
+ VariableAttributeRecord(var_attribute_sets)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongName {
+ pub short_name: Identifier,
+ pub long_name: Identifier,
+}
+
+impl LongName {
+ fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
+ let Some((short_name, long_name)) = input.split_once('=') else {
+ return Err(Warning::TBD);
+ };
+ let short_name = decoder
+ .new_identifier(short_name)
+ .map_err(Warning::InvalidShortName)?;
+ let long_name = decoder
+ .new_identifier(long_name)
+ .map_err(Warning::InvalidLongName)?;
+ Ok(LongName {
+ short_name,
+ long_name,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongNamesRecord(Vec<LongName>);
+
+impl LongNamesRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ let input = decoder.decode(&source.text);
+ let mut names = Vec::new();
+ for pair in input.split('\t').filter(|s| !s.is_empty()) {
+ if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&decoder.warn) {
+ names.push(long_name);
+ }
+ }
+ LongNamesRecord(names)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct ProductInfoRecord(pub String);
+
+impl ProductInfoRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ Self(decoder.decode(&source.text).into())
+ }
+}
+#[derive(Clone, Debug)]
+pub struct VariableSet {
+ pub name: String,
+ pub vars: Vec<Identifier>,
+}
+
+impl VariableSet {
+ fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
+ let (name, input) = input.split_once('=').ok_or(Warning::TBD)?;
+ let mut vars = Vec::new();
+ for var in input.split_ascii_whitespace() {
+ if let Some(identifier) = decoder
+ .new_identifier(var)
+ .map_err(Warning::InvalidVariableSetName)
+ .issue_warning(&decoder.warn)
+ {
+ vars.push(identifier);
+ }
+ }
+ Ok(VariableSet {
+ name: name.into(),
+ vars,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableSetRecord {
+ pub offsets: Range<u64>,
+ pub sets: Vec<VariableSet>,
+}
+
+impl VariableSetRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord {
+ let mut sets = Vec::new();
+ let input = decoder.decode(&source.text);
+ for line in input.lines() {
+ if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&decoder.warn) {
+ sets.push(set)
+ }
+ }
+ VariableSetRecord {
+ offsets: source.offsets.clone(),
+ sets,
+ }
+ }
+}
+
+trait IssueWarning<T> {
+ fn issue_warning<F>(self, warn: &F) -> Option<T>
+ where
+ F: Fn(Warning);
+}
+impl<T> IssueWarning<T> for Result<T, Warning> {
+ fn issue_warning<F>(self, warn: &F) -> Option<T>
+ where
+ F: Fn(Warning),
+ {
+ match self {
+ Ok(result) => Some(result),
+ Err(error) => {
+ warn(error);
+ None
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Extension {
+ pub offsets: Range<u64>,
+
+ /// Record subtype.
+ pub subtype: u32,
+
+ /// Size of each data element.
+ pub size: u32,
+
+ /// Number of data elements.
+ pub count: u32,
+
+ /// `size * count` bytes of data.
+ pub data: Vec<u8>,
+}
+
+impl Extension {
+ fn check_size<E: ExtensionRecord>(&self) -> Result<(), Warning> {
+ if let Some(expected_size) = E::SIZE {
+ if self.size != expected_size {
+ return Err(Warning::BadRecordSize {
+ offset: self.offsets.start,
+ record: E::NAME.into(),
+ size: self.size,
+ expected_size,
+ });
+ }
+ }
+ if let Some(expected_count) = E::COUNT {
+ if self.count != expected_count {
+ return Err(Warning::BadRecordCount {
+ offset: self.offsets.start,
+ record: E::NAME.into(),
+ count: self.count,
+ expected_count,
+ });
+ }
+ }
+ Ok(())
+ }
+
+ fn read<R: Read + Seek>(
+ r: &mut R,
+ endian: Endian,
+ n_vars: usize,
+ warn: &dyn Fn(Warning),
+ ) -> Result<Option<Record>, Error> {
+ let subtype = endian.parse(read_bytes(r)?);
+ let header_offset = r.stream_position()?;
+ let size: u32 = endian.parse(read_bytes(r)?);
+ let count = endian.parse(read_bytes(r)?);
+ let Some(product) = size.checked_mul(count) else {
+ return Err(Error::ExtensionRecordTooLarge {
+ offset: header_offset,
+ subtype,
+ size,
+ count,
+ });
+ };
+ let start_offset = r.stream_position()?;
+ let data = read_vec(r, product as usize)?;
+ let end_offset = start_offset + product as u64;
+ let extension = Extension {
+ offsets: start_offset..end_offset,
+ subtype,
+ size,
+ count,
+ data,
+ };
+ let result = match subtype {
+ IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian),
+ FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian),
+ VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, n_vars, endian, warn),
+ MultipleResponseRecord::SUBTYPE | 19 => {
+ MultipleResponseRecord::parse(&extension, endian)
+ }
+ LongStringValueLabelRecord::SUBTYPE => {
+ LongStringValueLabelRecord::parse(&extension, endian)
+ }
+ EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian),
+ NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian),
+ 5 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::VariableSets,
+ ))),
+ 10 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::ProductInfo,
+ ))),
+ 13 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::LongNames,
+ ))),
+ 14 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::VeryLongStrings,
+ ))),
+ 17 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::FileAttributes,
+ ))),
+ 18 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::VariableAttributes,
+ ))),
+ _ => Ok(Record::OtherExtension(extension)),
+ };
+ match result {
+ Ok(result) => Ok(Some(result)),
+ Err(error) => {
+ warn(error);
+ Ok(None)
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct ZHeader {
+ /// File offset to the start of the record.
+ pub offset: u64,
+
+ /// File offset to the ZLIB data header.
+ pub zheader_offset: u64,
+
+ /// File offset to the ZLIB trailer.
+ pub ztrailer_offset: u64,
+
+ /// Length of the ZLIB trailer in bytes.
+ pub ztrailer_len: u64,
+}
+
+impl ZHeader {
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
+ let offset = r.stream_position()?;
+ let zheader_offset: u64 = endian.parse(read_bytes(r)?);
+ let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
+ let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
+
+ Ok(ZHeader {
+ offset,
+ zheader_offset,
+ ztrailer_offset,
+ ztrailer_len,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct ZTrailer {
+ /// File offset to the start of the record.
+ pub offset: u64,
+
+ /// Compression bias as a negative integer, e.g. -100.
+ pub int_bias: i64,
+
+ /// Always observed as zero.
+ pub zero: u64,
+
+ /// Uncompressed size of each block, except possibly the last. Only
+ /// `0x3ff000` has been observed so far.
+ pub block_size: u32,
+
+ /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
+ pub blocks: Vec<ZBlock>,
+}
+
+#[derive(Clone, Debug)]
+pub struct ZBlock {
+ /// Offset of block of data if simple compression were used.
+ pub uncompressed_ofs: u64,
+
+ /// Actual offset within the file of the compressed data block.
+ pub compressed_ofs: u64,
+
+ /// The number of bytes in this data block after decompression. This is
+ /// `block_size` in every data block but the last, which may be smaller.
+ pub uncompressed_size: u32,
+
+ /// The number of bytes in this data block, as stored compressed in this
+ /// file.
+ pub compressed_size: u32,
+}
+
+impl ZBlock {
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
+ Ok(ZBlock {
+ uncompressed_ofs: endian.parse(read_bytes(r)?),
+ compressed_ofs: endian.parse(read_bytes(r)?),
+ uncompressed_size: endian.parse(read_bytes(r)?),
+ compressed_size: endian.parse(read_bytes(r)?),
+ })
+ }
+}
+
+impl ZTrailer {
+ fn read<R: Read + Seek>(
+ reader: &mut R,
+ endian: Endian,
+ ztrailer_ofs: u64,
+ ztrailer_len: u64,
+ ) -> Result<Option<ZTrailer>, Error> {
+ let start_offset = reader.stream_position()?;
+ if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
+ return Ok(None);
+ }
+ let int_bias = endian.parse(read_bytes(reader)?);
+ let zero = endian.parse(read_bytes(reader)?);
+ let block_size = endian.parse(read_bytes(reader)?);
+ let n_blocks: u32 = endian.parse(read_bytes(reader)?);
+ let expected_n_blocks = (ztrailer_len - 24) / 24;
+ if n_blocks as u64 != expected_n_blocks {
+ return Err(Error::BadZlibTrailerNBlocks {
+ offset: ztrailer_ofs,
+ n_blocks,
+ expected_n_blocks,
+ ztrailer_len,
+ });
+ }
+ let blocks = (0..n_blocks)
+ .map(|_| ZBlock::read(reader, endian))
+ .collect::<Result<Vec<_>, _>>()?;
+ reader.seek(SeekFrom::Start(start_offset))?;
+ Ok(Some(ZTrailer {
+ offset: ztrailer_ofs,
+ int_bias,
+ zero,
+ block_size,
+ blocks,
+ }))
+ }
+}
+
+fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
+ let mut buf = [0; N];
+ let n = r.read(&mut buf)?;
+ if n > 0 {
+ if n < N {
+ r.read_exact(&mut buf[n..])?;
+ }
+ Ok(Some(buf))
+ } else {
+ Ok(None)
+ }
+}
+
+fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
+ let mut buf = [0; N];
+ r.read_exact(&mut buf)?;
+ Ok(buf)
+}
+
+fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
+ let mut vec = vec![0; n];
+ r.read_exact(&mut vec)?;
+ Ok(vec)
+}
+
+fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<RawString, IoError> {
+ let length: u32 = endian.parse(read_bytes(r)?);
+ Ok(read_vec(r, length as usize)?.into())
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringValueLabels<N, S>
+where
+ S: Debug,
+{
+ pub var_name: N,
+ pub width: u32,
+
+ /// `(value, label)` pairs, where each value is `width` bytes.
+ pub labels: Vec<(S, S)>,
+}
+
+impl LongStringValueLabels<RawString, RawString> {
+ fn decode(
+ &self,
+ decoder: &Decoder,
+ ) -> Result<LongStringValueLabels<Identifier, String>, Warning> {
+ let var_name = decoder.decode(&self.var_name);
+ let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding)
+ .map_err(Warning::InvalidLongStringValueLabelName)?;
+
+ let mut labels = Vec::with_capacity(self.labels.len());
+ for (value, label) in self.labels.iter() {
+ let value = decoder.decode_exact_length(&value.0).to_string();
+ let label = decoder.decode(label).to_string();
+ labels.push((value, label));
+ }
+
+ Ok(LongStringValueLabels {
+ var_name,
+ width: self.width,
+ labels,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringValueLabelRecord<N, S>(pub Vec<LongStringValueLabels<N, S>>)
+where
+ N: Debug,
+ S: Debug;
+
+impl ExtensionRecord for LongStringValueLabelRecord<RawString, RawString> {
+ const SUBTYPE: u32 = 21;
+ const SIZE: Option<u32> = Some(1);
+ const COUNT: Option<u32> = None;
+ const NAME: &'static str = "long string value labels record";
+
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let mut label_set = Vec::new();
+ while !input.is_empty() {
+ let var_name = read_string(&mut input, endian)?;
+ let width: u32 = endian.parse(read_bytes(&mut input)?);
+ let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
+ let mut labels = Vec::new();
+ for _ in 0..n_labels {
+ let value = read_string(&mut input, endian)?;
+ let label = read_string(&mut input, endian)?;
+ labels.push((value, label));
+ }
+ label_set.push(LongStringValueLabels {
+ var_name,
+ width,
+ labels,
+ })
+ }
+ Ok(Record::LongStringValueLabels(LongStringValueLabelRecord(
+ label_set,
+ )))
+ }
+}
+
+impl LongStringValueLabelRecord<RawString, RawString> {
+ fn decode(self, decoder: &Decoder) -> LongStringValueLabelRecord<Identifier, String> {
+ let mut labels = Vec::with_capacity(self.0.len());
+ for label in &self.0 {
+ match label.decode(decoder) {
+ Ok(set) => labels.push(set),
+ Err(error) => decoder.warn(error),
+ }
+ }
+ LongStringValueLabelRecord(labels)
+ }
+}
--- /dev/null
+use float_next_after::NextAfter;
+use num::{Bounded, Zero};
+use ordered_float::OrderedFloat;
+use std::{
+ collections::{hash_map::Entry, HashMap},
+ error::Error as StdError,
+ fmt::{Display, Formatter, Result as FmtResult},
+ iter::repeat,
+};
+
+use crate::endian::{Endian, ToBytes};
+
+pub type Result<T, F = Error> = std::result::Result<T, F>;
+
+#[derive(Debug)]
+pub struct Error {
+ pub file_name: Option<String>,
+ pub line_number: Option<usize>,
+ pub token: Option<String>,
+ pub message: String,
+}
+
+impl Error {
+ fn new(
+ file_name: Option<&str>,
+ line_number: Option<usize>,
+ token: Option<&str>,
+ message: String,
+ ) -> Error {
+ Error {
+ file_name: file_name.map(String::from),
+ line_number,
+ token: token.map(String::from),
+ message,
+ }
+ }
+}
+
+impl StdError for Error {}
+
+impl Display for Error {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ match (self.file_name.as_ref(), self.line_number) {
+ (Some(ref file_name), Some(line_number)) => write!(f, "{file_name}:{line_number}: ")?,
+ (Some(ref file_name), None) => write!(f, "{file_name}: ")?,
+ (None, Some(line_number)) => write!(f, "line {line_number}: ")?,
+ (None, None) => (),
+ }
+ if let Some(ref token) = self.token {
+ write!(f, "at '{token}': ")?;
+ }
+ write!(f, "{}", self.message)
+ }
+}
+
+pub fn sack(input: &str, input_file_name: Option<&str>, endian: Endian) -> Result<Vec<u8>> {
+ let mut symbol_table = HashMap::new();
+ let output = _sack(input, input_file_name, endian, &mut symbol_table)?;
+ let output = if !symbol_table.is_empty() {
+ for (k, v) in symbol_table.iter() {
+ println!("{k} => {v:?}");
+ }
+ for (k, v) in symbol_table.iter() {
+ if v.is_none() {
+ Err(Error::new(
+ input_file_name,
+ None,
+ None,
+ format!("label {k} used but never defined"),
+ ))?
+ }
+ }
+ _sack(input, input_file_name, endian, &mut symbol_table)?
+ } else {
+ output
+ };
+ Ok(output)
+}
+
+fn _sack(
+ input: &str,
+ input_file_name: Option<&str>,
+ endian: Endian,
+ symbol_table: &mut HashMap<String, Option<u32>>,
+) -> Result<Vec<u8>> {
+ let mut lexer = Lexer::new(input, input_file_name, endian)?;
+ let mut output = Vec::new();
+ while parse_data_item(&mut lexer, &mut output, symbol_table)? {}
+ Ok(output)
+}
+
+fn parse_data_item(
+ lexer: &mut Lexer,
+ output: &mut Vec<u8>,
+ symbol_table: &mut HashMap<String, Option<u32>>,
+) -> Result<bool> {
+ if lexer.token.is_none() {
+ return Ok(false);
+ };
+
+ let initial_len = output.len();
+ match lexer.take()? {
+ Token::Integer(integer) => {
+ if let Ok(integer) = TryInto::<i32>::try_into(integer) {
+ output.extend_from_slice(&lexer.endian.to_bytes(integer));
+ } else if let Ok(integer) = TryInto::<u32>::try_into(integer) {
+ output.extend_from_slice(&lexer.endian.to_bytes(integer));
+ } else {
+ Err(lexer.error(format!(
+ "{integer} is not in the valid range [{},{}]",
+ i32::min_value(),
+ u32::max_value()
+ )))?;
+ };
+ }
+ Token::Float(float) => output.extend_from_slice(&lexer.endian.to_bytes(float.0)),
+ Token::PcSysmis => {
+ output.extend_from_slice(&[0xf5, 0x1e, 0x26, 0x02, 0x8a, 0x8c, 0xed, 0xff])
+ }
+ Token::I8 => put_integers::<u8, 1>(lexer, "i8", output)?,
+ Token::I16 => put_integers::<u16, 2>(lexer, "i16", output)?,
+ Token::I64 => put_integers::<i64, 8>(lexer, "i64", output)?,
+ Token::String(string) => output.extend_from_slice(string.as_bytes()),
+ Token::S(size) => {
+ let Some((Token::String(ref string), _)) = lexer.token else {
+ Err(lexer.error(format!("string expected after 's{size}'")))?
+ };
+ let len = string.len();
+ if len > size {
+ Err(lexer.error(format!(
+ "{len}-byte string is longer than pad length {size}"
+ )))?
+ }
+ output.extend_from_slice(string.as_bytes());
+ output.extend(repeat(b' ').take(size - len));
+ lexer.get()?;
+ }
+ Token::LParen => {
+ while !matches!(lexer.token, Some((Token::RParen, _))) {
+ parse_data_item(lexer, output, symbol_table)?;
+ }
+ lexer.get()?;
+ }
+ Token::Count => put_counted_items::<u32, 4>(lexer, "COUNT", output, symbol_table)?,
+ Token::Count8 => put_counted_items::<u8, 1>(lexer, "COUNT8", output, symbol_table)?,
+ Token::Hex => {
+ let Some((Token::String(ref string), _)) = lexer.token else {
+ Err(lexer.error(String::from("string expected after 'hex'")))?
+ };
+ let mut string = &string[..];
+ loop {
+ string = string.trim_start();
+ if string.is_empty() {
+ break;
+ };
+
+ let mut i = string.chars();
+ let Some(c0) = i.next() else { return Ok(true) };
+ let Some(c1) = i.next() else {
+ Err(lexer.error(String::from("hex string has odd number of characters")))?
+ };
+
+ let (Some(digit0), Some(digit1)) = (c0.to_digit(16), c1.to_digit(16)) else {
+ Err(lexer.error(String::from("invalid digit in hex string")))?
+ };
+ let byte = digit0 * 16 + digit1;
+ output.push(byte as u8);
+
+ string = i.as_str();
+ }
+ lexer.get()?;
+ }
+ Token::Label(name) => {
+ println!("define {name}");
+ let value = output.len() as u32;
+ match symbol_table.entry(name.clone()) {
+ Entry::Vacant(v) => {
+ v.insert(Some(value));
+ }
+ Entry::Occupied(mut o) => {
+ match o.get() {
+ Some(v) => {
+ if *v != value {
+ Err(lexer.error(format!("{name}: can't redefine label for offset {:#x} with offset {:#x}", *v, value)))?
+ }
+ }
+ None => drop(o.insert(Some(value))),
+ }
+ }
+ };
+ return Ok(true);
+ }
+ Token::At(name) => {
+ let mut value = *symbol_table.entry(name.clone()).or_insert(None);
+ loop {
+ let plus = match lexer.token {
+ Some((Token::Plus, _)) => true,
+ Some((Token::Minus, _)) => false,
+ _ => break,
+ };
+ lexer.get()?;
+
+ let operand = match lexer.token {
+ Some((Token::At(ref name), _)) => {
+ *symbol_table.entry(name.clone()).or_insert(None)
+ }
+ Some((Token::Integer(integer), _)) => Some(
+ integer
+ .try_into()
+ .map_err(|msg| lexer.error(format!("bad offset literal ({msg})")))?,
+ ),
+ _ => Err(lexer.error(String::from("expecting @label or integer literal")))?,
+ };
+ lexer.get()?;
+
+ value = match (value, operand) {
+ (Some(a), Some(b)) => Some(
+ if plus {
+ a.checked_add(b)
+ } else {
+ a.checked_sub(b)
+ }
+ .ok_or_else(|| {
+ lexer.error(String::from("overflow in offset arithmetic"))
+ })?,
+ ),
+ _ => None,
+ };
+ }
+ let value = value.unwrap_or(0);
+ output.extend_from_slice(&lexer.endian.to_bytes(value));
+ }
+ _ => (),
+ };
+ if let Some((Token::Asterisk, _)) = lexer.token {
+ lexer.get()?;
+ let Token::Integer(count) = lexer.take()? else {
+ Err(lexer.error(String::from("positive integer expected after '*'")))?
+ };
+ if count < 1 {
+ Err(lexer.error(String::from("positive integer expected after '*'")))?
+ };
+ let final_len = output.len();
+ for _ in 1..count {
+ output.extend_from_within(initial_len..final_len);
+ }
+ }
+ match lexer.token {
+ Some((Token::Semicolon, _)) => {
+ lexer.get()?;
+ }
+ Some((Token::RParen, _)) => (),
+ _ => Err(lexer.error(String::from("';' expected")))?,
+ }
+ Ok(true)
+}
+
+fn put_counted_items<T, const N: usize>(
+ lexer: &mut Lexer,
+ name: &str,
+ output: &mut Vec<u8>,
+ symbol_table: &mut HashMap<String, Option<u32>>,
+) -> Result<()>
+where
+ T: Zero + TryFrom<usize>,
+ Endian: ToBytes<T, N>,
+{
+ let old_size = output.len();
+ output.extend_from_slice(&lexer.endian.to_bytes(T::zero()));
+ let start = output.len();
+ if !matches!(lexer.token, Some((Token::LParen, _))) {
+ Err(lexer.error(format!("'(' expected after '{name}'")))?
+ }
+ lexer.get()?;
+ while !matches!(lexer.token, Some((Token::RParen, _))) {
+ parse_data_item(lexer, output, symbol_table)?;
+ }
+ lexer.get()?;
+ let delta = output.len() - start;
+ let Ok(delta): Result<T, _> = delta.try_into() else {
+ Err(lexer.error(format!("{delta} bytes is too much for '{name}'")))?
+ };
+ let dest = &mut output[old_size..old_size + N];
+ dest.copy_from_slice(&lexer.endian.to_bytes(delta));
+ Ok(())
+}
+
+fn put_integers<T, const N: usize>(
+ lexer: &mut Lexer,
+ name: &str,
+ output: &mut Vec<u8>,
+) -> Result<()>
+where
+ T: Bounded + Display + TryFrom<i64> + Copy,
+ Endian: ToBytes<T, N>,
+{
+ println!("put_integers {:?}", lexer.token);
+ let mut n = 0;
+ while let Some(integer) = lexer.take_if(|t| match t {
+ Token::Integer(integer) => Some(*integer),
+ _ => None,
+ })? {
+ println!("got integer {integer}");
+ let Ok(integer) = integer.try_into() else {
+ Err(lexer.error(format!(
+ "{integer} is not in the valid range [{},{}]",
+ T::min_value(),
+ T::max_value()
+ )))?
+ };
+ output.extend_from_slice(&lexer.endian.to_bytes(integer));
+ n += 1;
+ }
+ println!("put_integers {:?} {n}", lexer.token);
+ if n == 0 {
+ Err(lexer.error(format!("integer expected after '{name}'")))?
+ }
+ Ok(())
+}
+
+#[derive(PartialEq, Eq, Clone, Debug)]
+enum Token {
+ Integer(i64),
+ Float(OrderedFloat<f64>),
+ PcSysmis,
+ String(String),
+ Semicolon,
+ Asterisk,
+ LParen,
+ RParen,
+ I8,
+ I16,
+ I64,
+ S(usize),
+ Count,
+ Count8,
+ Hex,
+ Label(String),
+ At(String),
+ Minus,
+ Plus,
+}
+
+struct Lexer<'a> {
+ input: &'a str,
+ token: Option<(Token, &'a str)>,
+ input_file_name: Option<&'a str>,
+ line_number: usize,
+ endian: Endian,
+}
+
+fn skip_comments(mut s: &str) -> (&str, usize) {
+ let mut n_newlines = 0;
+ let s = loop {
+ s = s.trim_start_matches([' ', '\t', '\r', '<', '>']);
+ if let Some(remainder) = s.strip_prefix('#') {
+ let Some((_, remainder)) = remainder.split_once('\n') else {
+ break "";
+ };
+ s = remainder;
+ n_newlines += 1;
+ } else if let Some(remainder) = s.strip_prefix('\n') {
+ s = remainder;
+ n_newlines += 1;
+ } else {
+ break s;
+ }
+ };
+ (s, n_newlines)
+}
+
+impl<'a> Lexer<'a> {
+ fn new(input: &'a str, input_file_name: Option<&'a str>, endian: Endian) -> Result<Lexer<'a>> {
+ let mut lexer = Lexer {
+ input,
+ token: None,
+ input_file_name,
+ line_number: 1,
+ endian,
+ };
+ lexer.token = lexer.next()?;
+ Ok(lexer)
+ }
+ fn error(&self, message: String) -> Error {
+ let repr = self.token.as_ref().map(|(_, repr)| *repr);
+ Error::new(self.input_file_name, Some(self.line_number), repr, message)
+ }
+ fn take(&mut self) -> Result<Token> {
+ let Some(token) = self.token.take() else {
+ Err(self.error(String::from("unexpected end of input")))?
+ };
+ self.token = self.next()?;
+ Ok(token.0)
+ }
+ fn take_if<F, T>(&mut self, condition: F) -> Result<Option<T>>
+ where
+ F: FnOnce(&Token) -> Option<T>,
+ {
+ let Some(ref token) = self.token else {
+ return Ok(None);
+ };
+ match condition(&token.0) {
+ Some(value) => {
+ self.token = self.next()?;
+ Ok(Some(value))
+ }
+ None => Ok(None),
+ }
+ }
+ fn get(&mut self) -> Result<Option<&Token>> {
+ if self.token.is_none() {
+ Err(self.error(String::from("unexpected end of input")))?
+ } else {
+ self.token = self.next()?;
+ match self.token {
+ Some((ref token, _)) => Ok(Some(token)),
+ None => Ok(None),
+ }
+ }
+ }
+
+ fn next(&mut self) -> Result<Option<(Token, &'a str)>> {
+ // Get the first character of the token, skipping past white space and
+ // comments.
+ let (s, n_newlines) = skip_comments(self.input);
+ self.line_number += n_newlines;
+ self.input = s;
+
+ let start = s;
+ let mut iter = s.chars();
+ let Some(c) = iter.next() else {
+ return Ok(None);
+ };
+ let (token, rest) = match c {
+ c if c.is_ascii_digit() || c == '-' => {
+ let len = s
+ .find(|c: char| {
+ !(c.is_ascii_digit() || c.is_alphabetic() || c == '.' || c == '-')
+ })
+ .unwrap_or(s.len());
+ let (number, rest) = s.split_at(len);
+ let token = if number == "-" {
+ Token::Minus
+ } else if let Some(digits) = number.strip_prefix("0x") {
+ Token::Integer(i64::from_str_radix(digits, 16).map_err(|msg| {
+ self.error(format!("bad integer literal '{number}' ({msg})"))
+ })?)
+ } else if !number.contains('.') {
+ Token::Integer(number.parse().map_err(|msg| {
+ self.error(format!("bad integer literal '{number}' ({msg})"))
+ })?)
+ } else {
+ Token::Float(number.parse().map_err(|msg| {
+ self.error(format!("bad float literal '{number}' ({msg})"))
+ })?)
+ };
+ (token, rest)
+ }
+ '"' => {
+ let s = iter.as_str();
+ let Some(len) = s.find(['\n', '"']) else {
+ Err(self.error(String::from("end-of-file inside string")))?
+ };
+ let (string, rest) = s.split_at(len);
+ let Some(rest) = rest.strip_prefix('"') else {
+ Err(self.error(format!("new-line inside string ({string}...{rest})")))?
+ };
+ (Token::String(string.into()), rest)
+ }
+ ';' => (Token::Semicolon, iter.as_str()),
+ '*' => (Token::Asterisk, iter.as_str()),
+ '+' => (Token::Plus, iter.as_str()),
+ '(' => (Token::LParen, iter.as_str()),
+ ')' => (Token::RParen, iter.as_str()),
+ c if c.is_alphabetic() || c == '@' || c == '_' => {
+ let len = s
+ .find(|c: char| {
+ !(c.is_ascii_digit()
+ || c.is_alphabetic()
+ || c == '@'
+ || c == '.'
+ || c == '_')
+ })
+ .unwrap_or(s.len());
+ let (s, rest) = s.split_at(len);
+ if let Some(rest) = rest.strip_prefix(':') {
+ (Token::Label(s.into()), rest)
+ } else if let Some(name) = s.strip_prefix('@') {
+ (Token::At(name.into()), rest)
+ } else if let Some(count) = s.strip_prefix('s') {
+ let token =
+ Token::S(count.parse().map_err(|msg| {
+ self.error(format!("bad counted string '{s}' ({msg})"))
+ })?);
+ (token, rest)
+ } else {
+ let token = match s {
+ "i8" => Token::I8,
+ "i16" => Token::I16,
+ "i64" => Token::I64,
+ "SYSMIS" => Token::Float(OrderedFloat(-f64::MAX)),
+ "PCSYSMIS" => Token::PcSysmis,
+ "LOWEST" => Token::Float((-f64::MAX).next_after(0.0).into()),
+ "HIGHEST" => Token::Float(f64::MAX.into()),
+ "ENDIAN" => Token::Integer(if self.endian == Endian::Big { 1 } else { 2 }),
+ "COUNT" => Token::Count,
+ "COUNT8" => Token::Count8,
+ "hex" => Token::Hex,
+ _ => Err(self.error(format!("invalid token '{s}'")))?,
+ };
+ (token, rest)
+ }
+ }
+ _ => Err(self.error(format!("invalid input byte '{c}'")))?,
+ };
+ self.input = rest;
+ let repr = &start[..start.len() - rest.len()];
+ println!("{token:?} {repr}");
+ Ok(Some((token, repr)))
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use crate::endian::Endian;
+ use crate::sack::sack;
+ use anyhow::Result;
+ use hexplay::HexView;
+
+ #[test]
+ fn basic_sack() -> Result<()> {
+ let input = r#"
+"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
+2; # Layout code
+28; # Nominal case size
+0; # Not compressed
+0; # Not weighted
+1; # 1 case.
+100.0; # Bias.
+"01 Jan 11"; "20:53:52";
+"PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 "";
+i8 0 *3;
+"#;
+ let output = sack(input, None, Endian::Big)?;
+ HexView::new(&output).print()?;
+ Ok(())
+ }
+
+ #[test]
+ fn pcp_sack() -> Result<()> {
+ let input = r#"
+# File header.
+2; 0;
+@MAIN; @MAIN_END - @MAIN;
+@VARS; @VARS_END - @VARS;
+@LABELS; @LABELS_END - @LABELS;
+@DATA; @DATA_END - @DATA;
+(0; 0) * 11;
+i8 0 * 128;
+
+MAIN:
+ i16 1; # Fixed.
+ s62 "PCSPSS PSPP synthetic test product";
+ PCSYSMIS;
+ 0; 0; i16 1; # Fixed.
+ i16 0;
+ i16 15;
+ 1;
+ i16 0; # Fixed.
+ 1;
+ s8 "11/28/14";
+ s8 "15:11:00";
+ s64 "PSPP synthetic test file";
+MAIN_END:
+
+VARS:
+ 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS;
+ 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS;
+ 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS;
+
+ # Numeric variable, no label or missing values.
+ 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS;
+
+ # Numeric variable, variable label.
+ 0; 0; @NUM2_LABEL - @LABELS_OFS; 0x050800; s8 "NUM2"; PCSYSMIS;
+
+ # Numeric variable with missing value.
+ 0; 0; 0; 0x050800; s8 "NUM3"; 1.0;
+
+ # Numeric variable, variable label and missing value.
+ 0; 0; @NUM4_LABEL - @LABELS_OFS; 0x050800; s8 "NUM4"; 2.0;
+
+ # String variable, no label or missing values.
+ 0; 0; 0; 0x010800; s8 "STR1"; PCSYSMIS;
+
+ # String variable, variable label.
+ 0; 0; @STR2_LABEL - @LABELS_OFS; 0x010400; s8 "STR2"; PCSYSMIS;
+
+ # String variable with missing value.
+ 0; 0; 0; 0x010500; s8 "STR3"; s8 "MISS";
+
+ # String variable, variable label and missing value.
+ 0; 0; @STR4_LABEL - @LABELS_OFS; 0x010100; s8 "STR4"; s8 "OTHR";
+
+ # Long string variable
+ 0; 0; 0; 0x010b00; s8 "STR5"; PCSYSMIS;
+ 0 * 8;
+
+ # Long string variable with variable label
+ 0; 0; @STR6_LABEL - @LABELS_OFS; 0x010b00; s8 "STR6"; PCSYSMIS;
+ 0 * 8;
+VARS_END:
+
+LABELS:
+ 3; i8 0 0 0; LABELS_OFS: i8 0;
+ NUM2_LABEL: COUNT8("Numeric variable 2's label");
+ NUM4_LABEL: COUNT8("Another numeric variable label");
+ STR2_LABEL: COUNT8("STR2's variable label");
+ STR4_LABEL: COUNT8("STR4's variable label");
+ STR6_LABEL: COUNT8("Another string variable's label");
+LABELS_END:
+
+DATA:
+ 0.0; "11/28/14"; 1.0;
+ 0.0; 1.0; 2.0; PCSYSMIS; s8 "abcdefgh"; s8 "ijkl"; s8 "mnopq"; s8 "r";
+ s16 "stuvwxyzAB"; s16 "CDEFGHIJKLM";
+DATA_END:
+"#;
+ let output = sack(input, None, Endian::Big)?;
+ HexView::new(&output).print()?;
+ Ok(())
+ }
+}
--- /dev/null
+use std::sync::OnceLock;
+
+use enum_map::EnumMap;
+
+use crate::{
+ endian::Endian,
+ format::{Format, Settings as FormatSettings},
+ message::Severity,
+};
+
+pub struct Settings {
+ pub input_integer_format: Endian,
+ pub input_float_format: Endian,
+ pub output_integer_format: Endian,
+ pub output_float_format: Endian,
+
+ /// `MDISPLAY`: how to display matrices in `MATRIX`...`END MATRIX`.
+ pub matrix_display: MatrixDisplay,
+
+ pub view_length: usize,
+ pub view_width: usize,
+ pub safer: bool,
+ pub include: bool,
+ pub route_errors_to_terminal: bool,
+ pub route_errors_to_listing: bool,
+ pub scompress: bool,
+ pub undefined: bool,
+ pub blanks: Option<f64>,
+ pub max_messages: EnumMap<Severity, usize>,
+ pub printback: bool,
+ pub macros: MacroSettings,
+ pub max_loops: usize,
+ pub workspace: usize,
+ pub default_format: Format,
+ pub testing: bool,
+ pub fuzz_bits: usize,
+ pub scale_min: usize,
+ pub commands: Compatibility,
+ pub global: Compatibility,
+ pub syntax: Compatibility,
+ pub formats: FormatSettings,
+ pub small: f64,
+}
+
+impl Default for Settings {
+ fn default() -> Self {
+ Self {
+ input_integer_format: Endian::NATIVE,
+ input_float_format: Endian::NATIVE,
+ output_integer_format: Endian::NATIVE,
+ output_float_format: Endian::NATIVE,
+ matrix_display: MatrixDisplay::default(),
+ view_length: 24,
+ view_width: 79,
+ safer: false,
+ include: true,
+ route_errors_to_terminal: true,
+ route_errors_to_listing: true,
+ scompress: true,
+ undefined: true,
+ blanks: None,
+ max_messages: EnumMap::from_fn(|_| 100),
+ printback: true,
+ macros: MacroSettings::default(),
+ max_loops: 40,
+ workspace: 64 * 1024 * 1024,
+ default_format: Format::F8_2,
+ testing: false,
+ fuzz_bits: 6,
+ scale_min: 24,
+ commands: Compatibility::Enhanced,
+ global: Compatibility::Enhanced,
+ syntax: Compatibility::Enhanced,
+ formats: FormatSettings::default(),
+ small: 0.0001,
+ }
+ }
+}
+
+impl Settings {
+ pub fn global() -> &'static Settings {
+ static GLOBAL: OnceLock<Settings> = OnceLock::new();
+ &GLOBAL.get_or_init(|| Settings::default())
+ }
+}
+
+pub enum Compatibility {
+ Compatible,
+ Enhanced,
+}
+
+pub struct MacroSettings {
+ /// Expand macros?
+ pub expand: bool,
+
+ /// Print macro expansions?
+ pub print_expansions: bool,
+
+ /// Maximum iterations of `!FOR`.
+ pub max_iterations: usize,
+
+ /// Maximum nested macro expansion levels.
+ pub max_nest: usize,
+}
+
+impl Default for MacroSettings {
+ fn default() -> Self {
+ Self {
+ expand: true,
+ print_expansions: false,
+ max_iterations: 1000,
+ max_nest: 50,
+ }
+ }
+}
+
+/// How to display matrices in `MATRIX`...`END MATRIX`.
+#[derive(Default)]
+pub enum MatrixDisplay {
+ /// Output matrices as text.
+ #[default]
+ Text,
+
+ /// Output matrices as pivot tables.
+ Tables,
+}
+
+pub enum OutputType {
+ /// Errors and warnings.
+ Error,
+
+ /// Notes.
+ Notes,
+
+ /// Syntax printback.
+ Syntax,
+
+ /// Everything else.
+ Other,
+}
--- /dev/null
+use std::fs::read_to_string;
+use std::path::PathBuf;
+
+use anyhow::{anyhow, Result};
+use clap::Parser;
+use pspp::endian::Endian;
+use pspp::sack::sack;
+
+/// SAv Construction Kit
+///
+/// The input is a sequence of data items, each followed by a semicolon. Each
+/// data item is converted to the output format and written on stdout. A data
+/// item is one of the following:
+///
+/// - An integer in decimal, in hexadecimal prefixed by `0x`, or in octal
+/// prefixed by `0`. Output as a 32-bit binary integer.
+///
+/// - A floating-point number. Output in 64-bit IEEE 754 format.
+///
+/// - A string enclosed in double quotes. Output literally. There is no
+/// syntax for "escapes". Strings may not contain new-lines.
+///
+/// - A literal of the form `s<number>` followed by a quoted string as above.
+/// Output as the string's contents followed by enough spaces to fill up
+/// `<number>` bytes. For example, `s8 "foo"` is output as `foo` followed
+/// by 5 spaces.
+///
+/// - The literal `i8`, `i16`, or `i64` followed by an integer. Output
+/// as a binary integer with the specified number of bits.
+///
+/// - One of the literals `SYSMIS`, `LOWEST`, or `HIGHEST`. Output as a
+/// 64-bit IEEE 754 float of the appropriate PSPP value.
+///
+/// - `PCSYSMIS`. Output as SPSS/PC+ system-missing value.
+///
+/// - The literal `ENDIAN`. Output as a 32-bit binary integer, either with
+/// value 1 if `--be` is in effect or 2 if `--le` is in effect.
+///
+/// - A pair of parentheses enclosing a sequence of data items, each followed
+/// by a semicolon (the last semicolon is optional). Output as the enclosed
+/// data items in sequence.
+///
+/// - The literal `COUNT` or `COUNT8` followed by a sequence of parenthesized
+/// data items, as above. Output as a 32-bit or 8-bit binary integer whose
+/// value is the number of bytes enclosed within the parentheses, followed
+/// by the enclosed data items themselves.
+///
+/// optionally followed by an asterisk and a positive integer, which specifies a
+/// repeat count for the data item.
+#[derive(Parser, Debug)]
+struct Args {
+ /// Big-endian output format (default)
+ #[arg(long = "be")]
+ be: bool,
+
+ /// Little-endian output format
+ #[arg(long = "le")]
+ le: bool,
+
+ /// Input file.
+ #[arg(required = true, name = "input")]
+ input_file_name: PathBuf,
+
+ /// Output file.
+ #[arg(required = true, name = "output")]
+ output_file_name: PathBuf,
+}
+
+fn main() -> Result<()> {
+ let Args {
+ be,
+ le,
+ input_file_name,
+ output_file_name,
+ } = Args::parse();
+ let endian = match (be, le) {
+ (false, false) | (true, false) => Endian::Big,
+ (false, true) => Endian::Little,
+ (true, true) => return Err(anyhow!("can't use both `--be` and `--le`")),
+ };
+
+ let input_file_str = input_file_name.to_string_lossy();
+ let input = read_to_string(&input_file_name)
+ .map_err(|err| anyhow!("{input_file_str}: read failed ({err})"))?;
+
+ let output = sack(&input, Some(&input_file_str), endian)?;
+
+ let output_file_str = output_file_name.to_string_lossy();
+ std::fs::write(&output_file_name, output)
+ .map_err(|err| anyhow!("{output_file_str}: write failed ({err})"))?;
+
+ Ok(())
+}
+++ /dev/null
-use std::{fmt::Write, sync::OnceLock};
-
-use flagset::{flags, FlagSet};
-
-use crate::{
- integer::ToInteger,
- lex::{
- command_name::CommandMatcher,
- lexer::Lexer,
- token::{Punct, Token},
- },
- message::Diagnostic,
-};
-
-flags! {
- enum State: u8 {
- /// No active dataset yet defined.
- Initial,
-
- /// Active dataset has been defined.
- Data,
-
- /// Inside `INPUT PROGRAM`.
- InputProgram,
-
- /// Inside `FILE TYPE`.
- FileType,
-
- /// State nested inside `LOOP` or `DO IF`, inside [State::Data].
- NestedData,
-
- /// State nested inside `LOOP` or `DO IF`, inside [State::InputProgram].
- NestedInputProgram,
- }
-}
-
-struct Command {
- allowed_states: FlagSet<State>,
- enhanced_only: bool,
- testing_only: bool,
- no_abbrev: bool,
- name: &'static str,
- run: Box<dyn Fn(&Context) + Send + Sync>,
-}
-
-fn commands() -> &'static [Command] {
- fn new_commands() -> Vec<Command> {
- vec![Command {
- allowed_states: State::Initial | State::Data,
- enhanced_only: false,
- testing_only: false,
- no_abbrev: false,
- name: "ECHO",
- run: Box::new(|_context| {
- println!("hi");
- }),
- }]
- }
-
- static COMMANDS: OnceLock<Vec<Command>> = OnceLock::new();
- COMMANDS.get_or_init(|| new_commands()).as_slice()
-}
-
-fn parse_command_word(lexer: &mut Lexer, s: &mut String, n: isize) -> bool {
- let separator = match s.chars().next_back() {
- Some(c) if c != '-' => " ",
- _ => "",
- };
-
- match lexer.next(n) {
- Token::Punct(Punct::Dash) => {
- s.push('-');
- true
- }
- Token::Id(id) => {
- write!(s, "{separator}{id}").unwrap();
- true
- }
- Token::Number(number) if number.is_sign_positive() => {
- if let Some(integer) = number.to_exact_usize() {
- write!(s, "{separator}{integer}").unwrap();
- true
- } else {
- false
- }
- }
- _ => false,
- }
-}
-
-fn find_best_match(s: &str) -> (Option<&'static Command>, isize) {
- let mut cm = CommandMatcher::new(s);
- for command in commands() {
- cm.add(command.name, command);
- }
- cm.get_match()
-}
-
-fn parse_command_name(
- lexer: &mut Lexer,
- error: &Box<dyn Fn(Diagnostic)>,
-) -> Result<(&'static Command, isize), ()> {
- let mut s = String::new();
- let mut word = 0;
- let mut missing_words = 0;
- let mut command = None;
- while parse_command_word(lexer, &mut s, word) {
- (command, missing_words) = find_best_match(&s);
- if missing_words <= 0 {
- break;
- }
- word += 1;
- }
- if command.is_none() && missing_words > 0 {
- s.push_str(" .");
- (command, missing_words) = find_best_match(&s);
- s.truncate(s.len() - 2);
- }
-
- match command {
- Some(command) => Ok((command, (word + 1) + missing_words)),
- None => {
- if s.is_empty() {
- error(lexer.error("Syntax error expecting command name"))
- } else {
- error(lexer.error("Unknown command `{s}`."))
- };
- Err(())
- }
- }
-}
-
-pub enum Success {
- Success,
- Eof,
- Finish,
-}
-
-pub fn end_of_command(context: &Context) -> Result<Success, ()> {
- match context.lexer.token() {
- Token::EndCommand | Token::End => Ok(Success::Success),
- _ => {
- context.error(
- context
- .lexer
- .error("Syntax error expecting end of command."),
- );
- Err(())
- }
- }
-}
-
-fn parse_in_state(lexer: &mut Lexer, error: &Box<dyn Fn(Diagnostic)>, _state: State) {
- match lexer.token() {
- Token::End | Token::EndCommand => (),
- _ => {
- if let Ok((command, n_tokens)) = parse_command_name(lexer, error) {
- for _ in 0..n_tokens {
- lexer.get();
- }
- let context = Context {
- error,
- lexer,
- command_name: Some(command.name),
- };
- (command.run)(&context);
- end_of_command(&context);
- }
- lexer.interactive_reset();
- lexer.discard_rest_of_command();
- }
- }
- while let Token::EndCommand = lexer.token() {
- lexer.get();
- }
-}
-
-pub fn parse(lexer: &mut Lexer, error: &Box<dyn Fn(Diagnostic)>) {
- parse_in_state(lexer, error, State::Initial)
-}
-
-pub struct Context<'a> {
- error: &'a Box<dyn Fn(Diagnostic)>,
- lexer: &'a mut Lexer,
- command_name: Option<&'static str>,
-}
-
-impl<'a> Context<'a> {
- pub fn error(&self, diagnostic: Diagnostic) {
- (self.error)(diagnostic);
- }
-}
+++ /dev/null
-use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc};
-
-use crate::{
- dictionary::{Dictionary, VarWidth, Variable},
- encoding::Error as EncodingError,
- endian::Endian,
- format::{Error as FormatError, Format, UncheckedFormat},
- identifier::{Error as IdError, Identifier},
- raw::{
- self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord,
- FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongNamesRecord,
- LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord,
- NumberOfCasesRecord, ProductInfoRecord, RawStr, ValueLabel, ValueLabelRecord,
- VarDisplayRecord, VariableAttributeRecord, VariableRecord, VariableSetRecord,
- VeryLongStringsRecord, ZHeader, ZTrailer,
- },
-};
-use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
-use encoding_rs::Encoding;
-use num::Integer;
-use thiserror::Error as ThisError;
-
-pub use crate::raw::{CategoryLabels, Compression};
-
-#[derive(ThisError, Debug)]
-pub enum Error {
- #[error("Missing header record")]
- MissingHeaderRecord,
-
- // XXX this is an internal error
- #[error("More than one file header record")]
- DuplicateHeaderRecord,
-
- #[error("{0}")]
- EncodingError(EncodingError),
-
- #[error("Using default encoding {0}.")]
- UsingDefaultEncoding(String),
-
- #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)]
- InvalidVariableWidth { offsets: Range<u64>, width: i32 },
-
- #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
- InvalidLongMissingValueFormat,
-
- #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
- InvalidCreationDate { creation_date: String },
-
- #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
- InvalidCreationTime { creation_time: String },
-
- #[error("{id_error} Renaming variable to {new_name}.")]
- InvalidVariableName {
- id_error: IdError,
- new_name: Identifier,
- },
-
- #[error(
- "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}"
- )]
- InvalidPrintFormat {
- new_spec: Format,
- variable: Identifier,
- format_error: FormatError,
- },
-
- #[error(
- "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}"
- )]
- InvalidWriteFormat {
- new_spec: Format,
- variable: Identifier,
- format_error: FormatError,
- },
-
- #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")]
- DuplicateVariableName {
- duplicate_name: Identifier,
- new_name: Identifier,
- },
-
- #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")]
- InvalidDictIndex { dict_index: usize, max_index: usize },
-
- #[error("Dictionary index {0} refers to a long string continuation.")]
- DictIndexIsContinuation(usize),
-
- #[error("At offset {offset:#x}, one or more variable indexes for value labels referred to long string continuation records: {indexes:?}")]
- LongStringContinuationIndexes { offset: u64, indexes: Vec<u32> },
-
- #[error(
- "At offsets {:#x}...{:#x}, record types 3 and 4 may not add value labels to one or more long string variables: {variables:?}", .offsets.start, .offsets.end
- )]
- InvalidLongStringValueLabels {
- offsets: Range<u64>,
- variables: Vec<Identifier>,
- },
-
- #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")]
- ValueLabelsDifferentTypes {
- numeric_var: Identifier,
- string_var: Identifier,
- },
-
- #[error("Invalid multiple response set name. {0}")]
- InvalidMrSetName(IdError),
-
- #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")]
- UnknownMrSetVariable {
- mr_set: Identifier,
- short_name: Identifier,
- },
-
- #[error("Multiple response set {0} has no variables.")]
- EmptyMrSet(Identifier),
-
- #[error("Multiple response set {0} has only one variable.")]
- OneVarMrSet(Identifier),
-
- #[error("Multiple response set {0} contains both string and numeric variables.")]
- MixedMrSet(Identifier),
-
- #[error(
- "Invalid numeric format for counted value {number} in multiple response set {mr_set}."
- )]
- InvalidMDGroupCountedValue { mr_set: Identifier, number: String },
-
- #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")]
- TooWideMDGroupCountedValue {
- mr_set: Identifier,
- value: String,
- width: usize,
- max_width: u16,
- },
-
- #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")]
- InvalidLongValueLabelWidth {
- name: Identifier,
- width: u32,
- min_width: u16,
- max_width: u16,
- },
-
- #[error("Invalid attribute name. {0}")]
- InvalidAttributeName(IdError),
-
- #[error("Invalid short name in long variable name record. {0}")]
- InvalidShortName(IdError),
-
- #[error("Invalid name in long variable name record. {0}")]
- InvalidLongName(IdError),
-
- #[error("Invalid variable name in very long string record. {0}")]
- InvalidLongStringName(IdError),
-
- #[error("Invalid variable name in long string value label record. {0}")]
- InvalidLongStringValueLabelName(IdError),
-
- #[error("Invalid variable name in attribute record. {0}")]
- InvalidAttributeVariableName(IdError),
-
- // XXX This is risky because `text` might be arbitarily long.
- #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
- MalformedString { encoding: String, text: String },
-
- #[error("Details TBD")]
- TBD,
-}
-
-type DictIndex = usize;
-
-#[derive(Clone, Debug)]
-pub struct Headers {
- pub header: HeaderRecord<String>,
- pub variable: Vec<VariableRecord<String, String>>,
- pub value_label: Vec<ValueLabelRecord<RawStr<8>, String>>,
- pub document: Vec<DocumentRecord<String>>,
- pub integer_info: Option<IntegerInfoRecord>,
- pub float_info: Option<FloatInfoRecord>,
- pub var_display: Option<VarDisplayRecord>,
- pub multiple_response: Vec<MultipleResponseRecord<Identifier, String>>,
- pub long_string_value_labels: Vec<LongStringValueLabelRecord<Identifier, String>>,
- pub long_string_missing_values: Vec<LongStringMissingValueRecord<Identifier, String>>,
- pub encoding: Option<EncodingRecord>,
- pub number_of_cases: Option<NumberOfCasesRecord>,
- pub variable_sets: Vec<VariableSetRecord>,
- pub product_info: Option<ProductInfoRecord>,
- pub long_names: Vec<LongNamesRecord>,
- pub very_long_strings: Vec<VeryLongStringsRecord>,
- pub file_attributes: Vec<FileAttributeRecord>,
- pub variable_attributes: Vec<VariableAttributeRecord>,
- pub other_extension: Vec<Extension>,
- pub end_of_headers: Option<u32>,
- pub z_header: Option<ZHeader>,
- pub z_trailer: Option<ZTrailer>,
- pub cases: Option<Rc<RefCell<Cases>>>,
-}
-
-fn take_first<T, F>(mut vec: Vec<T>, more_than_one: F) -> Option<T>
-where
- F: FnOnce(),
-{
- if vec.len() > 1 {
- more_than_one();
- }
- vec.drain(..).next()
-}
-
-impl Headers {
- pub fn new(headers: Vec<raw::DecodedRecord>, warn: &impl Fn(Error)) -> Result<Headers, Error> {
- let mut file_header = Vec::new();
- let mut variable = Vec::new();
- let mut value_label = Vec::new();
- let mut document = Vec::new();
- let mut integer_info = Vec::new();
- let mut float_info = Vec::new();
- let mut var_display = Vec::new();
- let mut multiple_response = Vec::new();
- let mut long_string_value_labels = Vec::new();
- let mut long_string_missing_values = Vec::new();
- let mut encoding = Vec::new();
- let mut number_of_cases = Vec::new();
- let mut variable_sets = Vec::new();
- let mut product_info = Vec::new();
- let mut long_names = Vec::new();
- let mut very_long_strings = Vec::new();
- let mut file_attributes = Vec::new();
- let mut variable_attributes = Vec::new();
- let mut other_extension = Vec::new();
- let mut end_of_headers = Vec::new();
- let mut z_header = Vec::new();
- let mut z_trailer = Vec::new();
- let mut cases = Vec::new();
-
- for header in headers {
- match header {
- DecodedRecord::Header(record) => {
- file_header.push(record);
- }
- DecodedRecord::Variable(record) => {
- variable.push(record);
- }
- DecodedRecord::ValueLabel(record) => {
- value_label.push(record);
- }
- DecodedRecord::Document(record) => {
- document.push(record);
- }
- DecodedRecord::IntegerInfo(record) => {
- integer_info.push(record);
- }
- DecodedRecord::FloatInfo(record) => {
- float_info.push(record);
- }
- DecodedRecord::VariableSets(record) => {
- variable_sets.push(record);
- }
- DecodedRecord::VarDisplay(record) => {
- var_display.push(record);
- }
- DecodedRecord::MultipleResponse(record) => {
- multiple_response.push(record);
- }
- DecodedRecord::LongStringValueLabels(record) => {
- long_string_value_labels.push(record)
- }
- DecodedRecord::LongStringMissingValues(record) => {
- long_string_missing_values.push(record);
- }
- DecodedRecord::Encoding(record) => {
- encoding.push(record);
- }
- DecodedRecord::NumberOfCases(record) => {
- number_of_cases.push(record);
- }
- DecodedRecord::ProductInfo(record) => {
- product_info.push(record);
- }
- DecodedRecord::LongNames(record) => {
- long_names.push(record);
- }
- DecodedRecord::VeryLongStrings(record) => {
- very_long_strings.push(record);
- }
- DecodedRecord::FileAttributes(record) => {
- file_attributes.push(record);
- }
- DecodedRecord::VariableAttributes(record) => {
- variable_attributes.push(record);
- }
- DecodedRecord::OtherExtension(record) => {
- other_extension.push(record);
- }
- DecodedRecord::EndOfHeaders(record) => {
- end_of_headers.push(record);
- }
- DecodedRecord::ZHeader(record) => {
- z_header.push(record);
- }
- DecodedRecord::ZTrailer(record) => {
- z_trailer.push(record);
- }
- DecodedRecord::Cases(record) => {
- cases.push(record);
- }
- }
- }
-
- let Some(file_header) = take_first(file_header, || warn(Error::DuplicateHeaderRecord))
- else {
- return Err(Error::MissingHeaderRecord);
- };
-
- Ok(Headers {
- header: file_header,
- variable,
- value_label,
- document,
- integer_info: take_first(integer_info, || warn(Error::TBD)),
- float_info: take_first(float_info, || warn(Error::TBD)),
- var_display: take_first(var_display, || warn(Error::TBD)),
- multiple_response,
- long_string_value_labels,
- long_string_missing_values,
- encoding: take_first(encoding, || warn(Error::TBD)),
- number_of_cases: take_first(number_of_cases, || warn(Error::TBD)),
- variable_sets,
- product_info: take_first(product_info, || warn(Error::TBD)),
- long_names,
- very_long_strings,
- file_attributes,
- variable_attributes,
- other_extension,
- end_of_headers: take_first(end_of_headers, || warn(Error::TBD)),
- z_header: take_first(z_header, || warn(Error::TBD)),
- z_trailer: take_first(z_trailer, || warn(Error::TBD)),
- cases: take_first(cases, || warn(Error::TBD)),
- })
- }
-}
-
-pub struct Metadata {
- creation: NaiveDateTime,
- endian: Endian,
- compression: Option<Compression>,
- n_cases: Option<u64>,
- product: String,
- product_ext: Option<String>,
- version: Option<(i32, i32, i32)>,
-}
-
-impl Metadata {
- fn decode(headers: &Headers, warn: impl Fn(Error)) -> Self {
- let header = &headers.header;
- let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
- .unwrap_or_else(|_| {
- warn(Error::InvalidCreationDate {
- creation_date: header.creation_date.to_string(),
- });
- Default::default()
- });
- let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
- .unwrap_or_else(|_| {
- warn(Error::InvalidCreationTime {
- creation_time: header.creation_time.to_string(),
- });
- Default::default()
- });
- let creation = NaiveDateTime::new(creation_date, creation_time);
-
- let product = header
- .eye_catcher
- .trim_start_matches("@(#) SPSS DATA FILE")
- .trim_end()
- .to_string();
-
- Self {
- creation,
- endian: header.endian,
- compression: header.compression,
- n_cases: header.n_cases.map(|n| n as u64),
- product,
- product_ext: headers.product_info.as_ref().map(|pe| fix_line_ends(&pe.0)),
- version: headers.integer_info.as_ref().map(|ii| ii.version),
- }
- }
-}
-
-struct Decoder {
- //pub raw: raw::Decoder,
- pub encoding: &'static Encoding,
- //pub variables: HashMap<DictIndex, Variable>,
- //pub var_names: HashMap<Identifier, DictIndex>,
- //pub dictionary: Dictionary,
- //n_dict_indexes: usize,
- n_generated_names: usize,
-}
-
-impl Decoder {
- fn generate_name(&mut self, dictionary: &Dictionary) -> Identifier {
- loop {
- self.n_generated_names += 1;
- let name = Identifier::from_encoding(&format!("VAR{:03}", self.n_generated_names), self.encoding)
- .unwrap();
- if !dictionary.variables.contains(&name.0) {
- return name;
- }
- assert!(self.n_generated_names < usize::MAX);
- }
- }
-}
-
-pub fn decode(
- mut headers: Headers,
- encoding: &'static Encoding,
- warn: impl Fn(Error),
-) -> Result<(Dictionary, Metadata), Error> {
- let mut dictionary = Dictionary::new(encoding);
-
- let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' '));
- if !file_label.is_empty() {
- dictionary.file_label = Some(file_label);
- }
-
- for attributes in headers.file_attributes.drain(..) {
- dictionary.attributes.extend(attributes.0 .0.into_iter())
- }
-
- // Concatenate all the document records (really there should only be one)
- // and trim off the trailing spaces that pad them to 80 bytes.
- dictionary.documents = headers
- .document
- .drain(..)
- .flat_map(|record| record.lines)
- .map(trim_end_spaces)
- .collect();
-
- // XXX warn for weird integer format
- // XXX warn for weird floating-point format, etc.
-
- let mut decoder = Decoder {
- encoding,
- n_generated_names: 0,
- };
-
- let mut header_vars = headers.variable.iter().enumerate();
- let mut var_index_map = HashMap::new();
- while let Some((value_index, input)) = header_vars.next() {
- let name = trim_end_spaces(input.name.to_string());
- let name = match Identifier::from_encoding(&name, encoding) {
- Ok(name) => {
- if !dictionary.variables.contains(&name.0) {
- name
- } else {
- let new_name = decoder.generate_name(&dictionary);
- warn(Error::DuplicateVariableName {
- duplicate_name: name.clone(),
- new_name: new_name.clone(),
- });
- new_name
- }
- }
- Err(id_error) => {
- let new_name = decoder.generate_name(&dictionary);
- warn(Error::InvalidVariableName {
- id_error,
- new_name: new_name.clone(),
- });
- new_name
- }
- };
- let mut variable = Variable::new(name.clone(), VarWidth::from_raw(input.width).unwrap());
-
- // Set the short name the same as the long name (even if we renamed it).
- variable.short_names = vec![name];
-
- variable.label = input.label.clone();
-
- variable.missing_values = input.missing_values.clone();
-
- variable.print_format = decode_format(
- input.print_format,
- variable.width,
- |new_spec, format_error| {
- warn(Error::InvalidPrintFormat {
- new_spec,
- variable: variable.name.clone(),
- format_error,
- })
- },
- );
- variable.write_format = decode_format(
- input.write_format,
- variable.width,
- |new_spec, format_error| {
- warn(Error::InvalidWriteFormat {
- new_spec,
- variable: variable.name.clone(),
- format_error,
- })
- },
- );
-
- // Skip long string continuation records.
- if input.width > 0 {
- #[allow(unstable_name_collisions)]
- for _ in 1..input.width.div_ceil(&8) {
- if let Some((_, continuation)) = header_vars.next() {
- if continuation.width == -1 {
- continue;
- }
- }
- return Err(Error::TBD);
- }
- }
-
- let dict_index = dictionary.add_var(variable).unwrap();
- assert_eq!(var_index_map.insert(value_index, dict_index), None);
- }
-
- for record in headers.value_label.drain(..) {
- let mut dict_indexes = Vec::with_capacity(record.dict_indexes.len());
- let mut continuation_indexes = Vec::new();
- let mut long_string_variables = Vec::new();
- for value_index in record.dict_indexes.iter() {
- if let Some(dict_index) = var_index_map.get(&(*value_index as usize - 1)) {
- let variable = &dictionary.variables[*dict_index];
- if variable.width.is_long_string() {
- long_string_variables.push(variable.name.clone());
- } else {
- dict_indexes.push(*dict_index);
- }
- } else {
- continuation_indexes.push(*value_index);
- }
- }
- if !continuation_indexes.is_empty() {
- warn(Error::LongStringContinuationIndexes {
- offset: record.offsets.start,
- indexes: continuation_indexes,
- });
- }
- if !long_string_variables.is_empty() {
- warn(Error::InvalidLongStringValueLabels {
- offsets: record.offsets.clone(),
- variables: long_string_variables,
- });
- }
-
- for dict_index in dict_indexes {
- let mut variable = &dictionary.variables[dict_index];
- for ValueLabel { value, label } in record.labels.iter().cloned() {
-
- }
- }
- }
-
- let metadata = Metadata::decode(&headers, warn);
- Ok((dictionary, metadata))
-}
-
-fn trim_end_spaces(mut s: String) -> String {
- s.truncate(s.trim_end_matches(' ').len());
- s
-}
-
-/// Returns a copy of `s` in which all lone CR and CR LF pairs have been
-/// replaced by LF.
-///
-/// (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
-/// files that use CR-only line ends in the file label and extra product info.)
-fn fix_line_ends(s: &str) -> String {
- let mut out = String::with_capacity(s.len());
- let mut s = s.chars().peekable();
- while let Some(c) = s.next() {
- match c {
- '\r' => {
- s.next_if_eq(&'\n');
- out.push('\n')
- }
- c => out.push(c),
- }
- }
- out
-}
-
-fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Format, FormatError)) -> Format {
- UncheckedFormat::try_from(raw)
- .and_then(Format::try_from)
- .and_then(|x| x.check_width_compatibility(width))
- .unwrap_or_else(|error| {
- let new_format = Format::default_for_width(width);
- warn(new_format, error);
- new_format
- })
-}
-
-/*
-impl Decoder {
- fn generate_name(&mut self) -> Identifier {
- loop {
- self.n_generated_names += 1;
- let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
- .unwrap();
- if !self.var_names.contains_key(&name) {
- return name;
- }
- assert!(self.n_generated_names < usize::MAX);
- }
- }
- fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
- let (output, malformed) = self.encoding.decode_without_bom_handling(input);
- if malformed {
- warn(Error::MalformedString {
- encoding: self.encoding.name().into(),
- text: output.clone().into(),
- });
- }
- output
- }
- fn decode_string(&self, input: &[u8], warn: &impl Fn(Error)) -> String {
- self.decode_string_cow(input, warn).into()
- }
- pub fn decode_identifier(
- &self,
- input: &[u8],
- warn: &impl Fn(Error),
- ) -> Result<Identifier, IdError> {
- let s = self.decode_string_cow(input, warn);
- Identifier::new(&s, self.encoding)
- }
- fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> {
- let max_index = self.n_dict_indexes;
- if dict_index == 0 || dict_index > max_index {
- return Err(Error::InvalidDictIndex {
- dict_index,
- max_index,
- });
- }
- let Some(variable) = self.variables.get(&(dict_index - 1)) else {
- return Err(Error::DictIndexIsContinuation(dict_index));
- };
- Ok(variable)
- }
-
- /// Returns `input` decoded from `self.encoding` into UTF-8 such that
- /// re-encoding the result back into `self.encoding` will have exactly the
- /// same length in bytes.
- ///
- /// XXX warn about errors?
- fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
- if let (s, false) = self.encoding.decode_without_bom_handling(input) {
- // This is the common case. Usually there will be no errors.
- s
- } else {
- // Unusual case. Don't bother to optimize it much.
- let mut decoder = self.encoding.new_decoder_without_bom_handling();
- let mut output = String::with_capacity(
- decoder
- .max_utf8_buffer_length_without_replacement(input.len())
- .unwrap(),
- );
- let mut rest = input;
- while !rest.is_empty() {
- match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
- (DecoderResult::InputEmpty, _) => break,
- (DecoderResult::OutputFull, _) => unreachable!(),
- (DecoderResult::Malformed(a, b), consumed) => {
- let skipped = a as usize + b as usize;
- output.extend(repeat('?').take(skipped));
- rest = &rest[consumed..];
- }
- }
- }
- assert_eq!(self.encoding.encode(&output).0.len(), input.len());
- output.into()
- }
- }
-}
-
-pub trait TryDecode: Sized {
- type Input<'a>;
- fn try_decode(
- decoder: &mut Decoder,
- input: &Self::Input<'_>,
- warn: impl Fn(Error),
- ) -> Result<Option<Self>, Error>;
-}
-
-pub trait Decode<Input>: Sized {
- fn decode(decoder: &Decoder, input: &Input, warn: impl Fn(Error)) -> Self;
-}
-
-impl<const N: usize> Decode<RawStr<N>> for String {
- fn decode(decoder: &Decoder, input: &RawStr<N>, warn: impl Fn(Error)) -> Self {
- decoder.decode_string(&input.0, &warn)
- }
-}
-*/
-/*
-#[derive(Clone, Debug)]
-pub struct HeaderRecord {
- pub eye_catcher: String,
- pub weight_index: Option<usize>,
- pub n_cases: Option<u64>,
- pub creation: NaiveDateTime,
- pub file_label: String,
-}
-
-fn trim_end_spaces(mut s: String) -> String {
- s.truncate(s.trim_end_matches(' ').len());
- s
-}
-
-/// Data file info that doesn't fit in [Dictionary].
-pub struct Metadata {
- creation: NaiveDateTime,
- endian: Endian,
- compression: Option<Compression>,
- n_cases: Option<u64>,
- product: String,
- product_ext: Option<String>,
- version: Option<(i32, i32, i32)>,
-}
-
-impl Metadata {
- fn decode(
- header: &crate::raw::HeaderRecord<Cow<str>>,
- integer_info: Option<&IntegerInfoRecord>,
- product_ext: Option<&ProductInfoRecord>,
- warn: impl Fn(Error),
- ) -> Self {
- let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
- .unwrap_or_else(|_| {
- warn(Error::InvalidCreationDate {
- creation_date: header.creation_date.to_string(),
- });
- Default::default()
- });
- let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
- .unwrap_or_else(|_| {
- warn(Error::InvalidCreationTime {
- creation_time: header.creation_time.to_string(),
- });
- Default::default()
- });
- let creation = NaiveDateTime::new(creation_date, creation_time);
-
- let product = header
- .eye_catcher
- .trim_start_matches("@(#) SPSS DATA FILE")
- .trim_end()
- .to_string();
-
- Self {
- creation,
- endian: header.endian,
- compression: header.compression,
- n_cases: header.n_cases.map(|n| n as u64),
- product,
- product_ext: product_ext.map(|pe| pe.0.clone()),
- version: integer_info.map(|ii| ii.version),
- }
- }
-}
-
-impl TryDecode for HeaderRecord {
- type Input<'a> = crate::raw::HeaderRecord<Cow<'a, str>>;
-
- fn try_decode(
- _decoder: &mut Decoder,
- input: &Self::Input<'_>,
- warn: impl Fn(Error),
- ) -> Result<Option<Self>, Error> {
- let eye_catcher = trim_end_spaces(input.eye_catcher.to_string());
- let file_label = trim_end_spaces(input.file_label.to_string());
- let creation_date = NaiveDate::parse_from_str(&input.creation_date, "%e %b %Y")
- .unwrap_or_else(|_| {
- warn(Error::InvalidCreationDate {
- creation_date: input.creation_date.to_string(),
- });
- Default::default()
- });
- let creation_time = NaiveTime::parse_from_str(&input.creation_time, "%H:%M:%S")
- .unwrap_or_else(|_| {
- warn(Error::InvalidCreationTime {
- creation_time: input.creation_time.to_string(),
- });
- Default::default()
- });
- Ok(Some(HeaderRecord {
- eye_catcher,
- weight_index: input.weight_index.map(|n| n as usize),
- n_cases: input.n_cases.map(|n| n as u64),
- creation: NaiveDateTime::new(creation_date, creation_time),
- file_label,
- }))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableRecord {
- pub width: VarWidth,
- pub name: Identifier,
- pub print_format: Spec,
- pub write_format: Spec,
- pub missing_values: MissingValues<String>,
- pub label: Option<String>,
-}
-
-
-fn parse_variable_record(
- decoder: &mut Decoder,
- input: &raw::VariableRecord<Cow<str>, String>,
- warn: impl Fn(Error),
-) -> Result<(), Error> {
- let width = match input.width {
- 0 => VarWidth::Numeric,
- w @ 1..=255 => VarWidth::String(w as u16),
- -1 => return Ok(()),
- _ => {
- return Err(Error::InvalidVariableWidth {
- offsets: input.offsets.clone(),
- width: input.width,
- })
- }
- };
- let name = trim_end_spaces(input.name.to_string());
- let name = match Identifier::new(&name, decoder.encoding) {
- Ok(name) => {
- if !decoder.var_names.contains_key(&name) {
- name
- } else {
- let new_name = decoder.generate_name();
- warn(Error::DuplicateVariableName {
- duplicate_name: name.clone(),
- new_name: new_name.clone(),
- });
- new_name
- }
- }
- Err(id_error) => {
- let new_name = decoder.generate_name();
- warn(Error::InvalidVariableName {
- id_error,
- new_name: new_name.clone(),
- });
- new_name
- }
- };
- let variable = Variable {
- dict_index: decoder.n_dict_indexes,
- short_name: name.clone(),
- long_name: None,
- width,
- };
- decoder.n_dict_indexes += width.n_dict_indexes();
- assert!(decoder
- .var_names
- .insert(name.clone(), variable.dict_index)
- .is_none());
- assert!(decoder
- .variables
- .insert(variable.dict_index, variable)
- .is_none());
-
- let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
- warn(Error::InvalidPrintFormat {
- new_spec,
- variable: name.clone(),
- format_error,
- })
- });
- let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
- warn(Error::InvalidWriteFormat {
- new_spec,
- variable: name.clone(),
- format_error,
- })
- });
- let mut variable = dictionary::Variable::new(name, width);
- variable.print_format = print_format;
- variable.write_format = write_format;
- variable.missing_values = input.missing_values.clone();
- if let Some(ref label) = input.label {
- variable.label = Some(label.to_string());
- }
- decoder.dictionary.add_var(variable).unwrap();
- Ok(())
-}
-
-#[derive(Clone, Debug)]
-pub struct DocumentRecord(Vec<String>);
-
-impl TryDecode for DocumentRecord {
- type Input<'a> = crate::raw::DocumentRecord<RawDocumentLine>;
-
- fn try_decode(
- decoder: &mut Decoder,
- input: &Self::Input<'_>,
- warn: impl Fn(Error),
- ) -> Result<Option<Self>, Error> {
- Ok(Some(DocumentRecord(
- input
- .lines
- .iter()
- .map(|s| trim_end_spaces(decoder.decode_string(&s.0, &warn)))
- .collect(),
- )))
- }
-}
-
-trait TextRecord
-where
- Self: Sized,
-{
- const NAME: &'static str;
- fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error>;
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableSet {
- pub name: String,
- pub vars: Vec<String>,
-}
-
-impl VariableSet {
- fn parse(input: &str) -> Result<Self, Error> {
- let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
- let vars = input.split_ascii_whitespace().map(String::from).collect();
- Ok(VariableSet {
- name: name.into(),
- vars,
- })
- }
-}
-
-trait WarnOnError<T> {
- fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
-}
-impl<T> WarnOnError<T> for Result<T, Error> {
- fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
- match self {
- Ok(result) => Some(result),
- Err(error) => {
- warn(error);
- None
- }
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct ValueLabel {
- pub value: Value,
- pub label: String,
-}
-
-#[derive(Clone, Debug)]
-pub struct ValueLabelRecord {
- pub var_type: VarType,
- pub labels: Vec<ValueLabel>,
- pub variables: Vec<Identifier>,
-}
-
-impl TryDecode for ValueLabelRecord {
- type Input<'a> = crate::raw::ValueLabelRecord<RawStr<8>, RawString>;
- fn try_decode(
- decoder: &mut Decoder,
- input: &Self::Input<'_>,
- warn: impl Fn(Error),
- ) -> Result<Option<ValueLabelRecord>, Error> {
- let variables: Vec<&Variable> = input
- .dict_indexes
- .iter()
- .filter_map(|&dict_index| {
- decoder
- .get_var_by_index(dict_index as usize)
- .warn_on_error(&warn)
- })
- .filter(|&variable| match variable.width {
- VarWidth::String(width) if width > 8 => {
- warn(Error::InvalidLongStringValueLabel(
- variable.short_name.clone(),
- ));
- false
- }
- _ => true,
- })
- .collect();
- let mut i = variables.iter();
- let Some(&first_var) = i.next() else {
- return Ok(None);
- };
- let var_type: VarType = first_var.width.into();
- for &variable in i {
- let this_type: VarType = variable.width.into();
- if var_type != this_type {
- let (numeric_var, string_var) = match var_type {
- VarType::Numeric => (first_var, variable),
- VarType::String => (variable, first_var),
- };
- warn(Error::ValueLabelsDifferentTypes {
- numeric_var: numeric_var.short_name.clone(),
- string_var: string_var.short_name.clone(),
- });
- return Ok(None);
- }
- }
- let labels = input
- .labels
- .iter()
- .map(|raw::ValueLabel { value, label }| {
- let label = decoder.decode_string(&label.0, &warn);
- let value = Value::decode(value, decoder);
- ValueLabel { value, label }
- })
- .collect();
- let variables = variables
- .iter()
- .map(|&variable| variable.short_name.clone())
- .collect();
- Ok(Some(ValueLabelRecord {
- var_type,
- labels,
- variables,
- }))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableSetRecord(Vec<VariableSet>);
-
-impl TextRecord for VariableSetRecord {
- const NAME: &'static str = "variable set";
- fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
- let mut sets = Vec::new();
- for line in input.lines() {
- if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) {
- sets.push(set)
- }
- }
- Ok(VariableSetRecord(sets))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongName {
- pub short_name: Identifier,
- pub long_name: Identifier,
-}
-
-impl LongName {
- fn new(decoder: &mut Decoder, short_name: &str, long_name: &str) -> Result<LongName, Error> {
- let short_name =
- Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidShortName)?;
- let long_name =
- Identifier::new(long_name, decoder.encoding).map_err(Error::InvalidLongName)?;
- Ok(LongName {
- short_name,
- long_name,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongNameRecord(Vec<LongName>);
-
-impl LongNameRecord {
- pub fn parse(decoder: &mut Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
- let mut names = Vec::new();
- for pair in input.split('\t').filter(|s| !s.is_empty()) {
- if let Some((short_name, long_name)) = pair.split_once('=') {
- if let Some(long_name) =
- LongName::new(decoder, short_name, long_name).warn_on_error(&warn)
- {
- names.push(long_name);
- }
- } else {
- warn(Error::TBD)
- }
- }
- Ok(LongNameRecord(names))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VeryLongString {
- pub short_name: Identifier,
- pub length: u16,
-}
-
-impl VeryLongString {
- fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Error> {
- let Some((short_name, length)) = input.split_once('=') else {
- return Err(Error::TBD);
- };
- let short_name =
- Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidLongStringName)?;
- let length: u16 = length.parse().map_err(|_| Error::TBD)?;
- if length > VarWidth::MAX_STRING {
- return Err(Error::TBD);
- }
- Ok(VeryLongString { short_name, length })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VeryLongStringRecord(Vec<VeryLongString>);
-
-impl VeryLongStringRecord {
- pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
- let mut very_long_strings = Vec::new();
- for tuple in input
- .split('\0')
- .map(|s| s.trim_end_matches('\t'))
- .filter(|s| !s.is_empty())
- {
- if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&warn) {
- very_long_strings.push(vls)
- }
- }
- Ok(VeryLongStringRecord(very_long_strings))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct Attribute {
- pub name: Identifier,
- pub values: Vec<String>,
-}
-
-impl Attribute {
- fn parse<'a>(
- decoder: &Decoder,
- input: &'a str,
- warn: &impl Fn(Error),
- ) -> Result<(Option<Attribute>, &'a str), Error> {
- let Some((name, mut input)) = input.split_once('(') else {
- return Err(Error::TBD);
- };
- let mut values = Vec::new();
- loop {
- let Some((value, rest)) = input.split_once('\n') else {
- return Err(Error::TBD);
- };
- if let Some(stripped) = value
- .strip_prefix('\'')
- .and_then(|value| value.strip_suffix('\''))
- {
- values.push(stripped.into());
- } else {
- warn(Error::TBD);
- values.push(value.into());
- }
- if let Some(rest) = rest.strip_prefix(')') {
- let attribute = Identifier::new(name, decoder.encoding)
- .map_err(Error::InvalidAttributeName)
- .warn_on_error(warn)
- .map(|name| Attribute { name, values });
- return Ok((attribute, rest));
- };
- input = rest;
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct AttributeSet(pub Vec<Attribute>);
-
-impl AttributeSet {
- fn parse<'a>(
- decoder: &Decoder,
- mut input: &'a str,
- sentinel: Option<char>,
- warn: &impl Fn(Error),
- ) -> Result<(AttributeSet, &'a str), Error> {
- let mut attributes = Vec::new();
- let rest = loop {
- match input.chars().next() {
- None => break input,
- c if c == sentinel => break &input[1..],
- _ => {
- let (attribute, rest) = Attribute::parse(decoder, input, &warn)?;
- if let Some(attribute) = attribute {
- attributes.push(attribute);
- }
- input = rest;
- }
- }
- };
- Ok((AttributeSet(attributes), rest))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct FileAttributeRecord(AttributeSet);
-
-impl FileAttributeRecord {
- pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
- let (set, rest) = AttributeSet::parse(decoder, input, None, &warn)?;
- if !rest.is_empty() {
- warn(Error::TBD);
- }
- Ok(FileAttributeRecord(set))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VarAttributeSet {
- pub long_var_name: Identifier,
- pub attributes: AttributeSet,
-}
-
-impl VarAttributeSet {
- fn parse<'a>(
- decoder: &Decoder,
- input: &'a str,
- warn: &impl Fn(Error),
- ) -> Result<(Option<VarAttributeSet>, &'a str), Error> {
- let Some((long_var_name, rest)) = input.split_once(':') else {
- return Err(Error::TBD);
- };
- let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'), warn)?;
- let var_attribute = Identifier::new(long_var_name, decoder.encoding)
- .map_err(Error::InvalidAttributeVariableName)
- .warn_on_error(warn)
- .map(|name| VarAttributeSet {
- long_var_name: name,
- attributes,
- });
- Ok((var_attribute, rest))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
-
-impl VariableAttributeRecord {
- pub fn parse(decoder: &Decoder, mut input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
- let mut var_attribute_sets = Vec::new();
- while !input.is_empty() {
- let Some((var_attribute, rest)) =
- VarAttributeSet::parse(decoder, input, &warn).warn_on_error(&warn)
- else {
- break;
- };
- if let Some(var_attribute) = var_attribute {
- var_attribute_sets.push(var_attribute);
- }
- input = rest;
- }
- Ok(VariableAttributeRecord(var_attribute_sets))
- }
-}
-
-#[derive(Clone, Debug)]
-pub enum MultipleResponseType {
- MultipleDichotomy {
- value: Value,
- labels: CategoryLabels,
- },
- MultipleCategory,
-}
-
-impl MultipleResponseType {
- fn decode(
- decoder: &Decoder,
- mr_set: &Identifier,
- input: &raw::MultipleResponseType,
- min_width: VarWidth,
- warn: &impl Fn(Error),
- ) -> Result<Self, Error> {
- let mr_type = match input {
- raw::MultipleResponseType::MultipleDichotomy { value, labels } => {
- let value = decoder.decode_string_cow(&value.0, warn);
- let value = match min_width {
- VarWidth::Numeric => {
- let number: f64 = value.trim().parse().map_err(|_| {
- Error::InvalidMDGroupCountedValue {
- mr_set: mr_set.clone(),
- number: value.into(),
- }
- })?;
- Value::Number(Some(number.into()))
- }
- VarWidth::String(max_width) => {
- let value = value.trim_end_matches(' ');
- let width = value.len();
- if width > max_width as usize {
- return Err(Error::TooWideMDGroupCountedValue {
- mr_set: mr_set.clone(),
- value: value.into(),
- width,
- max_width,
- });
- };
- Value::String(value.into())
- }
- };
- MultipleResponseType::MultipleDichotomy {
- value,
- labels: *labels,
- }
- }
- raw::MultipleResponseType::MultipleCategory => MultipleResponseType::MultipleCategory,
- };
- Ok(mr_type)
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct MultipleResponseSet {
- pub name: Identifier,
- pub min_width: VarWidth,
- pub max_width: VarWidth,
- pub label: String,
- pub mr_type: MultipleResponseType,
- pub dict_indexes: Vec<DictIndex>,
-}
-
-impl MultipleResponseSet {
- fn decode(
- decoder: &Decoder,
- input: &raw::MultipleResponseSet<Identifier, Cow<str>>,
- warn: &impl Fn(Error),
- ) -> Result<Self, Error> {
- let mr_set_name = input.name.clone();
- let mut dict_indexes = Vec::with_capacity(input.short_names.len());
- for short_name in input.short_names.iter() {
- let Some(&dict_index) = decoder.var_names.get(&short_name) else {
- warn(Error::UnknownMrSetVariable {
- mr_set: mr_set_name.clone(),
- short_name: short_name.clone(),
- });
- continue;
- };
- dict_indexes.push(dict_index);
- }
-
- match dict_indexes.len() {
- 0 => return Err(Error::EmptyMrSet(mr_set_name)),
- 1 => return Err(Error::OneVarMrSet(mr_set_name)),
- _ => (),
- }
-
- let Some((Some(min_width), Some(max_width))) = dict_indexes
- .iter()
- .map(|dict_index| decoder.variables[dict_index].width)
- .map(|w| (Some(w), Some(w)))
- .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb)))
- else {
- return Err(Error::MixedMrSet(mr_set_name));
- };
-
- let mr_type =
- MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?;
-
- Ok(MultipleResponseSet {
- name: mr_set_name,
- min_width,
- max_width,
- label: input.label.to_string(),
- mr_type,
- dict_indexes,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
-
-impl TryDecode for MultipleResponseRecord {
- type Input<'a> = raw::MultipleResponseRecord<Identifier, Cow<'a, str>>;
-
- fn try_decode(
- decoder: &mut Decoder,
- input: &Self::Input<'_>,
- warn: impl Fn(Error),
- ) -> Result<Option<Self>, Error> {
- let mut sets = Vec::with_capacity(input.0.len());
- for set in &input.0 {
- match MultipleResponseSet::decode(decoder, set, &warn) {
- Ok(set) => sets.push(set),
- Err(error) => warn(error),
- }
- }
- Ok(Some(MultipleResponseRecord(sets)))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringValueLabels {
- pub var_name: Identifier,
- pub width: VarWidth,
- pub labels: Vec<ValueLabel>,
-}
-
-impl LongStringValueLabels {
- fn decode(
- decoder: &Decoder,
- input: &raw::LongStringValueLabels<RawString>,
- warn: &impl Fn(Error),
- ) -> Result<Self, Error> {
- let var_name = decoder.decode_string(&input.var_name.0, warn);
- let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
- .map_err(Error::InvalidLongStringValueLabelName)?;
-
- let min_width = 9;
- let max_width = VarWidth::MAX_STRING;
- if input.width < 9 || input.width > max_width as u32 {
- return Err(Error::InvalidLongValueLabelWidth {
- name: var_name,
- width: input.width,
- min_width,
- max_width,
- });
- }
- let width = input.width as u16;
-
- let mut labels = Vec::with_capacity(input.labels.len());
- for (value, label) in input.labels.iter() {
- let value = Value::String(decoder.decode_exact_length(&value.0).into());
- let label = decoder.decode_string(&label.0, warn);
- labels.push(ValueLabel { value, label });
- }
-
- Ok(LongStringValueLabels {
- var_name,
- width: VarWidth::String(width),
- labels,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
-
-impl TryDecode for LongStringValueLabelRecord {
- type Input<'a> = raw::LongStringValueLabelRecord<RawString>;
-
- fn try_decode(
- decoder: &mut Decoder,
- input: &Self::Input<'_>,
- warn: impl Fn(Error),
- ) -> Result<Option<Self>, Error> {
- let mut labels = Vec::with_capacity(input.0.len());
- for label in &input.0 {
- match LongStringValueLabels::decode(decoder, label, &warn) {
- Ok(set) => labels.push(set),
- Err(error) => warn(error),
- }
- }
- Ok(Some(LongStringValueLabelRecord(labels)))
- }
-}
-
-#[cfg(test)]
-mod test {
- use encoding_rs::WINDOWS_1252;
-
- #[test]
- fn test() {
- let mut s = String::new();
- s.push(char::REPLACEMENT_CHARACTER);
- let encoded = WINDOWS_1252.encode(&s).0;
- let decoded = WINDOWS_1252.decode(&encoded[..]).0;
- println!("{:?}", decoded);
- }
-
- #[test]
- fn test2() {
- let charset: Vec<u8> = (0..=255).collect();
- println!("{}", charset.len());
- let decoded = WINDOWS_1252.decode(&charset[..]).0;
- println!("{}", decoded.len());
- let encoded = WINDOWS_1252.encode(&decoded[..]).0;
- println!("{}", encoded.len());
- assert_eq!(&charset[..], &encoded[..]);
- }
-}
-*/
+++ /dev/null
-use std::{
- cmp::Ordering,
- collections::{HashMap, HashSet},
- fmt::Debug,
- ops::{Bound, RangeBounds},
-};
-
-use encoding_rs::Encoding;
-use indexmap::IndexSet;
-use num::integer::div_ceil;
-use ordered_float::OrderedFloat;
-use unicase::UniCase;
-
-use crate::{
- format::Format,
- identifier::{ByIdentifier, HasIdentifier, Identifier},
- raw::{self, Alignment, CategoryLabels, Decoder, Measure, MissingValues, RawStr, VarType},
-};
-
-pub type DictIndex = usize;
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum VarWidth {
- Numeric,
- String(u16),
-}
-
-impl PartialOrd for VarWidth {
- fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
- match (self, other) {
- (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal),
- (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)),
- _ => None,
- }
- }
-}
-
-impl VarWidth {
- pub const MAX_STRING: u16 = 32767;
-
- pub fn n_dict_indexes(self) -> usize {
- match self {
- VarWidth::Numeric => 1,
- VarWidth::String(w) => div_ceil(w as usize, 8),
- }
- }
-
- fn width_predicate(
- a: Option<VarWidth>,
- b: Option<VarWidth>,
- f: impl Fn(u16, u16) -> u16,
- ) -> Option<VarWidth> {
- match (a, b) {
- (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric),
- (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => {
- Some(VarWidth::String(f(a, b)))
- }
- _ => None,
- }
- }
-
- /// Returns the wider of `self` and `other`:
- /// - Numerical variable widths are equally wide.
- /// - Longer strings are wider than shorter strings.
- /// - Numerical and string types are incomparable, so result in `None`.
- /// - Any `None` in the input yields `None` in the output.
- pub fn wider(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
- Self::width_predicate(a, b, |a, b| a.max(b))
- }
-
- /// Returns the narrower of `self` and `other` (see [`Self::wider`]).
- pub fn narrower(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
- Self::width_predicate(a, b, |a, b| a.min(b))
- }
-
- pub fn default_display_width(&self) -> u32 {
- match self {
- VarWidth::Numeric => 8,
- VarWidth::String(width) => *width.min(&32) as u32,
- }
- }
-
- pub fn from_raw(raw: impl Into<i32>) -> Result<Self, ()> {
- let raw: i32 = raw.into();
- match raw {
- 0 => Ok(Self::Numeric),
- 1..=255 => Ok(Self::String(raw as u16)),
- _ => Err(()),
- }
- }
-
- pub fn is_long_string(&self) -> bool {
- if let Self::String(width) = self {
- *width > 8
- } else {
- false
- }
- }
-}
-
-impl From<VarWidth> for VarType {
- fn from(source: VarWidth) -> Self {
- match source {
- VarWidth::Numeric => VarType::Numeric,
- VarWidth::String(_) => VarType::String,
- }
- }
-}
-
-#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum Value {
- Number(Option<OrderedFloat<f64>>),
- String(String),
-}
-
-impl Value {
- pub fn decode(raw: &raw::Value<RawStr<8>>, decoder: &Decoder) -> Self {
- match raw {
- raw::Value::Number(x) => Value::Number(x.map(|x| x.into())),
- raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct Dictionary {
- pub variables: IndexSet<ByIdentifier<Variable>>,
- pub split_file: Vec<DictIndex>,
- pub weight: Option<DictIndex>,
- pub filter: Option<DictIndex>,
- pub case_limit: Option<u64>,
- pub file_label: Option<String>,
- pub documents: Vec<String>,
- pub vectors: HashSet<ByIdentifier<Vector>>,
- pub attributes: HashMap<Identifier, Vec<String>>,
- pub mrsets: HashSet<ByIdentifier<MultipleResponseSet>>,
- pub variable_sets: HashSet<ByIdentifier<VariableSet>>,
- pub encoding: &'static Encoding,
-}
-
-#[derive(Debug)]
-pub struct DuplicateVariableName;
-
-impl Dictionary {
- pub fn new(encoding: &'static Encoding) -> Self {
- Self {
- variables: IndexSet::new(),
- split_file: Vec::new(),
- weight: None,
- filter: None,
- case_limit: None,
- file_label: None,
- documents: Vec::new(),
- vectors: HashSet::new(),
- attributes: HashMap::new(),
- mrsets: HashSet::new(),
- variable_sets: HashSet::new(),
- encoding,
- }
- }
-
- pub fn add_var(&mut self, variable: Variable) -> Result<usize, DuplicateVariableName> {
- let (index, inserted) = self.variables.insert_full(ByIdentifier::new(variable));
- if inserted {
- Ok(index)
- } else {
- Err(DuplicateVariableName)
- }
- }
-
- pub fn reorder_var(&mut self, from_index: DictIndex, to_index: DictIndex) {
- if from_index != to_index {
- self.variables.move_index(from_index, to_index);
- self.update_dict_indexes(&|index| {
- #[allow(clippy::collapsible_else_if)]
- if index == from_index {
- Some(to_index)
- } else if from_index < to_index {
- if index > from_index && index <= to_index {
- Some(index - 1)
- } else {
- Some(index)
- }
- } else {
- if index >= to_index && index < from_index {
- Some(index + 1)
- } else {
- Some(index)
- }
- }
- })
- }
- }
-
- pub fn retain_vars<F>(&mut self, keep: F)
- where
- F: Fn(&Variable) -> bool,
- {
- let mut deleted = Vec::new();
- let mut index = 0;
- self.variables.retain(|var_by_id| {
- let keep = keep(&var_by_id.0);
- if !keep {
- deleted.push(index);
- }
- index += 1;
- keep
- });
- if !deleted.is_empty() {
- self.update_dict_indexes(&|index| match deleted.binary_search(&index) {
- Ok(_) => None,
- Err(position) => Some(position),
- })
- }
- }
-
- pub fn delete_vars<R>(&mut self, range: R)
- where
- R: RangeBounds<DictIndex>,
- {
- let start = match range.start_bound() {
- Bound::Included(&start) => start,
- Bound::Excluded(&start) => start + 1,
- Bound::Unbounded => 0,
- };
- let end = match range.end_bound() {
- Bound::Included(&end) => end + 1,
- Bound::Excluded(&end) => end,
- Bound::Unbounded => self.variables.len(),
- };
- if end > start {
- self.variables.drain(start..end);
- self.update_dict_indexes(&|index| {
- if index < start {
- Some(index)
- } else if index < end {
- None
- } else {
- Some(index - end - start)
- }
- })
- }
- }
-
- fn update_dict_indexes<F>(&mut self, f: &F)
- where
- F: Fn(DictIndex) -> Option<DictIndex>,
- {
- update_dict_index_vec(&mut self.split_file, f);
- self.weight = self.weight.and_then(f);
- self.filter = self.filter.and_then(f);
- self.vectors = self
- .vectors
- .drain()
- .filter_map(|vector_by_id| {
- vector_by_id
- .0
- .with_updated_dict_indexes(f)
- .map(ByIdentifier::new)
- })
- .collect();
- self.mrsets = self
- .mrsets
- .drain()
- .filter_map(|mrset_by_id| {
- mrset_by_id
- .0
- .with_updated_dict_indexes(f)
- .map(ByIdentifier::new)
- })
- .collect();
- self.variable_sets = self
- .variable_sets
- .drain()
- .filter_map(|var_set_by_id| {
- var_set_by_id
- .0
- .with_updated_dict_indexes(f)
- .map(ByIdentifier::new)
- })
- .collect();
- }
-}
-
-fn update_dict_index_vec<F>(dict_indexes: &mut Vec<DictIndex>, f: F)
-where
- F: Fn(DictIndex) -> Option<DictIndex>,
-{
- dict_indexes.retain_mut(|index| {
- if let Some(new) = f(*index) {
- *index = new;
- true
- } else {
- false
- }
- });
-}
-
-#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
-pub enum Role {
- Input,
- Target,
- Both,
- None,
- Partition,
- Split,
-}
-
-impl Default for Role {
- fn default() -> Self {
- Self::Input
- }
-}
-
-pub enum DictClass {
- Ordinary,
- System,
- Scratch,
-}
-
-impl DictClass {
- pub fn from_identifier(id: &Identifier) -> Self {
- if id.0.starts_with('$') {
- Self::System
- } else if id.0.starts_with('#') {
- Self::Scratch
- } else {
- Self::Ordinary
- }
- }
-
- pub fn must_leave(self) -> bool {
- match self {
- DictClass::Ordinary => false,
- DictClass::System => false,
- DictClass::Scratch => true,
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct Variable {
- pub name: Identifier,
- pub width: VarWidth,
- pub missing_values: MissingValues,
- pub print_format: Format,
- pub write_format: Format,
- pub value_labels: HashMap<Value, String>,
- pub label: Option<String>,
- pub measure: Option<Measure>,
- pub role: Role,
- pub display_width: u32,
- pub alignment: Alignment,
- pub leave: bool,
- pub short_names: Vec<Identifier>,
- pub attributes: HashSet<ByIdentifier<Attribute>>,
-}
-
-impl Variable {
- pub fn new(name: Identifier, width: VarWidth) -> Self {
- let var_type = VarType::from_width(width);
- let leave = DictClass::from_identifier(&name).must_leave();
- Self {
- name,
- width,
- missing_values: MissingValues::default(),
- print_format: Format::default_for_width(width),
- write_format: Format::default_for_width(width),
- value_labels: HashMap::new(),
- label: None,
- measure: Measure::default_for_type(var_type),
- role: Role::default(),
- display_width: width.default_display_width(),
- alignment: Alignment::default_for_type(var_type),
- leave,
- short_names: Vec::new(),
- attributes: HashSet::new(),
- }
- }
-}
-
-impl HasIdentifier for Variable {
- fn identifier(&self) -> &UniCase<String> {
- &self.name.0
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct Vector {
- pub name: Identifier,
- pub variables: Vec<DictIndex>,
-}
-
-impl Vector {
- fn with_updated_dict_indexes(
- mut self,
- f: impl Fn(DictIndex) -> Option<DictIndex>,
- ) -> Option<Self> {
- update_dict_index_vec(&mut self.variables, f);
- (!self.variables.is_empty()).then_some(self)
- }
-}
-
-impl HasIdentifier for Vector {
- fn identifier(&self) -> &UniCase<String> {
- &self.name.0
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct Attribute {
- pub name: Identifier,
- pub values: Vec<String>,
-}
-
-impl HasIdentifier for Attribute {
- fn identifier(&self) -> &UniCase<String> {
- &self.name.0
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct MultipleResponseSet {
- pub name: Identifier,
- pub label: String,
- pub mr_type: MultipleResponseType,
- pub variables: Vec<DictIndex>,
-}
-
-impl MultipleResponseSet {
- fn with_updated_dict_indexes(
- mut self,
- f: impl Fn(DictIndex) -> Option<DictIndex>,
- ) -> Option<Self> {
- update_dict_index_vec(&mut self.variables, f);
- (self.variables.len() > 1).then_some(self)
- }
-}
-
-impl HasIdentifier for MultipleResponseSet {
- fn identifier(&self) -> &UniCase<String> {
- &self.name.0
- }
-}
-
-#[derive(Clone, Debug)]
-pub enum MultipleResponseType {
- MultipleDichotomy {
- value: Value,
- labels: CategoryLabels,
- },
- MultipleCategory,
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableSet {
- pub name: Identifier,
- pub variables: Vec<DictIndex>,
-}
-
-impl VariableSet {
- fn with_updated_dict_indexes(
- mut self,
- f: impl Fn(DictIndex) -> Option<DictIndex>,
- ) -> Option<Self> {
- update_dict_index_vec(&mut self.variables, f);
- (!self.variables.is_empty()).then_some(self)
- }
-}
-
-impl HasIdentifier for VariableSet {
- fn identifier(&self) -> &UniCase<String> {
- &self.name.0
- }
-}
-
-#[cfg(test)]
-mod test {
- use std::collections::HashSet;
-
- use unicase::UniCase;
-
- use crate::identifier::Identifier;
-
- use super::{ByIdentifier, HasIdentifier};
-
- #[derive(PartialEq, Eq, Debug, Clone)]
- struct Variable {
- name: Identifier,
- value: i32,
- }
-
- impl HasIdentifier for Variable {
- fn identifier(&self) -> &UniCase<String> {
- &self.name.0
- }
- }
-
- #[test]
- fn test() {
- // Variables should not be the same if their values differ.
- let abcd = Identifier::new("abcd").unwrap();
- let abcd1 = Variable {
- name: abcd.clone(),
- value: 1,
- };
- let abcd2 = Variable {
- name: abcd,
- value: 2,
- };
- assert_ne!(abcd1, abcd2);
-
- // But `ByName` should treat them the same.
- let abcd1_by_name = ByIdentifier::new(abcd1);
- let abcd2_by_name = ByIdentifier::new(abcd2);
- assert_eq!(abcd1_by_name, abcd2_by_name);
-
- // And a `HashSet` of `ByName` should also treat them the same.
- let mut vars: HashSet<ByIdentifier<Variable>> = HashSet::new();
- assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
- assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
- assert_eq!(
- vars.get(&UniCase::new(String::from("abcd")))
- .unwrap()
- .0
- .value,
- 1
- );
- }
-}
+++ /dev/null
-use crate::locale_charset::locale_charset;
-use encoding_rs::{Encoding, UTF_8};
-
-include!(concat!(env!("OUT_DIR"), "/encodings.rs"));
-
-pub fn codepage_from_encoding(encoding: &str) -> Option<u32> {
- CODEPAGE_NAME_TO_NUMBER
- .get(encoding.to_ascii_lowercase().as_str())
- .copied()
-}
-
-use thiserror::Error as ThisError;
-
-#[derive(ThisError, Debug)]
-pub enum Error {
- #[error("This system file does not indicate its own character encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")]
- NoEncoding,
-
- #[error("This system file encodes text strings with unknown code page {0}.")]
- UnknownCodepage(i32),
-
- #[error("This system file encodes text strings with unknown encoding {0}.")]
- UnknownEncoding(String),
-
- #[error("This system file is encoded in EBCDIC, which is not supported.")]
- Ebcdic,
-}
-
-pub fn default_encoding() -> &'static Encoding {
- lazy_static! {
- static ref DEFAULT_ENCODING: &'static Encoding =
- Encoding::for_label(locale_charset().as_bytes()).unwrap_or(UTF_8);
- }
- &DEFAULT_ENCODING
-}
-
-pub fn get_encoding(
- encoding: Option<&str>,
- character_code: Option<i32>,
-) -> Result<&'static Encoding, Error> {
- let label = if let Some(encoding) = encoding {
- encoding
- } else if let Some(codepage) = character_code {
- match codepage {
- 1 => return Err(Error::Ebcdic),
- 2 | 3 => {
- // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
- // respectively. However, many files have character code 2 but
- // data which are clearly not ASCII. Therefore, ignore these
- // values.
- return Err(Error::NoEncoding);
- }
- 4 => "MS_KANJI",
- _ => CODEPAGE_NUMBER_TO_NAME
- .get(&codepage)
- .copied()
- .ok_or(Error::UnknownCodepage(codepage))?,
- }
- } else {
- return Err(Error::NoEncoding);
- };
-
- Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into()))
-}
+++ /dev/null
-/// The endianness for integer and floating-point numbers in SPSS system files.
-///
-/// SPSS system files can declare IBM 370 and DEC VAX floating-point
-/// representations, but no file that uses either of these has ever been found
-/// in the wild, so this code does not handle them.
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum Endian {
- /// Big-endian: MSB at lowest address.
- Big,
-
- /// Little-endian: LSB at lowest address.
- Little,
-}
-
-impl Endian {
- #[cfg(target_endian = "big")]
- pub const NATIVE: Endian = Endian::Big;
- #[cfg(target_endian = "little")]
- pub const NATIVE: Endian = Endian::Little;
-
- pub fn identify_u32(expected_value: u32, bytes: [u8; 4]) -> Option<Self> {
- let as_big: u32 = Endian::Big.parse(bytes);
- let as_little: u32 = Endian::Little.parse(bytes);
- match (as_big == expected_value, as_little == expected_value) {
- (true, false) => Some(Endian::Big),
- (false, true) => Some(Endian::Little),
- _ => None,
- }
- }
-
- pub fn identify_f64(expected_value: f64, bytes: [u8; 8]) -> Option<Self> {
- let as_big: f64 = Endian::Big.parse(bytes);
- let as_little: f64 = Endian::Little.parse(bytes);
- match (as_big == expected_value, as_little == expected_value) {
- (true, false) => Some(Endian::Big),
- (false, true) => Some(Endian::Little),
- _ => None,
- }
- }
-}
-
-pub trait ToBytes<T, const N: usize> {
- fn to_bytes(self, value: T) -> [u8; N];
-}
-impl ToBytes<i64, 8> for Endian {
- fn to_bytes(self, value: i64) -> [u8; 8] {
- match self {
- Endian::Big => i64::to_be_bytes(value),
- Endian::Little => i64::to_le_bytes(value),
- }
- }
-}
-impl ToBytes<u32, 4> for Endian {
- fn to_bytes(self, value: u32) -> [u8; 4] {
- match self {
- Endian::Big => u32::to_be_bytes(value),
- Endian::Little => u32::to_le_bytes(value),
- }
- }
-}
-impl ToBytes<i32, 4> for Endian {
- fn to_bytes(self, value: i32) -> [u8; 4] {
- match self {
- Endian::Big => i32::to_be_bytes(value),
- Endian::Little => i32::to_le_bytes(value),
- }
- }
-}
-impl ToBytes<u16, 2> for Endian {
- fn to_bytes(self, value: u16) -> [u8; 2] {
- match self {
- Endian::Big => u16::to_be_bytes(value),
- Endian::Little => u16::to_le_bytes(value),
- }
- }
-}
-impl ToBytes<u8, 1> for Endian {
- fn to_bytes(self, value: u8) -> [u8; 1] {
- [value]
- }
-}
-impl ToBytes<f64, 8> for Endian {
- fn to_bytes(self, value: f64) -> [u8; 8] {
- match self {
- Endian::Big => f64::to_be_bytes(value),
- Endian::Little => f64::to_le_bytes(value),
- }
- }
-}
-
-/// Parses an `N`-byte slice in one of the supported formats into native format
-/// as type `T`.
-pub trait Parse<T, const N: usize> {
- /// Given 'bytes', returns `T`.
- fn parse(self, bytes: [u8; N]) -> T;
-}
-impl Parse<u64, 8> for Endian {
- fn parse(self, bytes: [u8; 8]) -> u64 {
- match self {
- Endian::Big => u64::from_be_bytes(bytes),
- Endian::Little => u64::from_le_bytes(bytes),
- }
- }
-}
-impl Parse<u32, 4> for Endian {
- fn parse(self, bytes: [u8; 4]) -> u32 {
- match self {
- Endian::Big => u32::from_be_bytes(bytes),
- Endian::Little => u32::from_le_bytes(bytes),
- }
- }
-}
-impl Parse<u16, 2> for Endian {
- fn parse(self, bytes: [u8; 2]) -> u16 {
- match self {
- Endian::Big => u16::from_be_bytes(bytes),
- Endian::Little => u16::from_le_bytes(bytes),
- }
- }
-}
-impl Parse<u8, 1> for Endian {
- fn parse(self, bytes: [u8; 1]) -> u8 {
- match self {
- Endian::Big => u8::from_be_bytes(bytes),
- Endian::Little => u8::from_le_bytes(bytes),
- }
- }
-}
-impl Parse<i64, 8> for Endian {
- fn parse(self, bytes: [u8; 8]) -> i64 {
- match self {
- Endian::Big => i64::from_be_bytes(bytes),
- Endian::Little => i64::from_le_bytes(bytes),
- }
- }
-}
-impl Parse<i32, 4> for Endian {
- fn parse(self, bytes: [u8; 4]) -> i32 {
- match self {
- Endian::Big => i32::from_be_bytes(bytes),
- Endian::Little => i32::from_le_bytes(bytes),
- }
- }
-}
-impl Parse<i16, 2> for Endian {
- fn parse(self, bytes: [u8; 2]) -> i16 {
- match self {
- Endian::Big => i16::from_be_bytes(bytes),
- Endian::Little => i16::from_le_bytes(bytes),
- }
- }
-}
-impl Parse<i8, 1> for Endian {
- fn parse(self, bytes: [u8; 1]) -> i8 {
- match self {
- Endian::Big => i8::from_be_bytes(bytes),
- Endian::Little => i8::from_le_bytes(bytes),
- }
- }
-}
-impl Parse<f64, 8> for Endian {
- fn parse(self, bytes: [u8; 8]) -> f64 {
- match self {
- Endian::Big => f64::from_be_bytes(bytes),
- Endian::Little => f64::from_le_bytes(bytes),
- }
- }
-}
+++ /dev/null
-use crate::{
- command::parse,
- lex::{lexer::{Lexer, Source}, token::Token},
- message::Diagnostic,
-};
-
-pub struct Engine {
- lexer: Lexer,
-}
-
-impl Engine {
- fn new() -> Self {
- Self {
- lexer: Lexer::new(Box::new(|location, error| println!("{location}: {error}"))),
- }
- }
- fn run(&mut self, source: Source) {
- self.lexer.append(source);
- self.lexer.get();
- while self.lexer.token() != &Token::End {
- let error: Box<dyn Fn(Diagnostic)> = Box::new(|diagnostic| {
- println!("{diagnostic}");
- });
- parse(&mut self.lexer, &error);
- }
- }
-}
-
-#[cfg(test)]
-mod tests {
- use encoding_rs::UTF_8;
-
- use crate::lex::{
- lexer::{ErrorHandling, Source},
- segment::Mode,
- };
-
- use super::Engine;
-
- #[test]
- fn test_echo() {
- let mut engine = Engine::new();
- engine.run(Source::for_file_contents(
- "ECHO 'hi there'.\nECHO 'bye there'.\n".to_string(),
- Some("test.sps".to_string()),
- UTF_8,
- Mode::default(),
- ErrorHandling::default(),
- ));
- }
-}
+++ /dev/null
-use std::{
- fmt::{Display, Formatter, Result as FmtResult},
- ops::RangeInclusive,
-};
-
-use enum_map::{Enum, EnumMap};
-use thiserror::Error as ThisError;
-
-use crate::{
- dictionary::VarWidth,
- raw::{self, VarType},
-};
-
-#[derive(ThisError, Debug)]
-pub enum Error {
- #[error("Unknown format type {value}.")]
- UnknownFormat { value: u16 },
-
- #[error("Output format {0} specifies width {}, but {} requires an even width.", .0.w, .0.type_)]
- OddWidthNotAllowed(UncheckedFormat),
-
- #[error("Output format {0} specifies width {}, but {} requires a width between {} and {}.", .0.w, .0.type_, .0.type_.min_width(), .0.type_.max_width())]
- BadWidth(UncheckedFormat),
-
- #[error("Output format {0} specifies decimal places, but {} format does not allow any decimals.", .0.type_)]
- DecimalsNotAllowedForFormat(UncheckedFormat),
-
- #[error("Output format {0} specifies {} decimal places, but with a width of {}, {} does not allow any decimal places.", .0.d, .0.w, .0.type_)]
- DecimalsNotAllowedForWidth(UncheckedFormat),
-
- #[error("Output format {spec} specifies {} decimal places but, with a width of {}, {} allows at most {max_d} decimal places.", .spec.d, .spec.w, .spec.type_)]
- TooManyDecimalsForWidth {
- spec: UncheckedFormat,
- max_d: Decimals,
- },
-
- #[error("String variable is not compatible with numeric format {0}.")]
- UnnamedVariableNotCompatibleWithNumericFormat(Type),
-
- #[error("Numeric variable is not compatible with string format {0}.")]
- UnnamedVariableNotCompatibleWithStringFormat(Type),
-
- #[error("String variable {variable} with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")]
- NamedStringVariableBadSpecWidth {
- variable: String,
- width: Width,
- bad_spec: Format,
- good_spec: Format,
- },
-
- #[error("String variable with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")]
- UnnamedStringVariableBadSpecWidth {
- width: Width,
- bad_spec: Format,
- good_spec: Format,
- },
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub enum Category {
- // Numeric formats.
- Basic,
- Custom,
- Legacy,
- Binary,
- Hex,
- Date,
- Time,
- DateComponent,
-
- // String formats.
- String,
-}
-
-impl From<Type> for Category {
- fn from(source: Type) -> Self {
- match source {
- Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => Self::Basic,
- Type::CC(_) => Self::Custom,
- Type::N | Type::Z => Self::Legacy,
- Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => Self::Binary,
- Type::PIBHex | Type::RBHex => Self::Hex,
- Type::Date
- | Type::ADate
- | Type::EDate
- | Type::JDate
- | Type::SDate
- | Type::QYr
- | Type::MoYr
- | Type::WkYr
- | Type::DateTime
- | Type::YMDHMS => Self::Date,
- Type::MTime | Type::Time | Type::DTime => Self::Time,
- Type::WkDay | Type::Month => Self::DateComponent,
- Type::A | Type::AHex => Self::String,
- }
- }
-}
-
-#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Hash)]
-pub enum CC {
- A,
- B,
- C,
- D,
- E,
-}
-
-impl Display for CC {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- let s = match self {
- CC::A => "A",
- CC::B => "B",
- CC::C => "C",
- CC::D => "D",
- CC::E => "E",
- };
- write!(f, "{}", s)
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub enum Type {
- // Basic numeric formats.
- F,
- Comma,
- Dot,
- Dollar,
- Pct,
- E,
-
- // Custom currency formats.
- CC(CC),
-
- // Legacy numeric formats.
- N,
- Z,
-
- // Binary and hexadecimal formats.
- P,
- PK,
- IB,
- PIB,
- PIBHex,
- RB,
- RBHex,
-
- // Time and date formats.
- Date,
- ADate,
- EDate,
- JDate,
- SDate,
- QYr,
- MoYr,
- WkYr,
- DateTime,
- YMDHMS,
- MTime,
- Time,
- DTime,
-
- // Date component formats.
- WkDay,
- Month,
-
- // String formats.
- A,
- AHex,
-}
-
-pub type Width = u16;
-pub type SignedWidth = i16;
-
-pub type Decimals = u8;
-
-impl Type {
- pub fn max_width(self) -> Width {
- match self {
- Self::P | Self::PK | Self::PIBHex | Self::RBHex => 16,
- Self::IB | Self::PIB | Self::RB => 8,
- Self::A => 32767,
- Self::AHex => 32767 * 2,
- _ => 40,
- }
- }
-
- pub fn min_width(self) -> Width {
- match self {
- // Basic numeric formats.
- Self::F => 1,
- Self::Comma => 1,
- Self::Dot => 1,
- Self::Dollar => 2,
- Self::Pct => 2,
- Self::E => 6,
-
- // Custom currency formats.
- Self::CC(_) => 2,
-
- // Legacy numeric formats.
- Self::N => 1,
- Self::Z => 1,
-
- // Binary and hexadecimal formats.
- Self::P => 1,
- Self::PK => 1,
- Self::IB => 1,
- Self::PIB => 1,
- Self::PIBHex => 2,
- Self::RB => 2,
- Self::RBHex => 4,
-
- // Time and date formats.
- Self::Date => 9,
- Self::ADate => 8,
- Self::EDate => 8,
- Self::JDate => 5,
- Self::SDate => 8,
- Self::QYr => 6,
- Self::MoYr => 6,
- Self::WkYr => 8,
- Self::DateTime => 17,
- Self::YMDHMS => 16,
- Self::MTime => 5,
- Self::Time => 5,
- Self::DTime => 8,
-
- // Date component formats.
- Self::WkDay => 2,
- Self::Month => 3,
-
- // String formats.
- Self::A => 1,
- Self::AHex => 2,
- }
- }
-
- pub fn width_range(self) -> RangeInclusive<Width> {
- self.min_width()..=self.max_width()
- }
-
- pub fn max_decimals(self, width: Width) -> Decimals {
- let width = width.clamp(1, 40) as SignedWidth;
- let max = match self {
- Self::F | Self::Comma | Self::Dot | Self::CC(_) => width - 1,
- Self::Dollar | Self::Pct => width - 2,
- Self::E => width - 7,
- Self::N | Self::Z => width,
- Self::P => width * 2 - 1,
- Self::PK => width * 2,
- Self::IB | Self::PIB => max_digits_for_bytes(width as usize) as SignedWidth,
- Self::PIBHex => 0,
- Self::RB | Self::RBHex => 16,
- Self::Date
- | Self::ADate
- | Self::EDate
- | Self::JDate
- | Self::SDate
- | Self::QYr
- | Self::MoYr
- | Self::WkYr => 0,
- Self::DateTime => width - 21,
- Self::YMDHMS => width - 20,
- Self::MTime => width - 6,
- Self::Time => width - 9,
- Self::DTime => width - 12,
- Self::WkDay | Self::Month | Self::A | Self::AHex => 0,
- };
- max.clamp(0, 16) as Decimals
- }
-
- pub fn takes_decimals(self) -> bool {
- self.max_decimals(Width::MAX) > 0
- }
-
- pub fn category(self) -> Category {
- self.into()
- }
-
- pub fn width_step(self) -> Width {
- if self.category() == Category::Hex || self == Self::AHex {
- 2
- } else {
- 1
- }
- }
-
- pub fn clamp_width(self, width: Width) -> Width {
- let (min, max) = self.width_range().into_inner();
- let width = width.clamp(min, max);
- if self.width_step() == 2 {
- width / 2 * 2
- } else {
- width
- }
- }
-
- pub fn var_type(self) -> VarType {
- match self {
- Self::A | Self::AHex => VarType::String,
- _ => VarType::Numeric,
- }
- }
-
- /// Checks whether this format is valid for a variable with the given
- /// `var_type`.
- pub fn check_type_compatibility(self, var_type: VarType) -> Result<(), Error> {
- let my_type = self.var_type();
- match (my_type, var_type) {
- (VarType::Numeric, VarType::String) => {
- Err(Error::UnnamedVariableNotCompatibleWithNumericFormat(self))
- }
- (VarType::String, VarType::Numeric) => {
- Err(Error::UnnamedVariableNotCompatibleWithStringFormat(self))
- }
- _ => Ok(()),
- }
- }
-}
-
-impl Display for Type {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- let s = match self {
- Self::F => "F",
- Self::Comma => "COMMA",
- Self::Dot => "DOT",
- Self::Dollar => "DOLLAR",
- Self::Pct => "PCT",
- Self::E => "E",
- Self::CC(cc) => return write!(f, "{}", cc),
- Self::N => "N",
- Self::Z => "Z",
- Self::P => "P",
- Self::PK => "PK",
- Self::IB => "IB",
- Self::PIB => "PIB",
- Self::PIBHex => "PIBHEX",
- Self::RB => "RB",
- Self::RBHex => "RBHEX",
- Self::Date => "DATE",
- Self::ADate => "ADATE",
- Self::EDate => "EDATE",
- Self::JDate => "JDATE",
- Self::SDate => "SDATE",
- Self::QYr => "QYR",
- Self::MoYr => "MOYR",
- Self::WkYr => "WKYR",
- Self::DateTime => "DATETIME",
- Self::YMDHMS => "YMDHMS",
- Self::MTime => "MTIME",
- Self::Time => "TIME",
- Self::DTime => "DTIME",
- Self::WkDay => "WKDAY",
- Self::Month => "MONTH",
- Self::A => "A",
- Self::AHex => "AHEX",
- };
- write!(f, "{}", s)
- }
-}
-
-fn max_digits_for_bytes(bytes: usize) -> usize {
- *[0, 3, 5, 8, 10, 13, 15, 17].get(bytes).unwrap_or(&20)
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub struct Format {
- type_: Type,
- w: Width,
- d: Decimals,
-}
-
-impl Format {
- pub const F40: Format = Format {
- type_: Type::F,
- w: 40,
- d: 0,
- };
-
- pub const F8_2: Format = Format {
- type_: Type::F,
- w: 8,
- d: 2,
- };
-
- pub fn format(self) -> Type {
- self.type_
- }
- pub fn w(self) -> Width {
- self.w
- }
- pub fn d(self) -> Decimals {
- self.d
- }
-
- pub fn default_for_width(var_width: VarWidth) -> Self {
- match var_width {
- VarWidth::Numeric => Format {
- type_: Type::F,
- w: 8,
- d: 2,
- },
- VarWidth::String(w) => Format {
- type_: Type::A,
- w,
- d: 0,
- },
- }
- }
-
- pub fn fixed_from(source: &UncheckedFormat) -> Self {
- let UncheckedFormat {
- type_: format,
- w,
- d,
- } = *source;
- let (min, max) = format.width_range().into_inner();
- let mut w = w.clamp(min, max);
- if d <= format.max_decimals(Width::MAX) {
- while d > format.max_decimals(w) {
- w += 1;
- assert!(w <= 40);
- }
- }
- let d = d.clamp(0, format.max_decimals(w));
- Self {
- type_: format,
- w,
- d,
- }
- }
-
- pub fn var_width(self) -> VarWidth {
- match self.type_ {
- Type::A => VarWidth::String(self.w),
- Type::AHex => VarWidth::String(self.w / 2),
- _ => VarWidth::Numeric,
- }
- }
-
- pub fn var_type(self) -> VarType {
- self.type_.var_type()
- }
-
- /// Checks whether this format specification is valid for a variable with
- /// width `var_width`.
- pub fn check_width_compatibility(self, var_width: VarWidth) -> Result<Self, Error> {
- // Verify that the format is right for the variable's type.
- self.type_.check_type_compatibility(var_width.into())?;
-
- if let VarWidth::String(w) = var_width {
- if var_width != self.var_width() {
- let bad_spec = self;
- let good_spec = if self.type_ == Type::A {
- Format { w, ..self }
- } else {
- Format { w: w * 2, ..self }
- };
- return Err(Error::UnnamedStringVariableBadSpecWidth {
- width: w,
- bad_spec,
- good_spec,
- });
- }
- }
-
- Ok(self)
- }
-}
-
-impl Display for Format {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{}{}", self.type_, self.w)?;
- if self.type_.takes_decimals() || self.d > 0 {
- write!(f, ".{}", self.d)?;
- }
- Ok(())
- }
-}
-
-impl TryFrom<UncheckedFormat> for Format {
- type Error = Error;
-
- fn try_from(source: UncheckedFormat) -> Result<Self, Self::Error> {
- let UncheckedFormat {
- type_: format,
- w,
- d,
- } = source;
- let max_d = format.max_decimals(w);
- if w % format.width_step() != 0 {
- Err(Error::OddWidthNotAllowed(source))
- } else if !format.width_range().contains(&w) {
- Err(Error::BadWidth(source))
- } else if d > max_d {
- if format.takes_decimals() {
- Err(Error::DecimalsNotAllowedForFormat(source))
- } else if max_d > 0 {
- Err(Error::TooManyDecimalsForWidth {
- spec: source,
- max_d,
- })
- } else {
- Err(Error::DecimalsNotAllowedForWidth(source))
- }
- } else {
- Ok(Format {
- type_: format,
- w,
- d,
- })
- }
- }
-}
-
-impl TryFrom<u16> for Type {
- type Error = Error;
-
- fn try_from(source: u16) -> Result<Self, Self::Error> {
- match source {
- 1 => Ok(Self::A),
- 2 => Ok(Self::AHex),
- 3 => Ok(Self::Comma),
- 4 => Ok(Self::Dollar),
- 5 => Ok(Self::F),
- 6 => Ok(Self::IB),
- 7 => Ok(Self::PIBHex),
- 8 => Ok(Self::P),
- 9 => Ok(Self::PIB),
- 10 => Ok(Self::PK),
- 11 => Ok(Self::RB),
- 12 => Ok(Self::RBHex),
- 15 => Ok(Self::Z),
- 16 => Ok(Self::N),
- 17 => Ok(Self::E),
- 20 => Ok(Self::Date),
- 21 => Ok(Self::Time),
- 22 => Ok(Self::DateTime),
- 23 => Ok(Self::ADate),
- 24 => Ok(Self::JDate),
- 25 => Ok(Self::DTime),
- 26 => Ok(Self::WkDay),
- 27 => Ok(Self::Month),
- 28 => Ok(Self::MoYr),
- 29 => Ok(Self::QYr),
- 30 => Ok(Self::WkYr),
- 31 => Ok(Self::Pct),
- 32 => Ok(Self::Dot),
- 33 => Ok(Self::CC(CC::A)),
- 34 => Ok(Self::CC(CC::B)),
- 35 => Ok(Self::CC(CC::C)),
- 36 => Ok(Self::CC(CC::D)),
- 37 => Ok(Self::CC(CC::E)),
- 38 => Ok(Self::EDate),
- 39 => Ok(Self::SDate),
- 40 => Ok(Self::MTime),
- 41 => Ok(Self::YMDHMS),
- _ => Err(Error::UnknownFormat { value: source }),
- }
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub struct UncheckedFormat {
- pub type_: Type,
-
- pub w: Width,
-
- pub d: Decimals,
-}
-
-impl TryFrom<raw::Spec> for UncheckedFormat {
- type Error = Error;
-
- fn try_from(raw: raw::Spec) -> Result<Self, Self::Error> {
- let raw = raw.0;
- let raw_format = (raw >> 16) as u16;
- let format = raw_format.try_into()?;
- let w = ((raw >> 8) & 0xff) as Width;
- let d = (raw & 0xff) as Decimals;
- Ok(Self {
- type_: format,
- w,
- d,
- })
- }
-}
-
-impl Display for UncheckedFormat {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{}{}", self.type_, self.w)?;
- if self.type_.takes_decimals() || self.d > 0 {
- write!(f, ".{}", self.d)?;
- }
- Ok(())
- }
-}
-
-pub struct Settings {
- epoch: Option<i32>,
-
- /// Either `'.'` or `','`.
- decimal: char,
-
- /// Format `F`, `E`, `COMMA`, and `DOT` with leading zero (e.g. `0.5`
- /// instead of `.5`)?
- include_leading_zero: bool,
-
- /// Custom currency styles.
- ccs: EnumMap<CC, Option<NumberStyle>>,
-}
-
-impl Default for Settings {
- fn default() -> Self {
- Self {
- epoch: None,
- decimal: '.',
- include_leading_zero: false,
- ccs: Default::default(),
- }
- }
-}
-
-/// A numeric output style. This can express numeric formats in
-/// [Category::Basic] and [Category::Custom].
-pub struct NumberStyle {
- neg_prefix: Affix,
- prefix: Affix,
- suffix: Affix,
- neg_suffix: Affix,
-
- /// Decimal point: `'.'` or `','`.
- decimal: char,
-
- /// Grouping character: `'.'` or `','` or `None`.
- grouping: Option<char>,
-
- /// Format as `.5` or `0.5`?
- include_leading_zero: bool,
-
- /// An `Affix` may require more bytes than its display width; for example,
- /// U+00A5 (¥) is 2 bytes in UTF-8 but occupies only one display column.
- /// This member is the sum of the number of bytes required by all of the
- /// `Affix` members in this struct, minus their display widths. Thus, it
- /// can be used to size memory allocations: for example, the formatted
- /// result of `CCA20.5` requires no more than `(20 + extra_bytes)` bytes in
- /// UTF-8.
- extra_bytes: usize,
-}
-
-pub struct Affix {
- /// String contents of affix.
- s: String,
-
- /// Display width in columns (see [unicode_width])
- width: usize,
-}
+++ /dev/null
-use num::Float;
-use std::{num::FpCategory, fmt::{Display, Formatter, Result}};
-
-pub struct HexFloat<T: Float>(pub T);
-
-impl<T: Float> Display for HexFloat<T> {
- fn fmt(&self, f: &mut Formatter<'_>) -> Result {
- let sign = if self.0.is_sign_negative() { "-" } else { "" };
- match self.0.classify() {
- FpCategory::Nan => return write!(f, "NaN"),
- FpCategory::Infinite => return write!(f, "{sign}Infinity"),
- FpCategory::Zero => return write!(f, "{sign}0.0"),
- _ => (),
- };
- let (significand, mut exponent, _) = self.0.integer_decode();
- let mut hex_sig = format!("{:x}", significand);
- while hex_sig.ends_with('0') {
- hex_sig.pop();
- exponent += 4;
- }
- match hex_sig.len() {
- 0 => write!(f, "{sign}0.0"),
- 1 => write!(f, "{sign}0x{hex_sig}.0p{exponent}"),
- len => write!(
- f,
- "{sign}0x{}.{}p{}",
- hex_sig.chars().next().unwrap(),
- &hex_sig[1..],
- exponent + 4 * (len as i16 - 1)
- ),
- }
- }
-}
-
-#[cfg(test)]
-mod hex_float_tests {
- use crate::HexFloat;
- use num::Float;
-
- #[test]
- fn test() {
- assert_eq!(format!("{}", HexFloat(1.0)), "0x1.0p0");
- assert_eq!(format!("{}", HexFloat(123.0)), "0x1.ecp6");
- assert_eq!(format!("{}", HexFloat(1.0 / 16.0)), "0x1.0p-4");
- assert_eq!(format!("{}", HexFloat(f64::infinity())), "Infinity");
- assert_eq!(format!("{}", HexFloat(f64::neg_infinity())), "-Infinity");
- assert_eq!(format!("{}", HexFloat(f64::nan())), "NaN");
- assert_eq!(format!("{}", HexFloat(0.0)), "0.0");
- assert_eq!(format!("{}", HexFloat(f64::neg_zero())), "-0.0");
- }
-}
-
+++ /dev/null
-use std::{
- borrow::Borrow,
- cmp::Ordering,
- fmt::{Debug, Display, Formatter, Result as FmtResult},
- hash::{Hash, Hasher},
- ops::Deref,
-};
-
-use encoding_rs::{EncoderResult, Encoding, UTF_8};
-use finl_unicode::categories::{CharacterCategories, MajorCategory};
-use thiserror::Error as ThisError;
-use unicase::UniCase;
-
-pub trait IdentifierChar {
- /// Returns true if `self` is an ASCII character that may be the first
- /// character in an identifier.
- fn ascii_may_start_id(self) -> bool;
-
- /// Returns true if `self` may be the first character in an identifier.
- fn may_start_id(self) -> bool;
-
- /// Returns true if `self` is an ASCII character that may be a second or
- /// subsequent character in an identifier.
- fn ascii_may_continue_id(self) -> bool;
-
- /// Returns true if `self` may be a second or subsequent character in an
- /// identifier.
- fn may_continue_id(self) -> bool;
-}
-
-impl IdentifierChar for char {
- fn ascii_may_start_id(self) -> bool {
- matches!(self, 'a'..='z' | 'A'..='Z' | '@' | '#' | '$' | '!')
- }
-
- fn may_start_id(self) -> bool {
- if self < '\u{0080}' {
- self.ascii_may_start_id()
- } else {
- use MajorCategory::*;
-
- [L, M, S].contains(&self.get_major_category()) && self != char::REPLACEMENT_CHARACTER
- }
- }
-
- fn ascii_may_continue_id(self) -> bool {
- matches!(self, 'a'..='z' | 'A'..='Z' | '0'..='9' | '@' | '#' | '$' | '.' | '_')
- }
-
- fn may_continue_id(self) -> bool {
- if self < '\u{0080}' {
- self.ascii_may_continue_id()
- } else {
- use MajorCategory::*;
-
- [L, M, S, N].contains(&self.get_major_category()) && self != char::REPLACEMENT_CHARACTER
- }
- }
-}
-
-#[derive(Clone, Debug, ThisError)]
-pub enum Error {
- #[error("Identifier cannot be empty string.")]
- Empty,
-
- #[error("\"{0}\" may not be used as an identifier because it is a reserved word.")]
- Reserved(String),
-
- #[error("\"!\" is not a valid identifier.")]
- Bang,
-
- #[error("\"{0}\" may not be used as an identifier because it begins with disallowed character \"{1}\".")]
- BadFirstCharacter(String, char),
-
- #[error("\"{0}\" may not be used as an identifier because it contains disallowed character \"{1}\".")]
- BadLaterCharacter(String, char),
-
- #[error("Identifier \"{id}\" is {length} bytes in the encoding in use ({encoding}), which exceeds the {max}-byte limit.")]
- TooLong {
- id: String,
- length: usize,
- encoding: &'static str,
- max: usize,
- },
-
- #[error("\"{id}\" may not be used as an identifier because the encoding in use ({encoding}) cannot represent \"{c}\".")]
- NotEncodable {
- id: String,
- encoding: &'static str,
- c: char,
- },
-}
-
-pub enum ReservedWord {
- And,
- Or,
- Not,
- Eq,
- Ge,
- Gt,
- Le,
- Lt,
- Ne,
- All,
- By,
- To,
- With,
-}
-
-impl TryFrom<&str> for ReservedWord {
- type Error = ();
-
- fn try_from(source: &str) -> Result<Self, Self::Error> {
- if !(2..=4).contains(&source.len()) {
- Err(())
- } else {
- let b = source.as_bytes();
- let c0 = b[0].to_ascii_uppercase();
- let c1 = b[1].to_ascii_uppercase();
- match (source.len(), c0, c1) {
- (2, b'B', b'Y') => Ok(Self::By),
- (2, b'E', b'Q') => Ok(Self::Eq),
- (2, b'G', b'T') => Ok(Self::Gt),
- (2, b'G', b'E') => Ok(Self::Ge),
- (2, b'L', b'T') => Ok(Self::Lt),
- (2, b'L', b'E') => Ok(Self::Le),
- (2, b'N', b'E') => Ok(Self::Ne),
- (3, b'N', b'O') if b[2].to_ascii_uppercase() == b'T' => Ok(Self::Not),
- (2, b'O', b'R') => Ok(Self::Or),
- (2, b'T', b'O') => Ok(Self::To),
- (3, b'A', b'L') if b[2].to_ascii_uppercase() == b'L' => Ok(Self::All),
- (3, b'A', b'N') if b[2].to_ascii_uppercase() == b'D' => Ok(Self::And),
- (4, b'W', b'I')
- if b[2].to_ascii_uppercase() == b'T' && b[3].to_ascii_uppercase() == b'H' =>
- {
- Ok(Self::With)
- }
- _ => Err(()),
- }
- }
- }
-}
-
-pub fn is_reserved_word(s: &str) -> bool {
- ReservedWord::try_from(s).is_ok()
-}
-
-#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub struct Identifier(pub UniCase<String>);
-
-impl Identifier {
- /// Maximum length of an identifier, in bytes. The limit applies in the
- /// encoding used by the dictionary, not in UTF-8.
- pub const MAX_LEN: usize = 64;
-
- pub fn new(s: &str) -> Result<Self, Error> {
- Self::from_encoding(s, UTF_8)
- }
- pub fn from_encoding(s: &str, encoding: &'static Encoding) -> Result<Identifier, Error> {
- Self::is_plausible(s)?;
- let identifier = Identifier(s.into());
- identifier.check_encoding(encoding)?;
- Ok(identifier)
- }
-
- /// Checks whether this is a valid identifier in the given `encoding`. An
- /// identifier that is valid in one encoding might be invalid in another
- /// because some characters are unencodable or because it is too long.
- pub fn check_encoding(&self, encoding: &'static Encoding) -> Result<(), Error> {
- let s = self.0.as_str();
- let (_encoded, _, unencodable) = encoding.encode(s);
- if unencodable {
- let mut encoder = encoding.new_encoder();
- let mut buf = Vec::with_capacity(
- encoder
- .max_buffer_length_from_utf8_without_replacement(s.len())
- .unwrap(),
- );
- let EncoderResult::Unmappable(c) = encoder
- .encode_from_utf8_to_vec_without_replacement(s, &mut buf, true)
- .0
- else {
- unreachable!();
- };
- return Err(Error::NotEncodable {
- id: s.into(),
- encoding: encoding.name(),
- c,
- });
- }
- /*
- if encoded.len() > Self::MAX_LEN {
- return Err(Error::TooLong {
- id: s.into(),
- length: encoded.len(),
- encoding: encoding.name(),
- max: Self::MAX_LEN,
- });
- }*/
- Ok(())
- }
- pub fn is_plausible(s: &str) -> Result<(), Error> {
- if s.is_empty() {
- return Err(Error::Empty);
- }
- if is_reserved_word(s) {
- return Err(Error::Reserved(s.into()));
- }
- if s == "!" {
- return Err(Error::Bang);
- }
-
- let mut i = s.chars();
- let first = i.next().unwrap();
- if !first.may_start_id() {
- return Err(Error::BadFirstCharacter(s.into(), first));
- }
- for c in i {
- if !c.may_continue_id() {
- return Err(Error::BadLaterCharacter(s.into(), c));
- }
- }
- Ok(())
- }
-
- /// Returns true if `token` is a case-insensitive match for `keyword`.
- ///
- /// Keywords match `keyword` and `token` are identical, or `token` is at
- /// least 3 characters long and those characters are identical to `keyword`
- /// or differ only in case.
- ///
- /// `keyword` must be ASCII.
- pub fn matches_keyword(&self, keyword: &str) -> bool {
- id_match_n_nonstatic(keyword, self.0.as_str(), 3)
- }
-
- /// Returns true if `token` is a case-insensitive match for at least the
- /// first `n` characters of `keyword`.
- ///
- /// `keyword` must be ASCII.
- pub fn matches_keyword_n(&self, keyword: &str, n: usize) -> bool {
- id_match_n_nonstatic(keyword, self.0.as_str(), n)
- }
-}
-
-impl PartialEq<str> for Identifier {
- fn eq(&self, other: &str) -> bool {
- self.0.eq(&UniCase::new(other))
- }
-}
-
-/// Returns true if `token` is a case-insensitive match for `keyword`.
-///
-/// Keywords match `keyword` and `token` are identical, or `token` is at least 3
-/// characters long and those characters are identical to `keyword` or differ
-/// only in case.
-///
-/// `keyword` must be ASCII. It's normally a constant string, so it's declared
-/// as `&'static str` to make it harder to reverse the argument order. But
-/// there's no reason that a non-static string won't work, so use
-/// [`id_match_n_nonstatic`] instead if you need it.
-pub fn id_match(keyword: &'static str, token: &str) -> bool {
- id_match_n(keyword, token, 3)
-}
-
-/// Returns true if `token` is a case-insensitive match for at least the first
-/// `n` characters of `keyword`.
-///
-/// `keyword` must be ASCII. It's normally a constant string, so it's declared
-/// as `&'static str` to make it harder to reverse the argument order. But
-/// there's no reason that a non-static string won't work, so use
-/// [`id_match_n_nonstatic`] instead if you need it.
-pub fn id_match_n(keyword: &'static str, token: &str, n: usize) -> bool {
- id_match_n_nonstatic(keyword, token, n)
-}
-
-/// Returns true if `token` is a case-insensitive match for at least the first
-/// `n` characters of `keyword`.
-///
-/// `keyword` must be ASCII.
-pub fn id_match_n_nonstatic(keyword: &str, token: &str, n: usize) -> bool {
- debug_assert!(keyword.is_ascii());
- let keyword_prefix = if (n..keyword.len()).contains(&token.len()) {
- &keyword[..token.len()]
- } else {
- keyword
- };
- keyword_prefix.eq_ignore_ascii_case(token)
-}
-
-impl Display for Identifier {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{}", self.0)
- }
-}
-
-impl Debug for Identifier {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- write!(f, "{}", self.0)
- }
-}
-
-pub trait HasIdentifier {
- fn identifier(&self) -> &UniCase<String>;
-}
-
-pub struct ByIdentifier<T>(pub T)
-where
- T: HasIdentifier;
-
-impl<T> ByIdentifier<T>
-where
- T: HasIdentifier,
-{
- pub fn new(inner: T) -> Self {
- Self(inner)
- }
-}
-
-impl<T> PartialEq for ByIdentifier<T>
-where
- T: HasIdentifier,
-{
- fn eq(&self, other: &Self) -> bool {
- self.0.identifier().eq(other.0.identifier())
- }
-}
-
-impl<T> Eq for ByIdentifier<T> where T: HasIdentifier {}
-
-impl<T> PartialOrd for ByIdentifier<T>
-where
- T: HasIdentifier,
-{
- fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
- Some(self.cmp(other))
- }
-}
-
-impl<T> Ord for ByIdentifier<T>
-where
- T: HasIdentifier,
-{
- fn cmp(&self, other: &Self) -> Ordering {
- self.0.identifier().cmp(other.0.identifier())
- }
-}
-
-impl<T> Hash for ByIdentifier<T>
-where
- T: HasIdentifier,
-{
- fn hash<H: Hasher>(&self, state: &mut H) {
- self.0.identifier().hash(state)
- }
-}
-
-impl<T> Borrow<UniCase<String>> for ByIdentifier<T>
-where
- T: HasIdentifier,
-{
- fn borrow(&self) -> &UniCase<String> {
- self.0.identifier()
- }
-}
-
-impl<T> Debug for ByIdentifier<T>
-where
- T: HasIdentifier + Debug,
-{
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- self.0.fmt(f)
- }
-}
-
-impl<T> Clone for ByIdentifier<T>
-where
- T: HasIdentifier + Clone,
-{
- fn clone(&self) -> Self {
- Self(self.0.clone())
- }
-}
-
-impl<T> Deref for ByIdentifier<T>
-where
- T: HasIdentifier + Clone,
-{
- type Target = T;
-
- fn deref(&self) -> &Self::Target {
- &self.0
- }
-}
+++ /dev/null
-pub trait ToInteger {
- fn to_exact_integer<T>(&self) -> Option<T>
- where
- T: FromFloat;
- fn to_exact_usize(&self) -> Option<usize> {
- self.to_exact_integer()
- }
- fn to_exact_u8(&self) -> Option<u8> {
- self.to_exact_integer()
- }
- fn to_exact_u16(&self) -> Option<u16> {
- self.to_exact_integer()
- }
- fn to_exact_u32(&self) -> Option<u32> {
- self.to_exact_integer()
- }
- fn to_exact_u64(&self) -> Option<u64> {
- self.to_exact_integer()
- }
- fn to_exact_u128(&self) -> Option<u128> {
- self.to_exact_integer()
- }
- fn to_exact_isize(&self) -> Option<usize> {
- self.to_exact_integer()
- }
- fn to_exact_i8(&self) -> Option<i8> {
- self.to_exact_integer()
- }
- fn to_exact_i16(&self) -> Option<i16> {
- self.to_exact_integer()
- }
- fn to_exact_i32(&self) -> Option<i32> {
- self.to_exact_integer()
- }
- fn to_exact_i64(&self) -> Option<i64> {
- self.to_exact_integer()
- }
- fn to_exact_i128(&self) -> Option<i128> {
- self.to_exact_integer()
- }
-}
-
-impl ToInteger for f64 {
- fn to_exact_integer<T>(&self) -> Option<T>
- where
- T: FromFloat,
- {
- T::from_float(*self)
- }
-}
-
-pub trait FromFloat {
- fn from_float(x: f64) -> Option<Self>
- where
- Self: Sized;
-}
-
-macro_rules! impl_from_float {
- ($T:ident) => {
- impl FromFloat for $T {
- fn from_float(x: f64) -> Option<Self>
- where
- Self: Sized,
- {
- if x.trunc() == x && x >= $T::MIN as f64 && x <= $T::MAX as f64 {
- Some(x as Self)
- } else {
- None
- }
- }
- }
- };
-}
-
-impl_from_float!(usize);
-impl_from_float!(u8);
-impl_from_float!(u16);
-impl_from_float!(u32);
-impl_from_float!(u64);
-impl_from_float!(u128);
-impl_from_float!(isize);
-impl_from_float!(i8);
-impl_from_float!(i16);
-impl_from_float!(i32);
-impl_from_float!(i64);
-impl_from_float!(i128);
+++ /dev/null
-use crate::identifier::id_match_n_nonstatic;
-
-pub struct Match {
- pub exact: bool,
- pub missing_words: isize,
-}
-
-fn count_words(s: &str) -> isize {
- s.split_whitespace().count() as isize
-}
-
-/// Compares `string` obtained from the user against the full name of a `command`,
-/// using this algorithm:
-///
-/// 1. Divide `command` into words `c[0]` through `c[n - 1]`.
-///
-/// 2. Divide `string` into words `s[0]` through `s[m - 1]`.
-///
-/// 3. Compare word `c[i]` against `s[i]` for `0 <= i < min(n, m)`, using the keyword
-/// matching algorithm implemented by lex_id_match(). If any of them fail to
-/// match, then `string` does not match `command` and the function returns false.
-///
-/// 4. Otherwise, `string` and `command` match. Set *MISSING_WORDS to n - m. Set
-/// *EXACT to false if any of the S[i] were found to be abbreviated in the
-/// comparisons done in step 3, or to true if they were all exactly equal
-/// (modulo case). Return true.
-pub fn command_match(command: &str, string: &str) -> Option<Match> {
- let mut command_words = command.split_whitespace();
- let mut string_words = string.split_whitespace();
- let mut exact = true;
- loop {
- let Some(cw) = command_words.next() else {
- return Some(Match {
- exact,
- missing_words: -(string_words.count() as isize),
- });
- };
- let Some(sw) = string_words.next() else {
- return Some(Match {
- exact,
- missing_words: 1 + command_words.count() as isize,
- });
- };
- if !id_match_n_nonstatic(cw, sw, 3) {
- return None;
- }
- if sw.len() < cw.len() {
- exact = false;
- }
- }
-}
-
-/// Matches a string against a collection of command names.
-pub struct CommandMatcher<'a, T> {
- string: &'a str,
- extensible: bool,
- exact_match: Option<T>,
- n_matches: usize,
- match_: Option<T>,
- match_missing_words: isize,
-}
-
-impl<'a, T> CommandMatcher<'a, T> {
- pub fn new(string: &'a str) -> Self {
- Self {
- string,
- extensible: false,
- exact_match: None,
- n_matches: 0,
- match_: None,
- match_missing_words: 0,
- }
- }
-
- /// Consider `command` as a candidate for the command name being parsed. If
- /// `command` is the correct command name, then [Self::get_match] will
- /// return `aux` later.
- pub fn add(&mut self, command: &str, aux: T) {
- if let Some(Match {
- missing_words,
- exact,
- }) = command_match(command, self.string)
- {
- if missing_words > 0 {
- self.extensible = true;
- } else if exact && missing_words == 0 {
- self.exact_match = Some(aux);
- } else {
- if missing_words > self.match_missing_words {
- self.n_matches = 0;
- }
- if missing_words >= self.match_missing_words || self.n_matches == 0 {
- self.n_matches += 1;
- self.match_ = Some(aux);
- self.match_missing_words = missing_words;
- }
- }
- }
- }
-
- pub fn get_match(self) -> (Option<T>, isize) {
- if self.extensible {
- (None, 1)
- } else if let Some(exact_match) = self.exact_match {
- (Some(exact_match), 0)
- } else if self.n_matches == 1 {
- (self.match_, self.match_missing_words)
- } else {
- (None, self.match_missing_words)
- }
- }
-}
-
-pub const COMMAND_NAMES: &'static [&'static str] = &[
- "2SLS",
- "ACF",
- "ADD DOCUMENT",
- "ADD FILES",
- "ADD VALUE LABELS",
- "AGGREGATE",
- "ALSCAL",
- "ANACOR",
- "ANOVA",
- "APPLY DICTIONARY",
- "AUTORECODE",
- "BEGIN DATA",
- "BREAK",
- "CACHE",
- "CASEPLOT",
- "CASESTOVARS",
- "CATPCA",
- "CATREG",
- "CCF",
- "CD",
- "CLEAR TRANSFORMATIONS",
- "CLOSE FILE HANDLE",
- "CLUSTER",
- "COMPUTE",
- "CONJOINT",
- "CORRELATIONS",
- "CORRESPONDENCE",
- "COUNT",
- "COXREG",
- "CREATE",
- "CROSSTABS",
- "CSDESCRIPTIVES",
- "CSGLM",
- "CSLOGISTIC",
- "CSPLAN",
- "CSSELECT",
- "CSTABULATE",
- "CTABLES",
- "CURVEFIT",
- "DATA LIST",
- "DATAFILE ATTRIBUTE",
- "DATASET ACTIVATE",
- "DATASET CLOSE",
- "DATASET COPY",
- "DATASET DECLARE",
- "DATASET DISPLAY",
- "DATASET NAME",
- "DATE",
- "DEBUG EVALUATE",
- "DEBUG EXPAND",
- "DEBUG FLOAT FORMAT",
- "DEBUG FORMAT GUESSER",
- "DEBUG MATRIX READ",
- "DEBUG MOMENTS",
- "DEBUG PAPER SIZE",
- "DEBUG POOL",
- "DEBUG XFORM FAIL",
- "DEFINE",
- "DELETE VARIABLES",
- "DESCRIPTIVES",
- "DETECTANOMALY",
- "DISCRIMINANT",
- "DISPLAY MACROS",
- "DISPLAY VARIABLE SETS",
- "DISPLAY",
- "DO IF",
- "DO REPEAT",
- "DOCUMENT",
- "DROP DOCUMENTS",
- "ECHO",
- "EDIT",
- "ELSE IF",
- "ELSE",
- "END CASE",
- "END FILE TYPE",
- "END FILE",
- "END IF",
- "END LOOP",
- "END REPEAT",
- "ERASE",
- "EXAMINE",
- "EXECUTE",
- "EXIT",
- "EXPORT",
- "FACTOR",
- "FILE HANDLE",
- "FILE LABEL",
- "FILE TYPE",
- "FILTER",
- "FINISH",
- "FIT",
- "FLIP",
- "FORMATS",
- "FREQUENCIES",
- "GENLOG",
- "GET DATA",
- "GET TRANSLATE",
- "GET",
- "GGRAPH",
- "GLM",
- "GRAPH",
- "HILOGLINEAR",
- "HOMALS",
- "HOST",
- "IF",
- "IGRAPH",
- "IMPORT",
- "INCLUDE",
- "INFO",
- "INPUT PROGRAM",
- "INSERT",
- "KEYED DATA LIST",
- "KM",
- "LEAVE",
- "LIST",
- "LOGISTIC REGRESSION",
- "LOGLINEAR",
- "LOOP",
- "MANOVA",
- "MAPS",
- "MATCH FILES",
- "MATRIX DATA",
- "MATRIX",
- "MCONVERT",
- "MEANS",
- "MISSING VALUES",
- "MIXED",
- "MODEL CLOSE",
- "MODEL HANDLE",
- "MODEL LIST",
- "MODEL NAME",
- "MRSETS",
- "MULT RESPONSE",
- "MULTIPLE CORRESPONDENCE",
- "MVA",
- "N OF CASES",
- "N",
- "NAIVEBAYES",
- "NEW FILE",
- "NLR",
- "NOMREG",
- "NONPAR CORR",
- "NPAR TESTS",
- "NUMBERED",
- "NUMERIC",
- "OLAP CUBES",
- "OMS",
- "ONEWAY",
- "ORTHOPLAN",
- "OUTPUT MODIFY",
- "OVERALS",
- "PACF",
- "PARTIAL CORR",
- "PEARSON CORRELATIONS",
- "PERMISSIONS",
- "PLANCARDS",
- "PLUM",
- "POINT",
- "PPLOT",
- "PREDICT",
- "PREFSCAL",
- "PRESERVE",
- "PRINCALS",
- "PRINT EJECT",
- "PRINT FORMATS",
- "PRINT SPACE",
- "PRINT",
- "PROBIT",
- "PROCEDURE OUTPUT",
- "PROXIMITIES",
- "PROXSCAL",
- "Q",
- "QUICK CLUSTER",
- "QUIT",
- "RANK",
- "RATIO STATISTICS",
- "READ MODEL",
- "RECODE",
- "RECORD TYPE",
- "REFORMAT",
- "REGRESSION",
- "RELIABILITY",
- "RENAME VARIABLES",
- "REPEATING DATA",
- "REPORT",
- "REREAD",
- "RESTORE",
- "RMV",
- "ROC",
- "SAMPLE",
- "SAVE DATA COLLECTION",
- "SAVE TRANSLATE",
- "SAVE",
- "SCRIPT",
- "SEASON",
- "SELECT IF",
- "SELECTPRED",
- "SET",
- "SHOW",
- "SORT CASES",
- "SORT VARIABLES",
- "SPCHART",
- "SPECTRA",
- "SPLIT FILE",
- "STEMLEAF",
- "STRING",
- "SUBTITLE",
- "SUMMARIZE",
- "SURVIVAL",
- "SYSFILE INFO",
- "T-TEST",
- "TDISPLAY",
- "TEMPORARY",
- "TITLE",
- "TREE",
- "TSAPPLY",
- "TSET",
- "TSHOW",
- "TSMODEL",
- "TSPLOT",
- "TWOSTEP CLUSTER",
- "UNIANOVA",
- "UNNUMBERED",
- "UPDATE",
- "USE",
- "VALIDATEDATA",
- "VALUE LABELS",
- "VARCOMP",
- "VARIABLE ALIGNMENT",
- "VARIABLE ATTRIBUTE",
- "VARIABLE LABELS",
- "VARIABLE LEVEL",
- "VARIABLE ROLE",
- "VARIABLE WIDTH",
- "VARSTOCASES",
- "VECTOR",
- "VERIFY",
- "WEIGHT",
- "WLS",
- "WRITE FORMATS",
- "WRITE",
- "XEXPORT",
- "XGRAPH",
- "XSAVE",
-];
+++ /dev/null
-use std::{
- borrow::{Borrow, Cow},
- collections::{HashMap, VecDeque},
- fmt::Write,
- fs,
- io::Result as IoResult,
- mem,
- ops::{Range, RangeInclusive},
- path::Path,
- sync::Arc,
-};
-
-use chardetng::EncodingDetector;
-use encoding_rs::{Encoding, UTF_8};
-use thiserror::Error as ThisError;
-use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
-
-use crate::{
- macros::{macro_tokens_to_syntax, MacroSet, ParseStatus, Parser},
- message::{Category, Diagnostic, Location, Point, Severity},
- prompt::PromptStyle,
- settings::Settings,
-};
-
-use super::{
- scan::{MergeResult, ScanError, ScanToken},
- segment::{Mode, Segment, Segmenter},
- token::Token,
-};
-
-/// Error handling for a [`Reader`].
-#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
-pub enum ErrorHandling {
- /// Discard input line and continue reading.
- Terminal,
-
- /// Continue to next command, except for cascading failures.
- #[default]
- Continue,
-
- /// Continue, even for cascading failures.
- Ignore,
-
- /// Stop processing,
- Stop,
-}
-
-/// # Token pipeline
-///
-/// Tokens pass through a pipeline with the following stages. Each token
-/// eventually made available to the parser passes through of these stages.
-/// The stages are named after the processing that happens in each one.
-///
-/// Initially, tokens come from the segmenter and scanner to `pp`:
-///
-/// - `pp`: Tokens that need to pass through the macro preprocessor to end up
-/// in `merge`.
-///
-/// - `merge`: Tokens that need to pass through
-/// [`super::scan::ScanToken::merge`] to end up in `parse`.
-///
-/// - `parse`: Tokens available to the client for parsing.
-///
-/// `pp` and `merge` store tokens only temporarily until they pass into `parse`.
-/// Tokens then live in `parse` until the command is fully consumed, at which
-/// time they are freed together.
-pub struct Source {
- /// Error-handling mode.
- error_handling: ErrorHandling,
-
- /// Encoding.
- encoding: &'static Encoding,
-
- /// `None` if this reader is not associated with a file.
- file_name: Option<Arc<String>>,
-
- /// True if we've reached EOF already.
- eof: bool,
-
- /// Read some input from the source. If successful, returns the input that
- /// was read. At end of file or on error, returns an empty string.
- ///
- /// `prompt` provides a hint to interactive readers as to what kind of
- /// syntax is being read right now.
- read: Box<dyn Fn(PromptStyle) -> String>,
-
- /// Source file contents.
- buffer: String,
-
- /// 0-based line number of the first line not yet written to the journal.
- journal_line: usize,
-
- /// Byte offset of first character not yet scanned as token.
- seg_pos: usize,
-
- /// Byte offsets into `buffer` of starts of lines. The first element is 0.
- lines: Vec<usize>,
-
- /// Tokens that need to pass through the macro preprocessor to end up in
- /// `merge`.
- pp: VecDeque<LexToken>,
-
- /// Tokens that need to pass through [`super::scan::ScanToken::merge`] to
- /// end up in `parse`.
- merge: VecDeque<LexToken>,
-
- /// Tokens available to the client for parsing.
- parse: Vec<LexToken>,
-
- /// Offset in `parse` of the current token.
- parse_ofs: usize,
-
- segmenter: Segmenter,
-
- suppress_next_newline: bool,
-}
-
-impl Default for Source {
- fn default() -> Self {
- Self {
- error_handling: ErrorHandling::default(),
- encoding: UTF_8,
- file_name: None,
- eof: false,
- read: Box::new(|_| String::new()),
- buffer: String::new(),
- journal_line: 0,
- seg_pos: 0,
- lines: vec![0],
- pp: VecDeque::new(),
- merge: VecDeque::new(),
- parse: Vec::new(),
- parse_ofs: 0,
- segmenter: Segmenter::new(Mode::default(), false),
- suppress_next_newline: false,
- }
- }
-}
-
-impl Source {
- pub fn for_file<P>(
- path: P,
- encoding: Option<&'static Encoding>,
- syntax: Mode,
- error_handling: ErrorHandling,
- ) -> IoResult<Self>
- where
- P: AsRef<Path>,
- {
- let bytes = fs::read(path.as_ref())?;
- let encoding = encoding.unwrap_or_else(|| {
- let mut encoding_detector = EncodingDetector::new();
- encoding_detector.feed(&bytes, true);
- encoding_detector.guess(None, true)
- });
- let (contents, _malformed) = encoding.decode_with_bom_removal(&bytes);
- Ok(Self::for_file_contents(
- contents.to_string(),
- Some(path.as_ref().to_string_lossy().to_string()),
- encoding,
- syntax,
- error_handling,
- ))
- }
-
- pub fn for_file_contents(
- contents: String,
- file_name: Option<String>,
- encoding: &'static Encoding,
- syntax: Mode,
- error_handling: ErrorHandling,
- ) -> Self {
- Self {
- buffer: contents,
- file_name: file_name.map(Arc::new),
- encoding,
- error_handling,
- segmenter: Segmenter::new(syntax, false),
- ..Self::default()
- }
- }
-
- pub fn for_string(contents: String, encoding: &'static Encoding) -> Self {
- Self {
- buffer: contents,
- encoding,
- ..Self::default()
- }
- }
-
- pub fn for_function(
- read: Box<dyn Fn(PromptStyle) -> String>,
- file_name: Option<String>,
- encoding: &'static Encoding,
- syntax: Mode,
- error_handling: ErrorHandling,
- ) -> Self {
- Self {
- read,
- file_name: file_name.map(Arc::new),
- encoding,
- segmenter: Segmenter::new(syntax, false),
- error_handling,
- ..Self::default()
- }
- }
-
- fn read(&mut self) {
- loop {
- let prompt = self.segmenter.prompt();
- let s = (self.read)(prompt);
- if s.is_empty() {
- self.eof = true;
- return;
- }
- self.buffer.push_str(&s);
- if self.buffer[self.seg_pos..].contains('\n') {
- return;
- }
- }
- }
- fn try_get_pp(&mut self, context: &Context) -> bool {
- let (seg_len, seg_type) = loop {
- if let Ok(result) = self.segmenter.push(&self.buffer[self.seg_pos..], self.eof) {
- break result;
- }
-
- debug_assert!(!self.eof);
- self.read();
- };
-
- let pos = self.seg_pos..self.seg_pos + seg_len;
- self.seg_pos += seg_len;
- if seg_type == Segment::Newline {
- self.lines.push(self.seg_pos);
- }
-
- let scan_token = ScanToken::from_segment(&self.buffer[pos.clone()], seg_type);
-
- let n_lines = match (seg_type, self.suppress_next_newline) {
- (Segment::EndCommand, false) => {
- self.suppress_next_newline = true;
- 1
- }
- (Segment::Newline, true) => {
- self.suppress_next_newline = false;
- 0
- }
- (Segment::Newline, false) => 1,
- _ => 0,
- };
- for line_num in self.journal_line..self.journal_line + n_lines {
- let start_ofs = self.lines[line_num];
- let end_ofs = self
- .lines
- .get(line_num + 1)
- .copied()
- .unwrap_or(self.buffer.len());
- let line = &self.buffer[start_ofs..end_ofs];
- let _line = line
- .strip_suffix("\r\n")
- .unwrap_or(line.strip_suffix('\n').unwrap_or(line));
- // XXX submit the line as syntax
- }
- self.journal_line += n_lines;
-
- let pos = pos.start..pos.end;
- match scan_token {
- None => false,
- Some(ScanToken::Token(Token::End)) => {
- self.pp.push_back(LexToken {
- token: Token::EndCommand,
- pos,
- macro_rep: None,
- });
- self.eof = true;
- true
- }
- Some(ScanToken::Token(token)) => {
- self.pp.push_back(LexToken {
- token,
- pos,
- macro_rep: None,
- });
- true
- }
- Some(ScanToken::Error(error)) => {
- (context.error)(
- Location {
- file_name: self.file_name.clone(),
- span: Some(self.offset_to_point(pos.start)..self.offset_to_point(pos.end)),
- omit_underlines: false,
- },
- error.into(),
- );
- false
- }
- }
- }
-
- fn get_pp(&mut self, context: &Context) -> bool {
- while !self.eof {
- if self.try_get_pp(context) {
- return true;
- }
- }
- false
- }
-
- fn try_get_merge(&mut self, context: &Context) -> bool {
- if self.pp.is_empty() && !self.get_pp(context) {
- return false;
- }
-
- if !Settings::global().macros.expand {
- self.merge.append(&mut self.pp);
- return true;
- }
-
- // Now pass tokens one-by-one to the macro expander.
- let Some(mut parser) = Parser::new(context.macros, &self.pp[0].token) else {
- // Common case where there is no macro to expand.
- self.merge.push_back(self.pp.pop_front().unwrap());
- return true;
- };
- for ofs in 1.. {
- if self.pp.len() <= ofs && !self.get_pp(context) {
- // This should not be reachable because we always get a
- // `Token::EndCommand` at the end of an input file, which should
- // always terminate macro expansion.
- unreachable!();
- }
- let token = &self.pp[ofs];
- if parser.push(&token.token, &self.buffer[token.pos.clone()], &|e| {
- println!("{e:?}")
- }) == ParseStatus::Complete
- {
- break;
- }
- }
- let call = parser.finish();
- if call.len() == 0 {
- // False alarm: no macro to expand after all.
- self.merge.push_back(self.pp.pop_front().unwrap());
- return true;
- }
-
- // Expand the tokens.
- let c0 = &self.pp[0];
- let c1 = &self.pp[call.len() - 1];
- let mut expansion = Vec::new();
- call.expand(
- self.segmenter.mode(),
- self.token_location(c0..=c1),
- &mut expansion,
- |e| println!("{e:?}"),
- );
- let retval = !expansion.is_empty();
-
- if Settings::global().macros.print_expansions {
- // XXX
- }
-
- // Append the macro expansion tokens to the lookahead.
- let mut macro_rep = String::new();
- let mut pos = Vec::with_capacity(expansion.len());
- for [prefix, token] in macro_tokens_to_syntax(expansion.as_slice()) {
- macro_rep.push_str(prefix);
- let len = macro_rep.len();
- pos.push(len..=len + token.len() - 1);
- }
- let macro_rep = Arc::new(macro_rep);
- for (index, token) in expansion.into_iter().enumerate() {
- let lt = LexToken {
- token: token.token,
- pos: c0.pos.start..c1.pos.end,
- macro_rep: Some(MacroRepresentation {
- expansion: Arc::clone(¯o_rep),
- pos: pos[index].clone(),
- }),
- };
- self.merge.push_back(lt);
- }
- self.pp.drain(..call.len());
- retval
- }
-
- /// Attempts to obtain at least one new token into `self.merge`.
- ///
- /// Returns true if successful, false on failure. In the latter case, this source
- /// exhausted and 'self.eof' is now true.
- fn get_merge(&mut self, context: &Context) -> bool {
- while !self.eof {
- if self.try_get_merge(context) {
- return true;
- }
- }
- false
- }
-
- fn get_parse__(&mut self, context: &Context) -> bool {
- for i in 0.. {
- if self.merge.len() <= i && !self.get_merge(context) {
- // We always get a `Token::EndCommand` at the end of an input
- // file and the merger should return `Some(...)` for that token.
- debug_assert_eq!(self.merge.len(), 0);
- return false;
- }
-
- match ScanToken::merge(&self.merge) {
- None => (),
- Some(MergeResult::Copy) => {
- self.parse.push(self.merge.pop_front().unwrap());
- return true;
- }
- Some(MergeResult::Expand { n, token }) => {
- let first = &self.merge[0];
- let last = &self.merge[n - 1];
- self.parse.push(LexToken {
- token,
- pos: first.pos.start..last.pos.end,
- macro_rep: match (&first.macro_rep, &last.macro_rep) {
- (Some(a), Some(b)) if Arc::ptr_eq(&a.expansion, &b.expansion) => {
- Some(MacroRepresentation {
- expansion: a.expansion.clone(),
- pos: *a.pos.start()..=*b.pos.end(),
- })
- }
- _ => None,
- },
- });
- self.merge.drain(..n);
- return true;
- }
- }
- }
- unreachable!();
- }
-
- fn get_parse(&mut self, context: &Context) -> bool {
- // XXX deal with accumulated messages
- self.get_parse__(context)
- }
-
- fn offset_to_point(&self, offset: usize) -> Point {
- let line = self
- .lines
- .partition_point(|&line_start| line_start <= offset);
- Point {
- line: line as i32,
- column: Some(
- self.buffer
- .get(self.lines[line - 1]..offset)
- .unwrap_or_default()
- .width() as i32
- + 1,
- ),
- }
- }
-
- /// Returns the syntax for 1-based line-number `line_number`.
- fn get_line(&self, line_number: i32) -> &str {
- if (1..=self.lines.len() as i32).contains(&line_number) {
- let line_number = line_number as usize;
- let start = self.lines[line_number - 1];
- let end = self.lines.get(line_number).copied().unwrap_or(
- self.buffer[start..]
- .find('\n')
- .map(|ofs| ofs + start)
- .unwrap_or(self.buffer.len()),
- );
- let line = &self.buffer[start..end];
- line.strip_suffix("\r\n")
- .unwrap_or(line.strip_suffix('\n').unwrap_or(line))
- } else {
- ""
- }
- }
-
- fn token_location(&self, range: RangeInclusive<&LexToken>) -> Location {
- Location {
- file_name: self.file_name.clone(),
- span: Some(
- self.offset_to_point(range.start().pos.start)
- ..self.offset_to_point(range.end().pos.end),
- ),
- omit_underlines: false,
- }
- }
-
- fn ofs_location(&self, range: RangeInclusive<usize>) -> Location {
- if *range.start() <= *range.end() && *range.end() < self.parse.len() {
- self.token_location(&self.parse[*range.start()]..=&self.parse[*range.end()])
- } else {
- Location {
- file_name: self.file_name.clone(),
- span: None,
- omit_underlines: false,
- }
- }
- }
-
- fn token(&self) -> &Token {
- &self.parse[self.parse_ofs].token
- }
-
- fn next(&mut self, offset: isize, context: &Context) -> &Token {
- let Some(index) = offset.checked_add(self.parse_ofs as isize) else {
- return &Token::EndCommand;
- };
- let Ok(index) = usize::try_from(index) else {
- return &Token::EndCommand;
- };
-
- while index >= self.parse.len() {
- if let Some(token) = self.parse.last() {
- match token.token {
- Token::End => return &Token::End,
- Token::EndCommand => return &Token::EndCommand,
- _ => (),
- }
- }
- self.get_parse(context);
- }
- &self.parse[index].token
- }
-
- /// If the tokens in `ofs` contains a macro call, this returns the raw
- /// syntax for the macro call (not for the expansion) and for any other
- /// tokens included in that range. The syntax is encoded in UTF-8 and in
- /// the original form supplied to the lexer so that, for example, it may
- /// include comments, spaces, and new-lines if it spans multiple tokens.
- ///
- /// Returns `None` if the token range doesn't include a macro call.
- fn get_macro_call(&self, ofs: RangeInclusive<usize>) -> Option<&str> {
- if self
- .parse
- .get(ofs.clone())
- .unwrap_or_default()
- .iter()
- .all(|token| token.macro_rep.is_none())
- {
- return None;
- }
-
- let token0 = &self.parse[*ofs.start()];
- let token1 = &self.parse[*ofs.end()];
- Some(&self.buffer[token0.pos.start..token1.pos.end])
- }
-
- fn is_empty(&self) -> bool {
- self.buffer.is_empty() && self.eof
- }
-
- fn diagnostic(
- &self,
- severity: Severity,
- ofs: RangeInclusive<usize>,
- text: String,
- ) -> Diagnostic {
- let mut s = String::with_capacity(text.len() + 16);
- if self.is_empty() {
- s.push_str("At end of input: ");
- } else if let Some(call) = self.get_macro_call(ofs.clone()) {
- write!(&mut s, "In syntax expanded from `{}`: ", ellipsize(call)).unwrap();
- }
-
- if !text.is_empty() {
- s.push_str(&text);
- } else {
- s.push_str("Syntax error.");
- }
-
- if !s.ends_with('.') {
- s.push('.');
- }
-
- let location = self.ofs_location(ofs);
- let mut source = Vec::new();
- if let Some(Range {
- start: Point { line: l0, .. },
- end: Point { line: l1, .. },
- }) = location.span
- {
- let lines = if l1 - l0 > 3 {
- vec![l0, l0 + 1, l1]
- } else {
- (l0..=l1).collect()
- };
- for line_number in lines {
- source.push((line_number, self.get_line(line_number).to_string()));
- }
- }
-
- Diagnostic {
- category: Category::Syntax,
- severity,
- location,
- source,
- stack: Vec::new(),
- command_name: None, // XXX
- text: s,
- }
- }
-
- fn interactive_reset(&mut self) {
- if self.error_handling == ErrorHandling::Terminal {
- let Source {
- error_handling,
- encoding,
- read,
- ..
- } = mem::take(self);
- *self = Self {
- error_handling,
- encoding,
- read,
- ..Source::default()
- };
- }
- }
-}
-
-fn ellipsize(s: &str) -> Cow<str> {
- if s.width() > 64 {
- let mut out = String::new();
- let mut width = 0;
- for c in s.chars() {
- out.push(c);
- width += c.width().unwrap_or(0);
- if width > 64 {
- break;
- }
- }
- out.push_str("...");
- Cow::from(out)
- } else {
- Cow::from(s)
- }
-}
-
-/// A token in a [`Source`].
-struct LexToken {
- /// The regular token.
- token: Token,
-
- /// For a token obtained through the lexer in an ordinary way, this is the
- /// location of the token in the [`Source`]'s buffer.
- ///
- /// For a token produced through macro expansion, this is the entire macro
- /// call.
- pos: Range<usize>,
-
- /// For a token obtained through macro expansion, the part of the macro
- /// expansion that represents this token.
- ///
- /// For a token obtained through the lexer in an ordinary way, this is
- /// `None`.
- macro_rep: Option<MacroRepresentation>,
-}
-
-impl Borrow<Token> for LexToken {
- fn borrow(&self) -> &Token {
- &self.token
- }
-}
-
-struct MacroRepresentation {
- /// An entire macro expansion.
- expansion: Arc<String>,
-
- /// The substring of `expansion` that represents a single token.
- pos: RangeInclusive<usize>,
-}
-
-pub struct Lexer {
- source: Source,
- stack: Vec<Source>,
- macros: MacroSet,
- error: Box<dyn Fn(Location, Error)>,
-}
-
-struct Context<'a> {
- macros: &'a MacroSet,
- error: &'a Box<dyn Fn(Location, Error)>,
-}
-
-impl Lexer {
- pub fn new(error: Box<dyn Fn(Location, Error)>) -> Self {
- Self {
- source: Source::default(),
- stack: Vec::new(),
- macros: HashMap::new(),
- error,
- }
- }
-
- pub fn get(&mut self) -> &Token {
- if self.source.parse_ofs < self.source.parse.len() {
- if let Token::EndCommand = self.source.token() {
- self.source.parse.clear();
- self.source.parse_ofs = 0;
- } else {
- self.source.parse_ofs += 1;
- }
- }
-
- while self.source.parse_ofs == self.source.parse.len() {
- let context = Context {
- macros: &self.macros,
- error: &self.error,
- };
- if !self.source.get_parse(&context) && !self.pop_stack() {
- return &Token::End;
- }
- }
- self.source.token()
- }
-
- fn pop_stack(&mut self) -> bool {
- if let Some(new_source) = self.stack.pop() {
- self.source = new_source;
- true
- } else {
- self.source = Source::default();
- self.source.parse.push(LexToken {
- token: Token::End,
- pos: 0..0,
- macro_rep: None,
- });
- false
- }
- }
-
- /// Inserts `source` so that the next token comes from it. This is only
- /// permitted when the lexer is either empty or at `Token::EndCommand`.
- pub fn include(&mut self, mut source: Source) {
- // XXX what's the right assertion?
- let context = Context {
- macros: &self.macros,
- error: &self.error,
- };
- source.get_parse(&context);
- let old_source = mem::replace(&mut self.source, source);
- self.stack.push(old_source);
- }
-
- /// Inserts `source` so that it will be read after all the other sources.
- pub fn append(&mut self, mut source: Source) {
- let context = Context {
- macros: &self.macros,
- error: &self.error,
- };
- source.get_parse(&context);
- self.stack.insert(0, source);
- }
-
- pub fn token(&self) -> &Token {
- self.source.token()
- }
-
- pub fn next(&mut self, offset: isize) -> &Token {
- let context = Context {
- macros: &self.macros,
- error: &self.error,
- };
- self.source.next(offset, &context)
- }
-
- pub fn error<S>(&self, text: S) -> Diagnostic
- where
- S: ToString,
- {
- self.diagnostic(
- Severity::Error,
- self.source.parse_ofs..=self.source.parse_ofs,
- text,
- )
- }
-
- pub fn diagnostic<S>(
- &self,
- severity: Severity,
- ofs: RangeInclusive<usize>,
- text: S,
- ) -> Diagnostic
- where
- S: ToString,
- {
- self.source.diagnostic(severity, ofs, text.to_string())
- }
-
- pub fn error_handling(&self) -> ErrorHandling {
- self.source.error_handling
- }
-
- /// Discards all lookahead tokens, then discards all input sources
- /// until it encounters one with error mode [ErrorHandling::Terminal] or until it
- /// runs out of input sources.
- pub fn discard_noninteractive(&mut self) {
- while self.source.error_handling != ErrorHandling::Ignore {
- self.source.pp.clear();
- self.source.merge.clear();
- self.source.parse.clear();
- self.source.parse_ofs = 0;
-
- if self.source.error_handling == ErrorHandling::Terminal || !self.pop_stack() {
- return;
- }
- }
- }
-
- /// If the source that the lexer is currently reading has error mode
- /// [ErrorHandling::Terminal], discards all buffered input and tokens, so
- /// that the next token to be read comes directly from whatever is next read
- /// from the stream.
- ///
- /// It makes sense to call this function after encountering an error in a
- /// command entered on the console, because usually the user would prefer
- /// not to have cascading errors.
- pub fn interactive_reset(&mut self) {
- self.source.interactive_reset()
- }
-
- /// Advances past any tokens up to [Token::EndCommand] or [Token::End].
- pub fn discard_rest_of_command(&mut self) {
- while !matches!(self.token(), Token::EndCommand | Token::End) {
- self.get();
- }
- }
-}
-
-#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
-pub enum Error {
- /// Error forming tokens from the input.
- #[error("{0}")]
- TokenError(#[from] ScanError),
-}
-
-#[cfg(test)]
-mod tests {
- use encoding_rs::UTF_8;
-
- use crate::lex::{segment::Mode, token::Token};
-
- use super::{ErrorHandling, Lexer, Source};
-
- #[test]
- fn test() {
- let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}")));
- lexer.include(Source::for_string(
- String::from(
- r#"#! /usr/local/bin/pspp
-DATA LIST LIST NOTABLE /a.
-BEGIN DATA.
-1
-2
-END DATA.
-LIST.
-"#,
- ),
- UTF_8,
- ));
- loop {
- lexer.get();
- let token = lexer.token();
- println!("{token:?}");
- if let Token::End = token {
- break;
- }
- }
- }
-
- #[test]
- fn test_scan_errors() {
- let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}")));
- lexer.include(Source::for_file_contents(
- String::from(
- r#"x'123'
-x'1x'
-u''
-u'012345678'
-u'd800'
-u'110000'
-'foo
-'very long unterminated string that be ellipsized in its error message
-1e .x
-^
-�
-"#,
- ),
- Some(String::from("syntax.sps")),
- UTF_8,
- Mode::default(),
- ErrorHandling::default(),
- ));
- loop {
- lexer.get();
- let token = lexer.token();
- println!("{token:?}");
- if let Token::End = token {
- break;
- }
- }
- }
-
- #[test]
- fn test_null_byte() {
- let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}")));
- lexer.include(Source::for_file_contents(
- String::from(
- "datA dist list notable file='input.txt'/a b c.
-lis|.\0",
- ),
- Some(String::from("syntax.sps")),
- UTF_8,
- Mode::default(),
- ErrorHandling::default(),
- ));
- loop {
- lexer.get();
- let token = lexer.token();
- println!("{token:?}");
- if let Token::End = token {
- break;
- }
- }
- }
-}
+++ /dev/null
-//! PSPP syntax scanning.
-//!
-//! PSPP divides traditional "lexical analysis" or "tokenization" into two
-//! phases: a lower-level phase called "segmentation" and a higher-level phase
-//! called "scanning". [super::segment] implements the segmentation phase and
-//! this module the scanning phase.
-//!
-//! Scanning accepts as input a stream of segments, which are UTF-8 strings each
-//! labeled with a segment type. It outputs a stream of "scan tokens", which
-//! are the same as the tokens used by the PSPP parser with a few additional
-//! types.
-
-pub mod segment;
-pub mod scan;
-pub mod command_name;
-pub mod token;
-pub mod lexer;
+++ /dev/null
-//! PSPP lexical analysis.
-//!
-//! PSPP divides traditional "lexical analysis" or "tokenization" into two
-//! phases: a lower-level phase called "segmentation" and a higher-level phase
-//! called "scanning". [segment] implements the segmentation phase and [scan]
-//! the scanning phase.
-//!
-//! Scanning accepts as input a stream of segments, which are UTF-8 strings each
-//! labeled with a segment type. It outputs a stream of "scan tokens", which
-//! are the same as the tokens used by the PSPP parser with a few additional
-//! types.
-
-use crate::identifier::{Identifier, ReservedWord};
-
-use super::{
- segment::{Mode, Segment, Segmenter},
- token::{Punct, Token},
-};
-use std::{borrow::Borrow, collections::VecDeque};
-use thiserror::Error as ThisError;
-
-#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
-pub enum ScanError {
- /// Unterminated string constant.
- #[error("Unterminated string constant.")]
- ExpectedQuote,
-
- /// Missing exponent.
- #[error("Missing exponent following `{0}`")]
- ExpectedExponent(String),
-
- /// Odd length hex string.
- #[error("String of hex digits has {0} characters, which is not a multiple of 2.")]
- OddLengthHexString(usize),
-
- /// Invalid hex digit.
- #[error("Invalid hex digit {0:?}.")]
- BadHexDigit(char),
-
- /// Incomplete UTF-8 sequence.
- #[error("Incomplete UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
- IncompleteUtf8 { substring: String, offset: usize },
-
- /// Bad UTF-8 sequence.
- #[error("Invalid UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
- BadUtf8 { substring: String, offset: usize },
-
- /// Invalid length Unicode string.
- #[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")]
- BadLengthUnicodeString(usize),
-
- /// Invalid code point.
- #[error("U+{0:04X} is not a valid Unicode code point.")]
- BadCodePoint(u32),
-
- /// Expected hexadecimal Unicode code point
- #[error("Expected hexadecimal Unicode code point.")]
- ExpectedCodePoint,
-
- /// `DO REPEAT` nested too deeply.
- #[error("`DO REPEAT` nested too deeply.")]
- DoRepeatOverflow,
-
- /// Unexpected character.
- #[error("Unexpected character {0:?} in input.")]
- UnexpectedChar(char),
-}
-
-/// The input or output to token merging.
-#[derive(Clone, Debug, PartialEq)]
-pub enum ScanToken {
- Token(Token),
- Error(ScanError),
-}
-
-/// The result of merging tokens.
-#[derive(Clone, Debug)]
-pub enum MergeResult {
- /// Copy one token literally from input to output.
- Copy,
-
- /// Expand `n` tokens from the input into `token` in the output.
- Expand {
- /// Number of tokens to expand.
- n: usize,
-
- /// Replacement token.
- token: Token,
- },
-}
-
-impl ScanToken {
- pub fn from_segment(s: &str, segment: Segment) -> Option<Self> {
- match segment {
- Segment::Number => Some(Self::Token(Token::Number(s.parse().unwrap()))),
- Segment::QuotedString => {
- // Trim quote mark from front and back.
- let mut chars = s.chars();
- let quote = chars.next().unwrap();
- let s = chars.as_str().strip_suffix(quote).unwrap();
-
- // Replace doubled quotes by single ones.
- let (single_quote, double_quote) = match quote {
- '\'' => ("'", "''"),
- '"' => ("\"", "\"\""),
- _ => unreachable!(),
- };
- Some(Self::Token(Token::String(
- s.replace(double_quote, single_quote),
- )))
- }
- Segment::HexString => {
- // Strip `X"` prefix and `"` suffix (or variations).
- let s = &s[2..s.len() - 1];
- for c in s.chars() {
- if !c.is_ascii_hexdigit() {
- return Some(Self::Error(ScanError::BadHexDigit(c)));
- }
- }
- if s.len() % 2 != 0 {
- return Some(Self::Error(ScanError::OddLengthHexString(s.len())));
- }
- let bytes = s
- .as_bytes()
- .chunks_exact(2)
- .map(|pair| {
- let hi = char::from(pair[0]).to_digit(16).unwrap() as u8;
- let lo = char::from(pair[1]).to_digit(16).unwrap() as u8;
- hi * 16 + lo
- })
- .collect::<Vec<_>>();
- match String::from_utf8(bytes) {
- Ok(string) => Some(Self::Token(Token::String(string))),
- Err(error) => {
- let details = error.utf8_error();
- let offset = details.valid_up_to() * 2;
- let end = details
- .error_len()
- .map(|len| offset + len * 2)
- .unwrap_or(s.len());
- let substring = String::from(&s[offset..end]);
- Some(Self::Error(if details.error_len().is_some() {
- ScanError::BadUtf8 { substring, offset }
- } else {
- ScanError::IncompleteUtf8 { substring, offset }
- }))
- }
- }
- }
- Segment::UnicodeString => {
- // Strip `U"` prefix and `"` suffix (or variations).
- let s = &s[2..s.len() - 1];
- if !(1..=8).contains(&s.len()) {
- return Some(Self::Error(ScanError::BadLengthUnicodeString(s.len())));
- }
- let Ok(code_point) = u32::from_str_radix(s, 16) else {
- return Some(Self::Error(ScanError::ExpectedCodePoint));
- };
- let Some(c) = char::from_u32(code_point) else {
- return Some(Self::Error(ScanError::BadCodePoint(code_point)));
- };
- Some(Self::Token(Token::String(String::from(c))))
- }
-
- Segment::UnquotedString
- | Segment::DoRepeatCommand
- | Segment::InlineData
- | Segment::Document
- | Segment::MacroBody
- | Segment::MacroName => Some(Self::Token(Token::String(String::from(s)))),
-
- Segment::Identifier => {
- if let Ok(reserved_word) = ReservedWord::try_from(s) {
- match reserved_word {
- ReservedWord::And => Some(Self::Token(Token::Punct(Punct::And))),
- ReservedWord::Or => Some(Self::Token(Token::Punct(Punct::Or))),
- ReservedWord::Not => Some(Self::Token(Token::Punct(Punct::Not))),
- ReservedWord::Eq => Some(Self::Token(Token::Punct(Punct::Eq))),
- ReservedWord::Ge => Some(Self::Token(Token::Punct(Punct::Ge))),
- ReservedWord::Gt => Some(Self::Token(Token::Punct(Punct::Gt))),
- ReservedWord::Le => Some(Self::Token(Token::Punct(Punct::Le))),
- ReservedWord::Lt => Some(Self::Token(Token::Punct(Punct::Lt))),
- ReservedWord::Ne => Some(Self::Token(Token::Punct(Punct::Ne))),
- ReservedWord::All => Some(Self::Token(Token::Punct(Punct::All))),
- ReservedWord::By => Some(Self::Token(Token::Punct(Punct::By))),
- ReservedWord::To => Some(Self::Token(Token::Punct(Punct::To))),
- ReservedWord::With => Some(Self::Token(Token::Punct(Punct::With))),
- }
- } else {
- Some(Self::Token(Token::Id(Identifier::new(s).unwrap())))
- }
- }
- Segment::Punct => match s {
- "(" => Some(Self::Token(Token::Punct(Punct::LParen))),
- ")" => Some(Self::Token(Token::Punct(Punct::RParen))),
- "[" => Some(Self::Token(Token::Punct(Punct::LSquare))),
- "]" => Some(Self::Token(Token::Punct(Punct::RSquare))),
- "{" => Some(Self::Token(Token::Punct(Punct::LCurly))),
- "}" => Some(Self::Token(Token::Punct(Punct::RCurly))),
- "," => Some(Self::Token(Token::Punct(Punct::Comma))),
- "=" => Some(Self::Token(Token::Punct(Punct::Equals))),
- "-" => Some(Self::Token(Token::Punct(Punct::Dash))),
- "&" => Some(Self::Token(Token::Punct(Punct::And))),
- "|" => Some(Self::Token(Token::Punct(Punct::Or))),
- "+" => Some(Self::Token(Token::Punct(Punct::Plus))),
- "/" => Some(Self::Token(Token::Punct(Punct::Slash))),
- "*" => Some(Self::Token(Token::Punct(Punct::Asterisk))),
- "<" => Some(Self::Token(Token::Punct(Punct::Lt))),
- ">" => Some(Self::Token(Token::Punct(Punct::Gt))),
- "~" => Some(Self::Token(Token::Punct(Punct::Not))),
- ":" => Some(Self::Token(Token::Punct(Punct::Colon))),
- ";" => Some(Self::Token(Token::Punct(Punct::Semicolon))),
- "**" => Some(Self::Token(Token::Punct(Punct::Exp))),
- "<=" => Some(Self::Token(Token::Punct(Punct::Le))),
- "<>" => Some(Self::Token(Token::Punct(Punct::Ne))),
- "~=" => Some(Self::Token(Token::Punct(Punct::Ne))),
- ">=" => Some(Self::Token(Token::Punct(Punct::Ge))),
- "!" => Some(Self::Token(Token::Punct(Punct::Bang))),
- "%" => Some(Self::Token(Token::Punct(Punct::Percent))),
- "?" => Some(Self::Token(Token::Punct(Punct::Question))),
- "`" => Some(Self::Token(Token::Punct(Punct::Backtick))),
- "_" => Some(Self::Token(Token::Punct(Punct::Underscore))),
- "." => Some(Self::Token(Token::Punct(Punct::Dot))),
- "!*" => Some(Self::Token(Token::Punct(Punct::BangAsterisk))),
- _ => unreachable!("bad punctuator {s:?}"),
- },
- Segment::Shbang
- | Segment::Spaces
- | Segment::Comment
- | Segment::Newline
- | Segment::CommentCommand => None,
- Segment::DoRepeatOverflow => Some(Self::Error(ScanError::DoRepeatOverflow)),
- Segment::StartDocument => {
- Some(Self::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())))
- }
- Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => {
- Some(Self::Token(Token::EndCommand))
- }
- Segment::End => Some(Self::Token(Token::End)),
- Segment::ExpectedQuote => Some(Self::Error(ScanError::ExpectedQuote)),
- Segment::ExpectedExponent => {
- Some(Self::Error(ScanError::ExpectedExponent(String::from(s))))
- }
- Segment::UnexpectedChar => Some(Self::Error(ScanError::UnexpectedChar(
- s.chars().next().unwrap(),
- ))),
- }
- }
-
- /// Attempts to merge a sequence of tokens together into a single token. The
- /// tokens are taken from the beginning of `input`. If successful, removes one
- /// or more token from the beginning of `input` and returnss the merged
- /// token. More input tokens might be needed; if so, leaves `input` alone and
- /// returns `None`. In the latter case, the caller should add more tokens to the
- /// input ([Token::End] or [Token::Punct(Punct::EndCmd)] is always sufficient).
- ///
- /// This performs two different kinds of token merging:
- ///
- /// - String concatenation, where syntax like `"a" + "b"` is converted into a
- /// single string token. This is definitely needed because the parser relies
- /// on it.
- ///
- /// - Negative number merging, where syntax like `-5` is converted from a pair
- /// of tokens (a dash and a positive number) into a single token (a negative
- /// number). This might not be needed anymore because the segmenter
- /// directly treats a dash followed by a number, with optional intervening
- /// white space, as a negative number. It's only needed if we want
- /// intervening comments to be allowed or for part of the negative number
- /// token to be produced by macro expansion.
- pub fn merge<T>(tokens: &T) -> Option<MergeResult>
- where
- T: Tokens,
- {
- match tokens.get(0)? {
- Token::Punct(Punct::Dash) => match tokens.get(1)? {
- Token::Number(number) if number.is_sign_positive() => {
- let number = *number;
- return Some(MergeResult::Expand {
- n: 2,
- token: Token::Number(-number),
- });
- }
- _ => Some(MergeResult::Copy),
- },
- Token::String(_) => {
- let mut i = 0;
- while matches!(tokens.get(i * 2 + 1)?, Token::Punct(Punct::Plus))
- && matches!(tokens.get(i * 2 + 2)?, Token::String(_))
- {
- i += 1;
- }
- if i == 0 {
- Some(MergeResult::Copy)
- } else {
- let mut output = String::new();
- for i in 0..=i {
- let Token::String(s) = tokens.get(i * 2).unwrap() else {
- unreachable!()
- };
- output.push_str(&s);
- }
- Some(MergeResult::Expand {
- n: i * 2 + 1,
- token: Token::String(output),
- })
- }
- }
- _ => Some(MergeResult::Copy),
- }
- }
-}
-
-pub trait Tokens {
- fn get(&self, index: usize) -> Option<&Token>;
-}
-
-impl<T> Tokens for VecDeque<T>
-where
- T: Borrow<Token>,
-{
- fn get(&self, index: usize) -> Option<&Token> {
- self.get(index).map(|token| token.borrow())
- }
-}
-
-pub struct StringSegmenter<'a> {
- input: &'a str,
- segmenter: Segmenter,
-}
-
-impl<'a> StringSegmenter<'a> {
- pub fn new(input: &'a str, mode: Mode, is_snippet: bool) -> Self {
- Self {
- input,
- segmenter: Segmenter::new(mode, is_snippet),
- }
- }
-}
-
-impl<'a> Iterator for StringSegmenter<'a> {
- type Item = (&'a str, ScanToken);
-
- fn next(&mut self) -> Option<Self::Item> {
- loop {
- let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap();
- if seg_type == Segment::End {
- return None;
- }
- let (s, rest) = self.input.split_at(seg_len);
- self.input = rest;
-
- if let Some(token) = ScanToken::from_segment(s, seg_type) {
- return Some((s, token));
- }
- }
- }
-}
-
-pub struct StringScanner<'a> {
- input: &'a str,
- segmenter: Segmenter,
- tokens: VecDeque<Token>,
-}
-
-impl<'a> StringScanner<'a> {
- pub fn new(input: &'a str, mode: Mode, is_snippet: bool) -> Self {
- Self {
- input,
- segmenter: Segmenter::new(mode, is_snippet),
- tokens: VecDeque::with_capacity(1),
- }
- }
-
- fn merge(&mut self) -> Option<ScanToken> {
- let result = ScanToken::merge(&self.tokens)?;
- match result {
- MergeResult::Copy => Some(ScanToken::Token(self.tokens.pop_front().unwrap())),
- MergeResult::Expand { n, token } => {
- self.tokens.drain(..n);
- Some(ScanToken::Token(token))
- }
- }
- }
-}
-
-impl<'a> Iterator for StringScanner<'a> {
- type Item = ScanToken;
-
- fn next(&mut self) -> Option<Self::Item> {
- if let Some(token) = self.merge() {
- return Some(token);
- }
- loop {
- let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap();
- if seg_type == Segment::End && self.tokens.is_empty() {
- return None;
- }
- let (s, rest) = self.input.split_at(seg_len);
- self.input = rest;
-
- match ScanToken::from_segment(s, seg_type) {
- Some(ScanToken::Error(error)) => return Some(ScanToken::Error(error)),
- Some(ScanToken::Token(token)) => {
- self.tokens.push_back(token);
- if let Some(token) = self.merge() {
- return Some(token);
- }
- }
- None => (),
- }
- }
- }
-}
-
-#[cfg(test)]
-mod test;
+++ /dev/null
-use crate::{identifier::Identifier, lex::{
- segment::Mode,
- token::{Punct, Token},
-}};
-
-use super::{ScanError, ScanToken, StringScanner};
-
-fn print_token(token: &Token) {
- match token {
- Token::End => print!("Token::End"),
- Token::Id(s) => print!("Token::Id(String::from({s:?}))"),
- Token::Number(number) => print!("Token::Number({number:?})"),
- Token::String(s) => print!("Token::String(String::from({s:?}))"),
- Token::EndCommand => print!("Token::EndCommand"),
- Token::Punct(punct) => print!("Token::Punct(Punct::{punct:?})"),
- }
-}
-
-fn check_scan(input: &str, mode: Mode, expected: &[ScanToken]) {
- let tokens = StringScanner::new(input, mode, false).collect::<Vec<_>>();
-
- if &tokens != expected {
- for token in &tokens {
- match token {
- ScanToken::Token(token) => {
- print!("ScanToken::Token(");
- print_token(token);
- print!(")");
- }
- ScanToken::Error(error) => print!("ScanToken::Error(ScanError::{error:?})"),
- }
- println!(",");
- }
-
- eprintln!("tokens differ from expected:");
- let difference = diff::slice(expected, &tokens);
- for result in difference {
- match result {
- diff::Result::Left(left) => eprintln!("-{left:?}"),
- diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
- diff::Result::Right(right) => eprintln!("+{right:?}"),
- }
- }
- panic!();
- }
-}
-
-#[test]
-fn test_identifiers() {
- check_scan(
- r#"a aB i5 $x @efg @@. !abcd !* !*a #.# .x _z.
-abcd. abcd.
-QRSTUV./* end of line comment */
-QrStUv./* end of line comment */
-WXYZ. /* unterminated end of line comment
-�. /* U+FFFD is not valid in an identifier
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("aB").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("i5").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("$x").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("@efg").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("@@.").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("!abcd").unwrap())),
- ScanToken::Token(Token::Punct(Punct::BangAsterisk)),
- ScanToken::Token(Token::Punct(Punct::BangAsterisk)),
- ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("#.#").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Dot)),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Underscore)),
- ScanToken::Token(Token::Id(Identifier::new("z").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("abcd.").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("abcd").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("QRSTUV").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("QrStUv").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("WXYZ").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Error(ScanError::UnexpectedChar('�')),
- ScanToken::Token(Token::EndCommand),
- ],
- );
-}
-
-#[test]
-fn test_reserved_words() {
- check_scan(
- r#"and or not eq ge gt le lt ne all by to with
-AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
-andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
-and. with.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Punct(Punct::And)),
- ScanToken::Token(Token::Punct(Punct::Or)),
- ScanToken::Token(Token::Punct(Punct::Not)),
- ScanToken::Token(Token::Punct(Punct::Eq)),
- ScanToken::Token(Token::Punct(Punct::Ge)),
- ScanToken::Token(Token::Punct(Punct::Gt)),
- ScanToken::Token(Token::Punct(Punct::Le)),
- ScanToken::Token(Token::Punct(Punct::Lt)),
- ScanToken::Token(Token::Punct(Punct::Ne)),
- ScanToken::Token(Token::Punct(Punct::All)),
- ScanToken::Token(Token::Punct(Punct::By)),
- ScanToken::Token(Token::Punct(Punct::To)),
- ScanToken::Token(Token::Punct(Punct::With)),
- ScanToken::Token(Token::Punct(Punct::And)),
- ScanToken::Token(Token::Punct(Punct::Or)),
- ScanToken::Token(Token::Punct(Punct::Not)),
- ScanToken::Token(Token::Punct(Punct::Eq)),
- ScanToken::Token(Token::Punct(Punct::Ge)),
- ScanToken::Token(Token::Punct(Punct::Gt)),
- ScanToken::Token(Token::Punct(Punct::Le)),
- ScanToken::Token(Token::Punct(Punct::Lt)),
- ScanToken::Token(Token::Punct(Punct::Ne)),
- ScanToken::Token(Token::Punct(Punct::All)),
- ScanToken::Token(Token::Punct(Punct::By)),
- ScanToken::Token(Token::Punct(Punct::To)),
- ScanToken::Token(Token::Punct(Punct::With)),
- ScanToken::Token(Token::Id(Identifier::new("andx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("orx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("notx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("eqx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("gex").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("gtx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("lex").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("ltx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("nex").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("allx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("byx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("tox").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("withx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("and.").unwrap())),
- ScanToken::Token(Token::Punct(Punct::With)),
- ScanToken::Token(Token::EndCommand),
- ],
- );
-}
-
-#[test]
-fn test_punctuation() {
- check_scan(
- r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] **
-~&|=>=><=<~=<>(),-+*/[]**
-% : ; ? _ ` { } ~
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Punct(Punct::Not)),
- ScanToken::Token(Token::Punct(Punct::And)),
- ScanToken::Token(Token::Punct(Punct::Or)),
- ScanToken::Token(Token::Punct(Punct::Equals)),
- ScanToken::Token(Token::Punct(Punct::Ge)),
- ScanToken::Token(Token::Punct(Punct::Gt)),
- ScanToken::Token(Token::Punct(Punct::Le)),
- ScanToken::Token(Token::Punct(Punct::Lt)),
- ScanToken::Token(Token::Punct(Punct::Ne)),
- ScanToken::Token(Token::Punct(Punct::Ne)),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Punct(Punct::Comma)),
- ScanToken::Token(Token::Punct(Punct::Dash)),
- ScanToken::Token(Token::Punct(Punct::Plus)),
- ScanToken::Token(Token::Punct(Punct::Asterisk)),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Punct(Punct::LSquare)),
- ScanToken::Token(Token::Punct(Punct::RSquare)),
- ScanToken::Token(Token::Punct(Punct::Exp)),
- ScanToken::Token(Token::Punct(Punct::Not)),
- ScanToken::Token(Token::Punct(Punct::And)),
- ScanToken::Token(Token::Punct(Punct::Or)),
- ScanToken::Token(Token::Punct(Punct::Equals)),
- ScanToken::Token(Token::Punct(Punct::Ge)),
- ScanToken::Token(Token::Punct(Punct::Gt)),
- ScanToken::Token(Token::Punct(Punct::Le)),
- ScanToken::Token(Token::Punct(Punct::Lt)),
- ScanToken::Token(Token::Punct(Punct::Ne)),
- ScanToken::Token(Token::Punct(Punct::Ne)),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Punct(Punct::Comma)),
- ScanToken::Token(Token::Punct(Punct::Dash)),
- ScanToken::Token(Token::Punct(Punct::Plus)),
- ScanToken::Token(Token::Punct(Punct::Asterisk)),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Punct(Punct::LSquare)),
- ScanToken::Token(Token::Punct(Punct::RSquare)),
- ScanToken::Token(Token::Punct(Punct::Exp)),
- ScanToken::Token(Token::Punct(Punct::Percent)),
- ScanToken::Token(Token::Punct(Punct::Colon)),
- ScanToken::Token(Token::Punct(Punct::Semicolon)),
- ScanToken::Token(Token::Punct(Punct::Question)),
- ScanToken::Token(Token::Punct(Punct::Underscore)),
- ScanToken::Token(Token::Punct(Punct::Backtick)),
- ScanToken::Token(Token::Punct(Punct::LCurly)),
- ScanToken::Token(Token::Punct(Punct::RCurly)),
- ScanToken::Token(Token::Punct(Punct::Not)),
- ],
- );
-}
-
-#[test]
-fn test_positive_numbers() {
- check_scan(
- r#"0 1 01 001. 1.
-123. /* comment 1 */ /* comment 2 */
-.1 0.1 00.1 00.10
-5e1 6E-1 7e+1 6E+01 6e-03
-.3E1 .4e-1 .5E+1 .6e+01 .7E-03
-1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
-. 1e e1 1e+ 1e-
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Number(0.0)),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Number(123.0)),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::Number(0.1)),
- ScanToken::Token(Token::Number(0.1)),
- ScanToken::Token(Token::Number(0.1)),
- ScanToken::Token(Token::Number(50.0)),
- ScanToken::Token(Token::Number(0.6)),
- ScanToken::Token(Token::Number(70.0)),
- ScanToken::Token(Token::Number(60.0)),
- ScanToken::Token(Token::Number(0.006)),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Number(30.0)),
- ScanToken::Token(Token::Number(0.04)),
- ScanToken::Token(Token::Number(5.0)),
- ScanToken::Token(Token::Number(6.0)),
- ScanToken::Token(Token::Number(0.0007)),
- ScanToken::Token(Token::Number(12.3)),
- ScanToken::Token(Token::Number(4.56)),
- ScanToken::Token(Token::Number(789.0)),
- ScanToken::Token(Token::Number(999.0)),
- ScanToken::Token(Token::Number(0.0112)),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Error(ScanError::ExpectedExponent(String::from("1e"))),
- ScanToken::Token(Token::Id(Identifier::new("e1").unwrap())),
- ScanToken::Error(ScanError::ExpectedExponent(String::from("1e+"))),
- ScanToken::Error(ScanError::ExpectedExponent(String::from("1e-"))),
- ],
- );
-}
-
-#[test]
-fn test_negative_numbers() {
- check_scan(
- r#" -0 -1 -01 -001. -1.
- -123. /* comment 1 */ /* comment 2 */
- -.1 -0.1 -00.1 -00.10
- -5e1 -6E-1 -7e+1 -6E+01 -6e-03
- -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03
- -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03
- -/**/1
- -. -1e -e1 -1e+ -1e- -1.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Number(-0.0)),
- ScanToken::Token(Token::Number(-1.0)),
- ScanToken::Token(Token::Number(-1.0)),
- ScanToken::Token(Token::Number(-1.0)),
- ScanToken::Token(Token::Number(-1.0)),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Number(-123.0)),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Number(-0.1)),
- ScanToken::Token(Token::Number(-0.1)),
- ScanToken::Token(Token::Number(-0.1)),
- ScanToken::Token(Token::Number(-0.1)),
- ScanToken::Token(Token::Number(-50.0)),
- ScanToken::Token(Token::Number(-0.6)),
- ScanToken::Token(Token::Number(-70.0)),
- ScanToken::Token(Token::Number(-60.0)),
- ScanToken::Token(Token::Number(-0.006)),
- ScanToken::Token(Token::Number(-3.0)),
- ScanToken::Token(Token::Number(-0.04)),
- ScanToken::Token(Token::Number(-5.0)),
- ScanToken::Token(Token::Number(-6.0)),
- ScanToken::Token(Token::Number(-0.0007)),
- ScanToken::Token(Token::Number(-12.3)),
- ScanToken::Token(Token::Number(-4.56)),
- ScanToken::Token(Token::Number(-789.0)),
- ScanToken::Token(Token::Number(-999.0)),
- ScanToken::Token(Token::Number(-0.0112)),
- ScanToken::Token(Token::Number(-1.0)),
- ScanToken::Token(Token::Punct(Punct::Dash)),
- ScanToken::Token(Token::Punct(Punct::Dot)),
- ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e"))),
- ScanToken::Token(Token::Punct(Punct::Dash)),
- ScanToken::Token(Token::Id(Identifier::new("e1").unwrap())),
- ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e+"))),
- ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e-"))),
- ScanToken::Token(Token::Number(-1.0)),
- ScanToken::Token(Token::EndCommand),
- ],
- );
-}
-
-#[test]
-fn test_strings() {
- check_scan(
- r#"'x' "y" 'abc'
-'Don''t' "Can't" 'Won''t'
-"""quoted""" '"quoted"'
-'' "" '''' """"
-'missing end quote
-"missing double quote
-'x' + "y"
-+ 'z' +
-'a' /* abc */ + "b" /*
-+ 'c' +/* */"d"/* */+'e'
-'foo'
-+ /* special case: + in column 0 would ordinarily start a new command
-'bar'
-'foo'
- +
-'bar'
-'foo'
-+
-
-'bar'
-
-+
-x"4142"+'5152'
-"4142"+
-x'5152'
-x"4142"
-+u'304a'
-"�あいうえお"
-"abc"+U"FFFD"+u'3048'+"xyz"
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::String(String::from("x"))),
- ScanToken::Token(Token::String(String::from("y"))),
- ScanToken::Token(Token::String(String::from("abc"))),
- ScanToken::Token(Token::String(String::from("Don't"))),
- ScanToken::Token(Token::String(String::from("Can't"))),
- ScanToken::Token(Token::String(String::from("Won't"))),
- ScanToken::Token(Token::String(String::from("\"quoted\""))),
- ScanToken::Token(Token::String(String::from("\"quoted\""))),
- ScanToken::Token(Token::String(String::from(""))),
- ScanToken::Token(Token::String(String::from(""))),
- ScanToken::Token(Token::String(String::from("'"))),
- ScanToken::Token(Token::String(String::from("\""))),
- ScanToken::Error(ScanError::ExpectedQuote),
- ScanToken::Error(ScanError::ExpectedQuote),
- ScanToken::Token(Token::String(String::from("xyzabcde"))),
- ScanToken::Token(Token::String(String::from("foobar"))),
- ScanToken::Token(Token::String(String::from("foobar"))),
- ScanToken::Token(Token::String(String::from("foo"))),
- ScanToken::Token(Token::Punct(Punct::Plus)),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::String(String::from("bar"))),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Punct(Punct::Plus)),
- ScanToken::Token(Token::String(String::from("AB5152"))),
- ScanToken::Token(Token::String(String::from("4142QR"))),
- ScanToken::Token(Token::String(String::from("ABお"))),
- ScanToken::Token(Token::String(String::from("�あいうえお"))),
- ScanToken::Token(Token::String(String::from("abc�えxyz"))),
- ScanToken::Token(Token::End),
- ],
- );
-}
-
-#[test]
-fn test_shbang() {
- check_scan(
- r#"#! /usr/bin/pspp
-#! /usr/bin/pspp
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("#").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Bang)),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Id(Identifier::new("usr").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Id(Identifier::new("bin").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Id(Identifier::new("pspp").unwrap())),
- ],
- );
-}
-
-#[test]
-fn test_comments() {
- check_scan(
- r#"* Comment commands "don't
-have to contain valid tokens.
-
-** Check ambiguity with ** token.
-****************.
-
-comment keyword works too.
-COMM also.
-com is ambiguous with COMPUTE.
-
- * Comment need not start at left margin.
-
-* Comment ends with blank line
-
-next command.
-
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("com").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("is").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("ambiguous").unwrap())),
- ScanToken::Token(Token::Punct(Punct::With)),
- ScanToken::Token(Token::Id(Identifier::new("COMPUTE").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("next").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::EndCommand),
- ],
- );
-}
-
-#[test]
-fn test_document() {
- check_scan(
- r#"DOCUMENT one line.
-DOC more
- than
- one
- line.
-docu
-first.paragraph
-isn't parsed as tokens
-
-second paragraph.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())),
- ScanToken::Token(Token::String(String::from("DOCUMENT one line."))),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())),
- ScanToken::Token(Token::String(String::from("DOC more"))),
- ScanToken::Token(Token::String(String::from(" than"))),
- ScanToken::Token(Token::String(String::from(" one"))),
- ScanToken::Token(Token::String(String::from(" line."))),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())),
- ScanToken::Token(Token::String(String::from("docu"))),
- ScanToken::Token(Token::String(String::from("first.paragraph"))),
- ScanToken::Token(Token::String(String::from("isn't parsed as tokens"))),
- ScanToken::Token(Token::String(String::from(""))),
- ScanToken::Token(Token::String(String::from("second paragraph."))),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::EndCommand),
- ],
- );
-}
-
-#[test]
-fn test_file_label() {
- check_scan(
- r#"FIL label isn't quoted.
-FILE
- lab 'is quoted'.
-FILE /*
-/**/ lab not quoted here either
-
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("FIL").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("label").unwrap())),
- ScanToken::Token(Token::String(String::from("isn't quoted"))),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("FILE").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("lab").unwrap())),
- ScanToken::Token(Token::String(String::from("is quoted"))),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("FILE").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("lab").unwrap())),
- ScanToken::Token(Token::String(String::from("not quoted here either"))),
- ScanToken::Token(Token::EndCommand),
- ],
- );
-}
-
-#[test]
-fn test_begin_data() {
- check_scan(
- r#"begin data.
-123
-xxx
-end data.
-
-BEG /**/ DAT /*
-5 6 7 /* x
-
-end data
-end data
-.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("begin").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::String(String::from("123"))),
- ScanToken::Token(Token::String(String::from("xxx"))),
- ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("BEG").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("DAT").unwrap())),
- ScanToken::Token(Token::String(String::from("5 6 7 /* x"))),
- ScanToken::Token(Token::String(String::from(""))),
- ScanToken::Token(Token::String(String::from("end data"))),
- ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ],
- );
-}
-
-#[test]
-fn test_do_repeat() {
- check_scan(
- r#"do repeat x=a b c
- y=d e f.
- do repeat a=1 thru 5.
-another command.
-second command
-+ third command.
-end /* x */ /* y */ repeat print.
-end
- repeat.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("do").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Equals)),
- ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("y").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Equals)),
- ScanToken::Token(Token::Id(Identifier::new("d").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("e").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("f").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::String(String::from(" do repeat a=1 thru 5."))),
- ScanToken::Token(Token::String(String::from("another command."))),
- ScanToken::Token(Token::String(String::from("second command"))),
- ScanToken::Token(Token::String(String::from("+ third command."))),
- ScanToken::Token(Token::String(String::from(
- "end /* x */ /* y */ repeat print.",
- ))),
- ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ],
- );
-}
-
-#[test]
-fn test_do_repeat_batch() {
- check_scan(
- r#"do repeat x=a b c
- y=d e f
-do repeat a=1 thru 5
-another command
-second command
-+ third command
-end /* x */ /* y */ repeat print
-end
- repeat
-do
- repeat #a=1
-
- inner command
-end repeat
-"#,
- Mode::Batch,
- &[
- ScanToken::Token(Token::Id(Identifier::new("do").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Equals)),
- ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("y").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Equals)),
- ScanToken::Token(Token::Id(Identifier::new("d").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("e").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("f").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::String(String::from("do repeat a=1 thru 5"))),
- ScanToken::Token(Token::String(String::from("another command"))),
- ScanToken::Token(Token::String(String::from("second command"))),
- ScanToken::Token(Token::String(String::from("+ third command"))),
- ScanToken::Token(Token::String(String::from(
- "end /* x */ /* y */ repeat print",
- ))),
- ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("do").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("#a").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Equals)),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::String(String::from(" inner command"))),
- ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
- ],
- );
-}
-
-#[test]
-fn test_batch_mode() {
- check_scan(
- r#"first command
- another line of first command
-+ second command
-third command
-
-fourth command.
- fifth command.
-"#,
- Mode::Batch,
- &[
- ScanToken::Token(Token::Id(Identifier::new("first").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("another").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("line").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("of").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("first").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("second").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("third").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("fourth").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("fifth").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ],
- );
-}
-
-mod define {
- use crate::{identifier::Identifier, lex::{
- scan::ScanToken,
- segment::Mode,
- token::{Punct, Token},
- }};
-
- use super::check_scan;
-
- #[test]
- fn test_simple() {
- check_scan(
- r#"define !macro1()
-var1 var2 var3
-!enddefine.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ],
- );
- }
-
- #[test]
- fn test_no_newline_after_parentheses() {
- check_scan(
- r#"define !macro1() var1 var2 var3
-!enddefine.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::String(String::from(" var1 var2 var3"))),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ],
- );
- }
-
- #[test]
- fn test_no_newline_before_enddefine() {
- check_scan(
- r#"define !macro1()
-var1 var2 var3!enddefine.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ],
- );
- }
-
- #[test]
- fn test_all_on_one_line() {
- check_scan(
- r#"define !macro1()var1 var2 var3!enddefine.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ],
- );
- }
-
- #[test]
- fn test_empty() {
- check_scan(
- r#"define !macro1()
-!enddefine.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ],
- );
- }
-
- #[test]
- fn test_blank_lines() {
- check_scan(
- r#"define !macro1()
-
-
-!enddefine.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::String(String::from(""))),
- ScanToken::Token(Token::String(String::from(""))),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ],
- );
- }
-
- #[test]
- fn test_arguments() {
- check_scan(
- r#"define !macro1(a(), b(), c())
-!enddefine.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Punct(Punct::Comma)),
- ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Punct(Punct::Comma)),
- ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ],
- );
- }
-
- #[test]
- fn test_multiline_arguments() {
- check_scan(
- r#"define !macro1(
- a(), b(
- ),
- c()
-)
-!enddefine.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Punct(Punct::Comma)),
- ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Punct(Punct::Comma)),
- ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ],
- );
- }
-
- #[test]
- fn test_arguments_start_on_second_line() {
- check_scan(
- r#"define !macro1
-(x,y,z
-)
-content 1
-content 2
-!enddefine.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Comma)),
- ScanToken::Token(Token::Id(Identifier::new("y").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Comma)),
- ScanToken::Token(Token::Id(Identifier::new("z").unwrap())),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::String(String::from("content 1"))),
- ScanToken::Token(Token::String(String::from("content 2"))),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ],
- );
- }
-
- #[test]
- fn test_early_end_of_command_1() {
- check_scan(
- r#"define !macro1.
-data list /x 1.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::EndCommand),
- ],
- );
- }
-
- #[test]
- fn test_early_end_of_command_2() {
- check_scan(
- r#"define !macro1
-x.
-data list /x 1.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::EndCommand),
- ],
- );
- }
-
- #[test]
- fn test_early_end_of_command_3() {
- check_scan(
- r#"define !macro1(.
-x.
-data list /x 1.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::EndCommand),
- ],
- );
- }
-
- #[test]
- fn test_early_end_of_command_4() {
- // Notice the command terminator at the end of the DEFINE command,
- // which should not be there and ends it early.
- check_scan(
- r#"define !macro1.
-data list /x 1.
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::EndCommand),
- ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::EndCommand),
- ],
- );
- }
-
- #[test]
- fn test_missing_enddefine() {
- check_scan(
- r#"define !macro1()
-content line 1
-content line 2
-"#,
- Mode::Auto,
- &[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::String(String::from("content line 1"))),
- ScanToken::Token(Token::String(String::from("content line 2"))),
- ScanToken::Token(Token::End),
- ],
- );
- }
-}
+++ /dev/null
-//! Syntax segmentation.
-//!
-//! PSPP divides traditional "lexical analysis" or "tokenization" into two
-//! phases: a lower-level phase called "segmentation" and a higher-level phase
-//! called "scanning". This module implements the segmentation phase.
-//! [`super::scan`] contains declarations for the scanning phase.
-//!
-//! Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label
-//! (a segment type) for each byte or contiguous sequence of bytes in the input.
-//! It also, in a few corner cases, outputs zero-width segments that label the
-//! boundary between a pair of bytes in the input.
-//!
-//! Some segment types correspond directly to tokens; for example, an
-//! "identifier" segment (SEG_IDENTIFIER) becomes an identifier token (T_ID)
-//! later in lexical analysis. Other segments contribute to tokens but do not
-//! correspond directly; for example, multiple quoted string segments
-//! (SEG_QUOTED_STRING) separated by spaces (SEG_SPACES) and "+" punctuators
-//! (SEG_PUNCT) may be combined to form a single string token (T_STRING). Still
-//! other segments are ignored (e.g. SEG_SPACES) or trigger special behavior
-//! such as error messages later in tokenization (e.g. SEG_EXPECTED_QUOTE).
-
-use crate::{
- identifier::{id_match, id_match_n, IdentifierChar},
- prompt::PromptStyle,
-};
-use bitflags::bitflags;
-
-use super::command_name::{command_match, COMMAND_NAMES};
-
-/// Segmentation mode.
-///
-/// PSPP syntax is written in one of two modes which are broadly defined as
-/// follows:
-///
-/// - In interactive mode, commands end with a period at the end of the line
-/// or with a blank line.
-///
-/// - In batch mode, the second and subsequent lines of a command are indented
-/// from the left margin.
-///
-/// The segmenter can also try to automatically detect the mode in use, using a
-/// heuristic that is usually correct.
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
-pub enum Mode {
- /// Try to interpret input correctly regardless of whether it is written
- /// for interactive or batch mode.
- #[default]
- Auto,
-
- /// Interactive syntax mode.
- Interactive,
-
- /// Batch syntax mode.
- Batch,
-}
-
-/// The type of a segment.
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum Segment {
- Number,
- QuotedString,
- HexString,
- UnicodeString,
- UnquotedString,
- Identifier,
- Punct,
- Shbang,
- Spaces,
- Comment,
- Newline,
- CommentCommand,
- DoRepeatCommand,
- DoRepeatOverflow,
- InlineData,
- MacroName,
- MacroBody,
- StartDocument,
- Document,
- StartCommand,
- SeparateCommands,
- EndCommand,
- End,
- ExpectedQuote,
- ExpectedExponent,
- UnexpectedChar,
-}
-
-bitflags! {
- #[derive(Copy, Clone, Debug)]
- pub struct Substate: u8 {
- const START_OF_LINE = 1;
- const START_OF_COMMAND = 2;
- }
-}
-
-#[derive(Copy, Clone)]
-pub struct Segmenter {
- state: (State, Substate),
- nest: u8,
- mode: Mode,
-}
-
-#[derive(Copy, Clone, Debug)]
-pub struct Incomplete;
-
-impl Segmenter {
- /// Returns a segmenter with the given syntax `mode`.
- ///
- /// If `is_snippet` is false, then the segmenter will parse as if it's being
- /// given a whole file. This means, for example, that it will interpret `-`
- /// or `+` at the beginning of the syntax as a separator between commands
- /// (since `-` or `+` at the beginning of a line has this meaning).
- ///
- /// If `is_snippet` is true, then the segmenter will parse as if it's being
- /// given an isolated piece of syntax. This means that, for example, that
- /// it will interpret `-` or `+` at the beginning of the syntax as an
- /// operator token or (if followed by a digit) as part of a number.
- pub fn new(mode: Mode, is_snippet: bool) -> Self {
- Self {
- state: if is_snippet {
- (State::General, Substate::empty())
- } else {
- (State::Shbang, Substate::empty())
- },
- mode,
- nest: 0,
- }
- }
-
- pub fn mode(&self) -> Mode {
- self.mode
- }
-
- fn start_of_line(&self) -> bool {
- self.state.1.contains(Substate::START_OF_LINE)
- }
-
- fn start_of_command(&self) -> bool {
- self.state.1.contains(Substate::START_OF_COMMAND)
- }
-
- /// Returns the style of command prompt to display to an interactive user
- /// for input in the current state.. The return value is most accurate in
- /// mode `Mode::Interactive` and at the beginning of a line (that is, if
- /// [`Segmenter::push`] consumed as much as possible of the input up to a
- /// new-line).
- pub fn prompt(&self) -> PromptStyle {
- match self.state.0 {
- State::Shbang => PromptStyle::First,
- State::General => {
- if self.start_of_command() {
- PromptStyle::First
- } else {
- PromptStyle::Later
- }
- }
- State::Comment1 | State::Comment2 => PromptStyle::Comment,
- State::Document1 | State::Document2 => PromptStyle::Document,
- State::Document3 => PromptStyle::First,
- State::FileLabel1 => PromptStyle::Later,
- State::FileLabel2 | State::FileLabel3 => PromptStyle::First,
- State::DoRepeat1 | State::DoRepeat2 => {
- if self.start_of_command() {
- PromptStyle::First
- } else {
- PromptStyle::Later
- }
- }
- State::DoRepeat3 => PromptStyle::DoRepeat,
- State::DoRepeat4 => PromptStyle::DoRepeat,
- State::Define1 | State::Define2 | State::Define3 => {
- if self.start_of_command() {
- PromptStyle::First
- } else {
- PromptStyle::Later
- }
- }
- State::Define4 | State::Define5 | State::Define6 => PromptStyle::Define,
- State::BeginData1 => PromptStyle::First,
- State::BeginData2 => PromptStyle::Later,
- State::BeginData3 | State::BeginData4 => PromptStyle::Data,
- }
- }
-
- /// Attempts to label a prefix of the remaining input with a segment type.
- /// The caller supplies a prefix of the remaining input as `input`. If
- /// `eof` is true, then `input` is the entire (remainder) of the input; if
- /// `eof` is false, then further input is potentially available.
- ///
- /// The input may contain '\n' or '\r\n' line ends in any combination.
- ///
- /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes
- /// in the segment at the beginning of `input` (a number in
- /// `0..=input.len()`) and the type of that segment. The next call should
- /// not include those bytes in `input`, because they have (figuratively)
- /// been consumed by the segmenter.
- ///
- /// Segments can have zero length, including segment types `Type::End`,
- /// `Type::SeparateCommands`, `Type::StartDocument`, `Type::InlineData`, and
- /// `Type::Spaces`.
- ///
- /// Failure occurs only if the segment type of the bytes in `input` cannot
- /// yet be determined. In this case, this function returns `Err(Incomplete)`. If
- /// more input is available, the caller should obtain some more, then call
- /// again with a longer `input`. If this is not enough, the process might
- /// need to repeat again and again. If input is exhausted, then the caller
- /// may call again setting `eof` to true. This function will never return
- /// `Err(Incomplete)` when `eof` is true.
- ///
- /// The caller must not, in a sequence of calls, supply contradictory input.
- /// That is, bytes provided as part of `input` in one call, but not
- /// consumed, must not be provided with *different* values on subsequent
- /// calls. This is because the function must often make decisions based on
- /// looking ahead beyond the bytes that it consumes.
- fn push_rest<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- if input.is_empty() {
- if eof {
- return Ok((input, Segment::End));
- } else {
- return Err(Incomplete);
- };
- }
-
- match self.state.0 {
- State::Shbang => return self.parse_shbang(input, eof),
- State::General => {
- if self.start_of_line() {
- self.parse_start_of_line(input, eof)
- } else {
- self.parse_mid_line(input, eof)
- }
- }
- State::Comment1 => self.parse_comment_1(input, eof),
- State::Comment2 => self.parse_comment_2(input, eof),
- State::Document1 => self.parse_document_1(input, eof),
- State::Document2 => self.parse_document_2(input, eof),
- State::Document3 => self.parse_document_3(input, eof),
- State::FileLabel1 => self.parse_file_label_1(input, eof),
- State::FileLabel2 => self.parse_file_label_2(input, eof),
- State::FileLabel3 => self.parse_file_label_3(input, eof),
- State::DoRepeat1 => self.parse_do_repeat_1(input, eof),
- State::DoRepeat2 => self.parse_do_repeat_2(input, eof),
- State::DoRepeat3 => self.parse_do_repeat_3(input, eof),
- State::DoRepeat4 => self.parse_do_repeat_4(input),
- State::Define1 => self.parse_define_1_2(input, eof),
- State::Define2 => self.parse_define_1_2(input, eof),
- State::Define3 => self.parse_define_3(input, eof),
- State::Define4 => self.parse_define_4_5(input, eof),
- State::Define5 => self.parse_define_4_5(input, eof),
- State::Define6 => self.parse_define_6(input, eof),
- State::BeginData1 => self.parse_begin_data_1(input, eof),
- State::BeginData2 => self.parse_begin_data_2(input, eof),
- State::BeginData3 => self.parse_begin_data_3(input, eof),
- State::BeginData4 => self.parse_begin_data_4(input, eof),
- }
- }
-
- pub fn push(&mut self, input: &str, eof: bool) -> Result<(usize, Segment), Incomplete> {
- let (rest, seg_type) = self.push_rest(input, eof)?;
- Ok((input.len() - rest.len(), seg_type))
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-enum State {
- Shbang,
- General,
- Comment1,
- Comment2,
- Document1,
- Document2,
- Document3,
- FileLabel1,
- FileLabel2,
- FileLabel3,
- DoRepeat1,
- DoRepeat2,
- DoRepeat3,
- DoRepeat4,
- Define1,
- Define2,
- Define3,
- Define4,
- Define5,
- Define6,
- BeginData1,
- BeginData2,
- BeginData3,
- BeginData4,
-}
-
-fn take(input: &str, eof: bool) -> Result<(Option<char>, &str), Incomplete> {
- let mut iter = input.chars();
- match iter.next() {
- None if !eof => Err(Incomplete),
- c => Ok((c, iter.as_str())),
- }
-}
-
-fn skip_comment(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
- loop {
- let (Some(c), rest) = take(input, eof)? else {
- return Ok(input);
- };
- match c {
- '\n' | '\r' if is_end_of_line(input, eof)? => return Ok(input),
- '*' => {
- if let (Some('/'), rest) = take(rest, eof)? {
- return Ok(rest);
- }
- }
- _ => (),
- };
- input = rest;
- }
-}
-
-fn skip_matching<F>(f: F, input: &str, eof: bool) -> Result<&str, Incomplete>
-where
- F: Fn(char) -> bool,
-{
- let input = input.trim_start_matches(f);
- if input.is_empty() && !eof {
- Err(Incomplete)
- } else {
- Ok(input)
- }
-}
-
-fn match_char<F>(f: F, input: &str, eof: bool) -> Result<Option<&str>, Incomplete>
-where
- F: Fn(char) -> bool,
-{
- if let (Some(c), rest) = take(input, eof)? {
- if f(c) {
- return Ok(Some(rest));
- }
- }
- Ok(None)
-}
-
-fn skip_spaces(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
- loop {
- let (Some(c), rest) = take(input, eof)? else {
- return Ok(input);
- };
- match c {
- '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
- c if c.is_whitespace() => (),
- _ => return Ok(input),
- }
- input = rest;
- }
-}
-
-fn skip_digits(input: &str, eof: bool) -> Result<&str, Incomplete> {
- skip_matching(|c| c.is_ascii_digit(), input, eof)
-}
-
-fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
- loop {
- let (Some(c), rest) = take(input, eof)? else {
- return Ok(input);
- };
- match c {
- '/' => {
- let (c, rest2) = take(rest, eof)?;
- match c {
- Some('*') => input = skip_comment(rest2, eof)?,
- Some(_) | None => return Ok(rest),
- }
- }
- '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
- c if c.is_whitespace() => input = rest,
- _ => return Ok(input),
- };
- }
-}
-
-fn is_start_of_string(input: &str, eof: bool) -> Result<bool, Incomplete> {
- let (Some(c), rest) = take(input, eof)? else {
- return Ok(false);
- };
- match c {
- 'x' | 'X' | 'u' | 'U' => {
- let (c, _rest) = take(rest, eof)?;
- Ok(c == Some('\'') || c == Some('"'))
- }
- '\'' | '"' => Ok(true),
- '\n' | '\r' if is_end_of_line(input, eof)? => Ok(true),
- _ => Ok(false),
- }
-}
-
-fn is_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
- let (Some(c), rest) = take(input, eof)? else {
- return Ok(true);
- };
- Ok(match c {
- '\n' => true,
- '\r' => take(rest, eof)?.0 == Some('\n'),
- _ => false,
- })
-}
-
-fn at_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
- is_end_of_line(skip_spaces_and_comments(input, eof)?, eof)
-}
-
-fn first(s: &str) -> char {
- s.chars().next().unwrap()
-}
-fn get_command_name_candidates(target: &str) -> &[&'static str] {
- if target.is_empty() {
- return &[];
- }
- let target_first = first(target).to_ascii_uppercase();
- let low = COMMAND_NAMES.partition_point(|s| first(s) < target_first);
- let high = COMMAND_NAMES.partition_point(|s| first(s) <= target_first);
- &COMMAND_NAMES[low..high]
-}
-
-fn detect_command_name(input: &str, eof: bool) -> Result<bool, Incomplete> {
- let command_name = input
- .split(|c: char| {
- !((c.is_whitespace() && c != '\n') || (c.may_continue_id() && c != '.') || c == '-')
- })
- .next()
- .unwrap();
- if !eof && command_name.len() == input.len() {
- return Err(Incomplete);
- }
- let command_name = command_name.trim_end_matches(|c: char| c.is_whitespace() || c == '.');
- for command in get_command_name_candidates(command_name) {
- if let Some(m) = command_match(command, command_name) {
- if m.missing_words <= 0 {
- return Ok(true);
- }
- }
- }
- Ok(false)
-}
-
-impl Segmenter {
- fn parse_shbang<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- if let (Some('#'), rest) = take(input, eof)? {
- if let (Some('!'), rest) = take(rest, eof)? {
- let rest = self.parse_full_line(rest, eof)?;
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok((rest, Segment::Shbang));
- }
- }
-
- self.state = (
- State::General,
- Substate::START_OF_COMMAND | Substate::START_OF_LINE,
- );
- self.push_rest(input, eof)
- }
- fn at_command_start(&self, input: &str, eof: bool) -> Result<bool, Incomplete> {
- match self.mode {
- Mode::Auto => detect_command_name(input, eof),
- Mode::Interactive => Ok(false),
- Mode::Batch => Ok(true),
- }
- }
- fn parse_start_of_line<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- debug_assert_eq!(self.state.0, State::General);
- debug_assert!(self.start_of_line());
- debug_assert!(!input.is_empty());
-
- let (Some(c), rest) = take(input, eof).unwrap() else {
- unreachable!()
- };
- match c {
- '+' if is_start_of_string(skip_spaces_and_comments(rest, eof)?, eof)? => {
- // This `+` is punctuation that may separate pieces of a string.
- self.state = (State::General, Substate::empty());
- return Ok((rest, Segment::Punct));
- }
- '+' | '-' | '.' => {
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok((rest, Segment::StartCommand));
- }
- _ if c.is_whitespace() => {
- if at_end_of_line(input, eof)? {
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok((input, Segment::SeparateCommands));
- }
- }
- _ => {
- if self.at_command_start(input, eof)?
- && !self.state.1.contains(Substate::START_OF_COMMAND)
- {
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok((input, Segment::StartCommand));
- }
- }
- }
- self.state.1 = Substate::START_OF_COMMAND;
- self.parse_mid_line(input, eof)
- }
- fn parse_mid_line<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- debug_assert!(self.state.0 == State::General);
- debug_assert!(!self.state.1.contains(Substate::START_OF_LINE));
- let (Some(c), rest) = take(input, eof)? else {
- unreachable!()
- };
- match c {
- '\r' | '\n' if is_end_of_line(input, eof)? => {
- self.state.1 |= Substate::START_OF_LINE;
- Ok((
- self.parse_newline(input, eof).unwrap().unwrap(),
- Segment::Newline,
- ))
- }
- '/' => {
- if let (Some('*'), rest) = take(rest, eof)? {
- let rest = skip_comment(rest, eof)?;
- return Ok((rest, Segment::Comment));
- } else {
- self.state.1 = Substate::empty();
- return Ok((rest, Segment::Punct));
- }
- }
- '-' => {
- let (c, rest2) = take(skip_spaces(rest, eof)?, eof)?;
- match c {
- Some(c) if c.is_ascii_digit() => {
- return self.parse_number(rest, eof);
- }
- Some('.') => {
- if let (Some(c), _rest) = take(rest2, eof)? {
- if c.is_ascii_digit() {
- return self.parse_number(rest, eof);
- }
- }
- }
- None | Some(_) => (),
- }
- self.state.1 = Substate::empty();
- return Ok((rest, Segment::Punct));
- }
- '(' | ')' | '[' | ']' | '{' | '}' | ',' | '=' | ';' | ':' | '&' | '|' | '+' => {
- self.state.1 = Substate::empty();
- return Ok((rest, Segment::Punct));
- }
- '*' => {
- if self.state.1.contains(Substate::START_OF_COMMAND) {
- self.state = (State::Comment1, Substate::empty());
- self.parse_comment_1(input, eof)
- } else {
- self.parse_digraph(&['*'], rest, eof)
- }
- }
- '<' => self.parse_digraph(&['=', '>'], rest, eof),
- '>' => self.parse_digraph(&['='], rest, eof),
- '~' => self.parse_digraph(&['='], rest, eof),
- '.' if at_end_of_line(rest, eof)? => {
- self.state.1 = Substate::START_OF_COMMAND;
- Ok((rest, Segment::EndCommand))
- }
- '.' => match take(rest, eof)? {
- (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof),
- _ => Ok((rest, Segment::Punct)),
- },
- '0'..='9' => self.parse_number(input, eof),
- 'u' | 'U' => self.maybe_parse_string(Segment::UnicodeString, (input, rest), eof),
- 'x' | 'X' => self.maybe_parse_string(Segment::HexString, (input, rest), eof),
- '\'' | '"' => self.parse_string(Segment::QuotedString, c, rest, eof),
- '!' => {
- let (c, rest2) = take(rest, eof)?;
- match c {
- Some('*') => Ok((rest2, Segment::Punct)),
- Some(_) => self.parse_id(input, eof),
- None => Ok((rest, Segment::Punct)),
- }
- }
- c if c.is_whitespace() => Ok((skip_spaces(rest, eof)?, Segment::Spaces)),
- c if c.may_start_id() => self.parse_id(input, eof),
- '#'..='~' if c != '\\' && c != '^' => {
- self.state.1 = Substate::empty();
- Ok((rest, Segment::Punct))
- }
- _ => {
- self.state.1 = Substate::empty();
- Ok((rest, Segment::UnexpectedChar))
- }
- }
- }
- fn parse_string<'a>(
- &mut self,
- segment: Segment,
- quote: char,
- mut input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- while let (Some(c), rest) = take(input, eof)? {
- match c {
- _ if c == quote => {
- let (c, rest2) = take(rest, eof)?;
- if c != Some(quote) {
- self.state.1 = Substate::empty();
- return Ok((rest, segment));
- }
- input = rest2;
- }
- '\r' | '\n' if is_end_of_line(input, eof)? => break,
- _ => input = rest,
- }
- }
- self.state.1 = Substate::empty();
- Ok((input, Segment::ExpectedQuote))
- }
- fn maybe_parse_string<'a>(
- &mut self,
- segment: Segment,
- input: (&'a str, &'a str),
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- match take(input.1, eof)? {
- (Some(c), rest) if c == '\'' || c == '"' => self.parse_string(segment, c, rest, eof),
- _ => self.parse_id(input.0, eof),
- }
- }
- fn next_id_in_command<'a>(
- &self,
- mut input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, &'a str), Incomplete> {
- let mut sub = Segmenter::new(self.mode, true);
- loop {
- let (seg_len, seg_type) = sub.push(input, eof)?;
- let (segment, rest) = input.split_at(seg_len);
- match seg_type {
- Segment::Shbang | Segment::Spaces | Segment::Comment | Segment::Newline => (),
-
- Segment::Identifier => return Ok((segment, rest)),
-
- Segment::Number
- | Segment::QuotedString
- | Segment::HexString
- | Segment::UnicodeString
- | Segment::UnquotedString
- | Segment::Punct
- | Segment::CommentCommand
- | Segment::DoRepeatCommand
- | Segment::DoRepeatOverflow
- | Segment::InlineData
- | Segment::MacroName
- | Segment::MacroBody
- | Segment::StartDocument
- | Segment::Document
- | Segment::StartCommand
- | Segment::SeparateCommands
- | Segment::EndCommand
- | Segment::End
- | Segment::ExpectedQuote
- | Segment::ExpectedExponent
- | Segment::UnexpectedChar => return Ok(("", rest)),
- }
- input = rest;
- }
- }
- fn parse_id<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let (Some(_), mut end) = take(input, eof).unwrap() else {
- unreachable!()
- };
- while let (Some(c), rest) = take(end, eof)? {
- if !c.may_continue_id() {
- break;
- };
- end = rest;
- }
- let identifier = &input[..input.len() - end.len()];
- let identifier = match identifier.strip_suffix('.') {
- Some(without_dot) if at_end_of_line(end, eof)? => without_dot,
- _ => identifier,
- };
- let rest = &input[identifier.len()..];
-
- if self.state.1.contains(Substate::START_OF_COMMAND) {
- if id_match_n("COMMENT", identifier, 4) {
- self.state = (State::Comment1, Substate::empty());
- return self.parse_comment_1(input, eof);
- } else if id_match("DOCUMENT", identifier) {
- self.state = (State::Document1, Substate::empty());
- return Ok((input, Segment::StartDocument));
- } else if id_match_n("DEFINE", identifier, 6) {
- self.state = (State::Define1, Substate::empty());
- } else if id_match("FILE", identifier) {
- if id_match("LABEL", self.next_id_in_command(rest, eof)?.0) {
- self.state = (State::FileLabel1, Substate::empty());
- return Ok((rest, Segment::Identifier));
- }
- } else if id_match("DO", identifier) {
- if id_match("REPEAT", self.next_id_in_command(rest, eof)?.0) {
- self.state = (State::DoRepeat1, Substate::empty());
- return Ok((rest, Segment::Identifier));
- }
- } else if id_match("BEGIN", identifier) {
- let (next_id, rest2) = self.next_id_in_command(rest, eof)?;
- if id_match("DATA", next_id) {
- let rest2 = skip_spaces_and_comments(rest2, eof)?;
- let rest2 = if let Some(s) = rest2.strip_prefix('.') {
- skip_spaces_and_comments(s, eof)?
- } else {
- rest2
- };
- if is_end_of_line(rest2, eof)? {
- let s = &input[..input.len() - rest2.len()];
- self.state = (
- if s.contains('\n') {
- State::BeginData1
- } else {
- State::BeginData2
- },
- Substate::empty(),
- );
- return Ok((rest, Segment::Identifier));
- }
- }
- }
- }
-
- self.state.1 = Substate::empty();
- Ok((
- rest,
- if identifier != "!" {
- Segment::Identifier
- } else {
- Segment::Punct
- },
- ))
- }
- fn parse_digraph<'a>(
- &mut self,
- seconds: &[char],
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let (c, rest) = take(input, eof)?;
- self.state.1 = Substate::empty();
- Ok((
- match c {
- Some(c) if seconds.contains(&c) => rest,
- _ => input,
- },
- Segment::Punct,
- ))
- }
- fn parse_number<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let mut input = skip_digits(input, eof)?;
- if let Some(rest) = match_char(|c| c == '.', input, eof)? {
- let rest2 = skip_digits(rest, eof)?;
- if rest2.len() < rest.len() || !at_end_of_line(rest2, eof)? {
- input = rest2;
- }
- };
- if let Some(rest) = match_char(|c| c == 'e' || c == 'E', input, eof)? {
- let rest = match_char(|c| c == '+' || c == '-', rest, eof)?.unwrap_or(rest);
- let rest2 = skip_digits(rest, eof)?;
- if rest2.len() == rest.len() {
- self.state.1 = Substate::empty();
- return Ok((rest, Segment::ExpectedExponent));
- }
- input = rest2;
- }
- self.state.1 = Substate::empty();
- Ok((input, Segment::Number))
- }
- fn parse_comment_1<'a>(
- &mut self,
- mut input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- enum CommentState<'a> {
- Blank,
- NotBlank,
- Period(&'a str),
- }
- let mut state = CommentState::Blank;
- loop {
- let (Some(c), rest) = take(input, eof)? else {
- // End of file.
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok((input, Segment::SeparateCommands));
- };
- match c {
- '.' => state = CommentState::Period(input),
- '\n' | '\r' if is_end_of_line(input, eof)? => {
- match state {
- CommentState::Blank => {
- // Blank line ends comment command.
- self.state = (State::General, Substate::START_OF_COMMAND);
- return Ok((input, Segment::SeparateCommands));
- }
- CommentState::Period(period) => {
- // '.' at end of line ends comment command.
- self.state = (State::General, Substate::empty());
- return Ok((period, Segment::CommentCommand));
- }
- CommentState::NotBlank => {
- // Comment continues onto next line.
- self.state = (State::Comment2, Substate::empty());
- return Ok((input, Segment::CommentCommand));
- }
- }
- }
- c if c.is_whitespace() => (),
- _ => state = CommentState::NotBlank,
- }
- input = rest;
- }
- }
- fn parse_comment_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let rest = self.parse_newline(input, eof)?.unwrap();
-
- let new_command = match take(rest, eof)?.0 {
- Some('+') | Some('-') | Some('.') => true,
- Some(c) if !c.is_whitespace() => self.at_command_start(rest, eof)?,
- None | Some(_) => false,
- };
- if new_command {
- self.state = (
- State::General,
- Substate::START_OF_LINE | Substate::START_OF_COMMAND,
- );
- } else {
- self.state = (State::Comment1, Substate::empty());
- }
- Ok((rest, Segment::Newline))
- }
- fn parse_document_1<'a>(
- &mut self,
- mut input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let mut end_cmd = false;
- loop {
- let (Some(c), rest) = take(input, eof)? else {
- self.state = (State::Document3, Substate::empty());
- return Ok((input, Segment::Document));
- };
- match c {
- '.' => end_cmd = true,
- '\n' | '\r' if is_end_of_line(input, eof)? => {
- self.state.0 = if end_cmd {
- State::Document3
- } else {
- State::Document2
- };
- return Ok((input, Segment::Document));
- }
- c if !c.is_whitespace() => end_cmd = false,
- _ => (),
- }
- input = rest;
- }
- }
- fn parse_document_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let rest = self.parse_newline(input, eof)?.unwrap();
- self.state = (State::Document1, Substate::empty());
- Ok((rest, Segment::Newline))
- }
- fn parse_document_3<'a>(
- &mut self,
- input: &'a str,
- _eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- self.state = (
- State::General,
- Substate::START_OF_COMMAND | Substate::START_OF_LINE,
- );
- Ok((input, Segment::EndCommand))
- }
- fn quoted_file_label(input: &str, eof: bool) -> Result<bool, Incomplete> {
- let input = skip_spaces_and_comments(input, eof)?;
- match take(input, eof)?.0 {
- Some('\'') | Some('"') | Some('\n') => Ok(true),
- _ => Ok(false),
- }
- }
- fn parse_file_label_1<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let mut sub = Segmenter {
- state: (State::General, self.state.1),
- ..*self
- };
- let (rest, segment) = sub.push_rest(input, eof)?;
- if segment == Segment::Identifier {
- let id = &input[..input.len() - rest.len()];
- debug_assert!(id_match("LABEL", id), "{id} should be LABEL");
- if Self::quoted_file_label(rest, eof)? {
- *self = sub;
- } else {
- self.state.0 = State::FileLabel2;
- }
- } else {
- self.state.1 = sub.state.1;
- }
- Ok((rest, segment))
- }
- fn parse_file_label_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let input = skip_spaces(input, eof)?;
- self.state = (State::FileLabel3, Substate::empty());
- Ok((input, Segment::Spaces))
- }
- fn parse_file_label_3<'a>(
- &mut self,
- mut input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let mut end_cmd = None;
- loop {
- let (c, rest) = take(input, eof)?;
- match c {
- None | Some('\n') | Some('\r') if is_end_of_line(input, eof)? => {
- self.state = (State::General, Substate::empty());
- return Ok((end_cmd.unwrap_or(input), Segment::UnquotedString));
- }
- None => unreachable!(),
- Some('.') => end_cmd = Some(input),
- Some(c) if !c.is_whitespace() => end_cmd = None,
- Some(_) => (),
- }
- input = rest;
- }
- }
- fn subparse<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let mut sub = Segmenter {
- mode: self.mode,
- state: (State::General, self.state.1),
- nest: 0,
- };
- let result = sub.push_rest(input, eof)?;
- self.state.1 = sub.state.1;
- Ok(result)
- }
- /// We are segmenting a `DO REPEAT` command, currently reading the syntax
- /// that defines the stand-in variables (the head) before the lines of
- /// syntax to be repeated (the body).
- fn parse_do_repeat_1<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let (rest, segment) = self.subparse(input, eof)?;
- if segment == Segment::SeparateCommands {
- // We reached a blank line that separates the head from the body.
- self.state.0 = State::DoRepeat2;
- } else if segment == Segment::EndCommand || segment == Segment::StartCommand {
- // We reached the body.
- self.state.0 = State::DoRepeat3;
- self.nest = 1;
- }
- Ok((rest, segment))
- }
- /// We are segmenting a `DO REPEAT` command, currently reading a blank line
- /// that separates the head from the body.
- fn parse_do_repeat_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let (rest, segment) = self.subparse(input, eof)?;
- if segment == Segment::Newline {
- // We reached the body.
- self.state.0 = State::DoRepeat3;
- self.nest = 1;
- }
- Ok((rest, segment))
- }
- fn parse_newline<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<Option<&'a str>, Incomplete> {
- let (Some(c), rest) = take(input, eof)? else {
- return Ok(None);
- };
- match c {
- '\n' => Ok(Some(rest)),
- '\r' => {
- if let (Some('\n'), rest) = take(rest, eof)? {
- Ok(Some(rest))
- } else {
- Ok(None)
- }
- }
- _ => Ok(None),
- }
- }
-
- fn parse_full_line<'a>(
- &mut self,
- mut input: &'a str,
- eof: bool,
- ) -> Result<&'a str, Incomplete> {
- loop {
- if is_end_of_line(input, eof)? {
- return Ok(input);
- }
- input = take(input, eof).unwrap().1;
- }
- }
- fn check_repeat_command<'a>(&mut self, input: &'a str, eof: bool) -> Result<isize, Incomplete> {
- let input = input.strip_prefix(&['-', '+']).unwrap_or(input);
- let (id1, input) = self.next_id_in_command(input, eof)?;
- if id_match("DO", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) {
- Ok(1)
- } else if id_match("END", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0)
- {
- Ok(-1)
- } else {
- Ok(0)
- }
- }
- /// We are in the body of `DO REPEAT`, segmenting the lines of syntax that
- /// are to be repeated. Report each line of syntax as a single
- /// [`Type::DoRepeatCommand`].
- ///
- /// `DO REPEAT` can be nested, so we look for `DO REPEAT...END REPEAT`
- /// blocks inside the lines we're segmenting. `self.nest` counts the
- /// nesting level, starting at 1.
- fn parse_do_repeat_3<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- if let Some(rest) = self.parse_newline(input, eof)? {
- return Ok((rest, Segment::Newline));
- }
- let rest = self.parse_full_line(input, eof)?;
- let direction = self.check_repeat_command(input, eof)?;
- if direction > 0 {
- if let Some(nest) = self.nest.checked_add(1) {
- self.nest = nest;
- } else {
- self.state.0 = State::DoRepeat4;
- }
- } else if direction < 0 {
- self.nest -= 1;
- if self.nest == 0 {
- // Nesting level dropped to 0, so we've finished reading the `DO
- // REPEAT` body.
- self.state = (
- State::General,
- Substate::START_OF_COMMAND | Substate::START_OF_LINE,
- );
- return self.push_rest(input, eof);
- }
- }
- return Ok((rest, Segment::DoRepeatCommand));
- }
- fn parse_do_repeat_4<'a>(&mut self, input: &'a str) -> Result<(&'a str, Segment), Incomplete> {
- self.state.0 = State::DoRepeat3;
- Ok((input, Segment::DoRepeatOverflow))
- }
- /// We are segmenting a `DEFINE` command, which consists of:
- ///
- /// - The `DEFINE` keyword.
- ///
- /// - An identifier. We transform this into `Type::MacroName` instead of
- /// `Type::Identifier` because this identifier must never be macro-expanded.
- ///
- /// - Anything but `(`.
- ///
- /// - `(` followed by a sequence of tokens possibly including balanced
- /// parentheses up to a final `)`.
- ///
- /// - A sequence of any number of lines, one string per line, ending with
- /// `!ENDDEFINE`. The first line is usually blank (that is, a newline
- /// follows the `(`). The last line usually just has `!ENDDEFINE.` on
- /// it, but it can start with other tokens. The whole
- /// DEFINE...!ENDDEFINE can be on a single line, even.
- fn parse_define_1_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let (rest, segment) = self.subparse(input, eof)?;
- match segment {
- Segment::Identifier if self.state.0 == State::Define1 => {
- self.state.0 = State::Define2;
- return Ok((rest, Segment::MacroName));
- }
- Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => {
- // The DEFINE command is malformed because we reached its end
- // without ever hitting a `(` token. Transition back to general
- // parsing.
- self.state.0 = State::General;
- }
- Segment::Punct if input.starts_with('(') => {
- self.state.0 = State::Define3;
- self.nest = 1;
- }
- _ => (),
- }
- Ok((rest, segment))
- }
- fn parse_define_3<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let (rest, segment) = self.subparse(input, eof)?;
- match segment {
- Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => {
- // The DEFINE command is malformed because we reached its end
- // without ever hitting a `(` token. Transition back to general
- // parsing.
- self.state.0 = State::General;
- }
- Segment::Punct if input.starts_with('(') => {
- self.nest += 1;
- }
- Segment::Punct if input.starts_with(')') => {
- self.nest -= 1;
- if self.nest == 0 {
- self.state = (State::Define4, Substate::empty());
- }
- }
- _ => (),
- }
- Ok((rest, segment))
- }
- fn find_enddefine<'a>(mut input: &'a str) -> Option<&'a str> {
- loop {
- input = skip_spaces_and_comments(input, true).unwrap();
- let (Some(c), rest) = take(input, true).unwrap() else {
- return None;
- };
- match c {
- '!' if strip_prefix_ignore_ascii_case(input, "!ENDDEFINE").is_some() => {
- return Some(input)
- }
- '\'' | '"' => {
- let index = rest.find(c)?;
- input = &rest[index + 1..];
- }
- _ => input = rest,
- }
- }
- }
-
- /// We are in the body of a macro definition, looking for additional lines
- /// of the body or `!ENDDEFINE`.
- ///
- /// In `State::Define4`, we're parsing the first line of the macro body (the
- /// same line as the closing parenthesis in the argument definition). In
- /// `State::Define5`, we're on a later line.
- fn parse_define_4_5<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let rest = self.parse_full_line(input, eof)?;
- let line = &input[..input.len() - rest.len()];
- if let Some(end) = Self::find_enddefine(line) {
- // Macro ends at the !ENDDEFINE on this line.
- self.state = (State::General, Substate::empty());
- let (prefix, rest) = input.split_at(line.len() - end.len());
- if prefix.is_empty() {
- // Line starts with `!ENDDEFINE`.
- self.push_rest(input, eof)
- } else if prefix.trim_start().is_empty() {
- // Line starts with spaces followed by `!ENDDEFINE`.
- Ok((rest, Segment::Spaces))
- } else {
- // Line starts with some content followed by `!ENDDEFINE`.
- Ok((rest, Segment::MacroBody))
- }
- } else {
- // No `!ENDDEFINE`. We have a full line of macro body.
- //
- // If the first line of the macro body is blank, we just report it
- // as spaces, or not at all if there are no spaces, because it's not
- // significant.
- //
- // However, if it's a later line, we need to report it because blank
- // lines can have significance.
- let segment = if self.state.0 == State::Define4 && line.trim_start().is_empty() {
- if line.is_empty() {
- return self.parse_define_6(input, eof);
- }
- Segment::Spaces
- } else {
- Segment::MacroBody
- };
- self.state.0 = State::Define6;
- Ok((rest, segment))
- }
- }
- fn parse_define_6<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let rest = self.parse_newline(input, eof)?.unwrap();
- self.state.0 = State::Define5;
- Ok((rest, Segment::Newline))
- }
- fn parse_begin_data_1<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let (rest, segment) = self.subparse(input, eof)?;
- if segment == Segment::Newline {
- self.state.0 = State::BeginData2;
- }
- Ok((rest, segment))
- }
- fn parse_begin_data_2<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let (rest, segment) = self.subparse(input, eof)?;
- if segment == Segment::Newline {
- self.state.0 = State::BeginData3;
- }
- Ok((rest, segment))
- }
- fn is_end_data(line: &str) -> bool {
- let Some(rest) = strip_prefix_ignore_ascii_case(line, "END") else {
- return false;
- };
- let (Some(c), rest) = take(rest, true).unwrap() else {
- return false;
- };
- if !c.is_whitespace() {
- return false;
- };
- let Some(rest) = strip_prefix_ignore_ascii_case(rest, "DATA") else {
- return false;
- };
-
- let mut endcmd = false;
- for c in rest.chars() {
- match c {
- '.' if endcmd => return false,
- '.' => endcmd = true,
- c if c.is_whitespace() => (),
- _ => return false,
- }
- }
- true
- }
- fn parse_begin_data_3<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let rest = self.parse_full_line(input, eof)?;
- let line = &input[..input.len() - rest.len()];
- if Self::is_end_data(line) {
- self.state = (
- State::General,
- Substate::START_OF_COMMAND | Substate::START_OF_LINE,
- );
- self.push_rest(input, eof)
- } else {
- self.state.0 = State::BeginData4;
- Ok((rest, Segment::InlineData))
- }
- }
- fn parse_begin_data_4<'a>(
- &mut self,
- input: &'a str,
- eof: bool,
- ) -> Result<(&'a str, Segment), Incomplete> {
- let rest = self.parse_newline(input, eof)?.unwrap();
- self.state.0 = State::BeginData3;
- Ok((rest, Segment::Newline))
- }
-}
-
-fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&'a str> {
- line.get(..pattern.len())
- .map(|prefix| {
- prefix
- .eq_ignore_ascii_case(pattern)
- .then(|| &line[pattern.len()..])
- })
- .flatten()
-}
-
-#[cfg(test)]
-mod test;
+++ /dev/null
-use crate::prompt::PromptStyle;
-
-use super::{Mode, Segment, Segmenter};
-
-fn push_segment<'a>(
- segmenter: &mut Segmenter,
- input: &'a str,
- one_byte: bool,
-) -> (usize, Segment) {
- if one_byte {
- for len in input.char_indices().map(|(pos, _c)| pos) {
- if let Ok(result) = segmenter.push(&input[..len], false) {
- return result;
- }
- }
- }
- segmenter.push(input, true).unwrap()
-}
-
-fn _check_segmentation(
- mut input: &str,
- mode: Mode,
- expect_segments: &[(Segment, &str)],
- expect_prompts: &[PromptStyle],
- one_byte: bool,
-) {
- let mut segments = Vec::with_capacity(expect_segments.len());
- let mut prompts = Vec::new();
- let mut segmenter = Segmenter::new(mode, false);
- loop {
- let (seg_len, seg_type) = push_segment(&mut segmenter, input, one_byte);
- let (token, rest) = input.split_at(seg_len);
- segments.push((seg_type, token));
- match seg_type {
- Segment::End => break,
- Segment::Newline => prompts.push(segmenter.prompt()),
- _ => (),
- }
- input = rest;
- }
-
- if &segments != expect_segments {
- eprintln!("segments differ from expected:");
- let difference = diff::slice(expect_segments, &segments);
- for result in difference {
- match result {
- diff::Result::Left(left) => eprintln!("-{left:?}"),
- diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
- diff::Result::Right(right) => eprintln!("+{right:?}"),
- }
- }
- panic!();
- }
-
- if &prompts != expect_prompts {
- eprintln!("prompts differ from expected:");
- let difference = diff::slice(expect_prompts, &prompts);
- for result in difference {
- match result {
- diff::Result::Left(left) => eprintln!("-{left:?}"),
- diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
- diff::Result::Right(right) => eprintln!("+{right:?}"),
- }
- }
- panic!();
- }
-}
-
-fn check_segmentation(
- input: &str,
- mode: Mode,
- expect_segments: &[(Segment, &str)],
- expect_prompts: &[PromptStyle],
-) {
- for (one_byte, one_byte_name) in [(false, "full-string"), (true, "byte-by-byte")] {
- println!("running {one_byte_name} segmentation test with LF newlines...");
- _check_segmentation(input, mode, expect_segments, expect_prompts, one_byte);
-
- println!("running {one_byte_name} segmentation test with CRLF newlines...");
- _check_segmentation(
- &input.replace('\n', "\r\n"),
- mode,
- &expect_segments
- .iter()
- .map(|(segment, s)| match *segment {
- Segment::Newline => (Segment::Newline, "\r\n"),
- _ => (*segment, *s),
- })
- .collect::<Vec<_>>(),
- expect_prompts,
- one_byte,
- );
-
- if let Some(input) = input.strip_suffix('\n') {
- println!("running {one_byte_name} segmentation test without final newline...");
- let mut expect_segments: Vec<_> = expect_segments.iter().copied().collect();
- assert_eq!(expect_segments.pop(), Some((Segment::End, "")));
- assert_eq!(expect_segments.pop(), Some((Segment::Newline, "\n")));
- while let Some((Segment::SeparateCommands | Segment::EndCommand, "")) =
- expect_segments.last()
- {
- expect_segments.pop();
- }
- expect_segments.push((Segment::End, ""));
- _check_segmentation(
- input,
- mode,
- &expect_segments,
- &expect_prompts[..expect_prompts.len() - 1],
- one_byte,
- );
- }
- }
-}
-
-#[allow(dead_code)]
-fn print_segmentation(mut input: &str) {
- let mut segmenter = Segmenter::new(Mode::Interactive, false);
- loop {
- let (seg_len, seg_type) = segmenter.push(input, true).unwrap();
- let (token, rest) = input.split_at(seg_len);
- print!("{seg_type:?} {token:?}");
- match seg_type {
- Segment::Newline => print!(" ({:?})", segmenter.prompt()),
- Segment::End => break,
- _ => (),
- }
- println!();
- input = rest;
- }
-}
-
-#[test]
-fn test_identifiers() {
- check_segmentation(
- r#"a ab abc abcd !abcd
-A AB ABC ABCD !ABCD
-aB aBC aBcD !aBcD
-$x $y $z !$z
-grève Ângstrom poté
-#a #b #c ## #d !#d
-@efg @ @@. @#@ !@
-## # #12345 #.#
-f@#_.#6
-GhIjK
-.x 1y _z
-!abc abc!
-"#,
- Mode::Auto,
- &[
- (Segment::Identifier, "a"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "ab"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "abc"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "abcd"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "!abcd"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "A"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "AB"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "ABC"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "ABCD"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "!ABCD"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "aB"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "aBC"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "aBcD"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "!aBcD"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "$x"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "$y"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "$z"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "!$z"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "grève"),
- (Segment::Spaces, "\u{00a0}"),
- (Segment::Identifier, "Ângstrom"),
- (Segment::Spaces, "\u{00a0}"),
- (Segment::Identifier, "poté"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "#a"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "#b"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "#c"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "##"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "#d"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "!#d"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "@efg"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "@"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "@@."),
- (Segment::Spaces, " "),
- (Segment::Identifier, "@#@"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "!@"),
- (Segment::Spaces, " "),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "##"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "#"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "#12345"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "#.#"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "f@#_.#6"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "GhIjK"),
- (Segment::Newline, "\n"),
- (Segment::StartCommand, "."),
- (Segment::Identifier, "x"),
- (Segment::Spaces, " "),
- (Segment::Number, "1"),
- (Segment::Identifier, "y"),
- (Segment::Spaces, " "),
- (Segment::Punct, "_"),
- (Segment::Identifier, "z"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "!abc"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "abc"),
- (Segment::Punct, "!"),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- ],
- );
-}
-
-#[test]
-fn test_identifiers_ending_in_dot() {
- check_segmentation(
- r#"abcd. abcd.
-ABCD. ABCD.
-aBcD. aBcD.
-$y. $z. あいうえお.
-#c. #d..
-@@. @@....
-#.#.
-#abcd.
-.
-.
-LMNOP.
-QRSTUV./* end of line comment */
-qrstuv. /* end of line comment */
-QrStUv./* end of line comment */
-wxyz./* unterminated end of line comment
-WXYZ. /* unterminated end of line comment
-WxYz./* unterminated end of line comment
-"#,
- Mode::Auto,
- &[
- (Segment::Identifier, "abcd."),
- (Segment::Spaces, " "),
- (Segment::Identifier, "abcd"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "ABCD."),
- (Segment::Spaces, " "),
- (Segment::Identifier, "ABCD"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "aBcD."),
- (Segment::Spaces, " "),
- (Segment::Identifier, "aBcD"),
- (Segment::EndCommand, "."),
- (Segment::Spaces, " "),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "$y."),
- (Segment::Spaces, " "),
- (Segment::Identifier, "$z."),
- (Segment::Spaces, " "),
- (Segment::Identifier, "あいうえお"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "#c."),
- (Segment::Spaces, " "),
- (Segment::Identifier, "#d."),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "@@."),
- (Segment::Spaces, " "),
- (Segment::Identifier, "@@..."),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "#.#"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "#abcd"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::StartCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::StartCommand, "."),
- (Segment::Spaces, " "),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "LMNOP"),
- (Segment::EndCommand, "."),
- (Segment::Spaces, " "),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "QRSTUV"),
- (Segment::EndCommand, "."),
- (Segment::Comment, "/* end of line comment */"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "qrstuv"),
- (Segment::EndCommand, "."),
- (Segment::Spaces, " "),
- (Segment::Comment, "/* end of line comment */"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "QrStUv"),
- (Segment::EndCommand, "."),
- (Segment::Comment, "/* end of line comment */"),
- (Segment::Spaces, " "),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "wxyz"),
- (Segment::EndCommand, "."),
- (Segment::Comment, "/* unterminated end of line comment"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "WXYZ"),
- (Segment::EndCommand, "."),
- (Segment::Spaces, " "),
- (Segment::Comment, "/* unterminated end of line comment"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "WxYz"),
- (Segment::EndCommand, "."),
- (Segment::Comment, "/* unterminated end of line comment "),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- ],
- );
-}
-
-#[test]
-fn test_reserved_words() {
- check_segmentation(
- r#"and or not eq ge gt le lt ne all by to with
-AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
-andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
-and. with.
-"#,
- Mode::Auto,
- &[
- (Segment::Identifier, "and"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "or"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "not"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "eq"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "ge"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "gt"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "le"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "lt"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "ne"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "all"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "by"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "to"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "with"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "AND"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "OR"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "NOT"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "EQ"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "GE"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "GT"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "LE"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "LT"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "NE"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "ALL"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "BY"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "TO"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "WITH"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "andx"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "orx"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "notx"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "eqx"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "gex"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "gtx"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "lex"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "ltx"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "nex"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "allx"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "byx"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "tox"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "withx"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "and."),
- (Segment::Spaces, " "),
- (Segment::Identifier, "with"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::First,
- ],
- );
-}
-
-#[test]
-fn test_punctuation() {
- check_segmentation(
- r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] **
-~&|=>=><=<~=<>(),-+*/[]**!*
-% : ; ? _ ` { } ~ !*
-"#,
- Mode::Auto,
- &[
- (Segment::Punct, "~"),
- (Segment::Spaces, " "),
- (Segment::Punct, "&"),
- (Segment::Spaces, " "),
- (Segment::Punct, "|"),
- (Segment::Spaces, " "),
- (Segment::Punct, "="),
- (Segment::Spaces, " "),
- (Segment::Punct, ">="),
- (Segment::Spaces, " "),
- (Segment::Punct, ">"),
- (Segment::Spaces, " "),
- (Segment::Punct, "<="),
- (Segment::Spaces, " "),
- (Segment::Punct, "<"),
- (Segment::Spaces, " "),
- (Segment::Punct, "~="),
- (Segment::Spaces, " "),
- (Segment::Punct, "<>"),
- (Segment::Spaces, " "),
- (Segment::Punct, "("),
- (Segment::Spaces, " "),
- (Segment::Punct, ")"),
- (Segment::Spaces, " "),
- (Segment::Punct, ","),
- (Segment::Spaces, " "),
- (Segment::Punct, "-"),
- (Segment::Spaces, " "),
- (Segment::Punct, "+"),
- (Segment::Spaces, " "),
- (Segment::Punct, "*"),
- (Segment::Spaces, " "),
- (Segment::Punct, "/"),
- (Segment::Spaces, " "),
- (Segment::Punct, "["),
- (Segment::Spaces, " "),
- (Segment::Punct, "]"),
- (Segment::Spaces, " "),
- (Segment::Punct, "**"),
- (Segment::Newline, "\n"),
- (Segment::Punct, "~"),
- (Segment::Punct, "&"),
- (Segment::Punct, "|"),
- (Segment::Punct, "="),
- (Segment::Punct, ">="),
- (Segment::Punct, ">"),
- (Segment::Punct, "<="),
- (Segment::Punct, "<"),
- (Segment::Punct, "~="),
- (Segment::Punct, "<>"),
- (Segment::Punct, "("),
- (Segment::Punct, ")"),
- (Segment::Punct, ","),
- (Segment::Punct, "-"),
- (Segment::Punct, "+"),
- (Segment::Punct, "*"),
- (Segment::Punct, "/"),
- (Segment::Punct, "["),
- (Segment::Punct, "]"),
- (Segment::Punct, "**"),
- (Segment::Punct, "!*"),
- (Segment::Newline, "\n"),
- (Segment::Punct, "%"),
- (Segment::Spaces, " "),
- (Segment::Punct, ":"),
- (Segment::Spaces, " "),
- (Segment::Punct, ";"),
- (Segment::Spaces, " "),
- (Segment::Punct, "?"),
- (Segment::Spaces, " "),
- (Segment::Punct, "_"),
- (Segment::Spaces, " "),
- (Segment::Punct, "`"),
- (Segment::Spaces, " "),
- (Segment::Punct, "{"),
- (Segment::Spaces, " "),
- (Segment::Punct, "}"),
- (Segment::Spaces, " "),
- (Segment::Punct, "~"),
- (Segment::Spaces, " "),
- (Segment::Punct, "!*"),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[PromptStyle::Later, PromptStyle::Later, PromptStyle::Later],
- );
-}
-
-#[test]
-fn test_positive_numbers() {
- check_segmentation(
- r#"0 1 01 001. 1.
-123. /* comment 1 */ /* comment 2 */
-.1 0.1 00.1 00.10
-5e1 6E-1 7e+1 6E+01 6e-03
-.3E1 .4e-1 .5E+1 .6e+01 .7E-03
-1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
-. 1e e1 1e+ 1e- 1.
-"#,
- Mode::Auto,
- &[
- (Segment::Number, "0"),
- (Segment::Spaces, " "),
- (Segment::Number, "1"),
- (Segment::Spaces, " "),
- (Segment::Number, "01"),
- (Segment::Spaces, " "),
- (Segment::Number, "001."),
- (Segment::Spaces, " "),
- (Segment::Number, "1"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Number, "123"),
- (Segment::EndCommand, "."),
- (Segment::Spaces, " "),
- (Segment::Comment, "/* comment 1 */"),
- (Segment::Spaces, " "),
- (Segment::Comment, "/* comment 2 */"),
- (Segment::Newline, "\n"),
- (Segment::StartCommand, "."),
- (Segment::Number, "1"),
- (Segment::Spaces, " "),
- (Segment::Number, "0.1"),
- (Segment::Spaces, " "),
- (Segment::Number, "00.1"),
- (Segment::Spaces, " "),
- (Segment::Number, "00.10"),
- (Segment::Newline, "\n"),
- (Segment::Number, "5e1"),
- (Segment::Spaces, " "),
- (Segment::Number, "6E-1"),
- (Segment::Spaces, " "),
- (Segment::Number, "7e+1"),
- (Segment::Spaces, " "),
- (Segment::Number, "6E+01"),
- (Segment::Spaces, " "),
- (Segment::Number, "6e-03"),
- (Segment::Newline, "\n"),
- (Segment::StartCommand, "."),
- (Segment::Number, "3E1"),
- (Segment::Spaces, " "),
- (Segment::Number, ".4e-1"),
- (Segment::Spaces, " "),
- (Segment::Number, ".5E+1"),
- (Segment::Spaces, " "),
- (Segment::Number, ".6e+01"),
- (Segment::Spaces, " "),
- (Segment::Number, ".7E-03"),
- (Segment::Newline, "\n"),
- (Segment::Number, "1.23e1"),
- (Segment::Spaces, " "),
- (Segment::Number, "45.6E-1"),
- (Segment::Spaces, " "),
- (Segment::Number, "78.9e+1"),
- (Segment::Spaces, " "),
- (Segment::Number, "99.9E+01"),
- (Segment::Spaces, " "),
- (Segment::Number, "11.2e-03"),
- (Segment::Newline, "\n"),
- (Segment::StartCommand, "."),
- (Segment::Spaces, " "),
- (Segment::ExpectedExponent, "1e"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "e1"),
- (Segment::Spaces, " "),
- (Segment::ExpectedExponent, "1e+"),
- (Segment::Spaces, " "),
- (Segment::ExpectedExponent, "1e-"),
- (Segment::Spaces, " "),
- (Segment::Number, "1"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::First,
- ],
- );
-}
-
-#[test]
-fn test_negative_numbers() {
- check_segmentation(
- r#" -0 -1 -01 -001. -1.
- -123. /* comment 1 */ /* comment 2 */
- -.1 -0.1 -00.1 -00.10
- -5e1 -6E-1 -7e+1 -6E+01 -6e-03
- -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03
- -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03
- -/**/1
- -. -1e -e1 -1e+ -1e- -1.
-"#,
- Mode::Auto,
- &[
- (Segment::Spaces, " "),
- (Segment::Number, "-0"),
- (Segment::Spaces, " "),
- (Segment::Number, "-1"),
- (Segment::Spaces, " "),
- (Segment::Number, "-01"),
- (Segment::Spaces, " "),
- (Segment::Number, "-001."),
- (Segment::Spaces, " "),
- (Segment::Number, "-1"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Number, "-123"),
- (Segment::EndCommand, "."),
- (Segment::Spaces, " "),
- (Segment::Comment, "/* comment 1 */"),
- (Segment::Spaces, " "),
- (Segment::Comment, "/* comment 2 */"),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Number, "-.1"),
- (Segment::Spaces, " "),
- (Segment::Number, "-0.1"),
- (Segment::Spaces, " "),
- (Segment::Number, "-00.1"),
- (Segment::Spaces, " "),
- (Segment::Number, "-00.10"),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Number, "-5e1"),
- (Segment::Spaces, " "),
- (Segment::Number, "-6E-1"),
- (Segment::Spaces, " "),
- (Segment::Number, "-7e+1"),
- (Segment::Spaces, " "),
- (Segment::Number, "-6E+01"),
- (Segment::Spaces, " "),
- (Segment::Number, "-6e-03"),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Number, "-.3E1"),
- (Segment::Spaces, " "),
- (Segment::Number, "-.4e-1"),
- (Segment::Spaces, " "),
- (Segment::Number, "-.5E+1"),
- (Segment::Spaces, " "),
- (Segment::Number, "-.6e+01"),
- (Segment::Spaces, " "),
- (Segment::Number, "-.7E-03"),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Number, "-1.23e1"),
- (Segment::Spaces, " "),
- (Segment::Number, "-45.6E-1"),
- (Segment::Spaces, " "),
- (Segment::Number, "-78.9e+1"),
- (Segment::Spaces, " "),
- (Segment::Number, "-99.9E+01"),
- (Segment::Spaces, " "),
- (Segment::Number, "-11.2e-03"),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Punct, "-"),
- (Segment::Comment, "/**/"),
- (Segment::Number, "1"),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Punct, "-"),
- (Segment::Punct, "."),
- (Segment::Spaces, " "),
- (Segment::ExpectedExponent, "-1e"),
- (Segment::Spaces, " "),
- (Segment::Punct, "-"),
- (Segment::Identifier, "e1"),
- (Segment::Spaces, " "),
- (Segment::ExpectedExponent, "-1e+"),
- (Segment::Spaces, " "),
- (Segment::ExpectedExponent, "-1e-"),
- (Segment::Spaces, " "),
- (Segment::Number, "-1"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::First,
- ],
- );
-}
-
-#[test]
-fn test_strings() {
- check_segmentation(
- r#"'x' "y" 'abc'
-'Don''t' "Can't" 'Won''t'
-"""quoted""" '"quoted"'
-'' ""
-'missing end quote
-"missing double quote
-x"4142" X'5152'
-u'fffd' U"041"
-+ new command
-+ /* comment */ 'string continuation'
-+ /* also a punctuator on blank line
-- 'new command'
-"#,
- Mode::Auto,
- &[
- (Segment::QuotedString, "'x'"),
- (Segment::Spaces, " "),
- (Segment::QuotedString, "\"y\""),
- (Segment::Spaces, " "),
- (Segment::QuotedString, "'abc'"),
- (Segment::Newline, "\n"),
- (Segment::QuotedString, "'Don''t'"),
- (Segment::Spaces, " "),
- (Segment::QuotedString, "\"Can't\""),
- (Segment::Spaces, " "),
- (Segment::QuotedString, "'Won''t'"),
- (Segment::Newline, "\n"),
- (Segment::QuotedString, "\"\"\"quoted\"\"\""),
- (Segment::Spaces, " "),
- (Segment::QuotedString, "'\"quoted\"'"),
- (Segment::Newline, "\n"),
- (Segment::QuotedString, "''"),
- (Segment::Spaces, " "),
- (Segment::QuotedString, "\"\""),
- (Segment::Newline, "\n"),
- (Segment::ExpectedQuote, "'missing end quote"),
- (Segment::Newline, "\n"),
- (Segment::ExpectedQuote, "\"missing double quote"),
- (Segment::Newline, "\n"),
- (Segment::HexString, "x\"4142\""),
- (Segment::Spaces, " "),
- (Segment::HexString, "X'5152'"),
- (Segment::Newline, "\n"),
- (Segment::UnicodeString, "u'fffd'"),
- (Segment::Spaces, " "),
- (Segment::UnicodeString, "U\"041\""),
- (Segment::Newline, "\n"),
- (Segment::StartCommand, "+"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "new"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "command"),
- (Segment::Newline, "\n"),
- (Segment::Punct, "+"),
- (Segment::Spaces, " "),
- (Segment::Comment, "/* comment */"),
- (Segment::Spaces, " "),
- (Segment::QuotedString, "'string continuation'"),
- (Segment::Newline, "\n"),
- (Segment::Punct, "+"),
- (Segment::Spaces, " "),
- (Segment::Comment, "/* also a punctuator on blank line"),
- (Segment::Newline, "\n"),
- (Segment::StartCommand, "-"),
- (Segment::Spaces, " "),
- (Segment::QuotedString, "'new command'"),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- ],
- );
-}
-
-#[test]
-fn test_shbang() {
- check_segmentation(
- r#"#! /usr/bin/pspp
-title my title.
-#! /usr/bin/pspp
-"#,
- Mode::Interactive,
- &[
- (Segment::Shbang, "#! /usr/bin/pspp"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "title"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "my"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "title"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "#"),
- (Segment::Punct, "!"),
- (Segment::Spaces, " "),
- (Segment::Punct, "/"),
- (Segment::Identifier, "usr"),
- (Segment::Punct, "/"),
- (Segment::Identifier, "bin"),
- (Segment::Punct, "/"),
- (Segment::Identifier, "pspp"),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[PromptStyle::First, PromptStyle::First, PromptStyle::Later],
- );
-}
-
-#[test]
-fn test_comment_command() {
- check_segmentation(
- r#"* Comment commands "don't
-have to contain valid tokens.
-
-** Check ambiguity with ** token.
-****************.
-
-comment keyword works too.
-COMM also.
-com is ambiguous with COMPUTE.
-
- * Comment need not start at left margin.
-
-* Comment ends with blank line
-
-next command.
-
-"#,
- Mode::Interactive,
- &[
- (Segment::CommentCommand, "* Comment commands \"don't"),
- (Segment::Newline, "\n"),
- (Segment::CommentCommand, "have to contain valid tokens"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::CommentCommand, "** Check ambiguity with ** token"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::CommentCommand, "****************"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::CommentCommand, "comment keyword works too"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::CommentCommand, "COMM also"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "com"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "is"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "ambiguous"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "with"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "COMPUTE"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (
- Segment::CommentCommand,
- "* Comment need not start at left margin",
- ),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::CommentCommand, "* Comment ends with blank line"),
- (Segment::Newline, "\n"),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "next"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "command"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::Comment,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::Comment,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- ],
- );
-}
-
-#[test]
-fn test_document_command() {
- check_segmentation(
- r#"DOCUMENT one line.
-DOC more
- than
- one
- line.
-docu
-first.paragraph
-isn't parsed as tokens
-
-second paragraph.
-"#,
- Mode::Interactive,
- &[
- (Segment::StartDocument, ""),
- (Segment::Document, "DOCUMENT one line."),
- (Segment::EndCommand, ""),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::StartDocument, ""),
- (Segment::Document, "DOC more"),
- (Segment::Newline, "\n"),
- (Segment::Document, " than"),
- (Segment::Newline, "\n"),
- (Segment::Document, " one"),
- (Segment::Newline, "\n"),
- (Segment::Document, " line."),
- (Segment::EndCommand, ""),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::StartDocument, ""),
- (Segment::Document, "docu"),
- (Segment::Newline, "\n"),
- (Segment::Document, "first.paragraph"),
- (Segment::Newline, "\n"),
- (Segment::Document, "isn't parsed as tokens"),
- (Segment::Newline, "\n"),
- (Segment::Document, ""),
- (Segment::Newline, "\n"),
- (Segment::Document, "second paragraph."),
- (Segment::EndCommand, ""),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::First,
- PromptStyle::Document,
- PromptStyle::Document,
- PromptStyle::Document,
- PromptStyle::First,
- PromptStyle::Document,
- PromptStyle::Document,
- PromptStyle::Document,
- PromptStyle::Document,
- PromptStyle::First,
- ],
- );
-}
-
-#[test]
-fn test_file_label_command() {
- check_segmentation(
- r#"FIL label isn't quoted.
-FILE
- lab 'is quoted'.
-FILE /*
-/**/ lab not quoted here either
-
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "FIL"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "label"),
- (Segment::Spaces, " "),
- (Segment::UnquotedString, "isn't quoted"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "FILE"),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "lab"),
- (Segment::Spaces, " "),
- (Segment::QuotedString, "'is quoted'"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "FILE"),
- (Segment::Spaces, " "),
- (Segment::Comment, "/*"),
- (Segment::Newline, "\n"),
- (Segment::Comment, "/**/"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "lab"),
- (Segment::Spaces, " "),
- (Segment::UnquotedString, "not quoted here either"),
- (Segment::Newline, "\n"),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::First,
- PromptStyle::Later,
- PromptStyle::First,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::First,
- ],
- );
-}
-
-#[test]
-fn test_begin_data() {
- check_segmentation(
- r#"begin data.
-end data.
-
-begin data. /*
-123
-xxx
-end data.
-
-BEG /**/ DAT /*
-5 6 7 /* x
-
-end data
-end data
-.
-
-begin
- data.
-data
-end data.
-
-begin data "xxx".
-begin data 123.
-not data
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "begin"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "data"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "end"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "data"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "begin"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "data"),
- (Segment::EndCommand, "."),
- (Segment::Spaces, " "),
- (Segment::Comment, "/*"),
- (Segment::Newline, "\n"),
- (Segment::InlineData, "123"),
- (Segment::Newline, "\n"),
- (Segment::InlineData, "xxx"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "end"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "data"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "BEG"),
- (Segment::Spaces, " "),
- (Segment::Comment, "/**/"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "DAT"),
- (Segment::Spaces, " "),
- (Segment::Comment, "/*"),
- (Segment::Newline, "\n"),
- (Segment::InlineData, "5 6 7 /* x"),
- (Segment::Newline, "\n"),
- (Segment::InlineData, ""),
- (Segment::Newline, "\n"),
- (Segment::InlineData, "end data"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "end"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "data"),
- (Segment::Newline, "\n"),
- (Segment::StartCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "begin"),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "data"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::InlineData, "data"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "end"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "data"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "begin"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "data"),
- (Segment::Spaces, " "),
- (Segment::QuotedString, "\"xxx\""),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "begin"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "data"),
- (Segment::Spaces, " "),
- (Segment::Number, "123"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "not"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "data"),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::Data,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::Data,
- PromptStyle::Data,
- PromptStyle::Data,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::Data,
- PromptStyle::Data,
- PromptStyle::Data,
- PromptStyle::Data,
- PromptStyle::Later,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::Later,
- PromptStyle::Data,
- PromptStyle::Data,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::Later,
- ],
- );
-}
-
-#[test]
-fn test_do_repeat() {
- check_segmentation(
- r#"do repeat x=a b c
- y=d e f.
- do repeat a=1 thru 5.
-another command.
-second command
-+ third command.
-end /* x */ /* y */ repeat print.
-end
- repeat.
-do
- repeat #a=1.
- inner command.
-end repeat.
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "do"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "repeat"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "x"),
- (Segment::Punct, "="),
- (Segment::Identifier, "a"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "b"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "c"),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "y"),
- (Segment::Punct, "="),
- (Segment::Identifier, "d"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "e"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "f"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::DoRepeatCommand, " do repeat a=1 thru 5."),
- (Segment::Newline, "\n"),
- (Segment::DoRepeatCommand, "another command."),
- (Segment::Newline, "\n"),
- (Segment::DoRepeatCommand, "second command"),
- (Segment::Newline, "\n"),
- (Segment::DoRepeatCommand, "+ third command."),
- (Segment::Newline, "\n"),
- (
- Segment::DoRepeatCommand,
- "end /* x */ /* y */ repeat print.",
- ),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "end"),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "repeat"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "do"),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "repeat"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "#a"),
- (Segment::Punct, "="),
- (Segment::Number, "1"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::DoRepeatCommand, " inner command."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "end"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "repeat"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::Later,
- PromptStyle::First,
- PromptStyle::Later,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::First,
- ],
- );
-}
-
-#[test]
-fn test_do_repeat_overflow() {
- const N: usize = 257;
- let do_repeat: Vec<String> = (0..N)
- .map(|i| format!("do repeat v{i}={i} thru {}.\n", i + 5))
- .collect();
- let end_repeat: Vec<String> = (0..N)
- .rev()
- .map(|i| format!("end repeat. /* {i}\n"))
- .collect();
-
- let s: String = do_repeat
- .iter()
- .chain(end_repeat.iter())
- .map(|s| s.as_str())
- .collect();
- let mut expect_output = vec![
- (Segment::Identifier, "do"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "repeat"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "v0"),
- (Segment::Punct, "="),
- (Segment::Number, "0"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "thru"),
- (Segment::Spaces, " "),
- (Segment::Number, "5"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- ];
- for i in 1..N {
- expect_output.push((Segment::DoRepeatCommand, &do_repeat[i].trim_end()));
- if i >= 255 {
- expect_output.push((Segment::DoRepeatOverflow, ""));
- }
- expect_output.push((Segment::Newline, "\n"));
- }
- for i in 0..254 {
- expect_output.push((Segment::DoRepeatCommand, &end_repeat[i].trim_end()));
- expect_output.push((Segment::Newline, "\n"));
- }
- let comments: Vec<String> = (0..(N - 254)).rev().map(|i| format!("/* {i}")).collect();
- for comment in &comments {
- expect_output.extend([
- (Segment::Identifier, "end"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "repeat"),
- (Segment::EndCommand, "."),
- (Segment::Spaces, " "),
- (Segment::Comment, comment),
- (Segment::Newline, "\n"),
- ]);
- }
- expect_output.push((Segment::End, ""));
-
- let expect_prompts: Vec<_> = (0..N * 2 - 3)
- .map(|_| PromptStyle::DoRepeat)
- .chain([PromptStyle::First, PromptStyle::First, PromptStyle::First])
- .collect();
- check_segmentation(&s, Mode::Interactive, &expect_output, &expect_prompts);
-}
-
-#[test]
-fn test_do_repeat_batch() {
- check_segmentation(
- r#"do repeat x=a b c
- y=d e f
-do repeat a=1 thru 5
-another command
-second command
-+ third command
-end /* x */ /* y */ repeat print
-end
- repeat
-do
- repeat #a=1
-
- inner command
-end repeat
-"#,
- Mode::Batch,
- &[
- (Segment::Identifier, "do"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "repeat"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "x"),
- (Segment::Punct, "="),
- (Segment::Identifier, "a"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "b"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "c"),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "y"),
- (Segment::Punct, "="),
- (Segment::Identifier, "d"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "e"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "f"),
- (Segment::Newline, "\n"),
- (Segment::StartCommand, ""),
- (Segment::DoRepeatCommand, "do repeat a=1 thru 5"),
- (Segment::Newline, "\n"),
- (Segment::DoRepeatCommand, "another command"),
- (Segment::Newline, "\n"),
- (Segment::DoRepeatCommand, "second command"),
- (Segment::Newline, "\n"),
- (Segment::DoRepeatCommand, "+ third command"),
- (Segment::Newline, "\n"),
- (Segment::DoRepeatCommand, "end /* x */ /* y */ repeat print"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "end"),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "repeat"),
- (Segment::Newline, "\n"),
- (Segment::StartCommand, ""),
- (Segment::Identifier, "do"),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "repeat"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "#a"),
- (Segment::Punct, "="),
- (Segment::Number, "1"),
- (Segment::Newline, "\n"),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::DoRepeatCommand, " inner command"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "end"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "repeat"),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::DoRepeat,
- PromptStyle::DoRepeat,
- PromptStyle::Later,
- ],
- );
-}
-
-mod define {
- use crate::{
- lex::segment::{Mode, Segment},
- prompt::PromptStyle,
- };
-
- use super::check_segmentation;
-
- #[test]
- fn test_simple() {
- check_segmentation(
- r#"define !macro1()
-var1 var2 var3 "!enddefine"
-!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "define"),
- (Segment::Spaces, " "),
- (Segment::MacroName, "!macro1"),
- (Segment::Punct, "("),
- (Segment::Punct, ")"),
- (Segment::Newline, "\n"),
- (Segment::MacroBody, "var1 var2 var3 \"!enddefine\""),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "!enddefine"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[PromptStyle::Define, PromptStyle::Define, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_no_newline_after_parentheses() {
- check_segmentation(
- r#"define !macro1() var1 var2 var3 /* !enddefine
-!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "define"),
- (Segment::Spaces, " "),
- (Segment::MacroName, "!macro1"),
- (Segment::Punct, "("),
- (Segment::Punct, ")"),
- (Segment::MacroBody, " var1 var2 var3 /* !enddefine"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "!enddefine"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[PromptStyle::Define, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_no_newline_before_enddefine() {
- check_segmentation(
- r#"define !macro1()
-var1 var2 var3!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "define"),
- (Segment::Spaces, " "),
- (Segment::MacroName, "!macro1"),
- (Segment::Punct, "("),
- (Segment::Punct, ")"),
- (Segment::Newline, "\n"),
- (Segment::MacroBody, "var1 var2 var3"),
- (Segment::Identifier, "!enddefine"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[PromptStyle::Define, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_all_on_one_line() {
- check_segmentation(
- r#"define !macro1()var1 var2 var3!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "define"),
- (Segment::Spaces, " "),
- (Segment::MacroName, "!macro1"),
- (Segment::Punct, "("),
- (Segment::Punct, ")"),
- (Segment::MacroBody, "var1 var2 var3"),
- (Segment::Identifier, "!enddefine"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[PromptStyle::First],
- );
- }
-
- #[test]
- fn test_empty() {
- check_segmentation(
- r#"define !macro1()
-!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "define"),
- (Segment::Spaces, " "),
- (Segment::MacroName, "!macro1"),
- (Segment::Punct, "("),
- (Segment::Punct, ")"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "!enddefine"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[PromptStyle::Define, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_blank_lines() {
- check_segmentation(
- r#"define !macro1()
-
-
-!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "define"),
- (Segment::Spaces, " "),
- (Segment::MacroName, "!macro1"),
- (Segment::Punct, "("),
- (Segment::Punct, ")"),
- (Segment::Newline, "\n"),
- (Segment::MacroBody, ""),
- (Segment::Newline, "\n"),
- (Segment::MacroBody, ""),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "!enddefine"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::Define,
- PromptStyle::Define,
- PromptStyle::Define,
- PromptStyle::First,
- ],
- );
- }
-
- #[test]
- fn test_arguments() {
- check_segmentation(
- r#"define !macro1(a(), b(), c())
-!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "define"),
- (Segment::Spaces, " "),
- (Segment::MacroName, "!macro1"),
- (Segment::Punct, "("),
- (Segment::Identifier, "a"),
- (Segment::Punct, "("),
- (Segment::Punct, ")"),
- (Segment::Punct, ","),
- (Segment::Spaces, " "),
- (Segment::Identifier, "b"),
- (Segment::Punct, "("),
- (Segment::Punct, ")"),
- (Segment::Punct, ","),
- (Segment::Spaces, " "),
- (Segment::Identifier, "c"),
- (Segment::Punct, "("),
- (Segment::Punct, ")"),
- (Segment::Punct, ")"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "!enddefine"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[PromptStyle::Define, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_multiline_arguments() {
- check_segmentation(
- r#"define !macro1(
- a(), b(
- ),
- c()
-)
-!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "define"),
- (Segment::Spaces, " "),
- (Segment::MacroName, "!macro1"),
- (Segment::Punct, "("),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "a"),
- (Segment::Punct, "("),
- (Segment::Punct, ")"),
- (Segment::Punct, ","),
- (Segment::Spaces, " "),
- (Segment::Identifier, "b"),
- (Segment::Punct, "("),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Punct, ")"),
- (Segment::Punct, ","),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "c"),
- (Segment::Punct, "("),
- (Segment::Punct, ")"),
- (Segment::Newline, "\n"),
- (Segment::Punct, ")"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "!enddefine"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Define,
- PromptStyle::First,
- ],
- );
- }
-
- #[test]
- fn test_arguments_start_on_second_line() {
- check_segmentation(
- r#"define !macro1
-(x,y,z
-)
-content 1
-content 2
-!enddefine.
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "define"),
- (Segment::Spaces, " "),
- (Segment::MacroName, "!macro1"),
- (Segment::Newline, "\n"),
- (Segment::Punct, "("),
- (Segment::Identifier, "x"),
- (Segment::Punct, ","),
- (Segment::Identifier, "y"),
- (Segment::Punct, ","),
- (Segment::Identifier, "z"),
- (Segment::Newline, "\n"),
- (Segment::Punct, ")"),
- (Segment::Newline, "\n"),
- (Segment::MacroBody, "content 1"),
- (Segment::Newline, "\n"),
- (Segment::MacroBody, "content 2"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "!enddefine"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Define,
- PromptStyle::Define,
- PromptStyle::Define,
- PromptStyle::First,
- ],
- );
- }
-
- #[test]
- fn test_early_end_of_command_1() {
- check_segmentation(
- r#"define !macro1.
-data list /x 1.
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "define"),
- (Segment::Spaces, " "),
- (Segment::MacroName, "!macro1"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "data"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "list"),
- (Segment::Spaces, " "),
- (Segment::Punct, "/"),
- (Segment::Identifier, "x"),
- (Segment::Spaces, " "),
- (Segment::Number, "1"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[PromptStyle::First, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_early_end_of_command_2() {
- check_segmentation(
- r#"define !macro1
-x.
-data list /x 1.
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "define"),
- (Segment::Spaces, " "),
- (Segment::MacroName, "!macro1"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "x"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "data"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "list"),
- (Segment::Spaces, " "),
- (Segment::Punct, "/"),
- (Segment::Identifier, "x"),
- (Segment::Spaces, " "),
- (Segment::Number, "1"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[PromptStyle::Later, PromptStyle::First, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_early_end_of_command_3() {
- check_segmentation(
- r#"define !macro1(.
-x.
-data list /x 1.
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "define"),
- (Segment::Spaces, " "),
- (Segment::MacroName, "!macro1"),
- (Segment::Punct, "("),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "x"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "data"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "list"),
- (Segment::Spaces, " "),
- (Segment::Punct, "/"),
- (Segment::Identifier, "x"),
- (Segment::Spaces, " "),
- (Segment::Number, "1"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[PromptStyle::First, PromptStyle::First, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_early_end_of_command_4() {
- // Notice the command terminator at the end of the `DEFINE` command,
- // which should not be there and ends it early.
- check_segmentation(
- r#"define !macro1.
-data list /x 1.
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "define"),
- (Segment::Spaces, " "),
- (Segment::MacroName, "!macro1"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "data"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "list"),
- (Segment::Spaces, " "),
- (Segment::Punct, "/"),
- (Segment::Identifier, "x"),
- (Segment::Spaces, " "),
- (Segment::Number, "1"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[PromptStyle::First, PromptStyle::First],
- );
- }
-
- #[test]
- fn test_missing_enddefine() {
- check_segmentation(
- r#"define !macro1()
-content line 1
-content line 2
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "define"),
- (Segment::Spaces, " "),
- (Segment::MacroName, "!macro1"),
- (Segment::Punct, "("),
- (Segment::Punct, ")"),
- (Segment::Newline, "\n"),
- (Segment::MacroBody, "content line 1"),
- (Segment::Newline, "\n"),
- (Segment::MacroBody, "content line 2"),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::Define,
- PromptStyle::Define,
- PromptStyle::Define,
- ],
- );
- }
-
- #[test]
- fn test_missing_enddefine_2() {
- check_segmentation(
- r#"define !macro1()
-"#,
- Mode::Interactive,
- &[
- (Segment::Identifier, "define"),
- (Segment::Spaces, " "),
- (Segment::MacroName, "!macro1"),
- (Segment::Punct, "("),
- (Segment::Punct, ")"),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[PromptStyle::Define],
- );
- }
-}
-
-#[test]
-fn test_batch_mode() {
- check_segmentation(
- r#"first command
- another line of first command
-+ second command
-third command
-
-fourth command.
- fifth command.
-"#,
- Mode::Batch,
- &[
- (Segment::Identifier, "first"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "command"),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "another"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "line"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "of"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "first"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "command"),
- (Segment::Newline, "\n"),
- (Segment::StartCommand, "+"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "second"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "command"),
- (Segment::Newline, "\n"),
- (Segment::StartCommand, ""),
- (Segment::Identifier, "third"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "command"),
- (Segment::Newline, "\n"),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "fourth"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "command"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "fifth"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "command"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- ],
- );
-}
-
-#[test]
-fn test_auto_mode() {
- check_segmentation(
- r#"command
- another line of command
-2sls
-+ another command
-another line of second command
-data list /x 1
-aggregate.
-print eject.
-twostep cluster
-
-
-fourth command.
- fifth command.
-"#,
- Mode::Auto,
- &[
- (Segment::Identifier, "command"),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "another"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "line"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "of"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "command"),
- (Segment::Newline, "\n"),
- (Segment::StartCommand, ""),
- (Segment::Number, "2"),
- (Segment::Identifier, "sls"),
- (Segment::Newline, "\n"),
- (Segment::StartCommand, "+"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "another"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "command"),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "another"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "line"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "of"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "second"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "command"),
- (Segment::Newline, "\n"),
- (Segment::StartCommand, ""),
- (Segment::Identifier, "data"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "list"),
- (Segment::Spaces, " "),
- (Segment::Punct, "/"),
- (Segment::Identifier, "x"),
- (Segment::Spaces, " "),
- (Segment::Number, "1"),
- (Segment::Newline, "\n"),
- (Segment::StartCommand, ""),
- (Segment::Identifier, "aggregate"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "print"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "eject"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "twostep"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "cluster"),
- (Segment::Newline, "\n"),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::SeparateCommands, ""),
- (Segment::Newline, "\n"),
- (Segment::Identifier, "fourth"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "command"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "fifth"),
- (Segment::Spaces, " "),
- (Segment::Identifier, "command"),
- (Segment::EndCommand, "."),
- (Segment::Newline, "\n"),
- (Segment::End, ""),
- ],
- &[
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::Later,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::Later,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- PromptStyle::First,
- ],
- );
-}
+++ /dev/null
-use std::fmt::{Display, Formatter, Result as FmtResult};
-
-use crate::identifier::Identifier;
-
-#[derive(Clone, Debug, PartialEq)]
-pub enum Token {
- /// End of input.
- End,
-
- /// Identifier.
- Id(Identifier),
-
- /// Number.
- Number(f64),
-
- /// Quoted string.
- String(String),
-
- /// Command terminator or separator.
- ///
- /// Usually this is `.`, but a blank line also separates commands, and in
- /// batch mode any line that begins with a non-blank starts a new command.
- EndCommand,
-
- /// Operators, punctuators, and reserved words.
- Punct(Punct),
-}
-
-impl Token {
- pub fn id(&self) -> Option<&Identifier> {
- match self {
- Self::Id(identifier) => Some(identifier),
- _ => None,
- }
- }
-}
-
-fn is_printable(c: char) -> bool {
- !c.is_control() || ['\t', '\r', '\n'].contains(&c)
-}
-
-fn string_representation(s: &str, quote: char, f: &mut Formatter<'_>) -> FmtResult {
- write!(f, "{quote}")?;
- for section in s.split_inclusive(quote) {
- if let Some(rest) = section.strip_suffix(quote) {
- write!(f, "{rest}{quote}{quote}")?;
- } else {
- write!(f, "{section}")?;
- }
- }
- write!(f, "{quote}")
-}
-
-impl Display for Token {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- match self {
- Token::End => Ok(()),
- Token::Id(s) => write!(f, "{s}"),
- Token::Number(number) => {
- if number.is_sign_negative() {
- write!(f, "-{}", number.abs())
- } else {
- write!(f, "{number}")
- }
- }
- Token::String(s) => {
- if s.chars().all(|c| is_printable(c)) {
- if s.contains('"') {
- string_representation(s, '\'', f)
- } else {
- string_representation(s, '"', f)
- }
- } else {
- write!(f, "X\"")?;
- for byte in s.bytes() {
- let c1 = char::from_digit((byte >> 4) as u32, 16)
- .unwrap()
- .to_ascii_uppercase();
- let c2 = char::from_digit((byte & 0xf) as u32, 16)
- .unwrap()
- .to_ascii_uppercase()
- .to_ascii_lowercase();
- write!(f, "{c1}{c2}")?;
- }
- write!(f, "\"")
- }
- }
- Token::EndCommand => write!(f, "."),
- Token::Punct(punct) => punct.fmt(f),
- }
- }
-}
-
-/// Check that all negative numbers, even -0, get formatted with a leading `-`.
-#[cfg(test)]
-mod test {
- use crate::lex::token::Token;
-
- #[test]
- fn test_string() {
- assert_eq!(Token::String(String::from("abc")).to_string(), "\"abc\"");
- assert_eq!(
- Token::String(String::from("\u{0080}")).to_string(),
- "X\"C280\""
- );
- }
-
- #[test]
- fn test_neg0() {
- assert_eq!(Token::Number(-0.0).to_string(), "-0");
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum Punct {
- /// `+`.
- Plus,
-
- /// `-`.
- Dash,
-
- /// `*`.
- Asterisk,
-
- /// `/`.
- Slash,
-
- /// `=`.
- Equals,
-
- /// `(`.
- LParen,
-
- /// `)`.
- RParen,
-
- /// `[`.
- LSquare,
-
- /// `]`.
- RSquare,
-
- /// `{`.
- LCurly,
-
- /// `}`.
- RCurly,
-
- /// `,`.
- Comma,
-
- /// `;`.
- Semicolon,
-
- /// `:`.
- Colon,
-
- /// `AND` or `&`.
- And,
-
- /// `OR` or `|`.
- Or,
-
- /// `NOT` or `~`.
- Not,
-
- /// `EQ` or `=`.
- Eq,
-
- /// `GE` or '>=`
- Ge,
-
- /// `GT` or `>`.
- Gt,
-
- /// `LE` or `<=`.
- Le,
-
- /// `LT` or `<`.
- Lt,
-
- /// `NE` or `~=` or `<>`.
- Ne,
-
- /// `ALL`.
- All,
-
- /// `BY`.
- By,
-
- /// `TO`.
- To,
-
- /// `WITH`.
- With,
-
- /// `**`.
- Exp,
-
- /// `!` (only appears in macros).
- Bang,
-
- /// `%` (only appears in macros).
- Percent,
-
- /// `?` (only appears in macros).
- Question,
-
- /// ```` (only appears in macros).
- Backtick,
-
- /// `.`.
- ///
- /// This represents a dot in the middle of a line by itself, where it does not end a command.
- Dot,
-
- /// `_` (only appears in macros).
- ///
- /// Although underscores may appear within identifiers, they can't be the
- /// first character, so this represents an underscore found on its own.
- Underscore,
-
- /// `!*` (only appears in macros).
- BangAsterisk,
-}
-
-impl Punct {
- pub fn as_str(&self) -> &'static str {
- match self {
- Self::Plus => "+",
- Self::Dash => "-",
- Self::Asterisk => "*",
- Self::Slash => "/",
- Self::Equals => "=",
- Self::LParen => "(",
- Self::RParen => ")",
- Self::LSquare => "[",
- Self::RSquare => "]",
- Self::LCurly => "{",
- Self::RCurly => "}",
- Self::Comma => ",",
- Self::Semicolon => ";",
- Self::Colon => ":",
- Self::And => "AND",
- Self::Or => "OR",
- Self::Not => "NOT",
- Self::Eq => "EQ",
- Self::Ge => ">=",
- Self::Gt => ">",
- Self::Le => "<=",
- Self::Lt => "<",
- Self::Ne => "~=",
- Self::All => "ALL",
- Self::By => "BY",
- Self::To => "TO",
- Self::With => "WITH",
- Self::Exp => "**",
- Self::Bang => "!",
- Self::Percent => "%",
- Self::Question => "?",
- Self::Backtick => "`",
- Self::Dot => ".",
- Self::Underscore => "_",
- Self::BangAsterisk => "!*",
- }
- }
-}
-impl Display for Punct {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- write!(f, "{}", self.as_str())
- }
-}
+++ /dev/null
-#[allow(unused_variables, unused_mut, dead_code)]
-pub mod cooked;
-pub mod dictionary;
-pub mod encoding;
-pub mod endian;
-pub mod format;
-pub mod identifier;
-pub mod locale_charset;
-pub mod output;
-#[allow(unused_variables, unused_mut, dead_code)]
-pub mod raw;
-pub mod sack;
-pub mod lex;
-pub mod prompt;
-pub mod message;
-pub mod macros;
-pub mod settings;
-pub mod command;
-pub mod integer;
-pub mod engine;
+++ /dev/null
-// Determine a canonical name for the current locale's character encoding.
-//
-// Copyright (C) 2000-2006, 2008-2023 Free Software Foundation, Inc.
-//
-// This file is free software: you can redistribute it and/or modify it under
-// the terms of the GNU Lesser General Public License as published by the Free
-// Software Foundation; either version 2.1 of the License, or (at your option)
-// any later version.
-//
-// This file is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
-// A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-// details.
-//
-// You should have received a copy of the GNU Lesser General Public License
-// along with this program. If not, see <https://www.gnu.org/licenses/>.
-//
-// Written by Bruno Haible <bruno@clisp.org>. Translated to Rust by Ben Pfaff
-// <blp@cs.stanford.edu>.
-
-use lazy_static::lazy_static;
-
-fn map_aliases(s: &str) -> &'static str {
- #[cfg(target_os = "freebsd")]
- match s {
- "ARMSCII-8" => return "ARMSCII-8",
- "Big5" => return "BIG5",
- "C" => return "ASCII",
- "CP1131" => return "CP1131",
- "CP1251" => return "CP1251",
- "CP866" => return "CP866",
- "GB18030" => return "GB18030",
- "GB2312" => return "GB2312",
- "GBK" => return "GBK",
- "ISCII-DEV" => return "?",
- "ISO8859-1" => return "ISO-8859-1",
- "ISO8859-13" => return "ISO-8859-13",
- "ISO8859-15" => return "ISO-8859-15",
- "ISO8859-2" => return "ISO-8859-2",
- "ISO8859-5" => return "ISO-8859-5",
- "ISO8859-7" => return "ISO-8859-7",
- "ISO8859-9" => return "ISO-8859-9",
- "KOI8-R" => return "KOI8-R",
- "KOI8-U" => return "KOI8-U",
- "SJIS" => return "SHIFT_JIS",
- "US-ASCII" => return "ASCII",
- "eucCN" => return "GB2312",
- "eucJP" => return "EUC-JP",
- "eucKR" => return "EUC-KR",
- _ => (),
- };
-
- #[cfg(target_os = "netbsd")]
- match s {
- "646" => return "ASCII",
- "ARMSCII-8" => return "ARMSCII-8",
- "BIG5" => return "BIG5",
- "Big5-HKSCS" => return "BIG5-HKSCS",
- "CP1251" => return "CP1251",
- "CP866" => return "CP866",
- "GB18030" => return "GB18030",
- "GB2312" => return "GB2312",
- "ISO8859-1" => return "ISO-8859-1",
- "ISO8859-13" => return "ISO-8859-13",
- "ISO8859-15" => return "ISO-8859-15",
- "ISO8859-2" => return "ISO-8859-2",
- "ISO8859-4" => return "ISO-8859-4",
- "ISO8859-5" => return "ISO-8859-5",
- "ISO8859-7" => return "ISO-8859-7",
- "KOI8-R" => return "KOI8-R",
- "KOI8-U" => return "KOI8-U",
- "PT154" => return "PT154",
- "SJIS" => return "SHIFT_JIS",
- "eucCN" => return "GB2312",
- "eucJP" => return "EUC-JP",
- "eucKR" => return "EUC-KR",
- "eucTW" => return "EUC-TW",
- _ => (),
- };
-
- #[cfg(target_os = "openbsd")]
- match s {
- "646" => return "ASCII",
- "ISO8859-1" => return "ISO-8859-1",
- "ISO8859-13" => return "ISO-8859-13",
- "ISO8859-15" => return "ISO-8859-15",
- "ISO8859-2" => return "ISO-8859-2",
- "ISO8859-4" => return "ISO-8859-4",
- "ISO8859-5" => return "ISO-8859-5",
- "ISO8859-7" => return "ISO-8859-7",
- "US-ASCII" => return "ASCII",
- _ => (),
- };
-
- /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is
- useless:
- - It returns the empty string when LANG is set to a locale of the
- form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8
- LC_CTYPE file.
- - The environment variables LANG, LC_CTYPE, LC_ALL are not set by
- the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
- - The documentation says:
- "... all code that calls BSD system routines should ensure
- that the const *char parameters of these routines are in UTF-8
- encoding. All BSD system functions expect their string
- parameters to be in UTF-8 encoding and nothing else."
- It also says
- "An additional caveat is that string parameters for files,
- paths, and other file-system entities must be in canonical
- UTF-8. In a canonical UTF-8 Unicode string, all decomposable
- characters are decomposed ..."
- but this is not true: You can pass non-decomposed UTF-8 strings
- to file system functions, and it is the OS which will convert
- them to decomposed UTF-8 before accessing the file system.
- - The Apple Terminal application displays UTF-8 by default.
- - However, other applications are free to use different encodings:
- - xterm uses ISO-8859-1 by default.
- - TextEdit uses MacRoman by default.
- We prefer UTF-8 over decomposed UTF-8-MAC because one should
- minimize the use of decomposed Unicode. Unfortunately, through the
- Darwin file system, decomposed UTF-8 strings are leaked into user
- space nevertheless.
- Then there are also the locales with encodings other than US-ASCII
- and UTF-8. These locales can be occasionally useful to users (e.g.
- when grepping through ISO-8859-1 encoded text files), when all their
- file names are in US-ASCII.
- */
-
- #[cfg(target_os = "macos")]
- match s {
- "ARMSCII-8" => return "ARMSCII-8",
- "Big5" => return "BIG5",
- "Big5HKSCS" => return "BIG5-HKSCS",
- "CP1131" => return "CP1131",
- "CP1251" => return "CP1251",
- "CP866" => return "CP866",
- "CP949" => return "CP949",
- "GB18030" => return "GB18030",
- "GB2312" => return "GB2312",
- "GBK" => return "GBK",
- "ISO8859-1" => return "ISO-8859-1",
- "ISO8859-13" => return "ISO-8859-13",
- "ISO8859-15" => return "ISO-8859-15",
- "ISO8859-2" => return "ISO-8859-2",
- "ISO8859-4" => return "ISO-8859-4",
- "ISO8859-5" => return "ISO-8859-5",
- "ISO8859-7" => return "ISO-8859-7",
- "ISO8859-9" => return "ISO-8859-9",
- "KOI8-R" => return "KOI8-R",
- "KOI8-U" => return "KOI8-U",
- "PT154" => return "PT154",
- "SJIS" => return "SHIFT_JIS",
- "eucCN" => return "GB2312",
- "eucJP" => return "EUC-JP",
- "eucKR" => return "EUC-KR",
- _ => (),
- };
-
- #[cfg(target_os = "aix")]
- match s {
- "GBK" => return "GBK",
- "IBM-1046" => return "CP1046",
- "IBM-1124" => return "CP1124",
- "IBM-1129" => return "CP1129",
- "IBM-1252" => return "CP1252",
- "IBM-850" => return "CP850",
- "IBM-856" => return "CP856",
- "IBM-921" => return "ISO-8859-13",
- "IBM-922" => return "CP922",
- "IBM-932" => return "CP932",
- "IBM-943" => return "CP943",
- "IBM-eucCN" => return "GB2312",
- "IBM-eucJP" => return "EUC-JP",
- "IBM-eucKR" => return "EUC-KR",
- "IBM-eucTW" => return "EUC-TW",
- "ISO8859-1" => return "ISO-8859-1",
- "ISO8859-15" => return "ISO-8859-15",
- "ISO8859-2" => return "ISO-8859-2",
- "ISO8859-5" => return "ISO-8859-5",
- "ISO8859-6" => return "ISO-8859-6",
- "ISO8859-7" => return "ISO-8859-7",
- "ISO8859-8" => return "ISO-8859-8",
- "ISO8859-9" => return "ISO-8859-9",
- "TIS-620" => return "TIS-620",
- "UTF-8" => return "UTF-8",
- "big5" => return "BIG5",
- _ => (),
- };
-
- #[cfg(windows)]
- match s {
- "CP1361" => return "JOHAB",
- "CP20127" => return "ASCII",
- "CP20866" => return "KOI8-R",
- "CP20936" => return "GB2312",
- "CP21866" => return "KOI8-RU",
- "CP28591" => return "ISO-8859-1",
- "CP28592" => return "ISO-8859-2",
- "CP28593" => return "ISO-8859-3",
- "CP28594" => return "ISO-8859-4",
- "CP28595" => return "ISO-8859-5",
- "CP28596" => return "ISO-8859-6",
- "CP28597" => return "ISO-8859-7",
- "CP28598" => return "ISO-8859-8",
- "CP28599" => return "ISO-8859-9",
- "CP28605" => return "ISO-8859-15",
- "CP38598" => return "ISO-8859-8",
- "CP51932" => return "EUC-JP",
- "CP51936" => return "GB2312",
- "CP51949" => return "EUC-KR",
- "CP51950" => return "EUC-TW",
- "CP54936" => return "GB18030",
- "CP65001" => return "UTF-8",
- "CP936" => return "GBK",
- _ => (),
- };
-
- String::from(s).leak()
-}
-
-#[cfg(unix)]
-mod inner {
- use std::{
- ffi::{c_int, CStr, CString},
- ptr::null,
- };
-
- use libc::{self, nl_langinfo, setlocale, CODESET, LC_CTYPE};
-
- unsafe fn string_from_pointer(s: *const i8) -> Option<String> {
- if s.is_null() {
- None
- } else {
- Some(CStr::from_ptr(s).to_string_lossy().into())
- }
- }
-
- fn set_locale(category: c_int, locale: Option<&str>) -> Option<String> {
- unsafe {
- let locale = locale.map(|s| CString::new(s).unwrap());
- let locale_ptr = locale.as_ref().map_or(null(), |s| s.as_ptr());
- string_from_pointer(setlocale(category, locale_ptr))
- }
- }
-
- pub fn locale_charset() -> Option<String> {
- unsafe {
- let saved_locale = set_locale(LC_CTYPE, None);
- set_locale(LC_CTYPE, Some(""));
- let codeset = string_from_pointer(nl_langinfo(CODESET));
- set_locale(LC_CTYPE, saved_locale.as_deref());
- codeset
- }
- }
-}
-
-#[cfg(windows)]
-mod inner {
- use libc::{setlocale, LC_CTYPE};
- use std::ffi::{CStr, CString};
- use windows_sys::Win32::Globalization::GetACP;
-
- fn current_locale() -> Option<String> {
- unsafe {
- let empty_cstr = CString::new("").unwrap();
- let locale = setlocale(LC_CTYPE, empty_cstr.as_ptr());
- if locale.is_null() {
- None
- } else {
- Some(CStr::from_ptr(locale).to_string_lossy().into())
- }
- }
- }
-
- pub fn locale_charset() -> Option<String> {
- let Some(current_locale) = current_locale() else {
- return None;
- };
- let codepage = if let Some((_, pdot)) = current_locale.rsplit_once('.') {
- format!("CP{pdot}")
- } else {
- format!("CP{}", unsafe { GetACP() })
- };
- Some(match codepage.as_str() {
- "CP65001" | "CPutf8" => String::from("UTF-8"),
- _ => codepage,
- })
- }
-}
-
-#[cfg(not(any(unix, windows)))]
-mod inner {
- pub fn locale_charse() -> String {
- String::from("UTF-8")
- }
-}
-
-/// Returns the character set used by the locale configured in the operating
-/// system.
-pub fn locale_charset() -> &'static str {
- lazy_static! {
- static ref LOCALE_CHARSET: &'static str =
- map_aliases(&inner::locale_charset().unwrap_or(String::from("UTF-8")));
- }
- &LOCALE_CHARSET
-}
+++ /dev/null
-use lazy_static::lazy_static;
-use num::Integer;
-use std::{
- cell::RefCell,
- cmp::Ordering,
- collections::{BTreeMap, HashMap, HashSet},
- mem::take,
- num::NonZeroUsize,
- ops::RangeInclusive,
-};
-use thiserror::Error as ThisError;
-use unicase::UniCase;
-
-use crate::{
- identifier::Identifier,
- lex::{
- scan::{ScanError, ScanToken, StringScanner, StringSegmenter},
- segment::Mode,
- token::{Punct, Token},
- },
- message::Location,
- settings::Settings,
-};
-
-#[derive(Clone, Debug, ThisError)]
-pub enum MacroError {
- /// Expected more tokens.
- #[error(
- "Reached end of command expecting {n} more tokens in argument {arg} to macro {macro_}."
- )]
- ExpectedMoreTokens {
- n: usize,
- arg: Identifier,
- macro_: Identifier,
- },
-
- /// Expected a particular token at end of command.
- #[error("Reached end of command expecting {token:?} in argument {arg} to macro {macro_}.")]
- ExpectedToken {
- token: String,
- arg: Identifier,
- macro_: Identifier,
- },
-
- /// Expected a particular token, got a different one.
- #[error(
- "Found `{actual}` while expecting `{expected}` reading argument {arg} to macro {macro_}."
- )]
- UnexpectedToken {
- actual: String,
- expected: String,
- arg: Identifier,
- macro_: Identifier,
- },
-
- /// Argument specified multiple times,
- #[error("Argument {arg} specified multiple times in call to macro {macro_}.")]
- DuplicateArg { arg: Identifier, macro_: Identifier },
-
- /// Maximum nesting limit exceeded.
- #[error("Maximum nesting level {limit} exceeded. (Use `SET MNEST` to change the limit.)")]
- TooDeep { limit: usize },
-
- /// Invalid `!*`.
- #[error("`!*` may only be used within the expansion of a macro.")]
- InvalidBangAsterisk,
-
- /// Error tokenizing during expansion.
- #[error(transparent)]
- ScanError(ScanError),
-
- /// Expecting `)` in macro expression.
- #[error("Expecting `)` in macro expression.")]
- ExpectingRParen,
-
- /// Expecting literal.
- #[error("Expecting literal or function invocation in macro expression.")]
- ExpectingLiteral,
-
- /// Expecting `!THEN`.
- #[error("`!THEN` expected in macro `!IF` construct.")]
- ExpectingThen,
-
- /// Expecting `!ELSE` or `!THEN`.
- #[error("`!ELSE` or `!THEN` expected in macro `!IF` construct.")]
- ExpectingElseOrIfEnd,
-
- /// Expecting `!IFEND`.
- #[error("`!IFEND` expected in macro `!IF` construct.")]
- ExpectingIfEnd,
-
- /// Expecting macro variable name.
- #[error("Expecting macro variable name following `{0}`.")]
- ExpectingMacroVarName(&'static str),
-
- /// Invalid macro variable name.
- #[error("Cannot use argument name or macro keyword {name} as `{construct}` variable name.")]
- BadMacroVarName {
- name: Identifier,
- construct: &'static str,
- },
-
- /// Expecting `=` following `!LET`.
- #[error("Expecting `=` following `!LET`.")]
- ExpectingEquals,
-
- /// Expecting `=` or `!IN` in `!DO` loop.
- #[error("Expecting `=` or `!IN` in `!DO` loop.")]
- ExpectingEqualsOrIn,
-
- /// Missing `!DOEND`.
- #[error("Missing `!DOEND`.")]
- MissingDoEnd,
-
- /// Bad numberic macro expression.
- #[error("Macro expression must evaluate to a number (not {0:?})")]
- BadNumericMacroExpression(String),
-
- /// Too many iteration for list-based loop.
- #[error("`!DO` loop over list exceeded maximum number of iterations {0}. (Use `SET MITERATE` to change the limit.)")]
- MiterateList(usize),
-
- /// Too many iteration for numerical loop.
- #[error("Numerical `!DO` loop exceeded maximum number of iterations {0}. (Use `SET MITERATE` to change the limit.)")]
- MiterateNumeric(usize),
-
- /// Expecting `!TO` in numerical `!DO` loop.
- #[error("Expecting `!TO` in numerical `!DO` loop.")]
- ExpectingTo,
-
- /// `!BY` value cannot be zero.
- #[error("`!BY` value cannot be zero.")]
- ZeroBy,
-
- /// `!BREAK` outside `!DO`.
- #[error("`!BREAK` outside `!DO`.")]
- BreakOutsideDo,
-
- /// `,` or `)` expected in call to macro function.
- #[error("`,` or `)` expected in call to macro function `{0}`.")]
- ExpectingCommaOrRParen(Identifier),
-
- /// Macro function takes one argument.
- #[error("Macro function `{name}` takes one argument (not {n_args}).")]
- ExpectingOneArg { name: Identifier, n_args: usize },
-
- /// Macro function takes two arguments.
- #[error("Macro function `{name}` takes two arguments (not {n_args}).")]
- ExpectingTwoArgs { name: Identifier, n_args: usize },
-
- /// Macro function takes two or three arguments.
- #[error("Macro function `{name}` takes two or three arguments (not {n_args}).")]
- ExpectingTwoOrThreeArgs { name: Identifier, n_args: usize },
-
- /// Macro function needs at least one argument).
- #[error("Macro function `{name}` needs at least one argument).")]
- ExpectingOneOrMoreArgs { name: Identifier },
-
- /// Argument to `!BLANKS` must be non-negative integer (not `{0}`).
- #[error("Argument to `!BLANKS` must be non-negative integer (not `{0}`).")]
- InvalidBlanks(String),
-
- /// Second argument of `!SUBSTR` must be positive integer (not `{0}`).
- #[error("Second argument of `!SUBSTR` must be positive integer (not `{0}`).")]
- InvalidSubstr2(String),
-
- /// Third argument of `!SUBSTR` must be non-negative integer (not `{0}`).
- #[error("Third argument of `!SUBSTR` must be non-negative integer (not `{0}`).")]
- InvalidSubstr3(String),
-}
-
-/// A PSPP macro as defined with `!DEFINE`.
-pub struct Macro {
- /// The macro's name. This is an ordinary identifier except that it is
- /// allowed (but not required) to begin with `!`.
- pub name: Identifier,
-
- /// Source code location of macro definition, for error reporting.
- pub location: Location,
-
- /// Parameters.
- parameters: Vec<Parameter>,
-
- /// Body.
- body: Vec<MacroToken>,
-}
-
-impl Macro {
- fn initial_state(&self) -> ParserState {
- if self.parameters.is_empty() {
- ParserState::Finished
- } else if self.parameters[0].is_positional() {
- ParserState::Keyword
- } else if let ValueType::Enclose(_, _) = self.parameters[0].arg {
- ParserState::Enclose
- } else {
- ParserState::Arg
- }
- }
-
- fn find_parameter(&self, name: &Identifier) -> Option<usize> {
- self.parameters.iter().position(|param| ¶m.name == name)
- }
-}
-
-struct Parameter {
- /// `!name` or `!1`.
- name: Identifier,
-
- /// Default value.
- ///
- /// The tokens don't include white space, etc. between them.
- default: Vec<MacroToken>,
-
- /// Macro-expand the value?
- expand_value: bool,
-
- /// How the argument is specified.
- arg: ValueType,
-}
-
-impl Parameter {
- /// Returns true if this is a positional parameter. Positional parameters
- /// are expanded by index (position) rather than by name.
- fn is_positional(&self) -> bool {
- self.name.0.as_bytes()[1].is_ascii_digit()
- }
-}
-
-enum ValueType {
- /// Argument consists of `.0` tokens.
- NTokens(usize),
-
- /// Argument runs until token `.0`.
- CharEnd(Token),
-
- /// Argument starts with token `.0` and ends with token `.1`.
- Enclose(Token, Token),
-
- /// Argument runs until the end of the command.
- CmdEnd,
-}
-
-/// A token and the syntax that was tokenized to produce it. The syntax allows
-/// the token to be turned back into syntax accurately.
-#[derive(Clone)]
-pub struct MacroToken {
- /// The token.
- pub token: Token,
-
- /// The syntax that produces `token`.
- pub syntax: String,
-}
-
-fn tokenize_string_into(
- s: &str,
- mode: Mode,
- error: &impl Fn(MacroError),
- output: &mut Vec<MacroToken>,
-) {
- for (syntax, token) in StringSegmenter::new(s, mode, true) {
- match token {
- ScanToken::Token(token) => output.push(MacroToken {
- token,
- syntax: String::from(syntax),
- }),
- ScanToken::Error(scan_error) => error(MacroError::ScanError(scan_error)),
- }
- }
-}
-
-fn tokenize_string(s: &str, mode: Mode, error: &impl Fn(MacroError)) -> Vec<MacroToken> {
- let mut tokens = Vec::new();
- tokenize_string_into(s, mode, error, &mut tokens);
- tokens
-}
-
-fn try_unquote_string(input: &String, mode: Mode) -> Option<String> {
- let mut scanner = StringScanner::new(input, mode, true);
- let Some(ScanToken::Token(Token::String(unquoted))) = scanner.next() else {
- return None;
- };
- let None = scanner.next() else { return None };
- return Some(unquoted);
-}
-
-fn unquote_string(input: String, mode: Mode) -> String {
- try_unquote_string(&input, mode).unwrap_or(input)
-}
-
-#[derive(Clone)]
-struct MacroTokens<'a>(&'a [MacroToken]);
-
-impl<'a> MacroTokens<'a> {
- fn is_empty(&self) -> bool {
- self.0.is_empty()
- }
- fn match_(&mut self, s: &str) -> bool {
- if let Some((first, rest)) = self.0.split_first() {
- if first.syntax.eq_ignore_ascii_case(s) {
- self.0 = rest;
- return true;
- }
- }
- false
- }
- fn take_relop(&mut self) -> Option<RelOp> {
- if let Some((first, rest)) = self.0.split_first() {
- if let Ok(relop) = first.syntax.as_str().try_into() {
- self.0 = rest;
- return Some(relop);
- }
- }
- None
- }
- fn macro_id(&self) -> Option<&Identifier> {
- self.0.get(0).map(|mt| mt.token.macro_id()).flatten()
- }
- fn take_macro_id(&mut self) -> Option<&Identifier> {
- let result = self.0.get(0).map(|mt| mt.token.macro_id()).flatten();
- if result.is_some() {
- self.advance();
- }
- result
- }
- fn take(&mut self) -> Option<&MacroToken> {
- match self.0.split_first() {
- Some((first, rest)) => {
- self.0 = rest;
- Some(first)
- }
- None => None,
- }
- }
- fn advance(&mut self) -> &MacroToken {
- let (first, rest) = self.0.split_first().unwrap();
- self.0 = rest;
- first
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-enum TokenClass {
- /// No space before or after (new-line after).
- EndCommand,
-
- /// Space on both sides.
- BinaryOperator,
-
- /// Space afterward.
- Comma,
-
- /// Don't need spaces except sequentially.
- Id,
-
- /// Don't need spaces except sequentially.
- Punct,
-}
-
-impl TokenClass {
- fn separator(prev: Self, next: Self) -> &'static str {
- match (prev, next) {
- // Don't need a separator before the end of a command, but we
- // need a new-line afterward.
- (_, Self::EndCommand) => "",
- (Self::EndCommand, _) => "\n",
-
- // Binary operators always have a space on both sides, and a comma always has a space afterward.
- (Self::BinaryOperator, _) | (_, Self::BinaryOperator) | (Self::Comma, _) => " ",
-
- // Otherwise, `prev` is `Self::Punct`, which only need a space if
- // there are two or them in a row.
- (Self::Punct, Self::Punct) => " ",
- _ => "",
- }
- }
-}
-
-impl From<&Token> for TokenClass {
- fn from(source: &Token) -> Self {
- match source {
- Token::End => Self::Punct,
- Token::Id(_) | Token::Number(_) | Token::String(_) => Self::Id,
- Token::EndCommand => Self::EndCommand,
- Token::Punct(punct) => match punct {
- Punct::LParen
- | Punct::RParen
- | Punct::LSquare
- | Punct::RSquare
- | Punct::LCurly
- | Punct::RCurly => Self::Punct,
-
- Punct::Plus
- | Punct::Dash
- | Punct::Asterisk
- | Punct::Slash
- | Punct::Equals
- | Punct::Colon
- | Punct::And
- | Punct::Or
- | Punct::Not
- | Punct::Eq
- | Punct::Ge
- | Punct::Gt
- | Punct::Le
- | Punct::Lt
- | Punct::Ne
- | Punct::All
- | Punct::By
- | Punct::To
- | Punct::With
- | Punct::Exp
- | Punct::Bang
- | Punct::Percent
- | Punct::Question
- | Punct::Backtick
- | Punct::Dot
- | Punct::Underscore
- | Punct::BangAsterisk => Self::BinaryOperator,
-
- Punct::Comma | Punct::Semicolon => Self::Comma,
- },
- }
- }
-}
-
-pub fn macro_tokens_to_syntax(input: &[MacroToken]) -> impl Iterator<Item = [&str; 2]> {
- input
- .iter()
- .take(1)
- .map(|token| ["", token.syntax.as_str()])
- .chain(input.windows(2).map(|w| {
- let c0 = (&w[0].token).into();
- let c1 = (&w[1].token).into();
- [TokenClass::separator(c0, c1), w[1].syntax.as_str()]
- }))
-}
-
-trait MacroId {
- fn macro_id(&self) -> Option<&Identifier>;
-}
-
-impl MacroId for Token {
- fn macro_id(&self) -> Option<&Identifier> {
- let id = self.id()?;
- id.0.starts_with('!').then_some(id)
- }
-}
-
-enum RelOp {
- Eq,
- Ne,
- Lt,
- Gt,
- Le,
- Ge,
-}
-
-impl TryFrom<&str> for RelOp {
- type Error = ();
-
- fn try_from(source: &str) -> Result<Self, Self::Error> {
- match source {
- "=" => Ok(Self::Eq),
- "~=" | "<>" => Ok(Self::Ne),
- "<" => Ok(Self::Lt),
- ">" => Ok(Self::Gt),
- "<=" => Ok(Self::Le),
- ">=" => Ok(Self::Ge),
- _ if source.len() == 3 && source.as_bytes()[0] == b'!' => match (
- source.as_bytes()[0].to_ascii_uppercase(),
- source.as_bytes()[1].to_ascii_uppercase(),
- ) {
- (b'E', b'Q') => Ok(Self::Eq),
- (b'N', b'E') => Ok(Self::Ne),
- (b'L', b'T') => Ok(Self::Lt),
- (b'L', b'E') => Ok(Self::Le),
- (b'G', b'T') => Ok(Self::Gt),
- (b'G', b'E') => Ok(Self::Ge),
- _ => Err(()),
- },
- _ => Err(()),
- }
- }
-}
-
-impl RelOp {
- fn evaluate(&self, cmp: Ordering) -> bool {
- match self {
- RelOp::Eq => cmp == Ordering::Equal,
- RelOp::Ne => cmp != Ordering::Equal,
- RelOp::Lt => cmp == Ordering::Less,
- RelOp::Gt => cmp == Ordering::Greater,
- RelOp::Le => cmp != Ordering::Greater,
- RelOp::Ge => cmp != Ordering::Less,
- }
- }
-}
-
-pub type MacroSet = HashMap<UniCase<String>, Macro>;
-
-enum ParserState {
- /// Accumulating tokens toward the end of any type of argument.
- Arg,
-
- /// Expecting the opening delimiter of an ARG_ENCLOSE argument.
- Enclose,
-
- /// Expecting a keyword for a keyword argument.
- Keyword,
-
- /// Expecting an equal sign for a keyword argument.
- Equals,
-
- /// Macro fully parsed and ready for expansion.
- Finished,
-}
-
-/// Macro call parser FSM.
-pub struct Parser<'a> {
- macros: &'a MacroSet,
- macro_: &'a Macro,
- state: ParserState,
- args: Box<[Option<Vec<MacroToken>>]>,
- arg_index: usize,
-
- /// Length of macro call so far.
- n_tokens: usize,
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum ParseStatus {
- Complete,
- Incomplete,
-}
-
-impl<'a> Parser<'a> {
- pub fn new(macros: &'a MacroSet, token: &Token) -> Option<Self> {
- let macro_ = macros.get(&token.id()?.0)?;
- Some(Self {
- macros,
- macro_,
- state: macro_.initial_state(),
- args: (0..macro_.parameters.len()).map(|_| None).collect(),
- arg_index: 0,
- n_tokens: 1,
- })
- }
-
- fn finished(&mut self) {
- self.state = ParserState::Finished;
- for (i, arg) in self.args.iter_mut().enumerate() {
- if arg.is_none() {
- *arg = Some(self.macro_.parameters[i].default.clone());
- }
- }
- self.state = ParserState::Finished;
- }
-
- fn next_arg(&mut self) {
- if self.macro_.parameters.is_empty() {
- self.finished()
- } else {
- let param = &self.macro_.parameters[self.arg_index];
- if param.is_positional() {
- self.arg_index += 1;
- if self.arg_index >= self.args.len() {
- self.finished()
- } else {
- let param = &self.macro_.parameters[self.arg_index];
- self.state = if !param.is_positional() {
- ParserState::Keyword
- } else if let ValueType::Enclose(_, _) = param.arg {
- ParserState::Enclose
- } else {
- ParserState::Arg
- };
- }
- } else {
- if self.args.iter().any(|arg| arg.is_none()) {
- self.state = ParserState::Keyword;
- } else {
- self.finished();
- }
- }
- }
- }
-
- fn push_arg(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) {
- let param = &self.macro_.parameters[self.args.len() - 1];
- if let Token::EndCommand | Token::End = token {
- if let Some(arg) = &self.args[self.arg_index] {
- let param = &self.macro_.parameters[self.args.len() - 1];
-
- match ¶m.arg {
- ValueType::NTokens(n) => error(MacroError::ExpectedMoreTokens {
- n: n - arg.len(),
- arg: param.name.clone(),
- macro_: self.macro_.name.clone(),
- }),
- ValueType::CharEnd(end) | ValueType::Enclose(_, end) => {
- error(MacroError::ExpectedToken {
- token: end.to_string(),
- arg: param.name.clone(),
- macro_: self.macro_.name.clone(),
- })
- }
- ValueType::CmdEnd => {
- // This is OK, it's the expected way to end the argument.
- }
- }
- }
- self.finished();
- }
-
- self.n_tokens += 1;
- let arg = self.args[self.arg_index].get_or_insert(Vec::new());
- let (
- add_token, // Should we add `mt` to the current arg?
- next_arg, // Should we advance to the next arg?
- ) = match ¶m.arg {
- ValueType::NTokens(n) => (arg.len() + 1 >= *n, true),
- ValueType::CharEnd(end) | ValueType::Enclose(_, end) => {
- let at_end = token == end;
- (at_end, !at_end)
- }
- ValueType::CmdEnd => (false, true),
- };
- if add_token {
- if true
- // !macro_expand_arg (&mt->token, mc->me, *argp)
- {
- arg.push(MacroToken {
- token: token.clone(),
- syntax: String::from(syntax),
- });
- }
- }
- if next_arg {
- self.next_arg()
- }
- }
-
- fn push_enclose(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) {
- let param = &self.macro_.parameters[self.arg_index];
- let ValueType::Enclose(start, _) = ¶m.arg else {
- unreachable!()
- };
- if token == start {
- self.n_tokens += 1;
- self.args[self.arg_index].get_or_insert(Vec::new());
- self.state = ParserState::Arg;
- } else if param.is_positional() && matches!(token, Token::End | Token::EndCommand) {
- self.finished();
- } else {
- error(MacroError::UnexpectedToken {
- actual: String::from(syntax),
- expected: start.to_string(),
- arg: param.name.clone(),
- macro_: self.macro_.name.clone(),
- });
- self.finished();
- }
- }
-
- fn push_keyword(&mut self, token: &Token, _syntax: &str, error: &impl Fn(MacroError)) {
- let Some(id) = token.id() else {
- return self.finished();
- };
- let Some(arg_index) = self.macro_.find_parameter(id) else {
- return self.finished();
- };
- self.arg_index = arg_index;
- if self.args[arg_index].is_some() {
- error(MacroError::DuplicateArg {
- arg: id.clone(),
- macro_: self.macro_.name.clone(),
- });
- }
- self.args[arg_index] = Some(Vec::new());
- }
-
- fn push_equals(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) {
- let param = &self.macro_.parameters[self.arg_index];
- if let Token::Punct(Punct::Eq) = token {
- self.n_tokens += 1;
- self.state = if let ValueType::Enclose(_, _) = param.arg {
- ParserState::Enclose
- } else {
- ParserState::Arg
- };
- } else {
- error(MacroError::UnexpectedToken {
- actual: syntax.into(),
- expected: String::from("="),
- arg: param.name.clone(),
- macro_: self.macro_.name.clone(),
- });
- self.finished()
- }
- }
-
- /// Adds `token`, which has the given `syntax`, to the collection of tokens
- /// in `self` that potentially need to be macro expanded.
- ///
- /// Returns [ParseStatus::Incomplete] if the macro expander needs more
- /// tokens, for macro arguments or to decide whether this is actually a
- /// macro invocation. The caller should call `push` again with the next
- /// token.
- ///
- /// Returns [ParseStatus::Complete] if the macro invocation is now complete.
- /// The caller should call [`Self::finish()`] to obtain the expansion.
- pub fn push(
- &mut self,
- token: &Token,
- syntax: &str,
- error: &impl Fn(MacroError),
- ) -> ParseStatus {
- match self.state {
- ParserState::Arg => self.push_arg(token, syntax, error),
- ParserState::Enclose => self.push_enclose(token, syntax, error),
- ParserState::Keyword => self.push_keyword(token, syntax, error),
- ParserState::Equals => self.push_equals(token, syntax, error),
- ParserState::Finished => (),
- }
- if let ParserState::Finished = self.state {
- ParseStatus::Complete
- } else {
- ParseStatus::Incomplete
- }
- }
-
- pub fn finish(self) -> Call<'a> {
- let ParserState::Finished = self.state else {
- panic!()
- };
- Call(self)
- }
-}
-
-/// Expansion stack entry.
-struct Frame {
- /// A macro name or `!IF`, `!DO`, etc.
- name: Option<Identifier>,
-
- /// Source location, if available.
- location: Option<Location>,
-}
-
-struct Expander<'a> {
- /// Macros to expand recursively.
- macros: &'a MacroSet,
-
- /// Error reporting callback.
- error: &'a Box<dyn Fn(MacroError) + 'a>,
-
- /// Tokenization mode.
- mode: Mode,
-
- /// Remaining nesting levels.
- nesting_countdown: usize,
-
- /// Stack for error reporting.
- stack: Vec<Frame>,
-
- // May macro calls be expanded?
- expand: &'a RefCell<bool>,
-
- /// Variables from `!DO` and `!LET`.
- vars: &'a RefCell<BTreeMap<Identifier, String>>,
-
- // Only set if inside a `!DO` loop. If true, break out of the loop.
- break_: Option<&'a mut bool>,
-
- /// Only set if expanding a macro (and not, say, a macro argument).
- macro_: Option<&'a Macro>,
-
- /// Only set if expanding a macro (and not, say, a macro argument).
- args: Option<&'a [Option<Vec<MacroToken>>]>,
-}
-
-fn bool_to_string(b: bool) -> String {
- if b {
- String::from("1")
- } else {
- String::from("0")
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-enum IfEndClause {
- Else,
- IfEnd,
-}
-
-fn macro_keywords() -> HashSet<Identifier> {
- let mut keywords = HashSet::new();
- for kw in [
- "!BREAK",
- "!CHAREND",
- "!CMDEND",
- "!DEFAULT",
- "!DO",
- "!DOEND",
- "!ELSE",
- "!ENCLOSE",
- "!ENDDEFINE",
- "!IF",
- "!IFEND",
- "!IN",
- "!LET",
- "!NOEXPAND",
- "!OFFEXPAND",
- "!ONEXPAND",
- "!POSITIONAL",
- "!THEN",
- "!TOKENS",
- ] {
- keywords.insert(Identifier::new(kw).unwrap());
- }
- keywords
-}
-
-fn is_macro_keyword(s: &Identifier) -> bool {
- lazy_static! {
- static ref KEYWORDS: HashSet<Identifier> = macro_keywords();
- }
- KEYWORDS.contains(s)
-}
-
-enum DoInput {
- List(Vec<String>),
- Up { first: f64, last: f64, by: f64 },
- Down { first: f64, last: f64, by: f64 },
- Empty,
-}
-
-impl DoInput {
- fn from_list(items: Vec<MacroToken>) -> Self {
- Self::List(
- items
- .into_iter()
- .rev()
- .take(Settings::global().macros.max_iterations + 1)
- .map(|mt| mt.syntax)
- .collect(),
- )
- }
-
- fn from_by(first: f64, last: f64, by: f64) -> Self {
- if by > 0.0 && first <= last {
- Self::Up { first, last, by }
- } else if by > 0.0 && first <= last {
- Self::Down { first, last, by }
- } else {
- Self::Empty
- }
- }
-}
-
-impl Iterator for DoInput {
- type Item = String;
-
- fn next(&mut self) -> Option<Self::Item> {
- match self {
- DoInput::List(vec) => vec.pop(),
- DoInput::Up { first, last, by } => {
- if first <= last {
- let value = *first;
- *first += *by;
- Some(format!("{value}"))
- } else {
- None
- }
- }
- DoInput::Down { first, last, by } => {
- if first >= last {
- let value = *first;
- *first += *by;
- Some(format!("{value}"))
- } else {
- None
- }
- }
- DoInput::Empty => None,
- }
- }
-}
-
-impl<'a> Expander<'a> {
- fn may_expand(&self) -> bool {
- *self.expand.borrow()
- }
-
- fn should_break(&self) -> bool {
- self.break_.as_ref().map(|b| **b).unwrap_or(false)
- }
-
- fn expand(&mut self, input: &mut MacroTokens, output: &mut Vec<MacroToken>) {
- if self.nesting_countdown == 0 {
- (self.error)(MacroError::TooDeep {
- limit: Settings::global().macros.max_nest,
- });
- output.extend(take(&mut input.0).iter().cloned());
- } else {
- while !input.is_empty() && !self.should_break() {
- self.expand__(input, output);
- }
- }
- }
-
- fn expand_arg(&mut self, param_idx: usize, output: &mut Vec<MacroToken>) {
- let param = &self.macro_.unwrap().parameters[param_idx];
- let arg = &self.args.unwrap()[param_idx].as_ref().unwrap();
- if self.may_expand() && param.expand_value {
- let vars = RefCell::new(BTreeMap::new());
- let mut stack = take(&mut self.stack);
- stack.push(Frame {
- name: Some(param.name.clone()),
- location: None,
- });
- let mut subexpander = Expander {
- stack,
- vars: &vars,
- break_: None,
- macro_: None,
- args: None,
- ..*self
- };
- let mut arg_tokens = MacroTokens(&arg);
- subexpander.expand(&mut arg_tokens, output);
- self.stack = subexpander.stack;
- self.stack.pop();
- } else {
- output.extend(arg.iter().cloned());
- }
- }
- fn parse_function_args(
- &mut self,
- function: &Identifier,
- input: &mut MacroTokens,
- ) -> Option<Vec<String>> {
- input.advance();
- input.advance();
- let mut args = Vec::new();
- if input.match_(")") {
- return Some(args);
- }
- loop {
- args.push(self.parse_function_arg(input)?);
- match input.take() {
- Some(MacroToken {
- token: Token::Punct(Punct::Comma),
- ..
- }) => (),
- Some(MacroToken {
- token: Token::Punct(Punct::RParen),
- ..
- }) => return Some(args),
- _ => {
- (self.error)(MacroError::ExpectingCommaOrRParen(function.clone()));
- return None;
- }
- }
- }
- }
-
- fn expand_blanks(e: &mut Expander, args: Vec<String>) -> Option<String> {
- let Ok(n) = args[0].trim().parse::<usize>() else {
- (e.error)(MacroError::InvalidBlanks(args[0].clone()));
- return None;
- };
- Some(std::iter::repeat(' ').take(n).collect())
- }
-
- fn expand_concat(e: &mut Expander, args: Vec<String>) -> Option<String> {
- Some(
- args.into_iter()
- .map(|arg| unquote_string(arg, e.mode))
- .collect(),
- )
- }
-
- fn expand_eval(e: &mut Expander, args: Vec<String>) -> Option<String> {
- let tokens = tokenize_string(&args[0], e.mode, e.error);
- let mut stack = take(&mut e.stack);
- stack.push(Frame {
- name: Some(Identifier::new("!EVAL").unwrap()),
- location: None,
- });
- let mut break_ = false;
- let mut subexpander = Expander {
- break_: Some(&mut break_),
- stack,
- vars: e.vars,
- ..*e
- };
- let mut output = Vec::new();
- subexpander.expand(&mut MacroTokens(tokens.as_slice()), &mut output);
- subexpander.stack.pop();
- e.stack = subexpander.stack;
- Some(macro_tokens_to_syntax(&output).flatten().collect())
- }
-
- fn expand_head(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
- let arg = unquote_string(args.remove(0), e.mode);
- let mut output = tokenize_string(&arg, e.mode, e.error);
- if output.is_empty() {
- Some(String::new())
- } else {
- Some(output.swap_remove(0).syntax)
- }
- }
-
- fn expand_index(_e: &mut Expander, args: Vec<String>) -> Option<String> {
- let haystack = &args[0];
- let needle = &args[1];
- let position = haystack.find(needle);
- Some(format!(
- "{}",
- position.map_or(0, |position| &haystack[0..position].chars().count() + 1)
- ))
- }
-
- fn expand_length(_e: &mut Expander, args: Vec<String>) -> Option<String> {
- Some(format!("{}", args[0].chars().count()))
- }
-
- fn expand_quote(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
- let arg = args.remove(0);
- if try_unquote_string(&arg, e.mode).is_some() {
- Some(arg)
- } else {
- let mut output = String::with_capacity(arg.len() + 2);
- output.push('\'');
- for c in arg.chars() {
- if c == '"' {
- output.push('\'');
- }
- output.push(c);
- }
- output.push('\'');
- Some(output)
- }
- }
-
- fn expand_substr(e: &mut Expander, args: Vec<String>) -> Option<String> {
- let Ok(start) = args[1].trim().parse::<NonZeroUsize>() else {
- (e.error)(MacroError::InvalidSubstr3(args[0].clone()));
- return None;
- };
- let start = start.get();
- let Ok(count) = args[2].trim().parse::<usize>() else {
- (e.error)(MacroError::InvalidSubstr2(args[0].clone()));
- return None;
- };
-
- Some(args[0].chars().skip(start - 1).take(count).collect())
- }
-
- fn expand_tail(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
- let arg = unquote_string(args.remove(0), e.mode);
- let mut output = tokenize_string(&arg, e.mode, e.error);
- Some(
- output
- .pop()
- .map_or_else(|| String::new(), |tail| tail.syntax),
- )
- }
-
- fn expand_unquote(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
- Some(unquote_string(args.remove(0), e.mode))
- }
-
- fn expand_upcase(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
- Some(unquote_string(args.remove(0), e.mode).to_uppercase())
- }
-
- fn expand_macro_function(&mut self, orig_input: &mut MacroTokens) -> Option<String> {
- let mut input = orig_input.clone();
- let name = input.macro_id()?;
- if name == "!NULL" {
- return Some(String::new());
- }
- if input.0.len() < 2 || !matches!(input.0[1].token, Token::Punct(Punct::LParen)) {
- return None;
- }
-
- struct MacroFunction {
- name: Identifier,
- args: RangeInclusive<usize>,
- parser: fn(&mut Expander, Vec<String>) -> Option<String>,
- }
- impl MacroFunction {
- fn new(
- name: &str,
- args: RangeInclusive<usize>,
- parser: fn(&mut Expander, Vec<String>) -> Option<String>,
- ) -> Self {
- Self {
- name: Identifier::new(name).unwrap(),
- args,
- parser,
- }
- }
- }
- lazy_static! {
- static ref MACRO_FUNCTIONS: [MacroFunction; 11] = [
- MacroFunction::new("!BLANKS", 1..=1, Expander::expand_blanks),
- MacroFunction::new("!CONCAT", 1..=usize::MAX, Expander::expand_concat),
- MacroFunction::new("!HEAD", 1..=1, Expander::expand_head),
- MacroFunction::new("!INDEX", 2..=2, Expander::expand_index),
- MacroFunction::new("!LENGTH", 1..=1, Expander::expand_length),
- MacroFunction::new("!QUOTE", 1..=1, Expander::expand_quote),
- MacroFunction::new("!SUBSTR", 2..=3, Expander::expand_substr),
- MacroFunction::new("!TAIL", 1..=1, Expander::expand_tail),
- MacroFunction::new("!UNQUOTE", 1..=1, Expander::expand_unquote),
- MacroFunction::new("!UPCASE", 1..=1, Expander::expand_upcase),
- MacroFunction::new("!EVAL", 1..=1, Expander::expand_eval),
- ];
- }
-
- let function = MACRO_FUNCTIONS.iter().find(|mf| &mf.name == name)?;
-
- let args = self.parse_function_args(&function.name, &mut input)?;
-
- let n_args = args.len();
- if !function.args.contains(&n_args) {
- let name = function.name.clone();
- let error = match &function.args {
- x if x == &(1..=1) => MacroError::ExpectingOneArg { name, n_args },
- x if x == &(2..=2) => MacroError::ExpectingTwoArgs { name, n_args },
- x if x == &(2..=3) => MacroError::ExpectingTwoOrThreeArgs { name, n_args },
- x if x == &(1..=usize::MAX) => MacroError::ExpectingOneOrMoreArgs { name },
- _ => unreachable!(),
- };
- (self.error)(error);
- return None;
- }
-
- *orig_input = input;
- (function.parser)(self, args)
- }
-
- /// Parses one function argument from `input`. Each argument to a macro
- /// function is one of:
- ///
- /// - A quoted string or other single literal token.
- ///
- /// - An argument to the macro being expanded, e.g. `!1` or a named
- /// argument.
- ///
- /// - `!*`.
- ///
- /// - A function invocation.
- ///
- /// Each function invocation yields a character sequence to be turned into a
- /// sequence of tokens. The case where that character sequence is a single
- /// quoted string is an important special case.
- fn parse_function_arg(&mut self, input: &mut MacroTokens) -> Option<String> {
- if let Some(macro_) = self.macro_ {
- match &input.0.get(0)?.token {
- Token::Id(id) if id.0.starts_with('!') => {
- if let Some(param_idx) = macro_.find_parameter(id) {
- input.advance();
- return Some(
- macro_tokens_to_syntax(self.args.unwrap()[param_idx].as_ref().unwrap())
- .flatten()
- .collect(),
- );
- }
- if let Some(value) = self.vars.borrow().get(id) {
- return Some(value.clone());
- }
-
- if let Some(output) = self.expand_macro_function(input) {
- return Some(output);
- }
- }
- Token::Punct(Punct::BangAsterisk) => {
- let mut arg = String::new();
- for i in 0..macro_.parameters.len() {
- if !macro_.parameters[i].is_positional() {
- break;
- }
- if i > 0 {
- arg.push(' ')
- }
- arg.extend(
- macro_tokens_to_syntax(self.args.unwrap()[i].as_ref().unwrap())
- .flatten(),
- );
- }
- input.advance();
- return Some(arg);
- }
- _ => (),
- }
- }
- Some(input.advance().syntax.clone())
- }
-
- fn evaluate_literal(&mut self, input: &mut MacroTokens) -> Option<String> {
- if input.match_("(") {
- let value = self.evaluate_or(input)?;
- if input.match_(")") {
- Some(value)
- } else {
- (self.error)(MacroError::ExpectingRParen);
- None
- }
- } else if input.match_(")") {
- (self.error)(MacroError::ExpectingLiteral);
- None
- } else {
- Some(unquote_string(self.parse_function_arg(input)?, self.mode))
- }
- }
-
- fn evaluate_relational(&mut self, input: &mut MacroTokens) -> Option<String> {
- let lhs = self.evaluate_literal(input)?;
- let Some(relop) = input.take_relop() else {
- return Some(lhs);
- };
- let rhs = self.evaluate_literal(input)?;
- let cmp = unquote_string(lhs, self.mode).cmp(&unquote_string(rhs, self.mode));
- Some(bool_to_string(relop.evaluate(cmp)))
- }
-
- fn evaluate_not(&mut self, input: &mut MacroTokens) -> Option<String> {
- let mut negations = 0;
- while input.match_("!AND") || input.match_("&") {
- negations += 1;
- }
-
- let operand = self.evaluate_relational(input)?;
- if negations == 0 {
- return Some(operand);
- }
-
- let mut b = operand != "0";
- if negations.is_odd() {
- b = !b;
- }
- Some(bool_to_string(b))
- }
-
- fn evaluate_and(&mut self, input: &mut MacroTokens) -> Option<String> {
- let mut lhs = self.evaluate_not(input)?;
- while input.match_("!AND") || input.match_("&") {
- let rhs = self.evaluate_not(input)?;
- lhs = bool_to_string(lhs != "0" && rhs != "0");
- }
- Some(lhs)
- }
- fn evaluate_or(&mut self, input: &mut MacroTokens) -> Option<String> {
- let mut lhs = self.evaluate_and(input)?;
- while input.match_("!OR") || input.match_("|") {
- let rhs = self.evaluate_and(input)?;
- lhs = bool_to_string(lhs != "0" || rhs != "0");
- }
- Some(lhs)
- }
-
- fn evaluate_expression(&mut self, input: &mut MacroTokens) -> Option<String> {
- self.evaluate_or(input)
- }
-
- fn evaluate_number(&mut self, input: &mut MacroTokens) -> Option<f64> {
- let s = self.evaluate_expression(input)?;
- let tokens = tokenize_string(&s, self.mode, self.error);
- let (
- Some(MacroToken {
- token: Token::Number(number),
- ..
- }),
- 1,
- ) = (tokens.get(0), tokens.len())
- else {
- (self.error)(MacroError::BadNumericMacroExpression(s));
- return None;
- };
-
- Some(*number)
- }
-
- fn find_ifend_clause<'b>(
- input: &mut MacroTokens<'b>,
- ) -> Option<(MacroTokens<'b>, IfEndClause)> {
- let input_copy = input.clone();
- let mut nesting = 0;
- while !input.is_empty() {
- if input.match_("!IF") {
- nesting += 1;
- } else if input.match_("!IFEND") {
- if nesting == 0 {
- return Some((
- MacroTokens(&input_copy.0[..input_copy.0.len() - input.0.len() - 1]),
- IfEndClause::IfEnd,
- ));
- }
- nesting -= 1;
- } else if input.match_("!ELSE") && nesting == 0 {
- return Some((
- MacroTokens(&input_copy.0[..input_copy.0.len() - input.0.len() - 1]),
- IfEndClause::Else,
- ));
- } else {
- input.advance();
- }
- }
- return None;
- }
- fn expand_if(&mut self, orig_input: &mut MacroTokens, output: &mut Vec<MacroToken>) -> bool {
- let mut input = orig_input.clone();
- if !input.match_("!IF") {
- return false;
- }
- let Some(result) = self.evaluate_expression(&mut input) else {
- return false;
- };
- if !input.match_("!THEN") {
- (self.error)(MacroError::ExpectingThen);
- return false;
- }
-
- let Some((if_tokens, clause)) = Self::find_ifend_clause(&mut input) else {
- (self.error)(MacroError::ExpectingElseOrIfEnd);
- return false;
- };
-
- let else_tokens = match clause {
- IfEndClause::Else => {
- let Some((else_tokens, IfEndClause::IfEnd)) = Self::find_ifend_clause(&mut input)
- else {
- (self.error)(MacroError::ExpectingIfEnd);
- return false;
- };
- Some(else_tokens)
- }
- IfEndClause::IfEnd => None,
- };
-
- let subinput = match result.as_str() {
- "0" => else_tokens,
- _ => Some(if_tokens),
- };
- if let Some(mut subinput) = subinput {
- self.stack.push(Frame {
- name: Some(Identifier::new("!IF").unwrap()),
- location: None,
- });
- self.expand(&mut subinput, output);
- self.stack.pop();
- }
- *orig_input = input;
- true
- }
-
- fn take_macro_var_name(
- &mut self,
- input: &mut MacroTokens,
- construct: &'static str,
- ) -> Option<Identifier> {
- let Some(var_name) = input.take_macro_id() else {
- (self.error)(MacroError::ExpectingMacroVarName(construct));
- return None;
- };
- if is_macro_keyword(var_name)
- || self
- .macro_
- .map(|m| m.find_parameter(var_name))
- .flatten()
- .is_some()
- {
- (self.error)(MacroError::BadMacroVarName {
- name: var_name.clone(),
- construct,
- });
- None
- } else {
- Some(var_name.clone())
- }
- }
-
- fn expand_let(&mut self, orig_input: &mut MacroTokens) -> bool {
- let mut input = orig_input.clone();
- if !input.match_("!LET") {
- return false;
- }
-
- let Some(var_name) = self.take_macro_var_name(&mut input, "!LET") else {
- return false;
- };
- input.advance();
-
- if !input.match_("=") {
- (self.error)(MacroError::ExpectingEquals);
- return false;
- }
-
- let Some(value) = self.evaluate_expression(&mut input) else {
- return false;
- };
- self.vars.borrow_mut().insert(var_name.clone(), value);
- *orig_input = input;
- true
- }
-
- fn find_doend<'b>(&mut self, input: &mut MacroTokens<'b>) -> Option<MacroTokens<'b>> {
- let input_copy = input.clone();
- let mut nesting = 0;
- while !input.is_empty() {
- if input.match_("!DO") {
- nesting += 1;
- } else if input.match_("!DOEND") {
- if nesting == 0 {
- return Some(MacroTokens(
- &input_copy.0[..input_copy.0.len() - input.0.len() - 1],
- ));
- }
- nesting -= 1;
- } else {
- input.advance();
- }
- }
- (self.error)(MacroError::MissingDoEnd);
- return None;
- }
-
- fn expand_do(&mut self, orig_input: &mut MacroTokens, output: &mut Vec<MacroToken>) -> bool {
- let mut input = orig_input.clone();
- if !input.match_("!DO") {
- return false;
- }
-
- let Some(var_name) = self.take_macro_var_name(&mut input, "!DO") else {
- return false;
- };
-
- let (items, miterate_error) = if input.match_("!IN") {
- let Some(list) = self.evaluate_expression(&mut input) else {
- return false;
- };
- let items = tokenize_string(list.as_str(), self.mode, &self.error);
- (
- DoInput::from_list(items),
- MacroError::MiterateList(Settings::global().macros.max_iterations),
- )
- } else if input.match_("=") {
- let Some(first) = self.evaluate_number(&mut input) else {
- return false;
- };
- if !input.match_("!TO") {
- (self.error)(MacroError::ExpectingTo);
- return false;
- }
- let Some(last) = self.evaluate_number(&mut input) else {
- return false;
- };
- let by = if input.match_("!BY") {
- let Some(by) = self.evaluate_number(&mut input) else {
- return false;
- };
- if by == 0.0 {
- (self.error)(MacroError::ZeroBy);
- return false;
- }
- by
- } else {
- 1.0
- };
- (
- DoInput::from_by(first, last, by),
- MacroError::MiterateNumeric(Settings::global().macros.max_iterations),
- )
- } else {
- (self.error)(MacroError::ExpectingEqualsOrIn);
- return false;
- };
-
- let Some(body) = self.find_doend(&mut input) else {
- return false;
- };
-
- let mut stack = take(&mut self.stack);
- stack.push(Frame {
- name: Some(Identifier::new("!DO").unwrap()),
- location: None,
- });
- let mut break_ = false;
- let mut subexpander = Expander {
- break_: Some(&mut break_),
- stack,
- vars: self.vars,
- ..*self
- };
-
- for (i, item) in items.enumerate() {
- if subexpander.should_break() {
- break;
- }
- if i >= Settings::global().macros.max_iterations {
- (self.error)(miterate_error);
- break;
- }
- let mut vars = self.vars.borrow_mut();
- if let Some(value) = vars.get_mut(&var_name) {
- *value = item;
- } else {
- vars.insert(var_name.clone(), item);
- }
- subexpander.expand(&mut body.clone(), output);
- }
- *orig_input = input;
- true
- }
-
- fn expand__(&mut self, input: &mut MacroTokens, output: &mut Vec<MacroToken>) {
- // Recursive macro calls.
- if self.may_expand() {
- if let Some(call) = Call::for_tokens(self.macros, &input.0, &self.error) {
- let vars = RefCell::new(BTreeMap::new());
- let mut stack = take(&mut self.stack);
- stack.push(Frame {
- name: Some(call.0.macro_.name.clone()),
- location: Some(call.0.macro_.location.clone()),
- });
- let mut subexpander = Expander {
- break_: None,
- vars: &vars,
- nesting_countdown: self.nesting_countdown.saturating_sub(1),
- stack,
- ..*self
- };
- let mut body = MacroTokens(call.0.macro_.body.as_slice());
- subexpander.expand(&mut body, output);
- self.stack = subexpander.stack;
- self.stack.pop();
- input.0 = &input.0[call.len()..];
- return;
- }
- }
-
- // Only identifiers beginning with `!` receive further processing.
- let id = match &input.0[0].token {
- Token::Id(id) if id.0.starts_with('!') => id,
- Token::Punct(Punct::BangAsterisk) => {
- if let Some(macro_) = self.macro_ {
- for i in 0..macro_.parameters.len() {
- self.expand_arg(i, output);
- }
- } else {
- (self.error)(MacroError::InvalidBangAsterisk);
- }
- input.advance();
- return;
- }
- _ => {
- output.push(input.advance().clone());
- return;
- }
- };
-
- // Macro arguments.
- if let Some(macro_) = self.macro_ {
- if let Some(param_idx) = macro_.find_parameter(id) {
- self.expand_arg(param_idx, output);
- input.advance();
- return;
- }
- }
-
- // Variables set by `!DO` or `!LET`.
- if let Some(value) = self.vars.borrow().get(id) {
- tokenize_string_into(value.as_str(), self.mode, &self.error, output);
- input.advance();
- return;
- }
-
- // Macro functions.
- if self.expand_if(input, output) {
- return;
- }
- if self.expand_let(input) {
- return;
- }
- if self.expand_do(input, output) {
- return;
- }
-
- if input.match_("!BREAK") {
- if let Some(ref mut break_) = self.break_ {
- **break_ = true;
- } else {
- (self.error)(MacroError::BreakOutsideDo);
- }
- return;
- }
-
- if input.match_("!ONEXPAND") {
- *self.expand.borrow_mut() = true;
- } else if input.match_("!OFFEXPAND") {
- *self.expand.borrow_mut() = false;
- } else {
- output.push(input.advance().clone());
- }
- }
-}
-
-pub struct Call<'a>(Parser<'a>);
-
-impl<'a> Call<'a> {
- pub fn for_tokens<F>(macros: &'a MacroSet, tokens: &[MacroToken], error: &F) -> Option<Self>
- where
- F: Fn(MacroError),
- {
- let mut parser = Parser::new(macros, &tokens.get(0)?.token)?;
- for token in tokens[1..].iter().chain(&[MacroToken {
- token: Token::EndCommand,
- syntax: String::from(""),
- }]) {
- if parser.push(&token.token, &token.syntax, error) == ParseStatus::Complete {
- return Some(parser.finish());
- }
- }
- return None;
- }
-
- pub fn expand<F>(&self, mode: Mode, call_loc: Location, output: &mut Vec<MacroToken>, error: F)
- where
- F: Fn(MacroError) + 'a,
- {
- let error: Box<dyn Fn(MacroError) + 'a> = Box::new(error);
- let vars = RefCell::new(BTreeMap::new());
- let expand = RefCell::new(true);
- let mut me = Expander {
- macros: self.0.macros,
- error: &error,
- macro_: Some(self.0.macro_),
- args: Some(&self.0.args),
- mode,
- nesting_countdown: Settings::global().macros.max_nest,
- stack: vec![
- Frame {
- name: None,
- location: Some(call_loc),
- },
- Frame {
- name: Some(self.0.macro_.name.clone()),
- location: Some(self.0.macro_.location.clone()),
- },
- ],
- vars: &vars,
- break_: None,
- expand: &expand,
- };
- let mut body = MacroTokens(&self.0.macro_.body);
- me.expand(&mut body, output);
- }
-
- /// Returns the number of tokens consumed from the input for the macro
- /// invocation. If the result is 0, then there was no macro invocation and
- /// the expansion will be empty.
- pub fn len(&self) -> usize {
- self.0.n_tokens
- }
-}
+++ /dev/null
-/* PSPP - a program for statistical analysis.
- * Copyright (C) 2023 Free Software Foundation, Inc.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>. */
-
-use anyhow::Result;
-use clap::{Parser, ValueEnum};
-use encoding_rs::Encoding;
-use pspp::raw::{encoding_from_headers, Decoder, Magic, Reader, Record};
-use std::fs::File;
-use std::io::BufReader;
-use std::path::{Path, PathBuf};
-use std::str;
-use thiserror::Error as ThisError;
-
-/// A utility to dissect SPSS system files.
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-struct Args {
- /// Maximum number of cases to print.
- #[arg(long = "data", default_value_t = 0)]
- max_cases: u64,
-
- /// Files to dissect.
- #[arg(required = true)]
- files: Vec<PathBuf>,
-
- /// How to dissect the file.
- #[arg(short, long, value_enum, default_value_t)]
- mode: Mode,
-
- /// The encoding to use.
- #[arg(long, value_parser = parse_encoding)]
- encoding: Option<&'static Encoding>,
-}
-
-#[derive(ThisError, Debug)]
-#[error("{0}: unknown encoding")]
-struct UnknownEncodingError(String);
-
-fn parse_encoding(arg: &str) -> Result<&'static Encoding, UnknownEncodingError> {
- match Encoding::for_label_no_replacement(arg.as_bytes()) {
- Some(encoding) => Ok(encoding),
- None => Err(UnknownEncodingError(arg.to_string())),
- }
-}
-
-#[derive(Clone, Copy, Debug, Default, ValueEnum)]
-enum Mode {
- Identify,
- Raw,
- Decoded,
- #[default]
- Cooked,
-}
-
-fn main() -> Result<()> {
- let Args {
- max_cases,
- files,
- mode,
- encoding,
- } = Args::parse();
-
- for file in files {
- dissect(&file, max_cases, mode, encoding)?;
- }
- Ok(())
-}
-
-fn dissect(
- file_name: &Path,
- max_cases: u64,
- mode: Mode,
- encoding: Option<&'static Encoding>,
-) -> Result<()> {
- let reader = File::open(file_name)?;
- let reader = BufReader::new(reader);
- let mut reader = Reader::new(reader, |warning| println!("{warning}"))?;
-
- match mode {
- Mode::Identify => {
- let Record::Header(header) = reader.next().unwrap()? else {
- unreachable!()
- };
- match header.magic {
- Magic::Sav => println!("SPSS System File"),
- Magic::Zsav => println!("SPSS System File with Zlib compression"),
- Magic::Ebcdic => println!("EBCDIC-encoded SPSS System File"),
- }
- return Ok(());
- }
- Mode::Raw => {
- for header in reader {
- let header = header?;
- println!("{:?}", header);
- if let Record::Cases(cases) = header {
- let mut cases = cases.borrow_mut();
- for _ in 0..max_cases {
- let Some(Ok(record)) = cases.next() else {
- break;
- };
- println!("{:?}", record);
- }
- }
- }
- }
- Mode::Decoded => {
- let headers: Vec<Record> = reader.collect::<Result<Vec<_>, _>>()?;
- let encoding = match encoding {
- Some(encoding) => encoding,
- None => encoding_from_headers(&headers, &|e| eprintln!("{e}"))?,
- };
- let decoder = Decoder::new(encoding, |e| eprintln!("{e}"));
- for header in headers {
- let header = header.decode(&decoder);
- println!("{:?}", header);
- /*
- if let Record::Cases(cases) = header {
- let mut cases = cases.borrow_mut();
- for _ in 0..max_cases {
- let Some(Ok(record)) = cases.next() else {
- break;
- };
- println!("{:?}", record);
- }
- }
- */
- }
- }
- Mode::Cooked => {
- /*
- let headers: Vec<Record> = reader.collect::<Result<Vec<_>, _>>()?;
- let encoding = encoding_from_headers(&headers, &|e| eprintln!("{e}"))?;
- let (headers, _) = decode(headers, encoding, &|e| eprintln!("{e}"))?;
- for header in headers {
- println!("{header:?}");
- }
- */
- }
- }
-
- Ok(())
-}
+++ /dev/null
-use std::{
- cmp::{max, min},
- fmt::{Display, Formatter, Result as FmtResult},
- ops::Range,
- sync::Arc,
-};
-
-use enum_map::Enum;
-use unicode_width::UnicodeWidthStr;
-
-/// A line number and optional column number within a source file.
-#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
-pub struct Point {
- /// 1-based line number.
- pub line: i32,
-
- /// 1-based column number.
- ///
- /// Column numbers are measured according to the width of characters as
- /// shown in a typical fixed-width font, in which CJK characters have width
- /// 2 and combining characters have width 0, as measured by the
- /// `unicode_width` crate.
- pub column: Option<i32>,
-}
-
-impl Point {
- /// Takes `point`, adds to it the syntax in `syntax`, incrementing the line
- /// number for each new-line in `syntax` and the column number for each
- /// column, and returns the result.
- pub fn advance(&self, syntax: &str) -> Self {
- let mut result = *self;
- for line in syntax.split_inclusive('\n') {
- if line.ends_with('\n') {
- result.line += 1;
- result.column = Some(1);
- } else {
- result.column = result.column.map(|column| column + line.width() as i32);
- }
- }
- result
- }
-
- pub fn without_column(&self) -> Self {
- Self {
- line: self.line,
- column: None,
- }
- }
-}
-
-/// Location relevant to an diagnostic message.
-#[derive(Clone, Debug)]
-pub struct Location {
- /// File name, if any.
- pub file_name: Option<Arc<String>>,
-
- /// Starting and ending point, if any.
- pub span: Option<Range<Point>>,
-
- /// Normally, if `span` contains column information, then displaying the
- /// message will underline the location. Setting this to true disables
- /// displaying underlines.
- pub omit_underlines: bool,
-}
-
-impl Display for Location {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- if let Some(file_name) = &self.file_name {
- write!(f, "{}", file_name)?;
- }
-
- if let Some(span) = &self.span {
- if self.file_name.is_some() {
- write!(f, ":")?;
- }
- let l1 = span.start.line;
- let l2 = span.end.line;
- if let (Some(c1), Some(c2)) = (span.start.column, span.end.column) {
- if l2 > l1 {
- write!(f, "{l1}.{c1}-{l2}.{}", c2 - 1)?;
- } else {
- write!(f, "{l1}.{c1}-{}", c2 - 1)?;
- }
- } else {
- if l2 > l1 {
- write!(f, "{l1}-{l2}")?;
- } else {
- write!(f, "{l1}")?;
- }
- }
- }
- Ok(())
- }
-}
-
-impl Location {
- pub fn without_columns(&self) -> Self {
- Self {
- file_name: self.file_name.clone(),
- span: self
- .span
- .as_ref()
- .map(|span| span.start.without_column()..span.end.without_column()),
- omit_underlines: self.omit_underlines,
- }
- }
- pub fn merge(a: Option<Self>, b: &Option<Self>) -> Option<Self> {
- let Some(a) = a else { return b.clone() };
- let Some(b) = b else { return Some(a) };
- if a.file_name != b.file_name {
- // Failure.
- return Some(a);
- }
- let span = match (&a.span, &b.span) {
- (None, None) => None,
- (Some(r), None) | (None, Some(r)) => Some(r.clone()),
- (Some(ar), Some(br)) => {
- Some(min(ar.start, br.start).clone()..max(ar.end, br.end).clone())
- }
- };
- Some(Self {
- file_name: a.file_name,
- span,
- omit_underlines: a.omit_underlines || b.omit_underlines,
- })
- }
- pub fn is_empty(&self) -> bool {
- self.file_name.is_none() && self.span.is_none()
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Enum)]
-pub enum Severity {
- Error,
- Warning,
- Note,
-}
-
-impl Severity {
- fn as_str(&self) -> &'static str {
- match self {
- Severity::Error => "error",
- Severity::Warning => "warning",
- Severity::Note => "note",
- }
- }
-}
-
-impl Display for Severity {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- write!(f, "{}", self.as_str())
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum Category {
- General,
- Syntax,
- Data,
-}
-
-pub struct Stack {
- location: Location,
- description: String,
-}
-
-pub struct Diagnostic {
- pub severity: Severity,
- pub category: Category,
- pub location: Location,
- pub source: Vec<(i32, String)>,
- pub stack: Vec<Stack>,
- pub command_name: Option<&'static str>,
- pub text: String,
-}
-
-impl Display for Diagnostic {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- for Stack {
- location,
- description,
- } in &self.stack
- {
- if !!location.is_empty() {
- write!(f, "{location}: ")?;
- }
- writeln!(f, "{description}")?;
- }
- if self.category != Category::General && !self.location.is_empty() {
- write!(f, "{}: ", self.location)?;
- }
-
- write!(f, "{}: ", self.severity)?;
-
- match self.command_name {
- Some(command_name) if self.category == Category::Syntax => {
- write!(f, "{command_name}: ")?
- }
- _ => (),
- }
-
- write!(f, "{}", self.text)?;
-
- if let Some(Range {
- start: Point {
- line: l0,
- column: Some(c0),
- },
- end: Point {
- line: l1,
- column: Some(c1),
- },
- }) = self.location.span
- {
- let mut prev_line_number = None;
- for (line_number, line) in &self.source {
- if let Some(prev_line_number) = prev_line_number {
- if *line_number != prev_line_number + 1 {
- write!(f, "\n ... |")?;
- }
- }
- prev_line_number = Some(line_number);
-
- write!(f, "\n{line_number:5} | {line}")?;
-
- if !self.location.omit_underlines {
- let c0 = if *line_number == l0 { c0 } else { 1 };
- let c1 = if *line_number == l1 {
- c1
- } else {
- line.width() as i32
- };
- write!(f, "\n |")?;
- for _ in 0..c0 {
- f.write_str(" ")?;
- }
- if *line_number == l0 {
- f.write_str("^")?;
- for _ in c0..c1 {
- f.write_str("~")?;
- }
- } else {
- for _ in c0..=c1 {
- f.write_str("~")?;
- }
- }
- }
- }
- }
- Ok(())
- }
-}
+++ /dev/null
-use std::sync::Arc;
-
-use self::pivot::Value;
-
-pub mod pivot;
-
-/// A single output item.
-pub struct Item {
- /// The localized label for the item that appears in the outline pane in the
- /// output viewer and in PDF outlines. This is `None` if no label has been
- /// explicitly set.
- label: Option<String>,
-
- /// A locale-invariant identifier for the command that produced the output,
- /// which may be `None` if unknown or if a command did not produce this
- /// output.
- command_name: Option<String>,
-
- /// For a group item, this is true if the group's subtree should
- /// be expanded in an outline view, false otherwise.
- ///
- /// For other kinds of output items, this is true to show the item's
- /// content, false to hide it. The item's label is always shown in an
- /// outline view.
- show: bool,
-
- /// Item details.
- details: Details,
-}
-
-pub enum Details {
- Chart,
- Image,
- Group(Vec<Arc<Item>>),
- Message,
- Table,
- Text(Text),
-}
-
-pub struct Text {
- type_: TextType,
-
- content: Value,
-}
-
-pub enum TextType {
- /// `TITLE` and `SUBTITLE` commands.
- PageTitle,
-
- /// Title,
- Title,
-
- /// Syntax printback logging.
- Syntax,
-
- /// Other logging.
- Log,
-}
+++ /dev/null
-//! Pivot tables.
-//!
-//! Pivot tables are PSPP's primary form of output. They are analogous to the
-//! pivot tables you might be familiar with from spreadsheets and databases.
-//! See <https://en.wikipedia.org/wiki/Pivot_table> for a brief introduction to
-//! the overall concept of a pivot table.
-//!
-//! In PSPP, the most important internal pieces of a pivot table are:
-//!
-//! - Title. Every pivot table has a title that is displayed above it. It also
-//! has an optional caption (displayed below it) and corner text (displayed in
-//! the upper left corner).
-//!
-//! - Dimensions. A dimension consists of zero or more categories. A category
-//! has a label, such as "df" or "Asymp. Sig." or 123 or a variable name. The
-//! categories are the leaves of a tree whose non-leaf nodes form groups of
-//! categories. The tree always has a root group whose label is the name of
-//! the dimension.
-//!
-//! - Axes. A table has three axes: column, row, and layer. Each dimension is
-//! assigned to an axis, and each axis has zero or more dimensions. When an
-//! axis has more than one dimension, they are ordered from innermost to
-//! outermost.
-//!
-//! - Data. A table's data consists of zero or more cells. Each cell maps from
-//! a category for each dimension to a value, which is commonly a number but
-//! could also be a variable name or an arbitrary text string.
-//!
-//! Creating a pivot table usually consists of the following steps:
-//!
-//! 1. Create the table with pivot_table_create(), passing in the title.
-//!
-//! 2. Optionally, set the format to use for "count" values with
-//! pivot_table_set_weight_var() or pivot_table_set_weight_format().
-//!
-//! 3. Create each dimension with pivot_dimension_create() and populate it with
-//! categories and, possibly, with groups that contain the categories. This
-//! call also assigns the dimension to an axis.
-//!
-//! In simple cases, only a call to pivot_dimension_create() is needed.
-//! Other functions such as pivot_category_create_group() can be used for
-//! hierarchies of categories.
-//!
-//! Sometimes it's easier to create categories in tandem with inserting data,
-//! for example by adding a category for a variable just before inserting the
-//! first cell for that variable. In that case, creating categories and
-//! inserting data can be interleaved.
-//!
-//! 4. Insert data. For each cell, supply the category indexes, which are
-//! assigned starting from 0 in the order in which the categories were
-//! created in step 2, and the value to go in the cell. If the table has a
-//! small, fixed number of dimensions, functions like, e.g.
-//! pivot_table_put3() for 3 dimensions, can be used. The general function
-//! pivot_table_put() works for other cases.
-//!
-//! 5. Output the table for user consumption. Use pivot_table_submit().
-
-use std::{
- collections::HashMap,
- ops::Range,
- sync::{Arc, OnceLock},
-};
-
-use chrono::NaiveDateTime;
-use enum_map::{enum_map, Enum, EnumMap};
-
-use crate::format::{Format, Settings as FormatSettings};
-
-/// Areas of a pivot table for styling purposes.
-#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq)]
-pub enum Area {
- Title,
- Caption,
-
- /// Footnotes,
- Footer,
-
- // Top-left corner.
- Corner,
-
- ColumnLabels,
- RowLabels,
- Data,
-
- /// Layer indication.
- Layers,
-}
-
-/// Table borders for styling purposes.
-#[derive(Debug, Enum)]
-pub enum Border {
- Title,
- OuterFrame(BoxBorder),
- InnerFrame(BoxBorder),
- Dimensions(RowColBorder),
- Categories(RowColBorder),
- DataLeft,
- DataTop,
-}
-
-/// The borders on a box.
-#[derive(Debug, Enum)]
-pub enum BoxBorder {
- Left,
- Top,
- Right,
- Bottom,
-}
-
-/// Borders between rows and columns.
-#[derive(Debug, Enum, PartialEq, Eq)]
-pub enum RowColBorder {
- RowHorz,
- RowVert,
- ColHorz,
- ColVert,
-}
-
-/// Sizing for rows or columns of a rendered table.
-///
-/// The comments below talk about columns and their widths but they apply
-/// equally to rows and their heights.
-#[derive(Default)]
-pub struct Sizing {
- /// Specific column widths, in 1/96" units.
- widths: Vec<i32>,
-
- /// Specific page breaks: 0-based columns after which a page break must
- /// occur, e.g. a value of 1 requests a break after the second column.
- breaks: Vec<usize>,
-
- /// Keeps: columns to keep together on a page if possible.
- keeps: Vec<Range<usize>>,
-}
-
-#[derive(Enum)]
-pub enum Axis3 {
- X,
- Y,
- Z,
-}
-
-/// An axis within a pivot table.
-#[derive(Default)]
-pub struct TableAxis {
- /// `dimensions[0]` is the innermost dimension.
- dimensions: Vec<Dimension>,
-
- /// The number of rows or columns along the axis, that is, the product of
- /// `dimensions[*].n_leaves`. It is 0 if any dimension has 0 leaves.
- extent: usize,
-
- /// Sum of `dimensions[*].label_depth`.
- label_depth: usize,
-}
-
-/// Dimensions.
-///
-/// A [Dimension] identifies the categories associated with a single dimension
-/// within a multidimensional pivot table.
-///
-/// A dimension contains a collection of categories, which are the leaves in a
-/// tree of groups.
-///
-/// (A dimension or a group can contain zero categories, but this is unusual.
-/// If a dimension contains no categories, then its table cannot contain any
-/// data.)
-pub struct Dimension {
- axis_type: Axis3,
- level: usize,
-
- top_index: usize,
-
- /// Hierarchy of categories within the dimension. The groups and categories
- /// are sorted in the order that should be used for display. This might be
- /// different from the original order produced for output if the user
- /// adjusted it.
- ///
- /// The root must always be a group, although it is allowed to have no
- /// subcategories.
- root: Group,
-
- /// All of the leaves reachable via the root.
- ///
- /// The indexing for presentation_leaves is presentation order, thus
- /// `presentation_leaves[i]->presentation_index == i`. This order is the
- /// same as would be produced by an in-order traversal of the groups. It
- /// is the order into which the user reordered or sorted the categories.
- ///
- /// The indexing for `data_leaves` is that used for `idx` in [Cell], thus
- /// `data_leaves[i]->data_index == i`. This might differ from what an
- /// in-order traversal of `root` would yield, if the user reordered
- /// categories.
- data_leaves: Vec<Arc<Leaf>>,
- presentation_leaves: Vec<Arc<Leaf>>,
-
- /// Display.
- hide_all_labels: bool,
-
- /// Number of rows or columns needed to express the labels.
- label_depth: usize,
-}
-
-pub struct Group {
- name: Value,
- label_depth: usize,
- extra_depth: usize,
-
- /// The child categories.
- ///
- /// A group usually has multiple children, but it is allowed to have
- /// only one or even (pathologically) none.
- children: Vec<Category>,
-
- /// Display a label for the group itself?
- show_label: bool,
-
- show_label_in_corner: bool,
-}
-
-pub struct Leaf {
- name: Value,
- label_depth: usize,
- extra_depth: usize,
-
- group_index: usize,
- data_index: usize,
- presentation_index: usize,
-
- /// Default format for values in this category.
- format: Format,
-
- /// Honor [Table]'s `small` setting?
- honor_small: bool,
-}
-
-/// A pivot_category is a leaf (a category) or a group.
-pub enum Category {
- Group(Arc<Group>),
- Leaf(Arc<Leaf>),
-}
-
-trait CategoryTrait {
- fn name(&self) -> &Value;
- fn label_depth(&self) -> usize;
- fn extra_depth(&self) -> usize;
-}
-
-impl CategoryTrait for Group {
- fn name(&self) -> &Value {
- &self.name
- }
-
- fn label_depth(&self) -> usize {
- self.label_depth
- }
-
- fn extra_depth(&self) -> usize {
- self.extra_depth
- }
-}
-
-impl CategoryTrait for Leaf {
- fn name(&self) -> &Value {
- &self.name
- }
-
- fn label_depth(&self) -> usize {
- self.label_depth
- }
-
- fn extra_depth(&self) -> usize {
- self.extra_depth
- }
-}
-
-impl CategoryTrait for Category {
- fn name(&self) -> &Value {
- match self {
- Category::Group(group) => group.name(),
- Category::Leaf(leaf) => leaf.name(),
- }
- }
-
- fn label_depth(&self) -> usize {
- match self {
- Category::Group(group) => group.label_depth(),
- Category::Leaf(leaf) => leaf.label_depth(),
- }
- }
-
- fn extra_depth(&self) -> usize {
- match self {
- Category::Group(group) => group.extra_depth(),
- Category::Leaf(leaf) => leaf.extra_depth(),
- }
- }
-}
-
-/// Styling for a pivot table.
-///
-/// The division between this and the style information in [Table] seems fairly
-/// arbitrary. The ultimate reason for the division is simply because that's
-/// how SPSS documentation and file formats do it.
-struct Look {
- name: Option<String>,
-
- omit_empty: bool,
- row_labels_in_corner: bool,
-
- /// Range of column widths for columns in the row headings and corner , in 1/96"
- /// units.
- row_heading_widths: Range<usize>,
-
- /// Range of column widths for columns in the column headings , in 1/96"
- /// units.
- col_heading_widths: Range<usize>,
-
- /// Kind of markers to use for footnotes.
- footnote_marker_type: FootnoteMarkerType,
-
- /// Where to put the footnote markers.
- footnote_marker_position: FootnoteMarkerPosition,
-
- /// Styles for areas of the pivot table.
- areas: EnumMap<Area, AreaStyle>,
-
- /// Styles for borders in the pivot table.
- borders: EnumMap<Border, BorderStyle>,
-
- print_all_layers: bool,
-
- paginate_layers: bool,
-
- shrink_to_fit: EnumMap<Axis2, bool>,
-
- top_continuation: bool,
-
- bottom_continuation: bool,
-
- continuation: Option<String>,
-
- n_orphan_lines: usize,
-}
-
-impl Default for Look {
- fn default() -> Self {
- Self {
- name: None,
- omit_empty: true,
- row_labels_in_corner: true,
- row_heading_widths: 36..72,
- col_heading_widths: 36..120,
- footnote_marker_type: FootnoteMarkerType::Alphabetic,
- footnote_marker_position: FootnoteMarkerPosition::Subscript,
- areas: EnumMap::from_fn(|area| {
- use HorzAlign::*;
- use VertAlign::*;
- let (halign, valign, hmargins, vmargins) = match area {
- Area::Title => (Center, Middle, [8, 11], [1, 8]),
- Area::Caption => (Left, Top, [8, 11], [1, 1]),
- Area::Footer => (Left, Top, [11, 8], [2, 3]),
- Area::Corner => (Left, Bottom, [8, 11], [1, 1]),
- Area::ColumnLabels => (Left, Top, [8, 11], [1, 3]),
- Area::RowLabels => (Left, Top, [8, 11], [1, 3]),
- Area::Data => (Mixed, Top, [8, 11], [1, 1]),
- Area::Layers => (Left, Bottom, [8, 11], [1, 3]),
- };
- AreaStyle {
- cell_style: CellStyle {
- horz_align: halign,
- vert_align: valign,
- margins: enum_map! { Axis2::X => hmargins, Axis2::Y => vmargins },
- },
- font_style: FontStyle {
- bold: area == Area::Title,
- italic: false,
- underline: false,
- markup: false,
- font: String::from("Sans Serif"),
- fg: [Color::BLACK; 2],
- bg: [Color::WHITE; 2],
- size: 9,
- },
- }
- }),
- borders: EnumMap::from_fn(|border| {
- let stroke = match border {
- Border::InnerFrame(_) | Border::DataLeft | Border::DataTop => Stroke::Thick,
- Border::Dimensions(side) if side != RowColBorder::RowVert => Stroke::Solid,
- Border::Categories(RowColBorder::ColHorz | RowColBorder::ColVert) => {
- Stroke::Solid
- }
- _ => Stroke::None,
- };
- BorderStyle {
- stroke,
- color: Color::BLACK,
- }
- }),
- print_all_layers: false,
- paginate_layers: false,
- shrink_to_fit: EnumMap::from_fn(|_| false),
- top_continuation: false,
- bottom_continuation: false,
- continuation: None,
- n_orphan_lines: 0,
- }
- }
-}
-
-impl Look {
- fn shared_default() -> Arc<Look> {
- static LOOK: OnceLock<Arc<Look>> = OnceLock::new();
- LOOK.get_or_init(|| Arc::new(Look::default())).clone()
- }
-}
-
-pub struct AreaStyle {
- cell_style: CellStyle,
- font_style: FontStyle,
-}
-
-pub struct CellStyle {
- horz_align: HorzAlign,
- vert_align: VertAlign,
-
- /// Margins in 1/96" units.
- ///
- /// `margins[Axis2::X][0]` is the left margin.
- /// `margins[Axis2::X][1]` is the right margin.
- /// `margins[Axis2::Y][0]` is the top margin.
- /// `margins[Axis2::Y][1]` is the bottom margin.
- margins: EnumMap<Axis2, [i32; 2]>,
-}
-
-pub enum HorzAlign {
- /// Right aligned.
- Right,
-
- /// Left aligned.
- Left,
-
- /// Centered.
- Center,
-
- /// Align strings to the left, other formats to the right.
- Mixed,
-
- /// Align the decimal point at the specified position.
- Decimal {
- /// Decimal offset from the right side of the cell, in 1/96" units.
- offset: f64,
-
- /// Decimal character: either `b'.'` or `b','`.
- c: char,
- },
-}
-
-pub enum VertAlign {
- /// Top alignment.
- Top,
-
- /// Centered,
- Middle,
-
- /// Bottom alignment.
- Bottom,
-}
-
-pub struct FontStyle {
- bold: bool,
- italic: bool,
- underline: bool,
- markup: bool,
- font: String,
- fg: [Color; 2],
- bg: [Color; 2],
-
- /// In 1/72" units.
- size: i32,
-}
-
-pub struct Color {
- alpha: u8,
- r: u8,
- g: u8,
- b: u8,
-}
-
-impl Color {
- const BLACK: Color = Color::new(0, 0, 0);
- const WHITE: Color = Color::new(255, 255, 255);
-
- const fn new(r: u8, g: u8, b: u8) -> Self {
- Self {
- alpha: 255,
- r,
- g,
- b,
- }
- }
-}
-
-pub struct BorderStyle {
- stroke: Stroke,
- color: Color,
-}
-
-pub enum Stroke {
- None,
- Solid,
- Dashed,
- Thick,
- Thin,
- Double,
-}
-
-/// An axis of a flat table.
-#[derive(Debug, Enum)]
-pub enum Axis2 {
- X,
- Y,
-}
-
-pub enum FootnoteMarkerType {
- /// a, b, c, ...
- Alphabetic,
-
- /// 1, 2, 3, ...
- Numeric,
-}
-
-pub enum FootnoteMarkerPosition {
- /// Subscripts.
- Subscript,
-
- /// Superscripts.
- Superscript,
-}
-
-pub struct Table {
- look: Arc<Look>,
-
- rotate_inner_column_labels: bool,
-
- rotate_outer_row_labels: bool,
-
- show_grid_lines: bool,
-
- show_title: bool,
-
- show_caption: bool,
-
- show_value: Option<ValueShow>,
-
- show_variables: Option<ValueShow>,
-
- weight_format: Format,
-
- /// Current layer indexes, with axes[PIVOT_AXIS_LAYER].n_dimensions
- /// elements. current_layer[i] is an offset into
- /// axes[PIVOT_AXIS_LAYER].dimensions[i]->data_leaves[], EXCEPT that a
- /// dimension can have zero leaves, in which case current_layer[i] is zero
- /// and there's no corresponding leaf.
- current_layer: Vec<usize>,
-
- /// Column and row sizing and page breaks.
- sizing: EnumMap<Axis2, Sizing>,
-
- /// Format settings.
- settings: FormatSettings,
-
- /// Numeric grouping character (usually `.` or `,`).
- grouping: Option<char>,
-
- small: f64,
-
- command_local: Option<String>,
- command_c: Option<String>,
- language: Option<String>,
- locale: Option<String>,
- dataset: Option<String>,
- datafile: Option<String>,
- date: Option<NaiveDateTime>,
- footnotes: Vec<Footnote>,
- title: Option<Value>,
- subtype: Option<Value>,
- corner_text: Option<Value>,
- caption: Option<Value>,
- notes: Option<String>,
- dimensions: Vec<Dimension>,
- axes: EnumMap<Axis3, TableAxis>,
- cells: HashMap<u64, Value>,
-}
-
-impl Table {
- fn new() -> Self {
- Self {
- look: Look::shared_default(),
- rotate_inner_column_labels: false,
- rotate_outer_row_labels: false,
- show_grid_lines: false,
- show_title: true,
- show_caption: true,
- show_value: None,
- show_variables: None,
- weight_format: Format::F40,
- current_layer: Vec::new(),
- sizing: EnumMap::default(),
- settings: FormatSettings::default(), // XXX from settings
- grouping: None,
- small: 0.0001, // XXX from settings.
- command_local: None,
- command_c: None, // XXX from current command name.
- language: None,
- locale: None,
- dataset: None,
- datafile: None,
- date: None,
- footnotes: Vec::new(),
- subtype: None,
- title: None,
- corner_text: None,
- caption: None,
- notes: None,
- dimensions: Vec::new(),
- axes: EnumMap::default(),
- cells: HashMap::new(),
- }
- }
-}
-
-/// Whether to show variable or value labels or the underlying value or variable name.
-pub enum ValueShow {
- /// Value or variable name only.
- Value,
-
- /// Label only.
- Label,
-
- /// Value and label.
- Both,
-}
-
-pub struct Footnote {
- content: Value,
- marker: Value,
- show: bool,
-}
-
-/// The content of a single pivot table cell.
-///
-/// A [Value] is also a pivot table's title, caption, footnote marker and
-/// contents, and so on.
-///
-/// A given [Value] is one of:
-///
-/// 1. A number resulting from a calculation.
-///
-/// A number has an associated display format (usually [F] or [Pct]). This
-/// format can be set directly, but that is not usually the easiest way.
-/// Instead, it is usually true that all of the values in a single category
-/// should have the same format (e.g. all "Significance" values might use
-/// format `F40.3`), so PSPP makes it easy to set the default format for a
-/// category while creating the category. See pivot_dimension_create() for
-/// more details.
-///
-/// [F]: crate::format::Format::F
-/// [Pct]: crate::format::Format::Pct
-///
-/// 2. A numeric or string value obtained from data (PIVOT_VALUE_NUMERIC or
-/// PIVOT_VALUE_STRING). If such a value corresponds to a variable, then the
-/// variable's name can be attached to the pivot_value. If the value has a
-/// value label, then that can also be attached. When a label is present,
-/// the user can control whether to show the value or the label or both.
-///
-/// 3. A variable name (PIVOT_VALUE_VARIABLE). The variable label, if any, can
-/// be attached too, and again the user can control whether to show the value
-/// or the label or both.
-///
-/// 4. A text string (PIVOT_VALUE_TEXT). The value stores the string in English
-/// and translated into the output language (localized). Use
-/// pivot_value_new_text() or pivot_value_new_text_format() for those cases.
-/// In some cases, only an English or a localized version is available for
-/// one reason or another, although this is regrettable; in those cases, use
-/// pivot_value_new_user_text() or pivot_value_new_user_text_nocopy().
-///
-/// 5. A template. PSPP doesn't create these itself yet, but it can read and
-/// interpret those created by SPSS.
-pub struct Value {
- styling: Option<Box<ValueStyle>>,
- inner: ValueInner,
-}
-
-pub enum ValueInner {
- Number {
- show: ValueShow,
- format: Format,
- honor_small: bool,
- value: f64,
- var_name: Option<String>,
- value_label: Option<String>,
- },
- String {
- show: ValueShow,
- hex: bool,
- s: Option<String>,
- var_name: Option<String>,
- value_label: Option<String>,
- },
- Variable {
- show: ValueShow,
- var_name: Option<String>,
- value_label: Option<String>,
- },
- Text {
- user_provided: bool,
- /// Localized.
- local: String,
- /// English.
- c: String,
- /// Identifier.
- id: String,
- },
- Template {
- args: Vec<Vec<Value>>,
- local: String,
- id: String,
- },
-}
-
-pub struct ValueStyle {
- font_style: FontStyle,
- cell_style: CellStyle,
- subscripts: Vec<String>,
- footnote_indexes: Vec<usize>,
-}
+++ /dev/null
-#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)]
-pub enum PromptStyle {
- /// First line of command.
- First,
-
- /// Second or later line of command.
- Later,
-
- /// Between `BEGIN DATA` and `END DATA`.
- Data,
-
- /// `COMMENT` or `*` command.
- Comment,
-
- /// DOCUMENT command.
- Document,
-
- /// `DO REPEAT` command.
- DoRepeat,
-
- /// `DEFINE` command.
- Define,
-}
-
-impl PromptStyle {
- pub fn to_string(&self) -> &'static str {
- match self {
- PromptStyle::First => "first",
- PromptStyle::Later => "later",
- PromptStyle::Data => "data",
- PromptStyle::Comment => "COMMENT",
- PromptStyle::Document => "DOCUMENT",
- PromptStyle::DoRepeat => "DO REPEAT",
- PromptStyle::Define => "DEFINE",
- }
- }
-}
+++ /dev/null
-use crate::{
- dictionary::VarWidth,
- encoding::{default_encoding, get_encoding, Error as EncodingError},
- endian::{Endian, Parse, ToBytes},
- identifier::{Error as IdError, Identifier},
-};
-
-use encoding_rs::{mem::decode_latin1, DecoderResult, Encoding};
-use flate2::read::ZlibDecoder;
-use num::Integer;
-use std::{
- borrow::Cow,
- cell::RefCell,
- cmp::Ordering,
- collections::{HashMap, VecDeque},
- fmt::{Debug, Display, Formatter, Result as FmtResult},
- io::{Error as IoError, Read, Seek, SeekFrom},
- iter::repeat,
- mem::take,
- ops::Range,
- rc::Rc,
- str::from_utf8,
-};
-use thiserror::Error as ThisError;
-
-#[derive(ThisError, Debug)]
-pub enum Error {
- #[error("Not an SPSS system file")]
- NotASystemFile,
-
- #[error("Invalid magic number {0:?}")]
- BadMagic([u8; 4]),
-
- #[error("I/O error ({0})")]
- Io(#[from] IoError),
-
- #[error("Invalid SAV compression code {0}")]
- InvalidSavCompression(u32),
-
- #[error("Invalid ZSAV compression code {0}")]
- InvalidZsavCompression(u32),
-
- #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
- BadDocumentLength { offset: u64, n: usize, max: usize },
-
- #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
- BadRecordType { offset: u64, rec_type: u32 },
-
- #[error("In variable record starting at offset {start_offset:#x}, variable width is not in the valid range -1 to 255.")]
- BadVariableWidth { start_offset: u64, width: i32 },
-
- #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
- BadVariableLabelCode {
- start_offset: u64,
- code_offset: u64,
- code: u32,
- },
-
- #[error(
- "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
- )]
- BadNumericMissingValueCode { offset: u64, code: i32 },
-
- #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
- BadStringMissingValueCode { offset: u64, code: i32 },
-
- #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
- BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
-
- #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
- ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
-
- #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")]
- TooManyVarIndexes { offset: u64, n: u32, max: u32 },
-
- #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
- ExtensionRecordTooLarge {
- offset: u64,
- subtype: u32,
- size: u32,
- count: u32,
- },
-
- #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
- EofInCase {
- offset: u64,
- case_ofs: u64,
- case_len: usize,
- },
-
- #[error(
- "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
- )]
- EofInCompressedCase { offset: u64, case_ofs: u64 },
-
- #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
- PartialCompressedCase { offset: u64, case_ofs: u64 },
-
- #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
- CompressedNumberExpected { offset: u64, case_ofs: u64 },
-
- #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
- CompressedStringExpected { offset: u64, case_ofs: u64 },
-
- #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
- BadZlibTrailerNBlocks {
- offset: u64,
- n_blocks: u32,
- expected_n_blocks: u64,
- ztrailer_len: u64,
- },
-
- #[error("{0}")]
- EncodingError(EncodingError),
-}
-
-#[derive(ThisError, Debug)]
-pub enum Warning {
- #[error("Unexpected end of data inside extension record.")]
- UnexpectedEndOfData,
-
- #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")]
- NoVarIndexes { offset: u64 },
-
- #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())]
- MixedVarTypes {
- offset: u64,
- var_type: VarType,
- wrong_types: Vec<u32>,
- },
-
- #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")]
- InvalidVarIndexes {
- offset: u64,
- max: usize,
- invalid: Vec<u32>,
- },
-
- #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
- BadRecordSize {
- offset: u64,
- record: String,
- size: u32,
- expected_size: u32,
- },
-
- #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
- BadRecordCount {
- offset: u64,
- record: String,
- count: u32,
- expected_count: u32,
- },
-
- #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
- BadLongMissingValueLength {
- record_offset: u64,
- offset: u64,
- value_len: u32,
- },
-
- #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
- BadEncodingName { offset: u64 },
-
- // XXX This is risky because `text` might be arbitarily long.
- #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
- MalformedString { encoding: String, text: String },
-
- #[error("Invalid variable measurement level value {0}")]
- InvalidMeasurement(u32),
-
- #[error("Invalid variable display alignment value {0}")]
- InvalidAlignment(u32),
-
- #[error("Invalid attribute name. {0}")]
- InvalidAttributeName(IdError),
-
- #[error("Invalid variable name in attribute record. {0}")]
- InvalidAttributeVariableName(IdError),
-
- #[error("Invalid short name in long variable name record. {0}")]
- InvalidShortName(IdError),
-
- #[error("Invalid name in long variable name record. {0}")]
- InvalidLongName(IdError),
-
- #[error("Invalid variable name in very long string record. {0}")]
- InvalidLongStringName(IdError),
-
- #[error("Invalid variable name in variable set record. {0}")]
- InvalidVariableSetName(IdError),
-
- #[error("Invalid multiple response set name. {0}")]
- InvalidMrSetName(IdError),
-
- #[error("Invalid multiple response set variable name. {0}")]
- InvalidMrSetVariableName(IdError),
-
- #[error("Invalid variable name in long string missing values record. {0}")]
- InvalidLongStringMissingValueVariableName(IdError),
-
- #[error("Invalid variable name in long string value label record. {0}")]
- InvalidLongStringValueLabelName(IdError),
-
- #[error("{0}")]
- EncodingError(EncodingError),
-
- #[error("Details TBD")]
- TBD,
-}
-
-impl From<IoError> for Warning {
- fn from(_source: IoError) -> Self {
- Self::UnexpectedEndOfData
- }
-}
-
-#[derive(Clone, Debug)]
-pub enum Record {
- Header(HeaderRecord<RawString>),
- Variable(VariableRecord<RawString, RawStr<8>>),
- ValueLabel(ValueLabelRecord<RawStr<8>, RawString>),
- Document(DocumentRecord<RawDocumentLine>),
- IntegerInfo(IntegerInfoRecord),
- FloatInfo(FloatInfoRecord),
- VarDisplay(VarDisplayRecord),
- MultipleResponse(MultipleResponseRecord<RawString, RawString>),
- LongStringValueLabels(LongStringValueLabelRecord<RawString, RawString>),
- LongStringMissingValues(LongStringMissingValueRecord<RawString, RawStr<8>>),
- Encoding(EncodingRecord),
- NumberOfCases(NumberOfCasesRecord),
- Text(TextRecord),
- OtherExtension(Extension),
- EndOfHeaders(u32),
- ZHeader(ZHeader),
- ZTrailer(ZTrailer),
- Cases(Rc<RefCell<Cases>>),
-}
-
-#[derive(Clone, Debug)]
-pub enum DecodedRecord {
- Header(HeaderRecord<String>),
- Variable(VariableRecord<String, String>),
- ValueLabel(ValueLabelRecord<RawStr<8>, String>),
- Document(DocumentRecord<String>),
- IntegerInfo(IntegerInfoRecord),
- FloatInfo(FloatInfoRecord),
- VarDisplay(VarDisplayRecord),
- MultipleResponse(MultipleResponseRecord<Identifier, String>),
- LongStringValueLabels(LongStringValueLabelRecord<Identifier, String>),
- LongStringMissingValues(LongStringMissingValueRecord<Identifier, String>),
- Encoding(EncodingRecord),
- NumberOfCases(NumberOfCasesRecord),
- VariableSets(VariableSetRecord),
- ProductInfo(ProductInfoRecord),
- LongNames(LongNamesRecord),
- VeryLongStrings(VeryLongStringsRecord),
- FileAttributes(FileAttributeRecord),
- VariableAttributes(VariableAttributeRecord),
- OtherExtension(Extension),
- EndOfHeaders(u32),
- ZHeader(ZHeader),
- ZTrailer(ZTrailer),
- Cases(Rc<RefCell<Cases>>),
-}
-
-impl Record {
- fn read<R>(
- reader: &mut R,
- endian: Endian,
- var_types: &[VarType],
- warn: &dyn Fn(Warning),
- ) -> Result<Option<Record>, Error>
- where
- R: Read + Seek,
- {
- let rec_type: u32 = endian.parse(read_bytes(reader)?);
- match rec_type {
- 2 => Ok(Some(VariableRecord::read(reader, endian)?)),
- 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?),
- 6 => Ok(Some(DocumentRecord::read(reader, endian)?)),
- 7 => Extension::read(reader, endian, var_types.len(), warn),
- 999 => Ok(Some(Record::EndOfHeaders(
- endian.parse(read_bytes(reader)?),
- ))),
- _ => Err(Error::BadRecordType {
- offset: reader.stream_position()?,
- rec_type,
- }),
- }
- }
-
- pub fn decode(self, decoder: &Decoder) -> Result<DecodedRecord, Error> {
- Ok(match self {
- Record::Header(record) => record.decode(decoder),
- Record::Variable(record) => record.decode(decoder),
- Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)),
- Record::Document(record) => record.decode(decoder),
- Record::IntegerInfo(record) => DecodedRecord::IntegerInfo(record.clone()),
- Record::FloatInfo(record) => DecodedRecord::FloatInfo(record.clone()),
- Record::VarDisplay(record) => DecodedRecord::VarDisplay(record.clone()),
- Record::MultipleResponse(record) => record.decode(decoder),
- Record::LongStringValueLabels(record) => {
- DecodedRecord::LongStringValueLabels(record.decode(decoder))
- }
- Record::LongStringMissingValues(record) => {
- DecodedRecord::LongStringMissingValues(record.decode(decoder))
- }
- Record::Encoding(record) => DecodedRecord::Encoding(record.clone()),
- Record::NumberOfCases(record) => DecodedRecord::NumberOfCases(record.clone()),
- Record::Text(record) => record.decode(decoder),
- Record::OtherExtension(record) => DecodedRecord::OtherExtension(record.clone()),
- Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record),
- Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()),
- Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()),
- Record::Cases(record) => DecodedRecord::Cases(record.clone()),
- })
- }
-}
-
-pub fn encoding_from_headers(
- headers: &Vec<Record>,
- warn: &impl Fn(Warning),
-) -> Result<&'static Encoding, Error> {
- let mut encoding_record = None;
- let mut integer_info_record = None;
- for record in headers {
- match record {
- Record::Encoding(record) => encoding_record = Some(record),
- Record::IntegerInfo(record) => integer_info_record = Some(record),
- _ => (),
- }
- }
- let encoding = encoding_record.map(|record| record.0.as_str());
- let character_code = integer_info_record.map(|record| record.character_code);
- match get_encoding(encoding, character_code) {
- Ok(encoding) => Ok(encoding),
- Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)),
- Err(err) => {
- warn(Warning::EncodingError(err));
- // Warn that we're using the default encoding.
- Ok(default_encoding())
- }
- }
-}
-
-// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
-// decoded as Latin-1 (actually bytes interpreted as Unicode code points).
-fn default_decode(s: &[u8]) -> Cow<str> {
- from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum Compression {
- Simple,
- ZLib,
-}
-
-trait Header {
- fn offsets(&self) -> Range<u64>;
-}
-
-#[derive(Clone)]
-pub struct HeaderRecord<S>
-where
- S: Debug,
-{
- /// Offset in file.
- pub offsets: Range<u64>,
-
- /// Magic number.
- pub magic: Magic,
-
- /// Eye-catcher string, product name, in the file's encoding. Padded
- /// on the right with spaces.
- pub eye_catcher: S,
-
- /// Layout code, normally either 2 or 3.
- pub layout_code: u32,
-
- /// Number of variable positions, or `None` if the value in the file is
- /// questionably trustworthy.
- pub nominal_case_size: Option<u32>,
-
- /// Compression type, if any,
- pub compression: Option<Compression>,
-
- /// 1-based variable index of the weight variable, or `None` if the file is
- /// unweighted.
- pub weight_index: Option<u32>,
-
- /// Claimed number of cases, if known.
- pub n_cases: Option<u32>,
-
- /// Compression bias, usually 100.0.
- pub bias: f64,
-
- /// `dd mmm yy` in the file's encoding.
- pub creation_date: S,
-
- /// `HH:MM:SS` in the file's encoding.
- pub creation_time: S,
-
- /// File label, in the file's encoding. Padded on the right with spaces.
- pub file_label: S,
-
- /// Endianness of the data in the file header.
- pub endian: Endian,
-}
-
-impl<S> HeaderRecord<S>
-where
- S: Debug,
-{
- fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult
- where
- T: Debug,
- {
- writeln!(f, "{name:>17}: {:?}", value)
- }
-}
-
-impl<S> Debug for HeaderRecord<S>
-where
- S: Debug,
-{
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- writeln!(f, "File header record:")?;
- self.debug_field(f, "Magic", self.magic)?;
- self.debug_field(f, "Product name", &self.eye_catcher)?;
- self.debug_field(f, "Layout code", self.layout_code)?;
- self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
- self.debug_field(f, "Compression", self.compression)?;
- self.debug_field(f, "Weight index", self.weight_index)?;
- self.debug_field(f, "Number of cases", self.n_cases)?;
- self.debug_field(f, "Compression bias", self.bias)?;
- self.debug_field(f, "Creation date", &self.creation_date)?;
- self.debug_field(f, "Creation time", &self.creation_time)?;
- self.debug_field(f, "File label", &self.file_label)?;
- self.debug_field(f, "Endianness", self.endian)
- }
-}
-
-impl HeaderRecord<RawString> {
- fn read<R: Read + Seek>(r: &mut R) -> Result<Self, Error> {
- let start = r.stream_position()?;
-
- let magic: [u8; 4] = read_bytes(r)?;
- let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
-
- let eye_catcher = RawString(read_vec(r, 60)?);
- let layout_code: [u8; 4] = read_bytes(r)?;
- let endian = Endian::identify_u32(2, layout_code)
- .or_else(|| Endian::identify_u32(2, layout_code))
- .ok_or_else(|| Error::NotASystemFile)?;
- let layout_code = endian.parse(layout_code);
-
- let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
- let nominal_case_size =
- (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
-
- let compression_code: u32 = endian.parse(read_bytes(r)?);
- let compression = match (magic, compression_code) {
- (Magic::Zsav, 2) => Some(Compression::ZLib),
- (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)),
- (_, 0) => None,
- (_, 1) => Some(Compression::Simple),
- (_, code) => return Err(Error::InvalidSavCompression(code)),
- };
-
- let weight_index: u32 = endian.parse(read_bytes(r)?);
- let weight_index = (weight_index > 0).then_some(weight_index);
-
- let n_cases: u32 = endian.parse(read_bytes(r)?);
- let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
-
- let bias: f64 = endian.parse(read_bytes(r)?);
-
- let creation_date = RawString(read_vec(r, 9)?);
- let creation_time = RawString(read_vec(r, 8)?);
- let file_label = RawString(read_vec(r, 64)?);
- let _: [u8; 3] = read_bytes(r)?;
-
- Ok(HeaderRecord {
- offsets: start..r.stream_position()?,
- magic,
- layout_code,
- nominal_case_size,
- compression,
- weight_index,
- n_cases,
- bias,
- creation_date,
- creation_time,
- eye_catcher,
- file_label,
- endian,
- })
- }
-
- pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
- let eye_catcher = decoder.decode(&self.eye_catcher).to_string();
- let file_label = decoder.decode(&self.file_label).to_string();
- let creation_date = decoder.decode(&self.creation_date).to_string();
- let creation_time = decoder.decode(&self.creation_time).to_string();
- DecodedRecord::Header(HeaderRecord {
- eye_catcher,
- weight_index: self.weight_index,
- n_cases: self.n_cases,
- file_label,
- offsets: self.offsets.clone(),
- magic: self.magic,
- layout_code: self.layout_code,
- nominal_case_size: self.nominal_case_size,
- compression: self.compression,
- bias: self.bias,
- creation_date,
- creation_time,
- endian: self.endian,
- })
- }
-}
-
-pub struct Decoder {
- pub encoding: &'static Encoding,
- pub warn: Box<dyn Fn(Warning)>,
-}
-
-impl Decoder {
- pub fn new<F>(encoding: &'static Encoding, warn: F) -> Self
- where
- F: Fn(Warning) + 'static,
- {
- Self {
- encoding,
- warn: Box::new(warn),
- }
- }
- fn warn(&self, warning: Warning) {
- (self.warn)(warning)
- }
- fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
- let (output, malformed) = self.encoding.decode_without_bom_handling(input);
- if malformed {
- self.warn(Warning::MalformedString {
- encoding: self.encoding.name().into(),
- text: output.clone().into(),
- });
- }
- output
- }
-
- fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> {
- self.decode_slice(input.0.as_slice())
- }
-
- /// Returns `input` decoded from `self.encoding` into UTF-8 such that
- /// re-encoding the result back into `self.encoding` will have exactly the
- /// same length in bytes.
- ///
- /// XXX warn about errors?
- pub fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
- if let (s, false) = self.encoding.decode_without_bom_handling(input) {
- // This is the common case. Usually there will be no errors.
- s
- } else {
- // Unusual case. Don't bother to optimize it much.
- let mut decoder = self.encoding.new_decoder_without_bom_handling();
- let mut output = String::with_capacity(
- decoder
- .max_utf8_buffer_length_without_replacement(input.len())
- .unwrap(),
- );
- let mut rest = input;
- while !rest.is_empty() {
- match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
- (DecoderResult::InputEmpty, _) => break,
- (DecoderResult::OutputFull, _) => unreachable!(),
- (DecoderResult::Malformed(a, b), consumed) => {
- let skipped = a as usize + b as usize;
- output.extend(repeat('?').take(skipped));
- rest = &rest[consumed..];
- }
- }
- }
- assert_eq!(self.encoding.encode(&output).0.len(), input.len());
- output.into()
- }
- }
-
- pub fn decode_identifier(&self, input: &RawString) -> Result<Identifier, IdError> {
- self.new_identifier(&self.decode(input))
- }
-
- pub fn new_identifier(&self, name: &str) -> Result<Identifier, IdError> {
- Identifier::from_encoding(name, self.encoding)
- }
-}
-
-impl<S> Header for HeaderRecord<S>
-where
- S: Debug,
-{
- fn offsets(&self) -> Range<u64> {
- self.offsets.clone()
- }
-}
-
-#[derive(Copy, Clone, PartialEq, Eq, Hash)]
-pub enum Magic {
- /// Regular system file.
- Sav,
-
- /// System file with Zlib-compressed data.
- Zsav,
-
- /// EBCDIC-encoded system file.
- Ebcdic,
-}
-
-impl Magic {
- /// Magic number for a regular system file.
- pub const SAV: [u8; 4] = *b"$FL2";
-
- /// Magic number for a system file that contains zlib-compressed data.
- pub const ZSAV: [u8; 4] = *b"$FL3";
-
- /// Magic number for an EBCDIC-encoded system file. This is `$FL2` encoded
- /// in EBCDIC.
- pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2];
-}
-
-impl Debug for Magic {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- let s = match *self {
- Magic::Sav => "$FL2",
- Magic::Zsav => "$FL3",
- Magic::Ebcdic => "($FL2 in EBCDIC)",
- };
- write!(f, "{s}")
- }
-}
-
-impl TryFrom<[u8; 4]> for Magic {
- type Error = Error;
-
- fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
- match value {
- Magic::SAV => Ok(Magic::Sav),
- Magic::ZSAV => Ok(Magic::Zsav),
- Magic::EBCDIC => Ok(Magic::Ebcdic),
- _ => Err(Error::BadMagic(value)),
- }
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum VarType {
- Numeric,
- String,
-}
-
-impl VarType {
- pub fn from_width(width: VarWidth) -> VarType {
- match width {
- VarWidth::Numeric => Self::Numeric,
- VarWidth::String(_) => Self::String,
- }
- }
-
- pub fn opposite(self) -> VarType {
- match self {
- Self::Numeric => Self::String,
- Self::String => Self::Numeric,
- }
- }
-}
-
-impl Display for VarType {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- match self {
- VarType::Numeric => write!(f, "numeric"),
- VarType::String => write!(f, "string"),
- }
- }
-}
-
-#[derive(Copy, Clone)]
-pub enum Value<S>
-where
- S: Debug,
-{
- Number(Option<f64>),
- String(S),
-}
-
-type RawValue = Value<RawStr<8>>;
-
-impl<S> Debug for Value<S>
-where
- S: Debug,
-{
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- match self {
- Value::Number(Some(number)) => write!(f, "{number:?}"),
- Value::Number(None) => write!(f, "SYSMIS"),
- Value::String(s) => write!(f, "{:?}", s),
- }
- }
-}
-
-impl RawValue {
- fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Self, IoError> {
- Ok(Self::from_raw(
- &UntypedValue(read_bytes(r)?),
- var_type,
- endian,
- ))
- }
-
- pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self {
- match var_type {
- VarType::String => Value::String(RawStr(raw.0)),
- VarType::Numeric => {
- let number: f64 = endian.parse(raw.0);
- Value::Number((number != -f64::MAX).then_some(number))
- }
- }
- }
-
- fn read_case<R: Read + Seek>(
- reader: &mut R,
- var_types: &[VarType],
- endian: Endian,
- ) -> Result<Option<Vec<Self>>, Error> {
- let case_start = reader.stream_position()?;
- let mut values = Vec::with_capacity(var_types.len());
- for (i, &var_type) in var_types.iter().enumerate() {
- let Some(raw) = try_read_bytes(reader)? else {
- if i == 0 {
- return Ok(None);
- } else {
- let offset = reader.stream_position()?;
- return Err(Error::EofInCase {
- offset,
- case_ofs: offset - case_start,
- case_len: var_types.len() * 8,
- });
- }
- };
- values.push(Value::from_raw(&UntypedValue(raw), var_type, endian));
- }
- Ok(Some(values))
- }
-
- fn read_compressed_case<R: Read + Seek>(
- reader: &mut R,
- var_types: &[VarType],
- codes: &mut VecDeque<u8>,
- endian: Endian,
- bias: f64,
- ) -> Result<Option<Vec<Self>>, Error> {
- let case_start = reader.stream_position()?;
- let mut values = Vec::with_capacity(var_types.len());
- for (i, &var_type) in var_types.iter().enumerate() {
- let value = loop {
- let Some(code) = codes.pop_front() else {
- let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
- if i == 0 {
- return Ok(None);
- } else {
- let offset = reader.stream_position()?;
- return Err(Error::EofInCompressedCase {
- offset,
- case_ofs: offset - case_start,
- });
- }
- };
- codes.extend(new_codes.into_iter());
- continue;
- };
- match code {
- 0 => (),
- 1..=251 => match var_type {
- VarType::Numeric => break Self::Number(Some(code as f64 - bias)),
- VarType::String => {
- break Self::String(RawStr(endian.to_bytes(code as f64 - bias)))
- }
- },
- 252 => {
- if i == 0 {
- return Ok(None);
- } else {
- let offset = reader.stream_position()?;
- return Err(Error::PartialCompressedCase {
- offset,
- case_ofs: offset - case_start,
- });
- }
- }
- 253 => {
- break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian)
- }
- 254 => match var_type {
- VarType::String => break Self::String(RawStr(*b" ")), // XXX EBCDIC
- VarType::Numeric => {
- return Err(Error::CompressedStringExpected {
- offset: case_start,
- case_ofs: reader.stream_position()? - case_start,
- })
- }
- },
- 255 => match var_type {
- VarType::Numeric => break Self::Number(None),
- VarType::String => {
- return Err(Error::CompressedNumberExpected {
- offset: case_start,
- case_ofs: reader.stream_position()? - case_start,
- })
- }
- },
- }
- };
- values.push(value);
- }
- Ok(Some(values))
- }
-
- pub fn decode(self, decoder: &Decoder) -> Value<String> {
- match self {
- Self::Number(x) => Value::Number(x),
- Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
- }
- }
-}
-
-struct ZlibDecodeMultiple<R>
-where
- R: Read + Seek,
-{
- reader: Option<ZlibDecoder<R>>,
-}
-
-impl<R> ZlibDecodeMultiple<R>
-where
- R: Read + Seek,
-{
- fn new(reader: R) -> ZlibDecodeMultiple<R> {
- ZlibDecodeMultiple {
- reader: Some(ZlibDecoder::new(reader)),
- }
- }
-}
-
-impl<R> Read for ZlibDecodeMultiple<R>
-where
- R: Read + Seek,
-{
- fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
- loop {
- match self.reader.as_mut().unwrap().read(buf)? {
- 0 => {
- let inner = self.reader.take().unwrap().into_inner();
- self.reader = Some(ZlibDecoder::new(inner));
- }
- n => return Ok(n),
- };
- }
- }
-}
-
-impl<R> Seek for ZlibDecodeMultiple<R>
-where
- R: Read + Seek,
-{
- fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
- self.reader.as_mut().unwrap().get_mut().seek(pos)
- }
-}
-
-enum ReaderState {
- Start,
- Headers,
- ZlibHeader,
- ZlibTrailer {
- ztrailer_offset: u64,
- ztrailer_len: u64,
- },
- Cases,
- End,
-}
-
-pub struct Reader<R>
-where
- R: Read + Seek + 'static,
-{
- reader: Option<R>,
- warn: Box<dyn Fn(Warning)>,
-
- header: HeaderRecord<RawString>,
- var_types: Vec<VarType>,
-
- state: ReaderState,
-}
-
-impl<R> Reader<R>
-where
- R: Read + Seek + 'static,
-{
- pub fn new<F>(mut reader: R, warn: F) -> Result<Self, Error>
- where
- F: Fn(Warning) + 'static,
- {
- let header = HeaderRecord::read(&mut reader)?;
- Ok(Self {
- reader: Some(reader),
- warn: Box::new(warn),
- header,
- var_types: Vec::new(),
- state: ReaderState::Start,
- })
- }
- fn cases(&mut self) -> Cases {
- self.state = ReaderState::End;
- Cases::new(
- self.reader.take().unwrap(),
- take(&mut self.var_types),
- &self.header,
- )
- }
- fn _next(&mut self) -> Option<<Self as Iterator>::Item> {
- match self.state {
- ReaderState::Start => {
- self.state = ReaderState::Headers;
- Some(Ok(Record::Header(self.header.clone())))
- }
- ReaderState::Headers => {
- let record = loop {
- match Record::read(
- self.reader.as_mut().unwrap(),
- self.header.endian,
- self.var_types.as_slice(),
- &self.warn,
- ) {
- Ok(Some(record)) => break record,
- Ok(None) => (),
- Err(error) => return Some(Err(error)),
- }
- };
- match record {
- Record::Variable(VariableRecord { width, .. }) => {
- self.var_types.push(if width == 0 {
- VarType::Numeric
- } else {
- VarType::String
- });
- }
- Record::EndOfHeaders(_) => {
- self.state = if let Some(Compression::ZLib) = self.header.compression {
- ReaderState::ZlibHeader
- } else {
- ReaderState::Cases
- };
- }
- _ => (),
- };
- Some(Ok(record))
- }
- ReaderState::ZlibHeader => {
- let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian)
- {
- Ok(zheader) => zheader,
- Err(error) => return Some(Err(error)),
- };
- self.state = ReaderState::ZlibTrailer {
- ztrailer_offset: zheader.ztrailer_offset,
- ztrailer_len: zheader.ztrailer_len,
- };
- Some(Ok(Record::ZHeader(zheader)))
- }
- ReaderState::ZlibTrailer {
- ztrailer_offset,
- ztrailer_len,
- } => {
- match ZTrailer::read(
- self.reader.as_mut().unwrap(),
- self.header.endian,
- ztrailer_offset,
- ztrailer_len,
- ) {
- Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
- Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))),
- Err(error) => Some(Err(error)),
- }
- }
- ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
- ReaderState::End => None,
- }
- }
-}
-
-impl<R> Iterator for Reader<R>
-where
- R: Read + Seek + 'static,
-{
- type Item = Result<Record, Error>;
-
- fn next(&mut self) -> Option<Self::Item> {
- let retval = self._next();
- if matches!(retval, Some(Err(_))) {
- self.state = ReaderState::End;
- }
- retval
- }
-}
-
-trait ReadSeek: Read + Seek {}
-impl<T> ReadSeek for T where T: Read + Seek {}
-
-pub struct Cases {
- reader: Box<dyn ReadSeek>,
- var_types: Vec<VarType>,
- compression: Option<Compression>,
- bias: f64,
- endian: Endian,
- codes: VecDeque<u8>,
- eof: bool,
-}
-
-impl Debug for Cases {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "Cases")
- }
-}
-
-impl Cases {
- fn new<R>(reader: R, var_types: Vec<VarType>, header: &HeaderRecord<RawString>) -> Self
- where
- R: Read + Seek + 'static,
- {
- Self {
- reader: if header.compression == Some(Compression::ZLib) {
- Box::new(ZlibDecodeMultiple::new(reader))
- } else {
- Box::new(reader)
- },
- var_types,
- compression: header.compression,
- bias: header.bias,
- endian: header.endian,
- codes: VecDeque::with_capacity(8),
- eof: false,
- }
- }
-}
-
-impl Iterator for Cases {
- type Item = Result<Vec<RawValue>, Error>;
-
- fn next(&mut self) -> Option<Self::Item> {
- if self.eof {
- return None;
- }
-
- let retval = if self.compression.is_some() {
- Value::read_compressed_case(
- &mut self.reader,
- &self.var_types,
- &mut self.codes,
- self.endian,
- self.bias,
- )
- .transpose()
- } else {
- Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose()
- };
- self.eof = matches!(retval, None | Some(Err(_)));
- retval
- }
-}
-
-#[derive(Copy, Clone, PartialEq, Eq, Hash)]
-pub struct Spec(pub u32);
-
-impl Debug for Spec {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- let type_ = format_name(self.0 >> 16);
- let w = (self.0 >> 8) & 0xff;
- let d = self.0 & 0xff;
- write!(f, "{:06x} ({type_}{w}.{d})", self.0)
- }
-}
-
-fn format_name(type_: u32) -> Cow<'static, str> {
- match type_ {
- 1 => "A",
- 2 => "AHEX",
- 3 => "COMMA",
- 4 => "DOLLAR",
- 5 => "F",
- 6 => "IB",
- 7 => "PIBHEX",
- 8 => "P",
- 9 => "PIB",
- 10 => "PK",
- 11 => "RB",
- 12 => "RBHEX",
- 15 => "Z",
- 16 => "N",
- 17 => "E",
- 20 => "DATE",
- 21 => "TIME",
- 22 => "DATETIME",
- 23 => "ADATE",
- 24 => "JDATE",
- 25 => "DTIME",
- 26 => "WKDAY",
- 27 => "MONTH",
- 28 => "MOYR",
- 29 => "QYR",
- 30 => "WKYR",
- 31 => "PCT",
- 32 => "DOT",
- 33 => "CCA",
- 34 => "CCB",
- 35 => "CCC",
- 36 => "CCD",
- 37 => "CCE",
- 38 => "EDATE",
- 39 => "SDATE",
- 40 => "MTIME",
- 41 => "YMDHMS",
- _ => return format!("<unknown format {type_}>").into(),
- }
- .into()
-}
-
-#[derive(Clone)]
-pub struct MissingValues<S = String>
-where
- S: Debug,
-{
- /// Individual missing values, up to 3 of them.
- pub values: Vec<Value<S>>,
-
- /// Optional range of missing values.
- pub range: Option<(Value<S>, Value<S>)>,
-}
-
-impl<S> Debug for MissingValues<S>
-where
- S: Debug,
-{
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- for (i, value) in self.values.iter().enumerate() {
- if i > 0 {
- write!(f, ", ")?;
- }
- write!(f, "{value:?}")?;
- }
-
- if let Some((low, high)) = &self.range {
- if !self.values.is_empty() {
- write!(f, ", ")?;
- }
- write!(f, "{low:?} THRU {high:?}")?;
- }
-
- if self.is_empty() {
- write!(f, "none")?;
- }
-
- Ok(())
- }
-}
-
-impl<S> MissingValues<S>
-where
- S: Debug,
-{
- fn is_empty(&self) -> bool {
- self.values.is_empty() && self.range.is_none()
- }
-}
-
-impl<S> Default for MissingValues<S>
-where
- S: Debug,
-{
- fn default() -> Self {
- Self {
- values: Vec::new(),
- range: None,
- }
- }
-}
-
-impl MissingValues<RawStr<8>> {
- fn read<R: Read + Seek>(
- r: &mut R,
- offset: u64,
- width: i32,
- code: i32,
- endian: Endian,
- ) -> Result<Self, Error> {
- let (n_values, has_range) = match (width, code) {
- (_, 0..=3) => (code, false),
- (0, -2) => (0, true),
- (0, -3) => (1, true),
- (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
- (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
- };
-
- let var_type = if width == 0 {
- VarType::Numeric
- } else {
- VarType::String
- };
-
- let mut values = Vec::new();
- for _ in 0..n_values {
- values.push(RawValue::read(r, var_type, endian)?);
- }
- let range = if has_range {
- let low = RawValue::read(r, var_type, endian)?;
- let high = RawValue::read(r, var_type, endian)?;
- Some((low, high))
- } else {
- None
- };
- Ok(Self { values, range })
- }
- fn decode(&self, decoder: &Decoder) -> MissingValues<String> {
- MissingValues {
- values: self
- .values
- .iter()
- .map(|value| value.decode(decoder))
- .collect(),
- range: self
- .range
- .as_ref()
- .map(|(low, high)| (low.decode(decoder), high.decode(decoder))),
- }
- }
-}
-
-#[derive(Clone)]
-pub struct VariableRecord<S, V>
-where
- S: Debug,
- V: Debug,
-{
- /// Range of offsets in file.
- pub offsets: Range<u64>,
-
- /// Variable width, in the range -1..=255.
- pub width: i32,
-
- /// Variable name, padded on the right with spaces.
- pub name: S,
-
- /// Print format.
- pub print_format: Spec,
-
- /// Write format.
- pub write_format: Spec,
-
- /// Missing values.
- pub missing_values: MissingValues<V>,
-
- /// Optional variable label.
- pub label: Option<S>,
-}
-
-impl<S, V> Debug for VariableRecord<S, V>
-where
- S: Debug,
- V: Debug,
-{
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- writeln!(
- f,
- "Width: {} ({})",
- self.width,
- match self.width.cmp(&0) {
- Ordering::Greater => "string",
- Ordering::Equal => "numeric",
- Ordering::Less => "long string continuation record",
- }
- )?;
- writeln!(f, "Print format: {:?}", self.print_format)?;
- writeln!(f, "Write format: {:?}", self.write_format)?;
- writeln!(f, "Name: {:?}", &self.name)?;
- writeln!(f, "Variable label: {:?}", self.label)?;
- writeln!(f, "Missing values: {:?}", self.missing_values)
- }
-}
-
-impl VariableRecord<RawString, RawStr<8>> {
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
- let start_offset = r.stream_position()?;
- let width: i32 = endian.parse(read_bytes(r)?);
- if !(-1..=255).contains(&width) {
- return Err(Error::BadVariableWidth {
- start_offset,
- width,
- });
- }
- let code_offset = r.stream_position()?;
- let has_variable_label: u32 = endian.parse(read_bytes(r)?);
- let missing_value_code: i32 = endian.parse(read_bytes(r)?);
- let print_format = Spec(endian.parse(read_bytes(r)?));
- let write_format = Spec(endian.parse(read_bytes(r)?));
- let name = RawString(read_vec(r, 8)?);
-
- let label = match has_variable_label {
- 0 => None,
- 1 => {
- let len: u32 = endian.parse(read_bytes(r)?);
- let read_len = len.min(65535) as usize;
- let label = RawString(read_vec(r, read_len)?);
-
- let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
- let _ = read_vec(r, padding_bytes as usize)?;
-
- Some(label)
- }
- _ => {
- return Err(Error::BadVariableLabelCode {
- start_offset,
- code_offset,
- code: has_variable_label,
- })
- }
- };
-
- let missing_values =
- MissingValues::read(r, start_offset, width, missing_value_code, endian)?;
-
- let end_offset = r.stream_position()?;
-
- Ok(Record::Variable(VariableRecord {
- offsets: start_offset..end_offset,
- width,
- name,
- print_format,
- write_format,
- missing_values,
- label,
- }))
- }
-
- pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
- DecodedRecord::Variable(VariableRecord {
- offsets: self.offsets.clone(),
- width: self.width,
- name: decoder.decode(&self.name).to_string(),
- print_format: self.print_format,
- write_format: self.write_format,
- missing_values: self.missing_values.decode(decoder),
- label: self
- .label
- .as_ref()
- .map(|label| decoder.decode(label).to_string()),
- })
- }
-}
-
-#[derive(Copy, Clone)]
-pub struct UntypedValue(pub [u8; 8]);
-
-impl Debug for UntypedValue {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- let little: f64 = Endian::Little.parse(self.0);
- let little = format!("{:?}", little);
- let big: f64 = Endian::Big.parse(self.0);
- let big = format!("{:?}", big);
- let number = if little.len() <= big.len() {
- little
- } else {
- big
- };
- write!(f, "{number}")?;
-
- let string = default_decode(&self.0);
- let string = string
- .split(|c: char| c == '\0' || c.is_control())
- .next()
- .unwrap();
- write!(f, "{string:?}")?;
- Ok(())
- }
-}
-
-#[derive(Clone)]
-pub struct RawString(pub Vec<u8>);
-
-impl From<Vec<u8>> for RawString {
- fn from(source: Vec<u8>) -> Self {
- Self(source)
- }
-}
-
-impl From<&[u8]> for RawString {
- fn from(source: &[u8]) -> Self {
- Self(source.into())
- }
-}
-
-impl Debug for RawString {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{:?}", default_decode(self.0.as_slice()))
- }
-}
-
-#[derive(Copy, Clone)]
-pub struct RawStr<const N: usize>(pub [u8; N]);
-
-impl<const N: usize> From<[u8; N]> for RawStr<N> {
- fn from(source: [u8; N]) -> Self {
- Self(source)
- }
-}
-
-impl<const N: usize> Debug for RawStr<N> {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{:?}", default_decode(&self.0))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct ValueLabel<V, S>
-where
- V: Debug,
- S: Debug,
-{
- pub value: Value<V>,
- pub label: S,
-}
-
-#[derive(Clone)]
-pub struct ValueLabelRecord<V, S>
-where
- V: Debug,
- S: Debug,
-{
- /// Range of offsets in file.
- pub offsets: Range<u64>,
-
- /// The labels.
- pub labels: Vec<ValueLabel<V, S>>,
-
- /// The 1-based indexes of the variable indexes.
- pub dict_indexes: Vec<u32>,
-
- /// The types of the variables.
- pub var_type: VarType,
-}
-
-impl<V, S> Debug for ValueLabelRecord<V, S>
-where
- V: Debug,
- S: Debug,
-{
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- writeln!(f, "labels: ")?;
- for label in self.labels.iter() {
- writeln!(f, "{label:?}")?;
- }
- write!(f, "apply to {} variables", self.var_type)?;
- for dict_index in self.dict_indexes.iter() {
- write!(f, " #{dict_index}")?;
- }
- Ok(())
- }
-}
-
-impl<V, S> Header for ValueLabelRecord<V, S>
-where
- V: Debug,
- S: Debug,
-{
- fn offsets(&self) -> Range<u64> {
- self.offsets.clone()
- }
-}
-
-impl<V, S> ValueLabelRecord<V, S>
-where
- V: Debug,
- S: Debug,
-{
- /// Maximum number of value labels in a record.
- pub const MAX_LABELS: u32 = u32::MAX / 8;
-
- /// Maximum number of variable indexes in a record.
- pub const MAX_INDEXES: u32 = u32::MAX / 8;
-}
-
-impl ValueLabelRecord<RawStr<8>, RawString> {
- fn read<R: Read + Seek>(
- r: &mut R,
- endian: Endian,
- var_types: &[VarType],
- warn: &dyn Fn(Warning),
- ) -> Result<Option<Record>, Error> {
- let label_offset = r.stream_position()?;
- let n: u32 = endian.parse(read_bytes(r)?);
- if n > Self::MAX_LABELS {
- return Err(Error::BadNumberOfValueLabels {
- offset: label_offset,
- n,
- max: Self::MAX_LABELS,
- });
- }
-
- let mut labels = Vec::new();
- for _ in 0..n {
- let value = UntypedValue(read_bytes(r)?);
- let label_len: u8 = endian.parse(read_bytes(r)?);
- let label_len = label_len as usize;
- let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
-
- let mut label = read_vec(r, padded_len - 1)?;
- label.truncate(label_len);
- labels.push((value, RawString(label)));
- }
-
- let index_offset = r.stream_position()?;
- let rec_type: u32 = endian.parse(read_bytes(r)?);
- if rec_type != 4 {
- return Err(Error::ExpectedVarIndexRecord {
- offset: index_offset,
- rec_type,
- });
- }
-
- let n: u32 = endian.parse(read_bytes(r)?);
- if n > Self::MAX_INDEXES {
- return Err(Error::TooManyVarIndexes {
- offset: index_offset,
- n,
- max: Self::MAX_INDEXES,
- });
- } else if n == 0 {
- warn(Warning::NoVarIndexes {
- offset: index_offset,
- });
- return Ok(None);
- }
-
- let index_offset = r.stream_position()?;
- let mut dict_indexes = Vec::with_capacity(n as usize);
- let mut invalid_indexes = Vec::new();
- for _ in 0..n {
- let index: u32 = endian.parse(read_bytes(r)?);
- if index == 0 || index as usize > var_types.len() {
- dict_indexes.push(index);
- } else {
- invalid_indexes.push(index);
- }
- }
- if !invalid_indexes.is_empty() {
- warn(Warning::InvalidVarIndexes {
- offset: index_offset,
- max: var_types.len(),
- invalid: invalid_indexes,
- });
- }
-
- let Some(&first_index) = dict_indexes.first() else {
- return Ok(None);
- };
- let var_type = var_types[first_index as usize - 1];
- let mut wrong_type_indexes = Vec::new();
- dict_indexes.retain(|&index| {
- if var_types[index as usize - 1] != var_type {
- wrong_type_indexes.push(index);
- false
- } else {
- true
- }
- });
- if !wrong_type_indexes.is_empty() {
- warn(Warning::MixedVarTypes {
- offset: index_offset,
- var_type,
- wrong_types: wrong_type_indexes,
- });
- }
-
- let labels = labels
- .into_iter()
- .map(|(value, label)| ValueLabel {
- value: Value::from_raw(&value, var_type, endian),
- label,
- })
- .collect();
-
- let end_offset = r.stream_position()?;
- Ok(Some(Record::ValueLabel(ValueLabelRecord {
- offsets: label_offset..end_offset,
- labels,
- dict_indexes,
- var_type,
- })))
- }
-
- fn decode(self, decoder: &Decoder) -> ValueLabelRecord<RawStr<8>, String> {
- let labels = self
- .labels
- .iter()
- .map(|ValueLabel { value, label }| ValueLabel {
- value: *value,
- label: decoder.decode(label).to_string(),
- })
- .collect();
- ValueLabelRecord {
- offsets: self.offsets.clone(),
- labels,
- dict_indexes: self.dict_indexes.clone(),
- var_type: self.var_type,
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct DocumentRecord<S>
-where
- S: Debug,
-{
- pub offsets: Range<u64>,
-
- /// The document, as an array of lines. Raw lines are exactly 80 bytes long
- /// and are right-padded with spaces without any new-line termination.
- pub lines: Vec<S>,
-}
-
-pub type RawDocumentLine = RawStr<DOC_LINE_LEN>;
-
-/// Length of a line in a document. Document lines are fixed-length and
-/// padded on the right with spaces.
-pub const DOC_LINE_LEN: usize = 80;
-
-impl DocumentRecord<RawDocumentLine> {
- /// Maximum number of lines we will accept in a document. This is simply
- /// the maximum number that will fit in a 32-bit space.
- pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
-
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
- let start_offset = r.stream_position()?;
- let n: u32 = endian.parse(read_bytes(r)?);
- let n = n as usize;
- if n > Self::MAX_LINES {
- Err(Error::BadDocumentLength {
- offset: start_offset,
- n,
- max: Self::MAX_LINES,
- })
- } else {
- let mut lines = Vec::with_capacity(n);
- for _ in 0..n {
- lines.push(RawStr(read_bytes(r)?));
- }
- let end_offset = r.stream_position()?;
- Ok(Record::Document(DocumentRecord {
- offsets: start_offset..end_offset,
- lines,
- }))
- }
- }
-
- pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
- DecodedRecord::Document(DocumentRecord {
- offsets: self.offsets.clone(),
- lines: self
- .lines
- .iter()
- .map(|s| decoder.decode_slice(&s.0).to_string())
- .collect(),
- })
- }
-}
-
-impl<S> Header for DocumentRecord<S>
-where
- S: Debug,
-{
- fn offsets(&self) -> Range<u64> {
- self.offsets.clone()
- }
-}
-
-trait ExtensionRecord {
- const SUBTYPE: u32;
- const SIZE: Option<u32>;
- const COUNT: Option<u32>;
- const NAME: &'static str;
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning>;
-}
-
-#[derive(Clone, Debug)]
-pub struct IntegerInfoRecord {
- pub offsets: Range<u64>,
- pub version: (i32, i32, i32),
- pub machine_code: i32,
- pub floating_point_rep: i32,
- pub compression_code: i32,
- pub endianness: i32,
- pub character_code: i32,
-}
-
-impl ExtensionRecord for IntegerInfoRecord {
- const SUBTYPE: u32 = 3;
- const SIZE: Option<u32> = Some(4);
- const COUNT: Option<u32> = Some(8);
- const NAME: &'static str = "integer record";
-
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size::<Self>()?;
-
- let mut input = &ext.data[..];
- let data: Vec<i32> = (0..8)
- .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
- .collect();
- Ok(Record::IntegerInfo(IntegerInfoRecord {
- offsets: ext.offsets.clone(),
- version: (data[0], data[1], data[2]),
- machine_code: data[3],
- floating_point_rep: data[4],
- compression_code: data[5],
- endianness: data[6],
- character_code: data[7],
- }))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct FloatInfoRecord {
- pub sysmis: f64,
- pub highest: f64,
- pub lowest: f64,
-}
-
-impl ExtensionRecord for FloatInfoRecord {
- const SUBTYPE: u32 = 4;
- const SIZE: Option<u32> = Some(8);
- const COUNT: Option<u32> = Some(3);
- const NAME: &'static str = "floating point record";
-
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size::<Self>()?;
-
- let mut input = &ext.data[..];
- let data: Vec<f64> = (0..3)
- .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
- .collect();
- Ok(Record::FloatInfo(FloatInfoRecord {
- sysmis: data[0],
- highest: data[1],
- lowest: data[2],
- }))
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum CategoryLabels {
- VarLabels,
- CountedValues,
-}
-
-#[derive(Clone, Debug)]
-pub enum MultipleResponseType {
- MultipleDichotomy {
- value: RawString,
- labels: CategoryLabels,
- },
- MultipleCategory,
-}
-
-impl MultipleResponseType {
- fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> {
- let (mr_type, input) = match input.split_first() {
- Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
- Some((b'D', input)) => {
- let (value, input) = parse_counted_string(input)?;
- (
- MultipleResponseType::MultipleDichotomy {
- value,
- labels: CategoryLabels::VarLabels,
- },
- input,
- )
- }
- Some((b'E', input)) => {
- let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
- (CategoryLabels::CountedValues, rest)
- } else if let Some(rest) = input.strip_prefix(b" 11 ") {
- (CategoryLabels::VarLabels, rest)
- } else {
- return Err(Warning::TBD);
- };
- let (value, input) = parse_counted_string(input)?;
- (
- MultipleResponseType::MultipleDichotomy { value, labels },
- input,
- )
- }
- _ => return Err(Warning::TBD),
- };
- Ok((mr_type, input))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct MultipleResponseSet<I, S>
-where
- I: Debug,
- S: Debug,
-{
- pub name: I,
- pub label: S,
- pub mr_type: MultipleResponseType,
- pub short_names: Vec<I>,
-}
-
-impl MultipleResponseSet<RawString, RawString> {
- fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> {
- let Some(equals) = input.iter().position(|&b| b == b'=') else {
- return Err(Warning::TBD);
- };
- let (name, input) = input.split_at(equals);
- let (mr_type, input) = MultipleResponseType::parse(input)?;
- let Some(input) = input.strip_prefix(b" ") else {
- return Err(Warning::TBD);
- };
- let (label, mut input) = parse_counted_string(input)?;
- let mut vars = Vec::new();
- while input.first() != Some(&b'\n') {
- match input.split_first() {
- Some((b' ', rest)) => {
- let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
- return Err(Warning::TBD);
- };
- let (var, rest) = rest.split_at(length);
- if !var.is_empty() {
- vars.push(var.into());
- }
- input = rest;
- }
- _ => return Err(Warning::TBD),
- }
- }
- while input.first() == Some(&b'\n') {
- input = &input[1..];
- }
- Ok((
- MultipleResponseSet {
- name: name.into(),
- label,
- mr_type,
- short_names: vars,
- },
- input,
- ))
- }
-
- fn decode(
- &self,
- decoder: &Decoder,
- ) -> Result<MultipleResponseSet<Identifier, String>, Warning> {
- let mut short_names = Vec::with_capacity(self.short_names.len());
- for short_name in self.short_names.iter() {
- if let Some(short_name) = decoder
- .decode_identifier(short_name)
- .map_err(Warning::InvalidMrSetName)
- .issue_warning(&decoder.warn)
- {
- short_names.push(short_name);
- }
- }
- Ok(MultipleResponseSet {
- name: decoder
- .decode_identifier(&self.name)
- .map_err(Warning::InvalidMrSetVariableName)?,
- label: decoder.decode(&self.label).to_string(),
- mr_type: self.mr_type.clone(),
- short_names,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct MultipleResponseRecord<I, S>(pub Vec<MultipleResponseSet<I, S>>)
-where
- I: Debug,
- S: Debug;
-
-impl ExtensionRecord for MultipleResponseRecord<RawString, RawString> {
- const SUBTYPE: u32 = 7;
- const SIZE: Option<u32> = Some(1);
- const COUNT: Option<u32> = None;
- const NAME: &'static str = "multiple response set record";
-
- fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
- ext.check_size::<Self>()?;
-
- let mut input = &ext.data[..];
- let mut sets = Vec::new();
- while !input.is_empty() {
- let (set, rest) = MultipleResponseSet::parse(input)?;
- sets.push(set);
- input = rest;
- }
- Ok(Record::MultipleResponse(MultipleResponseRecord(sets)))
- }
-}
-
-impl MultipleResponseRecord<RawString, RawString> {
- fn decode(self, decoder: &Decoder) -> DecodedRecord {
- let mut sets = Vec::new();
- for set in self.0.iter() {
- if let Some(set) = set.decode(decoder).issue_warning(&decoder.warn) {
- sets.push(set);
- }
- }
- DecodedRecord::MultipleResponse(MultipleResponseRecord(sets))
- }
-}
-
-fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> {
- let Some(space) = input.iter().position(|&b| b == b' ') else {
- return Err(Warning::TBD);
- };
- let Ok(length) = from_utf8(&input[..space]) else {
- return Err(Warning::TBD);
- };
- let Ok(length): Result<usize, _> = length.parse() else {
- return Err(Warning::TBD);
- };
-
- let input = &input[space + 1..];
- if input.len() < length {
- return Err(Warning::TBD);
- };
-
- let (string, rest) = input.split_at(length);
- Ok((string.into(), rest))
-}
-
-#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum Measure {
- Nominal,
- Ordinal,
- Scale,
-}
-
-impl Measure {
- pub fn default_for_type(var_type: VarType) -> Option<Measure> {
- match var_type {
- VarType::Numeric => None,
- VarType::String => Some(Self::Nominal),
- }
- }
-
- fn try_decode(source: u32) -> Result<Option<Measure>, Warning> {
- match source {
- 0 => Ok(None),
- 1 => Ok(Some(Measure::Nominal)),
- 2 => Ok(Some(Measure::Ordinal)),
- 3 => Ok(Some(Measure::Scale)),
- _ => Err(Warning::InvalidMeasurement(source)),
- }
- }
-}
-
-#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum Alignment {
- Left,
- Right,
- Center,
-}
-
-impl Alignment {
- fn try_decode(source: u32) -> Result<Option<Alignment>, Warning> {
- match source {
- 0 => Ok(None),
- 1 => Ok(Some(Alignment::Left)),
- 2 => Ok(Some(Alignment::Right)),
- 3 => Ok(Some(Alignment::Center)),
- _ => Err(Warning::InvalidAlignment(source)),
- }
- }
-
- pub fn default_for_type(var_type: VarType) -> Self {
- match var_type {
- VarType::Numeric => Self::Right,
- VarType::String => Self::Left,
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VarDisplay {
- pub measure: Option<Measure>,
- pub width: Option<u32>,
- pub alignment: Option<Alignment>,
-}
-
-#[derive(Clone, Debug)]
-pub struct VarDisplayRecord(pub Vec<VarDisplay>);
-
-impl VarDisplayRecord {
- const SUBTYPE: u32 = 11;
-
- fn parse(
- ext: &Extension,
- n_vars: usize,
- endian: Endian,
- warn: &dyn Fn(Warning),
- ) -> Result<Record, Warning> {
- if ext.size != 4 {
- return Err(Warning::BadRecordSize {
- offset: ext.offsets.start,
- record: String::from("variable display record"),
- size: ext.size,
- expected_size: 4,
- });
- }
-
- let has_width = if ext.count as usize == 3 * n_vars {
- true
- } else if ext.count as usize == 2 * n_vars {
- false
- } else {
- return Err(Warning::TBD);
- };
-
- let mut var_displays = Vec::new();
- let mut input = &ext.data[..];
- for _ in 0..n_vars {
- let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
- .issue_warning(&warn)
- .flatten();
- let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
- let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
- .issue_warning(&warn)
- .flatten();
- var_displays.push(VarDisplay {
- measure,
- width,
- alignment,
- });
- }
- Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringMissingValues<N, V>
-where
- N: Debug,
- V: Debug,
-{
- /// Variable name.
- pub var_name: N,
-
- /// Missing values.
- pub missing_values: MissingValues<V>,
-}
-
-impl LongStringMissingValues<RawString, RawStr<8>> {
- fn decode(
- &self,
- decoder: &Decoder,
- ) -> Result<LongStringMissingValues<Identifier, String>, IdError> {
- Ok(LongStringMissingValues {
- var_name: decoder.decode_identifier(&self.var_name)?,
- missing_values: self.missing_values.decode(decoder),
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringMissingValueRecord<N, V>(pub Vec<LongStringMissingValues<N, V>>)
-where
- N: Debug,
- V: Debug;
-
-impl ExtensionRecord for LongStringMissingValueRecord<RawString, RawStr<8>> {
- const SUBTYPE: u32 = 22;
- const SIZE: Option<u32> = Some(1);
- const COUNT: Option<u32> = None;
- const NAME: &'static str = "long string missing values record";
-
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size::<Self>()?;
-
- let mut input = &ext.data[..];
- let mut missing_value_set = Vec::new();
- while !input.is_empty() {
- let var_name = read_string(&mut input, endian)?;
- let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
- let value_len: u32 = endian.parse(read_bytes(&mut input)?);
- if value_len != 8 {
- let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
- return Err(Warning::BadLongMissingValueLength {
- record_offset: ext.offsets.start,
- offset,
- value_len,
- });
- }
- let mut values = Vec::new();
- for i in 0..n_missing_values {
- let value: [u8; 8] = read_bytes(&mut input)?;
- let numeric_value: u64 = endian.parse(value);
- let value = if i > 0 && numeric_value == 8 {
- // Tolerate files written by old, buggy versions of PSPP
- // where we believed that the value_length was repeated
- // before each missing value.
- read_bytes(&mut input)?
- } else {
- value
- };
- values.push(Value::String(RawStr(value)));
- }
- let missing_values = MissingValues {
- values,
- range: None,
- };
- missing_value_set.push(LongStringMissingValues {
- var_name,
- missing_values,
- });
- }
- Ok(Record::LongStringMissingValues(
- LongStringMissingValueRecord(missing_value_set),
- ))
- }
-}
-
-impl LongStringMissingValueRecord<RawString, RawStr<8>> {
- pub fn decode(self, decoder: &Decoder) -> LongStringMissingValueRecord<Identifier, String> {
- let mut mvs = Vec::with_capacity(self.0.len());
- for mv in self.0.iter() {
- if let Some(mv) = mv
- .decode(decoder)
- .map_err(Warning::InvalidLongStringMissingValueVariableName)
- .issue_warning(&decoder.warn)
- {
- mvs.push(mv);
- }
- }
- LongStringMissingValueRecord(mvs)
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct EncodingRecord(pub String);
-
-impl ExtensionRecord for EncodingRecord {
- const SUBTYPE: u32 = 20;
- const SIZE: Option<u32> = Some(1);
- const COUNT: Option<u32> = None;
- const NAME: &'static str = "encoding record";
-
- fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
- ext.check_size::<Self>()?;
-
- Ok(Record::Encoding(EncodingRecord(
- String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName {
- offset: ext.offsets.start,
- })?,
- )))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct NumberOfCasesRecord {
- /// Always observed as 1.
- pub one: u64,
-
- /// Number of cases.
- pub n_cases: u64,
-}
-
-impl ExtensionRecord for NumberOfCasesRecord {
- const SUBTYPE: u32 = 16;
- const SIZE: Option<u32> = Some(8);
- const COUNT: Option<u32> = Some(2);
- const NAME: &'static str = "extended number of cases record";
-
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size::<Self>()?;
-
- let mut input = &ext.data[..];
- let one = endian.parse(read_bytes(&mut input)?);
- let n_cases = endian.parse(read_bytes(&mut input)?);
-
- Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct TextRecord {
- pub offsets: Range<u64>,
-
- /// Type of record.
- pub rec_type: TextRecordType,
-
- /// The text content of the record.
- pub text: RawString,
-}
-
-#[derive(Clone, Copy, Debug)]
-pub enum TextRecordType {
- VariableSets,
- ProductInfo,
- LongNames,
- VeryLongStrings,
- FileAttributes,
- VariableAttributes,
-}
-
-impl TextRecord {
- fn new(extension: Extension, rec_type: TextRecordType) -> Self {
- Self {
- offsets: extension.offsets,
- rec_type,
- text: extension.data.into(),
- }
- }
- pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
- match self.rec_type {
- TextRecordType::VariableSets => {
- DecodedRecord::VariableSets(VariableSetRecord::decode(&self, decoder))
- }
- TextRecordType::ProductInfo => {
- DecodedRecord::ProductInfo(ProductInfoRecord::decode(&self, decoder))
- }
- TextRecordType::LongNames => {
- DecodedRecord::LongNames(LongNamesRecord::decode(&self, decoder))
- }
- TextRecordType::VeryLongStrings => {
- DecodedRecord::VeryLongStrings(VeryLongStringsRecord::decode(&self, decoder))
- }
- TextRecordType::FileAttributes => {
- DecodedRecord::FileAttributes(FileAttributeRecord::decode(&self, decoder))
- }
- TextRecordType::VariableAttributes => {
- DecodedRecord::VariableAttributes(VariableAttributeRecord::decode(&self, decoder))
- }
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VeryLongString {
- pub short_name: Identifier,
- pub length: u16,
-}
-
-impl VeryLongString {
- fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Warning> {
- let Some((short_name, length)) = input.split_once('=') else {
- return Err(Warning::TBD);
- };
- let short_name = decoder
- .new_identifier(short_name)
- .map_err(Warning::InvalidLongStringName)?;
- let length = length.parse().map_err(|_| Warning::TBD)?;
- Ok(VeryLongString { short_name, length })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VeryLongStringsRecord(Vec<VeryLongString>);
-
-impl VeryLongStringsRecord {
- fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
- let input = decoder.decode(&source.text);
- let mut very_long_strings = Vec::new();
- for tuple in input
- .split('\0')
- .map(|s| s.trim_end_matches('\t'))
- .filter(|s| !s.is_empty())
- {
- if let Some(vls) = VeryLongString::parse(decoder, tuple).issue_warning(&decoder.warn) {
- very_long_strings.push(vls)
- }
- }
- VeryLongStringsRecord(very_long_strings)
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct Attribute {
- pub name: Identifier,
- pub values: Vec<String>,
-}
-
-impl Attribute {
- fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> {
- let Some((name, mut input)) = input.split_once('(') else {
- return Err(Warning::TBD);
- };
- let name = decoder
- .new_identifier(name)
- .map_err(Warning::InvalidAttributeName)?;
- let mut values = Vec::new();
- loop {
- let Some((value, rest)) = input.split_once('\n') else {
- return Err(Warning::TBD);
- };
- if let Some(stripped) = value
- .strip_prefix('\'')
- .and_then(|value| value.strip_suffix('\''))
- {
- values.push(stripped.into());
- } else {
- decoder.warn(Warning::TBD);
- values.push(value.into());
- }
- if let Some(rest) = rest.strip_prefix(')') {
- let attribute = Attribute { name, values };
- return Ok((attribute, rest));
- };
- input = rest;
- }
- }
-}
-
-#[derive(Clone, Debug, Default)]
-pub struct AttributeSet(pub HashMap<Identifier, Vec<String>>);
-
-impl AttributeSet {
- fn parse<'a>(
- decoder: &Decoder,
- mut input: &'a str,
- sentinel: Option<char>,
- ) -> Result<(AttributeSet, &'a str), Warning> {
- let mut attributes = HashMap::new();
- let rest = loop {
- match input.chars().next() {
- None => break input,
- c if c == sentinel => break &input[1..],
- _ => {
- let (attribute, rest) = Attribute::parse(decoder, input)?;
- // XXX report duplicate name
- attributes.insert(attribute.name, attribute.values);
- input = rest;
- }
- }
- };
- Ok((AttributeSet(attributes), rest))
- }
-}
-
-#[derive(Clone, Debug, Default)]
-pub struct FileAttributeRecord(pub AttributeSet);
-
-impl FileAttributeRecord {
- fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
- let input = decoder.decode(&source.text);
- match AttributeSet::parse(decoder, &input, None).issue_warning(&decoder.warn) {
- Some((set, rest)) => {
- if !rest.is_empty() {
- decoder.warn(Warning::TBD);
- }
- FileAttributeRecord(set)
- }
- None => FileAttributeRecord::default(),
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VarAttributeSet {
- pub long_var_name: Identifier,
- pub attributes: AttributeSet,
-}
-
-impl VarAttributeSet {
- fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributeSet, &'a str), Warning> {
- let Some((long_var_name, rest)) = input.split_once(':') else {
- return Err(Warning::TBD);
- };
- let long_var_name = decoder
- .new_identifier(long_var_name)
- .map_err(Warning::InvalidAttributeVariableName)?;
- let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?;
- let var_attribute = VarAttributeSet {
- long_var_name,
- attributes,
- };
- Ok((var_attribute, rest))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
-
-impl VariableAttributeRecord {
- fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
- let decoded = decoder.decode(&source.text);
- let mut input = decoded.as_ref();
- let mut var_attribute_sets = Vec::new();
- while !input.is_empty() {
- let Some((var_attribute, rest)) =
- VarAttributeSet::parse(decoder, input).issue_warning(&decoder.warn)
- else {
- break;
- };
- var_attribute_sets.push(var_attribute);
- input = rest;
- }
- VariableAttributeRecord(var_attribute_sets)
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongName {
- pub short_name: Identifier,
- pub long_name: Identifier,
-}
-
-impl LongName {
- fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
- let Some((short_name, long_name)) = input.split_once('=') else {
- return Err(Warning::TBD);
- };
- let short_name = decoder
- .new_identifier(short_name)
- .map_err(Warning::InvalidShortName)?;
- let long_name = decoder
- .new_identifier(long_name)
- .map_err(Warning::InvalidLongName)?;
- Ok(LongName {
- short_name,
- long_name,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongNamesRecord(Vec<LongName>);
-
-impl LongNamesRecord {
- fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
- let input = decoder.decode(&source.text);
- let mut names = Vec::new();
- for pair in input.split('\t').filter(|s| !s.is_empty()) {
- if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&decoder.warn) {
- names.push(long_name);
- }
- }
- LongNamesRecord(names)
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct ProductInfoRecord(pub String);
-
-impl ProductInfoRecord {
- fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
- Self(decoder.decode(&source.text).into())
- }
-}
-#[derive(Clone, Debug)]
-pub struct VariableSet {
- pub name: String,
- pub vars: Vec<Identifier>,
-}
-
-impl VariableSet {
- fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
- let (name, input) = input.split_once('=').ok_or(Warning::TBD)?;
- let mut vars = Vec::new();
- for var in input.split_ascii_whitespace() {
- if let Some(identifier) = decoder
- .new_identifier(var)
- .map_err(Warning::InvalidVariableSetName)
- .issue_warning(&decoder.warn)
- {
- vars.push(identifier);
- }
- }
- Ok(VariableSet {
- name: name.into(),
- vars,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableSetRecord {
- pub offsets: Range<u64>,
- pub sets: Vec<VariableSet>,
-}
-
-impl VariableSetRecord {
- fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord {
- let mut sets = Vec::new();
- let input = decoder.decode(&source.text);
- for line in input.lines() {
- if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&decoder.warn) {
- sets.push(set)
- }
- }
- VariableSetRecord {
- offsets: source.offsets.clone(),
- sets,
- }
- }
-}
-
-trait IssueWarning<T> {
- fn issue_warning<F>(self, warn: &F) -> Option<T>
- where
- F: Fn(Warning);
-}
-impl<T> IssueWarning<T> for Result<T, Warning> {
- fn issue_warning<F>(self, warn: &F) -> Option<T>
- where
- F: Fn(Warning),
- {
- match self {
- Ok(result) => Some(result),
- Err(error) => {
- warn(error);
- None
- }
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct Extension {
- pub offsets: Range<u64>,
-
- /// Record subtype.
- pub subtype: u32,
-
- /// Size of each data element.
- pub size: u32,
-
- /// Number of data elements.
- pub count: u32,
-
- /// `size * count` bytes of data.
- pub data: Vec<u8>,
-}
-
-impl Extension {
- fn check_size<E: ExtensionRecord>(&self) -> Result<(), Warning> {
- if let Some(expected_size) = E::SIZE {
- if self.size != expected_size {
- return Err(Warning::BadRecordSize {
- offset: self.offsets.start,
- record: E::NAME.into(),
- size: self.size,
- expected_size,
- });
- }
- }
- if let Some(expected_count) = E::COUNT {
- if self.count != expected_count {
- return Err(Warning::BadRecordCount {
- offset: self.offsets.start,
- record: E::NAME.into(),
- count: self.count,
- expected_count,
- });
- }
- }
- Ok(())
- }
-
- fn read<R: Read + Seek>(
- r: &mut R,
- endian: Endian,
- n_vars: usize,
- warn: &dyn Fn(Warning),
- ) -> Result<Option<Record>, Error> {
- let subtype = endian.parse(read_bytes(r)?);
- let header_offset = r.stream_position()?;
- let size: u32 = endian.parse(read_bytes(r)?);
- let count = endian.parse(read_bytes(r)?);
- let Some(product) = size.checked_mul(count) else {
- return Err(Error::ExtensionRecordTooLarge {
- offset: header_offset,
- subtype,
- size,
- count,
- });
- };
- let start_offset = r.stream_position()?;
- let data = read_vec(r, product as usize)?;
- let end_offset = start_offset + product as u64;
- let extension = Extension {
- offsets: start_offset..end_offset,
- subtype,
- size,
- count,
- data,
- };
- let result = match subtype {
- IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian),
- FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian),
- VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, n_vars, endian, warn),
- MultipleResponseRecord::SUBTYPE | 19 => {
- MultipleResponseRecord::parse(&extension, endian)
- }
- LongStringValueLabelRecord::SUBTYPE => {
- LongStringValueLabelRecord::parse(&extension, endian)
- }
- EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian),
- NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian),
- 5 => Ok(Record::Text(TextRecord::new(
- extension,
- TextRecordType::VariableSets,
- ))),
- 10 => Ok(Record::Text(TextRecord::new(
- extension,
- TextRecordType::ProductInfo,
- ))),
- 13 => Ok(Record::Text(TextRecord::new(
- extension,
- TextRecordType::LongNames,
- ))),
- 14 => Ok(Record::Text(TextRecord::new(
- extension,
- TextRecordType::VeryLongStrings,
- ))),
- 17 => Ok(Record::Text(TextRecord::new(
- extension,
- TextRecordType::FileAttributes,
- ))),
- 18 => Ok(Record::Text(TextRecord::new(
- extension,
- TextRecordType::VariableAttributes,
- ))),
- _ => Ok(Record::OtherExtension(extension)),
- };
- match result {
- Ok(result) => Ok(Some(result)),
- Err(error) => {
- warn(error);
- Ok(None)
- }
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct ZHeader {
- /// File offset to the start of the record.
- pub offset: u64,
-
- /// File offset to the ZLIB data header.
- pub zheader_offset: u64,
-
- /// File offset to the ZLIB trailer.
- pub ztrailer_offset: u64,
-
- /// Length of the ZLIB trailer in bytes.
- pub ztrailer_len: u64,
-}
-
-impl ZHeader {
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
- let offset = r.stream_position()?;
- let zheader_offset: u64 = endian.parse(read_bytes(r)?);
- let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
- let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
-
- Ok(ZHeader {
- offset,
- zheader_offset,
- ztrailer_offset,
- ztrailer_len,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct ZTrailer {
- /// File offset to the start of the record.
- pub offset: u64,
-
- /// Compression bias as a negative integer, e.g. -100.
- pub int_bias: i64,
-
- /// Always observed as zero.
- pub zero: u64,
-
- /// Uncompressed size of each block, except possibly the last. Only
- /// `0x3ff000` has been observed so far.
- pub block_size: u32,
-
- /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
- pub blocks: Vec<ZBlock>,
-}
-
-#[derive(Clone, Debug)]
-pub struct ZBlock {
- /// Offset of block of data if simple compression were used.
- pub uncompressed_ofs: u64,
-
- /// Actual offset within the file of the compressed data block.
- pub compressed_ofs: u64,
-
- /// The number of bytes in this data block after decompression. This is
- /// `block_size` in every data block but the last, which may be smaller.
- pub uncompressed_size: u32,
-
- /// The number of bytes in this data block, as stored compressed in this
- /// file.
- pub compressed_size: u32,
-}
-
-impl ZBlock {
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
- Ok(ZBlock {
- uncompressed_ofs: endian.parse(read_bytes(r)?),
- compressed_ofs: endian.parse(read_bytes(r)?),
- uncompressed_size: endian.parse(read_bytes(r)?),
- compressed_size: endian.parse(read_bytes(r)?),
- })
- }
-}
-
-impl ZTrailer {
- fn read<R: Read + Seek>(
- reader: &mut R,
- endian: Endian,
- ztrailer_ofs: u64,
- ztrailer_len: u64,
- ) -> Result<Option<ZTrailer>, Error> {
- let start_offset = reader.stream_position()?;
- if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
- return Ok(None);
- }
- let int_bias = endian.parse(read_bytes(reader)?);
- let zero = endian.parse(read_bytes(reader)?);
- let block_size = endian.parse(read_bytes(reader)?);
- let n_blocks: u32 = endian.parse(read_bytes(reader)?);
- let expected_n_blocks = (ztrailer_len - 24) / 24;
- if n_blocks as u64 != expected_n_blocks {
- return Err(Error::BadZlibTrailerNBlocks {
- offset: ztrailer_ofs,
- n_blocks,
- expected_n_blocks,
- ztrailer_len,
- });
- }
- let blocks = (0..n_blocks)
- .map(|_| ZBlock::read(reader, endian))
- .collect::<Result<Vec<_>, _>>()?;
- reader.seek(SeekFrom::Start(start_offset))?;
- Ok(Some(ZTrailer {
- offset: ztrailer_ofs,
- int_bias,
- zero,
- block_size,
- blocks,
- }))
- }
-}
-
-fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
- let mut buf = [0; N];
- let n = r.read(&mut buf)?;
- if n > 0 {
- if n < N {
- r.read_exact(&mut buf[n..])?;
- }
- Ok(Some(buf))
- } else {
- Ok(None)
- }
-}
-
-fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
- let mut buf = [0; N];
- r.read_exact(&mut buf)?;
- Ok(buf)
-}
-
-fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
- let mut vec = vec![0; n];
- r.read_exact(&mut vec)?;
- Ok(vec)
-}
-
-fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<RawString, IoError> {
- let length: u32 = endian.parse(read_bytes(r)?);
- Ok(read_vec(r, length as usize)?.into())
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringValueLabels<N, S>
-where
- S: Debug,
-{
- pub var_name: N,
- pub width: u32,
-
- /// `(value, label)` pairs, where each value is `width` bytes.
- pub labels: Vec<(S, S)>,
-}
-
-impl LongStringValueLabels<RawString, RawString> {
- fn decode(
- &self,
- decoder: &Decoder,
- ) -> Result<LongStringValueLabels<Identifier, String>, Warning> {
- let var_name = decoder.decode(&self.var_name);
- let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding)
- .map_err(Warning::InvalidLongStringValueLabelName)?;
-
- let mut labels = Vec::with_capacity(self.labels.len());
- for (value, label) in self.labels.iter() {
- let value = decoder.decode_exact_length(&value.0).to_string();
- let label = decoder.decode(label).to_string();
- labels.push((value, label));
- }
-
- Ok(LongStringValueLabels {
- var_name,
- width: self.width,
- labels,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringValueLabelRecord<N, S>(pub Vec<LongStringValueLabels<N, S>>)
-where
- N: Debug,
- S: Debug;
-
-impl ExtensionRecord for LongStringValueLabelRecord<RawString, RawString> {
- const SUBTYPE: u32 = 21;
- const SIZE: Option<u32> = Some(1);
- const COUNT: Option<u32> = None;
- const NAME: &'static str = "long string value labels record";
-
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size::<Self>()?;
-
- let mut input = &ext.data[..];
- let mut label_set = Vec::new();
- while !input.is_empty() {
- let var_name = read_string(&mut input, endian)?;
- let width: u32 = endian.parse(read_bytes(&mut input)?);
- let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
- let mut labels = Vec::new();
- for _ in 0..n_labels {
- let value = read_string(&mut input, endian)?;
- let label = read_string(&mut input, endian)?;
- labels.push((value, label));
- }
- label_set.push(LongStringValueLabels {
- var_name,
- width,
- labels,
- })
- }
- Ok(Record::LongStringValueLabels(LongStringValueLabelRecord(
- label_set,
- )))
- }
-}
-
-impl LongStringValueLabelRecord<RawString, RawString> {
- fn decode(self, decoder: &Decoder) -> LongStringValueLabelRecord<Identifier, String> {
- let mut labels = Vec::with_capacity(self.0.len());
- for label in &self.0 {
- match label.decode(decoder) {
- Ok(set) => labels.push(set),
- Err(error) => decoder.warn(error),
- }
- }
- LongStringValueLabelRecord(labels)
- }
-}
+++ /dev/null
-use float_next_after::NextAfter;
-use num::{Bounded, Zero};
-use ordered_float::OrderedFloat;
-use std::{
- collections::{hash_map::Entry, HashMap},
- error::Error as StdError,
- fmt::{Display, Formatter, Result as FmtResult},
- iter::repeat,
-};
-
-use crate::endian::{Endian, ToBytes};
-
-pub type Result<T, F = Error> = std::result::Result<T, F>;
-
-#[derive(Debug)]
-pub struct Error {
- pub file_name: Option<String>,
- pub line_number: Option<usize>,
- pub token: Option<String>,
- pub message: String,
-}
-
-impl Error {
- fn new(
- file_name: Option<&str>,
- line_number: Option<usize>,
- token: Option<&str>,
- message: String,
- ) -> Error {
- Error {
- file_name: file_name.map(String::from),
- line_number,
- token: token.map(String::from),
- message,
- }
- }
-}
-
-impl StdError for Error {}
-
-impl Display for Error {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- match (self.file_name.as_ref(), self.line_number) {
- (Some(ref file_name), Some(line_number)) => write!(f, "{file_name}:{line_number}: ")?,
- (Some(ref file_name), None) => write!(f, "{file_name}: ")?,
- (None, Some(line_number)) => write!(f, "line {line_number}: ")?,
- (None, None) => (),
- }
- if let Some(ref token) = self.token {
- write!(f, "at '{token}': ")?;
- }
- write!(f, "{}", self.message)
- }
-}
-
-pub fn sack(input: &str, input_file_name: Option<&str>, endian: Endian) -> Result<Vec<u8>> {
- let mut symbol_table = HashMap::new();
- let output = _sack(input, input_file_name, endian, &mut symbol_table)?;
- let output = if !symbol_table.is_empty() {
- for (k, v) in symbol_table.iter() {
- println!("{k} => {v:?}");
- }
- for (k, v) in symbol_table.iter() {
- if v.is_none() {
- Err(Error::new(
- input_file_name,
- None,
- None,
- format!("label {k} used but never defined"),
- ))?
- }
- }
- _sack(input, input_file_name, endian, &mut symbol_table)?
- } else {
- output
- };
- Ok(output)
-}
-
-fn _sack(
- input: &str,
- input_file_name: Option<&str>,
- endian: Endian,
- symbol_table: &mut HashMap<String, Option<u32>>,
-) -> Result<Vec<u8>> {
- let mut lexer = Lexer::new(input, input_file_name, endian)?;
- let mut output = Vec::new();
- while parse_data_item(&mut lexer, &mut output, symbol_table)? {}
- Ok(output)
-}
-
-fn parse_data_item(
- lexer: &mut Lexer,
- output: &mut Vec<u8>,
- symbol_table: &mut HashMap<String, Option<u32>>,
-) -> Result<bool> {
- if lexer.token.is_none() {
- return Ok(false);
- };
-
- let initial_len = output.len();
- match lexer.take()? {
- Token::Integer(integer) => {
- if let Ok(integer) = TryInto::<i32>::try_into(integer) {
- output.extend_from_slice(&lexer.endian.to_bytes(integer));
- } else if let Ok(integer) = TryInto::<u32>::try_into(integer) {
- output.extend_from_slice(&lexer.endian.to_bytes(integer));
- } else {
- Err(lexer.error(format!(
- "{integer} is not in the valid range [{},{}]",
- i32::min_value(),
- u32::max_value()
- )))?;
- };
- }
- Token::Float(float) => output.extend_from_slice(&lexer.endian.to_bytes(float.0)),
- Token::PcSysmis => {
- output.extend_from_slice(&[0xf5, 0x1e, 0x26, 0x02, 0x8a, 0x8c, 0xed, 0xff])
- }
- Token::I8 => put_integers::<u8, 1>(lexer, "i8", output)?,
- Token::I16 => put_integers::<u16, 2>(lexer, "i16", output)?,
- Token::I64 => put_integers::<i64, 8>(lexer, "i64", output)?,
- Token::String(string) => output.extend_from_slice(string.as_bytes()),
- Token::S(size) => {
- let Some((Token::String(ref string), _)) = lexer.token else {
- Err(lexer.error(format!("string expected after 's{size}'")))?
- };
- let len = string.len();
- if len > size {
- Err(lexer.error(format!(
- "{len}-byte string is longer than pad length {size}"
- )))?
- }
- output.extend_from_slice(string.as_bytes());
- output.extend(repeat(b' ').take(size - len));
- lexer.get()?;
- }
- Token::LParen => {
- while !matches!(lexer.token, Some((Token::RParen, _))) {
- parse_data_item(lexer, output, symbol_table)?;
- }
- lexer.get()?;
- }
- Token::Count => put_counted_items::<u32, 4>(lexer, "COUNT", output, symbol_table)?,
- Token::Count8 => put_counted_items::<u8, 1>(lexer, "COUNT8", output, symbol_table)?,
- Token::Hex => {
- let Some((Token::String(ref string), _)) = lexer.token else {
- Err(lexer.error(String::from("string expected after 'hex'")))?
- };
- let mut string = &string[..];
- loop {
- string = string.trim_start();
- if string.is_empty() {
- break;
- };
-
- let mut i = string.chars();
- let Some(c0) = i.next() else { return Ok(true) };
- let Some(c1) = i.next() else {
- Err(lexer.error(String::from("hex string has odd number of characters")))?
- };
-
- let (Some(digit0), Some(digit1)) = (c0.to_digit(16), c1.to_digit(16)) else {
- Err(lexer.error(String::from("invalid digit in hex string")))?
- };
- let byte = digit0 * 16 + digit1;
- output.push(byte as u8);
-
- string = i.as_str();
- }
- lexer.get()?;
- }
- Token::Label(name) => {
- println!("define {name}");
- let value = output.len() as u32;
- match symbol_table.entry(name.clone()) {
- Entry::Vacant(v) => {
- v.insert(Some(value));
- }
- Entry::Occupied(mut o) => {
- match o.get() {
- Some(v) => {
- if *v != value {
- Err(lexer.error(format!("{name}: can't redefine label for offset {:#x} with offset {:#x}", *v, value)))?
- }
- }
- None => drop(o.insert(Some(value))),
- }
- }
- };
- return Ok(true);
- }
- Token::At(name) => {
- let mut value = *symbol_table.entry(name.clone()).or_insert(None);
- loop {
- let plus = match lexer.token {
- Some((Token::Plus, _)) => true,
- Some((Token::Minus, _)) => false,
- _ => break,
- };
- lexer.get()?;
-
- let operand = match lexer.token {
- Some((Token::At(ref name), _)) => {
- *symbol_table.entry(name.clone()).or_insert(None)
- }
- Some((Token::Integer(integer), _)) => Some(
- integer
- .try_into()
- .map_err(|msg| lexer.error(format!("bad offset literal ({msg})")))?,
- ),
- _ => Err(lexer.error(String::from("expecting @label or integer literal")))?,
- };
- lexer.get()?;
-
- value = match (value, operand) {
- (Some(a), Some(b)) => Some(
- if plus {
- a.checked_add(b)
- } else {
- a.checked_sub(b)
- }
- .ok_or_else(|| {
- lexer.error(String::from("overflow in offset arithmetic"))
- })?,
- ),
- _ => None,
- };
- }
- let value = value.unwrap_or(0);
- output.extend_from_slice(&lexer.endian.to_bytes(value));
- }
- _ => (),
- };
- if let Some((Token::Asterisk, _)) = lexer.token {
- lexer.get()?;
- let Token::Integer(count) = lexer.take()? else {
- Err(lexer.error(String::from("positive integer expected after '*'")))?
- };
- if count < 1 {
- Err(lexer.error(String::from("positive integer expected after '*'")))?
- };
- let final_len = output.len();
- for _ in 1..count {
- output.extend_from_within(initial_len..final_len);
- }
- }
- match lexer.token {
- Some((Token::Semicolon, _)) => {
- lexer.get()?;
- }
- Some((Token::RParen, _)) => (),
- _ => Err(lexer.error(String::from("';' expected")))?,
- }
- Ok(true)
-}
-
-fn put_counted_items<T, const N: usize>(
- lexer: &mut Lexer,
- name: &str,
- output: &mut Vec<u8>,
- symbol_table: &mut HashMap<String, Option<u32>>,
-) -> Result<()>
-where
- T: Zero + TryFrom<usize>,
- Endian: ToBytes<T, N>,
-{
- let old_size = output.len();
- output.extend_from_slice(&lexer.endian.to_bytes(T::zero()));
- let start = output.len();
- if !matches!(lexer.token, Some((Token::LParen, _))) {
- Err(lexer.error(format!("'(' expected after '{name}'")))?
- }
- lexer.get()?;
- while !matches!(lexer.token, Some((Token::RParen, _))) {
- parse_data_item(lexer, output, symbol_table)?;
- }
- lexer.get()?;
- let delta = output.len() - start;
- let Ok(delta): Result<T, _> = delta.try_into() else {
- Err(lexer.error(format!("{delta} bytes is too much for '{name}'")))?
- };
- let dest = &mut output[old_size..old_size + N];
- dest.copy_from_slice(&lexer.endian.to_bytes(delta));
- Ok(())
-}
-
-fn put_integers<T, const N: usize>(
- lexer: &mut Lexer,
- name: &str,
- output: &mut Vec<u8>,
-) -> Result<()>
-where
- T: Bounded + Display + TryFrom<i64> + Copy,
- Endian: ToBytes<T, N>,
-{
- println!("put_integers {:?}", lexer.token);
- let mut n = 0;
- while let Some(integer) = lexer.take_if(|t| match t {
- Token::Integer(integer) => Some(*integer),
- _ => None,
- })? {
- println!("got integer {integer}");
- let Ok(integer) = integer.try_into() else {
- Err(lexer.error(format!(
- "{integer} is not in the valid range [{},{}]",
- T::min_value(),
- T::max_value()
- )))?
- };
- output.extend_from_slice(&lexer.endian.to_bytes(integer));
- n += 1;
- }
- println!("put_integers {:?} {n}", lexer.token);
- if n == 0 {
- Err(lexer.error(format!("integer expected after '{name}'")))?
- }
- Ok(())
-}
-
-#[derive(PartialEq, Eq, Clone, Debug)]
-enum Token {
- Integer(i64),
- Float(OrderedFloat<f64>),
- PcSysmis,
- String(String),
- Semicolon,
- Asterisk,
- LParen,
- RParen,
- I8,
- I16,
- I64,
- S(usize),
- Count,
- Count8,
- Hex,
- Label(String),
- At(String),
- Minus,
- Plus,
-}
-
-struct Lexer<'a> {
- input: &'a str,
- token: Option<(Token, &'a str)>,
- input_file_name: Option<&'a str>,
- line_number: usize,
- endian: Endian,
-}
-
-fn skip_comments(mut s: &str) -> (&str, usize) {
- let mut n_newlines = 0;
- let s = loop {
- s = s.trim_start_matches([' ', '\t', '\r', '<', '>']);
- if let Some(remainder) = s.strip_prefix('#') {
- let Some((_, remainder)) = remainder.split_once('\n') else {
- break "";
- };
- s = remainder;
- n_newlines += 1;
- } else if let Some(remainder) = s.strip_prefix('\n') {
- s = remainder;
- n_newlines += 1;
- } else {
- break s;
- }
- };
- (s, n_newlines)
-}
-
-impl<'a> Lexer<'a> {
- fn new(input: &'a str, input_file_name: Option<&'a str>, endian: Endian) -> Result<Lexer<'a>> {
- let mut lexer = Lexer {
- input,
- token: None,
- input_file_name,
- line_number: 1,
- endian,
- };
- lexer.token = lexer.next()?;
- Ok(lexer)
- }
- fn error(&self, message: String) -> Error {
- let repr = self.token.as_ref().map(|(_, repr)| *repr);
- Error::new(self.input_file_name, Some(self.line_number), repr, message)
- }
- fn take(&mut self) -> Result<Token> {
- let Some(token) = self.token.take() else {
- Err(self.error(String::from("unexpected end of input")))?
- };
- self.token = self.next()?;
- Ok(token.0)
- }
- fn take_if<F, T>(&mut self, condition: F) -> Result<Option<T>>
- where
- F: FnOnce(&Token) -> Option<T>,
- {
- let Some(ref token) = self.token else {
- return Ok(None);
- };
- match condition(&token.0) {
- Some(value) => {
- self.token = self.next()?;
- Ok(Some(value))
- }
- None => Ok(None),
- }
- }
- fn get(&mut self) -> Result<Option<&Token>> {
- if self.token.is_none() {
- Err(self.error(String::from("unexpected end of input")))?
- } else {
- self.token = self.next()?;
- match self.token {
- Some((ref token, _)) => Ok(Some(token)),
- None => Ok(None),
- }
- }
- }
-
- fn next(&mut self) -> Result<Option<(Token, &'a str)>> {
- // Get the first character of the token, skipping past white space and
- // comments.
- let (s, n_newlines) = skip_comments(self.input);
- self.line_number += n_newlines;
- self.input = s;
-
- let start = s;
- let mut iter = s.chars();
- let Some(c) = iter.next() else {
- return Ok(None);
- };
- let (token, rest) = match c {
- c if c.is_ascii_digit() || c == '-' => {
- let len = s
- .find(|c: char| {
- !(c.is_ascii_digit() || c.is_alphabetic() || c == '.' || c == '-')
- })
- .unwrap_or(s.len());
- let (number, rest) = s.split_at(len);
- let token = if number == "-" {
- Token::Minus
- } else if let Some(digits) = number.strip_prefix("0x") {
- Token::Integer(i64::from_str_radix(digits, 16).map_err(|msg| {
- self.error(format!("bad integer literal '{number}' ({msg})"))
- })?)
- } else if !number.contains('.') {
- Token::Integer(number.parse().map_err(|msg| {
- self.error(format!("bad integer literal '{number}' ({msg})"))
- })?)
- } else {
- Token::Float(number.parse().map_err(|msg| {
- self.error(format!("bad float literal '{number}' ({msg})"))
- })?)
- };
- (token, rest)
- }
- '"' => {
- let s = iter.as_str();
- let Some(len) = s.find(['\n', '"']) else {
- Err(self.error(String::from("end-of-file inside string")))?
- };
- let (string, rest) = s.split_at(len);
- let Some(rest) = rest.strip_prefix('"') else {
- Err(self.error(format!("new-line inside string ({string}...{rest})")))?
- };
- (Token::String(string.into()), rest)
- }
- ';' => (Token::Semicolon, iter.as_str()),
- '*' => (Token::Asterisk, iter.as_str()),
- '+' => (Token::Plus, iter.as_str()),
- '(' => (Token::LParen, iter.as_str()),
- ')' => (Token::RParen, iter.as_str()),
- c if c.is_alphabetic() || c == '@' || c == '_' => {
- let len = s
- .find(|c: char| {
- !(c.is_ascii_digit()
- || c.is_alphabetic()
- || c == '@'
- || c == '.'
- || c == '_')
- })
- .unwrap_or(s.len());
- let (s, rest) = s.split_at(len);
- if let Some(rest) = rest.strip_prefix(':') {
- (Token::Label(s.into()), rest)
- } else if let Some(name) = s.strip_prefix('@') {
- (Token::At(name.into()), rest)
- } else if let Some(count) = s.strip_prefix('s') {
- let token =
- Token::S(count.parse().map_err(|msg| {
- self.error(format!("bad counted string '{s}' ({msg})"))
- })?);
- (token, rest)
- } else {
- let token = match s {
- "i8" => Token::I8,
- "i16" => Token::I16,
- "i64" => Token::I64,
- "SYSMIS" => Token::Float(OrderedFloat(-f64::MAX)),
- "PCSYSMIS" => Token::PcSysmis,
- "LOWEST" => Token::Float((-f64::MAX).next_after(0.0).into()),
- "HIGHEST" => Token::Float(f64::MAX.into()),
- "ENDIAN" => Token::Integer(if self.endian == Endian::Big { 1 } else { 2 }),
- "COUNT" => Token::Count,
- "COUNT8" => Token::Count8,
- "hex" => Token::Hex,
- _ => Err(self.error(format!("invalid token '{s}'")))?,
- };
- (token, rest)
- }
- }
- _ => Err(self.error(format!("invalid input byte '{c}'")))?,
- };
- self.input = rest;
- let repr = &start[..start.len() - rest.len()];
- println!("{token:?} {repr}");
- Ok(Some((token, repr)))
- }
-}
-
-#[cfg(test)]
-mod test {
- use crate::endian::Endian;
- use crate::sack::sack;
- use anyhow::Result;
- use hexplay::HexView;
-
- #[test]
- fn basic_sack() -> Result<()> {
- let input = r#"
-"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
-2; # Layout code
-28; # Nominal case size
-0; # Not compressed
-0; # Not weighted
-1; # 1 case.
-100.0; # Bias.
-"01 Jan 11"; "20:53:52";
-"PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 "";
-i8 0 *3;
-"#;
- let output = sack(input, None, Endian::Big)?;
- HexView::new(&output).print()?;
- Ok(())
- }
-
- #[test]
- fn pcp_sack() -> Result<()> {
- let input = r#"
-# File header.
-2; 0;
-@MAIN; @MAIN_END - @MAIN;
-@VARS; @VARS_END - @VARS;
-@LABELS; @LABELS_END - @LABELS;
-@DATA; @DATA_END - @DATA;
-(0; 0) * 11;
-i8 0 * 128;
-
-MAIN:
- i16 1; # Fixed.
- s62 "PCSPSS PSPP synthetic test product";
- PCSYSMIS;
- 0; 0; i16 1; # Fixed.
- i16 0;
- i16 15;
- 1;
- i16 0; # Fixed.
- 1;
- s8 "11/28/14";
- s8 "15:11:00";
- s64 "PSPP synthetic test file";
-MAIN_END:
-
-VARS:
- 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS;
- 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS;
- 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS;
-
- # Numeric variable, no label or missing values.
- 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS;
-
- # Numeric variable, variable label.
- 0; 0; @NUM2_LABEL - @LABELS_OFS; 0x050800; s8 "NUM2"; PCSYSMIS;
-
- # Numeric variable with missing value.
- 0; 0; 0; 0x050800; s8 "NUM3"; 1.0;
-
- # Numeric variable, variable label and missing value.
- 0; 0; @NUM4_LABEL - @LABELS_OFS; 0x050800; s8 "NUM4"; 2.0;
-
- # String variable, no label or missing values.
- 0; 0; 0; 0x010800; s8 "STR1"; PCSYSMIS;
-
- # String variable, variable label.
- 0; 0; @STR2_LABEL - @LABELS_OFS; 0x010400; s8 "STR2"; PCSYSMIS;
-
- # String variable with missing value.
- 0; 0; 0; 0x010500; s8 "STR3"; s8 "MISS";
-
- # String variable, variable label and missing value.
- 0; 0; @STR4_LABEL - @LABELS_OFS; 0x010100; s8 "STR4"; s8 "OTHR";
-
- # Long string variable
- 0; 0; 0; 0x010b00; s8 "STR5"; PCSYSMIS;
- 0 * 8;
-
- # Long string variable with variable label
- 0; 0; @STR6_LABEL - @LABELS_OFS; 0x010b00; s8 "STR6"; PCSYSMIS;
- 0 * 8;
-VARS_END:
-
-LABELS:
- 3; i8 0 0 0; LABELS_OFS: i8 0;
- NUM2_LABEL: COUNT8("Numeric variable 2's label");
- NUM4_LABEL: COUNT8("Another numeric variable label");
- STR2_LABEL: COUNT8("STR2's variable label");
- STR4_LABEL: COUNT8("STR4's variable label");
- STR6_LABEL: COUNT8("Another string variable's label");
-LABELS_END:
-
-DATA:
- 0.0; "11/28/14"; 1.0;
- 0.0; 1.0; 2.0; PCSYSMIS; s8 "abcdefgh"; s8 "ijkl"; s8 "mnopq"; s8 "r";
- s16 "stuvwxyzAB"; s16 "CDEFGHIJKLM";
-DATA_END:
-"#;
- let output = sack(input, None, Endian::Big)?;
- HexView::new(&output).print()?;
- Ok(())
- }
-}
+++ /dev/null
-use std::sync::OnceLock;
-
-use enum_map::EnumMap;
-
-use crate::{
- endian::Endian,
- format::{Format, Settings as FormatSettings},
- message::Severity,
-};
-
-pub struct Settings {
- pub input_integer_format: Endian,
- pub input_float_format: Endian,
- pub output_integer_format: Endian,
- pub output_float_format: Endian,
-
- /// `MDISPLAY`: how to display matrices in `MATRIX`...`END MATRIX`.
- pub matrix_display: MatrixDisplay,
-
- pub view_length: usize,
- pub view_width: usize,
- pub safer: bool,
- pub include: bool,
- pub route_errors_to_terminal: bool,
- pub route_errors_to_listing: bool,
- pub scompress: bool,
- pub undefined: bool,
- pub blanks: Option<f64>,
- pub max_messages: EnumMap<Severity, usize>,
- pub printback: bool,
- pub macros: MacroSettings,
- pub max_loops: usize,
- pub workspace: usize,
- pub default_format: Format,
- pub testing: bool,
- pub fuzz_bits: usize,
- pub scale_min: usize,
- pub commands: Compatibility,
- pub global: Compatibility,
- pub syntax: Compatibility,
- pub formats: FormatSettings,
- pub small: f64,
-}
-
-impl Default for Settings {
- fn default() -> Self {
- Self {
- input_integer_format: Endian::NATIVE,
- input_float_format: Endian::NATIVE,
- output_integer_format: Endian::NATIVE,
- output_float_format: Endian::NATIVE,
- matrix_display: MatrixDisplay::default(),
- view_length: 24,
- view_width: 79,
- safer: false,
- include: true,
- route_errors_to_terminal: true,
- route_errors_to_listing: true,
- scompress: true,
- undefined: true,
- blanks: None,
- max_messages: EnumMap::from_fn(|_| 100),
- printback: true,
- macros: MacroSettings::default(),
- max_loops: 40,
- workspace: 64 * 1024 * 1024,
- default_format: Format::F8_2,
- testing: false,
- fuzz_bits: 6,
- scale_min: 24,
- commands: Compatibility::Enhanced,
- global: Compatibility::Enhanced,
- syntax: Compatibility::Enhanced,
- formats: FormatSettings::default(),
- small: 0.0001,
- }
- }
-}
-
-impl Settings {
- pub fn global() -> &'static Settings {
- static GLOBAL: OnceLock<Settings> = OnceLock::new();
- &GLOBAL.get_or_init(|| Settings::default())
- }
-}
-
-pub enum Compatibility {
- Compatible,
- Enhanced,
-}
-
-pub struct MacroSettings {
- /// Expand macros?
- pub expand: bool,
-
- /// Print macro expansions?
- pub print_expansions: bool,
-
- /// Maximum iterations of `!FOR`.
- pub max_iterations: usize,
-
- /// Maximum nested macro expansion levels.
- pub max_nest: usize,
-}
-
-impl Default for MacroSettings {
- fn default() -> Self {
- Self {
- expand: true,
- print_expansions: false,
- max_iterations: 1000,
- max_nest: 50,
- }
- }
-}
-
-/// How to display matrices in `MATRIX`...`END MATRIX`.
-#[derive(Default)]
-pub enum MatrixDisplay {
- /// Output matrices as text.
- #[default]
- Text,
-
- /// Output matrices as pivot tables.
- Tables,
-}
-
-pub enum OutputType {
- /// Errors and warnings.
- Error,
-
- /// Notes.
- Notes,
-
- /// Syntax printback.
- Syntax,
-
- /// Everything else.
- Other,
-}
+++ /dev/null
-use std::fs::read_to_string;
-use std::path::PathBuf;
-
-use anyhow::{anyhow, Result};
-use clap::Parser;
-use pspp::endian::Endian;
-use pspp::sack::sack;
-
-/// SAv Construction Kit
-///
-/// The input is a sequence of data items, each followed by a semicolon. Each
-/// data item is converted to the output format and written on stdout. A data
-/// item is one of the following:
-///
-/// - An integer in decimal, in hexadecimal prefixed by `0x`, or in octal
-/// prefixed by `0`. Output as a 32-bit binary integer.
-///
-/// - A floating-point number. Output in 64-bit IEEE 754 format.
-///
-/// - A string enclosed in double quotes. Output literally. There is no
-/// syntax for "escapes". Strings may not contain new-lines.
-///
-/// - A literal of the form `s<number>` followed by a quoted string as above.
-/// Output as the string's contents followed by enough spaces to fill up
-/// `<number>` bytes. For example, `s8 "foo"` is output as `foo` followed
-/// by 5 spaces.
-///
-/// - The literal `i8`, `i16`, or `i64` followed by an integer. Output
-/// as a binary integer with the specified number of bits.
-///
-/// - One of the literals `SYSMIS`, `LOWEST`, or `HIGHEST`. Output as a
-/// 64-bit IEEE 754 float of the appropriate PSPP value.
-///
-/// - `PCSYSMIS`. Output as SPSS/PC+ system-missing value.
-///
-/// - The literal `ENDIAN`. Output as a 32-bit binary integer, either with
-/// value 1 if `--be` is in effect or 2 if `--le` is in effect.
-///
-/// - A pair of parentheses enclosing a sequence of data items, each followed
-/// by a semicolon (the last semicolon is optional). Output as the enclosed
-/// data items in sequence.
-///
-/// - The literal `COUNT` or `COUNT8` followed by a sequence of parenthesized
-/// data items, as above. Output as a 32-bit or 8-bit binary integer whose
-/// value is the number of bytes enclosed within the parentheses, followed
-/// by the enclosed data items themselves.
-///
-/// optionally followed by an asterisk and a positive integer, which specifies a
-/// repeat count for the data item.
-#[derive(Parser, Debug)]
-struct Args {
- /// Big-endian output format (default)
- #[arg(long = "be")]
- be: bool,
-
- /// Little-endian output format
- #[arg(long = "le")]
- le: bool,
-
- /// Input file.
- #[arg(required = true, name = "input")]
- input_file_name: PathBuf,
-
- /// Output file.
- #[arg(required = true, name = "output")]
- output_file_name: PathBuf,
-}
-
-fn main() -> Result<()> {
- let Args {
- be,
- le,
- input_file_name,
- output_file_name,
- } = Args::parse();
- let endian = match (be, le) {
- (false, false) | (true, false) => Endian::Big,
- (false, true) => Endian::Little,
- (true, true) => return Err(anyhow!("can't use both `--be` and `--le`")),
- };
-
- let input_file_str = input_file_name.to_string_lossy();
- let input = read_to_string(&input_file_name)
- .map_err(|err| anyhow!("{input_file_str}: read failed ({err})"))?;
-
- let output = sack(&input, Some(&input_file_str), endian)?;
-
- let output_file_str = output_file_name.to_string_lossy();
- std::fs::write(&output_file_name, output)
- .map_err(|err| anyhow!("{output_file_str}: write failed ({err})"))?;
-
- Ok(())
-}