From: Ben Pfaff Date: Tue, 20 Aug 2024 04:31:52 +0000 (-0700) Subject: Start working on lsp server. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=fdef55ea64bcde709efdf85e9064246a901a5b59;p=pspp Start working on lsp server. --- diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 2c9fed4fa1..3d7a9ebc7d 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -2,12 +2,30 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "addr2line" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678" +dependencies = [ + "gimli", +] + [[package]] name = "adler" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + [[package]] name = "android-tzdata" version = "0.1.1" @@ -23,11 +41,71 @@ dependencies = [ "libc", ] +[[package]] +name = "anstream" +version = "0.6.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" + +[[package]] +name = "anstyle-parse" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb47de1e80c2b463c735db5b217a0ddc39d612e7ac9e2e96a5aed1f57616c1cb" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d36fc52c7f6c869915e99412912f22093507da8d9e942ceaf66fe4b7c14422a" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8" +dependencies = [ + "anstyle", + "windows-sys 0.52.0", +] + [[package]] name = "anyhow" -version = "1.0.69" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800" +checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" + +[[package]] +name = "async-trait" +version = "0.1.81" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "atty" @@ -40,11 +118,37 @@ dependencies = [ "winapi", ] +[[package]] +name = "auto_impl" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c87f3f15e7794432337fc718554eaa4dc8f04c9677a950ffe366f20a162ae42" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "autocfg" -version = "1.1.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" + +[[package]] +name = "backtrace" +version = "0.3.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] [[package]] name = "bitflags" @@ -54,21 +158,30 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" [[package]] name = "bumpalo" -version = "3.13.0" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "bytes" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" +checksum = "8318a53db07bb3f8dca91a600466bdb3f2eaadeedfdbcf02e1accbad9271ba50" [[package]] name = "cc" -version = "1.0.79" +version = "1.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +checksum = "72db2f7947ecee9b03b510377e8bb9077afa27176fdbff55c51027e976fdcc48" +dependencies = [ + "shlex", +] [[package]] name = "cfg-if" @@ -89,72 +202,93 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.26" +version = "0.4.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5" +checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" dependencies = [ "android-tzdata", "iana-time-zone", "js-sys", "num-traits", - "time", "wasm-bindgen", - "winapi", + "windows-targets 0.52.6", ] [[package]] name = "clap" -version = "4.1.7" +version = "4.5.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f3061d6db6d8fcbbd4b05e057f2acace52e64e96b498c08c2d7a4e65addd340" +checksum = "ed6719fffa43d0d87e5fd8caeab59be1554fb028cd30edc88fc4369b17971019" dependencies = [ - "bitflags 1.3.2", + "clap_builder", "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "216aec2b177652e3846684cbfe25c9964d18ec45234f0f5da5157b207ed1aab6" +dependencies = [ + "anstream", + "anstyle", "clap_lex", - "is-terminal", - "once_cell", "strsim", - "termcolor 1.2.0", "terminal_size", ] [[package]] name = "clap_derive" -version = "4.1.7" +version = "4.5.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34d122164198950ba84a918270a3bb3f7ededd25e15f7451673d986f55bd2667" +checksum = "501d359d5f3dcaf6ecdeee48833ae73ec6e42723a1e52419c79abf9507eec0a0" dependencies = [ "heck", - "proc-macro-error", "proc-macro2", "quote", - "syn 1.0.109", + "syn", ] [[package]] name = "clap_lex" -version = "0.3.2" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "350b9cf31731f9957399229e9b2adc51eeabdfbe9d71d9a0552275fd12710d09" -dependencies = [ - "os_str_bytes", -] +checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" + +[[package]] +name = "colorchoice" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0" [[package]] name = "core-foundation-sys" -version = "0.8.4" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "crc32fast" -version = "1.3.2" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" dependencies = [ "cfg-if", ] +[[package]] +name = "dashmap" +version = "5.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +dependencies = [ + "cfg-if", + "hashbrown", + "lock_api", + "once_cell", + "parking_lot_core", +] + [[package]] name = "diff" version = "0.1.13" @@ -163,9 +297,9 @@ checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" [[package]] name = "encoding_rs" -version = "0.8.32" +version = "0.8.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394" +checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" dependencies = [ "cfg-if", ] @@ -187,45 +321,46 @@ checksum = "f282cfdfe92516eb26c2af8589c274c7c17681f5ecc03c18255fe741c6aa64eb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.27", + "syn", ] [[package]] -name = "equivalent" -version = "1.0.1" +name = "env_filter" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +checksum = "4f2c92ceda6ceec50f43169f9ee8424fe2db276791afde7b2cd8bc084cb376ab" +dependencies = [ + "log", + "regex", +] [[package]] -name = "errno" -version = "0.2.8" +name = "env_logger" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +checksum = "e13fa619b91fb2381732789fc5de83b45675e882f66623b7d8cb4f643017018d" dependencies = [ - "errno-dragonfly", - "libc", - "winapi", + "anstream", + "anstyle", + "env_filter", + "humantime", + "log", ] [[package]] -name = "errno" -version = "0.3.1" +name = "equivalent" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" -dependencies = [ - "errno-dragonfly", - "libc", - "windows-sys 0.48.0", -] +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] -name = "errno-dragonfly" -version = "0.1.2" +name = "errno" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" dependencies = [ - "cc", "libc", + "windows-sys 0.52.0", ] [[package]] @@ -242,9 +377,9 @@ checksum = "b3ea1ec5f8307826a5b71094dd91fc04d4ae75d5709b20ad351c7fb4815c86ec" [[package]] name = "flate2" -version = "1.0.26" +version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" +checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920" dependencies = [ "crc32fast", "miniz_oxide", @@ -256,17 +391,109 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futures" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" + +[[package]] +name = "futures-io" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" + +[[package]] +name = "futures-macro" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" + +[[package]] +name = "futures-task" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" + +[[package]] +name = "futures-util" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "gimli" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" + [[package]] name = "hashbrown" -version = "0.14.3" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" [[package]] name = "heck" -version = "0.4.1" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "hermit-abi" @@ -279,9 +506,9 @@ dependencies = [ [[package]] name = "hermit-abi" -version = "0.3.1" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" [[package]] name = "hexplay" @@ -290,21 +517,33 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0962bea6731e28b5a443ba4aa00fe3e4fe7555dadf12012435efb738eeac5898" dependencies = [ "atty", - "termcolor 0.3.6", + "termcolor", ] +[[package]] +name = "httparse" +version = "1.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + [[package]] name = "iana-time-zone" -version = "0.1.57" +version = "0.1.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" +checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows", + "windows-core", ] [[package]] @@ -316,76 +555,93 @@ dependencies = [ "cc", ] +[[package]] +name = "idna" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + [[package]] name = "indexmap" -version = "2.1.0" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f" +checksum = "93ead53efc7ea8ed3cfb0c79fc8023fbb782a5432b52830b6518941cebe6505c" dependencies = [ "equivalent", "hashbrown", ] [[package]] -name = "io-lifetimes" -version = "1.0.5" +name = "is_terminal_polyfill" +version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3" -dependencies = [ - "libc", - "windows-sys 0.45.0", -] +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] -name = "is-terminal" -version = "0.4.4" +name = "itoa" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b6b32576413a8e69b90e952e4a026476040d81017b80445deda5f2d3921857" -dependencies = [ - "hermit-abi 0.3.1", - "io-lifetimes", - "rustix 0.36.8", - "windows-sys 0.45.0", -] +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "js-sys" -version = "0.3.64" +version = "0.3.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" +checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a" dependencies = [ "wasm-bindgen", ] [[package]] name = "lazy_static" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" -version = "0.2.147" +version = "0.2.158" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" [[package]] name = "linux-raw-sys" -version = "0.1.4" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] -name = "linux-raw-sys" -version = "0.3.8" +name = "lock_api" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] [[package]] name = "log" -version = "0.4.19" +version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "lsp-types" +version = "0.94.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c66bfd44a06ae10647fe3f8214762e9369fd4248df1350924b4ef9e770a85ea1" +dependencies = [ + "bitflags 1.3.2", + "serde", + "serde_json", + "serde_repr", + "url", +] [[package]] name = "memchr" @@ -395,18 +651,30 @@ checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "miniz_oxide" -version = "0.7.1" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" dependencies = [ "adler", ] +[[package]] +name = "mio" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" +dependencies = [ + "hermit-abi 0.3.9", + "libc", + "wasi", + "windows-sys 0.52.0", +] + [[package]] name = "num" -version = "0.4.0" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" dependencies = [ "num-bigint", "num-complex", @@ -418,50 +686,48 @@ dependencies = [ [[package]] name = "num-bigint" -version = "0.4.3" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ - "autocfg", "num-integer", "num-traits", ] [[package]] name = "num-complex" -version = "0.4.3" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02e0d21255c828d6f128a1e41534206671e8c3ea0c62f32291e808dc82cff17d" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" dependencies = [ "num-traits", ] [[package]] name = "num-derive" -version = "0.4.0" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e6a0fd4f737c707bd9086cc16c925f294943eb62eb71499e9fd4cf71f8b9f4e" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", "quote", - "syn 2.0.27", + "syn", ] [[package]] name = "num-integer" -version = "0.1.45" +version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" dependencies = [ - "autocfg", "num-traits", ] [[package]] name = "num-iter" -version = "0.1.43" +version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" dependencies = [ "autocfg", "num-integer", @@ -470,11 +736,10 @@ dependencies = [ [[package]] name = "num-rational" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" dependencies = [ - "autocfg", "num-bigint", "num-integer", "num-traits", @@ -482,63 +747,103 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.16" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] +[[package]] +name = "object" +version = "0.36.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27b64972346851a39438c60b341ebc01bba47464ae329e55cf343eb93964efd9" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" -version = "1.17.1" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "ordered-float" -version = "3.7.0" +version = "3.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fc2dbde8f8a79f2102cc474ceb0ad68e3b80b85289ea62389b60e66777e4213" +checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc" dependencies = [ "num-traits", ] [[package]] -name = "os_str_bytes" -version = "6.4.1" +name = "parking_lot" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] [[package]] -name = "proc-macro-error" -version = "1.0.4" +name = "parking_lot_core" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn 1.0.109", - "version_check", + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets 0.52.6", +] + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "pin-project" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" +dependencies = [ + "pin-project-internal", ] [[package]] -name = "proc-macro-error-attr" -version = "1.0.4" +name = "pin-project-internal" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "version_check", + "syn", ] +[[package]] +name = "pin-project-lite" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + [[package]] name = "proc-macro2" -version = "1.0.66" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" dependencies = [ "unicode-ident", ] @@ -548,7 +853,7 @@ name = "pspp" version = "1.0.0" dependencies = [ "anyhow", - "bitflags 2.5.0", + "bitflags 2.6.0", "chardetng", "chrono", "clap", @@ -574,144 +879,411 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "pspp-lsp" +version = "0.1.0" +dependencies = [ + "env_logger", + "log", + "pspp", + "tokio", + "tower-lsp", +] + [[package]] name = "quote" -version = "1.0.32" +version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] [[package]] -name = "rustix" -version = "0.36.8" +name = "redox_syscall" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43abb88211988493c1abb44a70efa56ff0ce98f233b7b276146f1f3f7ba9644" +checksum = "2a908a6e00f1fdd0dfd9c0eb08ce85126f6d8bbda50017e74bc4a4b7d4a926a4" dependencies = [ - "bitflags 1.3.2", - "errno 0.2.8", - "io-lifetimes", - "libc", - "linux-raw-sys 0.1.4", - "windows-sys 0.45.0", + "bitflags 2.6.0", +] + +[[package]] +name = "regex" +version = "1.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", ] +[[package]] +name = "regex-syntax" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" + +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + [[package]] name = "rustix" -version = "0.37.3" +version = "0.38.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b24138615de35e32031d041a09032ef3487a616d901ca4db224e7d557efae2" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" dependencies = [ - "bitflags 1.3.2", - "errno 0.3.1", - "io-lifetimes", + "bitflags 2.6.0", + "errno", "libc", - "linux-raw-sys 0.3.8", - "windows-sys 0.45.0", + "linux-raw-sys", + "windows-sys 0.52.0", ] [[package]] -name = "strsim" -version = "0.10.0" +name = "ryu" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" [[package]] -name = "syn" -version = "1.0.109" +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.208" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +checksum = "cff085d2cb684faa248efb494c39b68e522822ac0de72ccf08109abde717cfb2" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.208" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf" dependencies = [ "proc-macro2", "quote", - "unicode-ident", + "syn", ] [[package]] -name = "syn" -version = "2.0.27" +name = "serde_json" +version = "1.0.125" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "serde_repr" +version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b60f673f44a8255b9c8c657daf66a596d435f2da81a555b06dc644d080ba45e0" +checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9" dependencies = [ "proc-macro2", "quote", - "unicode-ident", + "syn", ] [[package]] -name = "termcolor" -version = "0.3.6" +name = "shlex" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adc4587ead41bf016f11af03e55a624c06568b5a19db4e90fde573d805074f83" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "signal-hook-registry" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1" dependencies = [ - "wincolor", + "libc", +] + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "socket2" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6af063034fc1935ede7be0122941bafa9bacb949334d090b77ca98b5817c7d9" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", ] [[package]] name = "termcolor" -version = "1.2.0" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +checksum = "adc4587ead41bf016f11af03e55a624c06568b5a19db4e90fde573d805074f83" dependencies = [ - "winapi-util", + "wincolor", ] [[package]] name = "terminal_size" -version = "0.2.6" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e6bf6f19e9f8ed8d4048dc22981458ebcf406d67e94cd422e5ecd73d63b3237" +checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" dependencies = [ - "rustix 0.37.3", + "rustix", "windows-sys 0.48.0", ] [[package]] name = "thiserror" -version = "1.0.39" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5ab016db510546d856297882807df8da66a16fb8c4101cb8b30054b0d5b2d9c" +checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.39" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5420d42e90af0c38c3290abcca25b9b3bdf379fc9f55c528f53a269d9c9a267e" +checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn", ] [[package]] -name = "time" -version = "0.1.45" +name = "tinyvec" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.39.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9babc99b9923bfa4804bd74722ff02c0381021eafa4db9949217e3be8e84fff5" dependencies = [ + "backtrace", + "bytes", "libc", - "wasi", - "winapi", + "mio", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys 0.52.0", +] + +[[package]] +name = "tokio-macros" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-util" +version = "0.7.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "pin-project", + "pin-project-lite", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-lsp" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4ba052b54a6627628d9b3c34c176e7eda8359b7da9acd497b9f20998d118508" +dependencies = [ + "async-trait", + "auto_impl", + "bytes", + "dashmap", + "futures", + "httparse", + "lsp-types", + "memchr", + "serde", + "serde_json", + "tokio", + "tokio-util", + "tower", + "tower-lsp-macros", + "tracing", +] + +[[package]] +name = "tower-lsp-macros" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84fd902d4e0b9a4b27f2f440108dc034e1758628a9b702f8ec61ad66355422fa" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +dependencies = [ + "once_cell", ] [[package]] name = "unicase" -version = "2.6.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" +checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89" dependencies = [ "version_check", ] +[[package]] +name = "unicode-bidi" +version = "0.3.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" + [[package]] name = "unicode-ident" -version = "1.0.6" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "unicode-normalization" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +dependencies = [ + "tinyvec", +] [[package]] name = "unicode-width" @@ -719,54 +1291,73 @@ version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" +[[package]] +name = "url" +version = "2.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + [[package]] name = "utf8-decode" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "version_check" -version = "0.9.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" +version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.87" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" +checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5" dependencies = [ "cfg-if", + "once_cell", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.87" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" +checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.27", + "syn", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.87" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" +checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -774,22 +1365,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.87" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" +checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.27", + "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.87" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" +checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" [[package]] name = "winapi" @@ -807,15 +1398,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" -[[package]] -name = "winapi-util" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" -dependencies = [ - "winapi", -] - [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -832,142 +1414,149 @@ dependencies = [ ] [[package]] -name = "windows" -version = "0.48.0" +name = "windows-core" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.48.1", + "windows-targets 0.52.6", ] [[package]] name = "windows-sys" -version = "0.45.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets 0.42.1", + "windows-targets 0.48.5", ] [[package]] name = "windows-sys" -version = "0.48.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.48.1", + "windows-targets 0.52.6", ] [[package]] name = "windows-targets" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm 0.42.1", - "windows_aarch64_msvc 0.42.1", - "windows_i686_gnu 0.42.1", - "windows_i686_msvc 0.42.1", - "windows_x86_64_gnu 0.42.1", - "windows_x86_64_gnullvm 0.42.1", - "windows_x86_64_msvc 0.42.1", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", ] [[package]] name = "windows-targets" -version = "0.48.1" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm 0.48.0", - "windows_aarch64_msvc 0.48.0", - "windows_i686_gnu 0.48.0", - "windows_i686_msvc 0.48.0", - "windows_x86_64_gnu 0.48.0", - "windows_x86_64_gnullvm 0.48.0", - "windows_x86_64_msvc 0.48.0", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] name = "windows_aarch64_gnullvm" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" -version = "0.42.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 41b2f02c6f..3aa9d37c26 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -1,51 +1,6 @@ -[package] -name = "pspp" -version = "1.0.0" -edition = "2021" -authors = [ "Ben Pfaff", "John Darrington" ] - -[dependencies] -anyhow = "1.0.69" -clap = { version = "4.1.7", features = ["derive", "wrap_help"] } -encoding_rs = "0.8.32" -flate2 = "1.0.26" -float_next_after = "1.0.0" -hexplay = "0.2.1" -lazy_static = "1.4.0" -num = "0.4.0" -num-derive = "0.4.0" -num-traits = "0.2.16" -ordered-float = "3.7.0" -thiserror = "1.0" -chrono = "0.4.26" -finl_unicode = "1.2.0" -unicase = "2.6.0" -libc = "0.2.147" -indexmap = "2.1.0" -utf8-decode = "1.0.1" -bitflags = "2.5.0" -unicode-width = "0.1.13" -chardetng = "0.1.17" -enum-map = "2.7.3" -flagset = "0.4.6" - -[target.'cfg(windows)'.dependencies] -windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] } - -[build-dependencies] -anyhow = "1.0.69" - -[[bin]] -name = "pspp-dump-sav" -path = "src/main.rs" - -[lib] -path = "src/lib.rs" - -[[test]] -name = "sack" -path = "tests/sack.rs" -harness = false - -[dev-dependencies] -diff = "0.1.13" +[workspace] +members = [ + "pspp", + "pspp-lsp", +] +resolver = "2" diff --git a/rust/build.rs b/rust/build.rs deleted file mode 100644 index f8cb9efa13..0000000000 --- a/rust/build.rs +++ /dev/null @@ -1,184 +0,0 @@ -use anyhow::{anyhow, Result as AnyResult}; -use std::{ - collections::{BTreeMap, HashSet, VecDeque}, - env::var_os, - fs::{read_to_string, File}, - io::{Error as IoError, Write}, - path::{Path, PathBuf}, -}; - -#[derive(Copy, Clone, PartialEq, Eq, Ord, PartialOrd)] -enum Source { - Codepage, - Ibm, - Windows, -} - -// Code page number. -type CodepageNumber = usize; - -fn process_converter<'a>( - fields: &Vec<&'a str>, - codepages: &mut BTreeMap>>, -) { - if fields.is_empty() || fields[0] == "{" { - return; - } - - let mut cps: BTreeMap = BTreeMap::new(); - let mut iana = VecDeque::new(); - let mut other = VecDeque::new(); - - let mut iter = fields.iter().peekable(); - while let Some(&name) = iter.next() { - if iter.next_if(|&&s| s == "{").is_some() { - let mut standards = HashSet::new(); - loop { - let &standard = iter.next().expect("missing `}` in list of standards"); - if standard == "}" { - break; - } - standards.insert(standard); - } - - if standards.contains("IANA*") { - iana.push_front(name); - } else if standards.contains("IANA") { - iana.push_back(name); - } else if standards.iter().any(|&s| s.ends_with('*')) { - other.push_front(name); - } else { - other.push_back(name); - } - } else { - // Untagged names are completely nonstandard. - continue; - } - - if let Some(number) = name.strip_prefix("cp") { - if let Ok(number) = number.parse::() { - cps.insert(Source::Codepage, number); - } - } - - if let Some(number) = name.strip_prefix("windows-") { - if let Ok(number) = number.parse::() { - cps.insert(Source::Windows, number); - } - } - - if let Some(number) = name.strip_prefix("ibm-") { - if let Ok(number) = number.parse::() { - cps.insert(Source::Ibm, number); - } - } - } - - // If there are no tagged names then this is completely nonstandard. - if iana.is_empty() && other.is_empty() { - return; - } - - let all: Vec<&str> = iana.into_iter().chain(other).collect(); - for (source, number) in cps { - codepages - .entry(number) - .or_default() - .insert(source, all.clone()); - } -} - -fn write_output( - codepages: &BTreeMap>>, - file_name: &PathBuf, -) -> Result<(), IoError> { - let mut file = File::create(file_name)?; - - file.write_all( - "\ -use lazy_static::lazy_static; -use std::collections::HashMap; - -lazy_static! { - static ref CODEPAGE_NUMBER_TO_NAME: HashMap = { - let mut map = HashMap::new(); -" - .as_bytes(), - )?; - - for (&cpnumber, value) in codepages.iter() { - let source = value.keys().max().unwrap(); - let name = value[source][0]; - writeln!(file, " map.insert({cpnumber}, \"{name}\");")?; - } - file.write_all( - " map - }; - - static ref CODEPAGE_NAME_TO_NUMBER: HashMap<&'static str, u32> = { - let mut map = HashMap::new(); -" - .as_bytes(), - )?; - - let mut names: BTreeMap>> = BTreeMap::new(); - for (&cpnumber, value) in codepages.iter() { - for (&source, value2) in value.iter() { - for name in value2.iter().map(|name| name.to_ascii_lowercase()) { - names - .entry(name) - .or_default() - .entry(source) - .or_default() - .push(cpnumber); - } - } - } - - for (name, value) in names.iter() { - for (_source, numbers) in value.iter().rev().take(1) { - writeln!(file, " map.insert(\"{name}\", {});", numbers[0])?; - } - } - file.write_all( - " map - }; -} -" - .as_bytes(), - )?; - - Ok(()) -} - -fn main() -> AnyResult<()> { - println!("cargo:rerun-if-changed=build.rs"); - - let input_file = Path::new(env!("CARGO_MANIFEST_DIR")).join("convrtrs.txt"); - println!("cargo:rerun-if-changed={}", input_file.to_string_lossy()); - let input = read_to_string(&input_file) - .map_err(|e| anyhow!("{}: read failed ({e})", input_file.to_string_lossy()))?; - - let mut codepages: BTreeMap>> = BTreeMap::new(); - let mut converter: Vec<&str> = Vec::new(); - for line in input.lines() { - let line = line - .find('#') - .map(|position| &line[..position]) - .unwrap_or(line) - .trim_end(); - if !line.starts_with([' ', '\t']) { - process_converter(&converter, &mut codepages); - converter.clear(); - } - converter.extend(line.split_whitespace()); - } - process_converter(&converter, &mut codepages); - - let output_file_name = Path::new(&var_os("OUT_DIR").unwrap()).join("encodings.rs"); - - write_output(&codepages, &output_file_name) - .map_err(|e| anyhow!("{}: write failed ({e})", output_file_name.to_string_lossy()))?; - - Ok(()) -} diff --git a/rust/convrtrs.txt b/rust/convrtrs.txt deleted file mode 100644 index 4aaa592a53..0000000000 --- a/rust/convrtrs.txt +++ /dev/null @@ -1,1269 +0,0 @@ -# Copyright (C) 2016 and later: Unicode, Inc. and others. -# License & terms of use: http://www.unicode.org/copyright.html -# ****************************************************************************** -# * -# * Copyright (C) 1995-2014, International Business Machines -# * Corporation and others. All Rights Reserved. -# * -# ****************************************************************************** - -# If this converter alias table looks very confusing, a much easier to -# understand view can be found at this demo: -# http://demo.icu-project.org/icu-bin/convexp - -# IMPORTANT NOTE -# -# This file is not read directly by ICU. If you change it, you need to -# run gencnval, and eventually run pkgdata to update the representation that -# ICU uses for aliases. The gencnval tool will normally compile this file into -# cnvalias.icu. The gencnval -v verbose option will help you when you edit -# this file. - -# Please be friendly to the rest of us that edit this table by -# keeping this table free of tabs. - -# This is an alias file used by the character set converter. -# A lot of converter information can be found in unicode/ucnv.h, but here -# is more information about this file. -# -# If you are adding a new converter to this list and want to include it in the -# icu data library, please be sure to add an entry to the appropriate ucm*.mk file -# (see ucmfiles.mk for more information). -# -# Here is the file format using BNF-like syntax: -# -# converterTable ::= tags { converterLine* } -# converterLine ::= converterName [ tags ] { taggedAlias* }'\n' -# taggedAlias ::= alias [ tags ] -# tags ::= '{' { tag+ } '}' -# tag ::= standard['*'] -# converterName ::= [0-9a-zA-Z:_'-']+ -# alias ::= converterName -# -# Except for the converter name, aliases are case insensitive. -# Names are separated by whitespace. -# Line continuation and comment sytax are similar to the GNU make syntax. -# Any lines beginning with whitespace (e.g. U+0020 SPACE or U+0009 HORIZONTAL -# TABULATION) are presumed to be a continuation of the previous line. -# The # symbol starts a comment and the comment continues till the end of -# the line. -# -# The converter -# -# All names can be tagged by including a space-separated list of tags in -# curly braces, as in ISO_8859-1:1987{IANA*} iso-8859-1 { MIME* } or -# some-charset{MIME* IANA*}. The order of tags does not matter, and -# whitespace is allowed between the tagged name and the tags list. -# -# The tags can be used to get standard names using ucnv_getStandardName(). -# -# The complete list of recognized tags used in this file is defined in -# the affinity list near the beginning of the file. -# -# The * after the standard tag denotes that the previous alias is the -# preferred (default) charset name for that standard. There can only -# be one of these default charset names per converter. - - - -# The world is getting more complicated... -# Supporting XML parsers, HTML, MIME, and similar applications -# that mark encodings with a charset name can be difficult. -# Many of these applications and operating systems will update -# their codepages over time. - -# It means that a new codepage, one that differs from an -# old one by changing a code point, e.g., to the Euro sign, -# must not get an old alias, because it would mean that -# old files with this alias would be interpreted differently. - -# If an codepage gets updated by assigning characters to previously -# unassigned code points, then a new name is not necessary. -# Also, some codepages map unassigned codepage byte values -# to the same numbers in Unicode for roundtripping. It may be -# industry practice to keep the encoding name in such a case, too -# (example: Windows codepages). - -# The aliases listed in the list of character sets -# that is maintained by the IANA (http://www.iana.org/) must -# not be changed to mean encodings different from what this -# list shows. Currently, the IANA list is at -# http://www.iana.org/assignments/character-sets -# It should also be mentioned that the exact mapping table used for each -# IANA names usually isn't specified. This means that some other applications -# and operating systems are left to interpret the exact mappings for the -# underspecified aliases. For instance, Shift-JIS on a Solaris platform -# may be different from Shift-JIS on a Windows platform. This is why -# some of the aliases can be tagged to differentiate different mapping -# tables with the same alias. If an alias is given to more than one converter, -# it is considered to be an ambiguous alias, and the affinity list will -# choose the converter to use when a standard isn't specified with the alias. - -# Name matching is case-insensitive. Also, dashes '-', underscores '_' -# and spaces ' ' are ignored in names (thus cs-iso_latin-1, csisolatin1 -# and "cs iso latin 1" are the same). -# However, the names in the left column are directly file names -# or names of algorithmic converters, and their case must not -# be changed - or else code and/or file names must also be changed. -# For example, the converter ibm-921 is expected to be the file ibm-921.cnv. - - - -# The immediately following list is the affinity list of supported standard tags. -# When multiple converters have the same alias under different standards, -# the standard nearest to the top of this list with that alias will -# be the first converter that will be opened. The ordering of the aliases -# after this affinity list does not affect the preferred alias, but it may -# affect the order of the returned list of aliases for a given converter. -# -# The general ordering is from specific and frequently used to more general -# or rarely used at the bottom. -{ UTR22 # Name format specified by https://www.unicode.org/reports/tr22/ - # ICU # Can also use ICU_FEATURE - IBM # The IBM CCSID number is specified by ibm-* - WINDOWS # The Microsoft code page identifier number is specified by windows-*. The rest are recognized IE names. - JAVA # Source: Sun JDK. Alias name case is ignored, but dashes are not ignored. - # GLIBC - # AIX - # DB2 - # SOLARIS - # APPLE - # HPUX - IANA # Source: http://www.iana.org/assignments/character-sets - MIME # Source: http://www.iana.org/assignments/character-sets - # MSIE # MSIE is Internet Explorer, which can be different from Windows (From the IMultiLanguage COM interface) - # ZOS_USS # z/OS (os/390) Unix System Services (USS), which has NL<->LF swapping. They have the same format as the IBM tag. - } - - - -# Fully algorithmic converters - -UTF-8 { IANA* MIME* JAVA* WINDOWS } - ibm-1208 { IBM* } # UTF-8 with IBM PUA - ibm-1209 { IBM } # UTF-8 - ibm-5304 { IBM } # Unicode 2.0, UTF-8 with IBM PUA - ibm-5305 { IBM } # Unicode 2.0, UTF-8 - ibm-13496 { IBM } # Unicode 3.0, UTF-8 with IBM PUA - ibm-13497 { IBM } # Unicode 3.0, UTF-8 - ibm-17592 { IBM } # Unicode 4.0, UTF-8 with IBM PUA - ibm-17593 { IBM } # Unicode 4.0, UTF-8 - windows-65001 { WINDOWS* } - cp1208 - x-UTF_8J - unicode-1-1-utf-8 - unicode-2-0-utf-8 - -# The ICU 2.2 UTF-16/32 converters detect and write a BOM. -UTF-16 { IANA* MIME* JAVA* } ISO-10646-UCS-2 { IANA } - ibm-1204 { IBM* } # UTF-16 with IBM PUA and BOM sensitive - ibm-1205 { IBM } # UTF-16 BOM sensitive - unicode - csUnicode - ucs-2 -# The following Unicode CCSIDs (IBM) are not valid in ICU because they are -# considered pure DBCS (exactly 2 bytes) of Unicode, -# and they are a subset of Unicode. ICU does not support their encoding structures. -# 1400 1401 1402 1410 1414 1415 1446 1447 1448 1449 64770 64771 65520 5496 5497 5498 9592 13688 -UTF-16BE { IANA* MIME* JAVA* } x-utf-16be { JAVA } - UnicodeBigUnmarked { JAVA } # java.io name - ibm-1200 { IBM* } # UTF-16 BE with IBM PUA - ibm-1201 { IBM } # UTF-16 BE - ibm-13488 { IBM } # Unicode 2.0, UTF-16 BE with IBM PUA - ibm-13489 { IBM } # Unicode 2.0, UTF-16 BE - ibm-17584 { IBM } # Unicode 3.0, UTF-16 BE with IBM PUA - ibm-17585 { IBM } # Unicode 3.0, UTF-16 BE - ibm-21680 { IBM } # Unicode 4.0, UTF-16 BE with IBM PUA - ibm-21681 { IBM } # Unicode 4.0, UTF-16 BE - ibm-25776 { IBM } # Unicode 4.1, UTF-16 BE with IBM PUA - ibm-25777 { IBM } # Unicode 4.1, UTF-16 BE - ibm-29872 { IBM } # Unicode 5.0, UTF-16 BE with IBM PUA - ibm-29873 { IBM } # Unicode 5.0, UTF-16 BE - ibm-61955 { IBM } # UTF-16BE with Gaidai University (Japan) PUA - ibm-61956 { IBM } # UTF-16BE with Microsoft HKSCS-Big 5 PUA - windows-1201 { WINDOWS* } - cp1200 - cp1201 - UTF16_BigEndian - # ibm-5297 { IBM } # Unicode 2.0, UTF-16 (BE) (reserved, never used) - # iso-10646-ucs-2 { JAVA } # This is ambiguous - # ibm-61952 is not a valid CCSID because it's Unicode 1.1 - # ibm-61953 is not a valid CCSID because it's Unicode 1.0 -UTF-16LE { IANA* MIME* JAVA* } x-utf-16le { JAVA } - UnicodeLittleUnmarked { JAVA } # java.io name - ibm-1202 { IBM* } # UTF-16 LE with IBM PUA - ibm-1203 { IBM } # UTF-16 LE - ibm-13490 { IBM } # Unicode 2.0, UTF-16 LE with IBM PUA - ibm-13491 { IBM } # Unicode 2.0, UTF-16 LE - ibm-17586 { IBM } # Unicode 3.0, UTF-16 LE with IBM PUA - ibm-17587 { IBM } # Unicode 3.0, UTF-16 LE - ibm-21682 { IBM } # Unicode 4.0, UTF-16 LE with IBM PUA - ibm-21683 { IBM } # Unicode 4.0, UTF-16 LE - ibm-25778 { IBM } # Unicode 4.1, UTF-16 LE with IBM PUA - ibm-25779 { IBM } # Unicode 4.1, UTF-16 LE - ibm-29874 { IBM } # Unicode 5.0, UTF-16 LE with IBM PUA - ibm-29875 { IBM } # Unicode 5.0, UTF-16 LE - UTF16_LittleEndian - windows-1200 { WINDOWS* } - -UTF-32 { IANA* MIME* } ISO-10646-UCS-4 { IANA } - ibm-1236 { IBM* } # UTF-32 with IBM PUA and BOM sensitive - ibm-1237 { IBM } # UTF-32 BOM sensitive - csUCS4 - ucs-4 -UTF-32BE { IANA* } UTF32_BigEndian - ibm-1232 { IBM* } # UTF-32 BE with IBM PUA - ibm-1233 { IBM } # UTF-32 BE - ibm-9424 { IBM } # Unicode 4.1, UTF-32 BE with IBM PUA -UTF-32LE { IANA* } UTF32_LittleEndian - ibm-1234 { IBM* } # UTF-32 LE, with IBM PUA - ibm-1235 { IBM } # UTF-32 LE - -# ICU-specific names for special uses -UTF16_PlatformEndian -UTF16_OppositeEndian - -UTF32_PlatformEndian -UTF32_OppositeEndian - - -# Java-specific, non-Unicode-standard UTF-16 variants. -# These are in the Java "Basic Encoding Set (contained in lib/rt.jar)". -# See the "Supported Encodings" at -# http://java.sun.com/javase/6/docs/technotes/guides/intl/encoding.doc.html -# or a newer version of this document. -# -# Aliases marked with { JAVA* } are canonical names for java.io and java.lang APIs. -# Aliases marked with { JAVA } are canonical names for the java.nio API. -# -# "BOM" means the Unicode Byte Order Mark, which is the encoding-scheme-specific -# byte sequence for U+FEFF. -# "Reverse BOM" means the BOM for the sibling encoding scheme with the -# opposite endianness. (LE<->BE) - -# "Sixteen-bit Unicode (or UCS) Transformation Format, big-endian byte order, -# with byte-order mark" -# -# From Unicode: Writes BOM. -# To Unicode: Detects and consumes BOM. -# If there is a "reverse BOM", Java throws -# MalformedInputException: Incorrect byte-order mark. -# In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value -# and a UCNV_ILLEGAL UConverterCallbackReason. -UTF-16BE,version=1 UnicodeBig { JAVA* } - -# "Sixteen-bit Unicode (or UCS) Transformation Format, little-endian byte order, -# with byte-order mark" -# -# From Unicode: Writes BOM. -# To Unicode: Detects and consumes BOM. -# If there is a "reverse BOM", Java throws -# MalformedInputException: Incorrect byte-order mark. -# In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value -# and a UCNV_ILLEGAL UConverterCallbackReason. -UTF-16LE,version=1 UnicodeLittle { JAVA* } x-UTF-16LE-BOM { JAVA } - -# This one is not mentioned on the "Supported Encodings" page -# but is available in Java. -# In Java, this is called "Unicode" but we cannot give it that alias -# because the standard UTF-16 converter already has a "unicode" alias. -# -# From Unicode: Writes BOM. -# To Unicode: Detects and consumes BOM. -# If there is no BOM, rather than defaulting to BE, Java throws -# MalformedInputException: Missing byte-order mark. -# In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value -# and a UCNV_ILLEGAL UConverterCallbackReason. -UTF-16,version=1 - -# This is the same as standard UTF-16 but always writes a big-endian byte stream, -# regardless of the platform endianness, as expected by the Java compatibility tests. -# See the java.nio.charset.Charset API documentation at -# http://java.sun.com/javase/6/docs/api/java/nio/charset/Charset.html -# or a newer version of this document. -# -# From Unicode: Write BE BOM and BE bytes -# To Unicode: Detects and consumes BOM. Defaults to BE. -UTF-16,version=2 - -# Note: ICU does not currently support Java-specific, non-Unicode-standard UTF-32 variants. -# Presumably, these behave analogously to the UTF-16 variants with similar names. -# UTF_32BE_BOM x-UTF-32BE-BOM -# UTF_32LE_BOM x-UTF-32LE-BOM - -# End of Java-specific, non-Unicode-standard UTF variants. - - -# On UTF-7: -# RFC 2152 (http://www.imc.org/rfc2152) allows to encode some US-ASCII -# characters directly or in base64. Especially, the characters in set O -# as defined in the RFC (!"#$%&*;<=>@[]^_`{|}) may be encoded directly -# but are not allowed in, e.g., email headers. -# By default, the ICU UTF-7 converter encodes set O directly. -# By choosing the option "version=1", set O will be escaped instead. -# For example: -# utf7Converter=ucnv_open("UTF-7,version=1"); -# -# For details about email headers see RFC 2047. -UTF-7 { IANA* MIME* WINDOWS } windows-65000 { WINDOWS* } - unicode-1-1-utf-7 - unicode-2-0-utf-7 - -# UTF-EBCDIC doesn't exist in ICU, but the aliases are here for reference. -#UTF-EBCDIC ibm-1210 { IBM* } ibm-1211 { IBM } - -# IMAP-mailbox-name is an ICU-specific name for the encoding of IMAP mailbox names. -# It is a substantially modified UTF-7 encoding. See the specification in: -# -# RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1 -# (http://www.ietf.org/rfc/rfc2060.txt) -# Section 5.1.3. Mailbox International Naming Convention -IMAP-mailbox-name - -SCSU { IANA* } - ibm-1212 { IBM } # SCSU with IBM PUA - ibm-1213 { IBM* } # SCSU -BOCU-1 { IANA* } - csBOCU-1 { IANA } - ibm-1214 { IBM } # BOCU-1 with IBM PUA - ibm-1215 { IBM* } # BOCU-1 - -# See https://www.unicode.org/reports/tr26 for this Compatibility Encoding Scheme for UTF-16 -# The Unicode Consortium does not encourage the use of CESU-8 -CESU-8 { IANA* } ibm-9400 { IBM* } - -# Standard iso-8859-1, which does not have the Euro update. -# See iso-8859-15 (latin9) for the Euro update -ISO-8859-1 { MIME* IANA JAVA* } - ibm-819 { IBM* JAVA } # This is not truely ibm-819 because it's missing the fallbacks. - IBM819 { IANA } - cp819 { IANA JAVA } - latin1 { IANA JAVA } - 8859_1 { JAVA } - csISOLatin1 { IANA JAVA } - iso-ir-100 { IANA JAVA } - ISO_8859-1:1987 { IANA* JAVA } - l1 { IANA JAVA } - 819 { JAVA } - # windows-28591 { WINDOWS* } # This has odd behavior because it has the Euro update, which isn't correct. - # LATIN_1 # Old ICU name - # ANSI_X3.110-1983 # This is for a different IANA alias. This isn't iso-8859-1. - -US-ASCII { MIME* IANA JAVA WINDOWS } - ASCII { JAVA* IANA WINDOWS } - ANSI_X3.4-1968 { IANA* WINDOWS } - ANSI_X3.4-1986 { IANA WINDOWS } - ISO_646.irv:1991 { IANA WINDOWS } - iso_646.irv:1983 { JAVA } - ISO646-US { JAVA IANA WINDOWS } - us { IANA } - csASCII { IANA WINDOWS } - iso-ir-6 { IANA } - cp367 { IANA WINDOWS } - ascii7 { JAVA } - 646 { JAVA } - windows-20127 { WINDOWS* } - ibm-367 { IBM* } IBM367 { IANA WINDOWS } # This is not truely ibm-367 because it's missing the fallbacks. - -# GB 18030 is partly algorithmic, using the MBCS converter -gb18030 { IANA* } ibm-1392 { IBM* } windows-54936 { WINDOWS* } GB18030 { MIME* } - -# Table-based interchange codepages - -# Central Europe -ibm-912_P100-1995 { UTR22* } - ibm-912 { IBM* JAVA } - ISO-8859-2 { MIME* IANA JAVA* WINDOWS } - ISO_8859-2:1987 { IANA* WINDOWS JAVA } - latin2 { IANA WINDOWS JAVA } - csISOLatin2 { IANA WINDOWS JAVA } - iso-ir-101 { IANA WINDOWS JAVA } - l2 { IANA WINDOWS JAVA } - 8859_2 { JAVA } - cp912 { JAVA } - 912 { JAVA } - windows-28592 { WINDOWS* } - -# Maltese Esperanto -ibm-913_P100-2000 { UTR22* } - ibm-913 { IBM* JAVA } - ISO-8859-3 { MIME* IANA WINDOWS JAVA* } - ISO_8859-3:1988 { IANA* WINDOWS JAVA } - latin3 { IANA JAVA WINDOWS } - csISOLatin3 { IANA WINDOWS } - iso-ir-109 { IANA WINDOWS JAVA } - l3 { IANA WINDOWS JAVA } - 8859_3 { JAVA } - cp913 { JAVA } - 913 { JAVA } - windows-28593 { WINDOWS* } - -# Baltic -ibm-914_P100-1995 { UTR22* } - ibm-914 { IBM* JAVA } - ISO-8859-4 { MIME* IANA WINDOWS JAVA* } - latin4 { IANA WINDOWS JAVA } - csISOLatin4 { IANA WINDOWS JAVA } - iso-ir-110 { IANA WINDOWS JAVA } - ISO_8859-4:1988 { IANA* WINDOWS JAVA } - l4 { IANA WINDOWS JAVA } - 8859_4 { JAVA } - cp914 { JAVA } - 914 { JAVA } - windows-28594 { WINDOWS* } - -# Cyrillic -ibm-915_P100-1995 { UTR22* } - ibm-915 { IBM* JAVA } - ISO-8859-5 { MIME* IANA WINDOWS JAVA* } - cyrillic { IANA WINDOWS JAVA } - csISOLatinCyrillic { IANA WINDOWS JAVA } - iso-ir-144 { IANA WINDOWS JAVA } - ISO_8859-5:1988 { IANA* WINDOWS JAVA } - 8859_5 { JAVA } - cp915 { JAVA } - 915 { JAVA } - windows-28595 { WINDOWS* } - -glibc-PT154-2.3.3 { UTR22* } - PTCP154 { IANA* } - csPTCP154 - PT154 - CP154 - Cyrillic-Asian - -# Arabic -# ISO_8859-6-E and ISO_8859-6-I are similar to this charset, but BiDi is done differently -# From a narrow mapping point of view, there is no difference. -# -E means explicit. -I means implicit. -# -E requires the client to handle the ISO 6429 bidirectional controls -ibm-1089_P100-1995 { UTR22* } - ibm-1089 { IBM* JAVA } - ISO-8859-6 { MIME* IANA WINDOWS JAVA* } - arabic { IANA WINDOWS JAVA } - csISOLatinArabic { IANA WINDOWS JAVA } - iso-ir-127 { IANA WINDOWS JAVA } - ISO_8859-6:1987 { IANA* WINDOWS JAVA } - ECMA-114 { IANA JAVA } - ASMO-708 { IANA JAVA } - 8859_6 { JAVA } - cp1089 { JAVA } - 1089 { JAVA } - windows-28596 { WINDOWS* } - ISO-8859-6-I { IANA MIME } # IANA considers this alias different and BiDi needs to be applied. - ISO-8859-6-E { IANA MIME } # IANA considers this alias different and BiDi needs to be applied. - x-ISO-8859-6S { JAVA } - -# ISO Greek (with euro update). This is really ISO_8859-7:2003 -ibm-9005_X110-2007 { UTR22* } - ibm-9005 { IBM* } - ISO-8859-7 { MIME* IANA JAVA* WINDOWS } - 8859_7 { JAVA } - greek { IANA JAVA WINDOWS } - greek8 { IANA JAVA WINDOWS } - ELOT_928 { IANA JAVA WINDOWS } - ECMA-118 { IANA JAVA WINDOWS } - csISOLatinGreek { IANA JAVA WINDOWS } - iso-ir-126 { IANA JAVA WINDOWS } - ISO_8859-7:1987 { IANA* JAVA WINDOWS } - windows-28597 { WINDOWS* } - sun_eu_greek # For Solaris - -# ISO Greek (w/o euro update) -# JDK 1.5 has these aliases. -ibm-813_P100-1995 { UTR22* } - ibm-813 { IBM* JAVA* } - cp813 { JAVA } - 813 { JAVA } - -# hebrew -# ISO_8859-8-E and ISO_8859-8-I are similar to this charset, but BiDi is done differently -# From a narrow mapping point of view, there is no difference. -# -E means explicit. -I means implicit. -# -E requires the client to handle the ISO 6429 bidirectional controls -# This matches the official mapping on unicode.org -ibm-5012_P100-1999 { UTR22* } - ibm-5012 { IBM* } - ISO-8859-8 { MIME* IANA WINDOWS JAVA* } - hebrew { IANA WINDOWS JAVA } - csISOLatinHebrew { IANA WINDOWS JAVA } - iso-ir-138 { IANA WINDOWS JAVA } - ISO_8859-8:1988 { IANA* WINDOWS JAVA } - ISO-8859-8-I { IANA MIME } # IANA and Windows considers this alias different and BiDi needs to be applied. - ISO-8859-8-E { IANA MIME } # IANA and Windows considers this alias different and BiDi needs to be applied. - 8859_8 { JAVA } - windows-28598 { WINDOWS* } # Hebrew (ISO-Visual). A hybrid between ibm-5012 and ibm-916 with extra PUA mappings. - hebrew8 # Reflect HP-UX code page update - -# Unfortunately, the Java aliases are split across ibm-916 and ibm-5012 -# Also many platforms are a combination between ibm-916 and ibm-5012 behaviors -ibm-916_P100-1995 { UTR22* } - ibm-916 { IBM* JAVA* } - cp916 { JAVA } - 916 { JAVA } - -# Turkish -ibm-920_P100-1995 { UTR22* } - ibm-920 { IBM* JAVA } - ISO-8859-9 { MIME* IANA WINDOWS JAVA* } - latin5 { IANA WINDOWS JAVA } - csISOLatin5 { IANA JAVA } - iso-ir-148 { IANA WINDOWS JAVA } - ISO_8859-9:1989 { IANA* WINDOWS } - l5 { IANA WINDOWS JAVA } - 8859_9 { JAVA } - cp920 { JAVA } - 920 { JAVA } - windows-28599 { WINDOWS* } - ECMA-128 # IANA doesn't have this alias 6/24/2002 - turkish8 # Reflect HP-UX codepage update 8/1/2008 - turkish # Reflect HP-UX codepage update 8/1/2008 - -# Nordic languages -iso-8859_10-1998 { UTR22* } ISO-8859-10 { MIME* IANA* } - iso-ir-157 { IANA } - l6 { IANA } - ISO_8859-10:1992 { IANA } - csISOLatin6 { IANA } - latin6 { IANA } - -# Thai -# Be warned. There several iso-8859-11 codepage variants, and they are all incompatible. -# ISO-8859-11 is a superset of TIS-620. The difference is that ISO-8859-11 contains the C1 control codes. -iso-8859_11-2001 { UTR22* } ISO-8859-11 - thai8 # HP-UX alias. HP-UX says TIS-620, but it's closer to ISO-8859-11. - x-iso-8859-11 { JAVA* } - -# iso-8859-13, PC Baltic (w/o euro update) -ibm-921_P100-1995 { UTR22* } - ibm-921 { IBM* } - ISO-8859-13 { IANA* MIME* JAVA* } - 8859_13 { JAVA } - windows-28603 { WINDOWS* } - cp921 - 921 - x-IBM921 { JAVA } - -# Celtic -iso-8859_14-1998 { UTR22* } ISO-8859-14 { IANA* } - iso-ir-199 { IANA } - ISO_8859-14:1998 { IANA } - latin8 { IANA } - iso-celtic { IANA } - l8 { IANA } - -# Latin 9 -ibm-923_P100-1998 { UTR22* } - ibm-923 { IBM* JAVA } - ISO-8859-15 { IANA* MIME* WINDOWS JAVA* } - Latin-9 { IANA WINDOWS } - l9 { WINDOWS } - 8859_15 { JAVA } - latin0 { JAVA } - csisolatin0 { JAVA } - csisolatin9 { JAVA } - iso8859_15_fdis { JAVA } - cp923 { JAVA } - 923 { JAVA } - windows-28605 { WINDOWS* } - -# CJK encodings - -ibm-942_P12A-1999 { UTR22* } # ibm-942_P120 is a rarely used alternate mapping (sjis78 is already old) - ibm-942 { IBM* } - ibm-932 { IBM } - cp932 - shift_jis78 - sjis78 - ibm-942_VSUB_VPUA - ibm-932_VSUB_VPUA - x-IBM942 { JAVA* } - x-IBM942C { JAVA } - # Is this "JIS_C6226-1978"? - -# ibm-943_P15A-2003 differs from windows-932-2000 only in a few roundtrip mappings: -# - the usual IBM PC control code rotation (1A-1C-7F) -# - the Windows table has roundtrips for bytes 80, A0, and FD-FF to U+0080 and PUA -ibm-943_P15A-2003 { UTR22* } - ibm-943 # Leave untagged because this isn't the default - Shift_JIS { IANA* MIME* WINDOWS JAVA } - MS_Kanji { IANA WINDOWS JAVA } - csShiftJIS { IANA WINDOWS JAVA } - windows-31j { IANA JAVA } # A further extension of Shift_JIS to include NEC special characters (Row 13) - csWindows31J { IANA WINDOWS JAVA } # A further extension of Shift_JIS to include NEC special characters (Row 13) - x-sjis { WINDOWS JAVA } - x-ms-cp932 { WINDOWS } - cp932 { WINDOWS } - windows-932 { WINDOWS* } - cp943c { JAVA* } # This is slightly different, but the backslash mapping is the same. - IBM-943C #{ AIX* } # Add this tag once AIX aliases becomes available - ms932 - pck # Probably SOLARIS - sjis # This might be for ibm-1351 - ibm-943_VSUB_VPUA - x-MS932_0213 { JAVA } - x-JISAutoDetect { JAVA } - # cp943 # This isn't Windows, and no one else uses it. - # IANA says that Windows-31J is an extension to csshiftjis ibm-932 -ibm-943_P130-1999 { UTR22* } - ibm-943 { IBM* JAVA } - Shift_JIS # Leave untagged because this isn't the default - cp943 { JAVA* } # This is slightly different, but the backslash mapping is the same. - 943 { JAVA } - ibm-943_VASCII_VSUB_VPUA - x-IBM943 { JAVA } - # japanese. Unicode name is \u30b7\u30d5\u30c8\u7b26\u53f7\u5316\u8868\u73fe -ibm-33722_P12A_P12A-2009_U2 { UTR22* } - ibm-33722 # Leave untagged because this isn't the default - ibm-5050 # Leave untagged because this isn't the default, and yes this alias is correct - ibm-33722_VPUA - IBM-eucJP -windows-51932-2006 { UTR22* } - windows-51932 { WINDOWS* } - CP51932 { IANA* } - csCP51932 -ibm-33722_P120-1999 { UTR22* } # Japan EUC with \ <-> Yen mapping - ibm-33722 { IBM* JAVA } - ibm-5050 { IBM } # Yes this is correct - cp33722 { JAVA* } - 33722 { JAVA } - ibm-33722_VASCII_VPUA - x-IBM33722 { JAVA } - x-IBM33722A { JAVA } - x-IBM33722C { JAVA } -# ibm-954 seems to be almost a superset of ibm-33722 and ibm-1350 -# ibm-1350 seems to be almost a superset of ibm-33722 -# ibm-954 contains more PUA characters than the others. -ibm-954_P101-2007 { UTR22* } - ibm-954 { IBM* } - x-IBM954 { JAVA* } - x-IBM954C { JAVA } - # eucJP # This is closest to Solaris EUC-JP. -euc-jp-2007 { UTR22* } - EUC-JP { MIME* IANA JAVA* WINDOWS* } - Extended_UNIX_Code_Packed_Format_for_Japanese { IANA* JAVA WINDOWS } - csEUCPkdFmtJapanese { IANA JAVA WINDOWS } - X-EUC-JP { MIME JAVA WINDOWS } # Japan EUC. x-euc-jp is a MIME name - eucjis {JAVA} - ujis # Linux sometimes uses this name. This is an unfortunate generic and rarely used name. Its use is discouraged. - -aix-IBM_udcJP-4.3.6 { UTR22* } - x-IBM-udcJP { JAVA* } - -java-euc_jp_linux-1.6_P { UTR22* } - euc-jp-linux - x-EUC_JP_LINUX { JAVA* } - -java-sjis_0213-1.6_P { UTR22* } - x-SJIS_0213 { JAVA* } - -# Here are various interpretations and extensions of Big5 -ibm-1373_P100-2002 { UTR22* } # IBM's interpretation of Windows' Taiwan Big-5 without HKSCS extensions - ibm-1373 { IBM* } - windows-950 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage. -windows-950-2000 { UTR22* } - Big5 { IANA* MIME* JAVA* WINDOWS } - csBig5 { IANA WINDOWS } - windows-950 { WINDOWS* } - x-windows-950 { JAVA } - x-big5 - ms950 -ibm-950_P110-1999 { UTR22* } # Taiwan Big-5 (w/o euro update) - ibm-950 { IBM* JAVA } - cp950 { JAVA* } - 950 { JAVA } - x-IBM950 { JAVA } -ibm-1375_P100-2008 { UTR22* } # Big5-HKSCS-2004 with Unicode 3.1 mappings. This uses supplementary characters. - ibm-1375 { IBM* } - Big5-HKSCS { IANA* JAVA* } - big5hk { JAVA } - HKSCS-BIG5 # From http://www.openi18n.org/localenameguide/ -ibm-5471_P100-2006 { UTR22* } # Big5-HKSCS-2001 with Unicode 3.0 mappings. This uses many PUA characters. - ibm-5471 { IBM* } - Big5-HKSCS - MS950_HKSCS { JAVA* } - hkbig5 # from HP-UX 11i, which can't handle supplementary characters. - big5-hkscs:unicode3.0 - x-MS950-HKSCS { JAVA } - # windows-950 # Windows-950 can be w/ or w/o HKSCS extensions. By default it's not. - # windows-950_hkscs -solaris-zh_TW_big5-2.7 { UTR22* } - Big5_Solaris { JAVA* } - x-Big5-Solaris { JAVA } -# GBK -ibm-1386_P100-2001 { UTR22* } - ibm-1386 { IBM* } - cp1386 - windows-936 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage. - ibm-1386_VSUB_VPUA -windows-936-2000 { UTR22* } - GBK { IANA* WINDOWS JAVA* } - CP936 { IANA JAVA } - MS936 { IANA } # In JDK 1.5, this goes to x-mswin-936. This is an IANA name split. - windows-936 { IANA WINDOWS* JAVA } - -# Java has two different tables for ibm-1383 and gb2312. We pick closest set for tagging. -ibm-1383_P110-1999 { UTR22* } # China EUC. - ibm-1383 { IBM* JAVA } - GB2312 { IANA* MIME* } - csGB2312 { IANA } - cp1383 { JAVA* } - 1383 { JAVA } - EUC-CN # According to other platforms, windows-20936 looks more like euc-cn. x-euc-cn is also a MIME name - ibm-eucCN - hp15CN # From HP-UX? - ibm-1383_VPUA - # gb # This is not an IANA name. gb in IANA means Great Britain. - -ibm-5478_P100-1995 { UTR22* } ibm-5478 { IBM* } # This gb_2312_80 DBCS mapping is needed by iso-2022. - GB_2312-80 { IANA* } # Windows maps this alias incorrectly - chinese { IANA } - iso-ir-58 { IANA } - csISO58GB231280 { IANA } - gb2312-1980 - GB2312.1980-0 # From X11R6 - -euc-tw-2014 { UTR22* } # Updated EUC-TW converter based on ibm-964 - EUC-TW - -ibm-964_P110-1999 { UTR22* } # Taiwan EUC. x-euc-tw is a MIME name - ibm-964 { IBM* JAVA } - ibm-eucTW - cns11643 - cp964 { JAVA* } - 964 { JAVA } - ibm-964_VPUA - x-IBM964 { JAVA } - -# ISO-2022 needs one, and other people may need others. -ibm-949_P110-1999 { UTR22* } - ibm-949 { IBM* JAVA } - cp949 { JAVA* } - 949 { JAVA } - ibm-949_VASCII_VSUB_VPUA - x-IBM949 { JAVA } -ibm-949_P11A-1999 { UTR22* } - ibm-949 # Leave untagged because this isn't the default - cp949c { JAVA* } - ibm-949_VSUB_VPUA - x-IBM949C { JAVA } - IBM-949C { JAVA } - -# Korean EUC. -# -# -# EUC-KR = KS X 1003/ISO 646-KR or ISO 646-IRV/US-ASCII in GL and KS X 1001:1998 (formerly KS C 5601-1987) in GR. -# -# Although widely spread on MS Windows, using -# KS C 5601 or related names to denote EUC-KR or -# windows-949 is very much misleading. KS C 5601-1987 -# is NOT suitable as a designation for MIME charset -# and MBCS. It's just the name of a 94 x 94 Korean -# coded character set standard which can be invoked -# on either GL (with MSB reset) or GR (with MSB set). -# Note that JOHAB (windows-1361) specified in -# KS X 1001:1998 annex 3 (KS C 5601-1992 annex 3) -# is a _seprate_ MBCS with a _completely different_ -# mapping. -# -# -# The following aliases tries to mirror the poor state of alias recognition -# on these platforms. -# -# ibm-970 is almost a subset of ibm-1363. -# Java, Solaris and AIX use euc-kr to also mean ksc5601. -# Java has both ibm-970 and EUC-KR as separate converters. -ibm-970_P110_P110-2006_U2 { UTR22* } - ibm-970 { IBM* JAVA } - EUC-KR { IANA* MIME* WINDOWS JAVA } - KS_C_5601-1987 { JAVA } - windows-51949 { WINDOWS* } - csEUCKR { IANA WINDOWS } # x-euc-kr is also a MIME name - ibm-eucKR { JAVA } - KSC_5601 { JAVA } # Needed by iso-2022 - 5601 { JAVA } - cp970 { JAVA* } - 970 { JAVA } - ibm-970_VPUA - x-IBM970 { JAVA } - -# ibm-971 is almost the set of DBCS mappings of ibm-970 -ibm-971_P100-1995 ibm-971 { IBM* } ibm-971_VPUA x-IBM971 { JAVA* } - -# Java, Solaris and AIX use euc-kr to also mean ksc5601, and _sometimes_ for Windows too. -# ibm-1363 is almost a superset of ibm-970. -ibm-1363_P11B-1998 { UTR22* } - ibm-1363 # Leave untagged because this isn't the default - KS_C_5601-1987 { IANA* } - KS_C_5601-1989 { IANA } - KSC_5601 { IANA } - csKSC56011987 { IANA } - korean { IANA } - iso-ir-149 { IANA } - cp1363 { MIME* } - 5601 - ksc - windows-949 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage. - ibm-1363_VSUB_VPUA - x-IBM1363C { JAVA* } - # ks_x_1001:1992 - # ksc5601-1992 - -ibm-1363_P110-1997 { UTR22* } # Korean KSC MBCS with \ <-> Won mapping - ibm-1363 { IBM* } - ibm-1363_VASCII_VSUB_VPUA - x-IBM1363 { JAVA* } - -windows-949-2000 { UTR22* } - windows-949 { JAVA* WINDOWS* } - KS_C_5601-1987 { WINDOWS } - KS_C_5601-1989 { WINDOWS } - KSC_5601 { MIME* WINDOWS } # Needed by iso-2022 - csKSC56011987 { WINDOWS } - korean { WINDOWS } - iso-ir-149 { WINDOWS } - ms949 { JAVA } - x-KSC5601 { JAVA } - -windows-1361-2000 { UTR22* } - ksc5601_1992 - ms1361 - johab - x-Johab { JAVA* } - -windows-874-2000 { UTR22* } # Thai (w/ euro update) - TIS-620 { WINDOWS } - windows-874 { JAVA* WINDOWS* } - MS874 { JAVA } - x-windows-874 { JAVA } - # iso-8859-11 { WINDOWS } # iso-8859-11 is similar to TIS-620. ibm-13162 is a closer match. - -ibm-874_P100-1995 { UTR22* } # Thai PC (w/o euro update). - ibm-874 { IBM* JAVA } - ibm-9066 { IBM } # Yes ibm-874 == ibm-9066. ibm-1161 has the euro update. - cp874 { JAVA* } - TIS-620 { IANA* JAVA } # This is actually separate from ibm-874, which is similar to this table - tis620.2533 { JAVA } # This is actually separate from ibm-874, which is similar to this table - eucTH # eucTH is an unusual alias from Solaris. eucTH has fewer mappings than TIS620 - x-IBM874 { JAVA } - -ibm-1162_P100-1999 { UTR22* } # Thai (w/ euro update) - ibm-1162 { IBM* } - -windows-864-2000 { UTR22* } - ibm-864s - cp864s - x-IBM864S { JAVA* } - -# Platform codepages -# If Java supports the IBM prefix, it should also support the ibm- prefix too. -ibm-437_P100-1995 { UTR22* } ibm-437 { IBM* } IBM437 { IANA* WINDOWS JAVA } cp437 { IANA WINDOWS JAVA* } 437 { IANA WINDOWS JAVA } csPC8CodePage437 { IANA JAVA } windows-437 { WINDOWS* } # PC US -ibm-720_P100-1997 { UTR22* } ibm-720 { IBM* } windows-720 { WINDOWS* } DOS-720 { WINDOWS } x-IBM720 { JAVA* } # PC Arabic -ibm-737_P100-1997 { UTR22* } ibm-737 { IBM* } IBM737 { WINDOWS JAVA } cp737 { JAVA* } windows-737 { WINDOWS* } 737 { JAVA } x-IBM737 { JAVA } # PC Greek -ibm-775_P100-1996 { UTR22* } ibm-775 { IBM* } IBM775 { IANA* WINDOWS JAVA } cp775 { IANA WINDOWS JAVA* } csPC775Baltic { IANA } windows-775 { WINDOWS* } 775 { JAVA } # PC Baltic -ibm-850_P100-1995 { UTR22* } ibm-850 { IBM* } IBM850 { IANA* MIME* WINDOWS JAVA } cp850 { IANA MIME WINDOWS JAVA* } 850 { IANA JAVA } csPC850Multilingual { IANA JAVA } windows-850 { WINDOWS* } # PC latin1 -ibm-851_P100-1995 { UTR22* } ibm-851 { IBM* } IBM851 { IANA* } cp851 { IANA MIME* } 851 { IANA } csPC851 { IANA } # PC DOS Greek (w/o euro) -ibm-852_P100-1995 { UTR22* } ibm-852 { IBM* } IBM852 { IANA* WINDOWS JAVA } cp852 { IANA WINDOWS JAVA* } 852 { IANA WINDOWS JAVA } csPCp852 { IANA JAVA } windows-852 { WINDOWS* } # PC latin2 (w/o euro update) -ibm-855_P100-1995 { UTR22* } ibm-855 { IBM* } IBM855 { IANA* JAVA } cp855 { IANA JAVA* } 855 { IANA } csIBM855 { IANA } csPCp855 { JAVA } windows-855 { WINDOWS* } # PC cyrillic (w/o euro update) -ibm-856_P100-1995 { UTR22* } ibm-856 { IBM* } IBM856 { JAVA } cp856 { JAVA* } 856 { JAVA } x-IBM856 { JAVA } # PC Hebrew implicit order -ibm-857_P100-1995 { UTR22* } ibm-857 { IBM* } IBM857 { IANA* MIME* WINDOWS JAVA } cp857 { IANA MIME JAVA* } 857 { IANA JAVA } csIBM857 { IANA JAVA } windows-857 { WINDOWS* } # PC Latin 5 (w/o euro update) -ibm-858_P100-1997 { UTR22* } ibm-858 { IBM* } IBM00858 { IANA* MIME* JAVA } CCSID00858 { IANA JAVA } CP00858 { IANA JAVA } PC-Multilingual-850+euro { IANA } cp858 { MIME JAVA* } windows-858 { WINDOWS* } # PC latin1 with Euro -ibm-860_P100-1995 { UTR22* } ibm-860 { IBM* } IBM860 { IANA* MIME* JAVA } cp860 { IANA MIME JAVA* } 860 { IANA JAVA } csIBM860 { IANA JAVA } # PC Portugal -ibm-861_P100-1995 { UTR22* } ibm-861 { IBM* } IBM861 { IANA* MIME* WINDOWS JAVA } cp861 { IANA MIME JAVA* } 861 { IANA JAVA } cp-is { IANA JAVA } csIBM861 { IANA JAVA } windows-861 { WINDOWS* } # PC Iceland -ibm-862_P100-1995 { UTR22* } ibm-862 { IBM* } IBM862 { IANA* MIME* JAVA } cp862 { IANA MIME JAVA* } 862 { IANA JAVA } csPC862LatinHebrew { IANA JAVA } DOS-862 { WINDOWS } windows-862 { WINDOWS* } # PC Hebrew visual order (w/o euro update) -ibm-863_P100-1995 { UTR22* } ibm-863 { IBM* } IBM863 { IANA* MIME* JAVA } cp863 { IANA MIME JAVA* } 863 { IANA JAVA } csIBM863 { IANA JAVA } # PC Canadian French -ibm-864_X110-1999 { UTR22* } ibm-864 { IBM* } IBM864 { IANA* MIME* JAVA } cp864 { IANA MIME JAVA* } csIBM864 { IANA JAVA } # PC Arabic (w/o euro update) -ibm-865_P100-1995 { UTR22* } ibm-865 { IBM* } IBM865 { IANA* MIME* JAVA } cp865 { IANA MIME JAVA* } 865 { IANA JAVA } csIBM865 { IANA JAVA } # PC Nordic -ibm-866_P100-1995 { UTR22* } ibm-866 { IBM* } IBM866 { IANA* MIME* JAVA } cp866 { IANA MIME WINDOWS JAVA* } 866 { IANA JAVA } csIBM866 { IANA JAVA } windows-866 { WINDOWS* } # PC Russian (w/o euro update) -ibm-867_P100-1998 { UTR22* } ibm-867 { IBM* } x-IBM867 { JAVA* } # PC Hebrew (w/ euro update) Updated version of ibm-862 -ibm-868_P100-1995 { UTR22* } ibm-868 { IBM* } IBM868 { IANA* MIME* JAVA } CP868 { IANA MIME JAVA* } 868 { JAVA } csIBM868 { IANA } cp-ar { IANA } # PC Urdu -ibm-869_P100-1995 { UTR22* } ibm-869 { IBM* } IBM869 { IANA* MIME* WINDOWS JAVA } cp869 { IANA MIME JAVA* } 869 { IANA JAVA } cp-gr { IANA JAVA } csIBM869 { IANA JAVA } windows-869 { WINDOWS* } # PC Greek (w/o euro update) -ibm-878_P100-1996 { UTR22* } ibm-878 { IBM* } KOI8-R { IANA* MIME* WINDOWS JAVA* } koi8 { WINDOWS JAVA } csKOI8R { IANA WINDOWS JAVA } windows-20866 { WINDOWS* } cp878 # Russian internet -ibm-901_P100-1999 { UTR22* } ibm-901 { IBM* } # PC Baltic (w/ euro update), update of ibm-921 -ibm-902_P100-1999 { UTR22* } ibm-902 { IBM* } # PC Estonian (w/ euro update), update of ibm-922 -ibm-922_P100-1999 { UTR22* } ibm-922 { IBM* } IBM922 { JAVA } cp922 { JAVA* } 922 { JAVA } x-IBM922 { JAVA } # PC Estonian (w/o euro update) -ibm-1168_P100-2002 { UTR22* } ibm-1168 { IBM* } KOI8-U { IANA* WINDOWS } windows-21866 { WINDOWS* } # Ukrainian KOI8. koi8-ru != KOI8-U and Microsoft is wrong for aliasing them as the same. -ibm-4909_P100-1999 { UTR22* } ibm-4909 { IBM* } # ISO Greek (w/ euro update), update of ibm-813 - -# The cp aliases in this section aren't really windows aliases, but it was used by ICU for Windows. -# cp is usually used to denote IBM in Java, and that is why we don't do that anymore. -# The windows-* aliases mean windows codepages. -ibm-5346_P100-1998 { UTR22* } ibm-5346 { IBM* } windows-1250 { IANA* JAVA* WINDOWS* } cp1250 { WINDOWS JAVA } # Windows Latin2 (w/ euro update) -ibm-5347_P100-1998 { UTR22* } ibm-5347 { IBM* } windows-1251 { IANA* JAVA* WINDOWS* } cp1251 { WINDOWS JAVA } ANSI1251 # Windows Cyrillic (w/ euro update). ANSI1251 is from Solaris -ibm-5348_P100-1997 { UTR22* } ibm-5348 { IBM* } windows-1252 { IANA* JAVA* WINDOWS* } cp1252 { JAVA } # Windows Latin1 (w/ euro update) -ibm-5349_P100-1998 { UTR22* } ibm-5349 { IBM* } windows-1253 { IANA* JAVA* WINDOWS* } cp1253 { JAVA } # Windows Greek (w/ euro update) -ibm-5350_P100-1998 { UTR22* } ibm-5350 { IBM* } windows-1254 { IANA* JAVA* WINDOWS* } cp1254 { JAVA } # Windows Turkish (w/ euro update) -ibm-9447_P100-2002 { UTR22* } ibm-9447 { IBM* } windows-1255 { IANA* JAVA* WINDOWS* } cp1255 { JAVA } # Windows Hebrew (w/ euro update) -ibm-9448_X100-2005 { UTR22* } ibm-9448 { IBM* } windows-1256 { IANA* JAVA* WINDOWS* } cp1256 { WINDOWS JAVA } x-windows-1256S { JAVA } # Windows Arabic (w/ euro update) -ibm-9449_P100-2002 { UTR22* } ibm-9449 { IBM* } windows-1257 { IANA* JAVA* WINDOWS* } cp1257 { JAVA } # Windows Baltic (w/ euro update) -ibm-5354_P100-1998 { UTR22* } ibm-5354 { IBM* } windows-1258 { IANA* JAVA* WINDOWS* } cp1258 { JAVA } # Windows Vietnamese (w/ euro update) - -# These tables are out of date, and most don't have the Euro -# Leave the windows- variants untagged. They are alternate tables of the newer ones above. -ibm-1250_P100-1995 { UTR22* } ibm-1250 { IBM* } windows-1250 # Old Windows Latin2 (w/o euro update) -ibm-1251_P100-1995 { UTR22* } ibm-1251 { IBM* } windows-1251 # Old Windows Cyrillic (w/o euro update) -ibm-1252_P100-2000 { UTR22* } ibm-1252 { IBM* } windows-1252 # Old Windows Latin 1 without Euro -ibm-1253_P100-1995 { UTR22* } ibm-1253 { IBM* } windows-1253 # Old Windows Greek (w/o euro update) -ibm-1254_P100-1995 { UTR22* } ibm-1254 { IBM* } windows-1254 # Old Windows Turkish (w/o euro update) -ibm-1255_P100-1995 { UTR22* } ibm-1255 { IBM* } # Very old Windows Hebrew (w/o euro update) -ibm-5351_P100-1998 { UTR22* } ibm-5351 { IBM* } windows-1255 # Old Windows Hebrew (w/ euro update) -ibm-1256_P110-1997 { UTR22* } ibm-1256 { IBM* } # Old Windows Arabic (w/o euro update) -ibm-5352_P100-1998 { UTR22* } ibm-5352 { IBM* } windows-1256 # Somewhat old Windows Arabic (w/ euro update) -ibm-1257_P100-1995 { UTR22* } ibm-1257 { IBM* } # Old Windows Baltic (w/o euro update) -ibm-5353_P100-1998 { UTR22* } ibm-5353 { IBM* } windows-1257 # Somewhat old Windows Baltic (w/ euro update) -ibm-1258_P100-1997 { UTR22* } ibm-1258 { IBM* } windows-1258 # Old Windows Vietnamese (w/o euro update) - -macos-0_2-10.2 { UTR22* } macintosh { IANA* MIME* WINDOWS } mac { IANA } csMacintosh { IANA } windows-10000 { WINDOWS* } macroman { JAVA } x-macroman { JAVA* } # Apple latin 1 -macos-6_2-10.4 { UTR22* } x-mac-greek { MIME* WINDOWS } windows-10006 { WINDOWS* } macgr x-MacGreek { JAVA* } # Apple Greek -macos-7_3-10.2 { UTR22* } x-mac-cyrillic { MIME* WINDOWS } windows-10007 { WINDOWS* } mac-cyrillic maccy x-MacCyrillic { JAVA } x-MacUkraine { JAVA* } # Apple Cyrillic -macos-21-10.5 { UTR22* } x-mac-thai { MIME* } x-MacThai { JAVA* } MacThai { JAVA } -macos-29-10.2 { UTR22* } x-mac-centraleurroman { MIME* } windows-10029 { WINDOWS* } x-mac-ce { WINDOWS } macce maccentraleurope x-MacCentralEurope { JAVA* } # Apple Central Europe -macos-33-10.5 { UTR22* } x-mac-symbol { MIME* } x-MacSymbol { JAVA* } MacSymbol { JAVA } -macos-34-10.2 { UTR22* } x-mac-dingbat { MIME* } x-MacDingbat { JAVA* } MacDingbat { JAVA } -macos-35-10.2 { UTR22* } x-mac-turkish { MIME* WINDOWS } windows-10081 { WINDOWS* } mactr x-MacTurkish { JAVA* } # Apple Turkish -macos-36_2-10.2 { UTR22* } x-mac-croatian { MIME* } x-MacCroatian { JAVA* } MacCroatian { JAVA } -macos-37_5-10.2 { UTR22* } x-mac-iceland { MIME* } x-MacIceland { JAVA* } MacIceland { JAVA } -macos-38_2-10.2 { UTR22* } x-mac-romania { MIME* } x-MacRomania { JAVA* } MacRomania { JAVA } -macos-518-10.2 { UTR22* } x-mac-arabic { MIME* } x-MacArabic { JAVA* } MacArabic { JAVA } -macos-1285-10.2 { UTR22* } x-mac-hebrew { MIME* } x-MacHebrew { JAVA* } MacHebrew { JAVA } - -ibm-1051_P100-1995 { UTR22* } ibm-1051 { IBM* } hp-roman8 { IANA* } roman8 { IANA } r8 { IANA } csHPRoman8 { IANA } x-roman8 { JAVA* } # HP Latin1 -ibm-1276_P100-1995 { UTR22* } ibm-1276 { IBM* } Adobe-Standard-Encoding { IANA* } csAdobeStandardEncoding { IANA } # Different from ISO-Unicode-IBM-1276 (GCSGID: 1276) - -ibm-1006_P100-1995 { UTR22* } ibm-1006 { IBM* } IBM1006 { JAVA } cp1006 { JAVA* } 1006 { JAVA } x-IBM1006 { JAVA } # Urdu -ibm-1098_P100-1995 { UTR22* } ibm-1098 { IBM* } IBM1098 { JAVA } cp1098 { JAVA* } 1098 { JAVA } x-IBM1098 { JAVA } # PC Farsi -ibm-1124_P100-1996 { UTR22* } ibm-1124 { IBM* JAVA } cp1124 { JAVA* } 1124 { JAVA } x-IBM1124 { JAVA } # ISO Cyrillic Ukraine -ibm-1125_P100-1997 { UTR22* } ibm-1125 { IBM* } cp1125 # Cyrillic Ukraine PC -ibm-1129_P100-1997 { UTR22* } ibm-1129 { IBM* } # ISO Vietnamese -ibm-1131_P100-1997 { UTR22* } ibm-1131 { IBM* } cp1131 # Cyrillic Belarus PC -ibm-1133_P100-1997 { UTR22* } ibm-1133 { IBM* } # ISO Lao - -# GSM 03.38 -gsm-03.38-2009 { UTR22* } GSM0338 # GSM0338 alias is from Perl - -# Partially algorithmic converters - -# [U_ENABLE_GENERIC_ISO_2022] -# The _generic_ ISO-2022 converter is disabled starting 2003-dec-03 (ICU 2.8). -# For details see the icu mailing list from 2003-dec-01 and the ucnv2022.c file. -# Language-specific variants of ISO-2022 continue to be available as listed below. -# ISO_2022 ISO-2022 - -ISO_2022,locale=ja,version=0 ISO-2022-JP { IANA* MIME* JAVA* } csISO2022JP { IANA JAVA } x-windows-iso2022jp { JAVA } x-windows-50220 { JAVA } -ISO_2022,locale=ja,version=1 ISO-2022-JP-1 { MIME* } JIS_Encoding { IANA* } csJISEncoding { IANA } ibm-5054 { IBM* } JIS x-windows-50221 { JAVA* } -ISO_2022,locale=ja,version=2 ISO-2022-JP-2 { IANA* MIME* } csISO2022JP2 { IANA } -ISO_2022,locale=ja,version=3 JIS7 -ISO_2022,locale=ja,version=4 JIS8 -ISO_2022,locale=ko,version=0 ISO-2022-KR { IANA* MIME* JAVA* } csISO2022KR { IANA JAVA } # This uses ibm-949 -ISO_2022,locale=ko,version=1 ibm-25546 { IBM* } -ISO_2022,locale=zh,version=0 ISO-2022-CN { IANA* JAVA* } csISO2022CN { JAVA } x-ISO-2022-CN-GB { JAVA } -ISO_2022,locale=zh,version=1 ISO-2022-CN-EXT { IANA* } -ISO_2022,locale=zh,version=2 ISO-2022-CN-CNS x-ISO-2022-CN-CNS { JAVA* } -HZ HZ-GB-2312 { IANA* } -x11-compound-text COMPOUND_TEXT x-compound-text { JAVA* } - -ISCII,version=0 x-ISCII91 { JAVA* } x-iscii-de { WINDOWS } windows-57002 { WINDOWS* } iscii-dev ibm-4902 { IBM* } # ibm-806 contains non-standard box drawing symbols. -ISCII,version=1 x-iscii-be { WINDOWS } windows-57003 { WINDOWS* } iscii-bng windows-57006 { WINDOWS } x-iscii-as { WINDOWS } # be is different from as on Windows. -ISCII,version=2 x-iscii-pa { WINDOWS } windows-57011 { WINDOWS* } iscii-gur -ISCII,version=3 x-iscii-gu { WINDOWS } windows-57010 { WINDOWS* } iscii-guj -ISCII,version=4 x-iscii-or { WINDOWS } windows-57007 { WINDOWS* } iscii-ori -ISCII,version=5 x-iscii-ta { WINDOWS } windows-57004 { WINDOWS* } iscii-tml -ISCII,version=6 x-iscii-te { WINDOWS } windows-57005 { WINDOWS* } iscii-tlg -ISCII,version=7 x-iscii-ka { WINDOWS } windows-57008 { WINDOWS* } iscii-knd -ISCII,version=8 x-iscii-ma { WINDOWS } windows-57009 { WINDOWS* } iscii-mlm - -# Lotus specific -LMBCS-1 lmbcs ibm-65025 { IBM* } - -# These Lotus specific converters still work, but they aren't advertised in this alias table. -# These are almost never used outside of Lotus software, -# and they take a lot of time when creating the available converter list. -# Also Lotus doesn't really use them anyway. It was a mistake to create these LMBCS variant converters in ICU. -#LMBCS-2 -#LMBCS-3 -#LMBCS-4 -#LMBCS-5 -#LMBCS-6 -#LMBCS-8 -#LMBCS-11 -#LMBCS-16 -#LMBCS-17 -#LMBCS-18 -#LMBCS-19 - -# EBCDIC codepages according to the CDRA - -# without Euro -ibm-37_P100-1995 { UTR22* } # EBCDIC US - ibm-37 { IBM* } - IBM037 { IANA* JAVA } - ibm-037 # { JAVA } - ebcdic-cp-us { IANA JAVA } - ebcdic-cp-ca { IANA JAVA } - ebcdic-cp-wt { IANA JAVA } - ebcdic-cp-nl { IANA JAVA } - csIBM037 { IANA JAVA } - cp037 { JAVA* } - 037 { JAVA } - cpibm37 { JAVA } - cp37 - -ibm-273_P100-1995 { UTR22* } ibm-273 { IBM* } IBM273 { IANA* JAVA } CP273 { IANA JAVA* } csIBM273 { IANA } ebcdic-de 273 { JAVA } # EBCDIC Germanay, Austria -ibm-277_P100-1995 { UTR22* } ibm-277 { IBM* } IBM277 { IANA* JAVA } cp277 { JAVA* } EBCDIC-CP-DK { IANA } EBCDIC-CP-NO { IANA } csIBM277 { IANA } ebcdic-dk 277 { JAVA } # EBCDIC Denmark -ibm-278_P100-1995 { UTR22* } ibm-278 { IBM* } IBM278 { IANA* JAVA } cp278 { JAVA* } ebcdic-cp-fi { IANA } ebcdic-cp-se { IANA } csIBM278 { IANA } ebcdic-sv { JAVA } 278 { JAVA } # EBCDIC Sweden -ibm-280_P100-1995 { UTR22* } ibm-280 { IBM* } IBM280 { IANA* JAVA } CP280 { IANA JAVA* } ebcdic-cp-it { IANA } csIBM280 { IANA } 280 { JAVA } # EBCDIC Italy -ibm-284_P100-1995 { UTR22* } ibm-284 { IBM* } IBM284 { IANA* JAVA } CP284 { IANA JAVA* } ebcdic-cp-es { IANA } csIBM284 { IANA } cpibm284 { JAVA } 284 { JAVA } # EBCDIC Spain -ibm-285_P100-1995 { UTR22* } ibm-285 { IBM* } IBM285 { IANA* JAVA } CP285 { IANA JAVA* } ebcdic-cp-gb { IANA } csIBM285 { IANA } cpibm285 { JAVA } ebcdic-gb { JAVA } 285 { JAVA } # EBCDIC UK Ireland -ibm-290_P100-1995 { UTR22* } ibm-290 { IBM* } IBM290 { IANA* } cp290 { IANA } EBCDIC-JP-kana { IANA } csIBM290 { IANA } # host SBCS (Katakana) -ibm-297_P100-1995 { UTR22* } ibm-297 { IBM* } IBM297 { IANA* JAVA } cp297 { IANA JAVA* } ebcdic-cp-fr { IANA } csIBM297 { IANA } cpibm297 { JAVA } 297 { JAVA } # EBCDIC France -ibm-420_X120-1999 { UTR22* } ibm-420 { IBM* } IBM420 { IANA* JAVA } cp420 { IANA JAVA* } ebcdic-cp-ar1 { IANA } csIBM420 { IANA } 420 { JAVA } # EBCDIC Arabic (all presentation shapes) -ibm-424_P100-1995 { UTR22* } ibm-424 { IBM* } IBM424 { IANA* JAVA } cp424 { IANA JAVA* } ebcdic-cp-he { IANA } csIBM424 { IANA } 424 { JAVA } # EBCDIC Hebrew -ibm-500_P100-1995 { UTR22* } ibm-500 { IBM* } IBM500 { IANA* JAVA } CP500 { IANA JAVA* } ebcdic-cp-be { IANA } csIBM500 { IANA } ebcdic-cp-ch { IANA } 500 # EBCDIC International Latin1 -ibm-803_P100-1999 { UTR22* } ibm-803 { IBM* } cp803 # Old EBCDIC Hebrew -ibm-838_P100-1995 { UTR22* } ibm-838 { IBM* } IBM838 { JAVA } IBM-Thai { IANA* JAVA } csIBMThai { IANA } cp838 { JAVA* } 838 { JAVA } ibm-9030 { IBM } # EBCDIC Thai. Yes ibm-9030 is an alias. -ibm-870_P100-1995 { UTR22* } ibm-870 { IBM* } IBM870 { IANA* JAVA } CP870 { IANA JAVA* } ebcdic-cp-roece { IANA } ebcdic-cp-yu { IANA } csIBM870 { IANA } # EBCDIC Latin 2 -ibm-871_P100-1995 { UTR22* } ibm-871 { IBM* } IBM871 { IANA* JAVA } ebcdic-cp-is { IANA JAVA } csIBM871 { IANA JAVA } CP871 { IANA JAVA* } ebcdic-is { JAVA } 871 { JAVA } # EBCDIC Iceland -ibm-875_P100-1995 { UTR22* } ibm-875 { IBM* } IBM875 { JAVA } cp875 { JAVA* } 875 { JAVA } x-IBM875 { JAVA } # EBCDIC Greek -ibm-918_P100-1995 { UTR22* } ibm-918 { IBM* } IBM918 { IANA* JAVA } CP918 { IANA JAVA* } ebcdic-cp-ar2 { IANA } csIBM918 { IANA } # EBCDIC Urdu -ibm-930_P120-1999 { UTR22* } # EBCDIC_STATEFUL Katakana-Kanji Host Mixed. - ibm-930 { IBM* } - ibm-5026 { IBM } # Yes this is correct - IBM930 { JAVA } - cp930 { JAVA* } - 930 { JAVA } - x-IBM930 { JAVA } - x-IBM930A { JAVA } -ibm-933_P110-1995 { UTR22* } ibm-933 { IBM* JAVA } cp933 { JAVA* } 933 { JAVA } x-IBM933 { JAVA } # Korea EBCDIC MIXED -ibm-935_P110-1999 { UTR22* } ibm-935 { IBM* JAVA } cp935 { JAVA* } 935 { JAVA } x-IBM935 { JAVA } # China EBCDIC MIXED. Need to use Unicode, ibm-1388 or gb18030 instead because it is required by the government of China. -ibm-937_P110-1999 { UTR22* } ibm-937 { IBM* JAVA } cp937 { JAVA* } 937 { JAVA } x-IBM937 { JAVA } # Taiwan EBCDIC MIXED -ibm-939_P120-1999 { UTR22* } # EBCDIC_STATEFUL Latin-Kanji Host Mixed. - ibm-939 { IBM* } - ibm-931 { IBM } # Yes this is correct - ibm-5035 { IBM } # Yes this is also correct - IBM939 { JAVA } - cp939 { JAVA* } - 939 { JAVA } - x-IBM939 { JAVA } - x-IBM939A { JAVA } -ibm-1025_P100-1995 { UTR22* } ibm-1025 { IBM* JAVA } cp1025 { JAVA* } 1025 { JAVA } x-IBM1025 { JAVA } # EBCDIC Cyrillic -ibm-1026_P100-1995 { UTR22* } ibm-1026 { IBM* } IBM1026 { IANA* JAVA } CP1026 { IANA JAVA* } csIBM1026 { IANA } 1026 { JAVA } # EBCDIC Turkey -ibm-1047_P100-1995 { UTR22* } ibm-1047 { IBM* } IBM1047 { IANA* JAVA } cp1047 { JAVA* } 1047 { JAVA } # EBCDIC Open systems Latin1 -ibm-1097_P100-1995 { UTR22* } ibm-1097 { IBM* JAVA } cp1097 { JAVA* } 1097 { JAVA } x-IBM1097 { JAVA } # EBCDIC Farsi -ibm-1112_P100-1995 { UTR22* } ibm-1112 { IBM* JAVA } cp1112 { JAVA* } 1112 { JAVA } x-IBM1112 { JAVA } # EBCDIC Baltic -ibm-1114_P100-2001 { UTR22* } ibm-1114 { IBM* } x-IBM1114 { JAVA* } -ibm-1115_P100-1995 { UTR22* } ibm-1115 { IBM* } x-IBM1115 { JAVA* } -ibm-1122_P100-1999 { UTR22* } ibm-1122 { IBM* JAVA } cp1122 { JAVA* } 1122 { JAVA } x-IBM1122 { JAVA } # EBCDIC Estonia -ibm-1123_P100-1995 { UTR22* } ibm-1123 { IBM* JAVA } cp1123 { JAVA* } 1123 { JAVA } x-IBM1123 { JAVA } # EBCDIC Cyrillic Ukraine -ibm-1130_P100-1997 { UTR22* } ibm-1130 { IBM* } # EBCDIC Vietnamese -ibm-1132_P100-1998 { UTR22* } ibm-1132 { IBM* } # EBCDIC Lao -ibm-1137_P100-1999 { UTR22* } ibm-1137 { IBM* } # Devanagari EBCDIC (based on Unicode character set) -ibm-4517_P100-2005 { UTR22* } ibm-4517 { IBM* } # EBCDIC Arabic. Update of ibm-421 - -# with Euro -ibm-1140_P100-1997 { UTR22* } ibm-1140 { IBM* } IBM01140 { IANA* JAVA } CCSID01140 { IANA JAVA } CP01140 { IANA JAVA } cp1140 { JAVA* } ebcdic-us-37+euro { IANA } # EBCDIC US -ibm-1141_P100-1997 { UTR22* } ibm-1141 { IBM* } IBM01141 { IANA* JAVA } CCSID01141 { IANA JAVA } CP01141 { IANA JAVA } cp1141 { JAVA* } ebcdic-de-273+euro { IANA } # EBCDIC Germanay, Austria -ibm-1142_P100-1997 { UTR22* } ibm-1142 { IBM* } IBM01142 { IANA* JAVA } CCSID01142 { IANA JAVA } CP01142 { IANA JAVA } cp1142 { JAVA* } ebcdic-dk-277+euro { IANA } ebcdic-no-277+euro { IANA } # EBCDIC Denmark -ibm-1143_P100-1997 { UTR22* } ibm-1143 { IBM* } IBM01143 { IANA* JAVA } CCSID01143 { IANA JAVA } CP01143 { IANA JAVA } cp1143 { JAVA* } ebcdic-fi-278+euro { IANA } ebcdic-se-278+euro { IANA } # EBCDIC Sweden -ibm-1144_P100-1997 { UTR22* } ibm-1144 { IBM* } IBM01144 { IANA* JAVA } CCSID01144 { IANA JAVA } CP01144 { IANA JAVA } cp1144 { JAVA* } ebcdic-it-280+euro { IANA } # EBCDIC Italy -ibm-1145_P100-1997 { UTR22* } ibm-1145 { IBM* } IBM01145 { IANA* JAVA } CCSID01145 { IANA JAVA } CP01145 { IANA JAVA } cp1145 { JAVA* } ebcdic-es-284+euro { IANA } # EBCDIC Spain -ibm-1146_P100-1997 { UTR22* } ibm-1146 { IBM* } IBM01146 { IANA* JAVA } CCSID01146 { IANA JAVA } CP01146 { IANA JAVA } cp1146 { JAVA* } ebcdic-gb-285+euro { IANA } # EBCDIC UK Ireland -ibm-1147_P100-1997 { UTR22* } ibm-1147 { IBM* } IBM01147 { IANA* JAVA } CCSID01147 { IANA JAVA } CP01147 { IANA JAVA } cp1147 { JAVA* } ebcdic-fr-297+euro { IANA } # EBCDIC France -ibm-1148_P100-1997 { UTR22* } ibm-1148 { IBM* } IBM01148 { IANA* JAVA } CCSID01148 { IANA JAVA } CP01148 { IANA JAVA } cp1148 { JAVA* } ebcdic-international-500+euro { IANA } # EBCDIC International Latin1 -ibm-1149_P100-1997 { UTR22* } ibm-1149 { IBM* } IBM01149 { IANA* JAVA } CCSID01149 { IANA JAVA } CP01149 { IANA JAVA } cp1149 { JAVA* } ebcdic-is-871+euro { IANA } # EBCDIC Iceland -ibm-1153_P100-1999 { UTR22* } ibm-1153 { IBM* } IBM1153 { JAVA } x-IBM1153 { JAVA* } # EBCDIC latin 2 -ibm-1154_P100-1999 { UTR22* } ibm-1154 { IBM* } # EBCDIC Cyrillic Multilingual -ibm-1155_P100-1999 { UTR22* } ibm-1155 { IBM* } # EBCDIC Turkey -ibm-1156_P100-1999 { UTR22* } ibm-1156 { IBM* } # EBCDIC Baltic Multilingual -ibm-1157_P100-1999 { UTR22* } ibm-1157 { IBM* } # EBCDIC Estonia -ibm-1158_P100-1999 { UTR22* } ibm-1158 { IBM* } # EBCDIC Cyrillic Ukraine -ibm-1160_P100-1999 { UTR22* } ibm-1160 { IBM* } # EBCDIC Thailand -ibm-1164_P100-1999 { UTR22* } ibm-1164 { IBM* } # EBCDIC Viet Nam -ibm-1364_P110-2007 { UTR22* } ibm-1364 { IBM* } x-IBM1364 { JAVA* } # Korean Host Mixed -ibm-1370_P100-1999 { UTR22* } ibm-1370 { IBM* } x-IBM1370 { JAVA* } -ibm-1371_P100-1999 { UTR22* } ibm-1371 { IBM* } x-IBM1371 { JAVA* } # Taiwan EBCDIC MIXED (Euro update of ibm-937) -ibm-1388_P103-2001 { UTR22* } ibm-1388 { IBM* } ibm-9580 { IBM } x-IBM1388 { JAVA* } # S-Ch DBCS-Host Data GBK EBCDIC_STATEFUL. Yes ibm-9580 is an alias. -ibm-1390_P110-2003 { UTR22* } ibm-1390 { IBM* } x-IBM1390 { JAVA* } # Japan EBCDIC MIXED (JIS X 0213) -ibm-1399_P110-2003 { UTR22* } ibm-1399 { IBM* } x-IBM1399 { JAVA* } # Host MBCS (Latin-Kanji) (JIS X 0213) -ibm-5123_P100-1999 { UTR22* } ibm-5123 { IBM* } # Host Roman Jis. Euro update of ibm-1027. SBCS portion of ibm-1390. -ibm-8482_P100-1999 { UTR22* } ibm-8482 { IBM* } # host SBCS (Katakana). Euro update of ibm-290. SBCS portion of ibm-1399. -# Yes ibm-20780 is the same as ibm-16684 -ibm-16684_P110-2003 { UTR22* } ibm-16684 { IBM* } ibm-20780 { IBM } # DBCS Jis + Roman Jis Host. This is the DBCS portion of ibm-1390 and ibm-1399 (JIS X 0213). -ibm-4899_P100-1998 { UTR22* } ibm-4899 { IBM* } # Old EBCDIC Hebrew. Update of ibm-803 -ibm-4971_P100-1999 { UTR22* } ibm-4971 { IBM* } # EBCDIC Greek. Update of ibm-875 and superceded by ibm-9067 -ibm-9067_X100-2005 { UTR22* } ibm-9067 { IBM* } # EBCDIC Greek. Update of ibm-875 and ibm-4971 -ibm-12712_P100-1998 { UTR22* } ibm-12712 { IBM* } ebcdic-he # EBCDIC Hebrew (new sheqel, control characters update). Update of ibm-424 -ibm-16804_X110-1999 { UTR22* } ibm-16804 { IBM* } ebcdic-ar # EBCDIC Arabic. Update of ibm-420 - -java-Cp1399A-1.6_P { UTR22* } x-IBM1399A { JAVA* } -java-Cp420s-1.6_P { UTR22* } x-IBM420S { JAVA* } -java-Cp1390A-1.6_P { UTR22* } x-IBM1390A { JAVA* } - -# EBCDIC codepages for S/390, with LF and NL codes swapped -# Starting with ICU 2.4, the swapping is done by modifying the -# normal tables at runtime instead of at build time. -# Append UCNV_SWAP_LFNL_OPTION_STRING to the "ibm-CCSID" name to select this. -# -# Example: "ibm-1047,swaplfnl" or "ibm-1047" UCNV_SWAP_LFNL_OPTION_STRING -# -# This avoids the duplication of all EBCDIC SBCS and mixed-SBCS/DBCS -# mapping files. - -# Some examples below for declaring old-style, obsolete aliases with the "-s390" -# suffix to map to the new-style, recommended names with the option added. -# These are listed here for backward compatibility. -# Do not use these; instead use the normal converter name with the option -# added as recommended above. - -# Note: It is not possible to define an alias (non-initial name in a line here) -# that itself contains a converter option like this one for swapping LF<->NL. -# Such names would never be found because ucnv_open() will first parse and strip -# options before looking up a name in this table. -# ucnv_open() then parses the lookup result (the canonical name on the left -# in lines here) as well. - -# This also means that it is not necessary to add anything to convrtrs.txt -# for converter names like "ibm-1026,swaplfnl" to work - -# they are already covered by the normal option parsing together with the -# regular, option-less alias elsewhere in this file. - -ibm-37_P100-1995,swaplfnl ibm-37-s390 # ibm037-s390 also matches ibm-37-s390 -ibm-924_P100-1998,swaplfnl ibm-924-s390 IBM924_LF { JAVA* } -ibm-1047_P100-1995,swaplfnl ibm-1047-s390 IBM1047_LF { JAVA* } -ibm-1140_P100-1997,swaplfnl ibm-1140-s390 -ibm-1141_P100-1997,swaplfnl ibm-1141-s390 IBM1141_LF { JAVA* } -ibm-1142_P100-1997,swaplfnl ibm-1142-s390 -ibm-1143_P100-1997,swaplfnl ibm-1143-s390 -ibm-1144_P100-1997,swaplfnl ibm-1144-s390 -ibm-1145_P100-1997,swaplfnl ibm-1145-s390 -ibm-1146_P100-1997,swaplfnl ibm-1146-s390 -ibm-1147_P100-1997,swaplfnl ibm-1147-s390 -ibm-1148_P100-1997,swaplfnl ibm-1148-s390 -ibm-1149_P100-1997,swaplfnl ibm-1149-s390 -ibm-1153_P100-1999,swaplfnl ibm-1153-s390 -ibm-12712_P100-1998,swaplfnl ibm-12712-s390 -ibm-16804_X110-1999,swaplfnl ibm-16804-s390 - -# This is a special version of ibm-1140 that the XML4C (Xerces) parser team -# requested in 2000. -# It maps both EBCDIC LF and NL controls to Unicode LF U+000A. - -ebcdic-xml-us - -# These are not installed by default. They are rarely used. -# Many of them can be added through the online ICU Data Library Customization tool - -ibm-1004_P100-1995 { UTR22* } ibm-1004 { IBM* } -ibm-1008_P100-1995 { UTR22* } ibm-1008 { IBM* } # cp1008, 8-bit Arabic (w/o euro update) -ibm-1009_P100-1995 { UTR22* } ibm-1009 { IBM* } -ibm-1010_P100-1995 { UTR22* } ibm-1010 { IBM* } NF_Z_62-010 { IANA* } iso-ir-69 { IANA } ISO646-FR { IANA } fr { IANA } csISO69French { IANA } -ibm-1011_P100-1995 { UTR22* } ibm-1011 { IBM* } DIN_66003 { IANA* } iso-ir-21 { IANA } de { IANA } ISO646-DE { IANA } csISO21German { IANA } -ibm-1012_P100-1995 { UTR22* } ibm-1012 { IBM* } IT { IANA* } iso-ir-15 { IANA } ISO646-IT { IANA } csISO15Italian { IANA } -ibm-1013_P100-1995 { UTR22* } ibm-1013 { IBM* } BS_4730 { IANA* } iso-ir-4 { IANA } ISO646-GB { IANA } gb { IANA } uk { IANA } csISO4UnitedKingdom { IANA } -ibm-1014_P100-1995 { UTR22* } ibm-1014 { IBM* } ES2 { IANA* } iso-ir-85 { IANA } ISO646-ES2 { IANA } csISO85Spanish2 { IANA } -ibm-1015_P100-1995 { UTR22* } ibm-1015 { IBM* } PT2 { IANA* } iso-ir-84 { IANA } ISO646-PT2 { IANA } csISO84Portuguese2 { IANA } -ibm-1016_P100-1995 { UTR22* } ibm-1016 { IBM* } NS_4551-1 { IANA* } iso-ir-60 { IANA } ISO646-NO { IANA } no { IANA } csISO60DanishNorwegian { IANA } csISO60Norwegian1 { IANA } -ibm-1017_P100-1995 { UTR22* } ibm-1017 { IBM* } -ibm-1018_P100-1995 { UTR22* } ibm-1018 { IBM* } SEN_850200_B { IANA* } iso-ir-10 { IANA } FI { IANA } ISO646-FI { IANA } ISO646-SE { IANA } se { IANA } csISO10Swedish { IANA } -ibm-1019_P100-1995 { UTR22* } ibm-1019 { IBM* } -ibm-1020_P100-2003 { UTR22* } ibm-1020 { IBM* } CSA_Z243.4-1985-1 { IANA* } iso-ir-121 { IANA } ISO646-CA { IANA } csa7-1 { IANA } ca { IANA } csISO121Canadian1 { IANA } -ibm-1021_P100-2003 { UTR22* } ibm-1021 { IBM* } -ibm-1023_P100-2003 { UTR22* } ibm-1023 { IBM* } ES { IANA* } iso-ir-17 { IANA } ISO646-ES { IANA } csISO17Spanish { IANA } -ibm-1027_P100-1995 { UTR22* } ibm-1027 { IBM* } x-IBM1027 { JAVA* } -ibm-1041_P100-1995 { UTR22* } ibm-1041 { IBM* } x-IBM1041 { JAVA* } -ibm-1043_P100-1995 { UTR22* } ibm-1043 { IBM* } x-IBM1043 { JAVA* } -ibm-1046_X110-1999 { UTR22* } ibm-1046 { IBM* } x-IBM1046 { JAVA* } x-IBM1046S { JAVA } # Arabic -ibm-1088_P100-1995 { UTR22* } ibm-1088 { IBM* } x-IBM1088 { JAVA* } -ibm-1100_P100-2003 { UTR22* } ibm-1100 { IBM* } DEC-MCS { IANA* } dec { IANA } csDECMCS { IANA } -ibm-1101_P100-2003 { UTR22* } ibm-1101 { IBM* } -ibm-1102_P100-2003 { UTR22* } ibm-1102 { IBM* } -ibm-1103_P100-2003 { UTR22* } ibm-1103 { IBM* } -ibm-1104_P100-2003 { UTR22* } ibm-1104 { IBM* } NF_Z_62-010_1973 iso-ir-25 { IANA* } ISO646-FR1 { IANA } csISO25French { IANA } # NF_Z_62-010_(1973) is the real IANA alias, but () aren't invariant characters. -ibm-1105_P100-2003 { UTR22* } ibm-1105 { IBM* } -ibm-1106_P100-2003 { UTR22* } ibm-1106 { IBM* } -ibm-1107_P100-2003 { UTR22* } ibm-1107 { IBM* } DS_2089 { IANA* } ISO646-DK { IANA } dk { IANA } csISO646Danish { IANA } -ibm-1127_P100-2004 { UTR22* } ibm-1127 { IBM* } -ibm-1161_P100-1999 { UTR22* } ibm-1161 { IBM* } # Thai (Euro update of ibm-1129) -ibm-1163_P100-1999 { UTR22* } ibm-1163 { IBM* } # Vietnamese -ibm-1165_P101-2000 { UTR22* } ibm-1165 { IBM* } # Vietnamese (EBCDIC) -ibm-1166_P100-2002 { UTR22* } ibm-1166 { IBM* } # Cyrillic for Kazakhstan -ibm-1167_P100-2002 { UTR22* } ibm-1167 { IBM* } KOI8-RU x-KOI8_RU { JAVA* } -ibm-1174_X100-2007 { UTR22* } ibm-1174 { IBM* } KZ-1048 { IANA* } STRK1048-2002 { IANA } RK1048 { IANA } csKZ1048 { IANA } -ibm-1277_P100-1995 { UTR22* } ibm-1277 { IBM* } # Adobe (Postscript) Latin-1 -ibm-13125_P100-1997 { UTR22* } ibm-13125 { IBM* } # S-Ch (DBCS subset of ibm-4933, ibm-1388) -ibm-13140_P101-2000 { UTR22* } ibm-13140 { IBM* } -ibm-13218_P100-1996 { UTR22* } ibm-13218 { IBM* } # Japanese (EBCDIC update of ibm-930) -ibm-1350_P110-1997 { UTR22* } ibm-1350 { IBM* } x-eucJP-Open { JAVA* } eucJP-Open { JAVA } # Japanese (EUC-JP variant) -ibm-1351_P110-1997 { UTR22* } ibm-1351 { IBM* } x-IBM1351 { JAVA* } # Japanese (DBCS subset of ibm-5039) -ibm-1362_P110-1999 { UTR22* } ibm-1362 { IBM* } x-IBM1362 { JAVA* } # Korean (DBCS subset of ibm-1363) -ibm-13676_P102-2001 { UTR22* } ibm-13676 { IBM* } # Simplified Chinese (EBCDIC) -ibm-1380_P100-1995 { UTR22* } ibm-1380 { IBM* } x-IBM1380 { JAVA* } # Simplified Chinese (DBCS subset of ibm-1381) -ibm-1381_P110-1999 { UTR22* } ibm-1381 { IBM* JAVA } cp1381 { JAVA* } 1381 { JAVA } x-IBM1381 { JAVA } # Simplified Chinese PC Data mixed (IBM GB) -ibm-1382_P100-1995 { UTR22* } ibm-1382 { IBM* } x-IBM1382 { JAVA* } # Simplified Chinese (DBCS subset of ibm-1383) -ibm-17221_P100-2001 { UTR22* } ibm-17221 { IBM* } # Simplified Chinese (EBCDIC) -ibm-17248_X110-1999 { UTR22* } ibm-17248 { IBM* } # PC Arabic (w/ euro update) Updated version of ibm-864 -ibm-21344_P101-2000 { UTR22* } ibm-21344 { IBM* } # PC Arabic. Updated version of ibm-864 -ibm-21427_P100-1999 { UTR22* } ibm-21427 { IBM* } # Traditional Chinese (DBCS subset of ibm-1370) -ibm-256_P100-1995 { UTR22* } ibm-256 { IBM* } # Latin 1 EBCDIC -ibm-259_P100-1995 { UTR22* } ibm-259 { IBM* } IBM-Symbols { IANA* } csIBMSymbols { IANA } -ibm-274_P100-2000 { UTR22* } ibm-274 { IBM* } IBM274 { IANA* } EBCDIC-BE { IANA } CP274 { IANA } csIBM274 { IANA } -ibm-275_P100-1995 { UTR22* } ibm-275 { IBM* } IBM275 { IANA* } EBCDIC-BR { IANA } cp275 { IANA } csIBM275 { IANA } -ibm-286_P100-2003 { UTR22* } ibm-286 { IBM* } EBCDIC-AT-DE-A { IANA* } csEBCDICATDEA { IANA } -ibm-293_P100-1995 { UTR22* } ibm-293 { IBM* } # APL EBCDIC (APL: A Programming Language) -ibm-300_P120-2006 { UTR22* } ibm-300 { IBM* } x-IBM300 { JAVA* } # Japanese (DBCS subset of ibm-930 and ibm-939) -ibm-301_P110-1997 { UTR22* } ibm-301 { IBM* } x-IBM301 { JAVA* } # Japanese (DBCS subset of ibm-943) -ibm-33058_P100-2000 { UTR22* } ibm-33058 { IBM* } # SBCS (Katakana) -ibm-425_P101-2000 { UTR22* } ibm-425 { IBM* } # Arabic (EBCDIC) -ibm-4930_P110-1999 { UTR22* } ibm-4930 { IBM* } # Korean (DBCS subset of ibm-1364) -ibm-4933_P100-2002 { UTR22* } ibm-4933 { IBM* } # S-Ch (DBCS subset of ibm-1388) -ibm-4948_P100-1995 { UTR22* } ibm-4948 { IBM* } -ibm-4951_P100-1995 { UTR22* } ibm-4951 { IBM* } -ibm-4952_P100-1995 { UTR22* } ibm-4952 { IBM* } -ibm-4960_P100-1995 { UTR22* } ibm-4960 { IBM* } -ibm-5039_P11A-1998 { UTR22* } ibm-5039 { IBM* } # Japanese (HP Shift-JIS variant) -ibm-5048_P100-1995 { UTR22* } ibm-5048 { IBM* } # Japanese (DBCS subset of ibm-1350, JIS X208-1990) -ibm-5049_P100-1995 { UTR22* } ibm-5049 { IBM* } # Japanese (DBCS subset of ibm-1350, JIS X212) -ibm-5067_P100-1995 { UTR22* } ibm-5067 { IBM* } # Korean (DBCS subset of ibm-21450) -ibm-5104_X110-1999 { UTR22* } ibm-5104 { IBM* } # cp1008, 8-bit Arabic (w/ euro update) -ibm-5233_P100-2011 { UTR22* } ibm-5233 { IBM* } # Devanagari EBCDIC, including Indian Rupee -ibm-806_P100-1998 { UTR22* } ibm-806 { IBM* } # Hindi (ISCII variant) -ibm-808_P100-1999 { UTR22* } ibm-808 { IBM* } x-IBM808 { JAVA* } # Cyrillic -ibm-833_P100-1995 { UTR22* } ibm-833 { IBM* } x-IBM833 { JAVA* } -ibm-834_P100-1995 { UTR22* } ibm-834 { IBM* } x-IBM834 { JAVA* } # Korean (DBCS subset of ibm-933) -ibm-835_P100-1995 { UTR22* } ibm-835 { IBM* } x-IBM835 { JAVA* } # Traditional Chinese (DBCS subset of ibm-5033) -ibm-836_P100-1995 { UTR22* } ibm-836 { IBM* } x-IBM836 { JAVA* } -ibm-837_P100-2011 { UTR22* } ibm-837 { IBM* } x-IBM837 { JAVA* } # Simplified Chinese (DBCS subset of ibm-5031) -ibm-848_P100-1999 { UTR22* } ibm-848 { IBM* } # Cyrillic (euro update of ibm-1125) -ibm-849_P100-1999 { UTR22* } ibm-849 { IBM* } # Cyrillic Belarus (euro update of ibm-1131) -ibm-859_P100-1999 { UTR22* } ibm-859 { IBM* } x-IBM859 { JAVA* } # PC Latin 9 (w/ euro update) -ibm-8612_P100-1995 { UTR22* } ibm-8612 { IBM* } # Arabic (EBCDIC update of ibm-420) -ibm-872_P100-1999 { UTR22* } ibm-872 { IBM* } # Cyrillic (Euro update of ibm-855) -ibm-880_P100-1995 { UTR22* } ibm-880 { IBM* } IBM880 { IANA* } cp880 { IANA } EBCDIC-Cyrillic { IANA } csIBM880 { IANA } windows-20880 { WINDOWS* } -ibm-896_P100-1995 { UTR22* } ibm-896 { IBM* } # SBCS Katakana -ibm-897_P100-1995 { UTR22* } ibm-897 { IBM* } JIS_X0201 { IANA* } X0201 { IANA } csHalfWidthKatakana { IANA } x-IBM897 { JAVA* } -ibm-9027_P100-1999 { UTR22* } ibm-9027 { IBM* } # DBCS T-Ch Host. Euro update of ibm-835. DBCS portion of ibm-1371. -ibm-9048_P100-1998 { UTR22* } ibm-9048 { IBM* } # Hebrew (Euro and Sequel update of ibm-856) -ibm-905_P100-1995 { UTR22* } ibm-905 { IBM* } IBM905 { IANA* } CP905 { IANA } ebcdic-cp-tr { IANA } csIBM905 { IANA } windows-20905 { WINDOWS* } -ibm-9056_P100-1995 { UTR22* } ibm-9056 { IBM* } # Arabic -ibm-9061_P100-1999 { UTR22* } ibm-9061 { IBM* } # Greek (w/ euro update) -ibm-9145_P110-1997 { UTR22* } ibm-9145 { IBM* } # Japanese (DBCS subset of ibm-5050) -ibm-9238_X110-1999 { UTR22* } ibm-9238 { IBM* } # cp1046, PC Arabic Extended (w/ euro update) -ibm-924_P100-1998 { UTR22* } ibm-924 { IBM* } IBM00924 { IANA* } CCSID00924 { IANA } CP00924 { IANA } ebcdic-Latin9--euro { IANA } -ibm-926_P100-2000 { UTR22* } ibm-926 { IBM* } # Korean (DBCS subset of ibm-944) -ibm-927_P100-1995 { UTR22* } ibm-927 { IBM* } x-IBM927 { JAVA* } # Traditional Chinese (DBCS subset of ibm-948) -ibm-928_P100-1995 { UTR22* } ibm-928 { IBM* } # Simplified Chinese (DBCS subset of ibm-936) -ibm-941_P13A-2001 { UTR22* } ibm-941 { IBM* } # DBCS portion of ibm-943 -ibm-944_P100-1995 { UTR22* } ibm-944 { IBM* } # Korean -ibm-946_P100-1995 { UTR22* } ibm-946 { IBM* } # Simplified Chinese -ibm-947_P100-1995 { UTR22* } ibm-947 { IBM* } x-IBM947 { JAVA* } # Traditional Chinese (DBCS subset of ibm-950) -ibm-948_P110-1999 { UTR22* } ibm-948 { IBM* } x-IBM948 { JAVA* } # Traditional Chinese -ibm-951_P100-1995 { UTR22* } ibm-951 { IBM* } x-IBM951 { JAVA* } # Korean (DBCS subset of ibm-949) -ibm-952_P110-1997 { UTR22* } ibm-952 { IBM* } x-JIS0208 # Pure DBCS, Japanese EUC, G1 - JIS X208-1990 -ibm-953_P100-2000 { UTR22* } ibm-953 { IBM* } JIS_X0212-1990 { IANA* } # Pure DBCS, Japanese EUC, G3 - JIS X 0212-1990 -ibm-955_P110-1997 { UTR22* } ibm-955 { IBM* } # Pure DBCS, Japanese EUC, G0 - JIS X208-1978 -ibm-9577_P100-2001 { UTR22* } ibm-9577 { IBM* } ibm-1385 { IBM } x-IBM1385 { JAVA* } # ibm-9577 and ibm-1385 are identical DBCS tables. -iso-8859_16-2001 { UTR22* } ISO-8859-16 { IANA* } iso-ir-226 { IANA } ISO_8859-16:2001 { IANA } latin10 { IANA } l10 { IANA } - -# To be considered for listing at a later date for the data library customization tool -#ibm-1159_P100-1999 { UTR22* } ibm-1159 { IBM* } # SBCS T-Ch Host. Euro update of ibm-28709. This is used in combination with another CCSID mapping. -#ibm-960_P100-2000 { UTR22* } ibm-960 { IBM* } # Pure DBCS, CNS11643 plane 1 -#ibm-963_P100-1995 { UTR22* } ibm-963 { IBM* } # Pure DBCS, CNS11643 plane 2 Traditional Chinese (DBCS subset of ibm-965) diff --git a/rust/fuzz/.gitignore b/rust/fuzz/.gitignore deleted file mode 100644 index 1a45eee776..0000000000 --- a/rust/fuzz/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -target -corpus -artifacts -coverage diff --git a/rust/fuzz/Cargo.lock b/rust/fuzz/Cargo.lock deleted file mode 100644 index c840c28160..0000000000 --- a/rust/fuzz/Cargo.lock +++ /dev/null @@ -1,872 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 3 - -[[package]] -name = "adler" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" - -[[package]] -name = "android-tzdata" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" - -[[package]] -name = "android_system_properties" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" -dependencies = [ - "libc", -] - -[[package]] -name = "anstream" -version = "0.6.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" -dependencies = [ - "anstyle", - "anstyle-parse", - "anstyle-query", - "anstyle-wincon", - "colorchoice", - "is_terminal_polyfill", - "utf8parse", -] - -[[package]] -name = "anstyle" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" - -[[package]] -name = "anstyle-parse" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4" -dependencies = [ - "utf8parse", -] - -[[package]] -name = "anstyle-query" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391" -dependencies = [ - "windows-sys 0.52.0", -] - -[[package]] -name = "anstyle-wincon" -version = "3.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" -dependencies = [ - "anstyle", - "windows-sys 0.52.0", -] - -[[package]] -name = "anyhow" -version = "1.0.86" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" - -[[package]] -name = "arbitrary" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110" - -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi", -] - -[[package]] -name = "autocfg" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" - -[[package]] -name = "bitflags" -version = "2.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" - -[[package]] -name = "bumpalo" -version = "3.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" - -[[package]] -name = "cc" -version = "1.0.106" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "066fce287b1d4eafef758e89e09d724a24808a9196fe9756b8ca90e86d0719a2" -dependencies = [ - "jobserver", - "libc", - "once_cell", -] - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "chrono" -version = "0.4.38" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" -dependencies = [ - "android-tzdata", - "iana-time-zone", - "js-sys", - "num-traits", - "wasm-bindgen", - "windows-targets 0.52.6", -] - -[[package]] -name = "clap" -version = "4.5.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84b3edb18336f4df585bc9aa31dd99c036dfa5dc5e9a2939a722a188f3a8970d" -dependencies = [ - "clap_builder", - "clap_derive", -] - -[[package]] -name = "clap_builder" -version = "4.5.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1c09dd5ada6c6c78075d6fd0da3f90d8080651e2d6cc8eb2f1aaa4034ced708" -dependencies = [ - "anstream", - "anstyle", - "clap_lex", - "strsim", - "terminal_size", -] - -[[package]] -name = "clap_derive" -version = "4.5.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bac35c6dafb060fd4d275d9a4ffae97917c13a6327903a8be2153cd964f7085" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "clap_lex" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" - -[[package]] -name = "colorchoice" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" - -[[package]] -name = "core-foundation-sys" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" - -[[package]] -name = "crc32fast" -version = "1.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "encoding_rs" -version = "0.8.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "equivalent" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" - -[[package]] -name = "errno" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - -[[package]] -name = "finl_unicode" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" - -[[package]] -name = "flate2" -version = "1.0.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae" -dependencies = [ - "crc32fast", - "miniz_oxide", -] - -[[package]] -name = "float_next_after" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" - -[[package]] -name = "hashbrown" -version = "0.14.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" - -[[package]] -name = "heck" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" - -[[package]] -name = "hermit-abi" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" -dependencies = [ - "libc", -] - -[[package]] -name = "hexplay" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0962bea6731e28b5a443ba4aa00fe3e4fe7555dadf12012435efb738eeac5898" -dependencies = [ - "atty", - "termcolor", -] - -[[package]] -name = "iana-time-zone" -version = "0.1.60" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" -dependencies = [ - "android_system_properties", - "core-foundation-sys", - "iana-time-zone-haiku", - "js-sys", - "wasm-bindgen", - "windows-core", -] - -[[package]] -name = "iana-time-zone-haiku" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" -dependencies = [ - "cc", -] - -[[package]] -name = "indexmap" -version = "2.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" -dependencies = [ - "equivalent", - "hashbrown", -] - -[[package]] -name = "is_terminal_polyfill" -version = "1.70.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" - -[[package]] -name = "jobserver" -version = "0.1.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" -dependencies = [ - "libc", -] - -[[package]] -name = "js-sys" -version = "0.3.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" -dependencies = [ - "wasm-bindgen", -] - -[[package]] -name = "lazy_static" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" - -[[package]] -name = "libc" -version = "0.2.155" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" - -[[package]] -name = "libfuzzer-sys" -version = "0.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7" -dependencies = [ - "arbitrary", - "cc", - "once_cell", -] - -[[package]] -name = "linux-raw-sys" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" - -[[package]] -name = "log" -version = "0.4.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" - -[[package]] -name = "miniz_oxide" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" -dependencies = [ - "adler", -] - -[[package]] -name = "num" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" -dependencies = [ - "num-bigint", - "num-complex", - "num-integer", - "num-iter", - "num-rational", - "num-traits", -] - -[[package]] -name = "num-bigint" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" -dependencies = [ - "num-integer", - "num-traits", -] - -[[package]] -name = "num-complex" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-derive" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "num-integer" -version = "0.1.46" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-iter" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" -dependencies = [ - "autocfg", - "num-integer", - "num-traits", -] - -[[package]] -name = "num-rational" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" -dependencies = [ - "num-bigint", - "num-integer", - "num-traits", -] - -[[package]] -name = "num-traits" -version = "0.2.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" -dependencies = [ - "autocfg", -] - -[[package]] -name = "once_cell" -version = "1.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" - -[[package]] -name = "ordered-float" -version = "3.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc" -dependencies = [ - "num-traits", -] - -[[package]] -name = "proc-macro2" -version = "1.0.86" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "pspp" -version = "1.0.0" -dependencies = [ - "anyhow", - "bitflags", - "chrono", - "clap", - "encoding_rs", - "finl_unicode", - "flate2", - "float_next_after", - "hexplay", - "indexmap", - "lazy_static", - "libc", - "num", - "num-derive", - "num-traits", - "ordered-float", - "thiserror", - "unicase", - "utf8-decode", - "windows-sys 0.48.0", -] - -[[package]] -name = "pspp-fuzz" -version = "0.0.0" -dependencies = [ - "libfuzzer-sys", - "pspp", -] - -[[package]] -name = "quote" -version = "1.0.36" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "rustix" -version = "0.38.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" -dependencies = [ - "bitflags", - "errno", - "libc", - "linux-raw-sys", - "windows-sys 0.52.0", -] - -[[package]] -name = "strsim" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" - -[[package]] -name = "syn" -version = "2.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "201fcda3845c23e8212cd466bfebf0bd20694490fc0356ae8e428e0824a915a6" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "termcolor" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adc4587ead41bf016f11af03e55a624c06568b5a19db4e90fde573d805074f83" -dependencies = [ - "wincolor", -] - -[[package]] -name = "terminal_size" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" -dependencies = [ - "rustix", - "windows-sys 0.48.0", -] - -[[package]] -name = "thiserror" -version = "1.0.61" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" -dependencies = [ - "thiserror-impl", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.61" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "unicase" -version = "2.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89" -dependencies = [ - "version_check", -] - -[[package]] -name = "unicode-ident" -version = "1.0.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" - -[[package]] -name = "utf8-decode" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498" - -[[package]] -name = "utf8parse" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" - -[[package]] -name = "version_check" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" - -[[package]] -name = "wasm-bindgen" -version = "0.2.92" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" -dependencies = [ - "cfg-if", - "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.92" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" -dependencies = [ - "bumpalo", - "log", - "once_cell", - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.92" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" -dependencies = [ - "quote", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.92" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-backend", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.92" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" - -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - -[[package]] -name = "wincolor" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eeb06499a3a4d44302791052df005d5232b927ed1a9658146d842165c4de7767" -dependencies = [ - "winapi", -] - -[[package]] -name = "windows-core" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" -dependencies = [ - "windows-targets 0.52.6", -] - -[[package]] -name = "windows-sys" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" -dependencies = [ - "windows-targets 0.48.5", -] - -[[package]] -name = "windows-sys" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" -dependencies = [ - "windows-targets 0.52.6", -] - -[[package]] -name = "windows-targets" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" -dependencies = [ - "windows_aarch64_gnullvm 0.48.5", - "windows_aarch64_msvc 0.48.5", - "windows_i686_gnu 0.48.5", - "windows_i686_msvc 0.48.5", - "windows_x86_64_gnu 0.48.5", - "windows_x86_64_gnullvm 0.48.5", - "windows_x86_64_msvc 0.48.5", -] - -[[package]] -name = "windows-targets" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" -dependencies = [ - "windows_aarch64_gnullvm 0.52.6", - "windows_aarch64_msvc 0.52.6", - "windows_i686_gnu 0.52.6", - "windows_i686_gnullvm", - "windows_i686_msvc 0.52.6", - "windows_x86_64_gnu 0.52.6", - "windows_x86_64_gnullvm 0.52.6", - "windows_x86_64_msvc 0.52.6", -] - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" - -[[package]] -name = "windows_i686_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" - -[[package]] -name = "windows_i686_gnu" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" - -[[package]] -name = "windows_i686_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" - -[[package]] -name = "windows_i686_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" - -[[package]] -name = "windows_i686_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/rust/fuzz/Cargo.toml b/rust/fuzz/Cargo.toml deleted file mode 100644 index 8b44789bad..0000000000 --- a/rust/fuzz/Cargo.toml +++ /dev/null @@ -1,28 +0,0 @@ -[package] -name = "pspp-fuzz" -version = "0.0.0" -publish = false -edition = "2021" - -[package.metadata] -cargo-fuzz = true - -[dependencies] -libfuzzer-sys = "0.4" - -[dependencies.pspp] -path = ".." - -[[bin]] -name = "fuzz_target_1" -path = "fuzz_targets/fuzz_target_1.rs" -test = false -doc = false -bench = false - -[[bin]] -name = "segment" -path = "fuzz_targets/segment.rs" -test = false -doc = false -bench = false diff --git a/rust/fuzz/fuzz_targets/fuzz_target_1.rs b/rust/fuzz/fuzz_targets/fuzz_target_1.rs deleted file mode 100644 index 43a88c14f3..0000000000 --- a/rust/fuzz/fuzz_targets/fuzz_target_1.rs +++ /dev/null @@ -1,7 +0,0 @@ -#![no_main] - -use libfuzzer_sys::fuzz_target; - -fuzz_target!(|data: &[u8]| { - // fuzzed code goes here -}); diff --git a/rust/fuzz/fuzz_targets/segment.rs b/rust/fuzz/fuzz_targets/segment.rs deleted file mode 100644 index 1e5a109449..0000000000 --- a/rust/fuzz/fuzz_targets/segment.rs +++ /dev/null @@ -1,18 +0,0 @@ -#![no_main] - -use libfuzzer_sys::fuzz_target; -use pspp::lex::segment::{Segmenter, Mode, Type}; - -fuzz_target!(|data: &[u8]| { - if let Ok(mut input) = std::str::from_utf8(data) { - let mut segmenter = Segmenter::new(Mode::Auto, false); - loop { - let (rest, type_) = segmenter.push(input, true).unwrap(); - match type_ { - Type::End => break, - _ => (), - } - input = rest; - } - } -}); diff --git a/rust/pspp-lsp/Cargo.toml b/rust/pspp-lsp/Cargo.toml new file mode 100644 index 0000000000..44dd02290c --- /dev/null +++ b/rust/pspp-lsp/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "pspp-lsp" +version = "0.1.0" +edition = "2021" + +[dependencies] +env_logger = "0.11.5" +log = "0.4.22" +pspp = { version = "1.0.0", path = "../pspp" } +tokio = { version = "1.39.3", features = ["full"] } +tower-lsp = "0.20.0" diff --git a/rust/pspp-lsp/src/main.rs b/rust/pspp-lsp/src/main.rs new file mode 100644 index 0000000000..3876550ed2 --- /dev/null +++ b/rust/pspp-lsp/src/main.rs @@ -0,0 +1,86 @@ +use std::collections::HashMap; + +use tokio::sync::Mutex; +use tower_lsp::{ + jsonrpc::Result, + lsp_types::*, + Client, LanguageServer, LspService, Server, +}; + +#[tokio::main] +async fn main() { + env_logger::init(); + + let stdin = tokio::io::stdin(); + let stdout = tokio::io::stdout(); + + let (service, socket) = LspService::build(|client| Backend { + client, + document_map: Mutex::new(HashMap::new()), + }) + .finish(); + + Server::new(stdin, stdout, socket).serve(service).await; +} + +#[derive(Debug)] +struct Backend { + client: Client, + document_map: Mutex>, +} + +#[tower_lsp::async_trait] +impl LanguageServer for Backend { + async fn initialize(&self, params: InitializeParams) -> Result { + Ok(InitializeResult { + server_info: None, + capabilities: ServerCapabilities { + text_document_sync: Some(TextDocumentSyncCapability::Kind( + TextDocumentSyncKind::FULL, + )), + workspace: Some(WorkspaceServerCapabilities { + workspace_folders: Some(WorkspaceFoldersServerCapabilities { + supported: Some(true), + change_notifications: Some(OneOf::Left(true)), + }), + file_operations: None, + }), +/* + semantic_tokens_provider: Some( + SemanticTokensServerCapabilities::SemanticTokensRegistrationOptions( + SemanticTokensRegistrationOptions { + text_document_registration_options: { + TextDocumentRegistrationOptions { + document_selector: Some(vec![DocumentFilter { + language: Some("pspp".to_string()), + scheme: Some("file".to_string()), + pattern: None, + }]), + } + }, + semantic_tokens_options: SemanticTokensOptions { + work_done_progress_options: WorkDoneProgressOptions::default(), + legend: SemanticTokensLegend { + token_types: LEGEND_TYPE.into(), + token_modifiers: vec![], + }, + range: Some(true), + full: Some(SemanticTokensFullOptions::Bool(true)), + }, + static_registration_options: StaticRegistrationOptions::default(), + }, + ), + ), +*/ + definition_provider: Some(OneOf::Left(true)), + references_provider: Some(OneOf::Left(true)), + rename_provider: Some(OneOf::Left(true)), + ..ServerCapabilities::default() + }, + }) + } + + async fn shutdown(&self) -> Result<()> { + Ok(()) + } +} diff --git a/rust/pspp/Cargo.lock b/rust/pspp/Cargo.lock new file mode 100644 index 0000000000..2c9fed4fa1 --- /dev/null +++ b/rust/pspp/Cargo.lock @@ -0,0 +1,973 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anyhow" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800" + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi 0.1.19", + "libc", + "winapi", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" + +[[package]] +name = "bumpalo" +version = "3.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" + +[[package]] +name = "cc" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chardetng" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea" +dependencies = [ + "cfg-if", + "encoding_rs", + "memchr", +] + +[[package]] +name = "chrono" +version = "0.4.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "js-sys", + "num-traits", + "time", + "wasm-bindgen", + "winapi", +] + +[[package]] +name = "clap" +version = "4.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f3061d6db6d8fcbbd4b05e057f2acace52e64e96b498c08c2d7a4e65addd340" +dependencies = [ + "bitflags 1.3.2", + "clap_derive", + "clap_lex", + "is-terminal", + "once_cell", + "strsim", + "termcolor 1.2.0", + "terminal_size", +] + +[[package]] +name = "clap_derive" +version = "4.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34d122164198950ba84a918270a3bb3f7ededd25e15f7451673d986f55bd2667" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "clap_lex" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "350b9cf31731f9957399229e9b2adc51eeabdfbe9d71d9a0552275fd12710d09" +dependencies = [ + "os_str_bytes", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" + +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "diff" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" + +[[package]] +name = "encoding_rs" +version = "0.8.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "enum-map" +version = "2.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6866f3bfdf8207509a033af1a75a7b08abda06bbaaeae6669323fd5a097df2e9" +dependencies = [ + "enum-map-derive", +] + +[[package]] +name = "enum-map-derive" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f282cfdfe92516eb26c2af8589c274c7c17681f5ecc03c18255fe741c6aa64eb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.27", +] + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "errno" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +dependencies = [ + "errno-dragonfly", + "libc", + "winapi", +] + +[[package]] +name = "errno" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +dependencies = [ + "errno-dragonfly", + "libc", + "windows-sys 0.48.0", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "finl_unicode" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" + +[[package]] +name = "flagset" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3ea1ec5f8307826a5b71094dd91fc04d4ae75d5709b20ad351c7fb4815c86ec" + +[[package]] +name = "flate2" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "float_next_after" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" + +[[package]] +name = "hashbrown" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "hermit-abi" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" + +[[package]] +name = "hexplay" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0962bea6731e28b5a443ba4aa00fe3e4fe7555dadf12012435efb738eeac5898" +dependencies = [ + "atty", + "termcolor 0.3.6", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "indexmap" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "io-lifetimes" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3" +dependencies = [ + "libc", + "windows-sys 0.45.0", +] + +[[package]] +name = "is-terminal" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b6b32576413a8e69b90e952e4a026476040d81017b80445deda5f2d3921857" +dependencies = [ + "hermit-abi 0.3.1", + "io-lifetimes", + "rustix 0.36.8", + "windows-sys 0.45.0", +] + +[[package]] +name = "js-sys" +version = "0.3.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.147" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" + +[[package]] +name = "linux-raw-sys" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" + +[[package]] +name = "linux-raw-sys" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" + +[[package]] +name = "log" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "miniz_oxide" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +dependencies = [ + "adler", +] + +[[package]] +name = "num" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02e0d21255c828d6f128a1e41534206671e8c3ea0c62f32291e808dc82cff17d" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-derive" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e6a0fd4f737c707bd9086cc16c925f294943eb62eb71499e9fd4cf71f8b9f4e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.27", +] + +[[package]] +name = "num-integer" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" +dependencies = [ + "autocfg", + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" + +[[package]] +name = "ordered-float" +version = "3.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fc2dbde8f8a79f2102cc474ceb0ad68e3b80b85289ea62389b60e66777e4213" +dependencies = [ + "num-traits", +] + +[[package]] +name = "os_str_bytes" +version = "6.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn 1.0.109", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.66" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pspp" +version = "1.0.0" +dependencies = [ + "anyhow", + "bitflags 2.5.0", + "chardetng", + "chrono", + "clap", + "diff", + "encoding_rs", + "enum-map", + "finl_unicode", + "flagset", + "flate2", + "float_next_after", + "hexplay", + "indexmap", + "lazy_static", + "libc", + "num", + "num-derive", + "num-traits", + "ordered-float", + "thiserror", + "unicase", + "unicode-width", + "utf8-decode", + "windows-sys 0.48.0", +] + +[[package]] +name = "quote" +version = "1.0.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rustix" +version = "0.36.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f43abb88211988493c1abb44a70efa56ff0ce98f233b7b276146f1f3f7ba9644" +dependencies = [ + "bitflags 1.3.2", + "errno 0.2.8", + "io-lifetimes", + "libc", + "linux-raw-sys 0.1.4", + "windows-sys 0.45.0", +] + +[[package]] +name = "rustix" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b24138615de35e32031d041a09032ef3487a616d901ca4db224e7d557efae2" +dependencies = [ + "bitflags 1.3.2", + "errno 0.3.1", + "io-lifetimes", + "libc", + "linux-raw-sys 0.3.8", + "windows-sys 0.45.0", +] + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b60f673f44a8255b9c8c657daf66a596d435f2da81a555b06dc644d080ba45e0" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "termcolor" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adc4587ead41bf016f11af03e55a624c06568b5a19db4e90fde573d805074f83" +dependencies = [ + "wincolor", +] + +[[package]] +name = "termcolor" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "terminal_size" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e6bf6f19e9f8ed8d4048dc22981458ebcf406d67e94cd422e5ecd73d63b3237" +dependencies = [ + "rustix 0.37.3", + "windows-sys 0.48.0", +] + +[[package]] +name = "thiserror" +version = "1.0.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5ab016db510546d856297882807df8da66a16fb8c4101cb8b30054b0d5b2d9c" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5420d42e90af0c38c3290abcca25b9b3bdf379fc9f55c528f53a269d9c9a267e" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "time" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" +dependencies = [ + "libc", + "wasi", + "winapi", +] + +[[package]] +name = "unicase" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" +dependencies = [ + "version_check", +] + +[[package]] +name = "unicode-ident" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" + +[[package]] +name = "unicode-width" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" + +[[package]] +name = "utf8-decode" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wasi" +version = "0.10.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" + +[[package]] +name = "wasm-bindgen" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.27", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.27", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "wincolor" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eeb06499a3a4d44302791052df005d5232b927ed1a9658146d842165c4de7767" +dependencies = [ + "winapi", +] + +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets 0.48.1", +] + +[[package]] +name = "windows-sys" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +dependencies = [ + "windows-targets 0.42.1", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.1", +] + +[[package]] +name = "windows-targets" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" +dependencies = [ + "windows_aarch64_gnullvm 0.42.1", + "windows_aarch64_msvc 0.42.1", + "windows_i686_gnu 0.42.1", + "windows_i686_msvc 0.42.1", + "windows_x86_64_gnu 0.42.1", + "windows_x86_64_gnullvm 0.42.1", + "windows_x86_64_msvc 0.42.1", +] + +[[package]] +name = "windows-targets" +version = "0.48.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" +dependencies = [ + "windows_aarch64_gnullvm 0.48.0", + "windows_aarch64_msvc 0.48.0", + "windows_i686_gnu 0.48.0", + "windows_i686_msvc 0.48.0", + "windows_x86_64_gnu 0.48.0", + "windows_x86_64_gnullvm 0.48.0", + "windows_x86_64_msvc 0.48.0", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" + +[[package]] +name = "windows_i686_gnu" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" + +[[package]] +name = "windows_i686_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" diff --git a/rust/pspp/Cargo.toml b/rust/pspp/Cargo.toml new file mode 100644 index 0000000000..41b2f02c6f --- /dev/null +++ b/rust/pspp/Cargo.toml @@ -0,0 +1,51 @@ +[package] +name = "pspp" +version = "1.0.0" +edition = "2021" +authors = [ "Ben Pfaff", "John Darrington" ] + +[dependencies] +anyhow = "1.0.69" +clap = { version = "4.1.7", features = ["derive", "wrap_help"] } +encoding_rs = "0.8.32" +flate2 = "1.0.26" +float_next_after = "1.0.0" +hexplay = "0.2.1" +lazy_static = "1.4.0" +num = "0.4.0" +num-derive = "0.4.0" +num-traits = "0.2.16" +ordered-float = "3.7.0" +thiserror = "1.0" +chrono = "0.4.26" +finl_unicode = "1.2.0" +unicase = "2.6.0" +libc = "0.2.147" +indexmap = "2.1.0" +utf8-decode = "1.0.1" +bitflags = "2.5.0" +unicode-width = "0.1.13" +chardetng = "0.1.17" +enum-map = "2.7.3" +flagset = "0.4.6" + +[target.'cfg(windows)'.dependencies] +windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] } + +[build-dependencies] +anyhow = "1.0.69" + +[[bin]] +name = "pspp-dump-sav" +path = "src/main.rs" + +[lib] +path = "src/lib.rs" + +[[test]] +name = "sack" +path = "tests/sack.rs" +harness = false + +[dev-dependencies] +diff = "0.1.13" diff --git a/rust/pspp/build.rs b/rust/pspp/build.rs new file mode 100644 index 0000000000..f8cb9efa13 --- /dev/null +++ b/rust/pspp/build.rs @@ -0,0 +1,184 @@ +use anyhow::{anyhow, Result as AnyResult}; +use std::{ + collections::{BTreeMap, HashSet, VecDeque}, + env::var_os, + fs::{read_to_string, File}, + io::{Error as IoError, Write}, + path::{Path, PathBuf}, +}; + +#[derive(Copy, Clone, PartialEq, Eq, Ord, PartialOrd)] +enum Source { + Codepage, + Ibm, + Windows, +} + +// Code page number. +type CodepageNumber = usize; + +fn process_converter<'a>( + fields: &Vec<&'a str>, + codepages: &mut BTreeMap>>, +) { + if fields.is_empty() || fields[0] == "{" { + return; + } + + let mut cps: BTreeMap = BTreeMap::new(); + let mut iana = VecDeque::new(); + let mut other = VecDeque::new(); + + let mut iter = fields.iter().peekable(); + while let Some(&name) = iter.next() { + if iter.next_if(|&&s| s == "{").is_some() { + let mut standards = HashSet::new(); + loop { + let &standard = iter.next().expect("missing `}` in list of standards"); + if standard == "}" { + break; + } + standards.insert(standard); + } + + if standards.contains("IANA*") { + iana.push_front(name); + } else if standards.contains("IANA") { + iana.push_back(name); + } else if standards.iter().any(|&s| s.ends_with('*')) { + other.push_front(name); + } else { + other.push_back(name); + } + } else { + // Untagged names are completely nonstandard. + continue; + } + + if let Some(number) = name.strip_prefix("cp") { + if let Ok(number) = number.parse::() { + cps.insert(Source::Codepage, number); + } + } + + if let Some(number) = name.strip_prefix("windows-") { + if let Ok(number) = number.parse::() { + cps.insert(Source::Windows, number); + } + } + + if let Some(number) = name.strip_prefix("ibm-") { + if let Ok(number) = number.parse::() { + cps.insert(Source::Ibm, number); + } + } + } + + // If there are no tagged names then this is completely nonstandard. + if iana.is_empty() && other.is_empty() { + return; + } + + let all: Vec<&str> = iana.into_iter().chain(other).collect(); + for (source, number) in cps { + codepages + .entry(number) + .or_default() + .insert(source, all.clone()); + } +} + +fn write_output( + codepages: &BTreeMap>>, + file_name: &PathBuf, +) -> Result<(), IoError> { + let mut file = File::create(file_name)?; + + file.write_all( + "\ +use lazy_static::lazy_static; +use std::collections::HashMap; + +lazy_static! { + static ref CODEPAGE_NUMBER_TO_NAME: HashMap = { + let mut map = HashMap::new(); +" + .as_bytes(), + )?; + + for (&cpnumber, value) in codepages.iter() { + let source = value.keys().max().unwrap(); + let name = value[source][0]; + writeln!(file, " map.insert({cpnumber}, \"{name}\");")?; + } + file.write_all( + " map + }; + + static ref CODEPAGE_NAME_TO_NUMBER: HashMap<&'static str, u32> = { + let mut map = HashMap::new(); +" + .as_bytes(), + )?; + + let mut names: BTreeMap>> = BTreeMap::new(); + for (&cpnumber, value) in codepages.iter() { + for (&source, value2) in value.iter() { + for name in value2.iter().map(|name| name.to_ascii_lowercase()) { + names + .entry(name) + .or_default() + .entry(source) + .or_default() + .push(cpnumber); + } + } + } + + for (name, value) in names.iter() { + for (_source, numbers) in value.iter().rev().take(1) { + writeln!(file, " map.insert(\"{name}\", {});", numbers[0])?; + } + } + file.write_all( + " map + }; +} +" + .as_bytes(), + )?; + + Ok(()) +} + +fn main() -> AnyResult<()> { + println!("cargo:rerun-if-changed=build.rs"); + + let input_file = Path::new(env!("CARGO_MANIFEST_DIR")).join("convrtrs.txt"); + println!("cargo:rerun-if-changed={}", input_file.to_string_lossy()); + let input = read_to_string(&input_file) + .map_err(|e| anyhow!("{}: read failed ({e})", input_file.to_string_lossy()))?; + + let mut codepages: BTreeMap>> = BTreeMap::new(); + let mut converter: Vec<&str> = Vec::new(); + for line in input.lines() { + let line = line + .find('#') + .map(|position| &line[..position]) + .unwrap_or(line) + .trim_end(); + if !line.starts_with([' ', '\t']) { + process_converter(&converter, &mut codepages); + converter.clear(); + } + converter.extend(line.split_whitespace()); + } + process_converter(&converter, &mut codepages); + + let output_file_name = Path::new(&var_os("OUT_DIR").unwrap()).join("encodings.rs"); + + write_output(&codepages, &output_file_name) + .map_err(|e| anyhow!("{}: write failed ({e})", output_file_name.to_string_lossy()))?; + + Ok(()) +} diff --git a/rust/pspp/convrtrs.txt b/rust/pspp/convrtrs.txt new file mode 100644 index 0000000000..4aaa592a53 --- /dev/null +++ b/rust/pspp/convrtrs.txt @@ -0,0 +1,1269 @@ +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# ****************************************************************************** +# * +# * Copyright (C) 1995-2014, International Business Machines +# * Corporation and others. All Rights Reserved. +# * +# ****************************************************************************** + +# If this converter alias table looks very confusing, a much easier to +# understand view can be found at this demo: +# http://demo.icu-project.org/icu-bin/convexp + +# IMPORTANT NOTE +# +# This file is not read directly by ICU. If you change it, you need to +# run gencnval, and eventually run pkgdata to update the representation that +# ICU uses for aliases. The gencnval tool will normally compile this file into +# cnvalias.icu. The gencnval -v verbose option will help you when you edit +# this file. + +# Please be friendly to the rest of us that edit this table by +# keeping this table free of tabs. + +# This is an alias file used by the character set converter. +# A lot of converter information can be found in unicode/ucnv.h, but here +# is more information about this file. +# +# If you are adding a new converter to this list and want to include it in the +# icu data library, please be sure to add an entry to the appropriate ucm*.mk file +# (see ucmfiles.mk for more information). +# +# Here is the file format using BNF-like syntax: +# +# converterTable ::= tags { converterLine* } +# converterLine ::= converterName [ tags ] { taggedAlias* }'\n' +# taggedAlias ::= alias [ tags ] +# tags ::= '{' { tag+ } '}' +# tag ::= standard['*'] +# converterName ::= [0-9a-zA-Z:_'-']+ +# alias ::= converterName +# +# Except for the converter name, aliases are case insensitive. +# Names are separated by whitespace. +# Line continuation and comment sytax are similar to the GNU make syntax. +# Any lines beginning with whitespace (e.g. U+0020 SPACE or U+0009 HORIZONTAL +# TABULATION) are presumed to be a continuation of the previous line. +# The # symbol starts a comment and the comment continues till the end of +# the line. +# +# The converter +# +# All names can be tagged by including a space-separated list of tags in +# curly braces, as in ISO_8859-1:1987{IANA*} iso-8859-1 { MIME* } or +# some-charset{MIME* IANA*}. The order of tags does not matter, and +# whitespace is allowed between the tagged name and the tags list. +# +# The tags can be used to get standard names using ucnv_getStandardName(). +# +# The complete list of recognized tags used in this file is defined in +# the affinity list near the beginning of the file. +# +# The * after the standard tag denotes that the previous alias is the +# preferred (default) charset name for that standard. There can only +# be one of these default charset names per converter. + + + +# The world is getting more complicated... +# Supporting XML parsers, HTML, MIME, and similar applications +# that mark encodings with a charset name can be difficult. +# Many of these applications and operating systems will update +# their codepages over time. + +# It means that a new codepage, one that differs from an +# old one by changing a code point, e.g., to the Euro sign, +# must not get an old alias, because it would mean that +# old files with this alias would be interpreted differently. + +# If an codepage gets updated by assigning characters to previously +# unassigned code points, then a new name is not necessary. +# Also, some codepages map unassigned codepage byte values +# to the same numbers in Unicode for roundtripping. It may be +# industry practice to keep the encoding name in such a case, too +# (example: Windows codepages). + +# The aliases listed in the list of character sets +# that is maintained by the IANA (http://www.iana.org/) must +# not be changed to mean encodings different from what this +# list shows. Currently, the IANA list is at +# http://www.iana.org/assignments/character-sets +# It should also be mentioned that the exact mapping table used for each +# IANA names usually isn't specified. This means that some other applications +# and operating systems are left to interpret the exact mappings for the +# underspecified aliases. For instance, Shift-JIS on a Solaris platform +# may be different from Shift-JIS on a Windows platform. This is why +# some of the aliases can be tagged to differentiate different mapping +# tables with the same alias. If an alias is given to more than one converter, +# it is considered to be an ambiguous alias, and the affinity list will +# choose the converter to use when a standard isn't specified with the alias. + +# Name matching is case-insensitive. Also, dashes '-', underscores '_' +# and spaces ' ' are ignored in names (thus cs-iso_latin-1, csisolatin1 +# and "cs iso latin 1" are the same). +# However, the names in the left column are directly file names +# or names of algorithmic converters, and their case must not +# be changed - or else code and/or file names must also be changed. +# For example, the converter ibm-921 is expected to be the file ibm-921.cnv. + + + +# The immediately following list is the affinity list of supported standard tags. +# When multiple converters have the same alias under different standards, +# the standard nearest to the top of this list with that alias will +# be the first converter that will be opened. The ordering of the aliases +# after this affinity list does not affect the preferred alias, but it may +# affect the order of the returned list of aliases for a given converter. +# +# The general ordering is from specific and frequently used to more general +# or rarely used at the bottom. +{ UTR22 # Name format specified by https://www.unicode.org/reports/tr22/ + # ICU # Can also use ICU_FEATURE + IBM # The IBM CCSID number is specified by ibm-* + WINDOWS # The Microsoft code page identifier number is specified by windows-*. The rest are recognized IE names. + JAVA # Source: Sun JDK. Alias name case is ignored, but dashes are not ignored. + # GLIBC + # AIX + # DB2 + # SOLARIS + # APPLE + # HPUX + IANA # Source: http://www.iana.org/assignments/character-sets + MIME # Source: http://www.iana.org/assignments/character-sets + # MSIE # MSIE is Internet Explorer, which can be different from Windows (From the IMultiLanguage COM interface) + # ZOS_USS # z/OS (os/390) Unix System Services (USS), which has NL<->LF swapping. They have the same format as the IBM tag. + } + + + +# Fully algorithmic converters + +UTF-8 { IANA* MIME* JAVA* WINDOWS } + ibm-1208 { IBM* } # UTF-8 with IBM PUA + ibm-1209 { IBM } # UTF-8 + ibm-5304 { IBM } # Unicode 2.0, UTF-8 with IBM PUA + ibm-5305 { IBM } # Unicode 2.0, UTF-8 + ibm-13496 { IBM } # Unicode 3.0, UTF-8 with IBM PUA + ibm-13497 { IBM } # Unicode 3.0, UTF-8 + ibm-17592 { IBM } # Unicode 4.0, UTF-8 with IBM PUA + ibm-17593 { IBM } # Unicode 4.0, UTF-8 + windows-65001 { WINDOWS* } + cp1208 + x-UTF_8J + unicode-1-1-utf-8 + unicode-2-0-utf-8 + +# The ICU 2.2 UTF-16/32 converters detect and write a BOM. +UTF-16 { IANA* MIME* JAVA* } ISO-10646-UCS-2 { IANA } + ibm-1204 { IBM* } # UTF-16 with IBM PUA and BOM sensitive + ibm-1205 { IBM } # UTF-16 BOM sensitive + unicode + csUnicode + ucs-2 +# The following Unicode CCSIDs (IBM) are not valid in ICU because they are +# considered pure DBCS (exactly 2 bytes) of Unicode, +# and they are a subset of Unicode. ICU does not support their encoding structures. +# 1400 1401 1402 1410 1414 1415 1446 1447 1448 1449 64770 64771 65520 5496 5497 5498 9592 13688 +UTF-16BE { IANA* MIME* JAVA* } x-utf-16be { JAVA } + UnicodeBigUnmarked { JAVA } # java.io name + ibm-1200 { IBM* } # UTF-16 BE with IBM PUA + ibm-1201 { IBM } # UTF-16 BE + ibm-13488 { IBM } # Unicode 2.0, UTF-16 BE with IBM PUA + ibm-13489 { IBM } # Unicode 2.0, UTF-16 BE + ibm-17584 { IBM } # Unicode 3.0, UTF-16 BE with IBM PUA + ibm-17585 { IBM } # Unicode 3.0, UTF-16 BE + ibm-21680 { IBM } # Unicode 4.0, UTF-16 BE with IBM PUA + ibm-21681 { IBM } # Unicode 4.0, UTF-16 BE + ibm-25776 { IBM } # Unicode 4.1, UTF-16 BE with IBM PUA + ibm-25777 { IBM } # Unicode 4.1, UTF-16 BE + ibm-29872 { IBM } # Unicode 5.0, UTF-16 BE with IBM PUA + ibm-29873 { IBM } # Unicode 5.0, UTF-16 BE + ibm-61955 { IBM } # UTF-16BE with Gaidai University (Japan) PUA + ibm-61956 { IBM } # UTF-16BE with Microsoft HKSCS-Big 5 PUA + windows-1201 { WINDOWS* } + cp1200 + cp1201 + UTF16_BigEndian + # ibm-5297 { IBM } # Unicode 2.0, UTF-16 (BE) (reserved, never used) + # iso-10646-ucs-2 { JAVA } # This is ambiguous + # ibm-61952 is not a valid CCSID because it's Unicode 1.1 + # ibm-61953 is not a valid CCSID because it's Unicode 1.0 +UTF-16LE { IANA* MIME* JAVA* } x-utf-16le { JAVA } + UnicodeLittleUnmarked { JAVA } # java.io name + ibm-1202 { IBM* } # UTF-16 LE with IBM PUA + ibm-1203 { IBM } # UTF-16 LE + ibm-13490 { IBM } # Unicode 2.0, UTF-16 LE with IBM PUA + ibm-13491 { IBM } # Unicode 2.0, UTF-16 LE + ibm-17586 { IBM } # Unicode 3.0, UTF-16 LE with IBM PUA + ibm-17587 { IBM } # Unicode 3.0, UTF-16 LE + ibm-21682 { IBM } # Unicode 4.0, UTF-16 LE with IBM PUA + ibm-21683 { IBM } # Unicode 4.0, UTF-16 LE + ibm-25778 { IBM } # Unicode 4.1, UTF-16 LE with IBM PUA + ibm-25779 { IBM } # Unicode 4.1, UTF-16 LE + ibm-29874 { IBM } # Unicode 5.0, UTF-16 LE with IBM PUA + ibm-29875 { IBM } # Unicode 5.0, UTF-16 LE + UTF16_LittleEndian + windows-1200 { WINDOWS* } + +UTF-32 { IANA* MIME* } ISO-10646-UCS-4 { IANA } + ibm-1236 { IBM* } # UTF-32 with IBM PUA and BOM sensitive + ibm-1237 { IBM } # UTF-32 BOM sensitive + csUCS4 + ucs-4 +UTF-32BE { IANA* } UTF32_BigEndian + ibm-1232 { IBM* } # UTF-32 BE with IBM PUA + ibm-1233 { IBM } # UTF-32 BE + ibm-9424 { IBM } # Unicode 4.1, UTF-32 BE with IBM PUA +UTF-32LE { IANA* } UTF32_LittleEndian + ibm-1234 { IBM* } # UTF-32 LE, with IBM PUA + ibm-1235 { IBM } # UTF-32 LE + +# ICU-specific names for special uses +UTF16_PlatformEndian +UTF16_OppositeEndian + +UTF32_PlatformEndian +UTF32_OppositeEndian + + +# Java-specific, non-Unicode-standard UTF-16 variants. +# These are in the Java "Basic Encoding Set (contained in lib/rt.jar)". +# See the "Supported Encodings" at +# http://java.sun.com/javase/6/docs/technotes/guides/intl/encoding.doc.html +# or a newer version of this document. +# +# Aliases marked with { JAVA* } are canonical names for java.io and java.lang APIs. +# Aliases marked with { JAVA } are canonical names for the java.nio API. +# +# "BOM" means the Unicode Byte Order Mark, which is the encoding-scheme-specific +# byte sequence for U+FEFF. +# "Reverse BOM" means the BOM for the sibling encoding scheme with the +# opposite endianness. (LE<->BE) + +# "Sixteen-bit Unicode (or UCS) Transformation Format, big-endian byte order, +# with byte-order mark" +# +# From Unicode: Writes BOM. +# To Unicode: Detects and consumes BOM. +# If there is a "reverse BOM", Java throws +# MalformedInputException: Incorrect byte-order mark. +# In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value +# and a UCNV_ILLEGAL UConverterCallbackReason. +UTF-16BE,version=1 UnicodeBig { JAVA* } + +# "Sixteen-bit Unicode (or UCS) Transformation Format, little-endian byte order, +# with byte-order mark" +# +# From Unicode: Writes BOM. +# To Unicode: Detects and consumes BOM. +# If there is a "reverse BOM", Java throws +# MalformedInputException: Incorrect byte-order mark. +# In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value +# and a UCNV_ILLEGAL UConverterCallbackReason. +UTF-16LE,version=1 UnicodeLittle { JAVA* } x-UTF-16LE-BOM { JAVA } + +# This one is not mentioned on the "Supported Encodings" page +# but is available in Java. +# In Java, this is called "Unicode" but we cannot give it that alias +# because the standard UTF-16 converter already has a "unicode" alias. +# +# From Unicode: Writes BOM. +# To Unicode: Detects and consumes BOM. +# If there is no BOM, rather than defaulting to BE, Java throws +# MalformedInputException: Missing byte-order mark. +# In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value +# and a UCNV_ILLEGAL UConverterCallbackReason. +UTF-16,version=1 + +# This is the same as standard UTF-16 but always writes a big-endian byte stream, +# regardless of the platform endianness, as expected by the Java compatibility tests. +# See the java.nio.charset.Charset API documentation at +# http://java.sun.com/javase/6/docs/api/java/nio/charset/Charset.html +# or a newer version of this document. +# +# From Unicode: Write BE BOM and BE bytes +# To Unicode: Detects and consumes BOM. Defaults to BE. +UTF-16,version=2 + +# Note: ICU does not currently support Java-specific, non-Unicode-standard UTF-32 variants. +# Presumably, these behave analogously to the UTF-16 variants with similar names. +# UTF_32BE_BOM x-UTF-32BE-BOM +# UTF_32LE_BOM x-UTF-32LE-BOM + +# End of Java-specific, non-Unicode-standard UTF variants. + + +# On UTF-7: +# RFC 2152 (http://www.imc.org/rfc2152) allows to encode some US-ASCII +# characters directly or in base64. Especially, the characters in set O +# as defined in the RFC (!"#$%&*;<=>@[]^_`{|}) may be encoded directly +# but are not allowed in, e.g., email headers. +# By default, the ICU UTF-7 converter encodes set O directly. +# By choosing the option "version=1", set O will be escaped instead. +# For example: +# utf7Converter=ucnv_open("UTF-7,version=1"); +# +# For details about email headers see RFC 2047. +UTF-7 { IANA* MIME* WINDOWS } windows-65000 { WINDOWS* } + unicode-1-1-utf-7 + unicode-2-0-utf-7 + +# UTF-EBCDIC doesn't exist in ICU, but the aliases are here for reference. +#UTF-EBCDIC ibm-1210 { IBM* } ibm-1211 { IBM } + +# IMAP-mailbox-name is an ICU-specific name for the encoding of IMAP mailbox names. +# It is a substantially modified UTF-7 encoding. See the specification in: +# +# RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1 +# (http://www.ietf.org/rfc/rfc2060.txt) +# Section 5.1.3. Mailbox International Naming Convention +IMAP-mailbox-name + +SCSU { IANA* } + ibm-1212 { IBM } # SCSU with IBM PUA + ibm-1213 { IBM* } # SCSU +BOCU-1 { IANA* } + csBOCU-1 { IANA } + ibm-1214 { IBM } # BOCU-1 with IBM PUA + ibm-1215 { IBM* } # BOCU-1 + +# See https://www.unicode.org/reports/tr26 for this Compatibility Encoding Scheme for UTF-16 +# The Unicode Consortium does not encourage the use of CESU-8 +CESU-8 { IANA* } ibm-9400 { IBM* } + +# Standard iso-8859-1, which does not have the Euro update. +# See iso-8859-15 (latin9) for the Euro update +ISO-8859-1 { MIME* IANA JAVA* } + ibm-819 { IBM* JAVA } # This is not truely ibm-819 because it's missing the fallbacks. + IBM819 { IANA } + cp819 { IANA JAVA } + latin1 { IANA JAVA } + 8859_1 { JAVA } + csISOLatin1 { IANA JAVA } + iso-ir-100 { IANA JAVA } + ISO_8859-1:1987 { IANA* JAVA } + l1 { IANA JAVA } + 819 { JAVA } + # windows-28591 { WINDOWS* } # This has odd behavior because it has the Euro update, which isn't correct. + # LATIN_1 # Old ICU name + # ANSI_X3.110-1983 # This is for a different IANA alias. This isn't iso-8859-1. + +US-ASCII { MIME* IANA JAVA WINDOWS } + ASCII { JAVA* IANA WINDOWS } + ANSI_X3.4-1968 { IANA* WINDOWS } + ANSI_X3.4-1986 { IANA WINDOWS } + ISO_646.irv:1991 { IANA WINDOWS } + iso_646.irv:1983 { JAVA } + ISO646-US { JAVA IANA WINDOWS } + us { IANA } + csASCII { IANA WINDOWS } + iso-ir-6 { IANA } + cp367 { IANA WINDOWS } + ascii7 { JAVA } + 646 { JAVA } + windows-20127 { WINDOWS* } + ibm-367 { IBM* } IBM367 { IANA WINDOWS } # This is not truely ibm-367 because it's missing the fallbacks. + +# GB 18030 is partly algorithmic, using the MBCS converter +gb18030 { IANA* } ibm-1392 { IBM* } windows-54936 { WINDOWS* } GB18030 { MIME* } + +# Table-based interchange codepages + +# Central Europe +ibm-912_P100-1995 { UTR22* } + ibm-912 { IBM* JAVA } + ISO-8859-2 { MIME* IANA JAVA* WINDOWS } + ISO_8859-2:1987 { IANA* WINDOWS JAVA } + latin2 { IANA WINDOWS JAVA } + csISOLatin2 { IANA WINDOWS JAVA } + iso-ir-101 { IANA WINDOWS JAVA } + l2 { IANA WINDOWS JAVA } + 8859_2 { JAVA } + cp912 { JAVA } + 912 { JAVA } + windows-28592 { WINDOWS* } + +# Maltese Esperanto +ibm-913_P100-2000 { UTR22* } + ibm-913 { IBM* JAVA } + ISO-8859-3 { MIME* IANA WINDOWS JAVA* } + ISO_8859-3:1988 { IANA* WINDOWS JAVA } + latin3 { IANA JAVA WINDOWS } + csISOLatin3 { IANA WINDOWS } + iso-ir-109 { IANA WINDOWS JAVA } + l3 { IANA WINDOWS JAVA } + 8859_3 { JAVA } + cp913 { JAVA } + 913 { JAVA } + windows-28593 { WINDOWS* } + +# Baltic +ibm-914_P100-1995 { UTR22* } + ibm-914 { IBM* JAVA } + ISO-8859-4 { MIME* IANA WINDOWS JAVA* } + latin4 { IANA WINDOWS JAVA } + csISOLatin4 { IANA WINDOWS JAVA } + iso-ir-110 { IANA WINDOWS JAVA } + ISO_8859-4:1988 { IANA* WINDOWS JAVA } + l4 { IANA WINDOWS JAVA } + 8859_4 { JAVA } + cp914 { JAVA } + 914 { JAVA } + windows-28594 { WINDOWS* } + +# Cyrillic +ibm-915_P100-1995 { UTR22* } + ibm-915 { IBM* JAVA } + ISO-8859-5 { MIME* IANA WINDOWS JAVA* } + cyrillic { IANA WINDOWS JAVA } + csISOLatinCyrillic { IANA WINDOWS JAVA } + iso-ir-144 { IANA WINDOWS JAVA } + ISO_8859-5:1988 { IANA* WINDOWS JAVA } + 8859_5 { JAVA } + cp915 { JAVA } + 915 { JAVA } + windows-28595 { WINDOWS* } + +glibc-PT154-2.3.3 { UTR22* } + PTCP154 { IANA* } + csPTCP154 + PT154 + CP154 + Cyrillic-Asian + +# Arabic +# ISO_8859-6-E and ISO_8859-6-I are similar to this charset, but BiDi is done differently +# From a narrow mapping point of view, there is no difference. +# -E means explicit. -I means implicit. +# -E requires the client to handle the ISO 6429 bidirectional controls +ibm-1089_P100-1995 { UTR22* } + ibm-1089 { IBM* JAVA } + ISO-8859-6 { MIME* IANA WINDOWS JAVA* } + arabic { IANA WINDOWS JAVA } + csISOLatinArabic { IANA WINDOWS JAVA } + iso-ir-127 { IANA WINDOWS JAVA } + ISO_8859-6:1987 { IANA* WINDOWS JAVA } + ECMA-114 { IANA JAVA } + ASMO-708 { IANA JAVA } + 8859_6 { JAVA } + cp1089 { JAVA } + 1089 { JAVA } + windows-28596 { WINDOWS* } + ISO-8859-6-I { IANA MIME } # IANA considers this alias different and BiDi needs to be applied. + ISO-8859-6-E { IANA MIME } # IANA considers this alias different and BiDi needs to be applied. + x-ISO-8859-6S { JAVA } + +# ISO Greek (with euro update). This is really ISO_8859-7:2003 +ibm-9005_X110-2007 { UTR22* } + ibm-9005 { IBM* } + ISO-8859-7 { MIME* IANA JAVA* WINDOWS } + 8859_7 { JAVA } + greek { IANA JAVA WINDOWS } + greek8 { IANA JAVA WINDOWS } + ELOT_928 { IANA JAVA WINDOWS } + ECMA-118 { IANA JAVA WINDOWS } + csISOLatinGreek { IANA JAVA WINDOWS } + iso-ir-126 { IANA JAVA WINDOWS } + ISO_8859-7:1987 { IANA* JAVA WINDOWS } + windows-28597 { WINDOWS* } + sun_eu_greek # For Solaris + +# ISO Greek (w/o euro update) +# JDK 1.5 has these aliases. +ibm-813_P100-1995 { UTR22* } + ibm-813 { IBM* JAVA* } + cp813 { JAVA } + 813 { JAVA } + +# hebrew +# ISO_8859-8-E and ISO_8859-8-I are similar to this charset, but BiDi is done differently +# From a narrow mapping point of view, there is no difference. +# -E means explicit. -I means implicit. +# -E requires the client to handle the ISO 6429 bidirectional controls +# This matches the official mapping on unicode.org +ibm-5012_P100-1999 { UTR22* } + ibm-5012 { IBM* } + ISO-8859-8 { MIME* IANA WINDOWS JAVA* } + hebrew { IANA WINDOWS JAVA } + csISOLatinHebrew { IANA WINDOWS JAVA } + iso-ir-138 { IANA WINDOWS JAVA } + ISO_8859-8:1988 { IANA* WINDOWS JAVA } + ISO-8859-8-I { IANA MIME } # IANA and Windows considers this alias different and BiDi needs to be applied. + ISO-8859-8-E { IANA MIME } # IANA and Windows considers this alias different and BiDi needs to be applied. + 8859_8 { JAVA } + windows-28598 { WINDOWS* } # Hebrew (ISO-Visual). A hybrid between ibm-5012 and ibm-916 with extra PUA mappings. + hebrew8 # Reflect HP-UX code page update + +# Unfortunately, the Java aliases are split across ibm-916 and ibm-5012 +# Also many platforms are a combination between ibm-916 and ibm-5012 behaviors +ibm-916_P100-1995 { UTR22* } + ibm-916 { IBM* JAVA* } + cp916 { JAVA } + 916 { JAVA } + +# Turkish +ibm-920_P100-1995 { UTR22* } + ibm-920 { IBM* JAVA } + ISO-8859-9 { MIME* IANA WINDOWS JAVA* } + latin5 { IANA WINDOWS JAVA } + csISOLatin5 { IANA JAVA } + iso-ir-148 { IANA WINDOWS JAVA } + ISO_8859-9:1989 { IANA* WINDOWS } + l5 { IANA WINDOWS JAVA } + 8859_9 { JAVA } + cp920 { JAVA } + 920 { JAVA } + windows-28599 { WINDOWS* } + ECMA-128 # IANA doesn't have this alias 6/24/2002 + turkish8 # Reflect HP-UX codepage update 8/1/2008 + turkish # Reflect HP-UX codepage update 8/1/2008 + +# Nordic languages +iso-8859_10-1998 { UTR22* } ISO-8859-10 { MIME* IANA* } + iso-ir-157 { IANA } + l6 { IANA } + ISO_8859-10:1992 { IANA } + csISOLatin6 { IANA } + latin6 { IANA } + +# Thai +# Be warned. There several iso-8859-11 codepage variants, and they are all incompatible. +# ISO-8859-11 is a superset of TIS-620. The difference is that ISO-8859-11 contains the C1 control codes. +iso-8859_11-2001 { UTR22* } ISO-8859-11 + thai8 # HP-UX alias. HP-UX says TIS-620, but it's closer to ISO-8859-11. + x-iso-8859-11 { JAVA* } + +# iso-8859-13, PC Baltic (w/o euro update) +ibm-921_P100-1995 { UTR22* } + ibm-921 { IBM* } + ISO-8859-13 { IANA* MIME* JAVA* } + 8859_13 { JAVA } + windows-28603 { WINDOWS* } + cp921 + 921 + x-IBM921 { JAVA } + +# Celtic +iso-8859_14-1998 { UTR22* } ISO-8859-14 { IANA* } + iso-ir-199 { IANA } + ISO_8859-14:1998 { IANA } + latin8 { IANA } + iso-celtic { IANA } + l8 { IANA } + +# Latin 9 +ibm-923_P100-1998 { UTR22* } + ibm-923 { IBM* JAVA } + ISO-8859-15 { IANA* MIME* WINDOWS JAVA* } + Latin-9 { IANA WINDOWS } + l9 { WINDOWS } + 8859_15 { JAVA } + latin0 { JAVA } + csisolatin0 { JAVA } + csisolatin9 { JAVA } + iso8859_15_fdis { JAVA } + cp923 { JAVA } + 923 { JAVA } + windows-28605 { WINDOWS* } + +# CJK encodings + +ibm-942_P12A-1999 { UTR22* } # ibm-942_P120 is a rarely used alternate mapping (sjis78 is already old) + ibm-942 { IBM* } + ibm-932 { IBM } + cp932 + shift_jis78 + sjis78 + ibm-942_VSUB_VPUA + ibm-932_VSUB_VPUA + x-IBM942 { JAVA* } + x-IBM942C { JAVA } + # Is this "JIS_C6226-1978"? + +# ibm-943_P15A-2003 differs from windows-932-2000 only in a few roundtrip mappings: +# - the usual IBM PC control code rotation (1A-1C-7F) +# - the Windows table has roundtrips for bytes 80, A0, and FD-FF to U+0080 and PUA +ibm-943_P15A-2003 { UTR22* } + ibm-943 # Leave untagged because this isn't the default + Shift_JIS { IANA* MIME* WINDOWS JAVA } + MS_Kanji { IANA WINDOWS JAVA } + csShiftJIS { IANA WINDOWS JAVA } + windows-31j { IANA JAVA } # A further extension of Shift_JIS to include NEC special characters (Row 13) + csWindows31J { IANA WINDOWS JAVA } # A further extension of Shift_JIS to include NEC special characters (Row 13) + x-sjis { WINDOWS JAVA } + x-ms-cp932 { WINDOWS } + cp932 { WINDOWS } + windows-932 { WINDOWS* } + cp943c { JAVA* } # This is slightly different, but the backslash mapping is the same. + IBM-943C #{ AIX* } # Add this tag once AIX aliases becomes available + ms932 + pck # Probably SOLARIS + sjis # This might be for ibm-1351 + ibm-943_VSUB_VPUA + x-MS932_0213 { JAVA } + x-JISAutoDetect { JAVA } + # cp943 # This isn't Windows, and no one else uses it. + # IANA says that Windows-31J is an extension to csshiftjis ibm-932 +ibm-943_P130-1999 { UTR22* } + ibm-943 { IBM* JAVA } + Shift_JIS # Leave untagged because this isn't the default + cp943 { JAVA* } # This is slightly different, but the backslash mapping is the same. + 943 { JAVA } + ibm-943_VASCII_VSUB_VPUA + x-IBM943 { JAVA } + # japanese. Unicode name is \u30b7\u30d5\u30c8\u7b26\u53f7\u5316\u8868\u73fe +ibm-33722_P12A_P12A-2009_U2 { UTR22* } + ibm-33722 # Leave untagged because this isn't the default + ibm-5050 # Leave untagged because this isn't the default, and yes this alias is correct + ibm-33722_VPUA + IBM-eucJP +windows-51932-2006 { UTR22* } + windows-51932 { WINDOWS* } + CP51932 { IANA* } + csCP51932 +ibm-33722_P120-1999 { UTR22* } # Japan EUC with \ <-> Yen mapping + ibm-33722 { IBM* JAVA } + ibm-5050 { IBM } # Yes this is correct + cp33722 { JAVA* } + 33722 { JAVA } + ibm-33722_VASCII_VPUA + x-IBM33722 { JAVA } + x-IBM33722A { JAVA } + x-IBM33722C { JAVA } +# ibm-954 seems to be almost a superset of ibm-33722 and ibm-1350 +# ibm-1350 seems to be almost a superset of ibm-33722 +# ibm-954 contains more PUA characters than the others. +ibm-954_P101-2007 { UTR22* } + ibm-954 { IBM* } + x-IBM954 { JAVA* } + x-IBM954C { JAVA } + # eucJP # This is closest to Solaris EUC-JP. +euc-jp-2007 { UTR22* } + EUC-JP { MIME* IANA JAVA* WINDOWS* } + Extended_UNIX_Code_Packed_Format_for_Japanese { IANA* JAVA WINDOWS } + csEUCPkdFmtJapanese { IANA JAVA WINDOWS } + X-EUC-JP { MIME JAVA WINDOWS } # Japan EUC. x-euc-jp is a MIME name + eucjis {JAVA} + ujis # Linux sometimes uses this name. This is an unfortunate generic and rarely used name. Its use is discouraged. + +aix-IBM_udcJP-4.3.6 { UTR22* } + x-IBM-udcJP { JAVA* } + +java-euc_jp_linux-1.6_P { UTR22* } + euc-jp-linux + x-EUC_JP_LINUX { JAVA* } + +java-sjis_0213-1.6_P { UTR22* } + x-SJIS_0213 { JAVA* } + +# Here are various interpretations and extensions of Big5 +ibm-1373_P100-2002 { UTR22* } # IBM's interpretation of Windows' Taiwan Big-5 without HKSCS extensions + ibm-1373 { IBM* } + windows-950 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage. +windows-950-2000 { UTR22* } + Big5 { IANA* MIME* JAVA* WINDOWS } + csBig5 { IANA WINDOWS } + windows-950 { WINDOWS* } + x-windows-950 { JAVA } + x-big5 + ms950 +ibm-950_P110-1999 { UTR22* } # Taiwan Big-5 (w/o euro update) + ibm-950 { IBM* JAVA } + cp950 { JAVA* } + 950 { JAVA } + x-IBM950 { JAVA } +ibm-1375_P100-2008 { UTR22* } # Big5-HKSCS-2004 with Unicode 3.1 mappings. This uses supplementary characters. + ibm-1375 { IBM* } + Big5-HKSCS { IANA* JAVA* } + big5hk { JAVA } + HKSCS-BIG5 # From http://www.openi18n.org/localenameguide/ +ibm-5471_P100-2006 { UTR22* } # Big5-HKSCS-2001 with Unicode 3.0 mappings. This uses many PUA characters. + ibm-5471 { IBM* } + Big5-HKSCS + MS950_HKSCS { JAVA* } + hkbig5 # from HP-UX 11i, which can't handle supplementary characters. + big5-hkscs:unicode3.0 + x-MS950-HKSCS { JAVA } + # windows-950 # Windows-950 can be w/ or w/o HKSCS extensions. By default it's not. + # windows-950_hkscs +solaris-zh_TW_big5-2.7 { UTR22* } + Big5_Solaris { JAVA* } + x-Big5-Solaris { JAVA } +# GBK +ibm-1386_P100-2001 { UTR22* } + ibm-1386 { IBM* } + cp1386 + windows-936 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage. + ibm-1386_VSUB_VPUA +windows-936-2000 { UTR22* } + GBK { IANA* WINDOWS JAVA* } + CP936 { IANA JAVA } + MS936 { IANA } # In JDK 1.5, this goes to x-mswin-936. This is an IANA name split. + windows-936 { IANA WINDOWS* JAVA } + +# Java has two different tables for ibm-1383 and gb2312. We pick closest set for tagging. +ibm-1383_P110-1999 { UTR22* } # China EUC. + ibm-1383 { IBM* JAVA } + GB2312 { IANA* MIME* } + csGB2312 { IANA } + cp1383 { JAVA* } + 1383 { JAVA } + EUC-CN # According to other platforms, windows-20936 looks more like euc-cn. x-euc-cn is also a MIME name + ibm-eucCN + hp15CN # From HP-UX? + ibm-1383_VPUA + # gb # This is not an IANA name. gb in IANA means Great Britain. + +ibm-5478_P100-1995 { UTR22* } ibm-5478 { IBM* } # This gb_2312_80 DBCS mapping is needed by iso-2022. + GB_2312-80 { IANA* } # Windows maps this alias incorrectly + chinese { IANA } + iso-ir-58 { IANA } + csISO58GB231280 { IANA } + gb2312-1980 + GB2312.1980-0 # From X11R6 + +euc-tw-2014 { UTR22* } # Updated EUC-TW converter based on ibm-964 + EUC-TW + +ibm-964_P110-1999 { UTR22* } # Taiwan EUC. x-euc-tw is a MIME name + ibm-964 { IBM* JAVA } + ibm-eucTW + cns11643 + cp964 { JAVA* } + 964 { JAVA } + ibm-964_VPUA + x-IBM964 { JAVA } + +# ISO-2022 needs one, and other people may need others. +ibm-949_P110-1999 { UTR22* } + ibm-949 { IBM* JAVA } + cp949 { JAVA* } + 949 { JAVA } + ibm-949_VASCII_VSUB_VPUA + x-IBM949 { JAVA } +ibm-949_P11A-1999 { UTR22* } + ibm-949 # Leave untagged because this isn't the default + cp949c { JAVA* } + ibm-949_VSUB_VPUA + x-IBM949C { JAVA } + IBM-949C { JAVA } + +# Korean EUC. +# +# +# EUC-KR = KS X 1003/ISO 646-KR or ISO 646-IRV/US-ASCII in GL and KS X 1001:1998 (formerly KS C 5601-1987) in GR. +# +# Although widely spread on MS Windows, using +# KS C 5601 or related names to denote EUC-KR or +# windows-949 is very much misleading. KS C 5601-1987 +# is NOT suitable as a designation for MIME charset +# and MBCS. It's just the name of a 94 x 94 Korean +# coded character set standard which can be invoked +# on either GL (with MSB reset) or GR (with MSB set). +# Note that JOHAB (windows-1361) specified in +# KS X 1001:1998 annex 3 (KS C 5601-1992 annex 3) +# is a _seprate_ MBCS with a _completely different_ +# mapping. +# +# +# The following aliases tries to mirror the poor state of alias recognition +# on these platforms. +# +# ibm-970 is almost a subset of ibm-1363. +# Java, Solaris and AIX use euc-kr to also mean ksc5601. +# Java has both ibm-970 and EUC-KR as separate converters. +ibm-970_P110_P110-2006_U2 { UTR22* } + ibm-970 { IBM* JAVA } + EUC-KR { IANA* MIME* WINDOWS JAVA } + KS_C_5601-1987 { JAVA } + windows-51949 { WINDOWS* } + csEUCKR { IANA WINDOWS } # x-euc-kr is also a MIME name + ibm-eucKR { JAVA } + KSC_5601 { JAVA } # Needed by iso-2022 + 5601 { JAVA } + cp970 { JAVA* } + 970 { JAVA } + ibm-970_VPUA + x-IBM970 { JAVA } + +# ibm-971 is almost the set of DBCS mappings of ibm-970 +ibm-971_P100-1995 ibm-971 { IBM* } ibm-971_VPUA x-IBM971 { JAVA* } + +# Java, Solaris and AIX use euc-kr to also mean ksc5601, and _sometimes_ for Windows too. +# ibm-1363 is almost a superset of ibm-970. +ibm-1363_P11B-1998 { UTR22* } + ibm-1363 # Leave untagged because this isn't the default + KS_C_5601-1987 { IANA* } + KS_C_5601-1989 { IANA } + KSC_5601 { IANA } + csKSC56011987 { IANA } + korean { IANA } + iso-ir-149 { IANA } + cp1363 { MIME* } + 5601 + ksc + windows-949 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage. + ibm-1363_VSUB_VPUA + x-IBM1363C { JAVA* } + # ks_x_1001:1992 + # ksc5601-1992 + +ibm-1363_P110-1997 { UTR22* } # Korean KSC MBCS with \ <-> Won mapping + ibm-1363 { IBM* } + ibm-1363_VASCII_VSUB_VPUA + x-IBM1363 { JAVA* } + +windows-949-2000 { UTR22* } + windows-949 { JAVA* WINDOWS* } + KS_C_5601-1987 { WINDOWS } + KS_C_5601-1989 { WINDOWS } + KSC_5601 { MIME* WINDOWS } # Needed by iso-2022 + csKSC56011987 { WINDOWS } + korean { WINDOWS } + iso-ir-149 { WINDOWS } + ms949 { JAVA } + x-KSC5601 { JAVA } + +windows-1361-2000 { UTR22* } + ksc5601_1992 + ms1361 + johab + x-Johab { JAVA* } + +windows-874-2000 { UTR22* } # Thai (w/ euro update) + TIS-620 { WINDOWS } + windows-874 { JAVA* WINDOWS* } + MS874 { JAVA } + x-windows-874 { JAVA } + # iso-8859-11 { WINDOWS } # iso-8859-11 is similar to TIS-620. ibm-13162 is a closer match. + +ibm-874_P100-1995 { UTR22* } # Thai PC (w/o euro update). + ibm-874 { IBM* JAVA } + ibm-9066 { IBM } # Yes ibm-874 == ibm-9066. ibm-1161 has the euro update. + cp874 { JAVA* } + TIS-620 { IANA* JAVA } # This is actually separate from ibm-874, which is similar to this table + tis620.2533 { JAVA } # This is actually separate from ibm-874, which is similar to this table + eucTH # eucTH is an unusual alias from Solaris. eucTH has fewer mappings than TIS620 + x-IBM874 { JAVA } + +ibm-1162_P100-1999 { UTR22* } # Thai (w/ euro update) + ibm-1162 { IBM* } + +windows-864-2000 { UTR22* } + ibm-864s + cp864s + x-IBM864S { JAVA* } + +# Platform codepages +# If Java supports the IBM prefix, it should also support the ibm- prefix too. +ibm-437_P100-1995 { UTR22* } ibm-437 { IBM* } IBM437 { IANA* WINDOWS JAVA } cp437 { IANA WINDOWS JAVA* } 437 { IANA WINDOWS JAVA } csPC8CodePage437 { IANA JAVA } windows-437 { WINDOWS* } # PC US +ibm-720_P100-1997 { UTR22* } ibm-720 { IBM* } windows-720 { WINDOWS* } DOS-720 { WINDOWS } x-IBM720 { JAVA* } # PC Arabic +ibm-737_P100-1997 { UTR22* } ibm-737 { IBM* } IBM737 { WINDOWS JAVA } cp737 { JAVA* } windows-737 { WINDOWS* } 737 { JAVA } x-IBM737 { JAVA } # PC Greek +ibm-775_P100-1996 { UTR22* } ibm-775 { IBM* } IBM775 { IANA* WINDOWS JAVA } cp775 { IANA WINDOWS JAVA* } csPC775Baltic { IANA } windows-775 { WINDOWS* } 775 { JAVA } # PC Baltic +ibm-850_P100-1995 { UTR22* } ibm-850 { IBM* } IBM850 { IANA* MIME* WINDOWS JAVA } cp850 { IANA MIME WINDOWS JAVA* } 850 { IANA JAVA } csPC850Multilingual { IANA JAVA } windows-850 { WINDOWS* } # PC latin1 +ibm-851_P100-1995 { UTR22* } ibm-851 { IBM* } IBM851 { IANA* } cp851 { IANA MIME* } 851 { IANA } csPC851 { IANA } # PC DOS Greek (w/o euro) +ibm-852_P100-1995 { UTR22* } ibm-852 { IBM* } IBM852 { IANA* WINDOWS JAVA } cp852 { IANA WINDOWS JAVA* } 852 { IANA WINDOWS JAVA } csPCp852 { IANA JAVA } windows-852 { WINDOWS* } # PC latin2 (w/o euro update) +ibm-855_P100-1995 { UTR22* } ibm-855 { IBM* } IBM855 { IANA* JAVA } cp855 { IANA JAVA* } 855 { IANA } csIBM855 { IANA } csPCp855 { JAVA } windows-855 { WINDOWS* } # PC cyrillic (w/o euro update) +ibm-856_P100-1995 { UTR22* } ibm-856 { IBM* } IBM856 { JAVA } cp856 { JAVA* } 856 { JAVA } x-IBM856 { JAVA } # PC Hebrew implicit order +ibm-857_P100-1995 { UTR22* } ibm-857 { IBM* } IBM857 { IANA* MIME* WINDOWS JAVA } cp857 { IANA MIME JAVA* } 857 { IANA JAVA } csIBM857 { IANA JAVA } windows-857 { WINDOWS* } # PC Latin 5 (w/o euro update) +ibm-858_P100-1997 { UTR22* } ibm-858 { IBM* } IBM00858 { IANA* MIME* JAVA } CCSID00858 { IANA JAVA } CP00858 { IANA JAVA } PC-Multilingual-850+euro { IANA } cp858 { MIME JAVA* } windows-858 { WINDOWS* } # PC latin1 with Euro +ibm-860_P100-1995 { UTR22* } ibm-860 { IBM* } IBM860 { IANA* MIME* JAVA } cp860 { IANA MIME JAVA* } 860 { IANA JAVA } csIBM860 { IANA JAVA } # PC Portugal +ibm-861_P100-1995 { UTR22* } ibm-861 { IBM* } IBM861 { IANA* MIME* WINDOWS JAVA } cp861 { IANA MIME JAVA* } 861 { IANA JAVA } cp-is { IANA JAVA } csIBM861 { IANA JAVA } windows-861 { WINDOWS* } # PC Iceland +ibm-862_P100-1995 { UTR22* } ibm-862 { IBM* } IBM862 { IANA* MIME* JAVA } cp862 { IANA MIME JAVA* } 862 { IANA JAVA } csPC862LatinHebrew { IANA JAVA } DOS-862 { WINDOWS } windows-862 { WINDOWS* } # PC Hebrew visual order (w/o euro update) +ibm-863_P100-1995 { UTR22* } ibm-863 { IBM* } IBM863 { IANA* MIME* JAVA } cp863 { IANA MIME JAVA* } 863 { IANA JAVA } csIBM863 { IANA JAVA } # PC Canadian French +ibm-864_X110-1999 { UTR22* } ibm-864 { IBM* } IBM864 { IANA* MIME* JAVA } cp864 { IANA MIME JAVA* } csIBM864 { IANA JAVA } # PC Arabic (w/o euro update) +ibm-865_P100-1995 { UTR22* } ibm-865 { IBM* } IBM865 { IANA* MIME* JAVA } cp865 { IANA MIME JAVA* } 865 { IANA JAVA } csIBM865 { IANA JAVA } # PC Nordic +ibm-866_P100-1995 { UTR22* } ibm-866 { IBM* } IBM866 { IANA* MIME* JAVA } cp866 { IANA MIME WINDOWS JAVA* } 866 { IANA JAVA } csIBM866 { IANA JAVA } windows-866 { WINDOWS* } # PC Russian (w/o euro update) +ibm-867_P100-1998 { UTR22* } ibm-867 { IBM* } x-IBM867 { JAVA* } # PC Hebrew (w/ euro update) Updated version of ibm-862 +ibm-868_P100-1995 { UTR22* } ibm-868 { IBM* } IBM868 { IANA* MIME* JAVA } CP868 { IANA MIME JAVA* } 868 { JAVA } csIBM868 { IANA } cp-ar { IANA } # PC Urdu +ibm-869_P100-1995 { UTR22* } ibm-869 { IBM* } IBM869 { IANA* MIME* WINDOWS JAVA } cp869 { IANA MIME JAVA* } 869 { IANA JAVA } cp-gr { IANA JAVA } csIBM869 { IANA JAVA } windows-869 { WINDOWS* } # PC Greek (w/o euro update) +ibm-878_P100-1996 { UTR22* } ibm-878 { IBM* } KOI8-R { IANA* MIME* WINDOWS JAVA* } koi8 { WINDOWS JAVA } csKOI8R { IANA WINDOWS JAVA } windows-20866 { WINDOWS* } cp878 # Russian internet +ibm-901_P100-1999 { UTR22* } ibm-901 { IBM* } # PC Baltic (w/ euro update), update of ibm-921 +ibm-902_P100-1999 { UTR22* } ibm-902 { IBM* } # PC Estonian (w/ euro update), update of ibm-922 +ibm-922_P100-1999 { UTR22* } ibm-922 { IBM* } IBM922 { JAVA } cp922 { JAVA* } 922 { JAVA } x-IBM922 { JAVA } # PC Estonian (w/o euro update) +ibm-1168_P100-2002 { UTR22* } ibm-1168 { IBM* } KOI8-U { IANA* WINDOWS } windows-21866 { WINDOWS* } # Ukrainian KOI8. koi8-ru != KOI8-U and Microsoft is wrong for aliasing them as the same. +ibm-4909_P100-1999 { UTR22* } ibm-4909 { IBM* } # ISO Greek (w/ euro update), update of ibm-813 + +# The cp aliases in this section aren't really windows aliases, but it was used by ICU for Windows. +# cp is usually used to denote IBM in Java, and that is why we don't do that anymore. +# The windows-* aliases mean windows codepages. +ibm-5346_P100-1998 { UTR22* } ibm-5346 { IBM* } windows-1250 { IANA* JAVA* WINDOWS* } cp1250 { WINDOWS JAVA } # Windows Latin2 (w/ euro update) +ibm-5347_P100-1998 { UTR22* } ibm-5347 { IBM* } windows-1251 { IANA* JAVA* WINDOWS* } cp1251 { WINDOWS JAVA } ANSI1251 # Windows Cyrillic (w/ euro update). ANSI1251 is from Solaris +ibm-5348_P100-1997 { UTR22* } ibm-5348 { IBM* } windows-1252 { IANA* JAVA* WINDOWS* } cp1252 { JAVA } # Windows Latin1 (w/ euro update) +ibm-5349_P100-1998 { UTR22* } ibm-5349 { IBM* } windows-1253 { IANA* JAVA* WINDOWS* } cp1253 { JAVA } # Windows Greek (w/ euro update) +ibm-5350_P100-1998 { UTR22* } ibm-5350 { IBM* } windows-1254 { IANA* JAVA* WINDOWS* } cp1254 { JAVA } # Windows Turkish (w/ euro update) +ibm-9447_P100-2002 { UTR22* } ibm-9447 { IBM* } windows-1255 { IANA* JAVA* WINDOWS* } cp1255 { JAVA } # Windows Hebrew (w/ euro update) +ibm-9448_X100-2005 { UTR22* } ibm-9448 { IBM* } windows-1256 { IANA* JAVA* WINDOWS* } cp1256 { WINDOWS JAVA } x-windows-1256S { JAVA } # Windows Arabic (w/ euro update) +ibm-9449_P100-2002 { UTR22* } ibm-9449 { IBM* } windows-1257 { IANA* JAVA* WINDOWS* } cp1257 { JAVA } # Windows Baltic (w/ euro update) +ibm-5354_P100-1998 { UTR22* } ibm-5354 { IBM* } windows-1258 { IANA* JAVA* WINDOWS* } cp1258 { JAVA } # Windows Vietnamese (w/ euro update) + +# These tables are out of date, and most don't have the Euro +# Leave the windows- variants untagged. They are alternate tables of the newer ones above. +ibm-1250_P100-1995 { UTR22* } ibm-1250 { IBM* } windows-1250 # Old Windows Latin2 (w/o euro update) +ibm-1251_P100-1995 { UTR22* } ibm-1251 { IBM* } windows-1251 # Old Windows Cyrillic (w/o euro update) +ibm-1252_P100-2000 { UTR22* } ibm-1252 { IBM* } windows-1252 # Old Windows Latin 1 without Euro +ibm-1253_P100-1995 { UTR22* } ibm-1253 { IBM* } windows-1253 # Old Windows Greek (w/o euro update) +ibm-1254_P100-1995 { UTR22* } ibm-1254 { IBM* } windows-1254 # Old Windows Turkish (w/o euro update) +ibm-1255_P100-1995 { UTR22* } ibm-1255 { IBM* } # Very old Windows Hebrew (w/o euro update) +ibm-5351_P100-1998 { UTR22* } ibm-5351 { IBM* } windows-1255 # Old Windows Hebrew (w/ euro update) +ibm-1256_P110-1997 { UTR22* } ibm-1256 { IBM* } # Old Windows Arabic (w/o euro update) +ibm-5352_P100-1998 { UTR22* } ibm-5352 { IBM* } windows-1256 # Somewhat old Windows Arabic (w/ euro update) +ibm-1257_P100-1995 { UTR22* } ibm-1257 { IBM* } # Old Windows Baltic (w/o euro update) +ibm-5353_P100-1998 { UTR22* } ibm-5353 { IBM* } windows-1257 # Somewhat old Windows Baltic (w/ euro update) +ibm-1258_P100-1997 { UTR22* } ibm-1258 { IBM* } windows-1258 # Old Windows Vietnamese (w/o euro update) + +macos-0_2-10.2 { UTR22* } macintosh { IANA* MIME* WINDOWS } mac { IANA } csMacintosh { IANA } windows-10000 { WINDOWS* } macroman { JAVA } x-macroman { JAVA* } # Apple latin 1 +macos-6_2-10.4 { UTR22* } x-mac-greek { MIME* WINDOWS } windows-10006 { WINDOWS* } macgr x-MacGreek { JAVA* } # Apple Greek +macos-7_3-10.2 { UTR22* } x-mac-cyrillic { MIME* WINDOWS } windows-10007 { WINDOWS* } mac-cyrillic maccy x-MacCyrillic { JAVA } x-MacUkraine { JAVA* } # Apple Cyrillic +macos-21-10.5 { UTR22* } x-mac-thai { MIME* } x-MacThai { JAVA* } MacThai { JAVA } +macos-29-10.2 { UTR22* } x-mac-centraleurroman { MIME* } windows-10029 { WINDOWS* } x-mac-ce { WINDOWS } macce maccentraleurope x-MacCentralEurope { JAVA* } # Apple Central Europe +macos-33-10.5 { UTR22* } x-mac-symbol { MIME* } x-MacSymbol { JAVA* } MacSymbol { JAVA } +macos-34-10.2 { UTR22* } x-mac-dingbat { MIME* } x-MacDingbat { JAVA* } MacDingbat { JAVA } +macos-35-10.2 { UTR22* } x-mac-turkish { MIME* WINDOWS } windows-10081 { WINDOWS* } mactr x-MacTurkish { JAVA* } # Apple Turkish +macos-36_2-10.2 { UTR22* } x-mac-croatian { MIME* } x-MacCroatian { JAVA* } MacCroatian { JAVA } +macos-37_5-10.2 { UTR22* } x-mac-iceland { MIME* } x-MacIceland { JAVA* } MacIceland { JAVA } +macos-38_2-10.2 { UTR22* } x-mac-romania { MIME* } x-MacRomania { JAVA* } MacRomania { JAVA } +macos-518-10.2 { UTR22* } x-mac-arabic { MIME* } x-MacArabic { JAVA* } MacArabic { JAVA } +macos-1285-10.2 { UTR22* } x-mac-hebrew { MIME* } x-MacHebrew { JAVA* } MacHebrew { JAVA } + +ibm-1051_P100-1995 { UTR22* } ibm-1051 { IBM* } hp-roman8 { IANA* } roman8 { IANA } r8 { IANA } csHPRoman8 { IANA } x-roman8 { JAVA* } # HP Latin1 +ibm-1276_P100-1995 { UTR22* } ibm-1276 { IBM* } Adobe-Standard-Encoding { IANA* } csAdobeStandardEncoding { IANA } # Different from ISO-Unicode-IBM-1276 (GCSGID: 1276) + +ibm-1006_P100-1995 { UTR22* } ibm-1006 { IBM* } IBM1006 { JAVA } cp1006 { JAVA* } 1006 { JAVA } x-IBM1006 { JAVA } # Urdu +ibm-1098_P100-1995 { UTR22* } ibm-1098 { IBM* } IBM1098 { JAVA } cp1098 { JAVA* } 1098 { JAVA } x-IBM1098 { JAVA } # PC Farsi +ibm-1124_P100-1996 { UTR22* } ibm-1124 { IBM* JAVA } cp1124 { JAVA* } 1124 { JAVA } x-IBM1124 { JAVA } # ISO Cyrillic Ukraine +ibm-1125_P100-1997 { UTR22* } ibm-1125 { IBM* } cp1125 # Cyrillic Ukraine PC +ibm-1129_P100-1997 { UTR22* } ibm-1129 { IBM* } # ISO Vietnamese +ibm-1131_P100-1997 { UTR22* } ibm-1131 { IBM* } cp1131 # Cyrillic Belarus PC +ibm-1133_P100-1997 { UTR22* } ibm-1133 { IBM* } # ISO Lao + +# GSM 03.38 +gsm-03.38-2009 { UTR22* } GSM0338 # GSM0338 alias is from Perl + +# Partially algorithmic converters + +# [U_ENABLE_GENERIC_ISO_2022] +# The _generic_ ISO-2022 converter is disabled starting 2003-dec-03 (ICU 2.8). +# For details see the icu mailing list from 2003-dec-01 and the ucnv2022.c file. +# Language-specific variants of ISO-2022 continue to be available as listed below. +# ISO_2022 ISO-2022 + +ISO_2022,locale=ja,version=0 ISO-2022-JP { IANA* MIME* JAVA* } csISO2022JP { IANA JAVA } x-windows-iso2022jp { JAVA } x-windows-50220 { JAVA } +ISO_2022,locale=ja,version=1 ISO-2022-JP-1 { MIME* } JIS_Encoding { IANA* } csJISEncoding { IANA } ibm-5054 { IBM* } JIS x-windows-50221 { JAVA* } +ISO_2022,locale=ja,version=2 ISO-2022-JP-2 { IANA* MIME* } csISO2022JP2 { IANA } +ISO_2022,locale=ja,version=3 JIS7 +ISO_2022,locale=ja,version=4 JIS8 +ISO_2022,locale=ko,version=0 ISO-2022-KR { IANA* MIME* JAVA* } csISO2022KR { IANA JAVA } # This uses ibm-949 +ISO_2022,locale=ko,version=1 ibm-25546 { IBM* } +ISO_2022,locale=zh,version=0 ISO-2022-CN { IANA* JAVA* } csISO2022CN { JAVA } x-ISO-2022-CN-GB { JAVA } +ISO_2022,locale=zh,version=1 ISO-2022-CN-EXT { IANA* } +ISO_2022,locale=zh,version=2 ISO-2022-CN-CNS x-ISO-2022-CN-CNS { JAVA* } +HZ HZ-GB-2312 { IANA* } +x11-compound-text COMPOUND_TEXT x-compound-text { JAVA* } + +ISCII,version=0 x-ISCII91 { JAVA* } x-iscii-de { WINDOWS } windows-57002 { WINDOWS* } iscii-dev ibm-4902 { IBM* } # ibm-806 contains non-standard box drawing symbols. +ISCII,version=1 x-iscii-be { WINDOWS } windows-57003 { WINDOWS* } iscii-bng windows-57006 { WINDOWS } x-iscii-as { WINDOWS } # be is different from as on Windows. +ISCII,version=2 x-iscii-pa { WINDOWS } windows-57011 { WINDOWS* } iscii-gur +ISCII,version=3 x-iscii-gu { WINDOWS } windows-57010 { WINDOWS* } iscii-guj +ISCII,version=4 x-iscii-or { WINDOWS } windows-57007 { WINDOWS* } iscii-ori +ISCII,version=5 x-iscii-ta { WINDOWS } windows-57004 { WINDOWS* } iscii-tml +ISCII,version=6 x-iscii-te { WINDOWS } windows-57005 { WINDOWS* } iscii-tlg +ISCII,version=7 x-iscii-ka { WINDOWS } windows-57008 { WINDOWS* } iscii-knd +ISCII,version=8 x-iscii-ma { WINDOWS } windows-57009 { WINDOWS* } iscii-mlm + +# Lotus specific +LMBCS-1 lmbcs ibm-65025 { IBM* } + +# These Lotus specific converters still work, but they aren't advertised in this alias table. +# These are almost never used outside of Lotus software, +# and they take a lot of time when creating the available converter list. +# Also Lotus doesn't really use them anyway. It was a mistake to create these LMBCS variant converters in ICU. +#LMBCS-2 +#LMBCS-3 +#LMBCS-4 +#LMBCS-5 +#LMBCS-6 +#LMBCS-8 +#LMBCS-11 +#LMBCS-16 +#LMBCS-17 +#LMBCS-18 +#LMBCS-19 + +# EBCDIC codepages according to the CDRA + +# without Euro +ibm-37_P100-1995 { UTR22* } # EBCDIC US + ibm-37 { IBM* } + IBM037 { IANA* JAVA } + ibm-037 # { JAVA } + ebcdic-cp-us { IANA JAVA } + ebcdic-cp-ca { IANA JAVA } + ebcdic-cp-wt { IANA JAVA } + ebcdic-cp-nl { IANA JAVA } + csIBM037 { IANA JAVA } + cp037 { JAVA* } + 037 { JAVA } + cpibm37 { JAVA } + cp37 + +ibm-273_P100-1995 { UTR22* } ibm-273 { IBM* } IBM273 { IANA* JAVA } CP273 { IANA JAVA* } csIBM273 { IANA } ebcdic-de 273 { JAVA } # EBCDIC Germanay, Austria +ibm-277_P100-1995 { UTR22* } ibm-277 { IBM* } IBM277 { IANA* JAVA } cp277 { JAVA* } EBCDIC-CP-DK { IANA } EBCDIC-CP-NO { IANA } csIBM277 { IANA } ebcdic-dk 277 { JAVA } # EBCDIC Denmark +ibm-278_P100-1995 { UTR22* } ibm-278 { IBM* } IBM278 { IANA* JAVA } cp278 { JAVA* } ebcdic-cp-fi { IANA } ebcdic-cp-se { IANA } csIBM278 { IANA } ebcdic-sv { JAVA } 278 { JAVA } # EBCDIC Sweden +ibm-280_P100-1995 { UTR22* } ibm-280 { IBM* } IBM280 { IANA* JAVA } CP280 { IANA JAVA* } ebcdic-cp-it { IANA } csIBM280 { IANA } 280 { JAVA } # EBCDIC Italy +ibm-284_P100-1995 { UTR22* } ibm-284 { IBM* } IBM284 { IANA* JAVA } CP284 { IANA JAVA* } ebcdic-cp-es { IANA } csIBM284 { IANA } cpibm284 { JAVA } 284 { JAVA } # EBCDIC Spain +ibm-285_P100-1995 { UTR22* } ibm-285 { IBM* } IBM285 { IANA* JAVA } CP285 { IANA JAVA* } ebcdic-cp-gb { IANA } csIBM285 { IANA } cpibm285 { JAVA } ebcdic-gb { JAVA } 285 { JAVA } # EBCDIC UK Ireland +ibm-290_P100-1995 { UTR22* } ibm-290 { IBM* } IBM290 { IANA* } cp290 { IANA } EBCDIC-JP-kana { IANA } csIBM290 { IANA } # host SBCS (Katakana) +ibm-297_P100-1995 { UTR22* } ibm-297 { IBM* } IBM297 { IANA* JAVA } cp297 { IANA JAVA* } ebcdic-cp-fr { IANA } csIBM297 { IANA } cpibm297 { JAVA } 297 { JAVA } # EBCDIC France +ibm-420_X120-1999 { UTR22* } ibm-420 { IBM* } IBM420 { IANA* JAVA } cp420 { IANA JAVA* } ebcdic-cp-ar1 { IANA } csIBM420 { IANA } 420 { JAVA } # EBCDIC Arabic (all presentation shapes) +ibm-424_P100-1995 { UTR22* } ibm-424 { IBM* } IBM424 { IANA* JAVA } cp424 { IANA JAVA* } ebcdic-cp-he { IANA } csIBM424 { IANA } 424 { JAVA } # EBCDIC Hebrew +ibm-500_P100-1995 { UTR22* } ibm-500 { IBM* } IBM500 { IANA* JAVA } CP500 { IANA JAVA* } ebcdic-cp-be { IANA } csIBM500 { IANA } ebcdic-cp-ch { IANA } 500 # EBCDIC International Latin1 +ibm-803_P100-1999 { UTR22* } ibm-803 { IBM* } cp803 # Old EBCDIC Hebrew +ibm-838_P100-1995 { UTR22* } ibm-838 { IBM* } IBM838 { JAVA } IBM-Thai { IANA* JAVA } csIBMThai { IANA } cp838 { JAVA* } 838 { JAVA } ibm-9030 { IBM } # EBCDIC Thai. Yes ibm-9030 is an alias. +ibm-870_P100-1995 { UTR22* } ibm-870 { IBM* } IBM870 { IANA* JAVA } CP870 { IANA JAVA* } ebcdic-cp-roece { IANA } ebcdic-cp-yu { IANA } csIBM870 { IANA } # EBCDIC Latin 2 +ibm-871_P100-1995 { UTR22* } ibm-871 { IBM* } IBM871 { IANA* JAVA } ebcdic-cp-is { IANA JAVA } csIBM871 { IANA JAVA } CP871 { IANA JAVA* } ebcdic-is { JAVA } 871 { JAVA } # EBCDIC Iceland +ibm-875_P100-1995 { UTR22* } ibm-875 { IBM* } IBM875 { JAVA } cp875 { JAVA* } 875 { JAVA } x-IBM875 { JAVA } # EBCDIC Greek +ibm-918_P100-1995 { UTR22* } ibm-918 { IBM* } IBM918 { IANA* JAVA } CP918 { IANA JAVA* } ebcdic-cp-ar2 { IANA } csIBM918 { IANA } # EBCDIC Urdu +ibm-930_P120-1999 { UTR22* } # EBCDIC_STATEFUL Katakana-Kanji Host Mixed. + ibm-930 { IBM* } + ibm-5026 { IBM } # Yes this is correct + IBM930 { JAVA } + cp930 { JAVA* } + 930 { JAVA } + x-IBM930 { JAVA } + x-IBM930A { JAVA } +ibm-933_P110-1995 { UTR22* } ibm-933 { IBM* JAVA } cp933 { JAVA* } 933 { JAVA } x-IBM933 { JAVA } # Korea EBCDIC MIXED +ibm-935_P110-1999 { UTR22* } ibm-935 { IBM* JAVA } cp935 { JAVA* } 935 { JAVA } x-IBM935 { JAVA } # China EBCDIC MIXED. Need to use Unicode, ibm-1388 or gb18030 instead because it is required by the government of China. +ibm-937_P110-1999 { UTR22* } ibm-937 { IBM* JAVA } cp937 { JAVA* } 937 { JAVA } x-IBM937 { JAVA } # Taiwan EBCDIC MIXED +ibm-939_P120-1999 { UTR22* } # EBCDIC_STATEFUL Latin-Kanji Host Mixed. + ibm-939 { IBM* } + ibm-931 { IBM } # Yes this is correct + ibm-5035 { IBM } # Yes this is also correct + IBM939 { JAVA } + cp939 { JAVA* } + 939 { JAVA } + x-IBM939 { JAVA } + x-IBM939A { JAVA } +ibm-1025_P100-1995 { UTR22* } ibm-1025 { IBM* JAVA } cp1025 { JAVA* } 1025 { JAVA } x-IBM1025 { JAVA } # EBCDIC Cyrillic +ibm-1026_P100-1995 { UTR22* } ibm-1026 { IBM* } IBM1026 { IANA* JAVA } CP1026 { IANA JAVA* } csIBM1026 { IANA } 1026 { JAVA } # EBCDIC Turkey +ibm-1047_P100-1995 { UTR22* } ibm-1047 { IBM* } IBM1047 { IANA* JAVA } cp1047 { JAVA* } 1047 { JAVA } # EBCDIC Open systems Latin1 +ibm-1097_P100-1995 { UTR22* } ibm-1097 { IBM* JAVA } cp1097 { JAVA* } 1097 { JAVA } x-IBM1097 { JAVA } # EBCDIC Farsi +ibm-1112_P100-1995 { UTR22* } ibm-1112 { IBM* JAVA } cp1112 { JAVA* } 1112 { JAVA } x-IBM1112 { JAVA } # EBCDIC Baltic +ibm-1114_P100-2001 { UTR22* } ibm-1114 { IBM* } x-IBM1114 { JAVA* } +ibm-1115_P100-1995 { UTR22* } ibm-1115 { IBM* } x-IBM1115 { JAVA* } +ibm-1122_P100-1999 { UTR22* } ibm-1122 { IBM* JAVA } cp1122 { JAVA* } 1122 { JAVA } x-IBM1122 { JAVA } # EBCDIC Estonia +ibm-1123_P100-1995 { UTR22* } ibm-1123 { IBM* JAVA } cp1123 { JAVA* } 1123 { JAVA } x-IBM1123 { JAVA } # EBCDIC Cyrillic Ukraine +ibm-1130_P100-1997 { UTR22* } ibm-1130 { IBM* } # EBCDIC Vietnamese +ibm-1132_P100-1998 { UTR22* } ibm-1132 { IBM* } # EBCDIC Lao +ibm-1137_P100-1999 { UTR22* } ibm-1137 { IBM* } # Devanagari EBCDIC (based on Unicode character set) +ibm-4517_P100-2005 { UTR22* } ibm-4517 { IBM* } # EBCDIC Arabic. Update of ibm-421 + +# with Euro +ibm-1140_P100-1997 { UTR22* } ibm-1140 { IBM* } IBM01140 { IANA* JAVA } CCSID01140 { IANA JAVA } CP01140 { IANA JAVA } cp1140 { JAVA* } ebcdic-us-37+euro { IANA } # EBCDIC US +ibm-1141_P100-1997 { UTR22* } ibm-1141 { IBM* } IBM01141 { IANA* JAVA } CCSID01141 { IANA JAVA } CP01141 { IANA JAVA } cp1141 { JAVA* } ebcdic-de-273+euro { IANA } # EBCDIC Germanay, Austria +ibm-1142_P100-1997 { UTR22* } ibm-1142 { IBM* } IBM01142 { IANA* JAVA } CCSID01142 { IANA JAVA } CP01142 { IANA JAVA } cp1142 { JAVA* } ebcdic-dk-277+euro { IANA } ebcdic-no-277+euro { IANA } # EBCDIC Denmark +ibm-1143_P100-1997 { UTR22* } ibm-1143 { IBM* } IBM01143 { IANA* JAVA } CCSID01143 { IANA JAVA } CP01143 { IANA JAVA } cp1143 { JAVA* } ebcdic-fi-278+euro { IANA } ebcdic-se-278+euro { IANA } # EBCDIC Sweden +ibm-1144_P100-1997 { UTR22* } ibm-1144 { IBM* } IBM01144 { IANA* JAVA } CCSID01144 { IANA JAVA } CP01144 { IANA JAVA } cp1144 { JAVA* } ebcdic-it-280+euro { IANA } # EBCDIC Italy +ibm-1145_P100-1997 { UTR22* } ibm-1145 { IBM* } IBM01145 { IANA* JAVA } CCSID01145 { IANA JAVA } CP01145 { IANA JAVA } cp1145 { JAVA* } ebcdic-es-284+euro { IANA } # EBCDIC Spain +ibm-1146_P100-1997 { UTR22* } ibm-1146 { IBM* } IBM01146 { IANA* JAVA } CCSID01146 { IANA JAVA } CP01146 { IANA JAVA } cp1146 { JAVA* } ebcdic-gb-285+euro { IANA } # EBCDIC UK Ireland +ibm-1147_P100-1997 { UTR22* } ibm-1147 { IBM* } IBM01147 { IANA* JAVA } CCSID01147 { IANA JAVA } CP01147 { IANA JAVA } cp1147 { JAVA* } ebcdic-fr-297+euro { IANA } # EBCDIC France +ibm-1148_P100-1997 { UTR22* } ibm-1148 { IBM* } IBM01148 { IANA* JAVA } CCSID01148 { IANA JAVA } CP01148 { IANA JAVA } cp1148 { JAVA* } ebcdic-international-500+euro { IANA } # EBCDIC International Latin1 +ibm-1149_P100-1997 { UTR22* } ibm-1149 { IBM* } IBM01149 { IANA* JAVA } CCSID01149 { IANA JAVA } CP01149 { IANA JAVA } cp1149 { JAVA* } ebcdic-is-871+euro { IANA } # EBCDIC Iceland +ibm-1153_P100-1999 { UTR22* } ibm-1153 { IBM* } IBM1153 { JAVA } x-IBM1153 { JAVA* } # EBCDIC latin 2 +ibm-1154_P100-1999 { UTR22* } ibm-1154 { IBM* } # EBCDIC Cyrillic Multilingual +ibm-1155_P100-1999 { UTR22* } ibm-1155 { IBM* } # EBCDIC Turkey +ibm-1156_P100-1999 { UTR22* } ibm-1156 { IBM* } # EBCDIC Baltic Multilingual +ibm-1157_P100-1999 { UTR22* } ibm-1157 { IBM* } # EBCDIC Estonia +ibm-1158_P100-1999 { UTR22* } ibm-1158 { IBM* } # EBCDIC Cyrillic Ukraine +ibm-1160_P100-1999 { UTR22* } ibm-1160 { IBM* } # EBCDIC Thailand +ibm-1164_P100-1999 { UTR22* } ibm-1164 { IBM* } # EBCDIC Viet Nam +ibm-1364_P110-2007 { UTR22* } ibm-1364 { IBM* } x-IBM1364 { JAVA* } # Korean Host Mixed +ibm-1370_P100-1999 { UTR22* } ibm-1370 { IBM* } x-IBM1370 { JAVA* } +ibm-1371_P100-1999 { UTR22* } ibm-1371 { IBM* } x-IBM1371 { JAVA* } # Taiwan EBCDIC MIXED (Euro update of ibm-937) +ibm-1388_P103-2001 { UTR22* } ibm-1388 { IBM* } ibm-9580 { IBM } x-IBM1388 { JAVA* } # S-Ch DBCS-Host Data GBK EBCDIC_STATEFUL. Yes ibm-9580 is an alias. +ibm-1390_P110-2003 { UTR22* } ibm-1390 { IBM* } x-IBM1390 { JAVA* } # Japan EBCDIC MIXED (JIS X 0213) +ibm-1399_P110-2003 { UTR22* } ibm-1399 { IBM* } x-IBM1399 { JAVA* } # Host MBCS (Latin-Kanji) (JIS X 0213) +ibm-5123_P100-1999 { UTR22* } ibm-5123 { IBM* } # Host Roman Jis. Euro update of ibm-1027. SBCS portion of ibm-1390. +ibm-8482_P100-1999 { UTR22* } ibm-8482 { IBM* } # host SBCS (Katakana). Euro update of ibm-290. SBCS portion of ibm-1399. +# Yes ibm-20780 is the same as ibm-16684 +ibm-16684_P110-2003 { UTR22* } ibm-16684 { IBM* } ibm-20780 { IBM } # DBCS Jis + Roman Jis Host. This is the DBCS portion of ibm-1390 and ibm-1399 (JIS X 0213). +ibm-4899_P100-1998 { UTR22* } ibm-4899 { IBM* } # Old EBCDIC Hebrew. Update of ibm-803 +ibm-4971_P100-1999 { UTR22* } ibm-4971 { IBM* } # EBCDIC Greek. Update of ibm-875 and superceded by ibm-9067 +ibm-9067_X100-2005 { UTR22* } ibm-9067 { IBM* } # EBCDIC Greek. Update of ibm-875 and ibm-4971 +ibm-12712_P100-1998 { UTR22* } ibm-12712 { IBM* } ebcdic-he # EBCDIC Hebrew (new sheqel, control characters update). Update of ibm-424 +ibm-16804_X110-1999 { UTR22* } ibm-16804 { IBM* } ebcdic-ar # EBCDIC Arabic. Update of ibm-420 + +java-Cp1399A-1.6_P { UTR22* } x-IBM1399A { JAVA* } +java-Cp420s-1.6_P { UTR22* } x-IBM420S { JAVA* } +java-Cp1390A-1.6_P { UTR22* } x-IBM1390A { JAVA* } + +# EBCDIC codepages for S/390, with LF and NL codes swapped +# Starting with ICU 2.4, the swapping is done by modifying the +# normal tables at runtime instead of at build time. +# Append UCNV_SWAP_LFNL_OPTION_STRING to the "ibm-CCSID" name to select this. +# +# Example: "ibm-1047,swaplfnl" or "ibm-1047" UCNV_SWAP_LFNL_OPTION_STRING +# +# This avoids the duplication of all EBCDIC SBCS and mixed-SBCS/DBCS +# mapping files. + +# Some examples below for declaring old-style, obsolete aliases with the "-s390" +# suffix to map to the new-style, recommended names with the option added. +# These are listed here for backward compatibility. +# Do not use these; instead use the normal converter name with the option +# added as recommended above. + +# Note: It is not possible to define an alias (non-initial name in a line here) +# that itself contains a converter option like this one for swapping LF<->NL. +# Such names would never be found because ucnv_open() will first parse and strip +# options before looking up a name in this table. +# ucnv_open() then parses the lookup result (the canonical name on the left +# in lines here) as well. + +# This also means that it is not necessary to add anything to convrtrs.txt +# for converter names like "ibm-1026,swaplfnl" to work - +# they are already covered by the normal option parsing together with the +# regular, option-less alias elsewhere in this file. + +ibm-37_P100-1995,swaplfnl ibm-37-s390 # ibm037-s390 also matches ibm-37-s390 +ibm-924_P100-1998,swaplfnl ibm-924-s390 IBM924_LF { JAVA* } +ibm-1047_P100-1995,swaplfnl ibm-1047-s390 IBM1047_LF { JAVA* } +ibm-1140_P100-1997,swaplfnl ibm-1140-s390 +ibm-1141_P100-1997,swaplfnl ibm-1141-s390 IBM1141_LF { JAVA* } +ibm-1142_P100-1997,swaplfnl ibm-1142-s390 +ibm-1143_P100-1997,swaplfnl ibm-1143-s390 +ibm-1144_P100-1997,swaplfnl ibm-1144-s390 +ibm-1145_P100-1997,swaplfnl ibm-1145-s390 +ibm-1146_P100-1997,swaplfnl ibm-1146-s390 +ibm-1147_P100-1997,swaplfnl ibm-1147-s390 +ibm-1148_P100-1997,swaplfnl ibm-1148-s390 +ibm-1149_P100-1997,swaplfnl ibm-1149-s390 +ibm-1153_P100-1999,swaplfnl ibm-1153-s390 +ibm-12712_P100-1998,swaplfnl ibm-12712-s390 +ibm-16804_X110-1999,swaplfnl ibm-16804-s390 + +# This is a special version of ibm-1140 that the XML4C (Xerces) parser team +# requested in 2000. +# It maps both EBCDIC LF and NL controls to Unicode LF U+000A. + +ebcdic-xml-us + +# These are not installed by default. They are rarely used. +# Many of them can be added through the online ICU Data Library Customization tool + +ibm-1004_P100-1995 { UTR22* } ibm-1004 { IBM* } +ibm-1008_P100-1995 { UTR22* } ibm-1008 { IBM* } # cp1008, 8-bit Arabic (w/o euro update) +ibm-1009_P100-1995 { UTR22* } ibm-1009 { IBM* } +ibm-1010_P100-1995 { UTR22* } ibm-1010 { IBM* } NF_Z_62-010 { IANA* } iso-ir-69 { IANA } ISO646-FR { IANA } fr { IANA } csISO69French { IANA } +ibm-1011_P100-1995 { UTR22* } ibm-1011 { IBM* } DIN_66003 { IANA* } iso-ir-21 { IANA } de { IANA } ISO646-DE { IANA } csISO21German { IANA } +ibm-1012_P100-1995 { UTR22* } ibm-1012 { IBM* } IT { IANA* } iso-ir-15 { IANA } ISO646-IT { IANA } csISO15Italian { IANA } +ibm-1013_P100-1995 { UTR22* } ibm-1013 { IBM* } BS_4730 { IANA* } iso-ir-4 { IANA } ISO646-GB { IANA } gb { IANA } uk { IANA } csISO4UnitedKingdom { IANA } +ibm-1014_P100-1995 { UTR22* } ibm-1014 { IBM* } ES2 { IANA* } iso-ir-85 { IANA } ISO646-ES2 { IANA } csISO85Spanish2 { IANA } +ibm-1015_P100-1995 { UTR22* } ibm-1015 { IBM* } PT2 { IANA* } iso-ir-84 { IANA } ISO646-PT2 { IANA } csISO84Portuguese2 { IANA } +ibm-1016_P100-1995 { UTR22* } ibm-1016 { IBM* } NS_4551-1 { IANA* } iso-ir-60 { IANA } ISO646-NO { IANA } no { IANA } csISO60DanishNorwegian { IANA } csISO60Norwegian1 { IANA } +ibm-1017_P100-1995 { UTR22* } ibm-1017 { IBM* } +ibm-1018_P100-1995 { UTR22* } ibm-1018 { IBM* } SEN_850200_B { IANA* } iso-ir-10 { IANA } FI { IANA } ISO646-FI { IANA } ISO646-SE { IANA } se { IANA } csISO10Swedish { IANA } +ibm-1019_P100-1995 { UTR22* } ibm-1019 { IBM* } +ibm-1020_P100-2003 { UTR22* } ibm-1020 { IBM* } CSA_Z243.4-1985-1 { IANA* } iso-ir-121 { IANA } ISO646-CA { IANA } csa7-1 { IANA } ca { IANA } csISO121Canadian1 { IANA } +ibm-1021_P100-2003 { UTR22* } ibm-1021 { IBM* } +ibm-1023_P100-2003 { UTR22* } ibm-1023 { IBM* } ES { IANA* } iso-ir-17 { IANA } ISO646-ES { IANA } csISO17Spanish { IANA } +ibm-1027_P100-1995 { UTR22* } ibm-1027 { IBM* } x-IBM1027 { JAVA* } +ibm-1041_P100-1995 { UTR22* } ibm-1041 { IBM* } x-IBM1041 { JAVA* } +ibm-1043_P100-1995 { UTR22* } ibm-1043 { IBM* } x-IBM1043 { JAVA* } +ibm-1046_X110-1999 { UTR22* } ibm-1046 { IBM* } x-IBM1046 { JAVA* } x-IBM1046S { JAVA } # Arabic +ibm-1088_P100-1995 { UTR22* } ibm-1088 { IBM* } x-IBM1088 { JAVA* } +ibm-1100_P100-2003 { UTR22* } ibm-1100 { IBM* } DEC-MCS { IANA* } dec { IANA } csDECMCS { IANA } +ibm-1101_P100-2003 { UTR22* } ibm-1101 { IBM* } +ibm-1102_P100-2003 { UTR22* } ibm-1102 { IBM* } +ibm-1103_P100-2003 { UTR22* } ibm-1103 { IBM* } +ibm-1104_P100-2003 { UTR22* } ibm-1104 { IBM* } NF_Z_62-010_1973 iso-ir-25 { IANA* } ISO646-FR1 { IANA } csISO25French { IANA } # NF_Z_62-010_(1973) is the real IANA alias, but () aren't invariant characters. +ibm-1105_P100-2003 { UTR22* } ibm-1105 { IBM* } +ibm-1106_P100-2003 { UTR22* } ibm-1106 { IBM* } +ibm-1107_P100-2003 { UTR22* } ibm-1107 { IBM* } DS_2089 { IANA* } ISO646-DK { IANA } dk { IANA } csISO646Danish { IANA } +ibm-1127_P100-2004 { UTR22* } ibm-1127 { IBM* } +ibm-1161_P100-1999 { UTR22* } ibm-1161 { IBM* } # Thai (Euro update of ibm-1129) +ibm-1163_P100-1999 { UTR22* } ibm-1163 { IBM* } # Vietnamese +ibm-1165_P101-2000 { UTR22* } ibm-1165 { IBM* } # Vietnamese (EBCDIC) +ibm-1166_P100-2002 { UTR22* } ibm-1166 { IBM* } # Cyrillic for Kazakhstan +ibm-1167_P100-2002 { UTR22* } ibm-1167 { IBM* } KOI8-RU x-KOI8_RU { JAVA* } +ibm-1174_X100-2007 { UTR22* } ibm-1174 { IBM* } KZ-1048 { IANA* } STRK1048-2002 { IANA } RK1048 { IANA } csKZ1048 { IANA } +ibm-1277_P100-1995 { UTR22* } ibm-1277 { IBM* } # Adobe (Postscript) Latin-1 +ibm-13125_P100-1997 { UTR22* } ibm-13125 { IBM* } # S-Ch (DBCS subset of ibm-4933, ibm-1388) +ibm-13140_P101-2000 { UTR22* } ibm-13140 { IBM* } +ibm-13218_P100-1996 { UTR22* } ibm-13218 { IBM* } # Japanese (EBCDIC update of ibm-930) +ibm-1350_P110-1997 { UTR22* } ibm-1350 { IBM* } x-eucJP-Open { JAVA* } eucJP-Open { JAVA } # Japanese (EUC-JP variant) +ibm-1351_P110-1997 { UTR22* } ibm-1351 { IBM* } x-IBM1351 { JAVA* } # Japanese (DBCS subset of ibm-5039) +ibm-1362_P110-1999 { UTR22* } ibm-1362 { IBM* } x-IBM1362 { JAVA* } # Korean (DBCS subset of ibm-1363) +ibm-13676_P102-2001 { UTR22* } ibm-13676 { IBM* } # Simplified Chinese (EBCDIC) +ibm-1380_P100-1995 { UTR22* } ibm-1380 { IBM* } x-IBM1380 { JAVA* } # Simplified Chinese (DBCS subset of ibm-1381) +ibm-1381_P110-1999 { UTR22* } ibm-1381 { IBM* JAVA } cp1381 { JAVA* } 1381 { JAVA } x-IBM1381 { JAVA } # Simplified Chinese PC Data mixed (IBM GB) +ibm-1382_P100-1995 { UTR22* } ibm-1382 { IBM* } x-IBM1382 { JAVA* } # Simplified Chinese (DBCS subset of ibm-1383) +ibm-17221_P100-2001 { UTR22* } ibm-17221 { IBM* } # Simplified Chinese (EBCDIC) +ibm-17248_X110-1999 { UTR22* } ibm-17248 { IBM* } # PC Arabic (w/ euro update) Updated version of ibm-864 +ibm-21344_P101-2000 { UTR22* } ibm-21344 { IBM* } # PC Arabic. Updated version of ibm-864 +ibm-21427_P100-1999 { UTR22* } ibm-21427 { IBM* } # Traditional Chinese (DBCS subset of ibm-1370) +ibm-256_P100-1995 { UTR22* } ibm-256 { IBM* } # Latin 1 EBCDIC +ibm-259_P100-1995 { UTR22* } ibm-259 { IBM* } IBM-Symbols { IANA* } csIBMSymbols { IANA } +ibm-274_P100-2000 { UTR22* } ibm-274 { IBM* } IBM274 { IANA* } EBCDIC-BE { IANA } CP274 { IANA } csIBM274 { IANA } +ibm-275_P100-1995 { UTR22* } ibm-275 { IBM* } IBM275 { IANA* } EBCDIC-BR { IANA } cp275 { IANA } csIBM275 { IANA } +ibm-286_P100-2003 { UTR22* } ibm-286 { IBM* } EBCDIC-AT-DE-A { IANA* } csEBCDICATDEA { IANA } +ibm-293_P100-1995 { UTR22* } ibm-293 { IBM* } # APL EBCDIC (APL: A Programming Language) +ibm-300_P120-2006 { UTR22* } ibm-300 { IBM* } x-IBM300 { JAVA* } # Japanese (DBCS subset of ibm-930 and ibm-939) +ibm-301_P110-1997 { UTR22* } ibm-301 { IBM* } x-IBM301 { JAVA* } # Japanese (DBCS subset of ibm-943) +ibm-33058_P100-2000 { UTR22* } ibm-33058 { IBM* } # SBCS (Katakana) +ibm-425_P101-2000 { UTR22* } ibm-425 { IBM* } # Arabic (EBCDIC) +ibm-4930_P110-1999 { UTR22* } ibm-4930 { IBM* } # Korean (DBCS subset of ibm-1364) +ibm-4933_P100-2002 { UTR22* } ibm-4933 { IBM* } # S-Ch (DBCS subset of ibm-1388) +ibm-4948_P100-1995 { UTR22* } ibm-4948 { IBM* } +ibm-4951_P100-1995 { UTR22* } ibm-4951 { IBM* } +ibm-4952_P100-1995 { UTR22* } ibm-4952 { IBM* } +ibm-4960_P100-1995 { UTR22* } ibm-4960 { IBM* } +ibm-5039_P11A-1998 { UTR22* } ibm-5039 { IBM* } # Japanese (HP Shift-JIS variant) +ibm-5048_P100-1995 { UTR22* } ibm-5048 { IBM* } # Japanese (DBCS subset of ibm-1350, JIS X208-1990) +ibm-5049_P100-1995 { UTR22* } ibm-5049 { IBM* } # Japanese (DBCS subset of ibm-1350, JIS X212) +ibm-5067_P100-1995 { UTR22* } ibm-5067 { IBM* } # Korean (DBCS subset of ibm-21450) +ibm-5104_X110-1999 { UTR22* } ibm-5104 { IBM* } # cp1008, 8-bit Arabic (w/ euro update) +ibm-5233_P100-2011 { UTR22* } ibm-5233 { IBM* } # Devanagari EBCDIC, including Indian Rupee +ibm-806_P100-1998 { UTR22* } ibm-806 { IBM* } # Hindi (ISCII variant) +ibm-808_P100-1999 { UTR22* } ibm-808 { IBM* } x-IBM808 { JAVA* } # Cyrillic +ibm-833_P100-1995 { UTR22* } ibm-833 { IBM* } x-IBM833 { JAVA* } +ibm-834_P100-1995 { UTR22* } ibm-834 { IBM* } x-IBM834 { JAVA* } # Korean (DBCS subset of ibm-933) +ibm-835_P100-1995 { UTR22* } ibm-835 { IBM* } x-IBM835 { JAVA* } # Traditional Chinese (DBCS subset of ibm-5033) +ibm-836_P100-1995 { UTR22* } ibm-836 { IBM* } x-IBM836 { JAVA* } +ibm-837_P100-2011 { UTR22* } ibm-837 { IBM* } x-IBM837 { JAVA* } # Simplified Chinese (DBCS subset of ibm-5031) +ibm-848_P100-1999 { UTR22* } ibm-848 { IBM* } # Cyrillic (euro update of ibm-1125) +ibm-849_P100-1999 { UTR22* } ibm-849 { IBM* } # Cyrillic Belarus (euro update of ibm-1131) +ibm-859_P100-1999 { UTR22* } ibm-859 { IBM* } x-IBM859 { JAVA* } # PC Latin 9 (w/ euro update) +ibm-8612_P100-1995 { UTR22* } ibm-8612 { IBM* } # Arabic (EBCDIC update of ibm-420) +ibm-872_P100-1999 { UTR22* } ibm-872 { IBM* } # Cyrillic (Euro update of ibm-855) +ibm-880_P100-1995 { UTR22* } ibm-880 { IBM* } IBM880 { IANA* } cp880 { IANA } EBCDIC-Cyrillic { IANA } csIBM880 { IANA } windows-20880 { WINDOWS* } +ibm-896_P100-1995 { UTR22* } ibm-896 { IBM* } # SBCS Katakana +ibm-897_P100-1995 { UTR22* } ibm-897 { IBM* } JIS_X0201 { IANA* } X0201 { IANA } csHalfWidthKatakana { IANA } x-IBM897 { JAVA* } +ibm-9027_P100-1999 { UTR22* } ibm-9027 { IBM* } # DBCS T-Ch Host. Euro update of ibm-835. DBCS portion of ibm-1371. +ibm-9048_P100-1998 { UTR22* } ibm-9048 { IBM* } # Hebrew (Euro and Sequel update of ibm-856) +ibm-905_P100-1995 { UTR22* } ibm-905 { IBM* } IBM905 { IANA* } CP905 { IANA } ebcdic-cp-tr { IANA } csIBM905 { IANA } windows-20905 { WINDOWS* } +ibm-9056_P100-1995 { UTR22* } ibm-9056 { IBM* } # Arabic +ibm-9061_P100-1999 { UTR22* } ibm-9061 { IBM* } # Greek (w/ euro update) +ibm-9145_P110-1997 { UTR22* } ibm-9145 { IBM* } # Japanese (DBCS subset of ibm-5050) +ibm-9238_X110-1999 { UTR22* } ibm-9238 { IBM* } # cp1046, PC Arabic Extended (w/ euro update) +ibm-924_P100-1998 { UTR22* } ibm-924 { IBM* } IBM00924 { IANA* } CCSID00924 { IANA } CP00924 { IANA } ebcdic-Latin9--euro { IANA } +ibm-926_P100-2000 { UTR22* } ibm-926 { IBM* } # Korean (DBCS subset of ibm-944) +ibm-927_P100-1995 { UTR22* } ibm-927 { IBM* } x-IBM927 { JAVA* } # Traditional Chinese (DBCS subset of ibm-948) +ibm-928_P100-1995 { UTR22* } ibm-928 { IBM* } # Simplified Chinese (DBCS subset of ibm-936) +ibm-941_P13A-2001 { UTR22* } ibm-941 { IBM* } # DBCS portion of ibm-943 +ibm-944_P100-1995 { UTR22* } ibm-944 { IBM* } # Korean +ibm-946_P100-1995 { UTR22* } ibm-946 { IBM* } # Simplified Chinese +ibm-947_P100-1995 { UTR22* } ibm-947 { IBM* } x-IBM947 { JAVA* } # Traditional Chinese (DBCS subset of ibm-950) +ibm-948_P110-1999 { UTR22* } ibm-948 { IBM* } x-IBM948 { JAVA* } # Traditional Chinese +ibm-951_P100-1995 { UTR22* } ibm-951 { IBM* } x-IBM951 { JAVA* } # Korean (DBCS subset of ibm-949) +ibm-952_P110-1997 { UTR22* } ibm-952 { IBM* } x-JIS0208 # Pure DBCS, Japanese EUC, G1 - JIS X208-1990 +ibm-953_P100-2000 { UTR22* } ibm-953 { IBM* } JIS_X0212-1990 { IANA* } # Pure DBCS, Japanese EUC, G3 - JIS X 0212-1990 +ibm-955_P110-1997 { UTR22* } ibm-955 { IBM* } # Pure DBCS, Japanese EUC, G0 - JIS X208-1978 +ibm-9577_P100-2001 { UTR22* } ibm-9577 { IBM* } ibm-1385 { IBM } x-IBM1385 { JAVA* } # ibm-9577 and ibm-1385 are identical DBCS tables. +iso-8859_16-2001 { UTR22* } ISO-8859-16 { IANA* } iso-ir-226 { IANA } ISO_8859-16:2001 { IANA } latin10 { IANA } l10 { IANA } + +# To be considered for listing at a later date for the data library customization tool +#ibm-1159_P100-1999 { UTR22* } ibm-1159 { IBM* } # SBCS T-Ch Host. Euro update of ibm-28709. This is used in combination with another CCSID mapping. +#ibm-960_P100-2000 { UTR22* } ibm-960 { IBM* } # Pure DBCS, CNS11643 plane 1 +#ibm-963_P100-1995 { UTR22* } ibm-963 { IBM* } # Pure DBCS, CNS11643 plane 2 Traditional Chinese (DBCS subset of ibm-965) diff --git a/rust/pspp/fuzz/.gitignore b/rust/pspp/fuzz/.gitignore new file mode 100644 index 0000000000..1a45eee776 --- /dev/null +++ b/rust/pspp/fuzz/.gitignore @@ -0,0 +1,4 @@ +target +corpus +artifacts +coverage diff --git a/rust/pspp/fuzz/Cargo.lock b/rust/pspp/fuzz/Cargo.lock new file mode 100644 index 0000000000..c840c28160 --- /dev/null +++ b/rust/pspp/fuzz/Cargo.lock @@ -0,0 +1,872 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anstream" +version = "0.6.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" + +[[package]] +name = "anstyle-parse" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" +dependencies = [ + "anstyle", + "windows-sys 0.52.0", +] + +[[package]] +name = "anyhow" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" + +[[package]] +name = "arbitrary" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110" + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] +name = "autocfg" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" + +[[package]] +name = "bitflags" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" + +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "cc" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "066fce287b1d4eafef758e89e09d724a24808a9196fe9756b8ca90e86d0719a2" +dependencies = [ + "jobserver", + "libc", + "once_cell", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-targets 0.52.6", +] + +[[package]] +name = "clap" +version = "4.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84b3edb18336f4df585bc9aa31dd99c036dfa5dc5e9a2939a722a188f3a8970d" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1c09dd5ada6c6c78075d6fd0da3f90d8080651e2d6cc8eb2f1aaa4034ced708" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", + "terminal_size", +] + +[[package]] +name = "clap_derive" +version = "4.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bac35c6dafb060fd4d275d9a4ffae97917c13a6327903a8be2153cd964f7085" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" + +[[package]] +name = "colorchoice" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" + +[[package]] +name = "core-foundation-sys" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "encoding_rs" +version = "0.8.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "errno" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "finl_unicode" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" + +[[package]] +name = "flate2" +version = "1.0.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "float_next_after" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "hexplay" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0962bea6731e28b5a443ba4aa00fe3e4fe7555dadf12012435efb738eeac5898" +dependencies = [ + "atty", + "termcolor", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "indexmap" +version = "2.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" + +[[package]] +name = "jobserver" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libc" +version = "0.2.155" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" + +[[package]] +name = "libfuzzer-sys" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7" +dependencies = [ + "arbitrary", + "cc", + "once_cell", +] + +[[package]] +name = "linux-raw-sys" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "miniz_oxide" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" +dependencies = [ + "adler", +] + +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-derive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "ordered-float" +version = "3.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc" +dependencies = [ + "num-traits", +] + +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pspp" +version = "1.0.0" +dependencies = [ + "anyhow", + "bitflags", + "chrono", + "clap", + "encoding_rs", + "finl_unicode", + "flate2", + "float_next_after", + "hexplay", + "indexmap", + "lazy_static", + "libc", + "num", + "num-derive", + "num-traits", + "ordered-float", + "thiserror", + "unicase", + "utf8-decode", + "windows-sys 0.48.0", +] + +[[package]] +name = "pspp-fuzz" +version = "0.0.0" +dependencies = [ + "libfuzzer-sys", + "pspp", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rustix" +version = "0.38.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "201fcda3845c23e8212cd466bfebf0bd20694490fc0356ae8e428e0824a915a6" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "termcolor" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adc4587ead41bf016f11af03e55a624c06568b5a19db4e90fde573d805074f83" +dependencies = [ + "wincolor", +] + +[[package]] +name = "terminal_size" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" +dependencies = [ + "rustix", + "windows-sys 0.48.0", +] + +[[package]] +name = "thiserror" +version = "1.0.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "unicase" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89" +dependencies = [ + "version_check", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "utf8-decode" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wasm-bindgen" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "wincolor" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eeb06499a3a4d44302791052df005d5232b927ed1a9658146d842165c4de7767" +dependencies = [ + "winapi", +] + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/rust/pspp/fuzz/Cargo.toml b/rust/pspp/fuzz/Cargo.toml new file mode 100644 index 0000000000..8b44789bad --- /dev/null +++ b/rust/pspp/fuzz/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "pspp-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" + +[dependencies.pspp] +path = ".." + +[[bin]] +name = "fuzz_target_1" +path = "fuzz_targets/fuzz_target_1.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "segment" +path = "fuzz_targets/segment.rs" +test = false +doc = false +bench = false diff --git a/rust/pspp/fuzz/fuzz_targets/fuzz_target_1.rs b/rust/pspp/fuzz/fuzz_targets/fuzz_target_1.rs new file mode 100644 index 0000000000..43a88c14f3 --- /dev/null +++ b/rust/pspp/fuzz/fuzz_targets/fuzz_target_1.rs @@ -0,0 +1,7 @@ +#![no_main] + +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|data: &[u8]| { + // fuzzed code goes here +}); diff --git a/rust/pspp/fuzz/fuzz_targets/segment.rs b/rust/pspp/fuzz/fuzz_targets/segment.rs new file mode 100644 index 0000000000..1e5a109449 --- /dev/null +++ b/rust/pspp/fuzz/fuzz_targets/segment.rs @@ -0,0 +1,18 @@ +#![no_main] + +use libfuzzer_sys::fuzz_target; +use pspp::lex::segment::{Segmenter, Mode, Type}; + +fuzz_target!(|data: &[u8]| { + if let Ok(mut input) = std::str::from_utf8(data) { + let mut segmenter = Segmenter::new(Mode::Auto, false); + loop { + let (rest, type_) = segmenter.push(input, true).unwrap(); + match type_ { + Type::End => break, + _ => (), + } + input = rest; + } + } +}); diff --git a/rust/pspp/src/command.rs b/rust/pspp/src/command.rs new file mode 100644 index 0000000000..d337d1823a --- /dev/null +++ b/rust/pspp/src/command.rs @@ -0,0 +1,192 @@ +use std::{fmt::Write, sync::OnceLock}; + +use flagset::{flags, FlagSet}; + +use crate::{ + integer::ToInteger, + lex::{ + command_name::CommandMatcher, + lexer::Lexer, + token::{Punct, Token}, + }, + message::Diagnostic, +}; + +flags! { + enum State: u8 { + /// No active dataset yet defined. + Initial, + + /// Active dataset has been defined. + Data, + + /// Inside `INPUT PROGRAM`. + InputProgram, + + /// Inside `FILE TYPE`. + FileType, + + /// State nested inside `LOOP` or `DO IF`, inside [State::Data]. + NestedData, + + /// State nested inside `LOOP` or `DO IF`, inside [State::InputProgram]. + NestedInputProgram, + } +} + +struct Command { + allowed_states: FlagSet, + enhanced_only: bool, + testing_only: bool, + no_abbrev: bool, + name: &'static str, + run: Box, +} + +fn commands() -> &'static [Command] { + fn new_commands() -> Vec { + vec![Command { + allowed_states: State::Initial | State::Data, + enhanced_only: false, + testing_only: false, + no_abbrev: false, + name: "ECHO", + run: Box::new(|_context| { + println!("hi"); + }), + }] + } + + static COMMANDS: OnceLock> = OnceLock::new(); + COMMANDS.get_or_init(|| new_commands()).as_slice() +} + +fn parse_command_word(lexer: &mut Lexer, s: &mut String, n: isize) -> bool { + let separator = match s.chars().next_back() { + Some(c) if c != '-' => " ", + _ => "", + }; + + match lexer.next(n) { + Token::Punct(Punct::Dash) => { + s.push('-'); + true + } + Token::Id(id) => { + write!(s, "{separator}{id}").unwrap(); + true + } + Token::Number(number) if number.is_sign_positive() => { + if let Some(integer) = number.to_exact_usize() { + write!(s, "{separator}{integer}").unwrap(); + true + } else { + false + } + } + _ => false, + } +} + +fn find_best_match(s: &str) -> (Option<&'static Command>, isize) { + let mut cm = CommandMatcher::new(s); + for command in commands() { + cm.add(command.name, command); + } + cm.get_match() +} + +fn parse_command_name( + lexer: &mut Lexer, + error: &Box, +) -> Result<(&'static Command, isize), ()> { + let mut s = String::new(); + let mut word = 0; + let mut missing_words = 0; + let mut command = None; + while parse_command_word(lexer, &mut s, word) { + (command, missing_words) = find_best_match(&s); + if missing_words <= 0 { + break; + } + word += 1; + } + if command.is_none() && missing_words > 0 { + s.push_str(" ."); + (command, missing_words) = find_best_match(&s); + s.truncate(s.len() - 2); + } + + match command { + Some(command) => Ok((command, (word + 1) + missing_words)), + None => { + if s.is_empty() { + error(lexer.error("Syntax error expecting command name")) + } else { + error(lexer.error("Unknown command `{s}`.")) + }; + Err(()) + } + } +} + +pub enum Success { + Success, + Eof, + Finish, +} + +pub fn end_of_command(context: &Context) -> Result { + match context.lexer.token() { + Token::EndCommand | Token::End => Ok(Success::Success), + _ => { + context.error( + context + .lexer + .error("Syntax error expecting end of command."), + ); + Err(()) + } + } +} + +fn parse_in_state(lexer: &mut Lexer, error: &Box, _state: State) { + match lexer.token() { + Token::End | Token::EndCommand => (), + _ => { + if let Ok((command, n_tokens)) = parse_command_name(lexer, error) { + for _ in 0..n_tokens { + lexer.get(); + } + let context = Context { + error, + lexer, + command_name: Some(command.name), + }; + (command.run)(&context); + end_of_command(&context); + } + lexer.interactive_reset(); + lexer.discard_rest_of_command(); + } + } + while let Token::EndCommand = lexer.token() { + lexer.get(); + } +} + +pub fn parse(lexer: &mut Lexer, error: &Box) { + parse_in_state(lexer, error, State::Initial) +} + +pub struct Context<'a> { + error: &'a Box, + lexer: &'a mut Lexer, + command_name: Option<&'static str>, +} + +impl<'a> Context<'a> { + pub fn error(&self, diagnostic: Diagnostic) { + (self.error)(diagnostic); + } +} diff --git a/rust/pspp/src/cooked.rs b/rust/pspp/src/cooked.rs new file mode 100644 index 0000000000..d2617df528 --- /dev/null +++ b/rust/pspp/src/cooked.rs @@ -0,0 +1,1482 @@ +use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc}; + +use crate::{ + dictionary::{Dictionary, VarWidth, Variable}, + encoding::Error as EncodingError, + endian::Endian, + format::{Error as FormatError, Format, UncheckedFormat}, + identifier::{Error as IdError, Identifier}, + raw::{ + self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord, + FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongNamesRecord, + LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord, + NumberOfCasesRecord, ProductInfoRecord, RawStr, ValueLabel, ValueLabelRecord, + VarDisplayRecord, VariableAttributeRecord, VariableRecord, VariableSetRecord, + VeryLongStringsRecord, ZHeader, ZTrailer, + }, +}; +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use encoding_rs::Encoding; +use num::Integer; +use thiserror::Error as ThisError; + +pub use crate::raw::{CategoryLabels, Compression}; + +#[derive(ThisError, Debug)] +pub enum Error { + #[error("Missing header record")] + MissingHeaderRecord, + + // XXX this is an internal error + #[error("More than one file header record")] + DuplicateHeaderRecord, + + #[error("{0}")] + EncodingError(EncodingError), + + #[error("Using default encoding {0}.")] + UsingDefaultEncoding(String), + + #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)] + InvalidVariableWidth { offsets: Range, width: i32 }, + + #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")] + InvalidLongMissingValueFormat, + + #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")] + InvalidCreationDate { creation_date: String }, + + #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")] + InvalidCreationTime { creation_time: String }, + + #[error("{id_error} Renaming variable to {new_name}.")] + InvalidVariableName { + id_error: IdError, + new_name: Identifier, + }, + + #[error( + "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}" + )] + InvalidPrintFormat { + new_spec: Format, + variable: Identifier, + format_error: FormatError, + }, + + #[error( + "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}" + )] + InvalidWriteFormat { + new_spec: Format, + variable: Identifier, + format_error: FormatError, + }, + + #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")] + DuplicateVariableName { + duplicate_name: Identifier, + new_name: Identifier, + }, + + #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")] + InvalidDictIndex { dict_index: usize, max_index: usize }, + + #[error("Dictionary index {0} refers to a long string continuation.")] + DictIndexIsContinuation(usize), + + #[error("At offset {offset:#x}, one or more variable indexes for value labels referred to long string continuation records: {indexes:?}")] + LongStringContinuationIndexes { offset: u64, indexes: Vec }, + + #[error( + "At offsets {:#x}...{:#x}, record types 3 and 4 may not add value labels to one or more long string variables: {variables:?}", .offsets.start, .offsets.end + )] + InvalidLongStringValueLabels { + offsets: Range, + variables: Vec, + }, + + #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")] + ValueLabelsDifferentTypes { + numeric_var: Identifier, + string_var: Identifier, + }, + + #[error("Invalid multiple response set name. {0}")] + InvalidMrSetName(IdError), + + #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")] + UnknownMrSetVariable { + mr_set: Identifier, + short_name: Identifier, + }, + + #[error("Multiple response set {0} has no variables.")] + EmptyMrSet(Identifier), + + #[error("Multiple response set {0} has only one variable.")] + OneVarMrSet(Identifier), + + #[error("Multiple response set {0} contains both string and numeric variables.")] + MixedMrSet(Identifier), + + #[error( + "Invalid numeric format for counted value {number} in multiple response set {mr_set}." + )] + InvalidMDGroupCountedValue { mr_set: Identifier, number: String }, + + #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")] + TooWideMDGroupCountedValue { + mr_set: Identifier, + value: String, + width: usize, + max_width: u16, + }, + + #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")] + InvalidLongValueLabelWidth { + name: Identifier, + width: u32, + min_width: u16, + max_width: u16, + }, + + #[error("Invalid attribute name. {0}")] + InvalidAttributeName(IdError), + + #[error("Invalid short name in long variable name record. {0}")] + InvalidShortName(IdError), + + #[error("Invalid name in long variable name record. {0}")] + InvalidLongName(IdError), + + #[error("Invalid variable name in very long string record. {0}")] + InvalidLongStringName(IdError), + + #[error("Invalid variable name in long string value label record. {0}")] + InvalidLongStringValueLabelName(IdError), + + #[error("Invalid variable name in attribute record. {0}")] + InvalidAttributeVariableName(IdError), + + // XXX This is risky because `text` might be arbitarily long. + #[error("Text string contains invalid bytes for {encoding} encoding: {text}")] + MalformedString { encoding: String, text: String }, + + #[error("Details TBD")] + TBD, +} + +type DictIndex = usize; + +#[derive(Clone, Debug)] +pub struct Headers { + pub header: HeaderRecord, + pub variable: Vec>, + pub value_label: Vec, String>>, + pub document: Vec>, + pub integer_info: Option, + pub float_info: Option, + pub var_display: Option, + pub multiple_response: Vec>, + pub long_string_value_labels: Vec>, + pub long_string_missing_values: Vec>, + pub encoding: Option, + pub number_of_cases: Option, + pub variable_sets: Vec, + pub product_info: Option, + pub long_names: Vec, + pub very_long_strings: Vec, + pub file_attributes: Vec, + pub variable_attributes: Vec, + pub other_extension: Vec, + pub end_of_headers: Option, + pub z_header: Option, + pub z_trailer: Option, + pub cases: Option>>, +} + +fn take_first(mut vec: Vec, more_than_one: F) -> Option +where + F: FnOnce(), +{ + if vec.len() > 1 { + more_than_one(); + } + vec.drain(..).next() +} + +impl Headers { + pub fn new(headers: Vec, warn: &impl Fn(Error)) -> Result { + let mut file_header = Vec::new(); + let mut variable = Vec::new(); + let mut value_label = Vec::new(); + let mut document = Vec::new(); + let mut integer_info = Vec::new(); + let mut float_info = Vec::new(); + let mut var_display = Vec::new(); + let mut multiple_response = Vec::new(); + let mut long_string_value_labels = Vec::new(); + let mut long_string_missing_values = Vec::new(); + let mut encoding = Vec::new(); + let mut number_of_cases = Vec::new(); + let mut variable_sets = Vec::new(); + let mut product_info = Vec::new(); + let mut long_names = Vec::new(); + let mut very_long_strings = Vec::new(); + let mut file_attributes = Vec::new(); + let mut variable_attributes = Vec::new(); + let mut other_extension = Vec::new(); + let mut end_of_headers = Vec::new(); + let mut z_header = Vec::new(); + let mut z_trailer = Vec::new(); + let mut cases = Vec::new(); + + for header in headers { + match header { + DecodedRecord::Header(record) => { + file_header.push(record); + } + DecodedRecord::Variable(record) => { + variable.push(record); + } + DecodedRecord::ValueLabel(record) => { + value_label.push(record); + } + DecodedRecord::Document(record) => { + document.push(record); + } + DecodedRecord::IntegerInfo(record) => { + integer_info.push(record); + } + DecodedRecord::FloatInfo(record) => { + float_info.push(record); + } + DecodedRecord::VariableSets(record) => { + variable_sets.push(record); + } + DecodedRecord::VarDisplay(record) => { + var_display.push(record); + } + DecodedRecord::MultipleResponse(record) => { + multiple_response.push(record); + } + DecodedRecord::LongStringValueLabels(record) => { + long_string_value_labels.push(record) + } + DecodedRecord::LongStringMissingValues(record) => { + long_string_missing_values.push(record); + } + DecodedRecord::Encoding(record) => { + encoding.push(record); + } + DecodedRecord::NumberOfCases(record) => { + number_of_cases.push(record); + } + DecodedRecord::ProductInfo(record) => { + product_info.push(record); + } + DecodedRecord::LongNames(record) => { + long_names.push(record); + } + DecodedRecord::VeryLongStrings(record) => { + very_long_strings.push(record); + } + DecodedRecord::FileAttributes(record) => { + file_attributes.push(record); + } + DecodedRecord::VariableAttributes(record) => { + variable_attributes.push(record); + } + DecodedRecord::OtherExtension(record) => { + other_extension.push(record); + } + DecodedRecord::EndOfHeaders(record) => { + end_of_headers.push(record); + } + DecodedRecord::ZHeader(record) => { + z_header.push(record); + } + DecodedRecord::ZTrailer(record) => { + z_trailer.push(record); + } + DecodedRecord::Cases(record) => { + cases.push(record); + } + } + } + + let Some(file_header) = take_first(file_header, || warn(Error::DuplicateHeaderRecord)) + else { + return Err(Error::MissingHeaderRecord); + }; + + Ok(Headers { + header: file_header, + variable, + value_label, + document, + integer_info: take_first(integer_info, || warn(Error::TBD)), + float_info: take_first(float_info, || warn(Error::TBD)), + var_display: take_first(var_display, || warn(Error::TBD)), + multiple_response, + long_string_value_labels, + long_string_missing_values, + encoding: take_first(encoding, || warn(Error::TBD)), + number_of_cases: take_first(number_of_cases, || warn(Error::TBD)), + variable_sets, + product_info: take_first(product_info, || warn(Error::TBD)), + long_names, + very_long_strings, + file_attributes, + variable_attributes, + other_extension, + end_of_headers: take_first(end_of_headers, || warn(Error::TBD)), + z_header: take_first(z_header, || warn(Error::TBD)), + z_trailer: take_first(z_trailer, || warn(Error::TBD)), + cases: take_first(cases, || warn(Error::TBD)), + }) + } +} + +pub struct Metadata { + creation: NaiveDateTime, + endian: Endian, + compression: Option, + n_cases: Option, + product: String, + product_ext: Option, + version: Option<(i32, i32, i32)>, +} + +impl Metadata { + fn decode(headers: &Headers, warn: impl Fn(Error)) -> Self { + let header = &headers.header; + let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y") + .unwrap_or_else(|_| { + warn(Error::InvalidCreationDate { + creation_date: header.creation_date.to_string(), + }); + Default::default() + }); + let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S") + .unwrap_or_else(|_| { + warn(Error::InvalidCreationTime { + creation_time: header.creation_time.to_string(), + }); + Default::default() + }); + let creation = NaiveDateTime::new(creation_date, creation_time); + + let product = header + .eye_catcher + .trim_start_matches("@(#) SPSS DATA FILE") + .trim_end() + .to_string(); + + Self { + creation, + endian: header.endian, + compression: header.compression, + n_cases: header.n_cases.map(|n| n as u64), + product, + product_ext: headers.product_info.as_ref().map(|pe| fix_line_ends(&pe.0)), + version: headers.integer_info.as_ref().map(|ii| ii.version), + } + } +} + +struct Decoder { + //pub raw: raw::Decoder, + pub encoding: &'static Encoding, + //pub variables: HashMap, + //pub var_names: HashMap, + //pub dictionary: Dictionary, + //n_dict_indexes: usize, + n_generated_names: usize, +} + +impl Decoder { + fn generate_name(&mut self, dictionary: &Dictionary) -> Identifier { + loop { + self.n_generated_names += 1; + let name = Identifier::from_encoding(&format!("VAR{:03}", self.n_generated_names), self.encoding) + .unwrap(); + if !dictionary.variables.contains(&name.0) { + return name; + } + assert!(self.n_generated_names < usize::MAX); + } + } +} + +pub fn decode( + mut headers: Headers, + encoding: &'static Encoding, + warn: impl Fn(Error), +) -> Result<(Dictionary, Metadata), Error> { + let mut dictionary = Dictionary::new(encoding); + + let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' ')); + if !file_label.is_empty() { + dictionary.file_label = Some(file_label); + } + + for attributes in headers.file_attributes.drain(..) { + dictionary.attributes.extend(attributes.0 .0.into_iter()) + } + + // Concatenate all the document records (really there should only be one) + // and trim off the trailing spaces that pad them to 80 bytes. + dictionary.documents = headers + .document + .drain(..) + .flat_map(|record| record.lines) + .map(trim_end_spaces) + .collect(); + + // XXX warn for weird integer format + // XXX warn for weird floating-point format, etc. + + let mut decoder = Decoder { + encoding, + n_generated_names: 0, + }; + + let mut header_vars = headers.variable.iter().enumerate(); + let mut var_index_map = HashMap::new(); + while let Some((value_index, input)) = header_vars.next() { + let name = trim_end_spaces(input.name.to_string()); + let name = match Identifier::from_encoding(&name, encoding) { + Ok(name) => { + if !dictionary.variables.contains(&name.0) { + name + } else { + let new_name = decoder.generate_name(&dictionary); + warn(Error::DuplicateVariableName { + duplicate_name: name.clone(), + new_name: new_name.clone(), + }); + new_name + } + } + Err(id_error) => { + let new_name = decoder.generate_name(&dictionary); + warn(Error::InvalidVariableName { + id_error, + new_name: new_name.clone(), + }); + new_name + } + }; + let mut variable = Variable::new(name.clone(), VarWidth::from_raw(input.width).unwrap()); + + // Set the short name the same as the long name (even if we renamed it). + variable.short_names = vec![name]; + + variable.label = input.label.clone(); + + variable.missing_values = input.missing_values.clone(); + + variable.print_format = decode_format( + input.print_format, + variable.width, + |new_spec, format_error| { + warn(Error::InvalidPrintFormat { + new_spec, + variable: variable.name.clone(), + format_error, + }) + }, + ); + variable.write_format = decode_format( + input.write_format, + variable.width, + |new_spec, format_error| { + warn(Error::InvalidWriteFormat { + new_spec, + variable: variable.name.clone(), + format_error, + }) + }, + ); + + // Skip long string continuation records. + if input.width > 0 { + #[allow(unstable_name_collisions)] + for _ in 1..input.width.div_ceil(&8) { + if let Some((_, continuation)) = header_vars.next() { + if continuation.width == -1 { + continue; + } + } + return Err(Error::TBD); + } + } + + let dict_index = dictionary.add_var(variable).unwrap(); + assert_eq!(var_index_map.insert(value_index, dict_index), None); + } + + for record in headers.value_label.drain(..) { + let mut dict_indexes = Vec::with_capacity(record.dict_indexes.len()); + let mut continuation_indexes = Vec::new(); + let mut long_string_variables = Vec::new(); + for value_index in record.dict_indexes.iter() { + if let Some(dict_index) = var_index_map.get(&(*value_index as usize - 1)) { + let variable = &dictionary.variables[*dict_index]; + if variable.width.is_long_string() { + long_string_variables.push(variable.name.clone()); + } else { + dict_indexes.push(*dict_index); + } + } else { + continuation_indexes.push(*value_index); + } + } + if !continuation_indexes.is_empty() { + warn(Error::LongStringContinuationIndexes { + offset: record.offsets.start, + indexes: continuation_indexes, + }); + } + if !long_string_variables.is_empty() { + warn(Error::InvalidLongStringValueLabels { + offsets: record.offsets.clone(), + variables: long_string_variables, + }); + } + + for dict_index in dict_indexes { + let mut variable = &dictionary.variables[dict_index]; + for ValueLabel { value, label } in record.labels.iter().cloned() { + + } + } + } + + let metadata = Metadata::decode(&headers, warn); + Ok((dictionary, metadata)) +} + +fn trim_end_spaces(mut s: String) -> String { + s.truncate(s.trim_end_matches(' ').len()); + s +} + +/// Returns a copy of `s` in which all lone CR and CR LF pairs have been +/// replaced by LF. +/// +/// (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system +/// files that use CR-only line ends in the file label and extra product info.) +fn fix_line_ends(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + let mut s = s.chars().peekable(); + while let Some(c) = s.next() { + match c { + '\r' => { + s.next_if_eq(&'\n'); + out.push('\n') + } + c => out.push(c), + } + } + out +} + +fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Format, FormatError)) -> Format { + UncheckedFormat::try_from(raw) + .and_then(Format::try_from) + .and_then(|x| x.check_width_compatibility(width)) + .unwrap_or_else(|error| { + let new_format = Format::default_for_width(width); + warn(new_format, error); + new_format + }) +} + +/* +impl Decoder { + fn generate_name(&mut self) -> Identifier { + loop { + self.n_generated_names += 1; + let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding) + .unwrap(); + if !self.var_names.contains_key(&name) { + return name; + } + assert!(self.n_generated_names < usize::MAX); + } + } + fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> { + let (output, malformed) = self.encoding.decode_without_bom_handling(input); + if malformed { + warn(Error::MalformedString { + encoding: self.encoding.name().into(), + text: output.clone().into(), + }); + } + output + } + fn decode_string(&self, input: &[u8], warn: &impl Fn(Error)) -> String { + self.decode_string_cow(input, warn).into() + } + pub fn decode_identifier( + &self, + input: &[u8], + warn: &impl Fn(Error), + ) -> Result { + let s = self.decode_string_cow(input, warn); + Identifier::new(&s, self.encoding) + } + fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> { + let max_index = self.n_dict_indexes; + if dict_index == 0 || dict_index > max_index { + return Err(Error::InvalidDictIndex { + dict_index, + max_index, + }); + } + let Some(variable) = self.variables.get(&(dict_index - 1)) else { + return Err(Error::DictIndexIsContinuation(dict_index)); + }; + Ok(variable) + } + + /// Returns `input` decoded from `self.encoding` into UTF-8 such that + /// re-encoding the result back into `self.encoding` will have exactly the + /// same length in bytes. + /// + /// XXX warn about errors? + fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { + if let (s, false) = self.encoding.decode_without_bom_handling(input) { + // This is the common case. Usually there will be no errors. + s + } else { + // Unusual case. Don't bother to optimize it much. + let mut decoder = self.encoding.new_decoder_without_bom_handling(); + let mut output = String::with_capacity( + decoder + .max_utf8_buffer_length_without_replacement(input.len()) + .unwrap(), + ); + let mut rest = input; + while !rest.is_empty() { + match decoder.decode_to_string_without_replacement(rest, &mut output, true) { + (DecoderResult::InputEmpty, _) => break, + (DecoderResult::OutputFull, _) => unreachable!(), + (DecoderResult::Malformed(a, b), consumed) => { + let skipped = a as usize + b as usize; + output.extend(repeat('?').take(skipped)); + rest = &rest[consumed..]; + } + } + } + assert_eq!(self.encoding.encode(&output).0.len(), input.len()); + output.into() + } + } +} + +pub trait TryDecode: Sized { + type Input<'a>; + fn try_decode( + decoder: &mut Decoder, + input: &Self::Input<'_>, + warn: impl Fn(Error), + ) -> Result, Error>; +} + +pub trait Decode: Sized { + fn decode(decoder: &Decoder, input: &Input, warn: impl Fn(Error)) -> Self; +} + +impl Decode> for String { + fn decode(decoder: &Decoder, input: &RawStr, warn: impl Fn(Error)) -> Self { + decoder.decode_string(&input.0, &warn) + } +} +*/ +/* +#[derive(Clone, Debug)] +pub struct HeaderRecord { + pub eye_catcher: String, + pub weight_index: Option, + pub n_cases: Option, + pub creation: NaiveDateTime, + pub file_label: String, +} + +fn trim_end_spaces(mut s: String) -> String { + s.truncate(s.trim_end_matches(' ').len()); + s +} + +/// Data file info that doesn't fit in [Dictionary]. +pub struct Metadata { + creation: NaiveDateTime, + endian: Endian, + compression: Option, + n_cases: Option, + product: String, + product_ext: Option, + version: Option<(i32, i32, i32)>, +} + +impl Metadata { + fn decode( + header: &crate::raw::HeaderRecord>, + integer_info: Option<&IntegerInfoRecord>, + product_ext: Option<&ProductInfoRecord>, + warn: impl Fn(Error), + ) -> Self { + let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y") + .unwrap_or_else(|_| { + warn(Error::InvalidCreationDate { + creation_date: header.creation_date.to_string(), + }); + Default::default() + }); + let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S") + .unwrap_or_else(|_| { + warn(Error::InvalidCreationTime { + creation_time: header.creation_time.to_string(), + }); + Default::default() + }); + let creation = NaiveDateTime::new(creation_date, creation_time); + + let product = header + .eye_catcher + .trim_start_matches("@(#) SPSS DATA FILE") + .trim_end() + .to_string(); + + Self { + creation, + endian: header.endian, + compression: header.compression, + n_cases: header.n_cases.map(|n| n as u64), + product, + product_ext: product_ext.map(|pe| pe.0.clone()), + version: integer_info.map(|ii| ii.version), + } + } +} + +impl TryDecode for HeaderRecord { + type Input<'a> = crate::raw::HeaderRecord>; + + fn try_decode( + _decoder: &mut Decoder, + input: &Self::Input<'_>, + warn: impl Fn(Error), + ) -> Result, Error> { + let eye_catcher = trim_end_spaces(input.eye_catcher.to_string()); + let file_label = trim_end_spaces(input.file_label.to_string()); + let creation_date = NaiveDate::parse_from_str(&input.creation_date, "%e %b %Y") + .unwrap_or_else(|_| { + warn(Error::InvalidCreationDate { + creation_date: input.creation_date.to_string(), + }); + Default::default() + }); + let creation_time = NaiveTime::parse_from_str(&input.creation_time, "%H:%M:%S") + .unwrap_or_else(|_| { + warn(Error::InvalidCreationTime { + creation_time: input.creation_time.to_string(), + }); + Default::default() + }); + Ok(Some(HeaderRecord { + eye_catcher, + weight_index: input.weight_index.map(|n| n as usize), + n_cases: input.n_cases.map(|n| n as u64), + creation: NaiveDateTime::new(creation_date, creation_time), + file_label, + })) + } +} + +#[derive(Clone, Debug)] +pub struct VariableRecord { + pub width: VarWidth, + pub name: Identifier, + pub print_format: Spec, + pub write_format: Spec, + pub missing_values: MissingValues, + pub label: Option, +} + + +fn parse_variable_record( + decoder: &mut Decoder, + input: &raw::VariableRecord, String>, + warn: impl Fn(Error), +) -> Result<(), Error> { + let width = match input.width { + 0 => VarWidth::Numeric, + w @ 1..=255 => VarWidth::String(w as u16), + -1 => return Ok(()), + _ => { + return Err(Error::InvalidVariableWidth { + offsets: input.offsets.clone(), + width: input.width, + }) + } + }; + let name = trim_end_spaces(input.name.to_string()); + let name = match Identifier::new(&name, decoder.encoding) { + Ok(name) => { + if !decoder.var_names.contains_key(&name) { + name + } else { + let new_name = decoder.generate_name(); + warn(Error::DuplicateVariableName { + duplicate_name: name.clone(), + new_name: new_name.clone(), + }); + new_name + } + } + Err(id_error) => { + let new_name = decoder.generate_name(); + warn(Error::InvalidVariableName { + id_error, + new_name: new_name.clone(), + }); + new_name + } + }; + let variable = Variable { + dict_index: decoder.n_dict_indexes, + short_name: name.clone(), + long_name: None, + width, + }; + decoder.n_dict_indexes += width.n_dict_indexes(); + assert!(decoder + .var_names + .insert(name.clone(), variable.dict_index) + .is_none()); + assert!(decoder + .variables + .insert(variable.dict_index, variable) + .is_none()); + + let print_format = decode_format(input.print_format, width, |new_spec, format_error| { + warn(Error::InvalidPrintFormat { + new_spec, + variable: name.clone(), + format_error, + }) + }); + let write_format = decode_format(input.write_format, width, |new_spec, format_error| { + warn(Error::InvalidWriteFormat { + new_spec, + variable: name.clone(), + format_error, + }) + }); + let mut variable = dictionary::Variable::new(name, width); + variable.print_format = print_format; + variable.write_format = write_format; + variable.missing_values = input.missing_values.clone(); + if let Some(ref label) = input.label { + variable.label = Some(label.to_string()); + } + decoder.dictionary.add_var(variable).unwrap(); + Ok(()) +} + +#[derive(Clone, Debug)] +pub struct DocumentRecord(Vec); + +impl TryDecode for DocumentRecord { + type Input<'a> = crate::raw::DocumentRecord; + + fn try_decode( + decoder: &mut Decoder, + input: &Self::Input<'_>, + warn: impl Fn(Error), + ) -> Result, Error> { + Ok(Some(DocumentRecord( + input + .lines + .iter() + .map(|s| trim_end_spaces(decoder.decode_string(&s.0, &warn))) + .collect(), + ))) + } +} + +trait TextRecord +where + Self: Sized, +{ + const NAME: &'static str; + fn parse(input: &str, warn: impl Fn(Error)) -> Result; +} + +#[derive(Clone, Debug)] +pub struct VariableSet { + pub name: String, + pub vars: Vec, +} + +impl VariableSet { + fn parse(input: &str) -> Result { + let (name, input) = input.split_once('=').ok_or(Error::TBD)?; + let vars = input.split_ascii_whitespace().map(String::from).collect(); + Ok(VariableSet { + name: name.into(), + vars, + }) + } +} + +trait WarnOnError { + fn warn_on_error(self, warn: &F) -> Option; +} +impl WarnOnError for Result { + fn warn_on_error(self, warn: &F) -> Option { + match self { + Ok(result) => Some(result), + Err(error) => { + warn(error); + None + } + } + } +} + +#[derive(Clone, Debug)] +pub struct ValueLabel { + pub value: Value, + pub label: String, +} + +#[derive(Clone, Debug)] +pub struct ValueLabelRecord { + pub var_type: VarType, + pub labels: Vec, + pub variables: Vec, +} + +impl TryDecode for ValueLabelRecord { + type Input<'a> = crate::raw::ValueLabelRecord, RawString>; + fn try_decode( + decoder: &mut Decoder, + input: &Self::Input<'_>, + warn: impl Fn(Error), + ) -> Result, Error> { + let variables: Vec<&Variable> = input + .dict_indexes + .iter() + .filter_map(|&dict_index| { + decoder + .get_var_by_index(dict_index as usize) + .warn_on_error(&warn) + }) + .filter(|&variable| match variable.width { + VarWidth::String(width) if width > 8 => { + warn(Error::InvalidLongStringValueLabel( + variable.short_name.clone(), + )); + false + } + _ => true, + }) + .collect(); + let mut i = variables.iter(); + let Some(&first_var) = i.next() else { + return Ok(None); + }; + let var_type: VarType = first_var.width.into(); + for &variable in i { + let this_type: VarType = variable.width.into(); + if var_type != this_type { + let (numeric_var, string_var) = match var_type { + VarType::Numeric => (first_var, variable), + VarType::String => (variable, first_var), + }; + warn(Error::ValueLabelsDifferentTypes { + numeric_var: numeric_var.short_name.clone(), + string_var: string_var.short_name.clone(), + }); + return Ok(None); + } + } + let labels = input + .labels + .iter() + .map(|raw::ValueLabel { value, label }| { + let label = decoder.decode_string(&label.0, &warn); + let value = Value::decode(value, decoder); + ValueLabel { value, label } + }) + .collect(); + let variables = variables + .iter() + .map(|&variable| variable.short_name.clone()) + .collect(); + Ok(Some(ValueLabelRecord { + var_type, + labels, + variables, + })) + } +} + +#[derive(Clone, Debug)] +pub struct VariableSetRecord(Vec); + +impl TextRecord for VariableSetRecord { + const NAME: &'static str = "variable set"; + fn parse(input: &str, warn: impl Fn(Error)) -> Result { + let mut sets = Vec::new(); + for line in input.lines() { + if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) { + sets.push(set) + } + } + Ok(VariableSetRecord(sets)) + } +} + +#[derive(Clone, Debug)] +pub struct LongName { + pub short_name: Identifier, + pub long_name: Identifier, +} + +impl LongName { + fn new(decoder: &mut Decoder, short_name: &str, long_name: &str) -> Result { + let short_name = + Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidShortName)?; + let long_name = + Identifier::new(long_name, decoder.encoding).map_err(Error::InvalidLongName)?; + Ok(LongName { + short_name, + long_name, + }) + } +} + +#[derive(Clone, Debug)] +pub struct LongNameRecord(Vec); + +impl LongNameRecord { + pub fn parse(decoder: &mut Decoder, input: &str, warn: impl Fn(Error)) -> Result { + let mut names = Vec::new(); + for pair in input.split('\t').filter(|s| !s.is_empty()) { + if let Some((short_name, long_name)) = pair.split_once('=') { + if let Some(long_name) = + LongName::new(decoder, short_name, long_name).warn_on_error(&warn) + { + names.push(long_name); + } + } else { + warn(Error::TBD) + } + } + Ok(LongNameRecord(names)) + } +} + +#[derive(Clone, Debug)] +pub struct VeryLongString { + pub short_name: Identifier, + pub length: u16, +} + +impl VeryLongString { + fn parse(decoder: &Decoder, input: &str) -> Result { + let Some((short_name, length)) = input.split_once('=') else { + return Err(Error::TBD); + }; + let short_name = + Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidLongStringName)?; + let length: u16 = length.parse().map_err(|_| Error::TBD)?; + if length > VarWidth::MAX_STRING { + return Err(Error::TBD); + } + Ok(VeryLongString { short_name, length }) + } +} + +#[derive(Clone, Debug)] +pub struct VeryLongStringRecord(Vec); + +impl VeryLongStringRecord { + pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result { + let mut very_long_strings = Vec::new(); + for tuple in input + .split('\0') + .map(|s| s.trim_end_matches('\t')) + .filter(|s| !s.is_empty()) + { + if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&warn) { + very_long_strings.push(vls) + } + } + Ok(VeryLongStringRecord(very_long_strings)) + } +} + +#[derive(Clone, Debug)] +pub struct Attribute { + pub name: Identifier, + pub values: Vec, +} + +impl Attribute { + fn parse<'a>( + decoder: &Decoder, + input: &'a str, + warn: &impl Fn(Error), + ) -> Result<(Option, &'a str), Error> { + let Some((name, mut input)) = input.split_once('(') else { + return Err(Error::TBD); + }; + let mut values = Vec::new(); + loop { + let Some((value, rest)) = input.split_once('\n') else { + return Err(Error::TBD); + }; + if let Some(stripped) = value + .strip_prefix('\'') + .and_then(|value| value.strip_suffix('\'')) + { + values.push(stripped.into()); + } else { + warn(Error::TBD); + values.push(value.into()); + } + if let Some(rest) = rest.strip_prefix(')') { + let attribute = Identifier::new(name, decoder.encoding) + .map_err(Error::InvalidAttributeName) + .warn_on_error(warn) + .map(|name| Attribute { name, values }); + return Ok((attribute, rest)); + }; + input = rest; + } + } +} + +#[derive(Clone, Debug)] +pub struct AttributeSet(pub Vec); + +impl AttributeSet { + fn parse<'a>( + decoder: &Decoder, + mut input: &'a str, + sentinel: Option, + warn: &impl Fn(Error), + ) -> Result<(AttributeSet, &'a str), Error> { + let mut attributes = Vec::new(); + let rest = loop { + match input.chars().next() { + None => break input, + c if c == sentinel => break &input[1..], + _ => { + let (attribute, rest) = Attribute::parse(decoder, input, &warn)?; + if let Some(attribute) = attribute { + attributes.push(attribute); + } + input = rest; + } + } + }; + Ok((AttributeSet(attributes), rest)) + } +} + +#[derive(Clone, Debug)] +pub struct FileAttributeRecord(AttributeSet); + +impl FileAttributeRecord { + pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result { + let (set, rest) = AttributeSet::parse(decoder, input, None, &warn)?; + if !rest.is_empty() { + warn(Error::TBD); + } + Ok(FileAttributeRecord(set)) + } +} + +#[derive(Clone, Debug)] +pub struct VarAttributeSet { + pub long_var_name: Identifier, + pub attributes: AttributeSet, +} + +impl VarAttributeSet { + fn parse<'a>( + decoder: &Decoder, + input: &'a str, + warn: &impl Fn(Error), + ) -> Result<(Option, &'a str), Error> { + let Some((long_var_name, rest)) = input.split_once(':') else { + return Err(Error::TBD); + }; + let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'), warn)?; + let var_attribute = Identifier::new(long_var_name, decoder.encoding) + .map_err(Error::InvalidAttributeVariableName) + .warn_on_error(warn) + .map(|name| VarAttributeSet { + long_var_name: name, + attributes, + }); + Ok((var_attribute, rest)) + } +} + +#[derive(Clone, Debug)] +pub struct VariableAttributeRecord(Vec); + +impl VariableAttributeRecord { + pub fn parse(decoder: &Decoder, mut input: &str, warn: impl Fn(Error)) -> Result { + let mut var_attribute_sets = Vec::new(); + while !input.is_empty() { + let Some((var_attribute, rest)) = + VarAttributeSet::parse(decoder, input, &warn).warn_on_error(&warn) + else { + break; + }; + if let Some(var_attribute) = var_attribute { + var_attribute_sets.push(var_attribute); + } + input = rest; + } + Ok(VariableAttributeRecord(var_attribute_sets)) + } +} + +#[derive(Clone, Debug)] +pub enum MultipleResponseType { + MultipleDichotomy { + value: Value, + labels: CategoryLabels, + }, + MultipleCategory, +} + +impl MultipleResponseType { + fn decode( + decoder: &Decoder, + mr_set: &Identifier, + input: &raw::MultipleResponseType, + min_width: VarWidth, + warn: &impl Fn(Error), + ) -> Result { + let mr_type = match input { + raw::MultipleResponseType::MultipleDichotomy { value, labels } => { + let value = decoder.decode_string_cow(&value.0, warn); + let value = match min_width { + VarWidth::Numeric => { + let number: f64 = value.trim().parse().map_err(|_| { + Error::InvalidMDGroupCountedValue { + mr_set: mr_set.clone(), + number: value.into(), + } + })?; + Value::Number(Some(number.into())) + } + VarWidth::String(max_width) => { + let value = value.trim_end_matches(' '); + let width = value.len(); + if width > max_width as usize { + return Err(Error::TooWideMDGroupCountedValue { + mr_set: mr_set.clone(), + value: value.into(), + width, + max_width, + }); + }; + Value::String(value.into()) + } + }; + MultipleResponseType::MultipleDichotomy { + value, + labels: *labels, + } + } + raw::MultipleResponseType::MultipleCategory => MultipleResponseType::MultipleCategory, + }; + Ok(mr_type) + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseSet { + pub name: Identifier, + pub min_width: VarWidth, + pub max_width: VarWidth, + pub label: String, + pub mr_type: MultipleResponseType, + pub dict_indexes: Vec, +} + +impl MultipleResponseSet { + fn decode( + decoder: &Decoder, + input: &raw::MultipleResponseSet>, + warn: &impl Fn(Error), + ) -> Result { + let mr_set_name = input.name.clone(); + let mut dict_indexes = Vec::with_capacity(input.short_names.len()); + for short_name in input.short_names.iter() { + let Some(&dict_index) = decoder.var_names.get(&short_name) else { + warn(Error::UnknownMrSetVariable { + mr_set: mr_set_name.clone(), + short_name: short_name.clone(), + }); + continue; + }; + dict_indexes.push(dict_index); + } + + match dict_indexes.len() { + 0 => return Err(Error::EmptyMrSet(mr_set_name)), + 1 => return Err(Error::OneVarMrSet(mr_set_name)), + _ => (), + } + + let Some((Some(min_width), Some(max_width))) = dict_indexes + .iter() + .map(|dict_index| decoder.variables[dict_index].width) + .map(|w| (Some(w), Some(w))) + .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb))) + else { + return Err(Error::MixedMrSet(mr_set_name)); + }; + + let mr_type = + MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?; + + Ok(MultipleResponseSet { + name: mr_set_name, + min_width, + max_width, + label: input.label.to_string(), + mr_type, + dict_indexes, + }) + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseRecord(pub Vec); + +impl TryDecode for MultipleResponseRecord { + type Input<'a> = raw::MultipleResponseRecord>; + + fn try_decode( + decoder: &mut Decoder, + input: &Self::Input<'_>, + warn: impl Fn(Error), + ) -> Result, Error> { + let mut sets = Vec::with_capacity(input.0.len()); + for set in &input.0 { + match MultipleResponseSet::decode(decoder, set, &warn) { + Ok(set) => sets.push(set), + Err(error) => warn(error), + } + } + Ok(Some(MultipleResponseRecord(sets))) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringValueLabels { + pub var_name: Identifier, + pub width: VarWidth, + pub labels: Vec, +} + +impl LongStringValueLabels { + fn decode( + decoder: &Decoder, + input: &raw::LongStringValueLabels, + warn: &impl Fn(Error), + ) -> Result { + let var_name = decoder.decode_string(&input.var_name.0, warn); + let var_name = Identifier::new(var_name.trim_end(), decoder.encoding) + .map_err(Error::InvalidLongStringValueLabelName)?; + + let min_width = 9; + let max_width = VarWidth::MAX_STRING; + if input.width < 9 || input.width > max_width as u32 { + return Err(Error::InvalidLongValueLabelWidth { + name: var_name, + width: input.width, + min_width, + max_width, + }); + } + let width = input.width as u16; + + let mut labels = Vec::with_capacity(input.labels.len()); + for (value, label) in input.labels.iter() { + let value = Value::String(decoder.decode_exact_length(&value.0).into()); + let label = decoder.decode_string(&label.0, warn); + labels.push(ValueLabel { value, label }); + } + + Ok(LongStringValueLabels { + var_name, + width: VarWidth::String(width), + labels, + }) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringValueLabelRecord(pub Vec); + +impl TryDecode for LongStringValueLabelRecord { + type Input<'a> = raw::LongStringValueLabelRecord; + + fn try_decode( + decoder: &mut Decoder, + input: &Self::Input<'_>, + warn: impl Fn(Error), + ) -> Result, Error> { + let mut labels = Vec::with_capacity(input.0.len()); + for label in &input.0 { + match LongStringValueLabels::decode(decoder, label, &warn) { + Ok(set) => labels.push(set), + Err(error) => warn(error), + } + } + Ok(Some(LongStringValueLabelRecord(labels))) + } +} + +#[cfg(test)] +mod test { + use encoding_rs::WINDOWS_1252; + + #[test] + fn test() { + let mut s = String::new(); + s.push(char::REPLACEMENT_CHARACTER); + let encoded = WINDOWS_1252.encode(&s).0; + let decoded = WINDOWS_1252.decode(&encoded[..]).0; + println!("{:?}", decoded); + } + + #[test] + fn test2() { + let charset: Vec = (0..=255).collect(); + println!("{}", charset.len()); + let decoded = WINDOWS_1252.decode(&charset[..]).0; + println!("{}", decoded.len()); + let encoded = WINDOWS_1252.encode(&decoded[..]).0; + println!("{}", encoded.len()); + assert_eq!(&charset[..], &encoded[..]); + } +} +*/ diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs new file mode 100644 index 0000000000..c26009921b --- /dev/null +++ b/rust/pspp/src/dictionary.rs @@ -0,0 +1,530 @@ +use std::{ + cmp::Ordering, + collections::{HashMap, HashSet}, + fmt::Debug, + ops::{Bound, RangeBounds}, +}; + +use encoding_rs::Encoding; +use indexmap::IndexSet; +use num::integer::div_ceil; +use ordered_float::OrderedFloat; +use unicase::UniCase; + +use crate::{ + format::Format, + identifier::{ByIdentifier, HasIdentifier, Identifier}, + raw::{self, Alignment, CategoryLabels, Decoder, Measure, MissingValues, RawStr, VarType}, +}; + +pub type DictIndex = usize; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum VarWidth { + Numeric, + String(u16), +} + +impl PartialOrd for VarWidth { + fn partial_cmp(&self, other: &Self) -> Option { + match (self, other) { + (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal), + (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)), + _ => None, + } + } +} + +impl VarWidth { + pub const MAX_STRING: u16 = 32767; + + pub fn n_dict_indexes(self) -> usize { + match self { + VarWidth::Numeric => 1, + VarWidth::String(w) => div_ceil(w as usize, 8), + } + } + + fn width_predicate( + a: Option, + b: Option, + f: impl Fn(u16, u16) -> u16, + ) -> Option { + match (a, b) { + (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric), + (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => { + Some(VarWidth::String(f(a, b))) + } + _ => None, + } + } + + /// Returns the wider of `self` and `other`: + /// - Numerical variable widths are equally wide. + /// - Longer strings are wider than shorter strings. + /// - Numerical and string types are incomparable, so result in `None`. + /// - Any `None` in the input yields `None` in the output. + pub fn wider(a: Option, b: Option) -> Option { + Self::width_predicate(a, b, |a, b| a.max(b)) + } + + /// Returns the narrower of `self` and `other` (see [`Self::wider`]). + pub fn narrower(a: Option, b: Option) -> Option { + Self::width_predicate(a, b, |a, b| a.min(b)) + } + + pub fn default_display_width(&self) -> u32 { + match self { + VarWidth::Numeric => 8, + VarWidth::String(width) => *width.min(&32) as u32, + } + } + + pub fn from_raw(raw: impl Into) -> Result { + let raw: i32 = raw.into(); + match raw { + 0 => Ok(Self::Numeric), + 1..=255 => Ok(Self::String(raw as u16)), + _ => Err(()), + } + } + + pub fn is_long_string(&self) -> bool { + if let Self::String(width) = self { + *width > 8 + } else { + false + } + } +} + +impl From for VarType { + fn from(source: VarWidth) -> Self { + match source { + VarWidth::Numeric => VarType::Numeric, + VarWidth::String(_) => VarType::String, + } + } +} + +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Value { + Number(Option>), + String(String), +} + +impl Value { + pub fn decode(raw: &raw::Value>, decoder: &Decoder) -> Self { + match raw { + raw::Value::Number(x) => Value::Number(x.map(|x| x.into())), + raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()), + } + } +} + +#[derive(Clone, Debug)] +pub struct Dictionary { + pub variables: IndexSet>, + pub split_file: Vec, + pub weight: Option, + pub filter: Option, + pub case_limit: Option, + pub file_label: Option, + pub documents: Vec, + pub vectors: HashSet>, + pub attributes: HashMap>, + pub mrsets: HashSet>, + pub variable_sets: HashSet>, + pub encoding: &'static Encoding, +} + +#[derive(Debug)] +pub struct DuplicateVariableName; + +impl Dictionary { + pub fn new(encoding: &'static Encoding) -> Self { + Self { + variables: IndexSet::new(), + split_file: Vec::new(), + weight: None, + filter: None, + case_limit: None, + file_label: None, + documents: Vec::new(), + vectors: HashSet::new(), + attributes: HashMap::new(), + mrsets: HashSet::new(), + variable_sets: HashSet::new(), + encoding, + } + } + + pub fn add_var(&mut self, variable: Variable) -> Result { + let (index, inserted) = self.variables.insert_full(ByIdentifier::new(variable)); + if inserted { + Ok(index) + } else { + Err(DuplicateVariableName) + } + } + + pub fn reorder_var(&mut self, from_index: DictIndex, to_index: DictIndex) { + if from_index != to_index { + self.variables.move_index(from_index, to_index); + self.update_dict_indexes(&|index| { + #[allow(clippy::collapsible_else_if)] + if index == from_index { + Some(to_index) + } else if from_index < to_index { + if index > from_index && index <= to_index { + Some(index - 1) + } else { + Some(index) + } + } else { + if index >= to_index && index < from_index { + Some(index + 1) + } else { + Some(index) + } + } + }) + } + } + + pub fn retain_vars(&mut self, keep: F) + where + F: Fn(&Variable) -> bool, + { + let mut deleted = Vec::new(); + let mut index = 0; + self.variables.retain(|var_by_id| { + let keep = keep(&var_by_id.0); + if !keep { + deleted.push(index); + } + index += 1; + keep + }); + if !deleted.is_empty() { + self.update_dict_indexes(&|index| match deleted.binary_search(&index) { + Ok(_) => None, + Err(position) => Some(position), + }) + } + } + + pub fn delete_vars(&mut self, range: R) + where + R: RangeBounds, + { + let start = match range.start_bound() { + Bound::Included(&start) => start, + Bound::Excluded(&start) => start + 1, + Bound::Unbounded => 0, + }; + let end = match range.end_bound() { + Bound::Included(&end) => end + 1, + Bound::Excluded(&end) => end, + Bound::Unbounded => self.variables.len(), + }; + if end > start { + self.variables.drain(start..end); + self.update_dict_indexes(&|index| { + if index < start { + Some(index) + } else if index < end { + None + } else { + Some(index - end - start) + } + }) + } + } + + fn update_dict_indexes(&mut self, f: &F) + where + F: Fn(DictIndex) -> Option, + { + update_dict_index_vec(&mut self.split_file, f); + self.weight = self.weight.and_then(f); + self.filter = self.filter.and_then(f); + self.vectors = self + .vectors + .drain() + .filter_map(|vector_by_id| { + vector_by_id + .0 + .with_updated_dict_indexes(f) + .map(ByIdentifier::new) + }) + .collect(); + self.mrsets = self + .mrsets + .drain() + .filter_map(|mrset_by_id| { + mrset_by_id + .0 + .with_updated_dict_indexes(f) + .map(ByIdentifier::new) + }) + .collect(); + self.variable_sets = self + .variable_sets + .drain() + .filter_map(|var_set_by_id| { + var_set_by_id + .0 + .with_updated_dict_indexes(f) + .map(ByIdentifier::new) + }) + .collect(); + } +} + +fn update_dict_index_vec(dict_indexes: &mut Vec, f: F) +where + F: Fn(DictIndex) -> Option, +{ + dict_indexes.retain_mut(|index| { + if let Some(new) = f(*index) { + *index = new; + true + } else { + false + } + }); +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] +pub enum Role { + Input, + Target, + Both, + None, + Partition, + Split, +} + +impl Default for Role { + fn default() -> Self { + Self::Input + } +} + +pub enum DictClass { + Ordinary, + System, + Scratch, +} + +impl DictClass { + pub fn from_identifier(id: &Identifier) -> Self { + if id.0.starts_with('$') { + Self::System + } else if id.0.starts_with('#') { + Self::Scratch + } else { + Self::Ordinary + } + } + + pub fn must_leave(self) -> bool { + match self { + DictClass::Ordinary => false, + DictClass::System => false, + DictClass::Scratch => true, + } + } +} + +#[derive(Clone, Debug)] +pub struct Variable { + pub name: Identifier, + pub width: VarWidth, + pub missing_values: MissingValues, + pub print_format: Format, + pub write_format: Format, + pub value_labels: HashMap, + pub label: Option, + pub measure: Option, + pub role: Role, + pub display_width: u32, + pub alignment: Alignment, + pub leave: bool, + pub short_names: Vec, + pub attributes: HashSet>, +} + +impl Variable { + pub fn new(name: Identifier, width: VarWidth) -> Self { + let var_type = VarType::from_width(width); + let leave = DictClass::from_identifier(&name).must_leave(); + Self { + name, + width, + missing_values: MissingValues::default(), + print_format: Format::default_for_width(width), + write_format: Format::default_for_width(width), + value_labels: HashMap::new(), + label: None, + measure: Measure::default_for_type(var_type), + role: Role::default(), + display_width: width.default_display_width(), + alignment: Alignment::default_for_type(var_type), + leave, + short_names: Vec::new(), + attributes: HashSet::new(), + } + } +} + +impl HasIdentifier for Variable { + fn identifier(&self) -> &UniCase { + &self.name.0 + } +} + +#[derive(Clone, Debug)] +pub struct Vector { + pub name: Identifier, + pub variables: Vec, +} + +impl Vector { + fn with_updated_dict_indexes( + mut self, + f: impl Fn(DictIndex) -> Option, + ) -> Option { + update_dict_index_vec(&mut self.variables, f); + (!self.variables.is_empty()).then_some(self) + } +} + +impl HasIdentifier for Vector { + fn identifier(&self) -> &UniCase { + &self.name.0 + } +} + +#[derive(Clone, Debug)] +pub struct Attribute { + pub name: Identifier, + pub values: Vec, +} + +impl HasIdentifier for Attribute { + fn identifier(&self) -> &UniCase { + &self.name.0 + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseSet { + pub name: Identifier, + pub label: String, + pub mr_type: MultipleResponseType, + pub variables: Vec, +} + +impl MultipleResponseSet { + fn with_updated_dict_indexes( + mut self, + f: impl Fn(DictIndex) -> Option, + ) -> Option { + update_dict_index_vec(&mut self.variables, f); + (self.variables.len() > 1).then_some(self) + } +} + +impl HasIdentifier for MultipleResponseSet { + fn identifier(&self) -> &UniCase { + &self.name.0 + } +} + +#[derive(Clone, Debug)] +pub enum MultipleResponseType { + MultipleDichotomy { + value: Value, + labels: CategoryLabels, + }, + MultipleCategory, +} + +#[derive(Clone, Debug)] +pub struct VariableSet { + pub name: Identifier, + pub variables: Vec, +} + +impl VariableSet { + fn with_updated_dict_indexes( + mut self, + f: impl Fn(DictIndex) -> Option, + ) -> Option { + update_dict_index_vec(&mut self.variables, f); + (!self.variables.is_empty()).then_some(self) + } +} + +impl HasIdentifier for VariableSet { + fn identifier(&self) -> &UniCase { + &self.name.0 + } +} + +#[cfg(test)] +mod test { + use std::collections::HashSet; + + use unicase::UniCase; + + use crate::identifier::Identifier; + + use super::{ByIdentifier, HasIdentifier}; + + #[derive(PartialEq, Eq, Debug, Clone)] + struct Variable { + name: Identifier, + value: i32, + } + + impl HasIdentifier for Variable { + fn identifier(&self) -> &UniCase { + &self.name.0 + } + } + + #[test] + fn test() { + // Variables should not be the same if their values differ. + let abcd = Identifier::new("abcd").unwrap(); + let abcd1 = Variable { + name: abcd.clone(), + value: 1, + }; + let abcd2 = Variable { + name: abcd, + value: 2, + }; + assert_ne!(abcd1, abcd2); + + // But `ByName` should treat them the same. + let abcd1_by_name = ByIdentifier::new(abcd1); + let abcd2_by_name = ByIdentifier::new(abcd2); + assert_eq!(abcd1_by_name, abcd2_by_name); + + // And a `HashSet` of `ByName` should also treat them the same. + let mut vars: HashSet> = HashSet::new(); + assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone()))); + assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone()))); + assert_eq!( + vars.get(&UniCase::new(String::from("abcd"))) + .unwrap() + .0 + .value, + 1 + ); + } +} diff --git a/rust/pspp/src/encoding.rs b/rust/pspp/src/encoding.rs new file mode 100644 index 0000000000..aaed5fd4ca --- /dev/null +++ b/rust/pspp/src/encoding.rs @@ -0,0 +1,64 @@ +use crate::locale_charset::locale_charset; +use encoding_rs::{Encoding, UTF_8}; + +include!(concat!(env!("OUT_DIR"), "/encodings.rs")); + +pub fn codepage_from_encoding(encoding: &str) -> Option { + CODEPAGE_NAME_TO_NUMBER + .get(encoding.to_ascii_lowercase().as_str()) + .copied() +} + +use thiserror::Error as ThisError; + +#[derive(ThisError, Debug)] +pub enum Error { + #[error("This system file does not indicate its own character encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")] + NoEncoding, + + #[error("This system file encodes text strings with unknown code page {0}.")] + UnknownCodepage(i32), + + #[error("This system file encodes text strings with unknown encoding {0}.")] + UnknownEncoding(String), + + #[error("This system file is encoded in EBCDIC, which is not supported.")] + Ebcdic, +} + +pub fn default_encoding() -> &'static Encoding { + lazy_static! { + static ref DEFAULT_ENCODING: &'static Encoding = + Encoding::for_label(locale_charset().as_bytes()).unwrap_or(UTF_8); + } + &DEFAULT_ENCODING +} + +pub fn get_encoding( + encoding: Option<&str>, + character_code: Option, +) -> Result<&'static Encoding, Error> { + let label = if let Some(encoding) = encoding { + encoding + } else if let Some(codepage) = character_code { + match codepage { + 1 => return Err(Error::Ebcdic), + 2 | 3 => { + // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic] + // respectively. However, many files have character code 2 but + // data which are clearly not ASCII. Therefore, ignore these + // values. + return Err(Error::NoEncoding); + } + 4 => "MS_KANJI", + _ => CODEPAGE_NUMBER_TO_NAME + .get(&codepage) + .copied() + .ok_or(Error::UnknownCodepage(codepage))?, + } + } else { + return Err(Error::NoEncoding); + }; + + Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into())) +} diff --git a/rust/pspp/src/endian.rs b/rust/pspp/src/endian.rs new file mode 100644 index 0000000000..dd89a6cc1d --- /dev/null +++ b/rust/pspp/src/endian.rs @@ -0,0 +1,168 @@ +/// The endianness for integer and floating-point numbers in SPSS system files. +/// +/// SPSS system files can declare IBM 370 and DEC VAX floating-point +/// representations, but no file that uses either of these has ever been found +/// in the wild, so this code does not handle them. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum Endian { + /// Big-endian: MSB at lowest address. + Big, + + /// Little-endian: LSB at lowest address. + Little, +} + +impl Endian { + #[cfg(target_endian = "big")] + pub const NATIVE: Endian = Endian::Big; + #[cfg(target_endian = "little")] + pub const NATIVE: Endian = Endian::Little; + + pub fn identify_u32(expected_value: u32, bytes: [u8; 4]) -> Option { + let as_big: u32 = Endian::Big.parse(bytes); + let as_little: u32 = Endian::Little.parse(bytes); + match (as_big == expected_value, as_little == expected_value) { + (true, false) => Some(Endian::Big), + (false, true) => Some(Endian::Little), + _ => None, + } + } + + pub fn identify_f64(expected_value: f64, bytes: [u8; 8]) -> Option { + let as_big: f64 = Endian::Big.parse(bytes); + let as_little: f64 = Endian::Little.parse(bytes); + match (as_big == expected_value, as_little == expected_value) { + (true, false) => Some(Endian::Big), + (false, true) => Some(Endian::Little), + _ => None, + } + } +} + +pub trait ToBytes { + fn to_bytes(self, value: T) -> [u8; N]; +} +impl ToBytes for Endian { + fn to_bytes(self, value: i64) -> [u8; 8] { + match self { + Endian::Big => i64::to_be_bytes(value), + Endian::Little => i64::to_le_bytes(value), + } + } +} +impl ToBytes for Endian { + fn to_bytes(self, value: u32) -> [u8; 4] { + match self { + Endian::Big => u32::to_be_bytes(value), + Endian::Little => u32::to_le_bytes(value), + } + } +} +impl ToBytes for Endian { + fn to_bytes(self, value: i32) -> [u8; 4] { + match self { + Endian::Big => i32::to_be_bytes(value), + Endian::Little => i32::to_le_bytes(value), + } + } +} +impl ToBytes for Endian { + fn to_bytes(self, value: u16) -> [u8; 2] { + match self { + Endian::Big => u16::to_be_bytes(value), + Endian::Little => u16::to_le_bytes(value), + } + } +} +impl ToBytes for Endian { + fn to_bytes(self, value: u8) -> [u8; 1] { + [value] + } +} +impl ToBytes for Endian { + fn to_bytes(self, value: f64) -> [u8; 8] { + match self { + Endian::Big => f64::to_be_bytes(value), + Endian::Little => f64::to_le_bytes(value), + } + } +} + +/// Parses an `N`-byte slice in one of the supported formats into native format +/// as type `T`. +pub trait Parse { + /// Given 'bytes', returns `T`. + fn parse(self, bytes: [u8; N]) -> T; +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 8]) -> u64 { + match self { + Endian::Big => u64::from_be_bytes(bytes), + Endian::Little => u64::from_le_bytes(bytes), + } + } +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 4]) -> u32 { + match self { + Endian::Big => u32::from_be_bytes(bytes), + Endian::Little => u32::from_le_bytes(bytes), + } + } +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 2]) -> u16 { + match self { + Endian::Big => u16::from_be_bytes(bytes), + Endian::Little => u16::from_le_bytes(bytes), + } + } +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 1]) -> u8 { + match self { + Endian::Big => u8::from_be_bytes(bytes), + Endian::Little => u8::from_le_bytes(bytes), + } + } +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 8]) -> i64 { + match self { + Endian::Big => i64::from_be_bytes(bytes), + Endian::Little => i64::from_le_bytes(bytes), + } + } +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 4]) -> i32 { + match self { + Endian::Big => i32::from_be_bytes(bytes), + Endian::Little => i32::from_le_bytes(bytes), + } + } +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 2]) -> i16 { + match self { + Endian::Big => i16::from_be_bytes(bytes), + Endian::Little => i16::from_le_bytes(bytes), + } + } +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 1]) -> i8 { + match self { + Endian::Big => i8::from_be_bytes(bytes), + Endian::Little => i8::from_le_bytes(bytes), + } + } +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 8]) -> f64 { + match self { + Endian::Big => f64::from_be_bytes(bytes), + Endian::Little => f64::from_le_bytes(bytes), + } + } +} diff --git a/rust/pspp/src/engine.rs b/rust/pspp/src/engine.rs new file mode 100644 index 0000000000..f48c1948c1 --- /dev/null +++ b/rust/pspp/src/engine.rs @@ -0,0 +1,51 @@ +use crate::{ + command::parse, + lex::{lexer::{Lexer, Source}, token::Token}, + message::Diagnostic, +}; + +pub struct Engine { + lexer: Lexer, +} + +impl Engine { + fn new() -> Self { + Self { + lexer: Lexer::new(Box::new(|location, error| println!("{location}: {error}"))), + } + } + fn run(&mut self, source: Source) { + self.lexer.append(source); + self.lexer.get(); + while self.lexer.token() != &Token::End { + let error: Box = Box::new(|diagnostic| { + println!("{diagnostic}"); + }); + parse(&mut self.lexer, &error); + } + } +} + +#[cfg(test)] +mod tests { + use encoding_rs::UTF_8; + + use crate::lex::{ + lexer::{ErrorHandling, Source}, + segment::Mode, + }; + + use super::Engine; + + #[test] + fn test_echo() { + let mut engine = Engine::new(); + engine.run(Source::for_file_contents( + "ECHO 'hi there'.\nECHO 'bye there'.\n".to_string(), + Some("test.sps".to_string()), + UTF_8, + Mode::default(), + ErrorHandling::default(), + )); + } +} diff --git a/rust/pspp/src/format.rs b/rust/pspp/src/format.rs new file mode 100644 index 0000000000..bafdf2726c --- /dev/null +++ b/rust/pspp/src/format.rs @@ -0,0 +1,658 @@ +use std::{ + fmt::{Display, Formatter, Result as FmtResult}, + ops::RangeInclusive, +}; + +use enum_map::{Enum, EnumMap}; +use thiserror::Error as ThisError; + +use crate::{ + dictionary::VarWidth, + raw::{self, VarType}, +}; + +#[derive(ThisError, Debug)] +pub enum Error { + #[error("Unknown format type {value}.")] + UnknownFormat { value: u16 }, + + #[error("Output format {0} specifies width {}, but {} requires an even width.", .0.w, .0.type_)] + OddWidthNotAllowed(UncheckedFormat), + + #[error("Output format {0} specifies width {}, but {} requires a width between {} and {}.", .0.w, .0.type_, .0.type_.min_width(), .0.type_.max_width())] + BadWidth(UncheckedFormat), + + #[error("Output format {0} specifies decimal places, but {} format does not allow any decimals.", .0.type_)] + DecimalsNotAllowedForFormat(UncheckedFormat), + + #[error("Output format {0} specifies {} decimal places, but with a width of {}, {} does not allow any decimal places.", .0.d, .0.w, .0.type_)] + DecimalsNotAllowedForWidth(UncheckedFormat), + + #[error("Output format {spec} specifies {} decimal places but, with a width of {}, {} allows at most {max_d} decimal places.", .spec.d, .spec.w, .spec.type_)] + TooManyDecimalsForWidth { + spec: UncheckedFormat, + max_d: Decimals, + }, + + #[error("String variable is not compatible with numeric format {0}.")] + UnnamedVariableNotCompatibleWithNumericFormat(Type), + + #[error("Numeric variable is not compatible with string format {0}.")] + UnnamedVariableNotCompatibleWithStringFormat(Type), + + #[error("String variable {variable} with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")] + NamedStringVariableBadSpecWidth { + variable: String, + width: Width, + bad_spec: Format, + good_spec: Format, + }, + + #[error("String variable with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")] + UnnamedStringVariableBadSpecWidth { + width: Width, + bad_spec: Format, + good_spec: Format, + }, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum Category { + // Numeric formats. + Basic, + Custom, + Legacy, + Binary, + Hex, + Date, + Time, + DateComponent, + + // String formats. + String, +} + +impl From for Category { + fn from(source: Type) -> Self { + match source { + Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => Self::Basic, + Type::CC(_) => Self::Custom, + Type::N | Type::Z => Self::Legacy, + Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => Self::Binary, + Type::PIBHex | Type::RBHex => Self::Hex, + Type::Date + | Type::ADate + | Type::EDate + | Type::JDate + | Type::SDate + | Type::QYr + | Type::MoYr + | Type::WkYr + | Type::DateTime + | Type::YMDHMS => Self::Date, + Type::MTime | Type::Time | Type::DTime => Self::Time, + Type::WkDay | Type::Month => Self::DateComponent, + Type::A | Type::AHex => Self::String, + } + } +} + +#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Hash)] +pub enum CC { + A, + B, + C, + D, + E, +} + +impl Display for CC { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + let s = match self { + CC::A => "A", + CC::B => "B", + CC::C => "C", + CC::D => "D", + CC::E => "E", + }; + write!(f, "{}", s) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum Type { + // Basic numeric formats. + F, + Comma, + Dot, + Dollar, + Pct, + E, + + // Custom currency formats. + CC(CC), + + // Legacy numeric formats. + N, + Z, + + // Binary and hexadecimal formats. + P, + PK, + IB, + PIB, + PIBHex, + RB, + RBHex, + + // Time and date formats. + Date, + ADate, + EDate, + JDate, + SDate, + QYr, + MoYr, + WkYr, + DateTime, + YMDHMS, + MTime, + Time, + DTime, + + // Date component formats. + WkDay, + Month, + + // String formats. + A, + AHex, +} + +pub type Width = u16; +pub type SignedWidth = i16; + +pub type Decimals = u8; + +impl Type { + pub fn max_width(self) -> Width { + match self { + Self::P | Self::PK | Self::PIBHex | Self::RBHex => 16, + Self::IB | Self::PIB | Self::RB => 8, + Self::A => 32767, + Self::AHex => 32767 * 2, + _ => 40, + } + } + + pub fn min_width(self) -> Width { + match self { + // Basic numeric formats. + Self::F => 1, + Self::Comma => 1, + Self::Dot => 1, + Self::Dollar => 2, + Self::Pct => 2, + Self::E => 6, + + // Custom currency formats. + Self::CC(_) => 2, + + // Legacy numeric formats. + Self::N => 1, + Self::Z => 1, + + // Binary and hexadecimal formats. + Self::P => 1, + Self::PK => 1, + Self::IB => 1, + Self::PIB => 1, + Self::PIBHex => 2, + Self::RB => 2, + Self::RBHex => 4, + + // Time and date formats. + Self::Date => 9, + Self::ADate => 8, + Self::EDate => 8, + Self::JDate => 5, + Self::SDate => 8, + Self::QYr => 6, + Self::MoYr => 6, + Self::WkYr => 8, + Self::DateTime => 17, + Self::YMDHMS => 16, + Self::MTime => 5, + Self::Time => 5, + Self::DTime => 8, + + // Date component formats. + Self::WkDay => 2, + Self::Month => 3, + + // String formats. + Self::A => 1, + Self::AHex => 2, + } + } + + pub fn width_range(self) -> RangeInclusive { + self.min_width()..=self.max_width() + } + + pub fn max_decimals(self, width: Width) -> Decimals { + let width = width.clamp(1, 40) as SignedWidth; + let max = match self { + Self::F | Self::Comma | Self::Dot | Self::CC(_) => width - 1, + Self::Dollar | Self::Pct => width - 2, + Self::E => width - 7, + Self::N | Self::Z => width, + Self::P => width * 2 - 1, + Self::PK => width * 2, + Self::IB | Self::PIB => max_digits_for_bytes(width as usize) as SignedWidth, + Self::PIBHex => 0, + Self::RB | Self::RBHex => 16, + Self::Date + | Self::ADate + | Self::EDate + | Self::JDate + | Self::SDate + | Self::QYr + | Self::MoYr + | Self::WkYr => 0, + Self::DateTime => width - 21, + Self::YMDHMS => width - 20, + Self::MTime => width - 6, + Self::Time => width - 9, + Self::DTime => width - 12, + Self::WkDay | Self::Month | Self::A | Self::AHex => 0, + }; + max.clamp(0, 16) as Decimals + } + + pub fn takes_decimals(self) -> bool { + self.max_decimals(Width::MAX) > 0 + } + + pub fn category(self) -> Category { + self.into() + } + + pub fn width_step(self) -> Width { + if self.category() == Category::Hex || self == Self::AHex { + 2 + } else { + 1 + } + } + + pub fn clamp_width(self, width: Width) -> Width { + let (min, max) = self.width_range().into_inner(); + let width = width.clamp(min, max); + if self.width_step() == 2 { + width / 2 * 2 + } else { + width + } + } + + pub fn var_type(self) -> VarType { + match self { + Self::A | Self::AHex => VarType::String, + _ => VarType::Numeric, + } + } + + /// Checks whether this format is valid for a variable with the given + /// `var_type`. + pub fn check_type_compatibility(self, var_type: VarType) -> Result<(), Error> { + let my_type = self.var_type(); + match (my_type, var_type) { + (VarType::Numeric, VarType::String) => { + Err(Error::UnnamedVariableNotCompatibleWithNumericFormat(self)) + } + (VarType::String, VarType::Numeric) => { + Err(Error::UnnamedVariableNotCompatibleWithStringFormat(self)) + } + _ => Ok(()), + } + } +} + +impl Display for Type { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + let s = match self { + Self::F => "F", + Self::Comma => "COMMA", + Self::Dot => "DOT", + Self::Dollar => "DOLLAR", + Self::Pct => "PCT", + Self::E => "E", + Self::CC(cc) => return write!(f, "{}", cc), + Self::N => "N", + Self::Z => "Z", + Self::P => "P", + Self::PK => "PK", + Self::IB => "IB", + Self::PIB => "PIB", + Self::PIBHex => "PIBHEX", + Self::RB => "RB", + Self::RBHex => "RBHEX", + Self::Date => "DATE", + Self::ADate => "ADATE", + Self::EDate => "EDATE", + Self::JDate => "JDATE", + Self::SDate => "SDATE", + Self::QYr => "QYR", + Self::MoYr => "MOYR", + Self::WkYr => "WKYR", + Self::DateTime => "DATETIME", + Self::YMDHMS => "YMDHMS", + Self::MTime => "MTIME", + Self::Time => "TIME", + Self::DTime => "DTIME", + Self::WkDay => "WKDAY", + Self::Month => "MONTH", + Self::A => "A", + Self::AHex => "AHEX", + }; + write!(f, "{}", s) + } +} + +fn max_digits_for_bytes(bytes: usize) -> usize { + *[0, 3, 5, 8, 10, 13, 15, 17].get(bytes).unwrap_or(&20) +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct Format { + type_: Type, + w: Width, + d: Decimals, +} + +impl Format { + pub const F40: Format = Format { + type_: Type::F, + w: 40, + d: 0, + }; + + pub const F8_2: Format = Format { + type_: Type::F, + w: 8, + d: 2, + }; + + pub fn format(self) -> Type { + self.type_ + } + pub fn w(self) -> Width { + self.w + } + pub fn d(self) -> Decimals { + self.d + } + + pub fn default_for_width(var_width: VarWidth) -> Self { + match var_width { + VarWidth::Numeric => Format { + type_: Type::F, + w: 8, + d: 2, + }, + VarWidth::String(w) => Format { + type_: Type::A, + w, + d: 0, + }, + } + } + + pub fn fixed_from(source: &UncheckedFormat) -> Self { + let UncheckedFormat { + type_: format, + w, + d, + } = *source; + let (min, max) = format.width_range().into_inner(); + let mut w = w.clamp(min, max); + if d <= format.max_decimals(Width::MAX) { + while d > format.max_decimals(w) { + w += 1; + assert!(w <= 40); + } + } + let d = d.clamp(0, format.max_decimals(w)); + Self { + type_: format, + w, + d, + } + } + + pub fn var_width(self) -> VarWidth { + match self.type_ { + Type::A => VarWidth::String(self.w), + Type::AHex => VarWidth::String(self.w / 2), + _ => VarWidth::Numeric, + } + } + + pub fn var_type(self) -> VarType { + self.type_.var_type() + } + + /// Checks whether this format specification is valid for a variable with + /// width `var_width`. + pub fn check_width_compatibility(self, var_width: VarWidth) -> Result { + // Verify that the format is right for the variable's type. + self.type_.check_type_compatibility(var_width.into())?; + + if let VarWidth::String(w) = var_width { + if var_width != self.var_width() { + let bad_spec = self; + let good_spec = if self.type_ == Type::A { + Format { w, ..self } + } else { + Format { w: w * 2, ..self } + }; + return Err(Error::UnnamedStringVariableBadSpecWidth { + width: w, + bad_spec, + good_spec, + }); + } + } + + Ok(self) + } +} + +impl Display for Format { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{}{}", self.type_, self.w)?; + if self.type_.takes_decimals() || self.d > 0 { + write!(f, ".{}", self.d)?; + } + Ok(()) + } +} + +impl TryFrom for Format { + type Error = Error; + + fn try_from(source: UncheckedFormat) -> Result { + let UncheckedFormat { + type_: format, + w, + d, + } = source; + let max_d = format.max_decimals(w); + if w % format.width_step() != 0 { + Err(Error::OddWidthNotAllowed(source)) + } else if !format.width_range().contains(&w) { + Err(Error::BadWidth(source)) + } else if d > max_d { + if format.takes_decimals() { + Err(Error::DecimalsNotAllowedForFormat(source)) + } else if max_d > 0 { + Err(Error::TooManyDecimalsForWidth { + spec: source, + max_d, + }) + } else { + Err(Error::DecimalsNotAllowedForWidth(source)) + } + } else { + Ok(Format { + type_: format, + w, + d, + }) + } + } +} + +impl TryFrom for Type { + type Error = Error; + + fn try_from(source: u16) -> Result { + match source { + 1 => Ok(Self::A), + 2 => Ok(Self::AHex), + 3 => Ok(Self::Comma), + 4 => Ok(Self::Dollar), + 5 => Ok(Self::F), + 6 => Ok(Self::IB), + 7 => Ok(Self::PIBHex), + 8 => Ok(Self::P), + 9 => Ok(Self::PIB), + 10 => Ok(Self::PK), + 11 => Ok(Self::RB), + 12 => Ok(Self::RBHex), + 15 => Ok(Self::Z), + 16 => Ok(Self::N), + 17 => Ok(Self::E), + 20 => Ok(Self::Date), + 21 => Ok(Self::Time), + 22 => Ok(Self::DateTime), + 23 => Ok(Self::ADate), + 24 => Ok(Self::JDate), + 25 => Ok(Self::DTime), + 26 => Ok(Self::WkDay), + 27 => Ok(Self::Month), + 28 => Ok(Self::MoYr), + 29 => Ok(Self::QYr), + 30 => Ok(Self::WkYr), + 31 => Ok(Self::Pct), + 32 => Ok(Self::Dot), + 33 => Ok(Self::CC(CC::A)), + 34 => Ok(Self::CC(CC::B)), + 35 => Ok(Self::CC(CC::C)), + 36 => Ok(Self::CC(CC::D)), + 37 => Ok(Self::CC(CC::E)), + 38 => Ok(Self::EDate), + 39 => Ok(Self::SDate), + 40 => Ok(Self::MTime), + 41 => Ok(Self::YMDHMS), + _ => Err(Error::UnknownFormat { value: source }), + } + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct UncheckedFormat { + pub type_: Type, + + pub w: Width, + + pub d: Decimals, +} + +impl TryFrom for UncheckedFormat { + type Error = Error; + + fn try_from(raw: raw::Spec) -> Result { + let raw = raw.0; + let raw_format = (raw >> 16) as u16; + let format = raw_format.try_into()?; + let w = ((raw >> 8) & 0xff) as Width; + let d = (raw & 0xff) as Decimals; + Ok(Self { + type_: format, + w, + d, + }) + } +} + +impl Display for UncheckedFormat { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{}{}", self.type_, self.w)?; + if self.type_.takes_decimals() || self.d > 0 { + write!(f, ".{}", self.d)?; + } + Ok(()) + } +} + +pub struct Settings { + epoch: Option, + + /// Either `'.'` or `','`. + decimal: char, + + /// Format `F`, `E`, `COMMA`, and `DOT` with leading zero (e.g. `0.5` + /// instead of `.5`)? + include_leading_zero: bool, + + /// Custom currency styles. + ccs: EnumMap>, +} + +impl Default for Settings { + fn default() -> Self { + Self { + epoch: None, + decimal: '.', + include_leading_zero: false, + ccs: Default::default(), + } + } +} + +/// A numeric output style. This can express numeric formats in +/// [Category::Basic] and [Category::Custom]. +pub struct NumberStyle { + neg_prefix: Affix, + prefix: Affix, + suffix: Affix, + neg_suffix: Affix, + + /// Decimal point: `'.'` or `','`. + decimal: char, + + /// Grouping character: `'.'` or `','` or `None`. + grouping: Option, + + /// Format as `.5` or `0.5`? + include_leading_zero: bool, + + /// An `Affix` may require more bytes than its display width; for example, + /// U+00A5 (Â¥) is 2 bytes in UTF-8 but occupies only one display column. + /// This member is the sum of the number of bytes required by all of the + /// `Affix` members in this struct, minus their display widths. Thus, it + /// can be used to size memory allocations: for example, the formatted + /// result of `CCA20.5` requires no more than `(20 + extra_bytes)` bytes in + /// UTF-8. + extra_bytes: usize, +} + +pub struct Affix { + /// String contents of affix. + s: String, + + /// Display width in columns (see [unicode_width]) + width: usize, +} diff --git a/rust/pspp/src/hexfloat.rs b/rust/pspp/src/hexfloat.rs new file mode 100644 index 0000000000..b885fb2266 --- /dev/null +++ b/rust/pspp/src/hexfloat.rs @@ -0,0 +1,52 @@ +use num::Float; +use std::{num::FpCategory, fmt::{Display, Formatter, Result}}; + +pub struct HexFloat(pub T); + +impl Display for HexFloat { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + let sign = if self.0.is_sign_negative() { "-" } else { "" }; + match self.0.classify() { + FpCategory::Nan => return write!(f, "NaN"), + FpCategory::Infinite => return write!(f, "{sign}Infinity"), + FpCategory::Zero => return write!(f, "{sign}0.0"), + _ => (), + }; + let (significand, mut exponent, _) = self.0.integer_decode(); + let mut hex_sig = format!("{:x}", significand); + while hex_sig.ends_with('0') { + hex_sig.pop(); + exponent += 4; + } + match hex_sig.len() { + 0 => write!(f, "{sign}0.0"), + 1 => write!(f, "{sign}0x{hex_sig}.0p{exponent}"), + len => write!( + f, + "{sign}0x{}.{}p{}", + hex_sig.chars().next().unwrap(), + &hex_sig[1..], + exponent + 4 * (len as i16 - 1) + ), + } + } +} + +#[cfg(test)] +mod hex_float_tests { + use crate::HexFloat; + use num::Float; + + #[test] + fn test() { + assert_eq!(format!("{}", HexFloat(1.0)), "0x1.0p0"); + assert_eq!(format!("{}", HexFloat(123.0)), "0x1.ecp6"); + assert_eq!(format!("{}", HexFloat(1.0 / 16.0)), "0x1.0p-4"); + assert_eq!(format!("{}", HexFloat(f64::infinity())), "Infinity"); + assert_eq!(format!("{}", HexFloat(f64::neg_infinity())), "-Infinity"); + assert_eq!(format!("{}", HexFloat(f64::nan())), "NaN"); + assert_eq!(format!("{}", HexFloat(0.0)), "0.0"); + assert_eq!(format!("{}", HexFloat(f64::neg_zero())), "-0.0"); + } +} + diff --git a/rust/pspp/src/identifier.rs b/rust/pspp/src/identifier.rs new file mode 100644 index 0000000000..2d5c0317ec --- /dev/null +++ b/rust/pspp/src/identifier.rs @@ -0,0 +1,394 @@ +use std::{ + borrow::Borrow, + cmp::Ordering, + fmt::{Debug, Display, Formatter, Result as FmtResult}, + hash::{Hash, Hasher}, + ops::Deref, +}; + +use encoding_rs::{EncoderResult, Encoding, UTF_8}; +use finl_unicode::categories::{CharacterCategories, MajorCategory}; +use thiserror::Error as ThisError; +use unicase::UniCase; + +pub trait IdentifierChar { + /// Returns true if `self` is an ASCII character that may be the first + /// character in an identifier. + fn ascii_may_start_id(self) -> bool; + + /// Returns true if `self` may be the first character in an identifier. + fn may_start_id(self) -> bool; + + /// Returns true if `self` is an ASCII character that may be a second or + /// subsequent character in an identifier. + fn ascii_may_continue_id(self) -> bool; + + /// Returns true if `self` may be a second or subsequent character in an + /// identifier. + fn may_continue_id(self) -> bool; +} + +impl IdentifierChar for char { + fn ascii_may_start_id(self) -> bool { + matches!(self, 'a'..='z' | 'A'..='Z' | '@' | '#' | '$' | '!') + } + + fn may_start_id(self) -> bool { + if self < '\u{0080}' { + self.ascii_may_start_id() + } else { + use MajorCategory::*; + + [L, M, S].contains(&self.get_major_category()) && self != char::REPLACEMENT_CHARACTER + } + } + + fn ascii_may_continue_id(self) -> bool { + matches!(self, 'a'..='z' | 'A'..='Z' | '0'..='9' | '@' | '#' | '$' | '.' | '_') + } + + fn may_continue_id(self) -> bool { + if self < '\u{0080}' { + self.ascii_may_continue_id() + } else { + use MajorCategory::*; + + [L, M, S, N].contains(&self.get_major_category()) && self != char::REPLACEMENT_CHARACTER + } + } +} + +#[derive(Clone, Debug, ThisError)] +pub enum Error { + #[error("Identifier cannot be empty string.")] + Empty, + + #[error("\"{0}\" may not be used as an identifier because it is a reserved word.")] + Reserved(String), + + #[error("\"!\" is not a valid identifier.")] + Bang, + + #[error("\"{0}\" may not be used as an identifier because it begins with disallowed character \"{1}\".")] + BadFirstCharacter(String, char), + + #[error("\"{0}\" may not be used as an identifier because it contains disallowed character \"{1}\".")] + BadLaterCharacter(String, char), + + #[error("Identifier \"{id}\" is {length} bytes in the encoding in use ({encoding}), which exceeds the {max}-byte limit.")] + TooLong { + id: String, + length: usize, + encoding: &'static str, + max: usize, + }, + + #[error("\"{id}\" may not be used as an identifier because the encoding in use ({encoding}) cannot represent \"{c}\".")] + NotEncodable { + id: String, + encoding: &'static str, + c: char, + }, +} + +pub enum ReservedWord { + And, + Or, + Not, + Eq, + Ge, + Gt, + Le, + Lt, + Ne, + All, + By, + To, + With, +} + +impl TryFrom<&str> for ReservedWord { + type Error = (); + + fn try_from(source: &str) -> Result { + if !(2..=4).contains(&source.len()) { + Err(()) + } else { + let b = source.as_bytes(); + let c0 = b[0].to_ascii_uppercase(); + let c1 = b[1].to_ascii_uppercase(); + match (source.len(), c0, c1) { + (2, b'B', b'Y') => Ok(Self::By), + (2, b'E', b'Q') => Ok(Self::Eq), + (2, b'G', b'T') => Ok(Self::Gt), + (2, b'G', b'E') => Ok(Self::Ge), + (2, b'L', b'T') => Ok(Self::Lt), + (2, b'L', b'E') => Ok(Self::Le), + (2, b'N', b'E') => Ok(Self::Ne), + (3, b'N', b'O') if b[2].to_ascii_uppercase() == b'T' => Ok(Self::Not), + (2, b'O', b'R') => Ok(Self::Or), + (2, b'T', b'O') => Ok(Self::To), + (3, b'A', b'L') if b[2].to_ascii_uppercase() == b'L' => Ok(Self::All), + (3, b'A', b'N') if b[2].to_ascii_uppercase() == b'D' => Ok(Self::And), + (4, b'W', b'I') + if b[2].to_ascii_uppercase() == b'T' && b[3].to_ascii_uppercase() == b'H' => + { + Ok(Self::With) + } + _ => Err(()), + } + } + } +} + +pub fn is_reserved_word(s: &str) -> bool { + ReservedWord::try_from(s).is_ok() +} + +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Identifier(pub UniCase); + +impl Identifier { + /// Maximum length of an identifier, in bytes. The limit applies in the + /// encoding used by the dictionary, not in UTF-8. + pub const MAX_LEN: usize = 64; + + pub fn new(s: &str) -> Result { + Self::from_encoding(s, UTF_8) + } + pub fn from_encoding(s: &str, encoding: &'static Encoding) -> Result { + Self::is_plausible(s)?; + let identifier = Identifier(s.into()); + identifier.check_encoding(encoding)?; + Ok(identifier) + } + + /// Checks whether this is a valid identifier in the given `encoding`. An + /// identifier that is valid in one encoding might be invalid in another + /// because some characters are unencodable or because it is too long. + pub fn check_encoding(&self, encoding: &'static Encoding) -> Result<(), Error> { + let s = self.0.as_str(); + let (_encoded, _, unencodable) = encoding.encode(s); + if unencodable { + let mut encoder = encoding.new_encoder(); + let mut buf = Vec::with_capacity( + encoder + .max_buffer_length_from_utf8_without_replacement(s.len()) + .unwrap(), + ); + let EncoderResult::Unmappable(c) = encoder + .encode_from_utf8_to_vec_without_replacement(s, &mut buf, true) + .0 + else { + unreachable!(); + }; + return Err(Error::NotEncodable { + id: s.into(), + encoding: encoding.name(), + c, + }); + } + /* + if encoded.len() > Self::MAX_LEN { + return Err(Error::TooLong { + id: s.into(), + length: encoded.len(), + encoding: encoding.name(), + max: Self::MAX_LEN, + }); + }*/ + Ok(()) + } + pub fn is_plausible(s: &str) -> Result<(), Error> { + if s.is_empty() { + return Err(Error::Empty); + } + if is_reserved_word(s) { + return Err(Error::Reserved(s.into())); + } + if s == "!" { + return Err(Error::Bang); + } + + let mut i = s.chars(); + let first = i.next().unwrap(); + if !first.may_start_id() { + return Err(Error::BadFirstCharacter(s.into(), first)); + } + for c in i { + if !c.may_continue_id() { + return Err(Error::BadLaterCharacter(s.into(), c)); + } + } + Ok(()) + } + + /// Returns true if `token` is a case-insensitive match for `keyword`. + /// + /// Keywords match `keyword` and `token` are identical, or `token` is at + /// least 3 characters long and those characters are identical to `keyword` + /// or differ only in case. + /// + /// `keyword` must be ASCII. + pub fn matches_keyword(&self, keyword: &str) -> bool { + id_match_n_nonstatic(keyword, self.0.as_str(), 3) + } + + /// Returns true if `token` is a case-insensitive match for at least the + /// first `n` characters of `keyword`. + /// + /// `keyword` must be ASCII. + pub fn matches_keyword_n(&self, keyword: &str, n: usize) -> bool { + id_match_n_nonstatic(keyword, self.0.as_str(), n) + } +} + +impl PartialEq for Identifier { + fn eq(&self, other: &str) -> bool { + self.0.eq(&UniCase::new(other)) + } +} + +/// Returns true if `token` is a case-insensitive match for `keyword`. +/// +/// Keywords match `keyword` and `token` are identical, or `token` is at least 3 +/// characters long and those characters are identical to `keyword` or differ +/// only in case. +/// +/// `keyword` must be ASCII. It's normally a constant string, so it's declared +/// as `&'static str` to make it harder to reverse the argument order. But +/// there's no reason that a non-static string won't work, so use +/// [`id_match_n_nonstatic`] instead if you need it. +pub fn id_match(keyword: &'static str, token: &str) -> bool { + id_match_n(keyword, token, 3) +} + +/// Returns true if `token` is a case-insensitive match for at least the first +/// `n` characters of `keyword`. +/// +/// `keyword` must be ASCII. It's normally a constant string, so it's declared +/// as `&'static str` to make it harder to reverse the argument order. But +/// there's no reason that a non-static string won't work, so use +/// [`id_match_n_nonstatic`] instead if you need it. +pub fn id_match_n(keyword: &'static str, token: &str, n: usize) -> bool { + id_match_n_nonstatic(keyword, token, n) +} + +/// Returns true if `token` is a case-insensitive match for at least the first +/// `n` characters of `keyword`. +/// +/// `keyword` must be ASCII. +pub fn id_match_n_nonstatic(keyword: &str, token: &str, n: usize) -> bool { + debug_assert!(keyword.is_ascii()); + let keyword_prefix = if (n..keyword.len()).contains(&token.len()) { + &keyword[..token.len()] + } else { + keyword + }; + keyword_prefix.eq_ignore_ascii_case(token) +} + +impl Display for Identifier { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{}", self.0) + } +} + +impl Debug for Identifier { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + write!(f, "{}", self.0) + } +} + +pub trait HasIdentifier { + fn identifier(&self) -> &UniCase; +} + +pub struct ByIdentifier(pub T) +where + T: HasIdentifier; + +impl ByIdentifier +where + T: HasIdentifier, +{ + pub fn new(inner: T) -> Self { + Self(inner) + } +} + +impl PartialEq for ByIdentifier +where + T: HasIdentifier, +{ + fn eq(&self, other: &Self) -> bool { + self.0.identifier().eq(other.0.identifier()) + } +} + +impl Eq for ByIdentifier where T: HasIdentifier {} + +impl PartialOrd for ByIdentifier +where + T: HasIdentifier, +{ + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for ByIdentifier +where + T: HasIdentifier, +{ + fn cmp(&self, other: &Self) -> Ordering { + self.0.identifier().cmp(other.0.identifier()) + } +} + +impl Hash for ByIdentifier +where + T: HasIdentifier, +{ + fn hash(&self, state: &mut H) { + self.0.identifier().hash(state) + } +} + +impl Borrow> for ByIdentifier +where + T: HasIdentifier, +{ + fn borrow(&self) -> &UniCase { + self.0.identifier() + } +} + +impl Debug for ByIdentifier +where + T: HasIdentifier + Debug, +{ + fn fmt(&self, f: &mut Formatter) -> FmtResult { + self.0.fmt(f) + } +} + +impl Clone for ByIdentifier +where + T: HasIdentifier + Clone, +{ + fn clone(&self) -> Self { + Self(self.0.clone()) + } +} + +impl Deref for ByIdentifier +where + T: HasIdentifier + Clone, +{ + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} diff --git a/rust/pspp/src/integer.rs b/rust/pspp/src/integer.rs new file mode 100644 index 0000000000..6c76839927 --- /dev/null +++ b/rust/pspp/src/integer.rs @@ -0,0 +1,86 @@ +pub trait ToInteger { + fn to_exact_integer(&self) -> Option + where + T: FromFloat; + fn to_exact_usize(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_u8(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_u16(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_u32(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_u64(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_u128(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_isize(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_i8(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_i16(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_i32(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_i64(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_i128(&self) -> Option { + self.to_exact_integer() + } +} + +impl ToInteger for f64 { + fn to_exact_integer(&self) -> Option + where + T: FromFloat, + { + T::from_float(*self) + } +} + +pub trait FromFloat { + fn from_float(x: f64) -> Option + where + Self: Sized; +} + +macro_rules! impl_from_float { + ($T:ident) => { + impl FromFloat for $T { + fn from_float(x: f64) -> Option + where + Self: Sized, + { + if x.trunc() == x && x >= $T::MIN as f64 && x <= $T::MAX as f64 { + Some(x as Self) + } else { + None + } + } + } + }; +} + +impl_from_float!(usize); +impl_from_float!(u8); +impl_from_float!(u16); +impl_from_float!(u32); +impl_from_float!(u64); +impl_from_float!(u128); +impl_from_float!(isize); +impl_from_float!(i8); +impl_from_float!(i16); +impl_from_float!(i32); +impl_from_float!(i64); +impl_from_float!(i128); diff --git a/rust/pspp/src/lex/command_name.rs b/rust/pspp/src/lex/command_name.rs new file mode 100644 index 0000000000..bccea1483b --- /dev/null +++ b/rust/pspp/src/lex/command_name.rs @@ -0,0 +1,359 @@ +use crate::identifier::id_match_n_nonstatic; + +pub struct Match { + pub exact: bool, + pub missing_words: isize, +} + +fn count_words(s: &str) -> isize { + s.split_whitespace().count() as isize +} + +/// Compares `string` obtained from the user against the full name of a `command`, +/// using this algorithm: +/// +/// 1. Divide `command` into words `c[0]` through `c[n - 1]`. +/// +/// 2. Divide `string` into words `s[0]` through `s[m - 1]`. +/// +/// 3. Compare word `c[i]` against `s[i]` for `0 <= i < min(n, m)`, using the keyword +/// matching algorithm implemented by lex_id_match(). If any of them fail to +/// match, then `string` does not match `command` and the function returns false. +/// +/// 4. Otherwise, `string` and `command` match. Set *MISSING_WORDS to n - m. Set +/// *EXACT to false if any of the S[i] were found to be abbreviated in the +/// comparisons done in step 3, or to true if they were all exactly equal +/// (modulo case). Return true. +pub fn command_match(command: &str, string: &str) -> Option { + let mut command_words = command.split_whitespace(); + let mut string_words = string.split_whitespace(); + let mut exact = true; + loop { + let Some(cw) = command_words.next() else { + return Some(Match { + exact, + missing_words: -(string_words.count() as isize), + }); + }; + let Some(sw) = string_words.next() else { + return Some(Match { + exact, + missing_words: 1 + command_words.count() as isize, + }); + }; + if !id_match_n_nonstatic(cw, sw, 3) { + return None; + } + if sw.len() < cw.len() { + exact = false; + } + } +} + +/// Matches a string against a collection of command names. +pub struct CommandMatcher<'a, T> { + string: &'a str, + extensible: bool, + exact_match: Option, + n_matches: usize, + match_: Option, + match_missing_words: isize, +} + +impl<'a, T> CommandMatcher<'a, T> { + pub fn new(string: &'a str) -> Self { + Self { + string, + extensible: false, + exact_match: None, + n_matches: 0, + match_: None, + match_missing_words: 0, + } + } + + /// Consider `command` as a candidate for the command name being parsed. If + /// `command` is the correct command name, then [Self::get_match] will + /// return `aux` later. + pub fn add(&mut self, command: &str, aux: T) { + if let Some(Match { + missing_words, + exact, + }) = command_match(command, self.string) + { + if missing_words > 0 { + self.extensible = true; + } else if exact && missing_words == 0 { + self.exact_match = Some(aux); + } else { + if missing_words > self.match_missing_words { + self.n_matches = 0; + } + if missing_words >= self.match_missing_words || self.n_matches == 0 { + self.n_matches += 1; + self.match_ = Some(aux); + self.match_missing_words = missing_words; + } + } + } + } + + pub fn get_match(self) -> (Option, isize) { + if self.extensible { + (None, 1) + } else if let Some(exact_match) = self.exact_match { + (Some(exact_match), 0) + } else if self.n_matches == 1 { + (self.match_, self.match_missing_words) + } else { + (None, self.match_missing_words) + } + } +} + +pub const COMMAND_NAMES: &'static [&'static str] = &[ + "2SLS", + "ACF", + "ADD DOCUMENT", + "ADD FILES", + "ADD VALUE LABELS", + "AGGREGATE", + "ALSCAL", + "ANACOR", + "ANOVA", + "APPLY DICTIONARY", + "AUTORECODE", + "BEGIN DATA", + "BREAK", + "CACHE", + "CASEPLOT", + "CASESTOVARS", + "CATPCA", + "CATREG", + "CCF", + "CD", + "CLEAR TRANSFORMATIONS", + "CLOSE FILE HANDLE", + "CLUSTER", + "COMPUTE", + "CONJOINT", + "CORRELATIONS", + "CORRESPONDENCE", + "COUNT", + "COXREG", + "CREATE", + "CROSSTABS", + "CSDESCRIPTIVES", + "CSGLM", + "CSLOGISTIC", + "CSPLAN", + "CSSELECT", + "CSTABULATE", + "CTABLES", + "CURVEFIT", + "DATA LIST", + "DATAFILE ATTRIBUTE", + "DATASET ACTIVATE", + "DATASET CLOSE", + "DATASET COPY", + "DATASET DECLARE", + "DATASET DISPLAY", + "DATASET NAME", + "DATE", + "DEBUG EVALUATE", + "DEBUG EXPAND", + "DEBUG FLOAT FORMAT", + "DEBUG FORMAT GUESSER", + "DEBUG MATRIX READ", + "DEBUG MOMENTS", + "DEBUG PAPER SIZE", + "DEBUG POOL", + "DEBUG XFORM FAIL", + "DEFINE", + "DELETE VARIABLES", + "DESCRIPTIVES", + "DETECTANOMALY", + "DISCRIMINANT", + "DISPLAY MACROS", + "DISPLAY VARIABLE SETS", + "DISPLAY", + "DO IF", + "DO REPEAT", + "DOCUMENT", + "DROP DOCUMENTS", + "ECHO", + "EDIT", + "ELSE IF", + "ELSE", + "END CASE", + "END FILE TYPE", + "END FILE", + "END IF", + "END LOOP", + "END REPEAT", + "ERASE", + "EXAMINE", + "EXECUTE", + "EXIT", + "EXPORT", + "FACTOR", + "FILE HANDLE", + "FILE LABEL", + "FILE TYPE", + "FILTER", + "FINISH", + "FIT", + "FLIP", + "FORMATS", + "FREQUENCIES", + "GENLOG", + "GET DATA", + "GET TRANSLATE", + "GET", + "GGRAPH", + "GLM", + "GRAPH", + "HILOGLINEAR", + "HOMALS", + "HOST", + "IF", + "IGRAPH", + "IMPORT", + "INCLUDE", + "INFO", + "INPUT PROGRAM", + "INSERT", + "KEYED DATA LIST", + "KM", + "LEAVE", + "LIST", + "LOGISTIC REGRESSION", + "LOGLINEAR", + "LOOP", + "MANOVA", + "MAPS", + "MATCH FILES", + "MATRIX DATA", + "MATRIX", + "MCONVERT", + "MEANS", + "MISSING VALUES", + "MIXED", + "MODEL CLOSE", + "MODEL HANDLE", + "MODEL LIST", + "MODEL NAME", + "MRSETS", + "MULT RESPONSE", + "MULTIPLE CORRESPONDENCE", + "MVA", + "N OF CASES", + "N", + "NAIVEBAYES", + "NEW FILE", + "NLR", + "NOMREG", + "NONPAR CORR", + "NPAR TESTS", + "NUMBERED", + "NUMERIC", + "OLAP CUBES", + "OMS", + "ONEWAY", + "ORTHOPLAN", + "OUTPUT MODIFY", + "OVERALS", + "PACF", + "PARTIAL CORR", + "PEARSON CORRELATIONS", + "PERMISSIONS", + "PLANCARDS", + "PLUM", + "POINT", + "PPLOT", + "PREDICT", + "PREFSCAL", + "PRESERVE", + "PRINCALS", + "PRINT EJECT", + "PRINT FORMATS", + "PRINT SPACE", + "PRINT", + "PROBIT", + "PROCEDURE OUTPUT", + "PROXIMITIES", + "PROXSCAL", + "Q", + "QUICK CLUSTER", + "QUIT", + "RANK", + "RATIO STATISTICS", + "READ MODEL", + "RECODE", + "RECORD TYPE", + "REFORMAT", + "REGRESSION", + "RELIABILITY", + "RENAME VARIABLES", + "REPEATING DATA", + "REPORT", + "REREAD", + "RESTORE", + "RMV", + "ROC", + "SAMPLE", + "SAVE DATA COLLECTION", + "SAVE TRANSLATE", + "SAVE", + "SCRIPT", + "SEASON", + "SELECT IF", + "SELECTPRED", + "SET", + "SHOW", + "SORT CASES", + "SORT VARIABLES", + "SPCHART", + "SPECTRA", + "SPLIT FILE", + "STEMLEAF", + "STRING", + "SUBTITLE", + "SUMMARIZE", + "SURVIVAL", + "SYSFILE INFO", + "T-TEST", + "TDISPLAY", + "TEMPORARY", + "TITLE", + "TREE", + "TSAPPLY", + "TSET", + "TSHOW", + "TSMODEL", + "TSPLOT", + "TWOSTEP CLUSTER", + "UNIANOVA", + "UNNUMBERED", + "UPDATE", + "USE", + "VALIDATEDATA", + "VALUE LABELS", + "VARCOMP", + "VARIABLE ALIGNMENT", + "VARIABLE ATTRIBUTE", + "VARIABLE LABELS", + "VARIABLE LEVEL", + "VARIABLE ROLE", + "VARIABLE WIDTH", + "VARSTOCASES", + "VECTOR", + "VERIFY", + "WEIGHT", + "WLS", + "WRITE FORMATS", + "WRITE", + "XEXPORT", + "XGRAPH", + "XSAVE", +]; diff --git a/rust/pspp/src/lex/lexer.rs b/rust/pspp/src/lex/lexer.rs new file mode 100644 index 0000000000..82ef008aef --- /dev/null +++ b/rust/pspp/src/lex/lexer.rs @@ -0,0 +1,929 @@ +use std::{ + borrow::{Borrow, Cow}, + collections::{HashMap, VecDeque}, + fmt::Write, + fs, + io::Result as IoResult, + mem, + ops::{Range, RangeInclusive}, + path::Path, + sync::Arc, +}; + +use chardetng::EncodingDetector; +use encoding_rs::{Encoding, UTF_8}; +use thiserror::Error as ThisError; +use unicode_width::{UnicodeWidthChar, UnicodeWidthStr}; + +use crate::{ + macros::{macro_tokens_to_syntax, MacroSet, ParseStatus, Parser}, + message::{Category, Diagnostic, Location, Point, Severity}, + prompt::PromptStyle, + settings::Settings, +}; + +use super::{ + scan::{MergeResult, ScanError, ScanToken}, + segment::{Mode, Segment, Segmenter}, + token::Token, +}; + +/// Error handling for a [`Reader`]. +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] +pub enum ErrorHandling { + /// Discard input line and continue reading. + Terminal, + + /// Continue to next command, except for cascading failures. + #[default] + Continue, + + /// Continue, even for cascading failures. + Ignore, + + /// Stop processing, + Stop, +} + +/// # Token pipeline +/// +/// Tokens pass through a pipeline with the following stages. Each token +/// eventually made available to the parser passes through of these stages. +/// The stages are named after the processing that happens in each one. +/// +/// Initially, tokens come from the segmenter and scanner to `pp`: +/// +/// - `pp`: Tokens that need to pass through the macro preprocessor to end up +/// in `merge`. +/// +/// - `merge`: Tokens that need to pass through +/// [`super::scan::ScanToken::merge`] to end up in `parse`. +/// +/// - `parse`: Tokens available to the client for parsing. +/// +/// `pp` and `merge` store tokens only temporarily until they pass into `parse`. +/// Tokens then live in `parse` until the command is fully consumed, at which +/// time they are freed together. +pub struct Source { + /// Error-handling mode. + error_handling: ErrorHandling, + + /// Encoding. + encoding: &'static Encoding, + + /// `None` if this reader is not associated with a file. + file_name: Option>, + + /// True if we've reached EOF already. + eof: bool, + + /// Read some input from the source. If successful, returns the input that + /// was read. At end of file or on error, returns an empty string. + /// + /// `prompt` provides a hint to interactive readers as to what kind of + /// syntax is being read right now. + read: Box String>, + + /// Source file contents. + buffer: String, + + /// 0-based line number of the first line not yet written to the journal. + journal_line: usize, + + /// Byte offset of first character not yet scanned as token. + seg_pos: usize, + + /// Byte offsets into `buffer` of starts of lines. The first element is 0. + lines: Vec, + + /// Tokens that need to pass through the macro preprocessor to end up in + /// `merge`. + pp: VecDeque, + + /// Tokens that need to pass through [`super::scan::ScanToken::merge`] to + /// end up in `parse`. + merge: VecDeque, + + /// Tokens available to the client for parsing. + parse: Vec, + + /// Offset in `parse` of the current token. + parse_ofs: usize, + + segmenter: Segmenter, + + suppress_next_newline: bool, +} + +impl Default for Source { + fn default() -> Self { + Self { + error_handling: ErrorHandling::default(), + encoding: UTF_8, + file_name: None, + eof: false, + read: Box::new(|_| String::new()), + buffer: String::new(), + journal_line: 0, + seg_pos: 0, + lines: vec![0], + pp: VecDeque::new(), + merge: VecDeque::new(), + parse: Vec::new(), + parse_ofs: 0, + segmenter: Segmenter::new(Mode::default(), false), + suppress_next_newline: false, + } + } +} + +impl Source { + pub fn for_file

( + path: P, + encoding: Option<&'static Encoding>, + syntax: Mode, + error_handling: ErrorHandling, + ) -> IoResult + where + P: AsRef, + { + let bytes = fs::read(path.as_ref())?; + let encoding = encoding.unwrap_or_else(|| { + let mut encoding_detector = EncodingDetector::new(); + encoding_detector.feed(&bytes, true); + encoding_detector.guess(None, true) + }); + let (contents, _malformed) = encoding.decode_with_bom_removal(&bytes); + Ok(Self::for_file_contents( + contents.to_string(), + Some(path.as_ref().to_string_lossy().to_string()), + encoding, + syntax, + error_handling, + )) + } + + pub fn for_file_contents( + contents: String, + file_name: Option, + encoding: &'static Encoding, + syntax: Mode, + error_handling: ErrorHandling, + ) -> Self { + Self { + buffer: contents, + file_name: file_name.map(Arc::new), + encoding, + error_handling, + segmenter: Segmenter::new(syntax, false), + ..Self::default() + } + } + + pub fn for_string(contents: String, encoding: &'static Encoding) -> Self { + Self { + buffer: contents, + encoding, + ..Self::default() + } + } + + pub fn for_function( + read: Box String>, + file_name: Option, + encoding: &'static Encoding, + syntax: Mode, + error_handling: ErrorHandling, + ) -> Self { + Self { + read, + file_name: file_name.map(Arc::new), + encoding, + segmenter: Segmenter::new(syntax, false), + error_handling, + ..Self::default() + } + } + + fn read(&mut self) { + loop { + let prompt = self.segmenter.prompt(); + let s = (self.read)(prompt); + if s.is_empty() { + self.eof = true; + return; + } + self.buffer.push_str(&s); + if self.buffer[self.seg_pos..].contains('\n') { + return; + } + } + } + fn try_get_pp(&mut self, context: &Context) -> bool { + let (seg_len, seg_type) = loop { + if let Ok(result) = self.segmenter.push(&self.buffer[self.seg_pos..], self.eof) { + break result; + } + + debug_assert!(!self.eof); + self.read(); + }; + + let pos = self.seg_pos..self.seg_pos + seg_len; + self.seg_pos += seg_len; + if seg_type == Segment::Newline { + self.lines.push(self.seg_pos); + } + + let scan_token = ScanToken::from_segment(&self.buffer[pos.clone()], seg_type); + + let n_lines = match (seg_type, self.suppress_next_newline) { + (Segment::EndCommand, false) => { + self.suppress_next_newline = true; + 1 + } + (Segment::Newline, true) => { + self.suppress_next_newline = false; + 0 + } + (Segment::Newline, false) => 1, + _ => 0, + }; + for line_num in self.journal_line..self.journal_line + n_lines { + let start_ofs = self.lines[line_num]; + let end_ofs = self + .lines + .get(line_num + 1) + .copied() + .unwrap_or(self.buffer.len()); + let line = &self.buffer[start_ofs..end_ofs]; + let _line = line + .strip_suffix("\r\n") + .unwrap_or(line.strip_suffix('\n').unwrap_or(line)); + // XXX submit the line as syntax + } + self.journal_line += n_lines; + + let pos = pos.start..pos.end; + match scan_token { + None => false, + Some(ScanToken::Token(Token::End)) => { + self.pp.push_back(LexToken { + token: Token::EndCommand, + pos, + macro_rep: None, + }); + self.eof = true; + true + } + Some(ScanToken::Token(token)) => { + self.pp.push_back(LexToken { + token, + pos, + macro_rep: None, + }); + true + } + Some(ScanToken::Error(error)) => { + (context.error)( + Location { + file_name: self.file_name.clone(), + span: Some(self.offset_to_point(pos.start)..self.offset_to_point(pos.end)), + omit_underlines: false, + }, + error.into(), + ); + false + } + } + } + + fn get_pp(&mut self, context: &Context) -> bool { + while !self.eof { + if self.try_get_pp(context) { + return true; + } + } + false + } + + fn try_get_merge(&mut self, context: &Context) -> bool { + if self.pp.is_empty() && !self.get_pp(context) { + return false; + } + + if !Settings::global().macros.expand { + self.merge.append(&mut self.pp); + return true; + } + + // Now pass tokens one-by-one to the macro expander. + let Some(mut parser) = Parser::new(context.macros, &self.pp[0].token) else { + // Common case where there is no macro to expand. + self.merge.push_back(self.pp.pop_front().unwrap()); + return true; + }; + for ofs in 1.. { + if self.pp.len() <= ofs && !self.get_pp(context) { + // This should not be reachable because we always get a + // `Token::EndCommand` at the end of an input file, which should + // always terminate macro expansion. + unreachable!(); + } + let token = &self.pp[ofs]; + if parser.push(&token.token, &self.buffer[token.pos.clone()], &|e| { + println!("{e:?}") + }) == ParseStatus::Complete + { + break; + } + } + let call = parser.finish(); + if call.len() == 0 { + // False alarm: no macro to expand after all. + self.merge.push_back(self.pp.pop_front().unwrap()); + return true; + } + + // Expand the tokens. + let c0 = &self.pp[0]; + let c1 = &self.pp[call.len() - 1]; + let mut expansion = Vec::new(); + call.expand( + self.segmenter.mode(), + self.token_location(c0..=c1), + &mut expansion, + |e| println!("{e:?}"), + ); + let retval = !expansion.is_empty(); + + if Settings::global().macros.print_expansions { + // XXX + } + + // Append the macro expansion tokens to the lookahead. + let mut macro_rep = String::new(); + let mut pos = Vec::with_capacity(expansion.len()); + for [prefix, token] in macro_tokens_to_syntax(expansion.as_slice()) { + macro_rep.push_str(prefix); + let len = macro_rep.len(); + pos.push(len..=len + token.len() - 1); + } + let macro_rep = Arc::new(macro_rep); + for (index, token) in expansion.into_iter().enumerate() { + let lt = LexToken { + token: token.token, + pos: c0.pos.start..c1.pos.end, + macro_rep: Some(MacroRepresentation { + expansion: Arc::clone(¯o_rep), + pos: pos[index].clone(), + }), + }; + self.merge.push_back(lt); + } + self.pp.drain(..call.len()); + retval + } + + /// Attempts to obtain at least one new token into `self.merge`. + /// + /// Returns true if successful, false on failure. In the latter case, this source + /// exhausted and 'self.eof' is now true. + fn get_merge(&mut self, context: &Context) -> bool { + while !self.eof { + if self.try_get_merge(context) { + return true; + } + } + false + } + + fn get_parse__(&mut self, context: &Context) -> bool { + for i in 0.. { + if self.merge.len() <= i && !self.get_merge(context) { + // We always get a `Token::EndCommand` at the end of an input + // file and the merger should return `Some(...)` for that token. + debug_assert_eq!(self.merge.len(), 0); + return false; + } + + match ScanToken::merge(&self.merge) { + None => (), + Some(MergeResult::Copy) => { + self.parse.push(self.merge.pop_front().unwrap()); + return true; + } + Some(MergeResult::Expand { n, token }) => { + let first = &self.merge[0]; + let last = &self.merge[n - 1]; + self.parse.push(LexToken { + token, + pos: first.pos.start..last.pos.end, + macro_rep: match (&first.macro_rep, &last.macro_rep) { + (Some(a), Some(b)) if Arc::ptr_eq(&a.expansion, &b.expansion) => { + Some(MacroRepresentation { + expansion: a.expansion.clone(), + pos: *a.pos.start()..=*b.pos.end(), + }) + } + _ => None, + }, + }); + self.merge.drain(..n); + return true; + } + } + } + unreachable!(); + } + + fn get_parse(&mut self, context: &Context) -> bool { + // XXX deal with accumulated messages + self.get_parse__(context) + } + + fn offset_to_point(&self, offset: usize) -> Point { + let line = self + .lines + .partition_point(|&line_start| line_start <= offset); + Point { + line: line as i32, + column: Some( + self.buffer + .get(self.lines[line - 1]..offset) + .unwrap_or_default() + .width() as i32 + + 1, + ), + } + } + + /// Returns the syntax for 1-based line-number `line_number`. + fn get_line(&self, line_number: i32) -> &str { + if (1..=self.lines.len() as i32).contains(&line_number) { + let line_number = line_number as usize; + let start = self.lines[line_number - 1]; + let end = self.lines.get(line_number).copied().unwrap_or( + self.buffer[start..] + .find('\n') + .map(|ofs| ofs + start) + .unwrap_or(self.buffer.len()), + ); + let line = &self.buffer[start..end]; + line.strip_suffix("\r\n") + .unwrap_or(line.strip_suffix('\n').unwrap_or(line)) + } else { + "" + } + } + + fn token_location(&self, range: RangeInclusive<&LexToken>) -> Location { + Location { + file_name: self.file_name.clone(), + span: Some( + self.offset_to_point(range.start().pos.start) + ..self.offset_to_point(range.end().pos.end), + ), + omit_underlines: false, + } + } + + fn ofs_location(&self, range: RangeInclusive) -> Location { + if *range.start() <= *range.end() && *range.end() < self.parse.len() { + self.token_location(&self.parse[*range.start()]..=&self.parse[*range.end()]) + } else { + Location { + file_name: self.file_name.clone(), + span: None, + omit_underlines: false, + } + } + } + + fn token(&self) -> &Token { + &self.parse[self.parse_ofs].token + } + + fn next(&mut self, offset: isize, context: &Context) -> &Token { + let Some(index) = offset.checked_add(self.parse_ofs as isize) else { + return &Token::EndCommand; + }; + let Ok(index) = usize::try_from(index) else { + return &Token::EndCommand; + }; + + while index >= self.parse.len() { + if let Some(token) = self.parse.last() { + match token.token { + Token::End => return &Token::End, + Token::EndCommand => return &Token::EndCommand, + _ => (), + } + } + self.get_parse(context); + } + &self.parse[index].token + } + + /// If the tokens in `ofs` contains a macro call, this returns the raw + /// syntax for the macro call (not for the expansion) and for any other + /// tokens included in that range. The syntax is encoded in UTF-8 and in + /// the original form supplied to the lexer so that, for example, it may + /// include comments, spaces, and new-lines if it spans multiple tokens. + /// + /// Returns `None` if the token range doesn't include a macro call. + fn get_macro_call(&self, ofs: RangeInclusive) -> Option<&str> { + if self + .parse + .get(ofs.clone()) + .unwrap_or_default() + .iter() + .all(|token| token.macro_rep.is_none()) + { + return None; + } + + let token0 = &self.parse[*ofs.start()]; + let token1 = &self.parse[*ofs.end()]; + Some(&self.buffer[token0.pos.start..token1.pos.end]) + } + + fn is_empty(&self) -> bool { + self.buffer.is_empty() && self.eof + } + + fn diagnostic( + &self, + severity: Severity, + ofs: RangeInclusive, + text: String, + ) -> Diagnostic { + let mut s = String::with_capacity(text.len() + 16); + if self.is_empty() { + s.push_str("At end of input: "); + } else if let Some(call) = self.get_macro_call(ofs.clone()) { + write!(&mut s, "In syntax expanded from `{}`: ", ellipsize(call)).unwrap(); + } + + if !text.is_empty() { + s.push_str(&text); + } else { + s.push_str("Syntax error."); + } + + if !s.ends_with('.') { + s.push('.'); + } + + let location = self.ofs_location(ofs); + let mut source = Vec::new(); + if let Some(Range { + start: Point { line: l0, .. }, + end: Point { line: l1, .. }, + }) = location.span + { + let lines = if l1 - l0 > 3 { + vec![l0, l0 + 1, l1] + } else { + (l0..=l1).collect() + }; + for line_number in lines { + source.push((line_number, self.get_line(line_number).to_string())); + } + } + + Diagnostic { + category: Category::Syntax, + severity, + location, + source, + stack: Vec::new(), + command_name: None, // XXX + text: s, + } + } + + fn interactive_reset(&mut self) { + if self.error_handling == ErrorHandling::Terminal { + let Source { + error_handling, + encoding, + read, + .. + } = mem::take(self); + *self = Self { + error_handling, + encoding, + read, + ..Source::default() + }; + } + } +} + +fn ellipsize(s: &str) -> Cow { + if s.width() > 64 { + let mut out = String::new(); + let mut width = 0; + for c in s.chars() { + out.push(c); + width += c.width().unwrap_or(0); + if width > 64 { + break; + } + } + out.push_str("..."); + Cow::from(out) + } else { + Cow::from(s) + } +} + +/// A token in a [`Source`]. +struct LexToken { + /// The regular token. + token: Token, + + /// For a token obtained through the lexer in an ordinary way, this is the + /// location of the token in the [`Source`]'s buffer. + /// + /// For a token produced through macro expansion, this is the entire macro + /// call. + pos: Range, + + /// For a token obtained through macro expansion, the part of the macro + /// expansion that represents this token. + /// + /// For a token obtained through the lexer in an ordinary way, this is + /// `None`. + macro_rep: Option, +} + +impl Borrow for LexToken { + fn borrow(&self) -> &Token { + &self.token + } +} + +struct MacroRepresentation { + /// An entire macro expansion. + expansion: Arc, + + /// The substring of `expansion` that represents a single token. + pos: RangeInclusive, +} + +pub struct Lexer { + source: Source, + stack: Vec, + macros: MacroSet, + error: Box, +} + +struct Context<'a> { + macros: &'a MacroSet, + error: &'a Box, +} + +impl Lexer { + pub fn new(error: Box) -> Self { + Self { + source: Source::default(), + stack: Vec::new(), + macros: HashMap::new(), + error, + } + } + + pub fn get(&mut self) -> &Token { + if self.source.parse_ofs < self.source.parse.len() { + if let Token::EndCommand = self.source.token() { + self.source.parse.clear(); + self.source.parse_ofs = 0; + } else { + self.source.parse_ofs += 1; + } + } + + while self.source.parse_ofs == self.source.parse.len() { + let context = Context { + macros: &self.macros, + error: &self.error, + }; + if !self.source.get_parse(&context) && !self.pop_stack() { + return &Token::End; + } + } + self.source.token() + } + + fn pop_stack(&mut self) -> bool { + if let Some(new_source) = self.stack.pop() { + self.source = new_source; + true + } else { + self.source = Source::default(); + self.source.parse.push(LexToken { + token: Token::End, + pos: 0..0, + macro_rep: None, + }); + false + } + } + + /// Inserts `source` so that the next token comes from it. This is only + /// permitted when the lexer is either empty or at `Token::EndCommand`. + pub fn include(&mut self, mut source: Source) { + // XXX what's the right assertion? + let context = Context { + macros: &self.macros, + error: &self.error, + }; + source.get_parse(&context); + let old_source = mem::replace(&mut self.source, source); + self.stack.push(old_source); + } + + /// Inserts `source` so that it will be read after all the other sources. + pub fn append(&mut self, mut source: Source) { + let context = Context { + macros: &self.macros, + error: &self.error, + }; + source.get_parse(&context); + self.stack.insert(0, source); + } + + pub fn token(&self) -> &Token { + self.source.token() + } + + pub fn next(&mut self, offset: isize) -> &Token { + let context = Context { + macros: &self.macros, + error: &self.error, + }; + self.source.next(offset, &context) + } + + pub fn error(&self, text: S) -> Diagnostic + where + S: ToString, + { + self.diagnostic( + Severity::Error, + self.source.parse_ofs..=self.source.parse_ofs, + text, + ) + } + + pub fn diagnostic( + &self, + severity: Severity, + ofs: RangeInclusive, + text: S, + ) -> Diagnostic + where + S: ToString, + { + self.source.diagnostic(severity, ofs, text.to_string()) + } + + pub fn error_handling(&self) -> ErrorHandling { + self.source.error_handling + } + + /// Discards all lookahead tokens, then discards all input sources + /// until it encounters one with error mode [ErrorHandling::Terminal] or until it + /// runs out of input sources. + pub fn discard_noninteractive(&mut self) { + while self.source.error_handling != ErrorHandling::Ignore { + self.source.pp.clear(); + self.source.merge.clear(); + self.source.parse.clear(); + self.source.parse_ofs = 0; + + if self.source.error_handling == ErrorHandling::Terminal || !self.pop_stack() { + return; + } + } + } + + /// If the source that the lexer is currently reading has error mode + /// [ErrorHandling::Terminal], discards all buffered input and tokens, so + /// that the next token to be read comes directly from whatever is next read + /// from the stream. + /// + /// It makes sense to call this function after encountering an error in a + /// command entered on the console, because usually the user would prefer + /// not to have cascading errors. + pub fn interactive_reset(&mut self) { + self.source.interactive_reset() + } + + /// Advances past any tokens up to [Token::EndCommand] or [Token::End]. + pub fn discard_rest_of_command(&mut self) { + while !matches!(self.token(), Token::EndCommand | Token::End) { + self.get(); + } + } +} + +#[derive(ThisError, Clone, Debug, PartialEq, Eq)] +pub enum Error { + /// Error forming tokens from the input. + #[error("{0}")] + TokenError(#[from] ScanError), +} + +#[cfg(test)] +mod tests { + use encoding_rs::UTF_8; + + use crate::lex::{segment::Mode, token::Token}; + + use super::{ErrorHandling, Lexer, Source}; + + #[test] + fn test() { + let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}"))); + lexer.include(Source::for_string( + String::from( + r#"#! /usr/local/bin/pspp +DATA LIST LIST NOTABLE /a. +BEGIN DATA. +1 +2 +END DATA. +LIST. +"#, + ), + UTF_8, + )); + loop { + lexer.get(); + let token = lexer.token(); + println!("{token:?}"); + if let Token::End = token { + break; + } + } + } + + #[test] + fn test_scan_errors() { + let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}"))); + lexer.include(Source::for_file_contents( + String::from( + r#"x'123' +x'1x' +u'' +u'012345678' +u'd800' +u'110000' +'foo +'very long unterminated string that be ellipsized in its error message +1e .x +^ +� +"#, + ), + Some(String::from("syntax.sps")), + UTF_8, + Mode::default(), + ErrorHandling::default(), + )); + loop { + lexer.get(); + let token = lexer.token(); + println!("{token:?}"); + if let Token::End = token { + break; + } + } + } + + #[test] + fn test_null_byte() { + let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}"))); + lexer.include(Source::for_file_contents( + String::from( + "datA dist list notable file='input.txt'/a b c. +lis|.\0", + ), + Some(String::from("syntax.sps")), + UTF_8, + Mode::default(), + ErrorHandling::default(), + )); + loop { + lexer.get(); + let token = lexer.token(); + println!("{token:?}"); + if let Token::End = token { + break; + } + } + } +} diff --git a/rust/pspp/src/lex/mod.rs b/rust/pspp/src/lex/mod.rs new file mode 100644 index 0000000000..e87b088cf4 --- /dev/null +++ b/rust/pspp/src/lex/mod.rs @@ -0,0 +1,17 @@ +//! PSPP syntax scanning. +//! +//! PSPP divides traditional "lexical analysis" or "tokenization" into two +//! phases: a lower-level phase called "segmentation" and a higher-level phase +//! called "scanning". [super::segment] implements the segmentation phase and +//! this module the scanning phase. +//! +//! Scanning accepts as input a stream of segments, which are UTF-8 strings each +//! labeled with a segment type. It outputs a stream of "scan tokens", which +//! are the same as the tokens used by the PSPP parser with a few additional +//! types. + +pub mod segment; +pub mod scan; +pub mod command_name; +pub mod token; +pub mod lexer; diff --git a/rust/pspp/src/lex/scan/mod.rs b/rust/pspp/src/lex/scan/mod.rs new file mode 100644 index 0000000000..05577a9259 --- /dev/null +++ b/rust/pspp/src/lex/scan/mod.rs @@ -0,0 +1,416 @@ +//! PSPP lexical analysis. +//! +//! PSPP divides traditional "lexical analysis" or "tokenization" into two +//! phases: a lower-level phase called "segmentation" and a higher-level phase +//! called "scanning". [segment] implements the segmentation phase and [scan] +//! the scanning phase. +//! +//! Scanning accepts as input a stream of segments, which are UTF-8 strings each +//! labeled with a segment type. It outputs a stream of "scan tokens", which +//! are the same as the tokens used by the PSPP parser with a few additional +//! types. + +use crate::identifier::{Identifier, ReservedWord}; + +use super::{ + segment::{Mode, Segment, Segmenter}, + token::{Punct, Token}, +}; +use std::{borrow::Borrow, collections::VecDeque}; +use thiserror::Error as ThisError; + +#[derive(ThisError, Clone, Debug, PartialEq, Eq)] +pub enum ScanError { + /// Unterminated string constant. + #[error("Unterminated string constant.")] + ExpectedQuote, + + /// Missing exponent. + #[error("Missing exponent following `{0}`")] + ExpectedExponent(String), + + /// Odd length hex string. + #[error("String of hex digits has {0} characters, which is not a multiple of 2.")] + OddLengthHexString(usize), + + /// Invalid hex digit. + #[error("Invalid hex digit {0:?}.")] + BadHexDigit(char), + + /// Incomplete UTF-8 sequence. + #[error("Incomplete UTF-8 sequence `{substring}` starting {offset} digits into hex string.")] + IncompleteUtf8 { substring: String, offset: usize }, + + /// Bad UTF-8 sequence. + #[error("Invalid UTF-8 sequence `{substring}` starting {offset} digits into hex string.")] + BadUtf8 { substring: String, offset: usize }, + + /// Invalid length Unicode string. + #[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")] + BadLengthUnicodeString(usize), + + /// Invalid code point. + #[error("U+{0:04X} is not a valid Unicode code point.")] + BadCodePoint(u32), + + /// Expected hexadecimal Unicode code point + #[error("Expected hexadecimal Unicode code point.")] + ExpectedCodePoint, + + /// `DO REPEAT` nested too deeply. + #[error("`DO REPEAT` nested too deeply.")] + DoRepeatOverflow, + + /// Unexpected character. + #[error("Unexpected character {0:?} in input.")] + UnexpectedChar(char), +} + +/// The input or output to token merging. +#[derive(Clone, Debug, PartialEq)] +pub enum ScanToken { + Token(Token), + Error(ScanError), +} + +/// The result of merging tokens. +#[derive(Clone, Debug)] +pub enum MergeResult { + /// Copy one token literally from input to output. + Copy, + + /// Expand `n` tokens from the input into `token` in the output. + Expand { + /// Number of tokens to expand. + n: usize, + + /// Replacement token. + token: Token, + }, +} + +impl ScanToken { + pub fn from_segment(s: &str, segment: Segment) -> Option { + match segment { + Segment::Number => Some(Self::Token(Token::Number(s.parse().unwrap()))), + Segment::QuotedString => { + // Trim quote mark from front and back. + let mut chars = s.chars(); + let quote = chars.next().unwrap(); + let s = chars.as_str().strip_suffix(quote).unwrap(); + + // Replace doubled quotes by single ones. + let (single_quote, double_quote) = match quote { + '\'' => ("'", "''"), + '"' => ("\"", "\"\""), + _ => unreachable!(), + }; + Some(Self::Token(Token::String( + s.replace(double_quote, single_quote), + ))) + } + Segment::HexString => { + // Strip `X"` prefix and `"` suffix (or variations). + let s = &s[2..s.len() - 1]; + for c in s.chars() { + if !c.is_ascii_hexdigit() { + return Some(Self::Error(ScanError::BadHexDigit(c))); + } + } + if s.len() % 2 != 0 { + return Some(Self::Error(ScanError::OddLengthHexString(s.len()))); + } + let bytes = s + .as_bytes() + .chunks_exact(2) + .map(|pair| { + let hi = char::from(pair[0]).to_digit(16).unwrap() as u8; + let lo = char::from(pair[1]).to_digit(16).unwrap() as u8; + hi * 16 + lo + }) + .collect::>(); + match String::from_utf8(bytes) { + Ok(string) => Some(Self::Token(Token::String(string))), + Err(error) => { + let details = error.utf8_error(); + let offset = details.valid_up_to() * 2; + let end = details + .error_len() + .map(|len| offset + len * 2) + .unwrap_or(s.len()); + let substring = String::from(&s[offset..end]); + Some(Self::Error(if details.error_len().is_some() { + ScanError::BadUtf8 { substring, offset } + } else { + ScanError::IncompleteUtf8 { substring, offset } + })) + } + } + } + Segment::UnicodeString => { + // Strip `U"` prefix and `"` suffix (or variations). + let s = &s[2..s.len() - 1]; + if !(1..=8).contains(&s.len()) { + return Some(Self::Error(ScanError::BadLengthUnicodeString(s.len()))); + } + let Ok(code_point) = u32::from_str_radix(s, 16) else { + return Some(Self::Error(ScanError::ExpectedCodePoint)); + }; + let Some(c) = char::from_u32(code_point) else { + return Some(Self::Error(ScanError::BadCodePoint(code_point))); + }; + Some(Self::Token(Token::String(String::from(c)))) + } + + Segment::UnquotedString + | Segment::DoRepeatCommand + | Segment::InlineData + | Segment::Document + | Segment::MacroBody + | Segment::MacroName => Some(Self::Token(Token::String(String::from(s)))), + + Segment::Identifier => { + if let Ok(reserved_word) = ReservedWord::try_from(s) { + match reserved_word { + ReservedWord::And => Some(Self::Token(Token::Punct(Punct::And))), + ReservedWord::Or => Some(Self::Token(Token::Punct(Punct::Or))), + ReservedWord::Not => Some(Self::Token(Token::Punct(Punct::Not))), + ReservedWord::Eq => Some(Self::Token(Token::Punct(Punct::Eq))), + ReservedWord::Ge => Some(Self::Token(Token::Punct(Punct::Ge))), + ReservedWord::Gt => Some(Self::Token(Token::Punct(Punct::Gt))), + ReservedWord::Le => Some(Self::Token(Token::Punct(Punct::Le))), + ReservedWord::Lt => Some(Self::Token(Token::Punct(Punct::Lt))), + ReservedWord::Ne => Some(Self::Token(Token::Punct(Punct::Ne))), + ReservedWord::All => Some(Self::Token(Token::Punct(Punct::All))), + ReservedWord::By => Some(Self::Token(Token::Punct(Punct::By))), + ReservedWord::To => Some(Self::Token(Token::Punct(Punct::To))), + ReservedWord::With => Some(Self::Token(Token::Punct(Punct::With))), + } + } else { + Some(Self::Token(Token::Id(Identifier::new(s).unwrap()))) + } + } + Segment::Punct => match s { + "(" => Some(Self::Token(Token::Punct(Punct::LParen))), + ")" => Some(Self::Token(Token::Punct(Punct::RParen))), + "[" => Some(Self::Token(Token::Punct(Punct::LSquare))), + "]" => Some(Self::Token(Token::Punct(Punct::RSquare))), + "{" => Some(Self::Token(Token::Punct(Punct::LCurly))), + "}" => Some(Self::Token(Token::Punct(Punct::RCurly))), + "," => Some(Self::Token(Token::Punct(Punct::Comma))), + "=" => Some(Self::Token(Token::Punct(Punct::Equals))), + "-" => Some(Self::Token(Token::Punct(Punct::Dash))), + "&" => Some(Self::Token(Token::Punct(Punct::And))), + "|" => Some(Self::Token(Token::Punct(Punct::Or))), + "+" => Some(Self::Token(Token::Punct(Punct::Plus))), + "/" => Some(Self::Token(Token::Punct(Punct::Slash))), + "*" => Some(Self::Token(Token::Punct(Punct::Asterisk))), + "<" => Some(Self::Token(Token::Punct(Punct::Lt))), + ">" => Some(Self::Token(Token::Punct(Punct::Gt))), + "~" => Some(Self::Token(Token::Punct(Punct::Not))), + ":" => Some(Self::Token(Token::Punct(Punct::Colon))), + ";" => Some(Self::Token(Token::Punct(Punct::Semicolon))), + "**" => Some(Self::Token(Token::Punct(Punct::Exp))), + "<=" => Some(Self::Token(Token::Punct(Punct::Le))), + "<>" => Some(Self::Token(Token::Punct(Punct::Ne))), + "~=" => Some(Self::Token(Token::Punct(Punct::Ne))), + ">=" => Some(Self::Token(Token::Punct(Punct::Ge))), + "!" => Some(Self::Token(Token::Punct(Punct::Bang))), + "%" => Some(Self::Token(Token::Punct(Punct::Percent))), + "?" => Some(Self::Token(Token::Punct(Punct::Question))), + "`" => Some(Self::Token(Token::Punct(Punct::Backtick))), + "_" => Some(Self::Token(Token::Punct(Punct::Underscore))), + "." => Some(Self::Token(Token::Punct(Punct::Dot))), + "!*" => Some(Self::Token(Token::Punct(Punct::BangAsterisk))), + _ => unreachable!("bad punctuator {s:?}"), + }, + Segment::Shbang + | Segment::Spaces + | Segment::Comment + | Segment::Newline + | Segment::CommentCommand => None, + Segment::DoRepeatOverflow => Some(Self::Error(ScanError::DoRepeatOverflow)), + Segment::StartDocument => { + Some(Self::Token(Token::Id(Identifier::new("DOCUMENT").unwrap()))) + } + Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => { + Some(Self::Token(Token::EndCommand)) + } + Segment::End => Some(Self::Token(Token::End)), + Segment::ExpectedQuote => Some(Self::Error(ScanError::ExpectedQuote)), + Segment::ExpectedExponent => { + Some(Self::Error(ScanError::ExpectedExponent(String::from(s)))) + } + Segment::UnexpectedChar => Some(Self::Error(ScanError::UnexpectedChar( + s.chars().next().unwrap(), + ))), + } + } + + /// Attempts to merge a sequence of tokens together into a single token. The + /// tokens are taken from the beginning of `input`. If successful, removes one + /// or more token from the beginning of `input` and returnss the merged + /// token. More input tokens might be needed; if so, leaves `input` alone and + /// returns `None`. In the latter case, the caller should add more tokens to the + /// input ([Token::End] or [Token::Punct(Punct::EndCmd)] is always sufficient). + /// + /// This performs two different kinds of token merging: + /// + /// - String concatenation, where syntax like `"a" + "b"` is converted into a + /// single string token. This is definitely needed because the parser relies + /// on it. + /// + /// - Negative number merging, where syntax like `-5` is converted from a pair + /// of tokens (a dash and a positive number) into a single token (a negative + /// number). This might not be needed anymore because the segmenter + /// directly treats a dash followed by a number, with optional intervening + /// white space, as a negative number. It's only needed if we want + /// intervening comments to be allowed or for part of the negative number + /// token to be produced by macro expansion. + pub fn merge(tokens: &T) -> Option + where + T: Tokens, + { + match tokens.get(0)? { + Token::Punct(Punct::Dash) => match tokens.get(1)? { + Token::Number(number) if number.is_sign_positive() => { + let number = *number; + return Some(MergeResult::Expand { + n: 2, + token: Token::Number(-number), + }); + } + _ => Some(MergeResult::Copy), + }, + Token::String(_) => { + let mut i = 0; + while matches!(tokens.get(i * 2 + 1)?, Token::Punct(Punct::Plus)) + && matches!(tokens.get(i * 2 + 2)?, Token::String(_)) + { + i += 1; + } + if i == 0 { + Some(MergeResult::Copy) + } else { + let mut output = String::new(); + for i in 0..=i { + let Token::String(s) = tokens.get(i * 2).unwrap() else { + unreachable!() + }; + output.push_str(&s); + } + Some(MergeResult::Expand { + n: i * 2 + 1, + token: Token::String(output), + }) + } + } + _ => Some(MergeResult::Copy), + } + } +} + +pub trait Tokens { + fn get(&self, index: usize) -> Option<&Token>; +} + +impl Tokens for VecDeque +where + T: Borrow, +{ + fn get(&self, index: usize) -> Option<&Token> { + self.get(index).map(|token| token.borrow()) + } +} + +pub struct StringSegmenter<'a> { + input: &'a str, + segmenter: Segmenter, +} + +impl<'a> StringSegmenter<'a> { + pub fn new(input: &'a str, mode: Mode, is_snippet: bool) -> Self { + Self { + input, + segmenter: Segmenter::new(mode, is_snippet), + } + } +} + +impl<'a> Iterator for StringSegmenter<'a> { + type Item = (&'a str, ScanToken); + + fn next(&mut self) -> Option { + loop { + let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap(); + if seg_type == Segment::End { + return None; + } + let (s, rest) = self.input.split_at(seg_len); + self.input = rest; + + if let Some(token) = ScanToken::from_segment(s, seg_type) { + return Some((s, token)); + } + } + } +} + +pub struct StringScanner<'a> { + input: &'a str, + segmenter: Segmenter, + tokens: VecDeque, +} + +impl<'a> StringScanner<'a> { + pub fn new(input: &'a str, mode: Mode, is_snippet: bool) -> Self { + Self { + input, + segmenter: Segmenter::new(mode, is_snippet), + tokens: VecDeque::with_capacity(1), + } + } + + fn merge(&mut self) -> Option { + let result = ScanToken::merge(&self.tokens)?; + match result { + MergeResult::Copy => Some(ScanToken::Token(self.tokens.pop_front().unwrap())), + MergeResult::Expand { n, token } => { + self.tokens.drain(..n); + Some(ScanToken::Token(token)) + } + } + } +} + +impl<'a> Iterator for StringScanner<'a> { + type Item = ScanToken; + + fn next(&mut self) -> Option { + if let Some(token) = self.merge() { + return Some(token); + } + loop { + let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap(); + if seg_type == Segment::End && self.tokens.is_empty() { + return None; + } + let (s, rest) = self.input.split_at(seg_len); + self.input = rest; + + match ScanToken::from_segment(s, seg_type) { + Some(ScanToken::Error(error)) => return Some(ScanToken::Error(error)), + Some(ScanToken::Token(token)) => { + self.tokens.push_back(token); + if let Some(token) = self.merge() { + return Some(token); + } + } + None => (), + } + } + } +} + +#[cfg(test)] +mod test; diff --git a/rust/pspp/src/lex/scan/test.rs b/rust/pspp/src/lex/scan/test.rs new file mode 100644 index 0000000000..0ed9be6555 --- /dev/null +++ b/rust/pspp/src/lex/scan/test.rs @@ -0,0 +1,1017 @@ +use crate::{identifier::Identifier, lex::{ + segment::Mode, + token::{Punct, Token}, +}}; + +use super::{ScanError, ScanToken, StringScanner}; + +fn print_token(token: &Token) { + match token { + Token::End => print!("Token::End"), + Token::Id(s) => print!("Token::Id(String::from({s:?}))"), + Token::Number(number) => print!("Token::Number({number:?})"), + Token::String(s) => print!("Token::String(String::from({s:?}))"), + Token::EndCommand => print!("Token::EndCommand"), + Token::Punct(punct) => print!("Token::Punct(Punct::{punct:?})"), + } +} + +fn check_scan(input: &str, mode: Mode, expected: &[ScanToken]) { + let tokens = StringScanner::new(input, mode, false).collect::>(); + + if &tokens != expected { + for token in &tokens { + match token { + ScanToken::Token(token) => { + print!("ScanToken::Token("); + print_token(token); + print!(")"); + } + ScanToken::Error(error) => print!("ScanToken::Error(ScanError::{error:?})"), + } + println!(","); + } + + eprintln!("tokens differ from expected:"); + let difference = diff::slice(expected, &tokens); + for result in difference { + match result { + diff::Result::Left(left) => eprintln!("-{left:?}"), + diff::Result::Both(left, _right) => eprintln!(" {left:?}"), + diff::Result::Right(right) => eprintln!("+{right:?}"), + } + } + panic!(); + } +} + +#[test] +fn test_identifiers() { + check_scan( + r#"a aB i5 $x @efg @@. !abcd !* !*a #.# .x _z. +abcd. abcd. +QRSTUV./* end of line comment */ +QrStUv./* end of line comment */ +WXYZ. /* unterminated end of line comment +�. /* U+FFFD is not valid in an identifier +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("aB").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("i5").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("$x").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("@efg").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("@@.").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("!abcd").unwrap())), + ScanToken::Token(Token::Punct(Punct::BangAsterisk)), + ScanToken::Token(Token::Punct(Punct::BangAsterisk)), + ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("#.#").unwrap())), + ScanToken::Token(Token::Punct(Punct::Dot)), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), + ScanToken::Token(Token::Punct(Punct::Underscore)), + ScanToken::Token(Token::Id(Identifier::new("z").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("abcd.").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("abcd").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("QRSTUV").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("QrStUv").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("WXYZ").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Error(ScanError::UnexpectedChar('�')), + ScanToken::Token(Token::EndCommand), + ], + ); +} + +#[test] +fn test_reserved_words() { + check_scan( + r#"and or not eq ge gt le lt ne all by to with +AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH +andx orx notx eqx gex gtx lex ltx nex allx byx tox withx +and. with. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Punct(Punct::And)), + ScanToken::Token(Token::Punct(Punct::Or)), + ScanToken::Token(Token::Punct(Punct::Not)), + ScanToken::Token(Token::Punct(Punct::Eq)), + ScanToken::Token(Token::Punct(Punct::Ge)), + ScanToken::Token(Token::Punct(Punct::Gt)), + ScanToken::Token(Token::Punct(Punct::Le)), + ScanToken::Token(Token::Punct(Punct::Lt)), + ScanToken::Token(Token::Punct(Punct::Ne)), + ScanToken::Token(Token::Punct(Punct::All)), + ScanToken::Token(Token::Punct(Punct::By)), + ScanToken::Token(Token::Punct(Punct::To)), + ScanToken::Token(Token::Punct(Punct::With)), + ScanToken::Token(Token::Punct(Punct::And)), + ScanToken::Token(Token::Punct(Punct::Or)), + ScanToken::Token(Token::Punct(Punct::Not)), + ScanToken::Token(Token::Punct(Punct::Eq)), + ScanToken::Token(Token::Punct(Punct::Ge)), + ScanToken::Token(Token::Punct(Punct::Gt)), + ScanToken::Token(Token::Punct(Punct::Le)), + ScanToken::Token(Token::Punct(Punct::Lt)), + ScanToken::Token(Token::Punct(Punct::Ne)), + ScanToken::Token(Token::Punct(Punct::All)), + ScanToken::Token(Token::Punct(Punct::By)), + ScanToken::Token(Token::Punct(Punct::To)), + ScanToken::Token(Token::Punct(Punct::With)), + ScanToken::Token(Token::Id(Identifier::new("andx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("orx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("notx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("eqx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("gex").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("gtx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("lex").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("ltx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("nex").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("allx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("byx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("tox").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("withx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("and.").unwrap())), + ScanToken::Token(Token::Punct(Punct::With)), + ScanToken::Token(Token::EndCommand), + ], + ); +} + +#[test] +fn test_punctuation() { + check_scan( + r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] ** +~&|=>=><=<~=<>(),-+*/[]** +% : ; ? _ ` { } ~ +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Punct(Punct::Not)), + ScanToken::Token(Token::Punct(Punct::And)), + ScanToken::Token(Token::Punct(Punct::Or)), + ScanToken::Token(Token::Punct(Punct::Equals)), + ScanToken::Token(Token::Punct(Punct::Ge)), + ScanToken::Token(Token::Punct(Punct::Gt)), + ScanToken::Token(Token::Punct(Punct::Le)), + ScanToken::Token(Token::Punct(Punct::Lt)), + ScanToken::Token(Token::Punct(Punct::Ne)), + ScanToken::Token(Token::Punct(Punct::Ne)), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Punct(Punct::Comma)), + ScanToken::Token(Token::Punct(Punct::Dash)), + ScanToken::Token(Token::Punct(Punct::Plus)), + ScanToken::Token(Token::Punct(Punct::Asterisk)), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Punct(Punct::LSquare)), + ScanToken::Token(Token::Punct(Punct::RSquare)), + ScanToken::Token(Token::Punct(Punct::Exp)), + ScanToken::Token(Token::Punct(Punct::Not)), + ScanToken::Token(Token::Punct(Punct::And)), + ScanToken::Token(Token::Punct(Punct::Or)), + ScanToken::Token(Token::Punct(Punct::Equals)), + ScanToken::Token(Token::Punct(Punct::Ge)), + ScanToken::Token(Token::Punct(Punct::Gt)), + ScanToken::Token(Token::Punct(Punct::Le)), + ScanToken::Token(Token::Punct(Punct::Lt)), + ScanToken::Token(Token::Punct(Punct::Ne)), + ScanToken::Token(Token::Punct(Punct::Ne)), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Punct(Punct::Comma)), + ScanToken::Token(Token::Punct(Punct::Dash)), + ScanToken::Token(Token::Punct(Punct::Plus)), + ScanToken::Token(Token::Punct(Punct::Asterisk)), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Punct(Punct::LSquare)), + ScanToken::Token(Token::Punct(Punct::RSquare)), + ScanToken::Token(Token::Punct(Punct::Exp)), + ScanToken::Token(Token::Punct(Punct::Percent)), + ScanToken::Token(Token::Punct(Punct::Colon)), + ScanToken::Token(Token::Punct(Punct::Semicolon)), + ScanToken::Token(Token::Punct(Punct::Question)), + ScanToken::Token(Token::Punct(Punct::Underscore)), + ScanToken::Token(Token::Punct(Punct::Backtick)), + ScanToken::Token(Token::Punct(Punct::LCurly)), + ScanToken::Token(Token::Punct(Punct::RCurly)), + ScanToken::Token(Token::Punct(Punct::Not)), + ], + ); +} + +#[test] +fn test_positive_numbers() { + check_scan( + r#"0 1 01 001. 1. +123. /* comment 1 */ /* comment 2 */ +.1 0.1 00.1 00.10 +5e1 6E-1 7e+1 6E+01 6e-03 +.3E1 .4e-1 .5E+1 .6e+01 .7E-03 +1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03 +. 1e e1 1e+ 1e- +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Number(0.0)), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Number(123.0)), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::Number(0.1)), + ScanToken::Token(Token::Number(0.1)), + ScanToken::Token(Token::Number(0.1)), + ScanToken::Token(Token::Number(50.0)), + ScanToken::Token(Token::Number(0.6)), + ScanToken::Token(Token::Number(70.0)), + ScanToken::Token(Token::Number(60.0)), + ScanToken::Token(Token::Number(0.006)), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Number(30.0)), + ScanToken::Token(Token::Number(0.04)), + ScanToken::Token(Token::Number(5.0)), + ScanToken::Token(Token::Number(6.0)), + ScanToken::Token(Token::Number(0.0007)), + ScanToken::Token(Token::Number(12.3)), + ScanToken::Token(Token::Number(4.56)), + ScanToken::Token(Token::Number(789.0)), + ScanToken::Token(Token::Number(999.0)), + ScanToken::Token(Token::Number(0.0112)), + ScanToken::Token(Token::EndCommand), + ScanToken::Error(ScanError::ExpectedExponent(String::from("1e"))), + ScanToken::Token(Token::Id(Identifier::new("e1").unwrap())), + ScanToken::Error(ScanError::ExpectedExponent(String::from("1e+"))), + ScanToken::Error(ScanError::ExpectedExponent(String::from("1e-"))), + ], + ); +} + +#[test] +fn test_negative_numbers() { + check_scan( + r#" -0 -1 -01 -001. -1. + -123. /* comment 1 */ /* comment 2 */ + -.1 -0.1 -00.1 -00.10 + -5e1 -6E-1 -7e+1 -6E+01 -6e-03 + -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03 + -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03 + -/**/1 + -. -1e -e1 -1e+ -1e- -1. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Number(-0.0)), + ScanToken::Token(Token::Number(-1.0)), + ScanToken::Token(Token::Number(-1.0)), + ScanToken::Token(Token::Number(-1.0)), + ScanToken::Token(Token::Number(-1.0)), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Number(-123.0)), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Number(-0.1)), + ScanToken::Token(Token::Number(-0.1)), + ScanToken::Token(Token::Number(-0.1)), + ScanToken::Token(Token::Number(-0.1)), + ScanToken::Token(Token::Number(-50.0)), + ScanToken::Token(Token::Number(-0.6)), + ScanToken::Token(Token::Number(-70.0)), + ScanToken::Token(Token::Number(-60.0)), + ScanToken::Token(Token::Number(-0.006)), + ScanToken::Token(Token::Number(-3.0)), + ScanToken::Token(Token::Number(-0.04)), + ScanToken::Token(Token::Number(-5.0)), + ScanToken::Token(Token::Number(-6.0)), + ScanToken::Token(Token::Number(-0.0007)), + ScanToken::Token(Token::Number(-12.3)), + ScanToken::Token(Token::Number(-4.56)), + ScanToken::Token(Token::Number(-789.0)), + ScanToken::Token(Token::Number(-999.0)), + ScanToken::Token(Token::Number(-0.0112)), + ScanToken::Token(Token::Number(-1.0)), + ScanToken::Token(Token::Punct(Punct::Dash)), + ScanToken::Token(Token::Punct(Punct::Dot)), + ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e"))), + ScanToken::Token(Token::Punct(Punct::Dash)), + ScanToken::Token(Token::Id(Identifier::new("e1").unwrap())), + ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e+"))), + ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e-"))), + ScanToken::Token(Token::Number(-1.0)), + ScanToken::Token(Token::EndCommand), + ], + ); +} + +#[test] +fn test_strings() { + check_scan( + r#"'x' "y" 'abc' +'Don''t' "Can't" 'Won''t' +"""quoted""" '"quoted"' +'' "" '''' """" +'missing end quote +"missing double quote +'x' + "y" ++ 'z' + +'a' /* abc */ + "b" /* ++ 'c' +/* */"d"/* */+'e' +'foo' ++ /* special case: + in column 0 would ordinarily start a new command +'bar' +'foo' + + +'bar' +'foo' ++ + +'bar' + ++ +x"4142"+'5152' +"4142"+ +x'5152' +x"4142" ++u'304a' +"�あいうえお" +"abc"+U"FFFD"+u'3048'+"xyz" +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::String(String::from("x"))), + ScanToken::Token(Token::String(String::from("y"))), + ScanToken::Token(Token::String(String::from("abc"))), + ScanToken::Token(Token::String(String::from("Don't"))), + ScanToken::Token(Token::String(String::from("Can't"))), + ScanToken::Token(Token::String(String::from("Won't"))), + ScanToken::Token(Token::String(String::from("\"quoted\""))), + ScanToken::Token(Token::String(String::from("\"quoted\""))), + ScanToken::Token(Token::String(String::from(""))), + ScanToken::Token(Token::String(String::from(""))), + ScanToken::Token(Token::String(String::from("'"))), + ScanToken::Token(Token::String(String::from("\""))), + ScanToken::Error(ScanError::ExpectedQuote), + ScanToken::Error(ScanError::ExpectedQuote), + ScanToken::Token(Token::String(String::from("xyzabcde"))), + ScanToken::Token(Token::String(String::from("foobar"))), + ScanToken::Token(Token::String(String::from("foobar"))), + ScanToken::Token(Token::String(String::from("foo"))), + ScanToken::Token(Token::Punct(Punct::Plus)), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::String(String::from("bar"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Punct(Punct::Plus)), + ScanToken::Token(Token::String(String::from("AB5152"))), + ScanToken::Token(Token::String(String::from("4142QR"))), + ScanToken::Token(Token::String(String::from("ABお"))), + ScanToken::Token(Token::String(String::from("�あいうえお"))), + ScanToken::Token(Token::String(String::from("abc�えxyz"))), + ScanToken::Token(Token::End), + ], + ); +} + +#[test] +fn test_shbang() { + check_scan( + r#"#! /usr/bin/pspp +#! /usr/bin/pspp +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("#").unwrap())), + ScanToken::Token(Token::Punct(Punct::Bang)), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Id(Identifier::new("usr").unwrap())), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Id(Identifier::new("bin").unwrap())), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Id(Identifier::new("pspp").unwrap())), + ], + ); +} + +#[test] +fn test_comments() { + check_scan( + r#"* Comment commands "don't +have to contain valid tokens. + +** Check ambiguity with ** token. +****************. + +comment keyword works too. +COMM also. +com is ambiguous with COMPUTE. + + * Comment need not start at left margin. + +* Comment ends with blank line + +next command. + +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("com").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("is").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("ambiguous").unwrap())), + ScanToken::Token(Token::Punct(Punct::With)), + ScanToken::Token(Token::Id(Identifier::new("COMPUTE").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("next").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ], + ); +} + +#[test] +fn test_document() { + check_scan( + r#"DOCUMENT one line. +DOC more + than + one + line. +docu +first.paragraph +isn't parsed as tokens + +second paragraph. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())), + ScanToken::Token(Token::String(String::from("DOCUMENT one line."))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())), + ScanToken::Token(Token::String(String::from("DOC more"))), + ScanToken::Token(Token::String(String::from(" than"))), + ScanToken::Token(Token::String(String::from(" one"))), + ScanToken::Token(Token::String(String::from(" line."))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())), + ScanToken::Token(Token::String(String::from("docu"))), + ScanToken::Token(Token::String(String::from("first.paragraph"))), + ScanToken::Token(Token::String(String::from("isn't parsed as tokens"))), + ScanToken::Token(Token::String(String::from(""))), + ScanToken::Token(Token::String(String::from("second paragraph."))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ], + ); +} + +#[test] +fn test_file_label() { + check_scan( + r#"FIL label isn't quoted. +FILE + lab 'is quoted'. +FILE /* +/**/ lab not quoted here either + +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("FIL").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("label").unwrap())), + ScanToken::Token(Token::String(String::from("isn't quoted"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("FILE").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("lab").unwrap())), + ScanToken::Token(Token::String(String::from("is quoted"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("FILE").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("lab").unwrap())), + ScanToken::Token(Token::String(String::from("not quoted here either"))), + ScanToken::Token(Token::EndCommand), + ], + ); +} + +#[test] +fn test_begin_data() { + check_scan( + r#"begin data. +123 +xxx +end data. + +BEG /**/ DAT /* +5 6 7 /* x + +end data +end data +. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("begin").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::String(String::from("123"))), + ScanToken::Token(Token::String(String::from("xxx"))), + ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("BEG").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("DAT").unwrap())), + ScanToken::Token(Token::String(String::from("5 6 7 /* x"))), + ScanToken::Token(Token::String(String::from(""))), + ScanToken::Token(Token::String(String::from("end data"))), + ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), + ScanToken::Token(Token::EndCommand), + ], + ); +} + +#[test] +fn test_do_repeat() { + check_scan( + r#"do repeat x=a b c + y=d e f. + do repeat a=1 thru 5. +another command. +second command ++ third command. +end /* x */ /* y */ repeat print. +end + repeat. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("do").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), + ScanToken::Token(Token::Punct(Punct::Equals)), + ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("b").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("c").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("y").unwrap())), + ScanToken::Token(Token::Punct(Punct::Equals)), + ScanToken::Token(Token::Id(Identifier::new("d").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("e").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("f").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::String(String::from(" do repeat a=1 thru 5."))), + ScanToken::Token(Token::String(String::from("another command."))), + ScanToken::Token(Token::String(String::from("second command"))), + ScanToken::Token(Token::String(String::from("+ third command."))), + ScanToken::Token(Token::String(String::from( + "end /* x */ /* y */ repeat print.", + ))), + ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), + ScanToken::Token(Token::EndCommand), + ], + ); +} + +#[test] +fn test_do_repeat_batch() { + check_scan( + r#"do repeat x=a b c + y=d e f +do repeat a=1 thru 5 +another command +second command ++ third command +end /* x */ /* y */ repeat print +end + repeat +do + repeat #a=1 + + inner command +end repeat +"#, + Mode::Batch, + &[ + ScanToken::Token(Token::Id(Identifier::new("do").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), + ScanToken::Token(Token::Punct(Punct::Equals)), + ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("b").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("c").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("y").unwrap())), + ScanToken::Token(Token::Punct(Punct::Equals)), + ScanToken::Token(Token::Id(Identifier::new("d").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("e").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("f").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::String(String::from("do repeat a=1 thru 5"))), + ScanToken::Token(Token::String(String::from("another command"))), + ScanToken::Token(Token::String(String::from("second command"))), + ScanToken::Token(Token::String(String::from("+ third command"))), + ScanToken::Token(Token::String(String::from( + "end /* x */ /* y */ repeat print", + ))), + ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("do").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("#a").unwrap())), + ScanToken::Token(Token::Punct(Punct::Equals)), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::String(String::from(" inner command"))), + ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), + ], + ); +} + +#[test] +fn test_batch_mode() { + check_scan( + r#"first command + another line of first command ++ second command +third command + +fourth command. + fifth command. +"#, + Mode::Batch, + &[ + ScanToken::Token(Token::Id(Identifier::new("first").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("another").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("line").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("of").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("first").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("second").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("third").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("fourth").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("fifth").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), + ScanToken::Token(Token::EndCommand), + ], + ); +} + +mod define { + use crate::{identifier::Identifier, lex::{ + scan::ScanToken, + segment::Mode, + token::{Punct, Token}, + }}; + + use super::check_scan; + + #[test] + fn test_simple() { + check_scan( + r#"define !macro1() +var1 var2 var3 +!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::String(String::from("var1 var2 var3"))), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_no_newline_after_parentheses() { + check_scan( + r#"define !macro1() var1 var2 var3 +!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::String(String::from(" var1 var2 var3"))), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_no_newline_before_enddefine() { + check_scan( + r#"define !macro1() +var1 var2 var3!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::String(String::from("var1 var2 var3"))), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_all_on_one_line() { + check_scan( + r#"define !macro1()var1 var2 var3!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::String(String::from("var1 var2 var3"))), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_empty() { + check_scan( + r#"define !macro1() +!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_blank_lines() { + check_scan( + r#"define !macro1() + + +!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::String(String::from(""))), + ScanToken::Token(Token::String(String::from(""))), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_arguments() { + check_scan( + r#"define !macro1(a(), b(), c()) +!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Punct(Punct::Comma)), + ScanToken::Token(Token::Id(Identifier::new("b").unwrap())), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Punct(Punct::Comma)), + ScanToken::Token(Token::Id(Identifier::new("c").unwrap())), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_multiline_arguments() { + check_scan( + r#"define !macro1( + a(), b( + ), + c() +) +!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Punct(Punct::Comma)), + ScanToken::Token(Token::Id(Identifier::new("b").unwrap())), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Punct(Punct::Comma)), + ScanToken::Token(Token::Id(Identifier::new("c").unwrap())), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_arguments_start_on_second_line() { + check_scan( + r#"define !macro1 +(x,y,z +) +content 1 +content 2 +!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), + ScanToken::Token(Token::Punct(Punct::Comma)), + ScanToken::Token(Token::Id(Identifier::new("y").unwrap())), + ScanToken::Token(Token::Punct(Punct::Comma)), + ScanToken::Token(Token::Id(Identifier::new("z").unwrap())), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::String(String::from("content 1"))), + ScanToken::Token(Token::String(String::from("content 2"))), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_early_end_of_command_1() { + check_scan( + r#"define !macro1. +data list /x 1. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("list").unwrap())), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_early_end_of_command_2() { + check_scan( + r#"define !macro1 +x. +data list /x 1. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("list").unwrap())), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_early_end_of_command_3() { + check_scan( + r#"define !macro1(. +x. +data list /x 1. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("list").unwrap())), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_early_end_of_command_4() { + // Notice the command terminator at the end of the DEFINE command, + // which should not be there and ends it early. + check_scan( + r#"define !macro1. +data list /x 1. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("list").unwrap())), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_missing_enddefine() { + check_scan( + r#"define !macro1() +content line 1 +content line 2 +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::String(String::from("content line 1"))), + ScanToken::Token(Token::String(String::from("content line 2"))), + ScanToken::Token(Token::End), + ], + ); + } +} diff --git a/rust/pspp/src/lex/segment/mod.rs b/rust/pspp/src/lex/segment/mod.rs new file mode 100644 index 0000000000..befe5b0c53 --- /dev/null +++ b/rust/pspp/src/lex/segment/mod.rs @@ -0,0 +1,1334 @@ +//! Syntax segmentation. +//! +//! PSPP divides traditional "lexical analysis" or "tokenization" into two +//! phases: a lower-level phase called "segmentation" and a higher-level phase +//! called "scanning". This module implements the segmentation phase. +//! [`super::scan`] contains declarations for the scanning phase. +//! +//! Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label +//! (a segment type) for each byte or contiguous sequence of bytes in the input. +//! It also, in a few corner cases, outputs zero-width segments that label the +//! boundary between a pair of bytes in the input. +//! +//! Some segment types correspond directly to tokens; for example, an +//! "identifier" segment (SEG_IDENTIFIER) becomes an identifier token (T_ID) +//! later in lexical analysis. Other segments contribute to tokens but do not +//! correspond directly; for example, multiple quoted string segments +//! (SEG_QUOTED_STRING) separated by spaces (SEG_SPACES) and "+" punctuators +//! (SEG_PUNCT) may be combined to form a single string token (T_STRING). Still +//! other segments are ignored (e.g. SEG_SPACES) or trigger special behavior +//! such as error messages later in tokenization (e.g. SEG_EXPECTED_QUOTE). + +use crate::{ + identifier::{id_match, id_match_n, IdentifierChar}, + prompt::PromptStyle, +}; +use bitflags::bitflags; + +use super::command_name::{command_match, COMMAND_NAMES}; + +/// Segmentation mode. +/// +/// PSPP syntax is written in one of two modes which are broadly defined as +/// follows: +/// +/// - In interactive mode, commands end with a period at the end of the line +/// or with a blank line. +/// +/// - In batch mode, the second and subsequent lines of a command are indented +/// from the left margin. +/// +/// The segmenter can also try to automatically detect the mode in use, using a +/// heuristic that is usually correct. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)] +pub enum Mode { + /// Try to interpret input correctly regardless of whether it is written + /// for interactive or batch mode. + #[default] + Auto, + + /// Interactive syntax mode. + Interactive, + + /// Batch syntax mode. + Batch, +} + +/// The type of a segment. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum Segment { + Number, + QuotedString, + HexString, + UnicodeString, + UnquotedString, + Identifier, + Punct, + Shbang, + Spaces, + Comment, + Newline, + CommentCommand, + DoRepeatCommand, + DoRepeatOverflow, + InlineData, + MacroName, + MacroBody, + StartDocument, + Document, + StartCommand, + SeparateCommands, + EndCommand, + End, + ExpectedQuote, + ExpectedExponent, + UnexpectedChar, +} + +bitflags! { + #[derive(Copy, Clone, Debug)] + pub struct Substate: u8 { + const START_OF_LINE = 1; + const START_OF_COMMAND = 2; + } +} + +#[derive(Copy, Clone)] +pub struct Segmenter { + state: (State, Substate), + nest: u8, + mode: Mode, +} + +#[derive(Copy, Clone, Debug)] +pub struct Incomplete; + +impl Segmenter { + /// Returns a segmenter with the given syntax `mode`. + /// + /// If `is_snippet` is false, then the segmenter will parse as if it's being + /// given a whole file. This means, for example, that it will interpret `-` + /// or `+` at the beginning of the syntax as a separator between commands + /// (since `-` or `+` at the beginning of a line has this meaning). + /// + /// If `is_snippet` is true, then the segmenter will parse as if it's being + /// given an isolated piece of syntax. This means that, for example, that + /// it will interpret `-` or `+` at the beginning of the syntax as an + /// operator token or (if followed by a digit) as part of a number. + pub fn new(mode: Mode, is_snippet: bool) -> Self { + Self { + state: if is_snippet { + (State::General, Substate::empty()) + } else { + (State::Shbang, Substate::empty()) + }, + mode, + nest: 0, + } + } + + pub fn mode(&self) -> Mode { + self.mode + } + + fn start_of_line(&self) -> bool { + self.state.1.contains(Substate::START_OF_LINE) + } + + fn start_of_command(&self) -> bool { + self.state.1.contains(Substate::START_OF_COMMAND) + } + + /// Returns the style of command prompt to display to an interactive user + /// for input in the current state.. The return value is most accurate in + /// mode `Mode::Interactive` and at the beginning of a line (that is, if + /// [`Segmenter::push`] consumed as much as possible of the input up to a + /// new-line). + pub fn prompt(&self) -> PromptStyle { + match self.state.0 { + State::Shbang => PromptStyle::First, + State::General => { + if self.start_of_command() { + PromptStyle::First + } else { + PromptStyle::Later + } + } + State::Comment1 | State::Comment2 => PromptStyle::Comment, + State::Document1 | State::Document2 => PromptStyle::Document, + State::Document3 => PromptStyle::First, + State::FileLabel1 => PromptStyle::Later, + State::FileLabel2 | State::FileLabel3 => PromptStyle::First, + State::DoRepeat1 | State::DoRepeat2 => { + if self.start_of_command() { + PromptStyle::First + } else { + PromptStyle::Later + } + } + State::DoRepeat3 => PromptStyle::DoRepeat, + State::DoRepeat4 => PromptStyle::DoRepeat, + State::Define1 | State::Define2 | State::Define3 => { + if self.start_of_command() { + PromptStyle::First + } else { + PromptStyle::Later + } + } + State::Define4 | State::Define5 | State::Define6 => PromptStyle::Define, + State::BeginData1 => PromptStyle::First, + State::BeginData2 => PromptStyle::Later, + State::BeginData3 | State::BeginData4 => PromptStyle::Data, + } + } + + /// Attempts to label a prefix of the remaining input with a segment type. + /// The caller supplies a prefix of the remaining input as `input`. If + /// `eof` is true, then `input` is the entire (remainder) of the input; if + /// `eof` is false, then further input is potentially available. + /// + /// The input may contain '\n' or '\r\n' line ends in any combination. + /// + /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes + /// in the segment at the beginning of `input` (a number in + /// `0..=input.len()`) and the type of that segment. The next call should + /// not include those bytes in `input`, because they have (figuratively) + /// been consumed by the segmenter. + /// + /// Segments can have zero length, including segment types `Type::End`, + /// `Type::SeparateCommands`, `Type::StartDocument`, `Type::InlineData`, and + /// `Type::Spaces`. + /// + /// Failure occurs only if the segment type of the bytes in `input` cannot + /// yet be determined. In this case, this function returns `Err(Incomplete)`. If + /// more input is available, the caller should obtain some more, then call + /// again with a longer `input`. If this is not enough, the process might + /// need to repeat again and again. If input is exhausted, then the caller + /// may call again setting `eof` to true. This function will never return + /// `Err(Incomplete)` when `eof` is true. + /// + /// The caller must not, in a sequence of calls, supply contradictory input. + /// That is, bytes provided as part of `input` in one call, but not + /// consumed, must not be provided with *different* values on subsequent + /// calls. This is because the function must often make decisions based on + /// looking ahead beyond the bytes that it consumes. + fn push_rest<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + if input.is_empty() { + if eof { + return Ok((input, Segment::End)); + } else { + return Err(Incomplete); + }; + } + + match self.state.0 { + State::Shbang => return self.parse_shbang(input, eof), + State::General => { + if self.start_of_line() { + self.parse_start_of_line(input, eof) + } else { + self.parse_mid_line(input, eof) + } + } + State::Comment1 => self.parse_comment_1(input, eof), + State::Comment2 => self.parse_comment_2(input, eof), + State::Document1 => self.parse_document_1(input, eof), + State::Document2 => self.parse_document_2(input, eof), + State::Document3 => self.parse_document_3(input, eof), + State::FileLabel1 => self.parse_file_label_1(input, eof), + State::FileLabel2 => self.parse_file_label_2(input, eof), + State::FileLabel3 => self.parse_file_label_3(input, eof), + State::DoRepeat1 => self.parse_do_repeat_1(input, eof), + State::DoRepeat2 => self.parse_do_repeat_2(input, eof), + State::DoRepeat3 => self.parse_do_repeat_3(input, eof), + State::DoRepeat4 => self.parse_do_repeat_4(input), + State::Define1 => self.parse_define_1_2(input, eof), + State::Define2 => self.parse_define_1_2(input, eof), + State::Define3 => self.parse_define_3(input, eof), + State::Define4 => self.parse_define_4_5(input, eof), + State::Define5 => self.parse_define_4_5(input, eof), + State::Define6 => self.parse_define_6(input, eof), + State::BeginData1 => self.parse_begin_data_1(input, eof), + State::BeginData2 => self.parse_begin_data_2(input, eof), + State::BeginData3 => self.parse_begin_data_3(input, eof), + State::BeginData4 => self.parse_begin_data_4(input, eof), + } + } + + pub fn push(&mut self, input: &str, eof: bool) -> Result<(usize, Segment), Incomplete> { + let (rest, seg_type) = self.push_rest(input, eof)?; + Ok((input.len() - rest.len(), seg_type)) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +enum State { + Shbang, + General, + Comment1, + Comment2, + Document1, + Document2, + Document3, + FileLabel1, + FileLabel2, + FileLabel3, + DoRepeat1, + DoRepeat2, + DoRepeat3, + DoRepeat4, + Define1, + Define2, + Define3, + Define4, + Define5, + Define6, + BeginData1, + BeginData2, + BeginData3, + BeginData4, +} + +fn take(input: &str, eof: bool) -> Result<(Option, &str), Incomplete> { + let mut iter = input.chars(); + match iter.next() { + None if !eof => Err(Incomplete), + c => Ok((c, iter.as_str())), + } +} + +fn skip_comment(mut input: &str, eof: bool) -> Result<&str, Incomplete> { + loop { + let (Some(c), rest) = take(input, eof)? else { + return Ok(input); + }; + match c { + '\n' | '\r' if is_end_of_line(input, eof)? => return Ok(input), + '*' => { + if let (Some('/'), rest) = take(rest, eof)? { + return Ok(rest); + } + } + _ => (), + }; + input = rest; + } +} + +fn skip_matching(f: F, input: &str, eof: bool) -> Result<&str, Incomplete> +where + F: Fn(char) -> bool, +{ + let input = input.trim_start_matches(f); + if input.is_empty() && !eof { + Err(Incomplete) + } else { + Ok(input) + } +} + +fn match_char(f: F, input: &str, eof: bool) -> Result, Incomplete> +where + F: Fn(char) -> bool, +{ + if let (Some(c), rest) = take(input, eof)? { + if f(c) { + return Ok(Some(rest)); + } + } + Ok(None) +} + +fn skip_spaces(mut input: &str, eof: bool) -> Result<&str, Incomplete> { + loop { + let (Some(c), rest) = take(input, eof)? else { + return Ok(input); + }; + match c { + '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input), + c if c.is_whitespace() => (), + _ => return Ok(input), + } + input = rest; + } +} + +fn skip_digits(input: &str, eof: bool) -> Result<&str, Incomplete> { + skip_matching(|c| c.is_ascii_digit(), input, eof) +} + +fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomplete> { + loop { + let (Some(c), rest) = take(input, eof)? else { + return Ok(input); + }; + match c { + '/' => { + let (c, rest2) = take(rest, eof)?; + match c { + Some('*') => input = skip_comment(rest2, eof)?, + Some(_) | None => return Ok(rest), + } + } + '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input), + c if c.is_whitespace() => input = rest, + _ => return Ok(input), + }; + } +} + +fn is_start_of_string(input: &str, eof: bool) -> Result { + let (Some(c), rest) = take(input, eof)? else { + return Ok(false); + }; + match c { + 'x' | 'X' | 'u' | 'U' => { + let (c, _rest) = take(rest, eof)?; + Ok(c == Some('\'') || c == Some('"')) + } + '\'' | '"' => Ok(true), + '\n' | '\r' if is_end_of_line(input, eof)? => Ok(true), + _ => Ok(false), + } +} + +fn is_end_of_line(input: &str, eof: bool) -> Result { + let (Some(c), rest) = take(input, eof)? else { + return Ok(true); + }; + Ok(match c { + '\n' => true, + '\r' => take(rest, eof)?.0 == Some('\n'), + _ => false, + }) +} + +fn at_end_of_line(input: &str, eof: bool) -> Result { + is_end_of_line(skip_spaces_and_comments(input, eof)?, eof) +} + +fn first(s: &str) -> char { + s.chars().next().unwrap() +} +fn get_command_name_candidates(target: &str) -> &[&'static str] { + if target.is_empty() { + return &[]; + } + let target_first = first(target).to_ascii_uppercase(); + let low = COMMAND_NAMES.partition_point(|s| first(s) < target_first); + let high = COMMAND_NAMES.partition_point(|s| first(s) <= target_first); + &COMMAND_NAMES[low..high] +} + +fn detect_command_name(input: &str, eof: bool) -> Result { + let command_name = input + .split(|c: char| { + !((c.is_whitespace() && c != '\n') || (c.may_continue_id() && c != '.') || c == '-') + }) + .next() + .unwrap(); + if !eof && command_name.len() == input.len() { + return Err(Incomplete); + } + let command_name = command_name.trim_end_matches(|c: char| c.is_whitespace() || c == '.'); + for command in get_command_name_candidates(command_name) { + if let Some(m) = command_match(command, command_name) { + if m.missing_words <= 0 { + return Ok(true); + } + } + } + Ok(false) +} + +impl Segmenter { + fn parse_shbang<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + if let (Some('#'), rest) = take(input, eof)? { + if let (Some('!'), rest) = take(rest, eof)? { + let rest = self.parse_full_line(rest, eof)?; + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((rest, Segment::Shbang)); + } + } + + self.state = ( + State::General, + Substate::START_OF_COMMAND | Substate::START_OF_LINE, + ); + self.push_rest(input, eof) + } + fn at_command_start(&self, input: &str, eof: bool) -> Result { + match self.mode { + Mode::Auto => detect_command_name(input, eof), + Mode::Interactive => Ok(false), + Mode::Batch => Ok(true), + } + } + fn parse_start_of_line<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + debug_assert_eq!(self.state.0, State::General); + debug_assert!(self.start_of_line()); + debug_assert!(!input.is_empty()); + + let (Some(c), rest) = take(input, eof).unwrap() else { + unreachable!() + }; + match c { + '+' if is_start_of_string(skip_spaces_and_comments(rest, eof)?, eof)? => { + // This `+` is punctuation that may separate pieces of a string. + self.state = (State::General, Substate::empty()); + return Ok((rest, Segment::Punct)); + } + '+' | '-' | '.' => { + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((rest, Segment::StartCommand)); + } + _ if c.is_whitespace() => { + if at_end_of_line(input, eof)? { + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((input, Segment::SeparateCommands)); + } + } + _ => { + if self.at_command_start(input, eof)? + && !self.state.1.contains(Substate::START_OF_COMMAND) + { + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((input, Segment::StartCommand)); + } + } + } + self.state.1 = Substate::START_OF_COMMAND; + self.parse_mid_line(input, eof) + } + fn parse_mid_line<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + debug_assert!(self.state.0 == State::General); + debug_assert!(!self.state.1.contains(Substate::START_OF_LINE)); + let (Some(c), rest) = take(input, eof)? else { + unreachable!() + }; + match c { + '\r' | '\n' if is_end_of_line(input, eof)? => { + self.state.1 |= Substate::START_OF_LINE; + Ok(( + self.parse_newline(input, eof).unwrap().unwrap(), + Segment::Newline, + )) + } + '/' => { + if let (Some('*'), rest) = take(rest, eof)? { + let rest = skip_comment(rest, eof)?; + return Ok((rest, Segment::Comment)); + } else { + self.state.1 = Substate::empty(); + return Ok((rest, Segment::Punct)); + } + } + '-' => { + let (c, rest2) = take(skip_spaces(rest, eof)?, eof)?; + match c { + Some(c) if c.is_ascii_digit() => { + return self.parse_number(rest, eof); + } + Some('.') => { + if let (Some(c), _rest) = take(rest2, eof)? { + if c.is_ascii_digit() { + return self.parse_number(rest, eof); + } + } + } + None | Some(_) => (), + } + self.state.1 = Substate::empty(); + return Ok((rest, Segment::Punct)); + } + '(' | ')' | '[' | ']' | '{' | '}' | ',' | '=' | ';' | ':' | '&' | '|' | '+' => { + self.state.1 = Substate::empty(); + return Ok((rest, Segment::Punct)); + } + '*' => { + if self.state.1.contains(Substate::START_OF_COMMAND) { + self.state = (State::Comment1, Substate::empty()); + self.parse_comment_1(input, eof) + } else { + self.parse_digraph(&['*'], rest, eof) + } + } + '<' => self.parse_digraph(&['=', '>'], rest, eof), + '>' => self.parse_digraph(&['='], rest, eof), + '~' => self.parse_digraph(&['='], rest, eof), + '.' if at_end_of_line(rest, eof)? => { + self.state.1 = Substate::START_OF_COMMAND; + Ok((rest, Segment::EndCommand)) + } + '.' => match take(rest, eof)? { + (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof), + _ => Ok((rest, Segment::Punct)), + }, + '0'..='9' => self.parse_number(input, eof), + 'u' | 'U' => self.maybe_parse_string(Segment::UnicodeString, (input, rest), eof), + 'x' | 'X' => self.maybe_parse_string(Segment::HexString, (input, rest), eof), + '\'' | '"' => self.parse_string(Segment::QuotedString, c, rest, eof), + '!' => { + let (c, rest2) = take(rest, eof)?; + match c { + Some('*') => Ok((rest2, Segment::Punct)), + Some(_) => self.parse_id(input, eof), + None => Ok((rest, Segment::Punct)), + } + } + c if c.is_whitespace() => Ok((skip_spaces(rest, eof)?, Segment::Spaces)), + c if c.may_start_id() => self.parse_id(input, eof), + '#'..='~' if c != '\\' && c != '^' => { + self.state.1 = Substate::empty(); + Ok((rest, Segment::Punct)) + } + _ => { + self.state.1 = Substate::empty(); + Ok((rest, Segment::UnexpectedChar)) + } + } + } + fn parse_string<'a>( + &mut self, + segment: Segment, + quote: char, + mut input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + while let (Some(c), rest) = take(input, eof)? { + match c { + _ if c == quote => { + let (c, rest2) = take(rest, eof)?; + if c != Some(quote) { + self.state.1 = Substate::empty(); + return Ok((rest, segment)); + } + input = rest2; + } + '\r' | '\n' if is_end_of_line(input, eof)? => break, + _ => input = rest, + } + } + self.state.1 = Substate::empty(); + Ok((input, Segment::ExpectedQuote)) + } + fn maybe_parse_string<'a>( + &mut self, + segment: Segment, + input: (&'a str, &'a str), + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + match take(input.1, eof)? { + (Some(c), rest) if c == '\'' || c == '"' => self.parse_string(segment, c, rest, eof), + _ => self.parse_id(input.0, eof), + } + } + fn next_id_in_command<'a>( + &self, + mut input: &'a str, + eof: bool, + ) -> Result<(&'a str, &'a str), Incomplete> { + let mut sub = Segmenter::new(self.mode, true); + loop { + let (seg_len, seg_type) = sub.push(input, eof)?; + let (segment, rest) = input.split_at(seg_len); + match seg_type { + Segment::Shbang | Segment::Spaces | Segment::Comment | Segment::Newline => (), + + Segment::Identifier => return Ok((segment, rest)), + + Segment::Number + | Segment::QuotedString + | Segment::HexString + | Segment::UnicodeString + | Segment::UnquotedString + | Segment::Punct + | Segment::CommentCommand + | Segment::DoRepeatCommand + | Segment::DoRepeatOverflow + | Segment::InlineData + | Segment::MacroName + | Segment::MacroBody + | Segment::StartDocument + | Segment::Document + | Segment::StartCommand + | Segment::SeparateCommands + | Segment::EndCommand + | Segment::End + | Segment::ExpectedQuote + | Segment::ExpectedExponent + | Segment::UnexpectedChar => return Ok(("", rest)), + } + input = rest; + } + } + fn parse_id<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let (Some(_), mut end) = take(input, eof).unwrap() else { + unreachable!() + }; + while let (Some(c), rest) = take(end, eof)? { + if !c.may_continue_id() { + break; + }; + end = rest; + } + let identifier = &input[..input.len() - end.len()]; + let identifier = match identifier.strip_suffix('.') { + Some(without_dot) if at_end_of_line(end, eof)? => without_dot, + _ => identifier, + }; + let rest = &input[identifier.len()..]; + + if self.state.1.contains(Substate::START_OF_COMMAND) { + if id_match_n("COMMENT", identifier, 4) { + self.state = (State::Comment1, Substate::empty()); + return self.parse_comment_1(input, eof); + } else if id_match("DOCUMENT", identifier) { + self.state = (State::Document1, Substate::empty()); + return Ok((input, Segment::StartDocument)); + } else if id_match_n("DEFINE", identifier, 6) { + self.state = (State::Define1, Substate::empty()); + } else if id_match("FILE", identifier) { + if id_match("LABEL", self.next_id_in_command(rest, eof)?.0) { + self.state = (State::FileLabel1, Substate::empty()); + return Ok((rest, Segment::Identifier)); + } + } else if id_match("DO", identifier) { + if id_match("REPEAT", self.next_id_in_command(rest, eof)?.0) { + self.state = (State::DoRepeat1, Substate::empty()); + return Ok((rest, Segment::Identifier)); + } + } else if id_match("BEGIN", identifier) { + let (next_id, rest2) = self.next_id_in_command(rest, eof)?; + if id_match("DATA", next_id) { + let rest2 = skip_spaces_and_comments(rest2, eof)?; + let rest2 = if let Some(s) = rest2.strip_prefix('.') { + skip_spaces_and_comments(s, eof)? + } else { + rest2 + }; + if is_end_of_line(rest2, eof)? { + let s = &input[..input.len() - rest2.len()]; + self.state = ( + if s.contains('\n') { + State::BeginData1 + } else { + State::BeginData2 + }, + Substate::empty(), + ); + return Ok((rest, Segment::Identifier)); + } + } + } + } + + self.state.1 = Substate::empty(); + Ok(( + rest, + if identifier != "!" { + Segment::Identifier + } else { + Segment::Punct + }, + )) + } + fn parse_digraph<'a>( + &mut self, + seconds: &[char], + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let (c, rest) = take(input, eof)?; + self.state.1 = Substate::empty(); + Ok(( + match c { + Some(c) if seconds.contains(&c) => rest, + _ => input, + }, + Segment::Punct, + )) + } + fn parse_number<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let mut input = skip_digits(input, eof)?; + if let Some(rest) = match_char(|c| c == '.', input, eof)? { + let rest2 = skip_digits(rest, eof)?; + if rest2.len() < rest.len() || !at_end_of_line(rest2, eof)? { + input = rest2; + } + }; + if let Some(rest) = match_char(|c| c == 'e' || c == 'E', input, eof)? { + let rest = match_char(|c| c == '+' || c == '-', rest, eof)?.unwrap_or(rest); + let rest2 = skip_digits(rest, eof)?; + if rest2.len() == rest.len() { + self.state.1 = Substate::empty(); + return Ok((rest, Segment::ExpectedExponent)); + } + input = rest2; + } + self.state.1 = Substate::empty(); + Ok((input, Segment::Number)) + } + fn parse_comment_1<'a>( + &mut self, + mut input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + enum CommentState<'a> { + Blank, + NotBlank, + Period(&'a str), + } + let mut state = CommentState::Blank; + loop { + let (Some(c), rest) = take(input, eof)? else { + // End of file. + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((input, Segment::SeparateCommands)); + }; + match c { + '.' => state = CommentState::Period(input), + '\n' | '\r' if is_end_of_line(input, eof)? => { + match state { + CommentState::Blank => { + // Blank line ends comment command. + self.state = (State::General, Substate::START_OF_COMMAND); + return Ok((input, Segment::SeparateCommands)); + } + CommentState::Period(period) => { + // '.' at end of line ends comment command. + self.state = (State::General, Substate::empty()); + return Ok((period, Segment::CommentCommand)); + } + CommentState::NotBlank => { + // Comment continues onto next line. + self.state = (State::Comment2, Substate::empty()); + return Ok((input, Segment::CommentCommand)); + } + } + } + c if c.is_whitespace() => (), + _ => state = CommentState::NotBlank, + } + input = rest; + } + } + fn parse_comment_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let rest = self.parse_newline(input, eof)?.unwrap(); + + let new_command = match take(rest, eof)?.0 { + Some('+') | Some('-') | Some('.') => true, + Some(c) if !c.is_whitespace() => self.at_command_start(rest, eof)?, + None | Some(_) => false, + }; + if new_command { + self.state = ( + State::General, + Substate::START_OF_LINE | Substate::START_OF_COMMAND, + ); + } else { + self.state = (State::Comment1, Substate::empty()); + } + Ok((rest, Segment::Newline)) + } + fn parse_document_1<'a>( + &mut self, + mut input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let mut end_cmd = false; + loop { + let (Some(c), rest) = take(input, eof)? else { + self.state = (State::Document3, Substate::empty()); + return Ok((input, Segment::Document)); + }; + match c { + '.' => end_cmd = true, + '\n' | '\r' if is_end_of_line(input, eof)? => { + self.state.0 = if end_cmd { + State::Document3 + } else { + State::Document2 + }; + return Ok((input, Segment::Document)); + } + c if !c.is_whitespace() => end_cmd = false, + _ => (), + } + input = rest; + } + } + fn parse_document_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let rest = self.parse_newline(input, eof)?.unwrap(); + self.state = (State::Document1, Substate::empty()); + Ok((rest, Segment::Newline)) + } + fn parse_document_3<'a>( + &mut self, + input: &'a str, + _eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + self.state = ( + State::General, + Substate::START_OF_COMMAND | Substate::START_OF_LINE, + ); + Ok((input, Segment::EndCommand)) + } + fn quoted_file_label(input: &str, eof: bool) -> Result { + let input = skip_spaces_and_comments(input, eof)?; + match take(input, eof)?.0 { + Some('\'') | Some('"') | Some('\n') => Ok(true), + _ => Ok(false), + } + } + fn parse_file_label_1<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let mut sub = Segmenter { + state: (State::General, self.state.1), + ..*self + }; + let (rest, segment) = sub.push_rest(input, eof)?; + if segment == Segment::Identifier { + let id = &input[..input.len() - rest.len()]; + debug_assert!(id_match("LABEL", id), "{id} should be LABEL"); + if Self::quoted_file_label(rest, eof)? { + *self = sub; + } else { + self.state.0 = State::FileLabel2; + } + } else { + self.state.1 = sub.state.1; + } + Ok((rest, segment)) + } + fn parse_file_label_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let input = skip_spaces(input, eof)?; + self.state = (State::FileLabel3, Substate::empty()); + Ok((input, Segment::Spaces)) + } + fn parse_file_label_3<'a>( + &mut self, + mut input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let mut end_cmd = None; + loop { + let (c, rest) = take(input, eof)?; + match c { + None | Some('\n') | Some('\r') if is_end_of_line(input, eof)? => { + self.state = (State::General, Substate::empty()); + return Ok((end_cmd.unwrap_or(input), Segment::UnquotedString)); + } + None => unreachable!(), + Some('.') => end_cmd = Some(input), + Some(c) if !c.is_whitespace() => end_cmd = None, + Some(_) => (), + } + input = rest; + } + } + fn subparse<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let mut sub = Segmenter { + mode: self.mode, + state: (State::General, self.state.1), + nest: 0, + }; + let result = sub.push_rest(input, eof)?; + self.state.1 = sub.state.1; + Ok(result) + } + /// We are segmenting a `DO REPEAT` command, currently reading the syntax + /// that defines the stand-in variables (the head) before the lines of + /// syntax to be repeated (the body). + fn parse_do_repeat_1<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let (rest, segment) = self.subparse(input, eof)?; + if segment == Segment::SeparateCommands { + // We reached a blank line that separates the head from the body. + self.state.0 = State::DoRepeat2; + } else if segment == Segment::EndCommand || segment == Segment::StartCommand { + // We reached the body. + self.state.0 = State::DoRepeat3; + self.nest = 1; + } + Ok((rest, segment)) + } + /// We are segmenting a `DO REPEAT` command, currently reading a blank line + /// that separates the head from the body. + fn parse_do_repeat_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let (rest, segment) = self.subparse(input, eof)?; + if segment == Segment::Newline { + // We reached the body. + self.state.0 = State::DoRepeat3; + self.nest = 1; + } + Ok((rest, segment)) + } + fn parse_newline<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result, Incomplete> { + let (Some(c), rest) = take(input, eof)? else { + return Ok(None); + }; + match c { + '\n' => Ok(Some(rest)), + '\r' => { + if let (Some('\n'), rest) = take(rest, eof)? { + Ok(Some(rest)) + } else { + Ok(None) + } + } + _ => Ok(None), + } + } + + fn parse_full_line<'a>( + &mut self, + mut input: &'a str, + eof: bool, + ) -> Result<&'a str, Incomplete> { + loop { + if is_end_of_line(input, eof)? { + return Ok(input); + } + input = take(input, eof).unwrap().1; + } + } + fn check_repeat_command<'a>(&mut self, input: &'a str, eof: bool) -> Result { + let input = input.strip_prefix(&['-', '+']).unwrap_or(input); + let (id1, input) = self.next_id_in_command(input, eof)?; + if id_match("DO", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) { + Ok(1) + } else if id_match("END", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) + { + Ok(-1) + } else { + Ok(0) + } + } + /// We are in the body of `DO REPEAT`, segmenting the lines of syntax that + /// are to be repeated. Report each line of syntax as a single + /// [`Type::DoRepeatCommand`]. + /// + /// `DO REPEAT` can be nested, so we look for `DO REPEAT...END REPEAT` + /// blocks inside the lines we're segmenting. `self.nest` counts the + /// nesting level, starting at 1. + fn parse_do_repeat_3<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + if let Some(rest) = self.parse_newline(input, eof)? { + return Ok((rest, Segment::Newline)); + } + let rest = self.parse_full_line(input, eof)?; + let direction = self.check_repeat_command(input, eof)?; + if direction > 0 { + if let Some(nest) = self.nest.checked_add(1) { + self.nest = nest; + } else { + self.state.0 = State::DoRepeat4; + } + } else if direction < 0 { + self.nest -= 1; + if self.nest == 0 { + // Nesting level dropped to 0, so we've finished reading the `DO + // REPEAT` body. + self.state = ( + State::General, + Substate::START_OF_COMMAND | Substate::START_OF_LINE, + ); + return self.push_rest(input, eof); + } + } + return Ok((rest, Segment::DoRepeatCommand)); + } + fn parse_do_repeat_4<'a>(&mut self, input: &'a str) -> Result<(&'a str, Segment), Incomplete> { + self.state.0 = State::DoRepeat3; + Ok((input, Segment::DoRepeatOverflow)) + } + /// We are segmenting a `DEFINE` command, which consists of: + /// + /// - The `DEFINE` keyword. + /// + /// - An identifier. We transform this into `Type::MacroName` instead of + /// `Type::Identifier` because this identifier must never be macro-expanded. + /// + /// - Anything but `(`. + /// + /// - `(` followed by a sequence of tokens possibly including balanced + /// parentheses up to a final `)`. + /// + /// - A sequence of any number of lines, one string per line, ending with + /// `!ENDDEFINE`. The first line is usually blank (that is, a newline + /// follows the `(`). The last line usually just has `!ENDDEFINE.` on + /// it, but it can start with other tokens. The whole + /// DEFINE...!ENDDEFINE can be on a single line, even. + fn parse_define_1_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let (rest, segment) = self.subparse(input, eof)?; + match segment { + Segment::Identifier if self.state.0 == State::Define1 => { + self.state.0 = State::Define2; + return Ok((rest, Segment::MacroName)); + } + Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => { + // The DEFINE command is malformed because we reached its end + // without ever hitting a `(` token. Transition back to general + // parsing. + self.state.0 = State::General; + } + Segment::Punct if input.starts_with('(') => { + self.state.0 = State::Define3; + self.nest = 1; + } + _ => (), + } + Ok((rest, segment)) + } + fn parse_define_3<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let (rest, segment) = self.subparse(input, eof)?; + match segment { + Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => { + // The DEFINE command is malformed because we reached its end + // without ever hitting a `(` token. Transition back to general + // parsing. + self.state.0 = State::General; + } + Segment::Punct if input.starts_with('(') => { + self.nest += 1; + } + Segment::Punct if input.starts_with(')') => { + self.nest -= 1; + if self.nest == 0 { + self.state = (State::Define4, Substate::empty()); + } + } + _ => (), + } + Ok((rest, segment)) + } + fn find_enddefine<'a>(mut input: &'a str) -> Option<&'a str> { + loop { + input = skip_spaces_and_comments(input, true).unwrap(); + let (Some(c), rest) = take(input, true).unwrap() else { + return None; + }; + match c { + '!' if strip_prefix_ignore_ascii_case(input, "!ENDDEFINE").is_some() => { + return Some(input) + } + '\'' | '"' => { + let index = rest.find(c)?; + input = &rest[index + 1..]; + } + _ => input = rest, + } + } + } + + /// We are in the body of a macro definition, looking for additional lines + /// of the body or `!ENDDEFINE`. + /// + /// In `State::Define4`, we're parsing the first line of the macro body (the + /// same line as the closing parenthesis in the argument definition). In + /// `State::Define5`, we're on a later line. + fn parse_define_4_5<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let rest = self.parse_full_line(input, eof)?; + let line = &input[..input.len() - rest.len()]; + if let Some(end) = Self::find_enddefine(line) { + // Macro ends at the !ENDDEFINE on this line. + self.state = (State::General, Substate::empty()); + let (prefix, rest) = input.split_at(line.len() - end.len()); + if prefix.is_empty() { + // Line starts with `!ENDDEFINE`. + self.push_rest(input, eof) + } else if prefix.trim_start().is_empty() { + // Line starts with spaces followed by `!ENDDEFINE`. + Ok((rest, Segment::Spaces)) + } else { + // Line starts with some content followed by `!ENDDEFINE`. + Ok((rest, Segment::MacroBody)) + } + } else { + // No `!ENDDEFINE`. We have a full line of macro body. + // + // If the first line of the macro body is blank, we just report it + // as spaces, or not at all if there are no spaces, because it's not + // significant. + // + // However, if it's a later line, we need to report it because blank + // lines can have significance. + let segment = if self.state.0 == State::Define4 && line.trim_start().is_empty() { + if line.is_empty() { + return self.parse_define_6(input, eof); + } + Segment::Spaces + } else { + Segment::MacroBody + }; + self.state.0 = State::Define6; + Ok((rest, segment)) + } + } + fn parse_define_6<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let rest = self.parse_newline(input, eof)?.unwrap(); + self.state.0 = State::Define5; + Ok((rest, Segment::Newline)) + } + fn parse_begin_data_1<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let (rest, segment) = self.subparse(input, eof)?; + if segment == Segment::Newline { + self.state.0 = State::BeginData2; + } + Ok((rest, segment)) + } + fn parse_begin_data_2<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let (rest, segment) = self.subparse(input, eof)?; + if segment == Segment::Newline { + self.state.0 = State::BeginData3; + } + Ok((rest, segment)) + } + fn is_end_data(line: &str) -> bool { + let Some(rest) = strip_prefix_ignore_ascii_case(line, "END") else { + return false; + }; + let (Some(c), rest) = take(rest, true).unwrap() else { + return false; + }; + if !c.is_whitespace() { + return false; + }; + let Some(rest) = strip_prefix_ignore_ascii_case(rest, "DATA") else { + return false; + }; + + let mut endcmd = false; + for c in rest.chars() { + match c { + '.' if endcmd => return false, + '.' => endcmd = true, + c if c.is_whitespace() => (), + _ => return false, + } + } + true + } + fn parse_begin_data_3<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let rest = self.parse_full_line(input, eof)?; + let line = &input[..input.len() - rest.len()]; + if Self::is_end_data(line) { + self.state = ( + State::General, + Substate::START_OF_COMMAND | Substate::START_OF_LINE, + ); + self.push_rest(input, eof) + } else { + self.state.0 = State::BeginData4; + Ok((rest, Segment::InlineData)) + } + } + fn parse_begin_data_4<'a>( + &mut self, + input: &'a str, + eof: bool, + ) -> Result<(&'a str, Segment), Incomplete> { + let rest = self.parse_newline(input, eof)?.unwrap(); + self.state.0 = State::BeginData3; + Ok((rest, Segment::Newline)) + } +} + +fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&'a str> { + line.get(..pattern.len()) + .map(|prefix| { + prefix + .eq_ignore_ascii_case(pattern) + .then(|| &line[pattern.len()..]) + }) + .flatten() +} + +#[cfg(test)] +mod test; diff --git a/rust/pspp/src/lex/segment/test.rs b/rust/pspp/src/lex/segment/test.rs new file mode 100644 index 0000000000..d8c337dcdf --- /dev/null +++ b/rust/pspp/src/lex/segment/test.rs @@ -0,0 +1,2172 @@ +use crate::prompt::PromptStyle; + +use super::{Mode, Segment, Segmenter}; + +fn push_segment<'a>( + segmenter: &mut Segmenter, + input: &'a str, + one_byte: bool, +) -> (usize, Segment) { + if one_byte { + for len in input.char_indices().map(|(pos, _c)| pos) { + if let Ok(result) = segmenter.push(&input[..len], false) { + return result; + } + } + } + segmenter.push(input, true).unwrap() +} + +fn _check_segmentation( + mut input: &str, + mode: Mode, + expect_segments: &[(Segment, &str)], + expect_prompts: &[PromptStyle], + one_byte: bool, +) { + let mut segments = Vec::with_capacity(expect_segments.len()); + let mut prompts = Vec::new(); + let mut segmenter = Segmenter::new(mode, false); + loop { + let (seg_len, seg_type) = push_segment(&mut segmenter, input, one_byte); + let (token, rest) = input.split_at(seg_len); + segments.push((seg_type, token)); + match seg_type { + Segment::End => break, + Segment::Newline => prompts.push(segmenter.prompt()), + _ => (), + } + input = rest; + } + + if &segments != expect_segments { + eprintln!("segments differ from expected:"); + let difference = diff::slice(expect_segments, &segments); + for result in difference { + match result { + diff::Result::Left(left) => eprintln!("-{left:?}"), + diff::Result::Both(left, _right) => eprintln!(" {left:?}"), + diff::Result::Right(right) => eprintln!("+{right:?}"), + } + } + panic!(); + } + + if &prompts != expect_prompts { + eprintln!("prompts differ from expected:"); + let difference = diff::slice(expect_prompts, &prompts); + for result in difference { + match result { + diff::Result::Left(left) => eprintln!("-{left:?}"), + diff::Result::Both(left, _right) => eprintln!(" {left:?}"), + diff::Result::Right(right) => eprintln!("+{right:?}"), + } + } + panic!(); + } +} + +fn check_segmentation( + input: &str, + mode: Mode, + expect_segments: &[(Segment, &str)], + expect_prompts: &[PromptStyle], +) { + for (one_byte, one_byte_name) in [(false, "full-string"), (true, "byte-by-byte")] { + println!("running {one_byte_name} segmentation test with LF newlines..."); + _check_segmentation(input, mode, expect_segments, expect_prompts, one_byte); + + println!("running {one_byte_name} segmentation test with CRLF newlines..."); + _check_segmentation( + &input.replace('\n', "\r\n"), + mode, + &expect_segments + .iter() + .map(|(segment, s)| match *segment { + Segment::Newline => (Segment::Newline, "\r\n"), + _ => (*segment, *s), + }) + .collect::>(), + expect_prompts, + one_byte, + ); + + if let Some(input) = input.strip_suffix('\n') { + println!("running {one_byte_name} segmentation test without final newline..."); + let mut expect_segments: Vec<_> = expect_segments.iter().copied().collect(); + assert_eq!(expect_segments.pop(), Some((Segment::End, ""))); + assert_eq!(expect_segments.pop(), Some((Segment::Newline, "\n"))); + while let Some((Segment::SeparateCommands | Segment::EndCommand, "")) = + expect_segments.last() + { + expect_segments.pop(); + } + expect_segments.push((Segment::End, "")); + _check_segmentation( + input, + mode, + &expect_segments, + &expect_prompts[..expect_prompts.len() - 1], + one_byte, + ); + } + } +} + +#[allow(dead_code)] +fn print_segmentation(mut input: &str) { + let mut segmenter = Segmenter::new(Mode::Interactive, false); + loop { + let (seg_len, seg_type) = segmenter.push(input, true).unwrap(); + let (token, rest) = input.split_at(seg_len); + print!("{seg_type:?} {token:?}"); + match seg_type { + Segment::Newline => print!(" ({:?})", segmenter.prompt()), + Segment::End => break, + _ => (), + } + println!(); + input = rest; + } +} + +#[test] +fn test_identifiers() { + check_segmentation( + r#"a ab abc abcd !abcd +A AB ABC ABCD !ABCD +aB aBC aBcD !aBcD +$x $y $z !$z +grève Ângstrom poté +#a #b #c ## #d !#d +@efg @ @@. @#@ !@ +## # #12345 #.# +f@#_.#6 +GhIjK +.x 1y _z +!abc abc! +"#, + Mode::Auto, + &[ + (Segment::Identifier, "a"), + (Segment::Spaces, " "), + (Segment::Identifier, "ab"), + (Segment::Spaces, " "), + (Segment::Identifier, "abc"), + (Segment::Spaces, " "), + (Segment::Identifier, "abcd"), + (Segment::Spaces, " "), + (Segment::Identifier, "!abcd"), + (Segment::Newline, "\n"), + (Segment::Identifier, "A"), + (Segment::Spaces, " "), + (Segment::Identifier, "AB"), + (Segment::Spaces, " "), + (Segment::Identifier, "ABC"), + (Segment::Spaces, " "), + (Segment::Identifier, "ABCD"), + (Segment::Spaces, " "), + (Segment::Identifier, "!ABCD"), + (Segment::Newline, "\n"), + (Segment::Identifier, "aB"), + (Segment::Spaces, " "), + (Segment::Identifier, "aBC"), + (Segment::Spaces, " "), + (Segment::Identifier, "aBcD"), + (Segment::Spaces, " "), + (Segment::Identifier, "!aBcD"), + (Segment::Newline, "\n"), + (Segment::Identifier, "$x"), + (Segment::Spaces, " "), + (Segment::Identifier, "$y"), + (Segment::Spaces, " "), + (Segment::Identifier, "$z"), + (Segment::Spaces, " "), + (Segment::Identifier, "!$z"), + (Segment::Newline, "\n"), + (Segment::Identifier, "grève"), + (Segment::Spaces, "\u{00a0}"), + (Segment::Identifier, "Ângstrom"), + (Segment::Spaces, "\u{00a0}"), + (Segment::Identifier, "poté"), + (Segment::Newline, "\n"), + (Segment::Identifier, "#a"), + (Segment::Spaces, " "), + (Segment::Identifier, "#b"), + (Segment::Spaces, " "), + (Segment::Identifier, "#c"), + (Segment::Spaces, " "), + (Segment::Identifier, "##"), + (Segment::Spaces, " "), + (Segment::Identifier, "#d"), + (Segment::Spaces, " "), + (Segment::Identifier, "!#d"), + (Segment::Newline, "\n"), + (Segment::Identifier, "@efg"), + (Segment::Spaces, " "), + (Segment::Identifier, "@"), + (Segment::Spaces, " "), + (Segment::Identifier, "@@."), + (Segment::Spaces, " "), + (Segment::Identifier, "@#@"), + (Segment::Spaces, " "), + (Segment::Identifier, "!@"), + (Segment::Spaces, " "), + (Segment::Newline, "\n"), + (Segment::Identifier, "##"), + (Segment::Spaces, " "), + (Segment::Identifier, "#"), + (Segment::Spaces, " "), + (Segment::Identifier, "#12345"), + (Segment::Spaces, " "), + (Segment::Identifier, "#.#"), + (Segment::Newline, "\n"), + (Segment::Identifier, "f@#_.#6"), + (Segment::Newline, "\n"), + (Segment::Identifier, "GhIjK"), + (Segment::Newline, "\n"), + (Segment::StartCommand, "."), + (Segment::Identifier, "x"), + (Segment::Spaces, " "), + (Segment::Number, "1"), + (Segment::Identifier, "y"), + (Segment::Spaces, " "), + (Segment::Punct, "_"), + (Segment::Identifier, "z"), + (Segment::Newline, "\n"), + (Segment::Identifier, "!abc"), + (Segment::Spaces, " "), + (Segment::Identifier, "abc"), + (Segment::Punct, "!"), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + ], + ); +} + +#[test] +fn test_identifiers_ending_in_dot() { + check_segmentation( + r#"abcd. abcd. +ABCD. ABCD. +aBcD. aBcD. +$y. $z. あいうえお. +#c. #d.. +@@. @@.... +#.#. +#abcd. +. +. +LMNOP. +QRSTUV./* end of line comment */ +qrstuv. /* end of line comment */ +QrStUv./* end of line comment */ +wxyz./* unterminated end of line comment +WXYZ. /* unterminated end of line comment +WxYz./* unterminated end of line comment +"#, + Mode::Auto, + &[ + (Segment::Identifier, "abcd."), + (Segment::Spaces, " "), + (Segment::Identifier, "abcd"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "ABCD."), + (Segment::Spaces, " "), + (Segment::Identifier, "ABCD"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "aBcD."), + (Segment::Spaces, " "), + (Segment::Identifier, "aBcD"), + (Segment::EndCommand, "."), + (Segment::Spaces, " "), + (Segment::Newline, "\n"), + (Segment::Identifier, "$y."), + (Segment::Spaces, " "), + (Segment::Identifier, "$z."), + (Segment::Spaces, " "), + (Segment::Identifier, "あいうえお"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "#c."), + (Segment::Spaces, " "), + (Segment::Identifier, "#d."), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "@@."), + (Segment::Spaces, " "), + (Segment::Identifier, "@@..."), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "#.#"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "#abcd"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::StartCommand, "."), + (Segment::Newline, "\n"), + (Segment::StartCommand, "."), + (Segment::Spaces, " "), + (Segment::Newline, "\n"), + (Segment::Identifier, "LMNOP"), + (Segment::EndCommand, "."), + (Segment::Spaces, " "), + (Segment::Newline, "\n"), + (Segment::Identifier, "QRSTUV"), + (Segment::EndCommand, "."), + (Segment::Comment, "/* end of line comment */"), + (Segment::Newline, "\n"), + (Segment::Identifier, "qrstuv"), + (Segment::EndCommand, "."), + (Segment::Spaces, " "), + (Segment::Comment, "/* end of line comment */"), + (Segment::Newline, "\n"), + (Segment::Identifier, "QrStUv"), + (Segment::EndCommand, "."), + (Segment::Comment, "/* end of line comment */"), + (Segment::Spaces, " "), + (Segment::Newline, "\n"), + (Segment::Identifier, "wxyz"), + (Segment::EndCommand, "."), + (Segment::Comment, "/* unterminated end of line comment"), + (Segment::Newline, "\n"), + (Segment::Identifier, "WXYZ"), + (Segment::EndCommand, "."), + (Segment::Spaces, " "), + (Segment::Comment, "/* unterminated end of line comment"), + (Segment::Newline, "\n"), + (Segment::Identifier, "WxYz"), + (Segment::EndCommand, "."), + (Segment::Comment, "/* unterminated end of line comment "), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_reserved_words() { + check_segmentation( + r#"and or not eq ge gt le lt ne all by to with +AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH +andx orx notx eqx gex gtx lex ltx nex allx byx tox withx +and. with. +"#, + Mode::Auto, + &[ + (Segment::Identifier, "and"), + (Segment::Spaces, " "), + (Segment::Identifier, "or"), + (Segment::Spaces, " "), + (Segment::Identifier, "not"), + (Segment::Spaces, " "), + (Segment::Identifier, "eq"), + (Segment::Spaces, " "), + (Segment::Identifier, "ge"), + (Segment::Spaces, " "), + (Segment::Identifier, "gt"), + (Segment::Spaces, " "), + (Segment::Identifier, "le"), + (Segment::Spaces, " "), + (Segment::Identifier, "lt"), + (Segment::Spaces, " "), + (Segment::Identifier, "ne"), + (Segment::Spaces, " "), + (Segment::Identifier, "all"), + (Segment::Spaces, " "), + (Segment::Identifier, "by"), + (Segment::Spaces, " "), + (Segment::Identifier, "to"), + (Segment::Spaces, " "), + (Segment::Identifier, "with"), + (Segment::Newline, "\n"), + (Segment::Identifier, "AND"), + (Segment::Spaces, " "), + (Segment::Identifier, "OR"), + (Segment::Spaces, " "), + (Segment::Identifier, "NOT"), + (Segment::Spaces, " "), + (Segment::Identifier, "EQ"), + (Segment::Spaces, " "), + (Segment::Identifier, "GE"), + (Segment::Spaces, " "), + (Segment::Identifier, "GT"), + (Segment::Spaces, " "), + (Segment::Identifier, "LE"), + (Segment::Spaces, " "), + (Segment::Identifier, "LT"), + (Segment::Spaces, " "), + (Segment::Identifier, "NE"), + (Segment::Spaces, " "), + (Segment::Identifier, "ALL"), + (Segment::Spaces, " "), + (Segment::Identifier, "BY"), + (Segment::Spaces, " "), + (Segment::Identifier, "TO"), + (Segment::Spaces, " "), + (Segment::Identifier, "WITH"), + (Segment::Newline, "\n"), + (Segment::Identifier, "andx"), + (Segment::Spaces, " "), + (Segment::Identifier, "orx"), + (Segment::Spaces, " "), + (Segment::Identifier, "notx"), + (Segment::Spaces, " "), + (Segment::Identifier, "eqx"), + (Segment::Spaces, " "), + (Segment::Identifier, "gex"), + (Segment::Spaces, " "), + (Segment::Identifier, "gtx"), + (Segment::Spaces, " "), + (Segment::Identifier, "lex"), + (Segment::Spaces, " "), + (Segment::Identifier, "ltx"), + (Segment::Spaces, " "), + (Segment::Identifier, "nex"), + (Segment::Spaces, " "), + (Segment::Identifier, "allx"), + (Segment::Spaces, " "), + (Segment::Identifier, "byx"), + (Segment::Spaces, " "), + (Segment::Identifier, "tox"), + (Segment::Spaces, " "), + (Segment::Identifier, "withx"), + (Segment::Newline, "\n"), + (Segment::Identifier, "and."), + (Segment::Spaces, " "), + (Segment::Identifier, "with"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_punctuation() { + check_segmentation( + r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] ** +~&|=>=><=<~=<>(),-+*/[]**!* +% : ; ? _ ` { } ~ !* +"#, + Mode::Auto, + &[ + (Segment::Punct, "~"), + (Segment::Spaces, " "), + (Segment::Punct, "&"), + (Segment::Spaces, " "), + (Segment::Punct, "|"), + (Segment::Spaces, " "), + (Segment::Punct, "="), + (Segment::Spaces, " "), + (Segment::Punct, ">="), + (Segment::Spaces, " "), + (Segment::Punct, ">"), + (Segment::Spaces, " "), + (Segment::Punct, "<="), + (Segment::Spaces, " "), + (Segment::Punct, "<"), + (Segment::Spaces, " "), + (Segment::Punct, "~="), + (Segment::Spaces, " "), + (Segment::Punct, "<>"), + (Segment::Spaces, " "), + (Segment::Punct, "("), + (Segment::Spaces, " "), + (Segment::Punct, ")"), + (Segment::Spaces, " "), + (Segment::Punct, ","), + (Segment::Spaces, " "), + (Segment::Punct, "-"), + (Segment::Spaces, " "), + (Segment::Punct, "+"), + (Segment::Spaces, " "), + (Segment::Punct, "*"), + (Segment::Spaces, " "), + (Segment::Punct, "/"), + (Segment::Spaces, " "), + (Segment::Punct, "["), + (Segment::Spaces, " "), + (Segment::Punct, "]"), + (Segment::Spaces, " "), + (Segment::Punct, "**"), + (Segment::Newline, "\n"), + (Segment::Punct, "~"), + (Segment::Punct, "&"), + (Segment::Punct, "|"), + (Segment::Punct, "="), + (Segment::Punct, ">="), + (Segment::Punct, ">"), + (Segment::Punct, "<="), + (Segment::Punct, "<"), + (Segment::Punct, "~="), + (Segment::Punct, "<>"), + (Segment::Punct, "("), + (Segment::Punct, ")"), + (Segment::Punct, ","), + (Segment::Punct, "-"), + (Segment::Punct, "+"), + (Segment::Punct, "*"), + (Segment::Punct, "/"), + (Segment::Punct, "["), + (Segment::Punct, "]"), + (Segment::Punct, "**"), + (Segment::Punct, "!*"), + (Segment::Newline, "\n"), + (Segment::Punct, "%"), + (Segment::Spaces, " "), + (Segment::Punct, ":"), + (Segment::Spaces, " "), + (Segment::Punct, ";"), + (Segment::Spaces, " "), + (Segment::Punct, "?"), + (Segment::Spaces, " "), + (Segment::Punct, "_"), + (Segment::Spaces, " "), + (Segment::Punct, "`"), + (Segment::Spaces, " "), + (Segment::Punct, "{"), + (Segment::Spaces, " "), + (Segment::Punct, "}"), + (Segment::Spaces, " "), + (Segment::Punct, "~"), + (Segment::Spaces, " "), + (Segment::Punct, "!*"), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[PromptStyle::Later, PromptStyle::Later, PromptStyle::Later], + ); +} + +#[test] +fn test_positive_numbers() { + check_segmentation( + r#"0 1 01 001. 1. +123. /* comment 1 */ /* comment 2 */ +.1 0.1 00.1 00.10 +5e1 6E-1 7e+1 6E+01 6e-03 +.3E1 .4e-1 .5E+1 .6e+01 .7E-03 +1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03 +. 1e e1 1e+ 1e- 1. +"#, + Mode::Auto, + &[ + (Segment::Number, "0"), + (Segment::Spaces, " "), + (Segment::Number, "1"), + (Segment::Spaces, " "), + (Segment::Number, "01"), + (Segment::Spaces, " "), + (Segment::Number, "001."), + (Segment::Spaces, " "), + (Segment::Number, "1"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Number, "123"), + (Segment::EndCommand, "."), + (Segment::Spaces, " "), + (Segment::Comment, "/* comment 1 */"), + (Segment::Spaces, " "), + (Segment::Comment, "/* comment 2 */"), + (Segment::Newline, "\n"), + (Segment::StartCommand, "."), + (Segment::Number, "1"), + (Segment::Spaces, " "), + (Segment::Number, "0.1"), + (Segment::Spaces, " "), + (Segment::Number, "00.1"), + (Segment::Spaces, " "), + (Segment::Number, "00.10"), + (Segment::Newline, "\n"), + (Segment::Number, "5e1"), + (Segment::Spaces, " "), + (Segment::Number, "6E-1"), + (Segment::Spaces, " "), + (Segment::Number, "7e+1"), + (Segment::Spaces, " "), + (Segment::Number, "6E+01"), + (Segment::Spaces, " "), + (Segment::Number, "6e-03"), + (Segment::Newline, "\n"), + (Segment::StartCommand, "."), + (Segment::Number, "3E1"), + (Segment::Spaces, " "), + (Segment::Number, ".4e-1"), + (Segment::Spaces, " "), + (Segment::Number, ".5E+1"), + (Segment::Spaces, " "), + (Segment::Number, ".6e+01"), + (Segment::Spaces, " "), + (Segment::Number, ".7E-03"), + (Segment::Newline, "\n"), + (Segment::Number, "1.23e1"), + (Segment::Spaces, " "), + (Segment::Number, "45.6E-1"), + (Segment::Spaces, " "), + (Segment::Number, "78.9e+1"), + (Segment::Spaces, " "), + (Segment::Number, "99.9E+01"), + (Segment::Spaces, " "), + (Segment::Number, "11.2e-03"), + (Segment::Newline, "\n"), + (Segment::StartCommand, "."), + (Segment::Spaces, " "), + (Segment::ExpectedExponent, "1e"), + (Segment::Spaces, " "), + (Segment::Identifier, "e1"), + (Segment::Spaces, " "), + (Segment::ExpectedExponent, "1e+"), + (Segment::Spaces, " "), + (Segment::ExpectedExponent, "1e-"), + (Segment::Spaces, " "), + (Segment::Number, "1"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::First, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_negative_numbers() { + check_segmentation( + r#" -0 -1 -01 -001. -1. + -123. /* comment 1 */ /* comment 2 */ + -.1 -0.1 -00.1 -00.10 + -5e1 -6E-1 -7e+1 -6E+01 -6e-03 + -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03 + -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03 + -/**/1 + -. -1e -e1 -1e+ -1e- -1. +"#, + Mode::Auto, + &[ + (Segment::Spaces, " "), + (Segment::Number, "-0"), + (Segment::Spaces, " "), + (Segment::Number, "-1"), + (Segment::Spaces, " "), + (Segment::Number, "-01"), + (Segment::Spaces, " "), + (Segment::Number, "-001."), + (Segment::Spaces, " "), + (Segment::Number, "-1"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Number, "-123"), + (Segment::EndCommand, "."), + (Segment::Spaces, " "), + (Segment::Comment, "/* comment 1 */"), + (Segment::Spaces, " "), + (Segment::Comment, "/* comment 2 */"), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Number, "-.1"), + (Segment::Spaces, " "), + (Segment::Number, "-0.1"), + (Segment::Spaces, " "), + (Segment::Number, "-00.1"), + (Segment::Spaces, " "), + (Segment::Number, "-00.10"), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Number, "-5e1"), + (Segment::Spaces, " "), + (Segment::Number, "-6E-1"), + (Segment::Spaces, " "), + (Segment::Number, "-7e+1"), + (Segment::Spaces, " "), + (Segment::Number, "-6E+01"), + (Segment::Spaces, " "), + (Segment::Number, "-6e-03"), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Number, "-.3E1"), + (Segment::Spaces, " "), + (Segment::Number, "-.4e-1"), + (Segment::Spaces, " "), + (Segment::Number, "-.5E+1"), + (Segment::Spaces, " "), + (Segment::Number, "-.6e+01"), + (Segment::Spaces, " "), + (Segment::Number, "-.7E-03"), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Number, "-1.23e1"), + (Segment::Spaces, " "), + (Segment::Number, "-45.6E-1"), + (Segment::Spaces, " "), + (Segment::Number, "-78.9e+1"), + (Segment::Spaces, " "), + (Segment::Number, "-99.9E+01"), + (Segment::Spaces, " "), + (Segment::Number, "-11.2e-03"), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Punct, "-"), + (Segment::Comment, "/**/"), + (Segment::Number, "1"), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Punct, "-"), + (Segment::Punct, "."), + (Segment::Spaces, " "), + (Segment::ExpectedExponent, "-1e"), + (Segment::Spaces, " "), + (Segment::Punct, "-"), + (Segment::Identifier, "e1"), + (Segment::Spaces, " "), + (Segment::ExpectedExponent, "-1e+"), + (Segment::Spaces, " "), + (Segment::ExpectedExponent, "-1e-"), + (Segment::Spaces, " "), + (Segment::Number, "-1"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::First, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_strings() { + check_segmentation( + r#"'x' "y" 'abc' +'Don''t' "Can't" 'Won''t' +"""quoted""" '"quoted"' +'' "" +'missing end quote +"missing double quote +x"4142" X'5152' +u'fffd' U"041" ++ new command ++ /* comment */ 'string continuation' ++ /* also a punctuator on blank line +- 'new command' +"#, + Mode::Auto, + &[ + (Segment::QuotedString, "'x'"), + (Segment::Spaces, " "), + (Segment::QuotedString, "\"y\""), + (Segment::Spaces, " "), + (Segment::QuotedString, "'abc'"), + (Segment::Newline, "\n"), + (Segment::QuotedString, "'Don''t'"), + (Segment::Spaces, " "), + (Segment::QuotedString, "\"Can't\""), + (Segment::Spaces, " "), + (Segment::QuotedString, "'Won''t'"), + (Segment::Newline, "\n"), + (Segment::QuotedString, "\"\"\"quoted\"\"\""), + (Segment::Spaces, " "), + (Segment::QuotedString, "'\"quoted\"'"), + (Segment::Newline, "\n"), + (Segment::QuotedString, "''"), + (Segment::Spaces, " "), + (Segment::QuotedString, "\"\""), + (Segment::Newline, "\n"), + (Segment::ExpectedQuote, "'missing end quote"), + (Segment::Newline, "\n"), + (Segment::ExpectedQuote, "\"missing double quote"), + (Segment::Newline, "\n"), + (Segment::HexString, "x\"4142\""), + (Segment::Spaces, " "), + (Segment::HexString, "X'5152'"), + (Segment::Newline, "\n"), + (Segment::UnicodeString, "u'fffd'"), + (Segment::Spaces, " "), + (Segment::UnicodeString, "U\"041\""), + (Segment::Newline, "\n"), + (Segment::StartCommand, "+"), + (Segment::Spaces, " "), + (Segment::Identifier, "new"), + (Segment::Spaces, " "), + (Segment::Identifier, "command"), + (Segment::Newline, "\n"), + (Segment::Punct, "+"), + (Segment::Spaces, " "), + (Segment::Comment, "/* comment */"), + (Segment::Spaces, " "), + (Segment::QuotedString, "'string continuation'"), + (Segment::Newline, "\n"), + (Segment::Punct, "+"), + (Segment::Spaces, " "), + (Segment::Comment, "/* also a punctuator on blank line"), + (Segment::Newline, "\n"), + (Segment::StartCommand, "-"), + (Segment::Spaces, " "), + (Segment::QuotedString, "'new command'"), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + ], + ); +} + +#[test] +fn test_shbang() { + check_segmentation( + r#"#! /usr/bin/pspp +title my title. +#! /usr/bin/pspp +"#, + Mode::Interactive, + &[ + (Segment::Shbang, "#! /usr/bin/pspp"), + (Segment::Newline, "\n"), + (Segment::Identifier, "title"), + (Segment::Spaces, " "), + (Segment::Identifier, "my"), + (Segment::Spaces, " "), + (Segment::Identifier, "title"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "#"), + (Segment::Punct, "!"), + (Segment::Spaces, " "), + (Segment::Punct, "/"), + (Segment::Identifier, "usr"), + (Segment::Punct, "/"), + (Segment::Identifier, "bin"), + (Segment::Punct, "/"), + (Segment::Identifier, "pspp"), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[PromptStyle::First, PromptStyle::First, PromptStyle::Later], + ); +} + +#[test] +fn test_comment_command() { + check_segmentation( + r#"* Comment commands "don't +have to contain valid tokens. + +** Check ambiguity with ** token. +****************. + +comment keyword works too. +COMM also. +com is ambiguous with COMPUTE. + + * Comment need not start at left margin. + +* Comment ends with blank line + +next command. + +"#, + Mode::Interactive, + &[ + (Segment::CommentCommand, "* Comment commands \"don't"), + (Segment::Newline, "\n"), + (Segment::CommentCommand, "have to contain valid tokens"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::CommentCommand, "** Check ambiguity with ** token"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::CommentCommand, "****************"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::CommentCommand, "comment keyword works too"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::CommentCommand, "COMM also"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "com"), + (Segment::Spaces, " "), + (Segment::Identifier, "is"), + (Segment::Spaces, " "), + (Segment::Identifier, "ambiguous"), + (Segment::Spaces, " "), + (Segment::Identifier, "with"), + (Segment::Spaces, " "), + (Segment::Identifier, "COMPUTE"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + ( + Segment::CommentCommand, + "* Comment need not start at left margin", + ), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::CommentCommand, "* Comment ends with blank line"), + (Segment::Newline, "\n"), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::Identifier, "next"), + (Segment::Spaces, " "), + (Segment::Identifier, "command"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::Comment, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Comment, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_document_command() { + check_segmentation( + r#"DOCUMENT one line. +DOC more + than + one + line. +docu +first.paragraph +isn't parsed as tokens + +second paragraph. +"#, + Mode::Interactive, + &[ + (Segment::StartDocument, ""), + (Segment::Document, "DOCUMENT one line."), + (Segment::EndCommand, ""), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::StartDocument, ""), + (Segment::Document, "DOC more"), + (Segment::Newline, "\n"), + (Segment::Document, " than"), + (Segment::Newline, "\n"), + (Segment::Document, " one"), + (Segment::Newline, "\n"), + (Segment::Document, " line."), + (Segment::EndCommand, ""), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::StartDocument, ""), + (Segment::Document, "docu"), + (Segment::Newline, "\n"), + (Segment::Document, "first.paragraph"), + (Segment::Newline, "\n"), + (Segment::Document, "isn't parsed as tokens"), + (Segment::Newline, "\n"), + (Segment::Document, ""), + (Segment::Newline, "\n"), + (Segment::Document, "second paragraph."), + (Segment::EndCommand, ""), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::First, + PromptStyle::Document, + PromptStyle::Document, + PromptStyle::Document, + PromptStyle::First, + PromptStyle::Document, + PromptStyle::Document, + PromptStyle::Document, + PromptStyle::Document, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_file_label_command() { + check_segmentation( + r#"FIL label isn't quoted. +FILE + lab 'is quoted'. +FILE /* +/**/ lab not quoted here either + +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "FIL"), + (Segment::Spaces, " "), + (Segment::Identifier, "label"), + (Segment::Spaces, " "), + (Segment::UnquotedString, "isn't quoted"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "FILE"), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Identifier, "lab"), + (Segment::Spaces, " "), + (Segment::QuotedString, "'is quoted'"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "FILE"), + (Segment::Spaces, " "), + (Segment::Comment, "/*"), + (Segment::Newline, "\n"), + (Segment::Comment, "/**/"), + (Segment::Spaces, " "), + (Segment::Identifier, "lab"), + (Segment::Spaces, " "), + (Segment::UnquotedString, "not quoted here either"), + (Segment::Newline, "\n"), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::First, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_begin_data() { + check_segmentation( + r#"begin data. +end data. + +begin data. /* +123 +xxx +end data. + +BEG /**/ DAT /* +5 6 7 /* x + +end data +end data +. + +begin + data. +data +end data. + +begin data "xxx". +begin data 123. +not data +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "begin"), + (Segment::Spaces, " "), + (Segment::Identifier, "data"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "end"), + (Segment::Spaces, " "), + (Segment::Identifier, "data"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::Identifier, "begin"), + (Segment::Spaces, " "), + (Segment::Identifier, "data"), + (Segment::EndCommand, "."), + (Segment::Spaces, " "), + (Segment::Comment, "/*"), + (Segment::Newline, "\n"), + (Segment::InlineData, "123"), + (Segment::Newline, "\n"), + (Segment::InlineData, "xxx"), + (Segment::Newline, "\n"), + (Segment::Identifier, "end"), + (Segment::Spaces, " "), + (Segment::Identifier, "data"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::Identifier, "BEG"), + (Segment::Spaces, " "), + (Segment::Comment, "/**/"), + (Segment::Spaces, " "), + (Segment::Identifier, "DAT"), + (Segment::Spaces, " "), + (Segment::Comment, "/*"), + (Segment::Newline, "\n"), + (Segment::InlineData, "5 6 7 /* x"), + (Segment::Newline, "\n"), + (Segment::InlineData, ""), + (Segment::Newline, "\n"), + (Segment::InlineData, "end data"), + (Segment::Newline, "\n"), + (Segment::Identifier, "end"), + (Segment::Spaces, " "), + (Segment::Identifier, "data"), + (Segment::Newline, "\n"), + (Segment::StartCommand, "."), + (Segment::Newline, "\n"), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::Identifier, "begin"), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Identifier, "data"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::InlineData, "data"), + (Segment::Newline, "\n"), + (Segment::Identifier, "end"), + (Segment::Spaces, " "), + (Segment::Identifier, "data"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::Identifier, "begin"), + (Segment::Spaces, " "), + (Segment::Identifier, "data"), + (Segment::Spaces, " "), + (Segment::QuotedString, "\"xxx\""), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "begin"), + (Segment::Spaces, " "), + (Segment::Identifier, "data"), + (Segment::Spaces, " "), + (Segment::Number, "123"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "not"), + (Segment::Spaces, " "), + (Segment::Identifier, "data"), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::Data, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Later, + ], + ); +} + +#[test] +fn test_do_repeat() { + check_segmentation( + r#"do repeat x=a b c + y=d e f. + do repeat a=1 thru 5. +another command. +second command ++ third command. +end /* x */ /* y */ repeat print. +end + repeat. +do + repeat #a=1. + inner command. +end repeat. +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "do"), + (Segment::Spaces, " "), + (Segment::Identifier, "repeat"), + (Segment::Spaces, " "), + (Segment::Identifier, "x"), + (Segment::Punct, "="), + (Segment::Identifier, "a"), + (Segment::Spaces, " "), + (Segment::Identifier, "b"), + (Segment::Spaces, " "), + (Segment::Identifier, "c"), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Identifier, "y"), + (Segment::Punct, "="), + (Segment::Identifier, "d"), + (Segment::Spaces, " "), + (Segment::Identifier, "e"), + (Segment::Spaces, " "), + (Segment::Identifier, "f"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::DoRepeatCommand, " do repeat a=1 thru 5."), + (Segment::Newline, "\n"), + (Segment::DoRepeatCommand, "another command."), + (Segment::Newline, "\n"), + (Segment::DoRepeatCommand, "second command"), + (Segment::Newline, "\n"), + (Segment::DoRepeatCommand, "+ third command."), + (Segment::Newline, "\n"), + ( + Segment::DoRepeatCommand, + "end /* x */ /* y */ repeat print.", + ), + (Segment::Newline, "\n"), + (Segment::Identifier, "end"), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Identifier, "repeat"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "do"), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Identifier, "repeat"), + (Segment::Spaces, " "), + (Segment::Identifier, "#a"), + (Segment::Punct, "="), + (Segment::Number, "1"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::DoRepeatCommand, " inner command."), + (Segment::Newline, "\n"), + (Segment::Identifier, "end"), + (Segment::Spaces, " "), + (Segment::Identifier, "repeat"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_do_repeat_overflow() { + const N: usize = 257; + let do_repeat: Vec = (0..N) + .map(|i| format!("do repeat v{i}={i} thru {}.\n", i + 5)) + .collect(); + let end_repeat: Vec = (0..N) + .rev() + .map(|i| format!("end repeat. /* {i}\n")) + .collect(); + + let s: String = do_repeat + .iter() + .chain(end_repeat.iter()) + .map(|s| s.as_str()) + .collect(); + let mut expect_output = vec![ + (Segment::Identifier, "do"), + (Segment::Spaces, " "), + (Segment::Identifier, "repeat"), + (Segment::Spaces, " "), + (Segment::Identifier, "v0"), + (Segment::Punct, "="), + (Segment::Number, "0"), + (Segment::Spaces, " "), + (Segment::Identifier, "thru"), + (Segment::Spaces, " "), + (Segment::Number, "5"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + ]; + for i in 1..N { + expect_output.push((Segment::DoRepeatCommand, &do_repeat[i].trim_end())); + if i >= 255 { + expect_output.push((Segment::DoRepeatOverflow, "")); + } + expect_output.push((Segment::Newline, "\n")); + } + for i in 0..254 { + expect_output.push((Segment::DoRepeatCommand, &end_repeat[i].trim_end())); + expect_output.push((Segment::Newline, "\n")); + } + let comments: Vec = (0..(N - 254)).rev().map(|i| format!("/* {i}")).collect(); + for comment in &comments { + expect_output.extend([ + (Segment::Identifier, "end"), + (Segment::Spaces, " "), + (Segment::Identifier, "repeat"), + (Segment::EndCommand, "."), + (Segment::Spaces, " "), + (Segment::Comment, comment), + (Segment::Newline, "\n"), + ]); + } + expect_output.push((Segment::End, "")); + + let expect_prompts: Vec<_> = (0..N * 2 - 3) + .map(|_| PromptStyle::DoRepeat) + .chain([PromptStyle::First, PromptStyle::First, PromptStyle::First]) + .collect(); + check_segmentation(&s, Mode::Interactive, &expect_output, &expect_prompts); +} + +#[test] +fn test_do_repeat_batch() { + check_segmentation( + r#"do repeat x=a b c + y=d e f +do repeat a=1 thru 5 +another command +second command ++ third command +end /* x */ /* y */ repeat print +end + repeat +do + repeat #a=1 + + inner command +end repeat +"#, + Mode::Batch, + &[ + (Segment::Identifier, "do"), + (Segment::Spaces, " "), + (Segment::Identifier, "repeat"), + (Segment::Spaces, " "), + (Segment::Identifier, "x"), + (Segment::Punct, "="), + (Segment::Identifier, "a"), + (Segment::Spaces, " "), + (Segment::Identifier, "b"), + (Segment::Spaces, " "), + (Segment::Identifier, "c"), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Identifier, "y"), + (Segment::Punct, "="), + (Segment::Identifier, "d"), + (Segment::Spaces, " "), + (Segment::Identifier, "e"), + (Segment::Spaces, " "), + (Segment::Identifier, "f"), + (Segment::Newline, "\n"), + (Segment::StartCommand, ""), + (Segment::DoRepeatCommand, "do repeat a=1 thru 5"), + (Segment::Newline, "\n"), + (Segment::DoRepeatCommand, "another command"), + (Segment::Newline, "\n"), + (Segment::DoRepeatCommand, "second command"), + (Segment::Newline, "\n"), + (Segment::DoRepeatCommand, "+ third command"), + (Segment::Newline, "\n"), + (Segment::DoRepeatCommand, "end /* x */ /* y */ repeat print"), + (Segment::Newline, "\n"), + (Segment::Identifier, "end"), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Identifier, "repeat"), + (Segment::Newline, "\n"), + (Segment::StartCommand, ""), + (Segment::Identifier, "do"), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Identifier, "repeat"), + (Segment::Spaces, " "), + (Segment::Identifier, "#a"), + (Segment::Punct, "="), + (Segment::Number, "1"), + (Segment::Newline, "\n"), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::DoRepeatCommand, " inner command"), + (Segment::Newline, "\n"), + (Segment::Identifier, "end"), + (Segment::Spaces, " "), + (Segment::Identifier, "repeat"), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::Later, + ], + ); +} + +mod define { + use crate::{ + lex::segment::{Mode, Segment}, + prompt::PromptStyle, + }; + + use super::check_segmentation; + + #[test] + fn test_simple() { + check_segmentation( + r#"define !macro1() +var1 var2 var3 "!enddefine" +!enddefine. +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "define"), + (Segment::Spaces, " "), + (Segment::MacroName, "!macro1"), + (Segment::Punct, "("), + (Segment::Punct, ")"), + (Segment::Newline, "\n"), + (Segment::MacroBody, "var1 var2 var3 \"!enddefine\""), + (Segment::Newline, "\n"), + (Segment::Identifier, "!enddefine"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[PromptStyle::Define, PromptStyle::Define, PromptStyle::First], + ); + } + + #[test] + fn test_no_newline_after_parentheses() { + check_segmentation( + r#"define !macro1() var1 var2 var3 /* !enddefine +!enddefine. +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "define"), + (Segment::Spaces, " "), + (Segment::MacroName, "!macro1"), + (Segment::Punct, "("), + (Segment::Punct, ")"), + (Segment::MacroBody, " var1 var2 var3 /* !enddefine"), + (Segment::Newline, "\n"), + (Segment::Identifier, "!enddefine"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[PromptStyle::Define, PromptStyle::First], + ); + } + + #[test] + fn test_no_newline_before_enddefine() { + check_segmentation( + r#"define !macro1() +var1 var2 var3!enddefine. +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "define"), + (Segment::Spaces, " "), + (Segment::MacroName, "!macro1"), + (Segment::Punct, "("), + (Segment::Punct, ")"), + (Segment::Newline, "\n"), + (Segment::MacroBody, "var1 var2 var3"), + (Segment::Identifier, "!enddefine"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[PromptStyle::Define, PromptStyle::First], + ); + } + + #[test] + fn test_all_on_one_line() { + check_segmentation( + r#"define !macro1()var1 var2 var3!enddefine. +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "define"), + (Segment::Spaces, " "), + (Segment::MacroName, "!macro1"), + (Segment::Punct, "("), + (Segment::Punct, ")"), + (Segment::MacroBody, "var1 var2 var3"), + (Segment::Identifier, "!enddefine"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[PromptStyle::First], + ); + } + + #[test] + fn test_empty() { + check_segmentation( + r#"define !macro1() +!enddefine. +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "define"), + (Segment::Spaces, " "), + (Segment::MacroName, "!macro1"), + (Segment::Punct, "("), + (Segment::Punct, ")"), + (Segment::Newline, "\n"), + (Segment::Identifier, "!enddefine"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[PromptStyle::Define, PromptStyle::First], + ); + } + + #[test] + fn test_blank_lines() { + check_segmentation( + r#"define !macro1() + + +!enddefine. +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "define"), + (Segment::Spaces, " "), + (Segment::MacroName, "!macro1"), + (Segment::Punct, "("), + (Segment::Punct, ")"), + (Segment::Newline, "\n"), + (Segment::MacroBody, ""), + (Segment::Newline, "\n"), + (Segment::MacroBody, ""), + (Segment::Newline, "\n"), + (Segment::Identifier, "!enddefine"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::Define, + PromptStyle::Define, + PromptStyle::Define, + PromptStyle::First, + ], + ); + } + + #[test] + fn test_arguments() { + check_segmentation( + r#"define !macro1(a(), b(), c()) +!enddefine. +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "define"), + (Segment::Spaces, " "), + (Segment::MacroName, "!macro1"), + (Segment::Punct, "("), + (Segment::Identifier, "a"), + (Segment::Punct, "("), + (Segment::Punct, ")"), + (Segment::Punct, ","), + (Segment::Spaces, " "), + (Segment::Identifier, "b"), + (Segment::Punct, "("), + (Segment::Punct, ")"), + (Segment::Punct, ","), + (Segment::Spaces, " "), + (Segment::Identifier, "c"), + (Segment::Punct, "("), + (Segment::Punct, ")"), + (Segment::Punct, ")"), + (Segment::Newline, "\n"), + (Segment::Identifier, "!enddefine"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[PromptStyle::Define, PromptStyle::First], + ); + } + + #[test] + fn test_multiline_arguments() { + check_segmentation( + r#"define !macro1( + a(), b( + ), + c() +) +!enddefine. +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "define"), + (Segment::Spaces, " "), + (Segment::MacroName, "!macro1"), + (Segment::Punct, "("), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Identifier, "a"), + (Segment::Punct, "("), + (Segment::Punct, ")"), + (Segment::Punct, ","), + (Segment::Spaces, " "), + (Segment::Identifier, "b"), + (Segment::Punct, "("), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Punct, ")"), + (Segment::Punct, ","), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Identifier, "c"), + (Segment::Punct, "("), + (Segment::Punct, ")"), + (Segment::Newline, "\n"), + (Segment::Punct, ")"), + (Segment::Newline, "\n"), + (Segment::Identifier, "!enddefine"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Define, + PromptStyle::First, + ], + ); + } + + #[test] + fn test_arguments_start_on_second_line() { + check_segmentation( + r#"define !macro1 +(x,y,z +) +content 1 +content 2 +!enddefine. +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "define"), + (Segment::Spaces, " "), + (Segment::MacroName, "!macro1"), + (Segment::Newline, "\n"), + (Segment::Punct, "("), + (Segment::Identifier, "x"), + (Segment::Punct, ","), + (Segment::Identifier, "y"), + (Segment::Punct, ","), + (Segment::Identifier, "z"), + (Segment::Newline, "\n"), + (Segment::Punct, ")"), + (Segment::Newline, "\n"), + (Segment::MacroBody, "content 1"), + (Segment::Newline, "\n"), + (Segment::MacroBody, "content 2"), + (Segment::Newline, "\n"), + (Segment::Identifier, "!enddefine"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Define, + PromptStyle::Define, + PromptStyle::Define, + PromptStyle::First, + ], + ); + } + + #[test] + fn test_early_end_of_command_1() { + check_segmentation( + r#"define !macro1. +data list /x 1. +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "define"), + (Segment::Spaces, " "), + (Segment::MacroName, "!macro1"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "data"), + (Segment::Spaces, " "), + (Segment::Identifier, "list"), + (Segment::Spaces, " "), + (Segment::Punct, "/"), + (Segment::Identifier, "x"), + (Segment::Spaces, " "), + (Segment::Number, "1"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[PromptStyle::First, PromptStyle::First], + ); + } + + #[test] + fn test_early_end_of_command_2() { + check_segmentation( + r#"define !macro1 +x. +data list /x 1. +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "define"), + (Segment::Spaces, " "), + (Segment::MacroName, "!macro1"), + (Segment::Newline, "\n"), + (Segment::Identifier, "x"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "data"), + (Segment::Spaces, " "), + (Segment::Identifier, "list"), + (Segment::Spaces, " "), + (Segment::Punct, "/"), + (Segment::Identifier, "x"), + (Segment::Spaces, " "), + (Segment::Number, "1"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[PromptStyle::Later, PromptStyle::First, PromptStyle::First], + ); + } + + #[test] + fn test_early_end_of_command_3() { + check_segmentation( + r#"define !macro1(. +x. +data list /x 1. +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "define"), + (Segment::Spaces, " "), + (Segment::MacroName, "!macro1"), + (Segment::Punct, "("), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "x"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "data"), + (Segment::Spaces, " "), + (Segment::Identifier, "list"), + (Segment::Spaces, " "), + (Segment::Punct, "/"), + (Segment::Identifier, "x"), + (Segment::Spaces, " "), + (Segment::Number, "1"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[PromptStyle::First, PromptStyle::First, PromptStyle::First], + ); + } + + #[test] + fn test_early_end_of_command_4() { + // Notice the command terminator at the end of the `DEFINE` command, + // which should not be there and ends it early. + check_segmentation( + r#"define !macro1. +data list /x 1. +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "define"), + (Segment::Spaces, " "), + (Segment::MacroName, "!macro1"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "data"), + (Segment::Spaces, " "), + (Segment::Identifier, "list"), + (Segment::Spaces, " "), + (Segment::Punct, "/"), + (Segment::Identifier, "x"), + (Segment::Spaces, " "), + (Segment::Number, "1"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[PromptStyle::First, PromptStyle::First], + ); + } + + #[test] + fn test_missing_enddefine() { + check_segmentation( + r#"define !macro1() +content line 1 +content line 2 +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "define"), + (Segment::Spaces, " "), + (Segment::MacroName, "!macro1"), + (Segment::Punct, "("), + (Segment::Punct, ")"), + (Segment::Newline, "\n"), + (Segment::MacroBody, "content line 1"), + (Segment::Newline, "\n"), + (Segment::MacroBody, "content line 2"), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::Define, + PromptStyle::Define, + PromptStyle::Define, + ], + ); + } + + #[test] + fn test_missing_enddefine_2() { + check_segmentation( + r#"define !macro1() +"#, + Mode::Interactive, + &[ + (Segment::Identifier, "define"), + (Segment::Spaces, " "), + (Segment::MacroName, "!macro1"), + (Segment::Punct, "("), + (Segment::Punct, ")"), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[PromptStyle::Define], + ); + } +} + +#[test] +fn test_batch_mode() { + check_segmentation( + r#"first command + another line of first command ++ second command +third command + +fourth command. + fifth command. +"#, + Mode::Batch, + &[ + (Segment::Identifier, "first"), + (Segment::Spaces, " "), + (Segment::Identifier, "command"), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Identifier, "another"), + (Segment::Spaces, " "), + (Segment::Identifier, "line"), + (Segment::Spaces, " "), + (Segment::Identifier, "of"), + (Segment::Spaces, " "), + (Segment::Identifier, "first"), + (Segment::Spaces, " "), + (Segment::Identifier, "command"), + (Segment::Newline, "\n"), + (Segment::StartCommand, "+"), + (Segment::Spaces, " "), + (Segment::Identifier, "second"), + (Segment::Spaces, " "), + (Segment::Identifier, "command"), + (Segment::Newline, "\n"), + (Segment::StartCommand, ""), + (Segment::Identifier, "third"), + (Segment::Spaces, " "), + (Segment::Identifier, "command"), + (Segment::Newline, "\n"), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::Identifier, "fourth"), + (Segment::Spaces, " "), + (Segment::Identifier, "command"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Identifier, "fifth"), + (Segment::Spaces, " "), + (Segment::Identifier, "command"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + ], + ); +} + +#[test] +fn test_auto_mode() { + check_segmentation( + r#"command + another line of command +2sls ++ another command +another line of second command +data list /x 1 +aggregate. +print eject. +twostep cluster + + +fourth command. + fifth command. +"#, + Mode::Auto, + &[ + (Segment::Identifier, "command"), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Identifier, "another"), + (Segment::Spaces, " "), + (Segment::Identifier, "line"), + (Segment::Spaces, " "), + (Segment::Identifier, "of"), + (Segment::Spaces, " "), + (Segment::Identifier, "command"), + (Segment::Newline, "\n"), + (Segment::StartCommand, ""), + (Segment::Number, "2"), + (Segment::Identifier, "sls"), + (Segment::Newline, "\n"), + (Segment::StartCommand, "+"), + (Segment::Spaces, " "), + (Segment::Identifier, "another"), + (Segment::Spaces, " "), + (Segment::Identifier, "command"), + (Segment::Newline, "\n"), + (Segment::Identifier, "another"), + (Segment::Spaces, " "), + (Segment::Identifier, "line"), + (Segment::Spaces, " "), + (Segment::Identifier, "of"), + (Segment::Spaces, " "), + (Segment::Identifier, "second"), + (Segment::Spaces, " "), + (Segment::Identifier, "command"), + (Segment::Newline, "\n"), + (Segment::StartCommand, ""), + (Segment::Identifier, "data"), + (Segment::Spaces, " "), + (Segment::Identifier, "list"), + (Segment::Spaces, " "), + (Segment::Punct, "/"), + (Segment::Identifier, "x"), + (Segment::Spaces, " "), + (Segment::Number, "1"), + (Segment::Newline, "\n"), + (Segment::StartCommand, ""), + (Segment::Identifier, "aggregate"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "print"), + (Segment::Spaces, " "), + (Segment::Identifier, "eject"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Identifier, "twostep"), + (Segment::Spaces, " "), + (Segment::Identifier, "cluster"), + (Segment::Newline, "\n"), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::SeparateCommands, ""), + (Segment::Newline, "\n"), + (Segment::Identifier, "fourth"), + (Segment::Spaces, " "), + (Segment::Identifier, "command"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::Spaces, " "), + (Segment::Identifier, "fifth"), + (Segment::Spaces, " "), + (Segment::Identifier, "command"), + (Segment::EndCommand, "."), + (Segment::Newline, "\n"), + (Segment::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + ], + ); +} diff --git a/rust/pspp/src/lex/token.rs b/rust/pspp/src/lex/token.rs new file mode 100644 index 0000000000..2b59423b5f --- /dev/null +++ b/rust/pspp/src/lex/token.rs @@ -0,0 +1,272 @@ +use std::fmt::{Display, Formatter, Result as FmtResult}; + +use crate::identifier::Identifier; + +#[derive(Clone, Debug, PartialEq)] +pub enum Token { + /// End of input. + End, + + /// Identifier. + Id(Identifier), + + /// Number. + Number(f64), + + /// Quoted string. + String(String), + + /// Command terminator or separator. + /// + /// Usually this is `.`, but a blank line also separates commands, and in + /// batch mode any line that begins with a non-blank starts a new command. + EndCommand, + + /// Operators, punctuators, and reserved words. + Punct(Punct), +} + +impl Token { + pub fn id(&self) -> Option<&Identifier> { + match self { + Self::Id(identifier) => Some(identifier), + _ => None, + } + } +} + +fn is_printable(c: char) -> bool { + !c.is_control() || ['\t', '\r', '\n'].contains(&c) +} + +fn string_representation(s: &str, quote: char, f: &mut Formatter<'_>) -> FmtResult { + write!(f, "{quote}")?; + for section in s.split_inclusive(quote) { + if let Some(rest) = section.strip_suffix(quote) { + write!(f, "{rest}{quote}{quote}")?; + } else { + write!(f, "{section}")?; + } + } + write!(f, "{quote}") +} + +impl Display for Token { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + match self { + Token::End => Ok(()), + Token::Id(s) => write!(f, "{s}"), + Token::Number(number) => { + if number.is_sign_negative() { + write!(f, "-{}", number.abs()) + } else { + write!(f, "{number}") + } + } + Token::String(s) => { + if s.chars().all(|c| is_printable(c)) { + if s.contains('"') { + string_representation(s, '\'', f) + } else { + string_representation(s, '"', f) + } + } else { + write!(f, "X\"")?; + for byte in s.bytes() { + let c1 = char::from_digit((byte >> 4) as u32, 16) + .unwrap() + .to_ascii_uppercase(); + let c2 = char::from_digit((byte & 0xf) as u32, 16) + .unwrap() + .to_ascii_uppercase() + .to_ascii_lowercase(); + write!(f, "{c1}{c2}")?; + } + write!(f, "\"") + } + } + Token::EndCommand => write!(f, "."), + Token::Punct(punct) => punct.fmt(f), + } + } +} + +/// Check that all negative numbers, even -0, get formatted with a leading `-`. +#[cfg(test)] +mod test { + use crate::lex::token::Token; + + #[test] + fn test_string() { + assert_eq!(Token::String(String::from("abc")).to_string(), "\"abc\""); + assert_eq!( + Token::String(String::from("\u{0080}")).to_string(), + "X\"C280\"" + ); + } + + #[test] + fn test_neg0() { + assert_eq!(Token::Number(-0.0).to_string(), "-0"); + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum Punct { + /// `+`. + Plus, + + /// `-`. + Dash, + + /// `*`. + Asterisk, + + /// `/`. + Slash, + + /// `=`. + Equals, + + /// `(`. + LParen, + + /// `)`. + RParen, + + /// `[`. + LSquare, + + /// `]`. + RSquare, + + /// `{`. + LCurly, + + /// `}`. + RCurly, + + /// `,`. + Comma, + + /// `;`. + Semicolon, + + /// `:`. + Colon, + + /// `AND` or `&`. + And, + + /// `OR` or `|`. + Or, + + /// `NOT` or `~`. + Not, + + /// `EQ` or `=`. + Eq, + + /// `GE` or '>=` + Ge, + + /// `GT` or `>`. + Gt, + + /// `LE` or `<=`. + Le, + + /// `LT` or `<`. + Lt, + + /// `NE` or `~=` or `<>`. + Ne, + + /// `ALL`. + All, + + /// `BY`. + By, + + /// `TO`. + To, + + /// `WITH`. + With, + + /// `**`. + Exp, + + /// `!` (only appears in macros). + Bang, + + /// `%` (only appears in macros). + Percent, + + /// `?` (only appears in macros). + Question, + + /// ```` (only appears in macros). + Backtick, + + /// `.`. + /// + /// This represents a dot in the middle of a line by itself, where it does not end a command. + Dot, + + /// `_` (only appears in macros). + /// + /// Although underscores may appear within identifiers, they can't be the + /// first character, so this represents an underscore found on its own. + Underscore, + + /// `!*` (only appears in macros). + BangAsterisk, +} + +impl Punct { + pub fn as_str(&self) -> &'static str { + match self { + Self::Plus => "+", + Self::Dash => "-", + Self::Asterisk => "*", + Self::Slash => "/", + Self::Equals => "=", + Self::LParen => "(", + Self::RParen => ")", + Self::LSquare => "[", + Self::RSquare => "]", + Self::LCurly => "{", + Self::RCurly => "}", + Self::Comma => ",", + Self::Semicolon => ";", + Self::Colon => ":", + Self::And => "AND", + Self::Or => "OR", + Self::Not => "NOT", + Self::Eq => "EQ", + Self::Ge => ">=", + Self::Gt => ">", + Self::Le => "<=", + Self::Lt => "<", + Self::Ne => "~=", + Self::All => "ALL", + Self::By => "BY", + Self::To => "TO", + Self::With => "WITH", + Self::Exp => "**", + Self::Bang => "!", + Self::Percent => "%", + Self::Question => "?", + Self::Backtick => "`", + Self::Dot => ".", + Self::Underscore => "_", + Self::BangAsterisk => "!*", + } + } +} +impl Display for Punct { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + write!(f, "{}", self.as_str()) + } +} diff --git a/rust/pspp/src/lib.rs b/rust/pspp/src/lib.rs new file mode 100644 index 0000000000..3548e020ee --- /dev/null +++ b/rust/pspp/src/lib.rs @@ -0,0 +1,20 @@ +#[allow(unused_variables, unused_mut, dead_code)] +pub mod cooked; +pub mod dictionary; +pub mod encoding; +pub mod endian; +pub mod format; +pub mod identifier; +pub mod locale_charset; +pub mod output; +#[allow(unused_variables, unused_mut, dead_code)] +pub mod raw; +pub mod sack; +pub mod lex; +pub mod prompt; +pub mod message; +pub mod macros; +pub mod settings; +pub mod command; +pub mod integer; +pub mod engine; diff --git a/rust/pspp/src/locale_charset.rs b/rust/pspp/src/locale_charset.rs new file mode 100644 index 0000000000..596fd62406 --- /dev/null +++ b/rust/pspp/src/locale_charset.rs @@ -0,0 +1,306 @@ +// Determine a canonical name for the current locale's character encoding. +// +// Copyright (C) 2000-2006, 2008-2023 Free Software Foundation, Inc. +// +// This file is free software: you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 2.1 of the License, or (at your option) +// any later version. +// +// This file is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +// A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program. If not, see . +// +// Written by Bruno Haible . Translated to Rust by Ben Pfaff +// . + +use lazy_static::lazy_static; + +fn map_aliases(s: &str) -> &'static str { + #[cfg(target_os = "freebsd")] + match s { + "ARMSCII-8" => return "ARMSCII-8", + "Big5" => return "BIG5", + "C" => return "ASCII", + "CP1131" => return "CP1131", + "CP1251" => return "CP1251", + "CP866" => return "CP866", + "GB18030" => return "GB18030", + "GB2312" => return "GB2312", + "GBK" => return "GBK", + "ISCII-DEV" => return "?", + "ISO8859-1" => return "ISO-8859-1", + "ISO8859-13" => return "ISO-8859-13", + "ISO8859-15" => return "ISO-8859-15", + "ISO8859-2" => return "ISO-8859-2", + "ISO8859-5" => return "ISO-8859-5", + "ISO8859-7" => return "ISO-8859-7", + "ISO8859-9" => return "ISO-8859-9", + "KOI8-R" => return "KOI8-R", + "KOI8-U" => return "KOI8-U", + "SJIS" => return "SHIFT_JIS", + "US-ASCII" => return "ASCII", + "eucCN" => return "GB2312", + "eucJP" => return "EUC-JP", + "eucKR" => return "EUC-KR", + _ => (), + }; + + #[cfg(target_os = "netbsd")] + match s { + "646" => return "ASCII", + "ARMSCII-8" => return "ARMSCII-8", + "BIG5" => return "BIG5", + "Big5-HKSCS" => return "BIG5-HKSCS", + "CP1251" => return "CP1251", + "CP866" => return "CP866", + "GB18030" => return "GB18030", + "GB2312" => return "GB2312", + "ISO8859-1" => return "ISO-8859-1", + "ISO8859-13" => return "ISO-8859-13", + "ISO8859-15" => return "ISO-8859-15", + "ISO8859-2" => return "ISO-8859-2", + "ISO8859-4" => return "ISO-8859-4", + "ISO8859-5" => return "ISO-8859-5", + "ISO8859-7" => return "ISO-8859-7", + "KOI8-R" => return "KOI8-R", + "KOI8-U" => return "KOI8-U", + "PT154" => return "PT154", + "SJIS" => return "SHIFT_JIS", + "eucCN" => return "GB2312", + "eucJP" => return "EUC-JP", + "eucKR" => return "EUC-KR", + "eucTW" => return "EUC-TW", + _ => (), + }; + + #[cfg(target_os = "openbsd")] + match s { + "646" => return "ASCII", + "ISO8859-1" => return "ISO-8859-1", + "ISO8859-13" => return "ISO-8859-13", + "ISO8859-15" => return "ISO-8859-15", + "ISO8859-2" => return "ISO-8859-2", + "ISO8859-4" => return "ISO-8859-4", + "ISO8859-5" => return "ISO-8859-5", + "ISO8859-7" => return "ISO-8859-7", + "US-ASCII" => return "ASCII", + _ => (), + }; + + /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is + useless: + - It returns the empty string when LANG is set to a locale of the + form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8 + LC_CTYPE file. + - The environment variables LANG, LC_CTYPE, LC_ALL are not set by + the system; nl_langinfo(CODESET) returns "US-ASCII" in this case. + - The documentation says: + "... all code that calls BSD system routines should ensure + that the const *char parameters of these routines are in UTF-8 + encoding. All BSD system functions expect their string + parameters to be in UTF-8 encoding and nothing else." + It also says + "An additional caveat is that string parameters for files, + paths, and other file-system entities must be in canonical + UTF-8. In a canonical UTF-8 Unicode string, all decomposable + characters are decomposed ..." + but this is not true: You can pass non-decomposed UTF-8 strings + to file system functions, and it is the OS which will convert + them to decomposed UTF-8 before accessing the file system. + - The Apple Terminal application displays UTF-8 by default. + - However, other applications are free to use different encodings: + - xterm uses ISO-8859-1 by default. + - TextEdit uses MacRoman by default. + We prefer UTF-8 over decomposed UTF-8-MAC because one should + minimize the use of decomposed Unicode. Unfortunately, through the + Darwin file system, decomposed UTF-8 strings are leaked into user + space nevertheless. + Then there are also the locales with encodings other than US-ASCII + and UTF-8. These locales can be occasionally useful to users (e.g. + when grepping through ISO-8859-1 encoded text files), when all their + file names are in US-ASCII. + */ + + #[cfg(target_os = "macos")] + match s { + "ARMSCII-8" => return "ARMSCII-8", + "Big5" => return "BIG5", + "Big5HKSCS" => return "BIG5-HKSCS", + "CP1131" => return "CP1131", + "CP1251" => return "CP1251", + "CP866" => return "CP866", + "CP949" => return "CP949", + "GB18030" => return "GB18030", + "GB2312" => return "GB2312", + "GBK" => return "GBK", + "ISO8859-1" => return "ISO-8859-1", + "ISO8859-13" => return "ISO-8859-13", + "ISO8859-15" => return "ISO-8859-15", + "ISO8859-2" => return "ISO-8859-2", + "ISO8859-4" => return "ISO-8859-4", + "ISO8859-5" => return "ISO-8859-5", + "ISO8859-7" => return "ISO-8859-7", + "ISO8859-9" => return "ISO-8859-9", + "KOI8-R" => return "KOI8-R", + "KOI8-U" => return "KOI8-U", + "PT154" => return "PT154", + "SJIS" => return "SHIFT_JIS", + "eucCN" => return "GB2312", + "eucJP" => return "EUC-JP", + "eucKR" => return "EUC-KR", + _ => (), + }; + + #[cfg(target_os = "aix")] + match s { + "GBK" => return "GBK", + "IBM-1046" => return "CP1046", + "IBM-1124" => return "CP1124", + "IBM-1129" => return "CP1129", + "IBM-1252" => return "CP1252", + "IBM-850" => return "CP850", + "IBM-856" => return "CP856", + "IBM-921" => return "ISO-8859-13", + "IBM-922" => return "CP922", + "IBM-932" => return "CP932", + "IBM-943" => return "CP943", + "IBM-eucCN" => return "GB2312", + "IBM-eucJP" => return "EUC-JP", + "IBM-eucKR" => return "EUC-KR", + "IBM-eucTW" => return "EUC-TW", + "ISO8859-1" => return "ISO-8859-1", + "ISO8859-15" => return "ISO-8859-15", + "ISO8859-2" => return "ISO-8859-2", + "ISO8859-5" => return "ISO-8859-5", + "ISO8859-6" => return "ISO-8859-6", + "ISO8859-7" => return "ISO-8859-7", + "ISO8859-8" => return "ISO-8859-8", + "ISO8859-9" => return "ISO-8859-9", + "TIS-620" => return "TIS-620", + "UTF-8" => return "UTF-8", + "big5" => return "BIG5", + _ => (), + }; + + #[cfg(windows)] + match s { + "CP1361" => return "JOHAB", + "CP20127" => return "ASCII", + "CP20866" => return "KOI8-R", + "CP20936" => return "GB2312", + "CP21866" => return "KOI8-RU", + "CP28591" => return "ISO-8859-1", + "CP28592" => return "ISO-8859-2", + "CP28593" => return "ISO-8859-3", + "CP28594" => return "ISO-8859-4", + "CP28595" => return "ISO-8859-5", + "CP28596" => return "ISO-8859-6", + "CP28597" => return "ISO-8859-7", + "CP28598" => return "ISO-8859-8", + "CP28599" => return "ISO-8859-9", + "CP28605" => return "ISO-8859-15", + "CP38598" => return "ISO-8859-8", + "CP51932" => return "EUC-JP", + "CP51936" => return "GB2312", + "CP51949" => return "EUC-KR", + "CP51950" => return "EUC-TW", + "CP54936" => return "GB18030", + "CP65001" => return "UTF-8", + "CP936" => return "GBK", + _ => (), + }; + + String::from(s).leak() +} + +#[cfg(unix)] +mod inner { + use std::{ + ffi::{c_int, CStr, CString}, + ptr::null, + }; + + use libc::{self, nl_langinfo, setlocale, CODESET, LC_CTYPE}; + + unsafe fn string_from_pointer(s: *const i8) -> Option { + if s.is_null() { + None + } else { + Some(CStr::from_ptr(s).to_string_lossy().into()) + } + } + + fn set_locale(category: c_int, locale: Option<&str>) -> Option { + unsafe { + let locale = locale.map(|s| CString::new(s).unwrap()); + let locale_ptr = locale.as_ref().map_or(null(), |s| s.as_ptr()); + string_from_pointer(setlocale(category, locale_ptr)) + } + } + + pub fn locale_charset() -> Option { + unsafe { + let saved_locale = set_locale(LC_CTYPE, None); + set_locale(LC_CTYPE, Some("")); + let codeset = string_from_pointer(nl_langinfo(CODESET)); + set_locale(LC_CTYPE, saved_locale.as_deref()); + codeset + } + } +} + +#[cfg(windows)] +mod inner { + use libc::{setlocale, LC_CTYPE}; + use std::ffi::{CStr, CString}; + use windows_sys::Win32::Globalization::GetACP; + + fn current_locale() -> Option { + unsafe { + let empty_cstr = CString::new("").unwrap(); + let locale = setlocale(LC_CTYPE, empty_cstr.as_ptr()); + if locale.is_null() { + None + } else { + Some(CStr::from_ptr(locale).to_string_lossy().into()) + } + } + } + + pub fn locale_charset() -> Option { + let Some(current_locale) = current_locale() else { + return None; + }; + let codepage = if let Some((_, pdot)) = current_locale.rsplit_once('.') { + format!("CP{pdot}") + } else { + format!("CP{}", unsafe { GetACP() }) + }; + Some(match codepage.as_str() { + "CP65001" | "CPutf8" => String::from("UTF-8"), + _ => codepage, + }) + } +} + +#[cfg(not(any(unix, windows)))] +mod inner { + pub fn locale_charse() -> String { + String::from("UTF-8") + } +} + +/// Returns the character set used by the locale configured in the operating +/// system. +pub fn locale_charset() -> &'static str { + lazy_static! { + static ref LOCALE_CHARSET: &'static str = + map_aliases(&inner::locale_charset().unwrap_or(String::from("UTF-8"))); + } + &LOCALE_CHARSET +} diff --git a/rust/pspp/src/macros.rs b/rust/pspp/src/macros.rs new file mode 100644 index 0000000000..85671b05a5 --- /dev/null +++ b/rust/pspp/src/macros.rs @@ -0,0 +1,1668 @@ +use lazy_static::lazy_static; +use num::Integer; +use std::{ + cell::RefCell, + cmp::Ordering, + collections::{BTreeMap, HashMap, HashSet}, + mem::take, + num::NonZeroUsize, + ops::RangeInclusive, +}; +use thiserror::Error as ThisError; +use unicase::UniCase; + +use crate::{ + identifier::Identifier, + lex::{ + scan::{ScanError, ScanToken, StringScanner, StringSegmenter}, + segment::Mode, + token::{Punct, Token}, + }, + message::Location, + settings::Settings, +}; + +#[derive(Clone, Debug, ThisError)] +pub enum MacroError { + /// Expected more tokens. + #[error( + "Reached end of command expecting {n} more tokens in argument {arg} to macro {macro_}." + )] + ExpectedMoreTokens { + n: usize, + arg: Identifier, + macro_: Identifier, + }, + + /// Expected a particular token at end of command. + #[error("Reached end of command expecting {token:?} in argument {arg} to macro {macro_}.")] + ExpectedToken { + token: String, + arg: Identifier, + macro_: Identifier, + }, + + /// Expected a particular token, got a different one. + #[error( + "Found `{actual}` while expecting `{expected}` reading argument {arg} to macro {macro_}." + )] + UnexpectedToken { + actual: String, + expected: String, + arg: Identifier, + macro_: Identifier, + }, + + /// Argument specified multiple times, + #[error("Argument {arg} specified multiple times in call to macro {macro_}.")] + DuplicateArg { arg: Identifier, macro_: Identifier }, + + /// Maximum nesting limit exceeded. + #[error("Maximum nesting level {limit} exceeded. (Use `SET MNEST` to change the limit.)")] + TooDeep { limit: usize }, + + /// Invalid `!*`. + #[error("`!*` may only be used within the expansion of a macro.")] + InvalidBangAsterisk, + + /// Error tokenizing during expansion. + #[error(transparent)] + ScanError(ScanError), + + /// Expecting `)` in macro expression. + #[error("Expecting `)` in macro expression.")] + ExpectingRParen, + + /// Expecting literal. + #[error("Expecting literal or function invocation in macro expression.")] + ExpectingLiteral, + + /// Expecting `!THEN`. + #[error("`!THEN` expected in macro `!IF` construct.")] + ExpectingThen, + + /// Expecting `!ELSE` or `!THEN`. + #[error("`!ELSE` or `!THEN` expected in macro `!IF` construct.")] + ExpectingElseOrIfEnd, + + /// Expecting `!IFEND`. + #[error("`!IFEND` expected in macro `!IF` construct.")] + ExpectingIfEnd, + + /// Expecting macro variable name. + #[error("Expecting macro variable name following `{0}`.")] + ExpectingMacroVarName(&'static str), + + /// Invalid macro variable name. + #[error("Cannot use argument name or macro keyword {name} as `{construct}` variable name.")] + BadMacroVarName { + name: Identifier, + construct: &'static str, + }, + + /// Expecting `=` following `!LET`. + #[error("Expecting `=` following `!LET`.")] + ExpectingEquals, + + /// Expecting `=` or `!IN` in `!DO` loop. + #[error("Expecting `=` or `!IN` in `!DO` loop.")] + ExpectingEqualsOrIn, + + /// Missing `!DOEND`. + #[error("Missing `!DOEND`.")] + MissingDoEnd, + + /// Bad numberic macro expression. + #[error("Macro expression must evaluate to a number (not {0:?})")] + BadNumericMacroExpression(String), + + /// Too many iteration for list-based loop. + #[error("`!DO` loop over list exceeded maximum number of iterations {0}. (Use `SET MITERATE` to change the limit.)")] + MiterateList(usize), + + /// Too many iteration for numerical loop. + #[error("Numerical `!DO` loop exceeded maximum number of iterations {0}. (Use `SET MITERATE` to change the limit.)")] + MiterateNumeric(usize), + + /// Expecting `!TO` in numerical `!DO` loop. + #[error("Expecting `!TO` in numerical `!DO` loop.")] + ExpectingTo, + + /// `!BY` value cannot be zero. + #[error("`!BY` value cannot be zero.")] + ZeroBy, + + /// `!BREAK` outside `!DO`. + #[error("`!BREAK` outside `!DO`.")] + BreakOutsideDo, + + /// `,` or `)` expected in call to macro function. + #[error("`,` or `)` expected in call to macro function `{0}`.")] + ExpectingCommaOrRParen(Identifier), + + /// Macro function takes one argument. + #[error("Macro function `{name}` takes one argument (not {n_args}).")] + ExpectingOneArg { name: Identifier, n_args: usize }, + + /// Macro function takes two arguments. + #[error("Macro function `{name}` takes two arguments (not {n_args}).")] + ExpectingTwoArgs { name: Identifier, n_args: usize }, + + /// Macro function takes two or three arguments. + #[error("Macro function `{name}` takes two or three arguments (not {n_args}).")] + ExpectingTwoOrThreeArgs { name: Identifier, n_args: usize }, + + /// Macro function needs at least one argument). + #[error("Macro function `{name}` needs at least one argument).")] + ExpectingOneOrMoreArgs { name: Identifier }, + + /// Argument to `!BLANKS` must be non-negative integer (not `{0}`). + #[error("Argument to `!BLANKS` must be non-negative integer (not `{0}`).")] + InvalidBlanks(String), + + /// Second argument of `!SUBSTR` must be positive integer (not `{0}`). + #[error("Second argument of `!SUBSTR` must be positive integer (not `{0}`).")] + InvalidSubstr2(String), + + /// Third argument of `!SUBSTR` must be non-negative integer (not `{0}`). + #[error("Third argument of `!SUBSTR` must be non-negative integer (not `{0}`).")] + InvalidSubstr3(String), +} + +/// A PSPP macro as defined with `!DEFINE`. +pub struct Macro { + /// The macro's name. This is an ordinary identifier except that it is + /// allowed (but not required) to begin with `!`. + pub name: Identifier, + + /// Source code location of macro definition, for error reporting. + pub location: Location, + + /// Parameters. + parameters: Vec, + + /// Body. + body: Vec, +} + +impl Macro { + fn initial_state(&self) -> ParserState { + if self.parameters.is_empty() { + ParserState::Finished + } else if self.parameters[0].is_positional() { + ParserState::Keyword + } else if let ValueType::Enclose(_, _) = self.parameters[0].arg { + ParserState::Enclose + } else { + ParserState::Arg + } + } + + fn find_parameter(&self, name: &Identifier) -> Option { + self.parameters.iter().position(|param| ¶m.name == name) + } +} + +struct Parameter { + /// `!name` or `!1`. + name: Identifier, + + /// Default value. + /// + /// The tokens don't include white space, etc. between them. + default: Vec, + + /// Macro-expand the value? + expand_value: bool, + + /// How the argument is specified. + arg: ValueType, +} + +impl Parameter { + /// Returns true if this is a positional parameter. Positional parameters + /// are expanded by index (position) rather than by name. + fn is_positional(&self) -> bool { + self.name.0.as_bytes()[1].is_ascii_digit() + } +} + +enum ValueType { + /// Argument consists of `.0` tokens. + NTokens(usize), + + /// Argument runs until token `.0`. + CharEnd(Token), + + /// Argument starts with token `.0` and ends with token `.1`. + Enclose(Token, Token), + + /// Argument runs until the end of the command. + CmdEnd, +} + +/// A token and the syntax that was tokenized to produce it. The syntax allows +/// the token to be turned back into syntax accurately. +#[derive(Clone)] +pub struct MacroToken { + /// The token. + pub token: Token, + + /// The syntax that produces `token`. + pub syntax: String, +} + +fn tokenize_string_into( + s: &str, + mode: Mode, + error: &impl Fn(MacroError), + output: &mut Vec, +) { + for (syntax, token) in StringSegmenter::new(s, mode, true) { + match token { + ScanToken::Token(token) => output.push(MacroToken { + token, + syntax: String::from(syntax), + }), + ScanToken::Error(scan_error) => error(MacroError::ScanError(scan_error)), + } + } +} + +fn tokenize_string(s: &str, mode: Mode, error: &impl Fn(MacroError)) -> Vec { + let mut tokens = Vec::new(); + tokenize_string_into(s, mode, error, &mut tokens); + tokens +} + +fn try_unquote_string(input: &String, mode: Mode) -> Option { + let mut scanner = StringScanner::new(input, mode, true); + let Some(ScanToken::Token(Token::String(unquoted))) = scanner.next() else { + return None; + }; + let None = scanner.next() else { return None }; + return Some(unquoted); +} + +fn unquote_string(input: String, mode: Mode) -> String { + try_unquote_string(&input, mode).unwrap_or(input) +} + +#[derive(Clone)] +struct MacroTokens<'a>(&'a [MacroToken]); + +impl<'a> MacroTokens<'a> { + fn is_empty(&self) -> bool { + self.0.is_empty() + } + fn match_(&mut self, s: &str) -> bool { + if let Some((first, rest)) = self.0.split_first() { + if first.syntax.eq_ignore_ascii_case(s) { + self.0 = rest; + return true; + } + } + false + } + fn take_relop(&mut self) -> Option { + if let Some((first, rest)) = self.0.split_first() { + if let Ok(relop) = first.syntax.as_str().try_into() { + self.0 = rest; + return Some(relop); + } + } + None + } + fn macro_id(&self) -> Option<&Identifier> { + self.0.get(0).map(|mt| mt.token.macro_id()).flatten() + } + fn take_macro_id(&mut self) -> Option<&Identifier> { + let result = self.0.get(0).map(|mt| mt.token.macro_id()).flatten(); + if result.is_some() { + self.advance(); + } + result + } + fn take(&mut self) -> Option<&MacroToken> { + match self.0.split_first() { + Some((first, rest)) => { + self.0 = rest; + Some(first) + } + None => None, + } + } + fn advance(&mut self) -> &MacroToken { + let (first, rest) = self.0.split_first().unwrap(); + self.0 = rest; + first + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +enum TokenClass { + /// No space before or after (new-line after). + EndCommand, + + /// Space on both sides. + BinaryOperator, + + /// Space afterward. + Comma, + + /// Don't need spaces except sequentially. + Id, + + /// Don't need spaces except sequentially. + Punct, +} + +impl TokenClass { + fn separator(prev: Self, next: Self) -> &'static str { + match (prev, next) { + // Don't need a separator before the end of a command, but we + // need a new-line afterward. + (_, Self::EndCommand) => "", + (Self::EndCommand, _) => "\n", + + // Binary operators always have a space on both sides, and a comma always has a space afterward. + (Self::BinaryOperator, _) | (_, Self::BinaryOperator) | (Self::Comma, _) => " ", + + // Otherwise, `prev` is `Self::Punct`, which only need a space if + // there are two or them in a row. + (Self::Punct, Self::Punct) => " ", + _ => "", + } + } +} + +impl From<&Token> for TokenClass { + fn from(source: &Token) -> Self { + match source { + Token::End => Self::Punct, + Token::Id(_) | Token::Number(_) | Token::String(_) => Self::Id, + Token::EndCommand => Self::EndCommand, + Token::Punct(punct) => match punct { + Punct::LParen + | Punct::RParen + | Punct::LSquare + | Punct::RSquare + | Punct::LCurly + | Punct::RCurly => Self::Punct, + + Punct::Plus + | Punct::Dash + | Punct::Asterisk + | Punct::Slash + | Punct::Equals + | Punct::Colon + | Punct::And + | Punct::Or + | Punct::Not + | Punct::Eq + | Punct::Ge + | Punct::Gt + | Punct::Le + | Punct::Lt + | Punct::Ne + | Punct::All + | Punct::By + | Punct::To + | Punct::With + | Punct::Exp + | Punct::Bang + | Punct::Percent + | Punct::Question + | Punct::Backtick + | Punct::Dot + | Punct::Underscore + | Punct::BangAsterisk => Self::BinaryOperator, + + Punct::Comma | Punct::Semicolon => Self::Comma, + }, + } + } +} + +pub fn macro_tokens_to_syntax(input: &[MacroToken]) -> impl Iterator { + input + .iter() + .take(1) + .map(|token| ["", token.syntax.as_str()]) + .chain(input.windows(2).map(|w| { + let c0 = (&w[0].token).into(); + let c1 = (&w[1].token).into(); + [TokenClass::separator(c0, c1), w[1].syntax.as_str()] + })) +} + +trait MacroId { + fn macro_id(&self) -> Option<&Identifier>; +} + +impl MacroId for Token { + fn macro_id(&self) -> Option<&Identifier> { + let id = self.id()?; + id.0.starts_with('!').then_some(id) + } +} + +enum RelOp { + Eq, + Ne, + Lt, + Gt, + Le, + Ge, +} + +impl TryFrom<&str> for RelOp { + type Error = (); + + fn try_from(source: &str) -> Result { + match source { + "=" => Ok(Self::Eq), + "~=" | "<>" => Ok(Self::Ne), + "<" => Ok(Self::Lt), + ">" => Ok(Self::Gt), + "<=" => Ok(Self::Le), + ">=" => Ok(Self::Ge), + _ if source.len() == 3 && source.as_bytes()[0] == b'!' => match ( + source.as_bytes()[0].to_ascii_uppercase(), + source.as_bytes()[1].to_ascii_uppercase(), + ) { + (b'E', b'Q') => Ok(Self::Eq), + (b'N', b'E') => Ok(Self::Ne), + (b'L', b'T') => Ok(Self::Lt), + (b'L', b'E') => Ok(Self::Le), + (b'G', b'T') => Ok(Self::Gt), + (b'G', b'E') => Ok(Self::Ge), + _ => Err(()), + }, + _ => Err(()), + } + } +} + +impl RelOp { + fn evaluate(&self, cmp: Ordering) -> bool { + match self { + RelOp::Eq => cmp == Ordering::Equal, + RelOp::Ne => cmp != Ordering::Equal, + RelOp::Lt => cmp == Ordering::Less, + RelOp::Gt => cmp == Ordering::Greater, + RelOp::Le => cmp != Ordering::Greater, + RelOp::Ge => cmp != Ordering::Less, + } + } +} + +pub type MacroSet = HashMap, Macro>; + +enum ParserState { + /// Accumulating tokens toward the end of any type of argument. + Arg, + + /// Expecting the opening delimiter of an ARG_ENCLOSE argument. + Enclose, + + /// Expecting a keyword for a keyword argument. + Keyword, + + /// Expecting an equal sign for a keyword argument. + Equals, + + /// Macro fully parsed and ready for expansion. + Finished, +} + +/// Macro call parser FSM. +pub struct Parser<'a> { + macros: &'a MacroSet, + macro_: &'a Macro, + state: ParserState, + args: Box<[Option>]>, + arg_index: usize, + + /// Length of macro call so far. + n_tokens: usize, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum ParseStatus { + Complete, + Incomplete, +} + +impl<'a> Parser<'a> { + pub fn new(macros: &'a MacroSet, token: &Token) -> Option { + let macro_ = macros.get(&token.id()?.0)?; + Some(Self { + macros, + macro_, + state: macro_.initial_state(), + args: (0..macro_.parameters.len()).map(|_| None).collect(), + arg_index: 0, + n_tokens: 1, + }) + } + + fn finished(&mut self) { + self.state = ParserState::Finished; + for (i, arg) in self.args.iter_mut().enumerate() { + if arg.is_none() { + *arg = Some(self.macro_.parameters[i].default.clone()); + } + } + self.state = ParserState::Finished; + } + + fn next_arg(&mut self) { + if self.macro_.parameters.is_empty() { + self.finished() + } else { + let param = &self.macro_.parameters[self.arg_index]; + if param.is_positional() { + self.arg_index += 1; + if self.arg_index >= self.args.len() { + self.finished() + } else { + let param = &self.macro_.parameters[self.arg_index]; + self.state = if !param.is_positional() { + ParserState::Keyword + } else if let ValueType::Enclose(_, _) = param.arg { + ParserState::Enclose + } else { + ParserState::Arg + }; + } + } else { + if self.args.iter().any(|arg| arg.is_none()) { + self.state = ParserState::Keyword; + } else { + self.finished(); + } + } + } + } + + fn push_arg(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) { + let param = &self.macro_.parameters[self.args.len() - 1]; + if let Token::EndCommand | Token::End = token { + if let Some(arg) = &self.args[self.arg_index] { + let param = &self.macro_.parameters[self.args.len() - 1]; + + match ¶m.arg { + ValueType::NTokens(n) => error(MacroError::ExpectedMoreTokens { + n: n - arg.len(), + arg: param.name.clone(), + macro_: self.macro_.name.clone(), + }), + ValueType::CharEnd(end) | ValueType::Enclose(_, end) => { + error(MacroError::ExpectedToken { + token: end.to_string(), + arg: param.name.clone(), + macro_: self.macro_.name.clone(), + }) + } + ValueType::CmdEnd => { + // This is OK, it's the expected way to end the argument. + } + } + } + self.finished(); + } + + self.n_tokens += 1; + let arg = self.args[self.arg_index].get_or_insert(Vec::new()); + let ( + add_token, // Should we add `mt` to the current arg? + next_arg, // Should we advance to the next arg? + ) = match ¶m.arg { + ValueType::NTokens(n) => (arg.len() + 1 >= *n, true), + ValueType::CharEnd(end) | ValueType::Enclose(_, end) => { + let at_end = token == end; + (at_end, !at_end) + } + ValueType::CmdEnd => (false, true), + }; + if add_token { + if true + // !macro_expand_arg (&mt->token, mc->me, *argp) + { + arg.push(MacroToken { + token: token.clone(), + syntax: String::from(syntax), + }); + } + } + if next_arg { + self.next_arg() + } + } + + fn push_enclose(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) { + let param = &self.macro_.parameters[self.arg_index]; + let ValueType::Enclose(start, _) = ¶m.arg else { + unreachable!() + }; + if token == start { + self.n_tokens += 1; + self.args[self.arg_index].get_or_insert(Vec::new()); + self.state = ParserState::Arg; + } else if param.is_positional() && matches!(token, Token::End | Token::EndCommand) { + self.finished(); + } else { + error(MacroError::UnexpectedToken { + actual: String::from(syntax), + expected: start.to_string(), + arg: param.name.clone(), + macro_: self.macro_.name.clone(), + }); + self.finished(); + } + } + + fn push_keyword(&mut self, token: &Token, _syntax: &str, error: &impl Fn(MacroError)) { + let Some(id) = token.id() else { + return self.finished(); + }; + let Some(arg_index) = self.macro_.find_parameter(id) else { + return self.finished(); + }; + self.arg_index = arg_index; + if self.args[arg_index].is_some() { + error(MacroError::DuplicateArg { + arg: id.clone(), + macro_: self.macro_.name.clone(), + }); + } + self.args[arg_index] = Some(Vec::new()); + } + + fn push_equals(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) { + let param = &self.macro_.parameters[self.arg_index]; + if let Token::Punct(Punct::Eq) = token { + self.n_tokens += 1; + self.state = if let ValueType::Enclose(_, _) = param.arg { + ParserState::Enclose + } else { + ParserState::Arg + }; + } else { + error(MacroError::UnexpectedToken { + actual: syntax.into(), + expected: String::from("="), + arg: param.name.clone(), + macro_: self.macro_.name.clone(), + }); + self.finished() + } + } + + /// Adds `token`, which has the given `syntax`, to the collection of tokens + /// in `self` that potentially need to be macro expanded. + /// + /// Returns [ParseStatus::Incomplete] if the macro expander needs more + /// tokens, for macro arguments or to decide whether this is actually a + /// macro invocation. The caller should call `push` again with the next + /// token. + /// + /// Returns [ParseStatus::Complete] if the macro invocation is now complete. + /// The caller should call [`Self::finish()`] to obtain the expansion. + pub fn push( + &mut self, + token: &Token, + syntax: &str, + error: &impl Fn(MacroError), + ) -> ParseStatus { + match self.state { + ParserState::Arg => self.push_arg(token, syntax, error), + ParserState::Enclose => self.push_enclose(token, syntax, error), + ParserState::Keyword => self.push_keyword(token, syntax, error), + ParserState::Equals => self.push_equals(token, syntax, error), + ParserState::Finished => (), + } + if let ParserState::Finished = self.state { + ParseStatus::Complete + } else { + ParseStatus::Incomplete + } + } + + pub fn finish(self) -> Call<'a> { + let ParserState::Finished = self.state else { + panic!() + }; + Call(self) + } +} + +/// Expansion stack entry. +struct Frame { + /// A macro name or `!IF`, `!DO`, etc. + name: Option, + + /// Source location, if available. + location: Option, +} + +struct Expander<'a> { + /// Macros to expand recursively. + macros: &'a MacroSet, + + /// Error reporting callback. + error: &'a Box, + + /// Tokenization mode. + mode: Mode, + + /// Remaining nesting levels. + nesting_countdown: usize, + + /// Stack for error reporting. + stack: Vec, + + // May macro calls be expanded? + expand: &'a RefCell, + + /// Variables from `!DO` and `!LET`. + vars: &'a RefCell>, + + // Only set if inside a `!DO` loop. If true, break out of the loop. + break_: Option<&'a mut bool>, + + /// Only set if expanding a macro (and not, say, a macro argument). + macro_: Option<&'a Macro>, + + /// Only set if expanding a macro (and not, say, a macro argument). + args: Option<&'a [Option>]>, +} + +fn bool_to_string(b: bool) -> String { + if b { + String::from("1") + } else { + String::from("0") + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +enum IfEndClause { + Else, + IfEnd, +} + +fn macro_keywords() -> HashSet { + let mut keywords = HashSet::new(); + for kw in [ + "!BREAK", + "!CHAREND", + "!CMDEND", + "!DEFAULT", + "!DO", + "!DOEND", + "!ELSE", + "!ENCLOSE", + "!ENDDEFINE", + "!IF", + "!IFEND", + "!IN", + "!LET", + "!NOEXPAND", + "!OFFEXPAND", + "!ONEXPAND", + "!POSITIONAL", + "!THEN", + "!TOKENS", + ] { + keywords.insert(Identifier::new(kw).unwrap()); + } + keywords +} + +fn is_macro_keyword(s: &Identifier) -> bool { + lazy_static! { + static ref KEYWORDS: HashSet = macro_keywords(); + } + KEYWORDS.contains(s) +} + +enum DoInput { + List(Vec), + Up { first: f64, last: f64, by: f64 }, + Down { first: f64, last: f64, by: f64 }, + Empty, +} + +impl DoInput { + fn from_list(items: Vec) -> Self { + Self::List( + items + .into_iter() + .rev() + .take(Settings::global().macros.max_iterations + 1) + .map(|mt| mt.syntax) + .collect(), + ) + } + + fn from_by(first: f64, last: f64, by: f64) -> Self { + if by > 0.0 && first <= last { + Self::Up { first, last, by } + } else if by > 0.0 && first <= last { + Self::Down { first, last, by } + } else { + Self::Empty + } + } +} + +impl Iterator for DoInput { + type Item = String; + + fn next(&mut self) -> Option { + match self { + DoInput::List(vec) => vec.pop(), + DoInput::Up { first, last, by } => { + if first <= last { + let value = *first; + *first += *by; + Some(format!("{value}")) + } else { + None + } + } + DoInput::Down { first, last, by } => { + if first >= last { + let value = *first; + *first += *by; + Some(format!("{value}")) + } else { + None + } + } + DoInput::Empty => None, + } + } +} + +impl<'a> Expander<'a> { + fn may_expand(&self) -> bool { + *self.expand.borrow() + } + + fn should_break(&self) -> bool { + self.break_.as_ref().map(|b| **b).unwrap_or(false) + } + + fn expand(&mut self, input: &mut MacroTokens, output: &mut Vec) { + if self.nesting_countdown == 0 { + (self.error)(MacroError::TooDeep { + limit: Settings::global().macros.max_nest, + }); + output.extend(take(&mut input.0).iter().cloned()); + } else { + while !input.is_empty() && !self.should_break() { + self.expand__(input, output); + } + } + } + + fn expand_arg(&mut self, param_idx: usize, output: &mut Vec) { + let param = &self.macro_.unwrap().parameters[param_idx]; + let arg = &self.args.unwrap()[param_idx].as_ref().unwrap(); + if self.may_expand() && param.expand_value { + let vars = RefCell::new(BTreeMap::new()); + let mut stack = take(&mut self.stack); + stack.push(Frame { + name: Some(param.name.clone()), + location: None, + }); + let mut subexpander = Expander { + stack, + vars: &vars, + break_: None, + macro_: None, + args: None, + ..*self + }; + let mut arg_tokens = MacroTokens(&arg); + subexpander.expand(&mut arg_tokens, output); + self.stack = subexpander.stack; + self.stack.pop(); + } else { + output.extend(arg.iter().cloned()); + } + } + fn parse_function_args( + &mut self, + function: &Identifier, + input: &mut MacroTokens, + ) -> Option> { + input.advance(); + input.advance(); + let mut args = Vec::new(); + if input.match_(")") { + return Some(args); + } + loop { + args.push(self.parse_function_arg(input)?); + match input.take() { + Some(MacroToken { + token: Token::Punct(Punct::Comma), + .. + }) => (), + Some(MacroToken { + token: Token::Punct(Punct::RParen), + .. + }) => return Some(args), + _ => { + (self.error)(MacroError::ExpectingCommaOrRParen(function.clone())); + return None; + } + } + } + } + + fn expand_blanks(e: &mut Expander, args: Vec) -> Option { + let Ok(n) = args[0].trim().parse::() else { + (e.error)(MacroError::InvalidBlanks(args[0].clone())); + return None; + }; + Some(std::iter::repeat(' ').take(n).collect()) + } + + fn expand_concat(e: &mut Expander, args: Vec) -> Option { + Some( + args.into_iter() + .map(|arg| unquote_string(arg, e.mode)) + .collect(), + ) + } + + fn expand_eval(e: &mut Expander, args: Vec) -> Option { + let tokens = tokenize_string(&args[0], e.mode, e.error); + let mut stack = take(&mut e.stack); + stack.push(Frame { + name: Some(Identifier::new("!EVAL").unwrap()), + location: None, + }); + let mut break_ = false; + let mut subexpander = Expander { + break_: Some(&mut break_), + stack, + vars: e.vars, + ..*e + }; + let mut output = Vec::new(); + subexpander.expand(&mut MacroTokens(tokens.as_slice()), &mut output); + subexpander.stack.pop(); + e.stack = subexpander.stack; + Some(macro_tokens_to_syntax(&output).flatten().collect()) + } + + fn expand_head(e: &mut Expander, mut args: Vec) -> Option { + let arg = unquote_string(args.remove(0), e.mode); + let mut output = tokenize_string(&arg, e.mode, e.error); + if output.is_empty() { + Some(String::new()) + } else { + Some(output.swap_remove(0).syntax) + } + } + + fn expand_index(_e: &mut Expander, args: Vec) -> Option { + let haystack = &args[0]; + let needle = &args[1]; + let position = haystack.find(needle); + Some(format!( + "{}", + position.map_or(0, |position| &haystack[0..position].chars().count() + 1) + )) + } + + fn expand_length(_e: &mut Expander, args: Vec) -> Option { + Some(format!("{}", args[0].chars().count())) + } + + fn expand_quote(e: &mut Expander, mut args: Vec) -> Option { + let arg = args.remove(0); + if try_unquote_string(&arg, e.mode).is_some() { + Some(arg) + } else { + let mut output = String::with_capacity(arg.len() + 2); + output.push('\''); + for c in arg.chars() { + if c == '"' { + output.push('\''); + } + output.push(c); + } + output.push('\''); + Some(output) + } + } + + fn expand_substr(e: &mut Expander, args: Vec) -> Option { + let Ok(start) = args[1].trim().parse::() else { + (e.error)(MacroError::InvalidSubstr3(args[0].clone())); + return None; + }; + let start = start.get(); + let Ok(count) = args[2].trim().parse::() else { + (e.error)(MacroError::InvalidSubstr2(args[0].clone())); + return None; + }; + + Some(args[0].chars().skip(start - 1).take(count).collect()) + } + + fn expand_tail(e: &mut Expander, mut args: Vec) -> Option { + let arg = unquote_string(args.remove(0), e.mode); + let mut output = tokenize_string(&arg, e.mode, e.error); + Some( + output + .pop() + .map_or_else(|| String::new(), |tail| tail.syntax), + ) + } + + fn expand_unquote(e: &mut Expander, mut args: Vec) -> Option { + Some(unquote_string(args.remove(0), e.mode)) + } + + fn expand_upcase(e: &mut Expander, mut args: Vec) -> Option { + Some(unquote_string(args.remove(0), e.mode).to_uppercase()) + } + + fn expand_macro_function(&mut self, orig_input: &mut MacroTokens) -> Option { + let mut input = orig_input.clone(); + let name = input.macro_id()?; + if name == "!NULL" { + return Some(String::new()); + } + if input.0.len() < 2 || !matches!(input.0[1].token, Token::Punct(Punct::LParen)) { + return None; + } + + struct MacroFunction { + name: Identifier, + args: RangeInclusive, + parser: fn(&mut Expander, Vec) -> Option, + } + impl MacroFunction { + fn new( + name: &str, + args: RangeInclusive, + parser: fn(&mut Expander, Vec) -> Option, + ) -> Self { + Self { + name: Identifier::new(name).unwrap(), + args, + parser, + } + } + } + lazy_static! { + static ref MACRO_FUNCTIONS: [MacroFunction; 11] = [ + MacroFunction::new("!BLANKS", 1..=1, Expander::expand_blanks), + MacroFunction::new("!CONCAT", 1..=usize::MAX, Expander::expand_concat), + MacroFunction::new("!HEAD", 1..=1, Expander::expand_head), + MacroFunction::new("!INDEX", 2..=2, Expander::expand_index), + MacroFunction::new("!LENGTH", 1..=1, Expander::expand_length), + MacroFunction::new("!QUOTE", 1..=1, Expander::expand_quote), + MacroFunction::new("!SUBSTR", 2..=3, Expander::expand_substr), + MacroFunction::new("!TAIL", 1..=1, Expander::expand_tail), + MacroFunction::new("!UNQUOTE", 1..=1, Expander::expand_unquote), + MacroFunction::new("!UPCASE", 1..=1, Expander::expand_upcase), + MacroFunction::new("!EVAL", 1..=1, Expander::expand_eval), + ]; + } + + let function = MACRO_FUNCTIONS.iter().find(|mf| &mf.name == name)?; + + let args = self.parse_function_args(&function.name, &mut input)?; + + let n_args = args.len(); + if !function.args.contains(&n_args) { + let name = function.name.clone(); + let error = match &function.args { + x if x == &(1..=1) => MacroError::ExpectingOneArg { name, n_args }, + x if x == &(2..=2) => MacroError::ExpectingTwoArgs { name, n_args }, + x if x == &(2..=3) => MacroError::ExpectingTwoOrThreeArgs { name, n_args }, + x if x == &(1..=usize::MAX) => MacroError::ExpectingOneOrMoreArgs { name }, + _ => unreachable!(), + }; + (self.error)(error); + return None; + } + + *orig_input = input; + (function.parser)(self, args) + } + + /// Parses one function argument from `input`. Each argument to a macro + /// function is one of: + /// + /// - A quoted string or other single literal token. + /// + /// - An argument to the macro being expanded, e.g. `!1` or a named + /// argument. + /// + /// - `!*`. + /// + /// - A function invocation. + /// + /// Each function invocation yields a character sequence to be turned into a + /// sequence of tokens. The case where that character sequence is a single + /// quoted string is an important special case. + fn parse_function_arg(&mut self, input: &mut MacroTokens) -> Option { + if let Some(macro_) = self.macro_ { + match &input.0.get(0)?.token { + Token::Id(id) if id.0.starts_with('!') => { + if let Some(param_idx) = macro_.find_parameter(id) { + input.advance(); + return Some( + macro_tokens_to_syntax(self.args.unwrap()[param_idx].as_ref().unwrap()) + .flatten() + .collect(), + ); + } + if let Some(value) = self.vars.borrow().get(id) { + return Some(value.clone()); + } + + if let Some(output) = self.expand_macro_function(input) { + return Some(output); + } + } + Token::Punct(Punct::BangAsterisk) => { + let mut arg = String::new(); + for i in 0..macro_.parameters.len() { + if !macro_.parameters[i].is_positional() { + break; + } + if i > 0 { + arg.push(' ') + } + arg.extend( + macro_tokens_to_syntax(self.args.unwrap()[i].as_ref().unwrap()) + .flatten(), + ); + } + input.advance(); + return Some(arg); + } + _ => (), + } + } + Some(input.advance().syntax.clone()) + } + + fn evaluate_literal(&mut self, input: &mut MacroTokens) -> Option { + if input.match_("(") { + let value = self.evaluate_or(input)?; + if input.match_(")") { + Some(value) + } else { + (self.error)(MacroError::ExpectingRParen); + None + } + } else if input.match_(")") { + (self.error)(MacroError::ExpectingLiteral); + None + } else { + Some(unquote_string(self.parse_function_arg(input)?, self.mode)) + } + } + + fn evaluate_relational(&mut self, input: &mut MacroTokens) -> Option { + let lhs = self.evaluate_literal(input)?; + let Some(relop) = input.take_relop() else { + return Some(lhs); + }; + let rhs = self.evaluate_literal(input)?; + let cmp = unquote_string(lhs, self.mode).cmp(&unquote_string(rhs, self.mode)); + Some(bool_to_string(relop.evaluate(cmp))) + } + + fn evaluate_not(&mut self, input: &mut MacroTokens) -> Option { + let mut negations = 0; + while input.match_("!AND") || input.match_("&") { + negations += 1; + } + + let operand = self.evaluate_relational(input)?; + if negations == 0 { + return Some(operand); + } + + let mut b = operand != "0"; + if negations.is_odd() { + b = !b; + } + Some(bool_to_string(b)) + } + + fn evaluate_and(&mut self, input: &mut MacroTokens) -> Option { + let mut lhs = self.evaluate_not(input)?; + while input.match_("!AND") || input.match_("&") { + let rhs = self.evaluate_not(input)?; + lhs = bool_to_string(lhs != "0" && rhs != "0"); + } + Some(lhs) + } + fn evaluate_or(&mut self, input: &mut MacroTokens) -> Option { + let mut lhs = self.evaluate_and(input)?; + while input.match_("!OR") || input.match_("|") { + let rhs = self.evaluate_and(input)?; + lhs = bool_to_string(lhs != "0" || rhs != "0"); + } + Some(lhs) + } + + fn evaluate_expression(&mut self, input: &mut MacroTokens) -> Option { + self.evaluate_or(input) + } + + fn evaluate_number(&mut self, input: &mut MacroTokens) -> Option { + let s = self.evaluate_expression(input)?; + let tokens = tokenize_string(&s, self.mode, self.error); + let ( + Some(MacroToken { + token: Token::Number(number), + .. + }), + 1, + ) = (tokens.get(0), tokens.len()) + else { + (self.error)(MacroError::BadNumericMacroExpression(s)); + return None; + }; + + Some(*number) + } + + fn find_ifend_clause<'b>( + input: &mut MacroTokens<'b>, + ) -> Option<(MacroTokens<'b>, IfEndClause)> { + let input_copy = input.clone(); + let mut nesting = 0; + while !input.is_empty() { + if input.match_("!IF") { + nesting += 1; + } else if input.match_("!IFEND") { + if nesting == 0 { + return Some(( + MacroTokens(&input_copy.0[..input_copy.0.len() - input.0.len() - 1]), + IfEndClause::IfEnd, + )); + } + nesting -= 1; + } else if input.match_("!ELSE") && nesting == 0 { + return Some(( + MacroTokens(&input_copy.0[..input_copy.0.len() - input.0.len() - 1]), + IfEndClause::Else, + )); + } else { + input.advance(); + } + } + return None; + } + fn expand_if(&mut self, orig_input: &mut MacroTokens, output: &mut Vec) -> bool { + let mut input = orig_input.clone(); + if !input.match_("!IF") { + return false; + } + let Some(result) = self.evaluate_expression(&mut input) else { + return false; + }; + if !input.match_("!THEN") { + (self.error)(MacroError::ExpectingThen); + return false; + } + + let Some((if_tokens, clause)) = Self::find_ifend_clause(&mut input) else { + (self.error)(MacroError::ExpectingElseOrIfEnd); + return false; + }; + + let else_tokens = match clause { + IfEndClause::Else => { + let Some((else_tokens, IfEndClause::IfEnd)) = Self::find_ifend_clause(&mut input) + else { + (self.error)(MacroError::ExpectingIfEnd); + return false; + }; + Some(else_tokens) + } + IfEndClause::IfEnd => None, + }; + + let subinput = match result.as_str() { + "0" => else_tokens, + _ => Some(if_tokens), + }; + if let Some(mut subinput) = subinput { + self.stack.push(Frame { + name: Some(Identifier::new("!IF").unwrap()), + location: None, + }); + self.expand(&mut subinput, output); + self.stack.pop(); + } + *orig_input = input; + true + } + + fn take_macro_var_name( + &mut self, + input: &mut MacroTokens, + construct: &'static str, + ) -> Option { + let Some(var_name) = input.take_macro_id() else { + (self.error)(MacroError::ExpectingMacroVarName(construct)); + return None; + }; + if is_macro_keyword(var_name) + || self + .macro_ + .map(|m| m.find_parameter(var_name)) + .flatten() + .is_some() + { + (self.error)(MacroError::BadMacroVarName { + name: var_name.clone(), + construct, + }); + None + } else { + Some(var_name.clone()) + } + } + + fn expand_let(&mut self, orig_input: &mut MacroTokens) -> bool { + let mut input = orig_input.clone(); + if !input.match_("!LET") { + return false; + } + + let Some(var_name) = self.take_macro_var_name(&mut input, "!LET") else { + return false; + }; + input.advance(); + + if !input.match_("=") { + (self.error)(MacroError::ExpectingEquals); + return false; + } + + let Some(value) = self.evaluate_expression(&mut input) else { + return false; + }; + self.vars.borrow_mut().insert(var_name.clone(), value); + *orig_input = input; + true + } + + fn find_doend<'b>(&mut self, input: &mut MacroTokens<'b>) -> Option> { + let input_copy = input.clone(); + let mut nesting = 0; + while !input.is_empty() { + if input.match_("!DO") { + nesting += 1; + } else if input.match_("!DOEND") { + if nesting == 0 { + return Some(MacroTokens( + &input_copy.0[..input_copy.0.len() - input.0.len() - 1], + )); + } + nesting -= 1; + } else { + input.advance(); + } + } + (self.error)(MacroError::MissingDoEnd); + return None; + } + + fn expand_do(&mut self, orig_input: &mut MacroTokens, output: &mut Vec) -> bool { + let mut input = orig_input.clone(); + if !input.match_("!DO") { + return false; + } + + let Some(var_name) = self.take_macro_var_name(&mut input, "!DO") else { + return false; + }; + + let (items, miterate_error) = if input.match_("!IN") { + let Some(list) = self.evaluate_expression(&mut input) else { + return false; + }; + let items = tokenize_string(list.as_str(), self.mode, &self.error); + ( + DoInput::from_list(items), + MacroError::MiterateList(Settings::global().macros.max_iterations), + ) + } else if input.match_("=") { + let Some(first) = self.evaluate_number(&mut input) else { + return false; + }; + if !input.match_("!TO") { + (self.error)(MacroError::ExpectingTo); + return false; + } + let Some(last) = self.evaluate_number(&mut input) else { + return false; + }; + let by = if input.match_("!BY") { + let Some(by) = self.evaluate_number(&mut input) else { + return false; + }; + if by == 0.0 { + (self.error)(MacroError::ZeroBy); + return false; + } + by + } else { + 1.0 + }; + ( + DoInput::from_by(first, last, by), + MacroError::MiterateNumeric(Settings::global().macros.max_iterations), + ) + } else { + (self.error)(MacroError::ExpectingEqualsOrIn); + return false; + }; + + let Some(body) = self.find_doend(&mut input) else { + return false; + }; + + let mut stack = take(&mut self.stack); + stack.push(Frame { + name: Some(Identifier::new("!DO").unwrap()), + location: None, + }); + let mut break_ = false; + let mut subexpander = Expander { + break_: Some(&mut break_), + stack, + vars: self.vars, + ..*self + }; + + for (i, item) in items.enumerate() { + if subexpander.should_break() { + break; + } + if i >= Settings::global().macros.max_iterations { + (self.error)(miterate_error); + break; + } + let mut vars = self.vars.borrow_mut(); + if let Some(value) = vars.get_mut(&var_name) { + *value = item; + } else { + vars.insert(var_name.clone(), item); + } + subexpander.expand(&mut body.clone(), output); + } + *orig_input = input; + true + } + + fn expand__(&mut self, input: &mut MacroTokens, output: &mut Vec) { + // Recursive macro calls. + if self.may_expand() { + if let Some(call) = Call::for_tokens(self.macros, &input.0, &self.error) { + let vars = RefCell::new(BTreeMap::new()); + let mut stack = take(&mut self.stack); + stack.push(Frame { + name: Some(call.0.macro_.name.clone()), + location: Some(call.0.macro_.location.clone()), + }); + let mut subexpander = Expander { + break_: None, + vars: &vars, + nesting_countdown: self.nesting_countdown.saturating_sub(1), + stack, + ..*self + }; + let mut body = MacroTokens(call.0.macro_.body.as_slice()); + subexpander.expand(&mut body, output); + self.stack = subexpander.stack; + self.stack.pop(); + input.0 = &input.0[call.len()..]; + return; + } + } + + // Only identifiers beginning with `!` receive further processing. + let id = match &input.0[0].token { + Token::Id(id) if id.0.starts_with('!') => id, + Token::Punct(Punct::BangAsterisk) => { + if let Some(macro_) = self.macro_ { + for i in 0..macro_.parameters.len() { + self.expand_arg(i, output); + } + } else { + (self.error)(MacroError::InvalidBangAsterisk); + } + input.advance(); + return; + } + _ => { + output.push(input.advance().clone()); + return; + } + }; + + // Macro arguments. + if let Some(macro_) = self.macro_ { + if let Some(param_idx) = macro_.find_parameter(id) { + self.expand_arg(param_idx, output); + input.advance(); + return; + } + } + + // Variables set by `!DO` or `!LET`. + if let Some(value) = self.vars.borrow().get(id) { + tokenize_string_into(value.as_str(), self.mode, &self.error, output); + input.advance(); + return; + } + + // Macro functions. + if self.expand_if(input, output) { + return; + } + if self.expand_let(input) { + return; + } + if self.expand_do(input, output) { + return; + } + + if input.match_("!BREAK") { + if let Some(ref mut break_) = self.break_ { + **break_ = true; + } else { + (self.error)(MacroError::BreakOutsideDo); + } + return; + } + + if input.match_("!ONEXPAND") { + *self.expand.borrow_mut() = true; + } else if input.match_("!OFFEXPAND") { + *self.expand.borrow_mut() = false; + } else { + output.push(input.advance().clone()); + } + } +} + +pub struct Call<'a>(Parser<'a>); + +impl<'a> Call<'a> { + pub fn for_tokens(macros: &'a MacroSet, tokens: &[MacroToken], error: &F) -> Option + where + F: Fn(MacroError), + { + let mut parser = Parser::new(macros, &tokens.get(0)?.token)?; + for token in tokens[1..].iter().chain(&[MacroToken { + token: Token::EndCommand, + syntax: String::from(""), + }]) { + if parser.push(&token.token, &token.syntax, error) == ParseStatus::Complete { + return Some(parser.finish()); + } + } + return None; + } + + pub fn expand(&self, mode: Mode, call_loc: Location, output: &mut Vec, error: F) + where + F: Fn(MacroError) + 'a, + { + let error: Box = Box::new(error); + let vars = RefCell::new(BTreeMap::new()); + let expand = RefCell::new(true); + let mut me = Expander { + macros: self.0.macros, + error: &error, + macro_: Some(self.0.macro_), + args: Some(&self.0.args), + mode, + nesting_countdown: Settings::global().macros.max_nest, + stack: vec![ + Frame { + name: None, + location: Some(call_loc), + }, + Frame { + name: Some(self.0.macro_.name.clone()), + location: Some(self.0.macro_.location.clone()), + }, + ], + vars: &vars, + break_: None, + expand: &expand, + }; + let mut body = MacroTokens(&self.0.macro_.body); + me.expand(&mut body, output); + } + + /// Returns the number of tokens consumed from the input for the macro + /// invocation. If the result is 0, then there was no macro invocation and + /// the expansion will be empty. + pub fn len(&self) -> usize { + self.0.n_tokens + } +} diff --git a/rust/pspp/src/main.rs b/rust/pspp/src/main.rs new file mode 100644 index 0000000000..a3b3145bed --- /dev/null +++ b/rust/pspp/src/main.rs @@ -0,0 +1,155 @@ +/* PSPP - a program for statistical analysis. + * Copyright (C) 2023 Free Software Foundation, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . */ + +use anyhow::Result; +use clap::{Parser, ValueEnum}; +use encoding_rs::Encoding; +use pspp::raw::{encoding_from_headers, Decoder, Magic, Reader, Record}; +use std::fs::File; +use std::io::BufReader; +use std::path::{Path, PathBuf}; +use std::str; +use thiserror::Error as ThisError; + +/// A utility to dissect SPSS system files. +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + /// Maximum number of cases to print. + #[arg(long = "data", default_value_t = 0)] + max_cases: u64, + + /// Files to dissect. + #[arg(required = true)] + files: Vec, + + /// How to dissect the file. + #[arg(short, long, value_enum, default_value_t)] + mode: Mode, + + /// The encoding to use. + #[arg(long, value_parser = parse_encoding)] + encoding: Option<&'static Encoding>, +} + +#[derive(ThisError, Debug)] +#[error("{0}: unknown encoding")] +struct UnknownEncodingError(String); + +fn parse_encoding(arg: &str) -> Result<&'static Encoding, UnknownEncodingError> { + match Encoding::for_label_no_replacement(arg.as_bytes()) { + Some(encoding) => Ok(encoding), + None => Err(UnknownEncodingError(arg.to_string())), + } +} + +#[derive(Clone, Copy, Debug, Default, ValueEnum)] +enum Mode { + Identify, + Raw, + Decoded, + #[default] + Cooked, +} + +fn main() -> Result<()> { + let Args { + max_cases, + files, + mode, + encoding, + } = Args::parse(); + + for file in files { + dissect(&file, max_cases, mode, encoding)?; + } + Ok(()) +} + +fn dissect( + file_name: &Path, + max_cases: u64, + mode: Mode, + encoding: Option<&'static Encoding>, +) -> Result<()> { + let reader = File::open(file_name)?; + let reader = BufReader::new(reader); + let mut reader = Reader::new(reader, |warning| println!("{warning}"))?; + + match mode { + Mode::Identify => { + let Record::Header(header) = reader.next().unwrap()? else { + unreachable!() + }; + match header.magic { + Magic::Sav => println!("SPSS System File"), + Magic::Zsav => println!("SPSS System File with Zlib compression"), + Magic::Ebcdic => println!("EBCDIC-encoded SPSS System File"), + } + return Ok(()); + } + Mode::Raw => { + for header in reader { + let header = header?; + println!("{:?}", header); + if let Record::Cases(cases) = header { + let mut cases = cases.borrow_mut(); + for _ in 0..max_cases { + let Some(Ok(record)) = cases.next() else { + break; + }; + println!("{:?}", record); + } + } + } + } + Mode::Decoded => { + let headers: Vec = reader.collect::, _>>()?; + let encoding = match encoding { + Some(encoding) => encoding, + None => encoding_from_headers(&headers, &|e| eprintln!("{e}"))?, + }; + let decoder = Decoder::new(encoding, |e| eprintln!("{e}")); + for header in headers { + let header = header.decode(&decoder); + println!("{:?}", header); + /* + if let Record::Cases(cases) = header { + let mut cases = cases.borrow_mut(); + for _ in 0..max_cases { + let Some(Ok(record)) = cases.next() else { + break; + }; + println!("{:?}", record); + } + } + */ + } + } + Mode::Cooked => { + /* + let headers: Vec = reader.collect::, _>>()?; + let encoding = encoding_from_headers(&headers, &|e| eprintln!("{e}"))?; + let (headers, _) = decode(headers, encoding, &|e| eprintln!("{e}"))?; + for header in headers { + println!("{header:?}"); + } + */ + } + } + + Ok(()) +} diff --git a/rust/pspp/src/message.rs b/rust/pspp/src/message.rs new file mode 100644 index 0000000000..a3ba1d8e9f --- /dev/null +++ b/rust/pspp/src/message.rs @@ -0,0 +1,252 @@ +use std::{ + cmp::{max, min}, + fmt::{Display, Formatter, Result as FmtResult}, + ops::Range, + sync::Arc, +}; + +use enum_map::Enum; +use unicode_width::UnicodeWidthStr; + +/// A line number and optional column number within a source file. +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub struct Point { + /// 1-based line number. + pub line: i32, + + /// 1-based column number. + /// + /// Column numbers are measured according to the width of characters as + /// shown in a typical fixed-width font, in which CJK characters have width + /// 2 and combining characters have width 0, as measured by the + /// `unicode_width` crate. + pub column: Option, +} + +impl Point { + /// Takes `point`, adds to it the syntax in `syntax`, incrementing the line + /// number for each new-line in `syntax` and the column number for each + /// column, and returns the result. + pub fn advance(&self, syntax: &str) -> Self { + let mut result = *self; + for line in syntax.split_inclusive('\n') { + if line.ends_with('\n') { + result.line += 1; + result.column = Some(1); + } else { + result.column = result.column.map(|column| column + line.width() as i32); + } + } + result + } + + pub fn without_column(&self) -> Self { + Self { + line: self.line, + column: None, + } + } +} + +/// Location relevant to an diagnostic message. +#[derive(Clone, Debug)] +pub struct Location { + /// File name, if any. + pub file_name: Option>, + + /// Starting and ending point, if any. + pub span: Option>, + + /// Normally, if `span` contains column information, then displaying the + /// message will underline the location. Setting this to true disables + /// displaying underlines. + pub omit_underlines: bool, +} + +impl Display for Location { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + if let Some(file_name) = &self.file_name { + write!(f, "{}", file_name)?; + } + + if let Some(span) = &self.span { + if self.file_name.is_some() { + write!(f, ":")?; + } + let l1 = span.start.line; + let l2 = span.end.line; + if let (Some(c1), Some(c2)) = (span.start.column, span.end.column) { + if l2 > l1 { + write!(f, "{l1}.{c1}-{l2}.{}", c2 - 1)?; + } else { + write!(f, "{l1}.{c1}-{}", c2 - 1)?; + } + } else { + if l2 > l1 { + write!(f, "{l1}-{l2}")?; + } else { + write!(f, "{l1}")?; + } + } + } + Ok(()) + } +} + +impl Location { + pub fn without_columns(&self) -> Self { + Self { + file_name: self.file_name.clone(), + span: self + .span + .as_ref() + .map(|span| span.start.without_column()..span.end.without_column()), + omit_underlines: self.omit_underlines, + } + } + pub fn merge(a: Option, b: &Option) -> Option { + let Some(a) = a else { return b.clone() }; + let Some(b) = b else { return Some(a) }; + if a.file_name != b.file_name { + // Failure. + return Some(a); + } + let span = match (&a.span, &b.span) { + (None, None) => None, + (Some(r), None) | (None, Some(r)) => Some(r.clone()), + (Some(ar), Some(br)) => { + Some(min(ar.start, br.start).clone()..max(ar.end, br.end).clone()) + } + }; + Some(Self { + file_name: a.file_name, + span, + omit_underlines: a.omit_underlines || b.omit_underlines, + }) + } + pub fn is_empty(&self) -> bool { + self.file_name.is_none() && self.span.is_none() + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Enum)] +pub enum Severity { + Error, + Warning, + Note, +} + +impl Severity { + fn as_str(&self) -> &'static str { + match self { + Severity::Error => "error", + Severity::Warning => "warning", + Severity::Note => "note", + } + } +} + +impl Display for Severity { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + write!(f, "{}", self.as_str()) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum Category { + General, + Syntax, + Data, +} + +pub struct Stack { + location: Location, + description: String, +} + +pub struct Diagnostic { + pub severity: Severity, + pub category: Category, + pub location: Location, + pub source: Vec<(i32, String)>, + pub stack: Vec, + pub command_name: Option<&'static str>, + pub text: String, +} + +impl Display for Diagnostic { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + for Stack { + location, + description, + } in &self.stack + { + if !!location.is_empty() { + write!(f, "{location}: ")?; + } + writeln!(f, "{description}")?; + } + if self.category != Category::General && !self.location.is_empty() { + write!(f, "{}: ", self.location)?; + } + + write!(f, "{}: ", self.severity)?; + + match self.command_name { + Some(command_name) if self.category == Category::Syntax => { + write!(f, "{command_name}: ")? + } + _ => (), + } + + write!(f, "{}", self.text)?; + + if let Some(Range { + start: Point { + line: l0, + column: Some(c0), + }, + end: Point { + line: l1, + column: Some(c1), + }, + }) = self.location.span + { + let mut prev_line_number = None; + for (line_number, line) in &self.source { + if let Some(prev_line_number) = prev_line_number { + if *line_number != prev_line_number + 1 { + write!(f, "\n ... |")?; + } + } + prev_line_number = Some(line_number); + + write!(f, "\n{line_number:5} | {line}")?; + + if !self.location.omit_underlines { + let c0 = if *line_number == l0 { c0 } else { 1 }; + let c1 = if *line_number == l1 { + c1 + } else { + line.width() as i32 + }; + write!(f, "\n |")?; + for _ in 0..c0 { + f.write_str(" ")?; + } + if *line_number == l0 { + f.write_str("^")?; + for _ in c0..c1 { + f.write_str("~")?; + } + } else { + for _ in c0..=c1 { + f.write_str("~")?; + } + } + } + } + } + Ok(()) + } +} diff --git a/rust/pspp/src/output/mod.rs b/rust/pspp/src/output/mod.rs new file mode 100644 index 0000000000..944cbe75d9 --- /dev/null +++ b/rust/pspp/src/output/mod.rs @@ -0,0 +1,58 @@ +use std::sync::Arc; + +use self::pivot::Value; + +pub mod pivot; + +/// A single output item. +pub struct Item { + /// The localized label for the item that appears in the outline pane in the + /// output viewer and in PDF outlines. This is `None` if no label has been + /// explicitly set. + label: Option, + + /// A locale-invariant identifier for the command that produced the output, + /// which may be `None` if unknown or if a command did not produce this + /// output. + command_name: Option, + + /// For a group item, this is true if the group's subtree should + /// be expanded in an outline view, false otherwise. + /// + /// For other kinds of output items, this is true to show the item's + /// content, false to hide it. The item's label is always shown in an + /// outline view. + show: bool, + + /// Item details. + details: Details, +} + +pub enum Details { + Chart, + Image, + Group(Vec>), + Message, + Table, + Text(Text), +} + +pub struct Text { + type_: TextType, + + content: Value, +} + +pub enum TextType { + /// `TITLE` and `SUBTITLE` commands. + PageTitle, + + /// Title, + Title, + + /// Syntax printback logging. + Syntax, + + /// Other logging. + Log, +} diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs new file mode 100644 index 0000000000..d8f5c9f17f --- /dev/null +++ b/rust/pspp/src/output/pivot/mod.rs @@ -0,0 +1,738 @@ +//! Pivot tables. +//! +//! Pivot tables are PSPP's primary form of output. They are analogous to the +//! pivot tables you might be familiar with from spreadsheets and databases. +//! See for a brief introduction to +//! the overall concept of a pivot table. +//! +//! In PSPP, the most important internal pieces of a pivot table are: +//! +//! - Title. Every pivot table has a title that is displayed above it. It also +//! has an optional caption (displayed below it) and corner text (displayed in +//! the upper left corner). +//! +//! - Dimensions. A dimension consists of zero or more categories. A category +//! has a label, such as "df" or "Asymp. Sig." or 123 or a variable name. The +//! categories are the leaves of a tree whose non-leaf nodes form groups of +//! categories. The tree always has a root group whose label is the name of +//! the dimension. +//! +//! - Axes. A table has three axes: column, row, and layer. Each dimension is +//! assigned to an axis, and each axis has zero or more dimensions. When an +//! axis has more than one dimension, they are ordered from innermost to +//! outermost. +//! +//! - Data. A table's data consists of zero or more cells. Each cell maps from +//! a category for each dimension to a value, which is commonly a number but +//! could also be a variable name or an arbitrary text string. +//! +//! Creating a pivot table usually consists of the following steps: +//! +//! 1. Create the table with pivot_table_create(), passing in the title. +//! +//! 2. Optionally, set the format to use for "count" values with +//! pivot_table_set_weight_var() or pivot_table_set_weight_format(). +//! +//! 3. Create each dimension with pivot_dimension_create() and populate it with +//! categories and, possibly, with groups that contain the categories. This +//! call also assigns the dimension to an axis. +//! +//! In simple cases, only a call to pivot_dimension_create() is needed. +//! Other functions such as pivot_category_create_group() can be used for +//! hierarchies of categories. +//! +//! Sometimes it's easier to create categories in tandem with inserting data, +//! for example by adding a category for a variable just before inserting the +//! first cell for that variable. In that case, creating categories and +//! inserting data can be interleaved. +//! +//! 4. Insert data. For each cell, supply the category indexes, which are +//! assigned starting from 0 in the order in which the categories were +//! created in step 2, and the value to go in the cell. If the table has a +//! small, fixed number of dimensions, functions like, e.g. +//! pivot_table_put3() for 3 dimensions, can be used. The general function +//! pivot_table_put() works for other cases. +//! +//! 5. Output the table for user consumption. Use pivot_table_submit(). + +use std::{ + collections::HashMap, + ops::Range, + sync::{Arc, OnceLock}, +}; + +use chrono::NaiveDateTime; +use enum_map::{enum_map, Enum, EnumMap}; + +use crate::format::{Format, Settings as FormatSettings}; + +/// Areas of a pivot table for styling purposes. +#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq)] +pub enum Area { + Title, + Caption, + + /// Footnotes, + Footer, + + // Top-left corner. + Corner, + + ColumnLabels, + RowLabels, + Data, + + /// Layer indication. + Layers, +} + +/// Table borders for styling purposes. +#[derive(Debug, Enum)] +pub enum Border { + Title, + OuterFrame(BoxBorder), + InnerFrame(BoxBorder), + Dimensions(RowColBorder), + Categories(RowColBorder), + DataLeft, + DataTop, +} + +/// The borders on a box. +#[derive(Debug, Enum)] +pub enum BoxBorder { + Left, + Top, + Right, + Bottom, +} + +/// Borders between rows and columns. +#[derive(Debug, Enum, PartialEq, Eq)] +pub enum RowColBorder { + RowHorz, + RowVert, + ColHorz, + ColVert, +} + +/// Sizing for rows or columns of a rendered table. +/// +/// The comments below talk about columns and their widths but they apply +/// equally to rows and their heights. +#[derive(Default)] +pub struct Sizing { + /// Specific column widths, in 1/96" units. + widths: Vec, + + /// Specific page breaks: 0-based columns after which a page break must + /// occur, e.g. a value of 1 requests a break after the second column. + breaks: Vec, + + /// Keeps: columns to keep together on a page if possible. + keeps: Vec>, +} + +#[derive(Enum)] +pub enum Axis3 { + X, + Y, + Z, +} + +/// An axis within a pivot table. +#[derive(Default)] +pub struct TableAxis { + /// `dimensions[0]` is the innermost dimension. + dimensions: Vec, + + /// The number of rows or columns along the axis, that is, the product of + /// `dimensions[*].n_leaves`. It is 0 if any dimension has 0 leaves. + extent: usize, + + /// Sum of `dimensions[*].label_depth`. + label_depth: usize, +} + +/// Dimensions. +/// +/// A [Dimension] identifies the categories associated with a single dimension +/// within a multidimensional pivot table. +/// +/// A dimension contains a collection of categories, which are the leaves in a +/// tree of groups. +/// +/// (A dimension or a group can contain zero categories, but this is unusual. +/// If a dimension contains no categories, then its table cannot contain any +/// data.) +pub struct Dimension { + axis_type: Axis3, + level: usize, + + top_index: usize, + + /// Hierarchy of categories within the dimension. The groups and categories + /// are sorted in the order that should be used for display. This might be + /// different from the original order produced for output if the user + /// adjusted it. + /// + /// The root must always be a group, although it is allowed to have no + /// subcategories. + root: Group, + + /// All of the leaves reachable via the root. + /// + /// The indexing for presentation_leaves is presentation order, thus + /// `presentation_leaves[i]->presentation_index == i`. This order is the + /// same as would be produced by an in-order traversal of the groups. It + /// is the order into which the user reordered or sorted the categories. + /// + /// The indexing for `data_leaves` is that used for `idx` in [Cell], thus + /// `data_leaves[i]->data_index == i`. This might differ from what an + /// in-order traversal of `root` would yield, if the user reordered + /// categories. + data_leaves: Vec>, + presentation_leaves: Vec>, + + /// Display. + hide_all_labels: bool, + + /// Number of rows or columns needed to express the labels. + label_depth: usize, +} + +pub struct Group { + name: Value, + label_depth: usize, + extra_depth: usize, + + /// The child categories. + /// + /// A group usually has multiple children, but it is allowed to have + /// only one or even (pathologically) none. + children: Vec, + + /// Display a label for the group itself? + show_label: bool, + + show_label_in_corner: bool, +} + +pub struct Leaf { + name: Value, + label_depth: usize, + extra_depth: usize, + + group_index: usize, + data_index: usize, + presentation_index: usize, + + /// Default format for values in this category. + format: Format, + + /// Honor [Table]'s `small` setting? + honor_small: bool, +} + +/// A pivot_category is a leaf (a category) or a group. +pub enum Category { + Group(Arc), + Leaf(Arc), +} + +trait CategoryTrait { + fn name(&self) -> &Value; + fn label_depth(&self) -> usize; + fn extra_depth(&self) -> usize; +} + +impl CategoryTrait for Group { + fn name(&self) -> &Value { + &self.name + } + + fn label_depth(&self) -> usize { + self.label_depth + } + + fn extra_depth(&self) -> usize { + self.extra_depth + } +} + +impl CategoryTrait for Leaf { + fn name(&self) -> &Value { + &self.name + } + + fn label_depth(&self) -> usize { + self.label_depth + } + + fn extra_depth(&self) -> usize { + self.extra_depth + } +} + +impl CategoryTrait for Category { + fn name(&self) -> &Value { + match self { + Category::Group(group) => group.name(), + Category::Leaf(leaf) => leaf.name(), + } + } + + fn label_depth(&self) -> usize { + match self { + Category::Group(group) => group.label_depth(), + Category::Leaf(leaf) => leaf.label_depth(), + } + } + + fn extra_depth(&self) -> usize { + match self { + Category::Group(group) => group.extra_depth(), + Category::Leaf(leaf) => leaf.extra_depth(), + } + } +} + +/// Styling for a pivot table. +/// +/// The division between this and the style information in [Table] seems fairly +/// arbitrary. The ultimate reason for the division is simply because that's +/// how SPSS documentation and file formats do it. +struct Look { + name: Option, + + omit_empty: bool, + row_labels_in_corner: bool, + + /// Range of column widths for columns in the row headings and corner , in 1/96" + /// units. + row_heading_widths: Range, + + /// Range of column widths for columns in the column headings , in 1/96" + /// units. + col_heading_widths: Range, + + /// Kind of markers to use for footnotes. + footnote_marker_type: FootnoteMarkerType, + + /// Where to put the footnote markers. + footnote_marker_position: FootnoteMarkerPosition, + + /// Styles for areas of the pivot table. + areas: EnumMap, + + /// Styles for borders in the pivot table. + borders: EnumMap, + + print_all_layers: bool, + + paginate_layers: bool, + + shrink_to_fit: EnumMap, + + top_continuation: bool, + + bottom_continuation: bool, + + continuation: Option, + + n_orphan_lines: usize, +} + +impl Default for Look { + fn default() -> Self { + Self { + name: None, + omit_empty: true, + row_labels_in_corner: true, + row_heading_widths: 36..72, + col_heading_widths: 36..120, + footnote_marker_type: FootnoteMarkerType::Alphabetic, + footnote_marker_position: FootnoteMarkerPosition::Subscript, + areas: EnumMap::from_fn(|area| { + use HorzAlign::*; + use VertAlign::*; + let (halign, valign, hmargins, vmargins) = match area { + Area::Title => (Center, Middle, [8, 11], [1, 8]), + Area::Caption => (Left, Top, [8, 11], [1, 1]), + Area::Footer => (Left, Top, [11, 8], [2, 3]), + Area::Corner => (Left, Bottom, [8, 11], [1, 1]), + Area::ColumnLabels => (Left, Top, [8, 11], [1, 3]), + Area::RowLabels => (Left, Top, [8, 11], [1, 3]), + Area::Data => (Mixed, Top, [8, 11], [1, 1]), + Area::Layers => (Left, Bottom, [8, 11], [1, 3]), + }; + AreaStyle { + cell_style: CellStyle { + horz_align: halign, + vert_align: valign, + margins: enum_map! { Axis2::X => hmargins, Axis2::Y => vmargins }, + }, + font_style: FontStyle { + bold: area == Area::Title, + italic: false, + underline: false, + markup: false, + font: String::from("Sans Serif"), + fg: [Color::BLACK; 2], + bg: [Color::WHITE; 2], + size: 9, + }, + } + }), + borders: EnumMap::from_fn(|border| { + let stroke = match border { + Border::InnerFrame(_) | Border::DataLeft | Border::DataTop => Stroke::Thick, + Border::Dimensions(side) if side != RowColBorder::RowVert => Stroke::Solid, + Border::Categories(RowColBorder::ColHorz | RowColBorder::ColVert) => { + Stroke::Solid + } + _ => Stroke::None, + }; + BorderStyle { + stroke, + color: Color::BLACK, + } + }), + print_all_layers: false, + paginate_layers: false, + shrink_to_fit: EnumMap::from_fn(|_| false), + top_continuation: false, + bottom_continuation: false, + continuation: None, + n_orphan_lines: 0, + } + } +} + +impl Look { + fn shared_default() -> Arc { + static LOOK: OnceLock> = OnceLock::new(); + LOOK.get_or_init(|| Arc::new(Look::default())).clone() + } +} + +pub struct AreaStyle { + cell_style: CellStyle, + font_style: FontStyle, +} + +pub struct CellStyle { + horz_align: HorzAlign, + vert_align: VertAlign, + + /// Margins in 1/96" units. + /// + /// `margins[Axis2::X][0]` is the left margin. + /// `margins[Axis2::X][1]` is the right margin. + /// `margins[Axis2::Y][0]` is the top margin. + /// `margins[Axis2::Y][1]` is the bottom margin. + margins: EnumMap, +} + +pub enum HorzAlign { + /// Right aligned. + Right, + + /// Left aligned. + Left, + + /// Centered. + Center, + + /// Align strings to the left, other formats to the right. + Mixed, + + /// Align the decimal point at the specified position. + Decimal { + /// Decimal offset from the right side of the cell, in 1/96" units. + offset: f64, + + /// Decimal character: either `b'.'` or `b','`. + c: char, + }, +} + +pub enum VertAlign { + /// Top alignment. + Top, + + /// Centered, + Middle, + + /// Bottom alignment. + Bottom, +} + +pub struct FontStyle { + bold: bool, + italic: bool, + underline: bool, + markup: bool, + font: String, + fg: [Color; 2], + bg: [Color; 2], + + /// In 1/72" units. + size: i32, +} + +pub struct Color { + alpha: u8, + r: u8, + g: u8, + b: u8, +} + +impl Color { + const BLACK: Color = Color::new(0, 0, 0); + const WHITE: Color = Color::new(255, 255, 255); + + const fn new(r: u8, g: u8, b: u8) -> Self { + Self { + alpha: 255, + r, + g, + b, + } + } +} + +pub struct BorderStyle { + stroke: Stroke, + color: Color, +} + +pub enum Stroke { + None, + Solid, + Dashed, + Thick, + Thin, + Double, +} + +/// An axis of a flat table. +#[derive(Debug, Enum)] +pub enum Axis2 { + X, + Y, +} + +pub enum FootnoteMarkerType { + /// a, b, c, ... + Alphabetic, + + /// 1, 2, 3, ... + Numeric, +} + +pub enum FootnoteMarkerPosition { + /// Subscripts. + Subscript, + + /// Superscripts. + Superscript, +} + +pub struct Table { + look: Arc, + + rotate_inner_column_labels: bool, + + rotate_outer_row_labels: bool, + + show_grid_lines: bool, + + show_title: bool, + + show_caption: bool, + + show_value: Option, + + show_variables: Option, + + weight_format: Format, + + /// Current layer indexes, with axes[PIVOT_AXIS_LAYER].n_dimensions + /// elements. current_layer[i] is an offset into + /// axes[PIVOT_AXIS_LAYER].dimensions[i]->data_leaves[], EXCEPT that a + /// dimension can have zero leaves, in which case current_layer[i] is zero + /// and there's no corresponding leaf. + current_layer: Vec, + + /// Column and row sizing and page breaks. + sizing: EnumMap, + + /// Format settings. + settings: FormatSettings, + + /// Numeric grouping character (usually `.` or `,`). + grouping: Option, + + small: f64, + + command_local: Option, + command_c: Option, + language: Option, + locale: Option, + dataset: Option, + datafile: Option, + date: Option, + footnotes: Vec, + title: Option, + subtype: Option, + corner_text: Option, + caption: Option, + notes: Option, + dimensions: Vec, + axes: EnumMap, + cells: HashMap, +} + +impl Table { + fn new() -> Self { + Self { + look: Look::shared_default(), + rotate_inner_column_labels: false, + rotate_outer_row_labels: false, + show_grid_lines: false, + show_title: true, + show_caption: true, + show_value: None, + show_variables: None, + weight_format: Format::F40, + current_layer: Vec::new(), + sizing: EnumMap::default(), + settings: FormatSettings::default(), // XXX from settings + grouping: None, + small: 0.0001, // XXX from settings. + command_local: None, + command_c: None, // XXX from current command name. + language: None, + locale: None, + dataset: None, + datafile: None, + date: None, + footnotes: Vec::new(), + subtype: None, + title: None, + corner_text: None, + caption: None, + notes: None, + dimensions: Vec::new(), + axes: EnumMap::default(), + cells: HashMap::new(), + } + } +} + +/// Whether to show variable or value labels or the underlying value or variable name. +pub enum ValueShow { + /// Value or variable name only. + Value, + + /// Label only. + Label, + + /// Value and label. + Both, +} + +pub struct Footnote { + content: Value, + marker: Value, + show: bool, +} + +/// The content of a single pivot table cell. +/// +/// A [Value] is also a pivot table's title, caption, footnote marker and +/// contents, and so on. +/// +/// A given [Value] is one of: +/// +/// 1. A number resulting from a calculation. +/// +/// A number has an associated display format (usually [F] or [Pct]). This +/// format can be set directly, but that is not usually the easiest way. +/// Instead, it is usually true that all of the values in a single category +/// should have the same format (e.g. all "Significance" values might use +/// format `F40.3`), so PSPP makes it easy to set the default format for a +/// category while creating the category. See pivot_dimension_create() for +/// more details. +/// +/// [F]: crate::format::Format::F +/// [Pct]: crate::format::Format::Pct +/// +/// 2. A numeric or string value obtained from data (PIVOT_VALUE_NUMERIC or +/// PIVOT_VALUE_STRING). If such a value corresponds to a variable, then the +/// variable's name can be attached to the pivot_value. If the value has a +/// value label, then that can also be attached. When a label is present, +/// the user can control whether to show the value or the label or both. +/// +/// 3. A variable name (PIVOT_VALUE_VARIABLE). The variable label, if any, can +/// be attached too, and again the user can control whether to show the value +/// or the label or both. +/// +/// 4. A text string (PIVOT_VALUE_TEXT). The value stores the string in English +/// and translated into the output language (localized). Use +/// pivot_value_new_text() or pivot_value_new_text_format() for those cases. +/// In some cases, only an English or a localized version is available for +/// one reason or another, although this is regrettable; in those cases, use +/// pivot_value_new_user_text() or pivot_value_new_user_text_nocopy(). +/// +/// 5. A template. PSPP doesn't create these itself yet, but it can read and +/// interpret those created by SPSS. +pub struct Value { + styling: Option>, + inner: ValueInner, +} + +pub enum ValueInner { + Number { + show: ValueShow, + format: Format, + honor_small: bool, + value: f64, + var_name: Option, + value_label: Option, + }, + String { + show: ValueShow, + hex: bool, + s: Option, + var_name: Option, + value_label: Option, + }, + Variable { + show: ValueShow, + var_name: Option, + value_label: Option, + }, + Text { + user_provided: bool, + /// Localized. + local: String, + /// English. + c: String, + /// Identifier. + id: String, + }, + Template { + args: Vec>, + local: String, + id: String, + }, +} + +pub struct ValueStyle { + font_style: FontStyle, + cell_style: CellStyle, + subscripts: Vec, + footnote_indexes: Vec, +} diff --git a/rust/pspp/src/prompt.rs b/rust/pspp/src/prompt.rs new file mode 100644 index 0000000000..c02ca9b367 --- /dev/null +++ b/rust/pspp/src/prompt.rs @@ -0,0 +1,37 @@ +#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)] +pub enum PromptStyle { + /// First line of command. + First, + + /// Second or later line of command. + Later, + + /// Between `BEGIN DATA` and `END DATA`. + Data, + + /// `COMMENT` or `*` command. + Comment, + + /// DOCUMENT command. + Document, + + /// `DO REPEAT` command. + DoRepeat, + + /// `DEFINE` command. + Define, +} + +impl PromptStyle { + pub fn to_string(&self) -> &'static str { + match self { + PromptStyle::First => "first", + PromptStyle::Later => "later", + PromptStyle::Data => "data", + PromptStyle::Comment => "COMMENT", + PromptStyle::Document => "DOCUMENT", + PromptStyle::DoRepeat => "DO REPEAT", + PromptStyle::Define => "DEFINE", + } + } +} diff --git a/rust/pspp/src/raw.rs b/rust/pspp/src/raw.rs new file mode 100644 index 0000000000..c9b04773ff --- /dev/null +++ b/rust/pspp/src/raw.rs @@ -0,0 +1,2888 @@ +use crate::{ + dictionary::VarWidth, + encoding::{default_encoding, get_encoding, Error as EncodingError}, + endian::{Endian, Parse, ToBytes}, + identifier::{Error as IdError, Identifier}, +}; + +use encoding_rs::{mem::decode_latin1, DecoderResult, Encoding}; +use flate2::read::ZlibDecoder; +use num::Integer; +use std::{ + borrow::Cow, + cell::RefCell, + cmp::Ordering, + collections::{HashMap, VecDeque}, + fmt::{Debug, Display, Formatter, Result as FmtResult}, + io::{Error as IoError, Read, Seek, SeekFrom}, + iter::repeat, + mem::take, + ops::Range, + rc::Rc, + str::from_utf8, +}; +use thiserror::Error as ThisError; + +#[derive(ThisError, Debug)] +pub enum Error { + #[error("Not an SPSS system file")] + NotASystemFile, + + #[error("Invalid magic number {0:?}")] + BadMagic([u8; 4]), + + #[error("I/O error ({0})")] + Io(#[from] IoError), + + #[error("Invalid SAV compression code {0}")] + InvalidSavCompression(u32), + + #[error("Invalid ZSAV compression code {0}")] + InvalidZsavCompression(u32), + + #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")] + BadDocumentLength { offset: u64, n: usize, max: usize }, + + #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")] + BadRecordType { offset: u64, rec_type: u32 }, + + #[error("In variable record starting at offset {start_offset:#x}, variable width is not in the valid range -1 to 255.")] + BadVariableWidth { start_offset: u64, width: i32 }, + + #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")] + BadVariableLabelCode { + start_offset: u64, + code_offset: u64, + code: u32, + }, + + #[error( + "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3." + )] + BadNumericMissingValueCode { offset: u64, code: i32 }, + + #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")] + BadStringMissingValueCode { offset: u64, code: i32 }, + + #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")] + BadNumberOfValueLabels { offset: u64, n: u32, max: u32 }, + + #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")] + ExpectedVarIndexRecord { offset: u64, rec_type: u32 }, + + #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")] + TooManyVarIndexes { offset: u64, n: u32, max: u32 }, + + #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")] + ExtensionRecordTooLarge { + offset: u64, + subtype: u32, + size: u32, + count: u32, + }, + + #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")] + EofInCase { + offset: u64, + case_ofs: u64, + case_len: usize, + }, + + #[error( + "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case." + )] + EofInCompressedCase { offset: u64, case_ofs: u64 }, + + #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")] + PartialCompressedCase { offset: u64, case_ofs: u64 }, + + #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")] + CompressedNumberExpected { offset: u64, case_ofs: u64 }, + + #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")] + CompressedStringExpected { offset: u64, case_ofs: u64 }, + + #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")] + BadZlibTrailerNBlocks { + offset: u64, + n_blocks: u32, + expected_n_blocks: u64, + ztrailer_len: u64, + }, + + #[error("{0}")] + EncodingError(EncodingError), +} + +#[derive(ThisError, Debug)] +pub enum Warning { + #[error("Unexpected end of data inside extension record.")] + UnexpectedEndOfData, + + #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")] + NoVarIndexes { offset: u64 }, + + #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())] + MixedVarTypes { + offset: u64, + var_type: VarType, + wrong_types: Vec, + }, + + #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")] + InvalidVarIndexes { + offset: u64, + max: usize, + invalid: Vec, + }, + + #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")] + BadRecordSize { + offset: u64, + record: String, + size: u32, + expected_size: u32, + }, + + #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")] + BadRecordCount { + offset: u64, + record: String, + count: u32, + expected_count: u32, + }, + + #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")] + BadLongMissingValueLength { + record_offset: u64, + offset: u64, + value_len: u32, + }, + + #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")] + BadEncodingName { offset: u64 }, + + // XXX This is risky because `text` might be arbitarily long. + #[error("Text string contains invalid bytes for {encoding} encoding: {text}")] + MalformedString { encoding: String, text: String }, + + #[error("Invalid variable measurement level value {0}")] + InvalidMeasurement(u32), + + #[error("Invalid variable display alignment value {0}")] + InvalidAlignment(u32), + + #[error("Invalid attribute name. {0}")] + InvalidAttributeName(IdError), + + #[error("Invalid variable name in attribute record. {0}")] + InvalidAttributeVariableName(IdError), + + #[error("Invalid short name in long variable name record. {0}")] + InvalidShortName(IdError), + + #[error("Invalid name in long variable name record. {0}")] + InvalidLongName(IdError), + + #[error("Invalid variable name in very long string record. {0}")] + InvalidLongStringName(IdError), + + #[error("Invalid variable name in variable set record. {0}")] + InvalidVariableSetName(IdError), + + #[error("Invalid multiple response set name. {0}")] + InvalidMrSetName(IdError), + + #[error("Invalid multiple response set variable name. {0}")] + InvalidMrSetVariableName(IdError), + + #[error("Invalid variable name in long string missing values record. {0}")] + InvalidLongStringMissingValueVariableName(IdError), + + #[error("Invalid variable name in long string value label record. {0}")] + InvalidLongStringValueLabelName(IdError), + + #[error("{0}")] + EncodingError(EncodingError), + + #[error("Details TBD")] + TBD, +} + +impl From for Warning { + fn from(_source: IoError) -> Self { + Self::UnexpectedEndOfData + } +} + +#[derive(Clone, Debug)] +pub enum Record { + Header(HeaderRecord), + Variable(VariableRecord>), + ValueLabel(ValueLabelRecord, RawString>), + Document(DocumentRecord), + IntegerInfo(IntegerInfoRecord), + FloatInfo(FloatInfoRecord), + VarDisplay(VarDisplayRecord), + MultipleResponse(MultipleResponseRecord), + LongStringValueLabels(LongStringValueLabelRecord), + LongStringMissingValues(LongStringMissingValueRecord>), + Encoding(EncodingRecord), + NumberOfCases(NumberOfCasesRecord), + Text(TextRecord), + OtherExtension(Extension), + EndOfHeaders(u32), + ZHeader(ZHeader), + ZTrailer(ZTrailer), + Cases(Rc>), +} + +#[derive(Clone, Debug)] +pub enum DecodedRecord { + Header(HeaderRecord), + Variable(VariableRecord), + ValueLabel(ValueLabelRecord, String>), + Document(DocumentRecord), + IntegerInfo(IntegerInfoRecord), + FloatInfo(FloatInfoRecord), + VarDisplay(VarDisplayRecord), + MultipleResponse(MultipleResponseRecord), + LongStringValueLabels(LongStringValueLabelRecord), + LongStringMissingValues(LongStringMissingValueRecord), + Encoding(EncodingRecord), + NumberOfCases(NumberOfCasesRecord), + VariableSets(VariableSetRecord), + ProductInfo(ProductInfoRecord), + LongNames(LongNamesRecord), + VeryLongStrings(VeryLongStringsRecord), + FileAttributes(FileAttributeRecord), + VariableAttributes(VariableAttributeRecord), + OtherExtension(Extension), + EndOfHeaders(u32), + ZHeader(ZHeader), + ZTrailer(ZTrailer), + Cases(Rc>), +} + +impl Record { + fn read( + reader: &mut R, + endian: Endian, + var_types: &[VarType], + warn: &dyn Fn(Warning), + ) -> Result, Error> + where + R: Read + Seek, + { + let rec_type: u32 = endian.parse(read_bytes(reader)?); + match rec_type { + 2 => Ok(Some(VariableRecord::read(reader, endian)?)), + 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?), + 6 => Ok(Some(DocumentRecord::read(reader, endian)?)), + 7 => Extension::read(reader, endian, var_types.len(), warn), + 999 => Ok(Some(Record::EndOfHeaders( + endian.parse(read_bytes(reader)?), + ))), + _ => Err(Error::BadRecordType { + offset: reader.stream_position()?, + rec_type, + }), + } + } + + pub fn decode(self, decoder: &Decoder) -> Result { + Ok(match self { + Record::Header(record) => record.decode(decoder), + Record::Variable(record) => record.decode(decoder), + Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)), + Record::Document(record) => record.decode(decoder), + Record::IntegerInfo(record) => DecodedRecord::IntegerInfo(record.clone()), + Record::FloatInfo(record) => DecodedRecord::FloatInfo(record.clone()), + Record::VarDisplay(record) => DecodedRecord::VarDisplay(record.clone()), + Record::MultipleResponse(record) => record.decode(decoder), + Record::LongStringValueLabels(record) => { + DecodedRecord::LongStringValueLabels(record.decode(decoder)) + } + Record::LongStringMissingValues(record) => { + DecodedRecord::LongStringMissingValues(record.decode(decoder)) + } + Record::Encoding(record) => DecodedRecord::Encoding(record.clone()), + Record::NumberOfCases(record) => DecodedRecord::NumberOfCases(record.clone()), + Record::Text(record) => record.decode(decoder), + Record::OtherExtension(record) => DecodedRecord::OtherExtension(record.clone()), + Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record), + Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()), + Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()), + Record::Cases(record) => DecodedRecord::Cases(record.clone()), + }) + } +} + +pub fn encoding_from_headers( + headers: &Vec, + warn: &impl Fn(Warning), +) -> Result<&'static Encoding, Error> { + let mut encoding_record = None; + let mut integer_info_record = None; + for record in headers { + match record { + Record::Encoding(record) => encoding_record = Some(record), + Record::IntegerInfo(record) => integer_info_record = Some(record), + _ => (), + } + } + let encoding = encoding_record.map(|record| record.0.as_str()); + let character_code = integer_info_record.map(|record| record.character_code); + match get_encoding(encoding, character_code) { + Ok(encoding) => Ok(encoding), + Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)), + Err(err) => { + warn(Warning::EncodingError(err)); + // Warn that we're using the default encoding. + Ok(default_encoding()) + } + } +} + +// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it +// decoded as Latin-1 (actually bytes interpreted as Unicode code points). +fn default_decode(s: &[u8]) -> Cow { + from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from) +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum Compression { + Simple, + ZLib, +} + +trait Header { + fn offsets(&self) -> Range; +} + +#[derive(Clone)] +pub struct HeaderRecord +where + S: Debug, +{ + /// Offset in file. + pub offsets: Range, + + /// Magic number. + pub magic: Magic, + + /// Eye-catcher string, product name, in the file's encoding. Padded + /// on the right with spaces. + pub eye_catcher: S, + + /// Layout code, normally either 2 or 3. + pub layout_code: u32, + + /// Number of variable positions, or `None` if the value in the file is + /// questionably trustworthy. + pub nominal_case_size: Option, + + /// Compression type, if any, + pub compression: Option, + + /// 1-based variable index of the weight variable, or `None` if the file is + /// unweighted. + pub weight_index: Option, + + /// Claimed number of cases, if known. + pub n_cases: Option, + + /// Compression bias, usually 100.0. + pub bias: f64, + + /// `dd mmm yy` in the file's encoding. + pub creation_date: S, + + /// `HH:MM:SS` in the file's encoding. + pub creation_time: S, + + /// File label, in the file's encoding. Padded on the right with spaces. + pub file_label: S, + + /// Endianness of the data in the file header. + pub endian: Endian, +} + +impl HeaderRecord +where + S: Debug, +{ + fn debug_field(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult + where + T: Debug, + { + writeln!(f, "{name:>17}: {:?}", value) + } +} + +impl Debug for HeaderRecord +where + S: Debug, +{ + fn fmt(&self, f: &mut Formatter) -> FmtResult { + writeln!(f, "File header record:")?; + self.debug_field(f, "Magic", self.magic)?; + self.debug_field(f, "Product name", &self.eye_catcher)?; + self.debug_field(f, "Layout code", self.layout_code)?; + self.debug_field(f, "Nominal case size", self.nominal_case_size)?; + self.debug_field(f, "Compression", self.compression)?; + self.debug_field(f, "Weight index", self.weight_index)?; + self.debug_field(f, "Number of cases", self.n_cases)?; + self.debug_field(f, "Compression bias", self.bias)?; + self.debug_field(f, "Creation date", &self.creation_date)?; + self.debug_field(f, "Creation time", &self.creation_time)?; + self.debug_field(f, "File label", &self.file_label)?; + self.debug_field(f, "Endianness", self.endian) + } +} + +impl HeaderRecord { + fn read(r: &mut R) -> Result { + let start = r.stream_position()?; + + let magic: [u8; 4] = read_bytes(r)?; + let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?; + + let eye_catcher = RawString(read_vec(r, 60)?); + let layout_code: [u8; 4] = read_bytes(r)?; + let endian = Endian::identify_u32(2, layout_code) + .or_else(|| Endian::identify_u32(2, layout_code)) + .ok_or_else(|| Error::NotASystemFile)?; + let layout_code = endian.parse(layout_code); + + let nominal_case_size: u32 = endian.parse(read_bytes(r)?); + let nominal_case_size = + (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size); + + let compression_code: u32 = endian.parse(read_bytes(r)?); + let compression = match (magic, compression_code) { + (Magic::Zsav, 2) => Some(Compression::ZLib), + (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)), + (_, 0) => None, + (_, 1) => Some(Compression::Simple), + (_, code) => return Err(Error::InvalidSavCompression(code)), + }; + + let weight_index: u32 = endian.parse(read_bytes(r)?); + let weight_index = (weight_index > 0).then_some(weight_index); + + let n_cases: u32 = endian.parse(read_bytes(r)?); + let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases); + + let bias: f64 = endian.parse(read_bytes(r)?); + + let creation_date = RawString(read_vec(r, 9)?); + let creation_time = RawString(read_vec(r, 8)?); + let file_label = RawString(read_vec(r, 64)?); + let _: [u8; 3] = read_bytes(r)?; + + Ok(HeaderRecord { + offsets: start..r.stream_position()?, + magic, + layout_code, + nominal_case_size, + compression, + weight_index, + n_cases, + bias, + creation_date, + creation_time, + eye_catcher, + file_label, + endian, + }) + } + + pub fn decode(self, decoder: &Decoder) -> DecodedRecord { + let eye_catcher = decoder.decode(&self.eye_catcher).to_string(); + let file_label = decoder.decode(&self.file_label).to_string(); + let creation_date = decoder.decode(&self.creation_date).to_string(); + let creation_time = decoder.decode(&self.creation_time).to_string(); + DecodedRecord::Header(HeaderRecord { + eye_catcher, + weight_index: self.weight_index, + n_cases: self.n_cases, + file_label, + offsets: self.offsets.clone(), + magic: self.magic, + layout_code: self.layout_code, + nominal_case_size: self.nominal_case_size, + compression: self.compression, + bias: self.bias, + creation_date, + creation_time, + endian: self.endian, + }) + } +} + +pub struct Decoder { + pub encoding: &'static Encoding, + pub warn: Box, +} + +impl Decoder { + pub fn new(encoding: &'static Encoding, warn: F) -> Self + where + F: Fn(Warning) + 'static, + { + Self { + encoding, + warn: Box::new(warn), + } + } + fn warn(&self, warning: Warning) { + (self.warn)(warning) + } + fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { + let (output, malformed) = self.encoding.decode_without_bom_handling(input); + if malformed { + self.warn(Warning::MalformedString { + encoding: self.encoding.name().into(), + text: output.clone().into(), + }); + } + output + } + + fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> { + self.decode_slice(input.0.as_slice()) + } + + /// Returns `input` decoded from `self.encoding` into UTF-8 such that + /// re-encoding the result back into `self.encoding` will have exactly the + /// same length in bytes. + /// + /// XXX warn about errors? + pub fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { + if let (s, false) = self.encoding.decode_without_bom_handling(input) { + // This is the common case. Usually there will be no errors. + s + } else { + // Unusual case. Don't bother to optimize it much. + let mut decoder = self.encoding.new_decoder_without_bom_handling(); + let mut output = String::with_capacity( + decoder + .max_utf8_buffer_length_without_replacement(input.len()) + .unwrap(), + ); + let mut rest = input; + while !rest.is_empty() { + match decoder.decode_to_string_without_replacement(rest, &mut output, true) { + (DecoderResult::InputEmpty, _) => break, + (DecoderResult::OutputFull, _) => unreachable!(), + (DecoderResult::Malformed(a, b), consumed) => { + let skipped = a as usize + b as usize; + output.extend(repeat('?').take(skipped)); + rest = &rest[consumed..]; + } + } + } + assert_eq!(self.encoding.encode(&output).0.len(), input.len()); + output.into() + } + } + + pub fn decode_identifier(&self, input: &RawString) -> Result { + self.new_identifier(&self.decode(input)) + } + + pub fn new_identifier(&self, name: &str) -> Result { + Identifier::from_encoding(name, self.encoding) + } +} + +impl Header for HeaderRecord +where + S: Debug, +{ + fn offsets(&self) -> Range { + self.offsets.clone() + } +} + +#[derive(Copy, Clone, PartialEq, Eq, Hash)] +pub enum Magic { + /// Regular system file. + Sav, + + /// System file with Zlib-compressed data. + Zsav, + + /// EBCDIC-encoded system file. + Ebcdic, +} + +impl Magic { + /// Magic number for a regular system file. + pub const SAV: [u8; 4] = *b"$FL2"; + + /// Magic number for a system file that contains zlib-compressed data. + pub const ZSAV: [u8; 4] = *b"$FL3"; + + /// Magic number for an EBCDIC-encoded system file. This is `$FL2` encoded + /// in EBCDIC. + pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2]; +} + +impl Debug for Magic { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + let s = match *self { + Magic::Sav => "$FL2", + Magic::Zsav => "$FL3", + Magic::Ebcdic => "($FL2 in EBCDIC)", + }; + write!(f, "{s}") + } +} + +impl TryFrom<[u8; 4]> for Magic { + type Error = Error; + + fn try_from(value: [u8; 4]) -> Result { + match value { + Magic::SAV => Ok(Magic::Sav), + Magic::ZSAV => Ok(Magic::Zsav), + Magic::EBCDIC => Ok(Magic::Ebcdic), + _ => Err(Error::BadMagic(value)), + } + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum VarType { + Numeric, + String, +} + +impl VarType { + pub fn from_width(width: VarWidth) -> VarType { + match width { + VarWidth::Numeric => Self::Numeric, + VarWidth::String(_) => Self::String, + } + } + + pub fn opposite(self) -> VarType { + match self { + Self::Numeric => Self::String, + Self::String => Self::Numeric, + } + } +} + +impl Display for VarType { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + match self { + VarType::Numeric => write!(f, "numeric"), + VarType::String => write!(f, "string"), + } + } +} + +#[derive(Copy, Clone)] +pub enum Value +where + S: Debug, +{ + Number(Option), + String(S), +} + +type RawValue = Value>; + +impl Debug for Value +where + S: Debug, +{ + fn fmt(&self, f: &mut Formatter) -> FmtResult { + match self { + Value::Number(Some(number)) => write!(f, "{number:?}"), + Value::Number(None) => write!(f, "SYSMIS"), + Value::String(s) => write!(f, "{:?}", s), + } + } +} + +impl RawValue { + fn read(r: &mut R, var_type: VarType, endian: Endian) -> Result { + Ok(Self::from_raw( + &UntypedValue(read_bytes(r)?), + var_type, + endian, + )) + } + + pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self { + match var_type { + VarType::String => Value::String(RawStr(raw.0)), + VarType::Numeric => { + let number: f64 = endian.parse(raw.0); + Value::Number((number != -f64::MAX).then_some(number)) + } + } + } + + fn read_case( + reader: &mut R, + var_types: &[VarType], + endian: Endian, + ) -> Result>, Error> { + let case_start = reader.stream_position()?; + let mut values = Vec::with_capacity(var_types.len()); + for (i, &var_type) in var_types.iter().enumerate() { + let Some(raw) = try_read_bytes(reader)? else { + if i == 0 { + return Ok(None); + } else { + let offset = reader.stream_position()?; + return Err(Error::EofInCase { + offset, + case_ofs: offset - case_start, + case_len: var_types.len() * 8, + }); + } + }; + values.push(Value::from_raw(&UntypedValue(raw), var_type, endian)); + } + Ok(Some(values)) + } + + fn read_compressed_case( + reader: &mut R, + var_types: &[VarType], + codes: &mut VecDeque, + endian: Endian, + bias: f64, + ) -> Result>, Error> { + let case_start = reader.stream_position()?; + let mut values = Vec::with_capacity(var_types.len()); + for (i, &var_type) in var_types.iter().enumerate() { + let value = loop { + let Some(code) = codes.pop_front() else { + let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else { + if i == 0 { + return Ok(None); + } else { + let offset = reader.stream_position()?; + return Err(Error::EofInCompressedCase { + offset, + case_ofs: offset - case_start, + }); + } + }; + codes.extend(new_codes.into_iter()); + continue; + }; + match code { + 0 => (), + 1..=251 => match var_type { + VarType::Numeric => break Self::Number(Some(code as f64 - bias)), + VarType::String => { + break Self::String(RawStr(endian.to_bytes(code as f64 - bias))) + } + }, + 252 => { + if i == 0 { + return Ok(None); + } else { + let offset = reader.stream_position()?; + return Err(Error::PartialCompressedCase { + offset, + case_ofs: offset - case_start, + }); + } + } + 253 => { + break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian) + } + 254 => match var_type { + VarType::String => break Self::String(RawStr(*b" ")), // XXX EBCDIC + VarType::Numeric => { + return Err(Error::CompressedStringExpected { + offset: case_start, + case_ofs: reader.stream_position()? - case_start, + }) + } + }, + 255 => match var_type { + VarType::Numeric => break Self::Number(None), + VarType::String => { + return Err(Error::CompressedNumberExpected { + offset: case_start, + case_ofs: reader.stream_position()? - case_start, + }) + } + }, + } + }; + values.push(value); + } + Ok(Some(values)) + } + + pub fn decode(self, decoder: &Decoder) -> Value { + match self { + Self::Number(x) => Value::Number(x), + Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()), + } + } +} + +struct ZlibDecodeMultiple +where + R: Read + Seek, +{ + reader: Option>, +} + +impl ZlibDecodeMultiple +where + R: Read + Seek, +{ + fn new(reader: R) -> ZlibDecodeMultiple { + ZlibDecodeMultiple { + reader: Some(ZlibDecoder::new(reader)), + } + } +} + +impl Read for ZlibDecodeMultiple +where + R: Read + Seek, +{ + fn read(&mut self, buf: &mut [u8]) -> Result { + loop { + match self.reader.as_mut().unwrap().read(buf)? { + 0 => { + let inner = self.reader.take().unwrap().into_inner(); + self.reader = Some(ZlibDecoder::new(inner)); + } + n => return Ok(n), + }; + } + } +} + +impl Seek for ZlibDecodeMultiple +where + R: Read + Seek, +{ + fn seek(&mut self, pos: SeekFrom) -> Result { + self.reader.as_mut().unwrap().get_mut().seek(pos) + } +} + +enum ReaderState { + Start, + Headers, + ZlibHeader, + ZlibTrailer { + ztrailer_offset: u64, + ztrailer_len: u64, + }, + Cases, + End, +} + +pub struct Reader +where + R: Read + Seek + 'static, +{ + reader: Option, + warn: Box, + + header: HeaderRecord, + var_types: Vec, + + state: ReaderState, +} + +impl Reader +where + R: Read + Seek + 'static, +{ + pub fn new(mut reader: R, warn: F) -> Result + where + F: Fn(Warning) + 'static, + { + let header = HeaderRecord::read(&mut reader)?; + Ok(Self { + reader: Some(reader), + warn: Box::new(warn), + header, + var_types: Vec::new(), + state: ReaderState::Start, + }) + } + fn cases(&mut self) -> Cases { + self.state = ReaderState::End; + Cases::new( + self.reader.take().unwrap(), + take(&mut self.var_types), + &self.header, + ) + } + fn _next(&mut self) -> Option<::Item> { + match self.state { + ReaderState::Start => { + self.state = ReaderState::Headers; + Some(Ok(Record::Header(self.header.clone()))) + } + ReaderState::Headers => { + let record = loop { + match Record::read( + self.reader.as_mut().unwrap(), + self.header.endian, + self.var_types.as_slice(), + &self.warn, + ) { + Ok(Some(record)) => break record, + Ok(None) => (), + Err(error) => return Some(Err(error)), + } + }; + match record { + Record::Variable(VariableRecord { width, .. }) => { + self.var_types.push(if width == 0 { + VarType::Numeric + } else { + VarType::String + }); + } + Record::EndOfHeaders(_) => { + self.state = if let Some(Compression::ZLib) = self.header.compression { + ReaderState::ZlibHeader + } else { + ReaderState::Cases + }; + } + _ => (), + }; + Some(Ok(record)) + } + ReaderState::ZlibHeader => { + let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian) + { + Ok(zheader) => zheader, + Err(error) => return Some(Err(error)), + }; + self.state = ReaderState::ZlibTrailer { + ztrailer_offset: zheader.ztrailer_offset, + ztrailer_len: zheader.ztrailer_len, + }; + Some(Ok(Record::ZHeader(zheader))) + } + ReaderState::ZlibTrailer { + ztrailer_offset, + ztrailer_len, + } => { + match ZTrailer::read( + self.reader.as_mut().unwrap(), + self.header.endian, + ztrailer_offset, + ztrailer_len, + ) { + Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))), + Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))), + Err(error) => Some(Err(error)), + } + } + ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))), + ReaderState::End => None, + } + } +} + +impl Iterator for Reader +where + R: Read + Seek + 'static, +{ + type Item = Result; + + fn next(&mut self) -> Option { + let retval = self._next(); + if matches!(retval, Some(Err(_))) { + self.state = ReaderState::End; + } + retval + } +} + +trait ReadSeek: Read + Seek {} +impl ReadSeek for T where T: Read + Seek {} + +pub struct Cases { + reader: Box, + var_types: Vec, + compression: Option, + bias: f64, + endian: Endian, + codes: VecDeque, + eof: bool, +} + +impl Debug for Cases { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "Cases") + } +} + +impl Cases { + fn new(reader: R, var_types: Vec, header: &HeaderRecord) -> Self + where + R: Read + Seek + 'static, + { + Self { + reader: if header.compression == Some(Compression::ZLib) { + Box::new(ZlibDecodeMultiple::new(reader)) + } else { + Box::new(reader) + }, + var_types, + compression: header.compression, + bias: header.bias, + endian: header.endian, + codes: VecDeque::with_capacity(8), + eof: false, + } + } +} + +impl Iterator for Cases { + type Item = Result, Error>; + + fn next(&mut self) -> Option { + if self.eof { + return None; + } + + let retval = if self.compression.is_some() { + Value::read_compressed_case( + &mut self.reader, + &self.var_types, + &mut self.codes, + self.endian, + self.bias, + ) + .transpose() + } else { + Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose() + }; + self.eof = matches!(retval, None | Some(Err(_))); + retval + } +} + +#[derive(Copy, Clone, PartialEq, Eq, Hash)] +pub struct Spec(pub u32); + +impl Debug for Spec { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + let type_ = format_name(self.0 >> 16); + let w = (self.0 >> 8) & 0xff; + let d = self.0 & 0xff; + write!(f, "{:06x} ({type_}{w}.{d})", self.0) + } +} + +fn format_name(type_: u32) -> Cow<'static, str> { + match type_ { + 1 => "A", + 2 => "AHEX", + 3 => "COMMA", + 4 => "DOLLAR", + 5 => "F", + 6 => "IB", + 7 => "PIBHEX", + 8 => "P", + 9 => "PIB", + 10 => "PK", + 11 => "RB", + 12 => "RBHEX", + 15 => "Z", + 16 => "N", + 17 => "E", + 20 => "DATE", + 21 => "TIME", + 22 => "DATETIME", + 23 => "ADATE", + 24 => "JDATE", + 25 => "DTIME", + 26 => "WKDAY", + 27 => "MONTH", + 28 => "MOYR", + 29 => "QYR", + 30 => "WKYR", + 31 => "PCT", + 32 => "DOT", + 33 => "CCA", + 34 => "CCB", + 35 => "CCC", + 36 => "CCD", + 37 => "CCE", + 38 => "EDATE", + 39 => "SDATE", + 40 => "MTIME", + 41 => "YMDHMS", + _ => return format!("").into(), + } + .into() +} + +#[derive(Clone)] +pub struct MissingValues +where + S: Debug, +{ + /// Individual missing values, up to 3 of them. + pub values: Vec>, + + /// Optional range of missing values. + pub range: Option<(Value, Value)>, +} + +impl Debug for MissingValues +where + S: Debug, +{ + fn fmt(&self, f: &mut Formatter) -> FmtResult { + for (i, value) in self.values.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{value:?}")?; + } + + if let Some((low, high)) = &self.range { + if !self.values.is_empty() { + write!(f, ", ")?; + } + write!(f, "{low:?} THRU {high:?}")?; + } + + if self.is_empty() { + write!(f, "none")?; + } + + Ok(()) + } +} + +impl MissingValues +where + S: Debug, +{ + fn is_empty(&self) -> bool { + self.values.is_empty() && self.range.is_none() + } +} + +impl Default for MissingValues +where + S: Debug, +{ + fn default() -> Self { + Self { + values: Vec::new(), + range: None, + } + } +} + +impl MissingValues> { + fn read( + r: &mut R, + offset: u64, + width: i32, + code: i32, + endian: Endian, + ) -> Result { + let (n_values, has_range) = match (width, code) { + (_, 0..=3) => (code, false), + (0, -2) => (0, true), + (0, -3) => (1, true), + (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }), + (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }), + }; + + let var_type = if width == 0 { + VarType::Numeric + } else { + VarType::String + }; + + let mut values = Vec::new(); + for _ in 0..n_values { + values.push(RawValue::read(r, var_type, endian)?); + } + let range = if has_range { + let low = RawValue::read(r, var_type, endian)?; + let high = RawValue::read(r, var_type, endian)?; + Some((low, high)) + } else { + None + }; + Ok(Self { values, range }) + } + fn decode(&self, decoder: &Decoder) -> MissingValues { + MissingValues { + values: self + .values + .iter() + .map(|value| value.decode(decoder)) + .collect(), + range: self + .range + .as_ref() + .map(|(low, high)| (low.decode(decoder), high.decode(decoder))), + } + } +} + +#[derive(Clone)] +pub struct VariableRecord +where + S: Debug, + V: Debug, +{ + /// Range of offsets in file. + pub offsets: Range, + + /// Variable width, in the range -1..=255. + pub width: i32, + + /// Variable name, padded on the right with spaces. + pub name: S, + + /// Print format. + pub print_format: Spec, + + /// Write format. + pub write_format: Spec, + + /// Missing values. + pub missing_values: MissingValues, + + /// Optional variable label. + pub label: Option, +} + +impl Debug for VariableRecord +where + S: Debug, + V: Debug, +{ + fn fmt(&self, f: &mut Formatter) -> FmtResult { + writeln!( + f, + "Width: {} ({})", + self.width, + match self.width.cmp(&0) { + Ordering::Greater => "string", + Ordering::Equal => "numeric", + Ordering::Less => "long string continuation record", + } + )?; + writeln!(f, "Print format: {:?}", self.print_format)?; + writeln!(f, "Write format: {:?}", self.write_format)?; + writeln!(f, "Name: {:?}", &self.name)?; + writeln!(f, "Variable label: {:?}", self.label)?; + writeln!(f, "Missing values: {:?}", self.missing_values) + } +} + +impl VariableRecord> { + fn read(r: &mut R, endian: Endian) -> Result { + let start_offset = r.stream_position()?; + let width: i32 = endian.parse(read_bytes(r)?); + if !(-1..=255).contains(&width) { + return Err(Error::BadVariableWidth { + start_offset, + width, + }); + } + let code_offset = r.stream_position()?; + let has_variable_label: u32 = endian.parse(read_bytes(r)?); + let missing_value_code: i32 = endian.parse(read_bytes(r)?); + let print_format = Spec(endian.parse(read_bytes(r)?)); + let write_format = Spec(endian.parse(read_bytes(r)?)); + let name = RawString(read_vec(r, 8)?); + + let label = match has_variable_label { + 0 => None, + 1 => { + let len: u32 = endian.parse(read_bytes(r)?); + let read_len = len.min(65535) as usize; + let label = RawString(read_vec(r, read_len)?); + + let padding_bytes = Integer::next_multiple_of(&len, &4) - len; + let _ = read_vec(r, padding_bytes as usize)?; + + Some(label) + } + _ => { + return Err(Error::BadVariableLabelCode { + start_offset, + code_offset, + code: has_variable_label, + }) + } + }; + + let missing_values = + MissingValues::read(r, start_offset, width, missing_value_code, endian)?; + + let end_offset = r.stream_position()?; + + Ok(Record::Variable(VariableRecord { + offsets: start_offset..end_offset, + width, + name, + print_format, + write_format, + missing_values, + label, + })) + } + + pub fn decode(self, decoder: &Decoder) -> DecodedRecord { + DecodedRecord::Variable(VariableRecord { + offsets: self.offsets.clone(), + width: self.width, + name: decoder.decode(&self.name).to_string(), + print_format: self.print_format, + write_format: self.write_format, + missing_values: self.missing_values.decode(decoder), + label: self + .label + .as_ref() + .map(|label| decoder.decode(label).to_string()), + }) + } +} + +#[derive(Copy, Clone)] +pub struct UntypedValue(pub [u8; 8]); + +impl Debug for UntypedValue { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + let little: f64 = Endian::Little.parse(self.0); + let little = format!("{:?}", little); + let big: f64 = Endian::Big.parse(self.0); + let big = format!("{:?}", big); + let number = if little.len() <= big.len() { + little + } else { + big + }; + write!(f, "{number}")?; + + let string = default_decode(&self.0); + let string = string + .split(|c: char| c == '\0' || c.is_control()) + .next() + .unwrap(); + write!(f, "{string:?}")?; + Ok(()) + } +} + +#[derive(Clone)] +pub struct RawString(pub Vec); + +impl From> for RawString { + fn from(source: Vec) -> Self { + Self(source) + } +} + +impl From<&[u8]> for RawString { + fn from(source: &[u8]) -> Self { + Self(source.into()) + } +} + +impl Debug for RawString { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{:?}", default_decode(self.0.as_slice())) + } +} + +#[derive(Copy, Clone)] +pub struct RawStr(pub [u8; N]); + +impl From<[u8; N]> for RawStr { + fn from(source: [u8; N]) -> Self { + Self(source) + } +} + +impl Debug for RawStr { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{:?}", default_decode(&self.0)) + } +} + +#[derive(Clone, Debug)] +pub struct ValueLabel +where + V: Debug, + S: Debug, +{ + pub value: Value, + pub label: S, +} + +#[derive(Clone)] +pub struct ValueLabelRecord +where + V: Debug, + S: Debug, +{ + /// Range of offsets in file. + pub offsets: Range, + + /// The labels. + pub labels: Vec>, + + /// The 1-based indexes of the variable indexes. + pub dict_indexes: Vec, + + /// The types of the variables. + pub var_type: VarType, +} + +impl Debug for ValueLabelRecord +where + V: Debug, + S: Debug, +{ + fn fmt(&self, f: &mut Formatter) -> FmtResult { + writeln!(f, "labels: ")?; + for label in self.labels.iter() { + writeln!(f, "{label:?}")?; + } + write!(f, "apply to {} variables", self.var_type)?; + for dict_index in self.dict_indexes.iter() { + write!(f, " #{dict_index}")?; + } + Ok(()) + } +} + +impl Header for ValueLabelRecord +where + V: Debug, + S: Debug, +{ + fn offsets(&self) -> Range { + self.offsets.clone() + } +} + +impl ValueLabelRecord +where + V: Debug, + S: Debug, +{ + /// Maximum number of value labels in a record. + pub const MAX_LABELS: u32 = u32::MAX / 8; + + /// Maximum number of variable indexes in a record. + pub const MAX_INDEXES: u32 = u32::MAX / 8; +} + +impl ValueLabelRecord, RawString> { + fn read( + r: &mut R, + endian: Endian, + var_types: &[VarType], + warn: &dyn Fn(Warning), + ) -> Result, Error> { + let label_offset = r.stream_position()?; + let n: u32 = endian.parse(read_bytes(r)?); + if n > Self::MAX_LABELS { + return Err(Error::BadNumberOfValueLabels { + offset: label_offset, + n, + max: Self::MAX_LABELS, + }); + } + + let mut labels = Vec::new(); + for _ in 0..n { + let value = UntypedValue(read_bytes(r)?); + let label_len: u8 = endian.parse(read_bytes(r)?); + let label_len = label_len as usize; + let padded_len = Integer::next_multiple_of(&(label_len + 1), &8); + + let mut label = read_vec(r, padded_len - 1)?; + label.truncate(label_len); + labels.push((value, RawString(label))); + } + + let index_offset = r.stream_position()?; + let rec_type: u32 = endian.parse(read_bytes(r)?); + if rec_type != 4 { + return Err(Error::ExpectedVarIndexRecord { + offset: index_offset, + rec_type, + }); + } + + let n: u32 = endian.parse(read_bytes(r)?); + if n > Self::MAX_INDEXES { + return Err(Error::TooManyVarIndexes { + offset: index_offset, + n, + max: Self::MAX_INDEXES, + }); + } else if n == 0 { + warn(Warning::NoVarIndexes { + offset: index_offset, + }); + return Ok(None); + } + + let index_offset = r.stream_position()?; + let mut dict_indexes = Vec::with_capacity(n as usize); + let mut invalid_indexes = Vec::new(); + for _ in 0..n { + let index: u32 = endian.parse(read_bytes(r)?); + if index == 0 || index as usize > var_types.len() { + dict_indexes.push(index); + } else { + invalid_indexes.push(index); + } + } + if !invalid_indexes.is_empty() { + warn(Warning::InvalidVarIndexes { + offset: index_offset, + max: var_types.len(), + invalid: invalid_indexes, + }); + } + + let Some(&first_index) = dict_indexes.first() else { + return Ok(None); + }; + let var_type = var_types[first_index as usize - 1]; + let mut wrong_type_indexes = Vec::new(); + dict_indexes.retain(|&index| { + if var_types[index as usize - 1] != var_type { + wrong_type_indexes.push(index); + false + } else { + true + } + }); + if !wrong_type_indexes.is_empty() { + warn(Warning::MixedVarTypes { + offset: index_offset, + var_type, + wrong_types: wrong_type_indexes, + }); + } + + let labels = labels + .into_iter() + .map(|(value, label)| ValueLabel { + value: Value::from_raw(&value, var_type, endian), + label, + }) + .collect(); + + let end_offset = r.stream_position()?; + Ok(Some(Record::ValueLabel(ValueLabelRecord { + offsets: label_offset..end_offset, + labels, + dict_indexes, + var_type, + }))) + } + + fn decode(self, decoder: &Decoder) -> ValueLabelRecord, String> { + let labels = self + .labels + .iter() + .map(|ValueLabel { value, label }| ValueLabel { + value: *value, + label: decoder.decode(label).to_string(), + }) + .collect(); + ValueLabelRecord { + offsets: self.offsets.clone(), + labels, + dict_indexes: self.dict_indexes.clone(), + var_type: self.var_type, + } + } +} + +#[derive(Clone, Debug)] +pub struct DocumentRecord +where + S: Debug, +{ + pub offsets: Range, + + /// The document, as an array of lines. Raw lines are exactly 80 bytes long + /// and are right-padded with spaces without any new-line termination. + pub lines: Vec, +} + +pub type RawDocumentLine = RawStr; + +/// Length of a line in a document. Document lines are fixed-length and +/// padded on the right with spaces. +pub const DOC_LINE_LEN: usize = 80; + +impl DocumentRecord { + /// Maximum number of lines we will accept in a document. This is simply + /// the maximum number that will fit in a 32-bit space. + pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN; + + fn read(r: &mut R, endian: Endian) -> Result { + let start_offset = r.stream_position()?; + let n: u32 = endian.parse(read_bytes(r)?); + let n = n as usize; + if n > Self::MAX_LINES { + Err(Error::BadDocumentLength { + offset: start_offset, + n, + max: Self::MAX_LINES, + }) + } else { + let mut lines = Vec::with_capacity(n); + for _ in 0..n { + lines.push(RawStr(read_bytes(r)?)); + } + let end_offset = r.stream_position()?; + Ok(Record::Document(DocumentRecord { + offsets: start_offset..end_offset, + lines, + })) + } + } + + pub fn decode(self, decoder: &Decoder) -> DecodedRecord { + DecodedRecord::Document(DocumentRecord { + offsets: self.offsets.clone(), + lines: self + .lines + .iter() + .map(|s| decoder.decode_slice(&s.0).to_string()) + .collect(), + }) + } +} + +impl Header for DocumentRecord +where + S: Debug, +{ + fn offsets(&self) -> Range { + self.offsets.clone() + } +} + +trait ExtensionRecord { + const SUBTYPE: u32; + const SIZE: Option; + const COUNT: Option; + const NAME: &'static str; + fn parse(ext: &Extension, endian: Endian) -> Result; +} + +#[derive(Clone, Debug)] +pub struct IntegerInfoRecord { + pub offsets: Range, + pub version: (i32, i32, i32), + pub machine_code: i32, + pub floating_point_rep: i32, + pub compression_code: i32, + pub endianness: i32, + pub character_code: i32, +} + +impl ExtensionRecord for IntegerInfoRecord { + const SUBTYPE: u32 = 3; + const SIZE: Option = Some(4); + const COUNT: Option = Some(8); + const NAME: &'static str = "integer record"; + + fn parse(ext: &Extension, endian: Endian) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let data: Vec = (0..8) + .map(|_| endian.parse(read_bytes(&mut input).unwrap())) + .collect(); + Ok(Record::IntegerInfo(IntegerInfoRecord { + offsets: ext.offsets.clone(), + version: (data[0], data[1], data[2]), + machine_code: data[3], + floating_point_rep: data[4], + compression_code: data[5], + endianness: data[6], + character_code: data[7], + })) + } +} + +#[derive(Clone, Debug)] +pub struct FloatInfoRecord { + pub sysmis: f64, + pub highest: f64, + pub lowest: f64, +} + +impl ExtensionRecord for FloatInfoRecord { + const SUBTYPE: u32 = 4; + const SIZE: Option = Some(8); + const COUNT: Option = Some(3); + const NAME: &'static str = "floating point record"; + + fn parse(ext: &Extension, endian: Endian) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let data: Vec = (0..3) + .map(|_| endian.parse(read_bytes(&mut input).unwrap())) + .collect(); + Ok(Record::FloatInfo(FloatInfoRecord { + sysmis: data[0], + highest: data[1], + lowest: data[2], + })) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum CategoryLabels { + VarLabels, + CountedValues, +} + +#[derive(Clone, Debug)] +pub enum MultipleResponseType { + MultipleDichotomy { + value: RawString, + labels: CategoryLabels, + }, + MultipleCategory, +} + +impl MultipleResponseType { + fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> { + let (mr_type, input) = match input.split_first() { + Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input), + Some((b'D', input)) => { + let (value, input) = parse_counted_string(input)?; + ( + MultipleResponseType::MultipleDichotomy { + value, + labels: CategoryLabels::VarLabels, + }, + input, + ) + } + Some((b'E', input)) => { + let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") { + (CategoryLabels::CountedValues, rest) + } else if let Some(rest) = input.strip_prefix(b" 11 ") { + (CategoryLabels::VarLabels, rest) + } else { + return Err(Warning::TBD); + }; + let (value, input) = parse_counted_string(input)?; + ( + MultipleResponseType::MultipleDichotomy { value, labels }, + input, + ) + } + _ => return Err(Warning::TBD), + }; + Ok((mr_type, input)) + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseSet +where + I: Debug, + S: Debug, +{ + pub name: I, + pub label: S, + pub mr_type: MultipleResponseType, + pub short_names: Vec, +} + +impl MultipleResponseSet { + fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> { + let Some(equals) = input.iter().position(|&b| b == b'=') else { + return Err(Warning::TBD); + }; + let (name, input) = input.split_at(equals); + let (mr_type, input) = MultipleResponseType::parse(input)?; + let Some(input) = input.strip_prefix(b" ") else { + return Err(Warning::TBD); + }; + let (label, mut input) = parse_counted_string(input)?; + let mut vars = Vec::new(); + while input.first() != Some(&b'\n') { + match input.split_first() { + Some((b' ', rest)) => { + let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else { + return Err(Warning::TBD); + }; + let (var, rest) = rest.split_at(length); + if !var.is_empty() { + vars.push(var.into()); + } + input = rest; + } + _ => return Err(Warning::TBD), + } + } + while input.first() == Some(&b'\n') { + input = &input[1..]; + } + Ok(( + MultipleResponseSet { + name: name.into(), + label, + mr_type, + short_names: vars, + }, + input, + )) + } + + fn decode( + &self, + decoder: &Decoder, + ) -> Result, Warning> { + let mut short_names = Vec::with_capacity(self.short_names.len()); + for short_name in self.short_names.iter() { + if let Some(short_name) = decoder + .decode_identifier(short_name) + .map_err(Warning::InvalidMrSetName) + .issue_warning(&decoder.warn) + { + short_names.push(short_name); + } + } + Ok(MultipleResponseSet { + name: decoder + .decode_identifier(&self.name) + .map_err(Warning::InvalidMrSetVariableName)?, + label: decoder.decode(&self.label).to_string(), + mr_type: self.mr_type.clone(), + short_names, + }) + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseRecord(pub Vec>) +where + I: Debug, + S: Debug; + +impl ExtensionRecord for MultipleResponseRecord { + const SUBTYPE: u32 = 7; + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "multiple response set record"; + + fn parse(ext: &Extension, _endian: Endian) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let mut sets = Vec::new(); + while !input.is_empty() { + let (set, rest) = MultipleResponseSet::parse(input)?; + sets.push(set); + input = rest; + } + Ok(Record::MultipleResponse(MultipleResponseRecord(sets))) + } +} + +impl MultipleResponseRecord { + fn decode(self, decoder: &Decoder) -> DecodedRecord { + let mut sets = Vec::new(); + for set in self.0.iter() { + if let Some(set) = set.decode(decoder).issue_warning(&decoder.warn) { + sets.push(set); + } + } + DecodedRecord::MultipleResponse(MultipleResponseRecord(sets)) + } +} + +fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> { + let Some(space) = input.iter().position(|&b| b == b' ') else { + return Err(Warning::TBD); + }; + let Ok(length) = from_utf8(&input[..space]) else { + return Err(Warning::TBD); + }; + let Ok(length): Result = length.parse() else { + return Err(Warning::TBD); + }; + + let input = &input[space + 1..]; + if input.len() < length { + return Err(Warning::TBD); + }; + + let (string, rest) = input.split_at(length); + Ok((string.into(), rest)) +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Measure { + Nominal, + Ordinal, + Scale, +} + +impl Measure { + pub fn default_for_type(var_type: VarType) -> Option { + match var_type { + VarType::Numeric => None, + VarType::String => Some(Self::Nominal), + } + } + + fn try_decode(source: u32) -> Result, Warning> { + match source { + 0 => Ok(None), + 1 => Ok(Some(Measure::Nominal)), + 2 => Ok(Some(Measure::Ordinal)), + 3 => Ok(Some(Measure::Scale)), + _ => Err(Warning::InvalidMeasurement(source)), + } + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Alignment { + Left, + Right, + Center, +} + +impl Alignment { + fn try_decode(source: u32) -> Result, Warning> { + match source { + 0 => Ok(None), + 1 => Ok(Some(Alignment::Left)), + 2 => Ok(Some(Alignment::Right)), + 3 => Ok(Some(Alignment::Center)), + _ => Err(Warning::InvalidAlignment(source)), + } + } + + pub fn default_for_type(var_type: VarType) -> Self { + match var_type { + VarType::Numeric => Self::Right, + VarType::String => Self::Left, + } + } +} + +#[derive(Clone, Debug)] +pub struct VarDisplay { + pub measure: Option, + pub width: Option, + pub alignment: Option, +} + +#[derive(Clone, Debug)] +pub struct VarDisplayRecord(pub Vec); + +impl VarDisplayRecord { + const SUBTYPE: u32 = 11; + + fn parse( + ext: &Extension, + n_vars: usize, + endian: Endian, + warn: &dyn Fn(Warning), + ) -> Result { + if ext.size != 4 { + return Err(Warning::BadRecordSize { + offset: ext.offsets.start, + record: String::from("variable display record"), + size: ext.size, + expected_size: 4, + }); + } + + let has_width = if ext.count as usize == 3 * n_vars { + true + } else if ext.count as usize == 2 * n_vars { + false + } else { + return Err(Warning::TBD); + }; + + let mut var_displays = Vec::new(); + let mut input = &ext.data[..]; + for _ in 0..n_vars { + let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap())) + .issue_warning(&warn) + .flatten(); + let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap())); + let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap())) + .issue_warning(&warn) + .flatten(); + var_displays.push(VarDisplay { + measure, + width, + alignment, + }); + } + Ok(Record::VarDisplay(VarDisplayRecord(var_displays))) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringMissingValues +where + N: Debug, + V: Debug, +{ + /// Variable name. + pub var_name: N, + + /// Missing values. + pub missing_values: MissingValues, +} + +impl LongStringMissingValues> { + fn decode( + &self, + decoder: &Decoder, + ) -> Result, IdError> { + Ok(LongStringMissingValues { + var_name: decoder.decode_identifier(&self.var_name)?, + missing_values: self.missing_values.decode(decoder), + }) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringMissingValueRecord(pub Vec>) +where + N: Debug, + V: Debug; + +impl ExtensionRecord for LongStringMissingValueRecord> { + const SUBTYPE: u32 = 22; + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "long string missing values record"; + + fn parse(ext: &Extension, endian: Endian) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let mut missing_value_set = Vec::new(); + while !input.is_empty() { + let var_name = read_string(&mut input, endian)?; + let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?); + let value_len: u32 = endian.parse(read_bytes(&mut input)?); + if value_len != 8 { + let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start; + return Err(Warning::BadLongMissingValueLength { + record_offset: ext.offsets.start, + offset, + value_len, + }); + } + let mut values = Vec::new(); + for i in 0..n_missing_values { + let value: [u8; 8] = read_bytes(&mut input)?; + let numeric_value: u64 = endian.parse(value); + let value = if i > 0 && numeric_value == 8 { + // Tolerate files written by old, buggy versions of PSPP + // where we believed that the value_length was repeated + // before each missing value. + read_bytes(&mut input)? + } else { + value + }; + values.push(Value::String(RawStr(value))); + } + let missing_values = MissingValues { + values, + range: None, + }; + missing_value_set.push(LongStringMissingValues { + var_name, + missing_values, + }); + } + Ok(Record::LongStringMissingValues( + LongStringMissingValueRecord(missing_value_set), + )) + } +} + +impl LongStringMissingValueRecord> { + pub fn decode(self, decoder: &Decoder) -> LongStringMissingValueRecord { + let mut mvs = Vec::with_capacity(self.0.len()); + for mv in self.0.iter() { + if let Some(mv) = mv + .decode(decoder) + .map_err(Warning::InvalidLongStringMissingValueVariableName) + .issue_warning(&decoder.warn) + { + mvs.push(mv); + } + } + LongStringMissingValueRecord(mvs) + } +} + +#[derive(Clone, Debug)] +pub struct EncodingRecord(pub String); + +impl ExtensionRecord for EncodingRecord { + const SUBTYPE: u32 = 20; + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "encoding record"; + + fn parse(ext: &Extension, _endian: Endian) -> Result { + ext.check_size::()?; + + Ok(Record::Encoding(EncodingRecord( + String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName { + offset: ext.offsets.start, + })?, + ))) + } +} + +#[derive(Clone, Debug)] +pub struct NumberOfCasesRecord { + /// Always observed as 1. + pub one: u64, + + /// Number of cases. + pub n_cases: u64, +} + +impl ExtensionRecord for NumberOfCasesRecord { + const SUBTYPE: u32 = 16; + const SIZE: Option = Some(8); + const COUNT: Option = Some(2); + const NAME: &'static str = "extended number of cases record"; + + fn parse(ext: &Extension, endian: Endian) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let one = endian.parse(read_bytes(&mut input)?); + let n_cases = endian.parse(read_bytes(&mut input)?); + + Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases })) + } +} + +#[derive(Clone, Debug)] +pub struct TextRecord { + pub offsets: Range, + + /// Type of record. + pub rec_type: TextRecordType, + + /// The text content of the record. + pub text: RawString, +} + +#[derive(Clone, Copy, Debug)] +pub enum TextRecordType { + VariableSets, + ProductInfo, + LongNames, + VeryLongStrings, + FileAttributes, + VariableAttributes, +} + +impl TextRecord { + fn new(extension: Extension, rec_type: TextRecordType) -> Self { + Self { + offsets: extension.offsets, + rec_type, + text: extension.data.into(), + } + } + pub fn decode(self, decoder: &Decoder) -> DecodedRecord { + match self.rec_type { + TextRecordType::VariableSets => { + DecodedRecord::VariableSets(VariableSetRecord::decode(&self, decoder)) + } + TextRecordType::ProductInfo => { + DecodedRecord::ProductInfo(ProductInfoRecord::decode(&self, decoder)) + } + TextRecordType::LongNames => { + DecodedRecord::LongNames(LongNamesRecord::decode(&self, decoder)) + } + TextRecordType::VeryLongStrings => { + DecodedRecord::VeryLongStrings(VeryLongStringsRecord::decode(&self, decoder)) + } + TextRecordType::FileAttributes => { + DecodedRecord::FileAttributes(FileAttributeRecord::decode(&self, decoder)) + } + TextRecordType::VariableAttributes => { + DecodedRecord::VariableAttributes(VariableAttributeRecord::decode(&self, decoder)) + } + } + } +} + +#[derive(Clone, Debug)] +pub struct VeryLongString { + pub short_name: Identifier, + pub length: u16, +} + +impl VeryLongString { + fn parse(decoder: &Decoder, input: &str) -> Result { + let Some((short_name, length)) = input.split_once('=') else { + return Err(Warning::TBD); + }; + let short_name = decoder + .new_identifier(short_name) + .map_err(Warning::InvalidLongStringName)?; + let length = length.parse().map_err(|_| Warning::TBD)?; + Ok(VeryLongString { short_name, length }) + } +} + +#[derive(Clone, Debug)] +pub struct VeryLongStringsRecord(Vec); + +impl VeryLongStringsRecord { + fn decode(source: &TextRecord, decoder: &Decoder) -> Self { + let input = decoder.decode(&source.text); + let mut very_long_strings = Vec::new(); + for tuple in input + .split('\0') + .map(|s| s.trim_end_matches('\t')) + .filter(|s| !s.is_empty()) + { + if let Some(vls) = VeryLongString::parse(decoder, tuple).issue_warning(&decoder.warn) { + very_long_strings.push(vls) + } + } + VeryLongStringsRecord(very_long_strings) + } +} + +#[derive(Clone, Debug)] +pub struct Attribute { + pub name: Identifier, + pub values: Vec, +} + +impl Attribute { + fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> { + let Some((name, mut input)) = input.split_once('(') else { + return Err(Warning::TBD); + }; + let name = decoder + .new_identifier(name) + .map_err(Warning::InvalidAttributeName)?; + let mut values = Vec::new(); + loop { + let Some((value, rest)) = input.split_once('\n') else { + return Err(Warning::TBD); + }; + if let Some(stripped) = value + .strip_prefix('\'') + .and_then(|value| value.strip_suffix('\'')) + { + values.push(stripped.into()); + } else { + decoder.warn(Warning::TBD); + values.push(value.into()); + } + if let Some(rest) = rest.strip_prefix(')') { + let attribute = Attribute { name, values }; + return Ok((attribute, rest)); + }; + input = rest; + } + } +} + +#[derive(Clone, Debug, Default)] +pub struct AttributeSet(pub HashMap>); + +impl AttributeSet { + fn parse<'a>( + decoder: &Decoder, + mut input: &'a str, + sentinel: Option, + ) -> Result<(AttributeSet, &'a str), Warning> { + let mut attributes = HashMap::new(); + let rest = loop { + match input.chars().next() { + None => break input, + c if c == sentinel => break &input[1..], + _ => { + let (attribute, rest) = Attribute::parse(decoder, input)?; + // XXX report duplicate name + attributes.insert(attribute.name, attribute.values); + input = rest; + } + } + }; + Ok((AttributeSet(attributes), rest)) + } +} + +#[derive(Clone, Debug, Default)] +pub struct FileAttributeRecord(pub AttributeSet); + +impl FileAttributeRecord { + fn decode(source: &TextRecord, decoder: &Decoder) -> Self { + let input = decoder.decode(&source.text); + match AttributeSet::parse(decoder, &input, None).issue_warning(&decoder.warn) { + Some((set, rest)) => { + if !rest.is_empty() { + decoder.warn(Warning::TBD); + } + FileAttributeRecord(set) + } + None => FileAttributeRecord::default(), + } + } +} + +#[derive(Clone, Debug)] +pub struct VarAttributeSet { + pub long_var_name: Identifier, + pub attributes: AttributeSet, +} + +impl VarAttributeSet { + fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributeSet, &'a str), Warning> { + let Some((long_var_name, rest)) = input.split_once(':') else { + return Err(Warning::TBD); + }; + let long_var_name = decoder + .new_identifier(long_var_name) + .map_err(Warning::InvalidAttributeVariableName)?; + let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?; + let var_attribute = VarAttributeSet { + long_var_name, + attributes, + }; + Ok((var_attribute, rest)) + } +} + +#[derive(Clone, Debug)] +pub struct VariableAttributeRecord(Vec); + +impl VariableAttributeRecord { + fn decode(source: &TextRecord, decoder: &Decoder) -> Self { + let decoded = decoder.decode(&source.text); + let mut input = decoded.as_ref(); + let mut var_attribute_sets = Vec::new(); + while !input.is_empty() { + let Some((var_attribute, rest)) = + VarAttributeSet::parse(decoder, input).issue_warning(&decoder.warn) + else { + break; + }; + var_attribute_sets.push(var_attribute); + input = rest; + } + VariableAttributeRecord(var_attribute_sets) + } +} + +#[derive(Clone, Debug)] +pub struct LongName { + pub short_name: Identifier, + pub long_name: Identifier, +} + +impl LongName { + fn parse(input: &str, decoder: &Decoder) -> Result { + let Some((short_name, long_name)) = input.split_once('=') else { + return Err(Warning::TBD); + }; + let short_name = decoder + .new_identifier(short_name) + .map_err(Warning::InvalidShortName)?; + let long_name = decoder + .new_identifier(long_name) + .map_err(Warning::InvalidLongName)?; + Ok(LongName { + short_name, + long_name, + }) + } +} + +#[derive(Clone, Debug)] +pub struct LongNamesRecord(Vec); + +impl LongNamesRecord { + fn decode(source: &TextRecord, decoder: &Decoder) -> Self { + let input = decoder.decode(&source.text); + let mut names = Vec::new(); + for pair in input.split('\t').filter(|s| !s.is_empty()) { + if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&decoder.warn) { + names.push(long_name); + } + } + LongNamesRecord(names) + } +} + +#[derive(Clone, Debug)] +pub struct ProductInfoRecord(pub String); + +impl ProductInfoRecord { + fn decode(source: &TextRecord, decoder: &Decoder) -> Self { + Self(decoder.decode(&source.text).into()) + } +} +#[derive(Clone, Debug)] +pub struct VariableSet { + pub name: String, + pub vars: Vec, +} + +impl VariableSet { + fn parse(input: &str, decoder: &Decoder) -> Result { + let (name, input) = input.split_once('=').ok_or(Warning::TBD)?; + let mut vars = Vec::new(); + for var in input.split_ascii_whitespace() { + if let Some(identifier) = decoder + .new_identifier(var) + .map_err(Warning::InvalidVariableSetName) + .issue_warning(&decoder.warn) + { + vars.push(identifier); + } + } + Ok(VariableSet { + name: name.into(), + vars, + }) + } +} + +#[derive(Clone, Debug)] +pub struct VariableSetRecord { + pub offsets: Range, + pub sets: Vec, +} + +impl VariableSetRecord { + fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord { + let mut sets = Vec::new(); + let input = decoder.decode(&source.text); + for line in input.lines() { + if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&decoder.warn) { + sets.push(set) + } + } + VariableSetRecord { + offsets: source.offsets.clone(), + sets, + } + } +} + +trait IssueWarning { + fn issue_warning(self, warn: &F) -> Option + where + F: Fn(Warning); +} +impl IssueWarning for Result { + fn issue_warning(self, warn: &F) -> Option + where + F: Fn(Warning), + { + match self { + Ok(result) => Some(result), + Err(error) => { + warn(error); + None + } + } + } +} + +#[derive(Clone, Debug)] +pub struct Extension { + pub offsets: Range, + + /// Record subtype. + pub subtype: u32, + + /// Size of each data element. + pub size: u32, + + /// Number of data elements. + pub count: u32, + + /// `size * count` bytes of data. + pub data: Vec, +} + +impl Extension { + fn check_size(&self) -> Result<(), Warning> { + if let Some(expected_size) = E::SIZE { + if self.size != expected_size { + return Err(Warning::BadRecordSize { + offset: self.offsets.start, + record: E::NAME.into(), + size: self.size, + expected_size, + }); + } + } + if let Some(expected_count) = E::COUNT { + if self.count != expected_count { + return Err(Warning::BadRecordCount { + offset: self.offsets.start, + record: E::NAME.into(), + count: self.count, + expected_count, + }); + } + } + Ok(()) + } + + fn read( + r: &mut R, + endian: Endian, + n_vars: usize, + warn: &dyn Fn(Warning), + ) -> Result, Error> { + let subtype = endian.parse(read_bytes(r)?); + let header_offset = r.stream_position()?; + let size: u32 = endian.parse(read_bytes(r)?); + let count = endian.parse(read_bytes(r)?); + let Some(product) = size.checked_mul(count) else { + return Err(Error::ExtensionRecordTooLarge { + offset: header_offset, + subtype, + size, + count, + }); + }; + let start_offset = r.stream_position()?; + let data = read_vec(r, product as usize)?; + let end_offset = start_offset + product as u64; + let extension = Extension { + offsets: start_offset..end_offset, + subtype, + size, + count, + data, + }; + let result = match subtype { + IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian), + FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian), + VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, n_vars, endian, warn), + MultipleResponseRecord::SUBTYPE | 19 => { + MultipleResponseRecord::parse(&extension, endian) + } + LongStringValueLabelRecord::SUBTYPE => { + LongStringValueLabelRecord::parse(&extension, endian) + } + EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian), + NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian), + 5 => Ok(Record::Text(TextRecord::new( + extension, + TextRecordType::VariableSets, + ))), + 10 => Ok(Record::Text(TextRecord::new( + extension, + TextRecordType::ProductInfo, + ))), + 13 => Ok(Record::Text(TextRecord::new( + extension, + TextRecordType::LongNames, + ))), + 14 => Ok(Record::Text(TextRecord::new( + extension, + TextRecordType::VeryLongStrings, + ))), + 17 => Ok(Record::Text(TextRecord::new( + extension, + TextRecordType::FileAttributes, + ))), + 18 => Ok(Record::Text(TextRecord::new( + extension, + TextRecordType::VariableAttributes, + ))), + _ => Ok(Record::OtherExtension(extension)), + }; + match result { + Ok(result) => Ok(Some(result)), + Err(error) => { + warn(error); + Ok(None) + } + } + } +} + +#[derive(Clone, Debug)] +pub struct ZHeader { + /// File offset to the start of the record. + pub offset: u64, + + /// File offset to the ZLIB data header. + pub zheader_offset: u64, + + /// File offset to the ZLIB trailer. + pub ztrailer_offset: u64, + + /// Length of the ZLIB trailer in bytes. + pub ztrailer_len: u64, +} + +impl ZHeader { + fn read(r: &mut R, endian: Endian) -> Result { + let offset = r.stream_position()?; + let zheader_offset: u64 = endian.parse(read_bytes(r)?); + let ztrailer_offset: u64 = endian.parse(read_bytes(r)?); + let ztrailer_len: u64 = endian.parse(read_bytes(r)?); + + Ok(ZHeader { + offset, + zheader_offset, + ztrailer_offset, + ztrailer_len, + }) + } +} + +#[derive(Clone, Debug)] +pub struct ZTrailer { + /// File offset to the start of the record. + pub offset: u64, + + /// Compression bias as a negative integer, e.g. -100. + pub int_bias: i64, + + /// Always observed as zero. + pub zero: u64, + + /// Uncompressed size of each block, except possibly the last. Only + /// `0x3ff000` has been observed so far. + pub block_size: u32, + + /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them. + pub blocks: Vec, +} + +#[derive(Clone, Debug)] +pub struct ZBlock { + /// Offset of block of data if simple compression were used. + pub uncompressed_ofs: u64, + + /// Actual offset within the file of the compressed data block. + pub compressed_ofs: u64, + + /// The number of bytes in this data block after decompression. This is + /// `block_size` in every data block but the last, which may be smaller. + pub uncompressed_size: u32, + + /// The number of bytes in this data block, as stored compressed in this + /// file. + pub compressed_size: u32, +} + +impl ZBlock { + fn read(r: &mut R, endian: Endian) -> Result { + Ok(ZBlock { + uncompressed_ofs: endian.parse(read_bytes(r)?), + compressed_ofs: endian.parse(read_bytes(r)?), + uncompressed_size: endian.parse(read_bytes(r)?), + compressed_size: endian.parse(read_bytes(r)?), + }) + } +} + +impl ZTrailer { + fn read( + reader: &mut R, + endian: Endian, + ztrailer_ofs: u64, + ztrailer_len: u64, + ) -> Result, Error> { + let start_offset = reader.stream_position()?; + if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() { + return Ok(None); + } + let int_bias = endian.parse(read_bytes(reader)?); + let zero = endian.parse(read_bytes(reader)?); + let block_size = endian.parse(read_bytes(reader)?); + let n_blocks: u32 = endian.parse(read_bytes(reader)?); + let expected_n_blocks = (ztrailer_len - 24) / 24; + if n_blocks as u64 != expected_n_blocks { + return Err(Error::BadZlibTrailerNBlocks { + offset: ztrailer_ofs, + n_blocks, + expected_n_blocks, + ztrailer_len, + }); + } + let blocks = (0..n_blocks) + .map(|_| ZBlock::read(reader, endian)) + .collect::, _>>()?; + reader.seek(SeekFrom::Start(start_offset))?; + Ok(Some(ZTrailer { + offset: ztrailer_ofs, + int_bias, + zero, + block_size, + blocks, + })) + } +} + +fn try_read_bytes(r: &mut R) -> Result, IoError> { + let mut buf = [0; N]; + let n = r.read(&mut buf)?; + if n > 0 { + if n < N { + r.read_exact(&mut buf[n..])?; + } + Ok(Some(buf)) + } else { + Ok(None) + } +} + +fn read_bytes(r: &mut R) -> Result<[u8; N], IoError> { + let mut buf = [0; N]; + r.read_exact(&mut buf)?; + Ok(buf) +} + +fn read_vec(r: &mut R, n: usize) -> Result, IoError> { + let mut vec = vec![0; n]; + r.read_exact(&mut vec)?; + Ok(vec) +} + +fn read_string(r: &mut R, endian: Endian) -> Result { + let length: u32 = endian.parse(read_bytes(r)?); + Ok(read_vec(r, length as usize)?.into()) +} + +#[derive(Clone, Debug)] +pub struct LongStringValueLabels +where + S: Debug, +{ + pub var_name: N, + pub width: u32, + + /// `(value, label)` pairs, where each value is `width` bytes. + pub labels: Vec<(S, S)>, +} + +impl LongStringValueLabels { + fn decode( + &self, + decoder: &Decoder, + ) -> Result, Warning> { + let var_name = decoder.decode(&self.var_name); + let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding) + .map_err(Warning::InvalidLongStringValueLabelName)?; + + let mut labels = Vec::with_capacity(self.labels.len()); + for (value, label) in self.labels.iter() { + let value = decoder.decode_exact_length(&value.0).to_string(); + let label = decoder.decode(label).to_string(); + labels.push((value, label)); + } + + Ok(LongStringValueLabels { + var_name, + width: self.width, + labels, + }) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringValueLabelRecord(pub Vec>) +where + N: Debug, + S: Debug; + +impl ExtensionRecord for LongStringValueLabelRecord { + const SUBTYPE: u32 = 21; + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "long string value labels record"; + + fn parse(ext: &Extension, endian: Endian) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let mut label_set = Vec::new(); + while !input.is_empty() { + let var_name = read_string(&mut input, endian)?; + let width: u32 = endian.parse(read_bytes(&mut input)?); + let n_labels: u32 = endian.parse(read_bytes(&mut input)?); + let mut labels = Vec::new(); + for _ in 0..n_labels { + let value = read_string(&mut input, endian)?; + let label = read_string(&mut input, endian)?; + labels.push((value, label)); + } + label_set.push(LongStringValueLabels { + var_name, + width, + labels, + }) + } + Ok(Record::LongStringValueLabels(LongStringValueLabelRecord( + label_set, + ))) + } +} + +impl LongStringValueLabelRecord { + fn decode(self, decoder: &Decoder) -> LongStringValueLabelRecord { + let mut labels = Vec::with_capacity(self.0.len()); + for label in &self.0 { + match label.decode(decoder) { + Ok(set) => labels.push(set), + Err(error) => decoder.warn(error), + } + } + LongStringValueLabelRecord(labels) + } +} diff --git a/rust/pspp/src/sack.rs b/rust/pspp/src/sack.rs new file mode 100644 index 0000000000..c6be5d1eef --- /dev/null +++ b/rust/pspp/src/sack.rs @@ -0,0 +1,633 @@ +use float_next_after::NextAfter; +use num::{Bounded, Zero}; +use ordered_float::OrderedFloat; +use std::{ + collections::{hash_map::Entry, HashMap}, + error::Error as StdError, + fmt::{Display, Formatter, Result as FmtResult}, + iter::repeat, +}; + +use crate::endian::{Endian, ToBytes}; + +pub type Result = std::result::Result; + +#[derive(Debug)] +pub struct Error { + pub file_name: Option, + pub line_number: Option, + pub token: Option, + pub message: String, +} + +impl Error { + fn new( + file_name: Option<&str>, + line_number: Option, + token: Option<&str>, + message: String, + ) -> Error { + Error { + file_name: file_name.map(String::from), + line_number, + token: token.map(String::from), + message, + } + } +} + +impl StdError for Error {} + +impl Display for Error { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + match (self.file_name.as_ref(), self.line_number) { + (Some(ref file_name), Some(line_number)) => write!(f, "{file_name}:{line_number}: ")?, + (Some(ref file_name), None) => write!(f, "{file_name}: ")?, + (None, Some(line_number)) => write!(f, "line {line_number}: ")?, + (None, None) => (), + } + if let Some(ref token) = self.token { + write!(f, "at '{token}': ")?; + } + write!(f, "{}", self.message) + } +} + +pub fn sack(input: &str, input_file_name: Option<&str>, endian: Endian) -> Result> { + let mut symbol_table = HashMap::new(); + let output = _sack(input, input_file_name, endian, &mut symbol_table)?; + let output = if !symbol_table.is_empty() { + for (k, v) in symbol_table.iter() { + println!("{k} => {v:?}"); + } + for (k, v) in symbol_table.iter() { + if v.is_none() { + Err(Error::new( + input_file_name, + None, + None, + format!("label {k} used but never defined"), + ))? + } + } + _sack(input, input_file_name, endian, &mut symbol_table)? + } else { + output + }; + Ok(output) +} + +fn _sack( + input: &str, + input_file_name: Option<&str>, + endian: Endian, + symbol_table: &mut HashMap>, +) -> Result> { + let mut lexer = Lexer::new(input, input_file_name, endian)?; + let mut output = Vec::new(); + while parse_data_item(&mut lexer, &mut output, symbol_table)? {} + Ok(output) +} + +fn parse_data_item( + lexer: &mut Lexer, + output: &mut Vec, + symbol_table: &mut HashMap>, +) -> Result { + if lexer.token.is_none() { + return Ok(false); + }; + + let initial_len = output.len(); + match lexer.take()? { + Token::Integer(integer) => { + if let Ok(integer) = TryInto::::try_into(integer) { + output.extend_from_slice(&lexer.endian.to_bytes(integer)); + } else if let Ok(integer) = TryInto::::try_into(integer) { + output.extend_from_slice(&lexer.endian.to_bytes(integer)); + } else { + Err(lexer.error(format!( + "{integer} is not in the valid range [{},{}]", + i32::min_value(), + u32::max_value() + )))?; + }; + } + Token::Float(float) => output.extend_from_slice(&lexer.endian.to_bytes(float.0)), + Token::PcSysmis => { + output.extend_from_slice(&[0xf5, 0x1e, 0x26, 0x02, 0x8a, 0x8c, 0xed, 0xff]) + } + Token::I8 => put_integers::(lexer, "i8", output)?, + Token::I16 => put_integers::(lexer, "i16", output)?, + Token::I64 => put_integers::(lexer, "i64", output)?, + Token::String(string) => output.extend_from_slice(string.as_bytes()), + Token::S(size) => { + let Some((Token::String(ref string), _)) = lexer.token else { + Err(lexer.error(format!("string expected after 's{size}'")))? + }; + let len = string.len(); + if len > size { + Err(lexer.error(format!( + "{len}-byte string is longer than pad length {size}" + )))? + } + output.extend_from_slice(string.as_bytes()); + output.extend(repeat(b' ').take(size - len)); + lexer.get()?; + } + Token::LParen => { + while !matches!(lexer.token, Some((Token::RParen, _))) { + parse_data_item(lexer, output, symbol_table)?; + } + lexer.get()?; + } + Token::Count => put_counted_items::(lexer, "COUNT", output, symbol_table)?, + Token::Count8 => put_counted_items::(lexer, "COUNT8", output, symbol_table)?, + Token::Hex => { + let Some((Token::String(ref string), _)) = lexer.token else { + Err(lexer.error(String::from("string expected after 'hex'")))? + }; + let mut string = &string[..]; + loop { + string = string.trim_start(); + if string.is_empty() { + break; + }; + + let mut i = string.chars(); + let Some(c0) = i.next() else { return Ok(true) }; + let Some(c1) = i.next() else { + Err(lexer.error(String::from("hex string has odd number of characters")))? + }; + + let (Some(digit0), Some(digit1)) = (c0.to_digit(16), c1.to_digit(16)) else { + Err(lexer.error(String::from("invalid digit in hex string")))? + }; + let byte = digit0 * 16 + digit1; + output.push(byte as u8); + + string = i.as_str(); + } + lexer.get()?; + } + Token::Label(name) => { + println!("define {name}"); + let value = output.len() as u32; + match symbol_table.entry(name.clone()) { + Entry::Vacant(v) => { + v.insert(Some(value)); + } + Entry::Occupied(mut o) => { + match o.get() { + Some(v) => { + if *v != value { + Err(lexer.error(format!("{name}: can't redefine label for offset {:#x} with offset {:#x}", *v, value)))? + } + } + None => drop(o.insert(Some(value))), + } + } + }; + return Ok(true); + } + Token::At(name) => { + let mut value = *symbol_table.entry(name.clone()).or_insert(None); + loop { + let plus = match lexer.token { + Some((Token::Plus, _)) => true, + Some((Token::Minus, _)) => false, + _ => break, + }; + lexer.get()?; + + let operand = match lexer.token { + Some((Token::At(ref name), _)) => { + *symbol_table.entry(name.clone()).or_insert(None) + } + Some((Token::Integer(integer), _)) => Some( + integer + .try_into() + .map_err(|msg| lexer.error(format!("bad offset literal ({msg})")))?, + ), + _ => Err(lexer.error(String::from("expecting @label or integer literal")))?, + }; + lexer.get()?; + + value = match (value, operand) { + (Some(a), Some(b)) => Some( + if plus { + a.checked_add(b) + } else { + a.checked_sub(b) + } + .ok_or_else(|| { + lexer.error(String::from("overflow in offset arithmetic")) + })?, + ), + _ => None, + }; + } + let value = value.unwrap_or(0); + output.extend_from_slice(&lexer.endian.to_bytes(value)); + } + _ => (), + }; + if let Some((Token::Asterisk, _)) = lexer.token { + lexer.get()?; + let Token::Integer(count) = lexer.take()? else { + Err(lexer.error(String::from("positive integer expected after '*'")))? + }; + if count < 1 { + Err(lexer.error(String::from("positive integer expected after '*'")))? + }; + let final_len = output.len(); + for _ in 1..count { + output.extend_from_within(initial_len..final_len); + } + } + match lexer.token { + Some((Token::Semicolon, _)) => { + lexer.get()?; + } + Some((Token::RParen, _)) => (), + _ => Err(lexer.error(String::from("';' expected")))?, + } + Ok(true) +} + +fn put_counted_items( + lexer: &mut Lexer, + name: &str, + output: &mut Vec, + symbol_table: &mut HashMap>, +) -> Result<()> +where + T: Zero + TryFrom, + Endian: ToBytes, +{ + let old_size = output.len(); + output.extend_from_slice(&lexer.endian.to_bytes(T::zero())); + let start = output.len(); + if !matches!(lexer.token, Some((Token::LParen, _))) { + Err(lexer.error(format!("'(' expected after '{name}'")))? + } + lexer.get()?; + while !matches!(lexer.token, Some((Token::RParen, _))) { + parse_data_item(lexer, output, symbol_table)?; + } + lexer.get()?; + let delta = output.len() - start; + let Ok(delta): Result = delta.try_into() else { + Err(lexer.error(format!("{delta} bytes is too much for '{name}'")))? + }; + let dest = &mut output[old_size..old_size + N]; + dest.copy_from_slice(&lexer.endian.to_bytes(delta)); + Ok(()) +} + +fn put_integers( + lexer: &mut Lexer, + name: &str, + output: &mut Vec, +) -> Result<()> +where + T: Bounded + Display + TryFrom + Copy, + Endian: ToBytes, +{ + println!("put_integers {:?}", lexer.token); + let mut n = 0; + while let Some(integer) = lexer.take_if(|t| match t { + Token::Integer(integer) => Some(*integer), + _ => None, + })? { + println!("got integer {integer}"); + let Ok(integer) = integer.try_into() else { + Err(lexer.error(format!( + "{integer} is not in the valid range [{},{}]", + T::min_value(), + T::max_value() + )))? + }; + output.extend_from_slice(&lexer.endian.to_bytes(integer)); + n += 1; + } + println!("put_integers {:?} {n}", lexer.token); + if n == 0 { + Err(lexer.error(format!("integer expected after '{name}'")))? + } + Ok(()) +} + +#[derive(PartialEq, Eq, Clone, Debug)] +enum Token { + Integer(i64), + Float(OrderedFloat), + PcSysmis, + String(String), + Semicolon, + Asterisk, + LParen, + RParen, + I8, + I16, + I64, + S(usize), + Count, + Count8, + Hex, + Label(String), + At(String), + Minus, + Plus, +} + +struct Lexer<'a> { + input: &'a str, + token: Option<(Token, &'a str)>, + input_file_name: Option<&'a str>, + line_number: usize, + endian: Endian, +} + +fn skip_comments(mut s: &str) -> (&str, usize) { + let mut n_newlines = 0; + let s = loop { + s = s.trim_start_matches([' ', '\t', '\r', '<', '>']); + if let Some(remainder) = s.strip_prefix('#') { + let Some((_, remainder)) = remainder.split_once('\n') else { + break ""; + }; + s = remainder; + n_newlines += 1; + } else if let Some(remainder) = s.strip_prefix('\n') { + s = remainder; + n_newlines += 1; + } else { + break s; + } + }; + (s, n_newlines) +} + +impl<'a> Lexer<'a> { + fn new(input: &'a str, input_file_name: Option<&'a str>, endian: Endian) -> Result> { + let mut lexer = Lexer { + input, + token: None, + input_file_name, + line_number: 1, + endian, + }; + lexer.token = lexer.next()?; + Ok(lexer) + } + fn error(&self, message: String) -> Error { + let repr = self.token.as_ref().map(|(_, repr)| *repr); + Error::new(self.input_file_name, Some(self.line_number), repr, message) + } + fn take(&mut self) -> Result { + let Some(token) = self.token.take() else { + Err(self.error(String::from("unexpected end of input")))? + }; + self.token = self.next()?; + Ok(token.0) + } + fn take_if(&mut self, condition: F) -> Result> + where + F: FnOnce(&Token) -> Option, + { + let Some(ref token) = self.token else { + return Ok(None); + }; + match condition(&token.0) { + Some(value) => { + self.token = self.next()?; + Ok(Some(value)) + } + None => Ok(None), + } + } + fn get(&mut self) -> Result> { + if self.token.is_none() { + Err(self.error(String::from("unexpected end of input")))? + } else { + self.token = self.next()?; + match self.token { + Some((ref token, _)) => Ok(Some(token)), + None => Ok(None), + } + } + } + + fn next(&mut self) -> Result> { + // Get the first character of the token, skipping past white space and + // comments. + let (s, n_newlines) = skip_comments(self.input); + self.line_number += n_newlines; + self.input = s; + + let start = s; + let mut iter = s.chars(); + let Some(c) = iter.next() else { + return Ok(None); + }; + let (token, rest) = match c { + c if c.is_ascii_digit() || c == '-' => { + let len = s + .find(|c: char| { + !(c.is_ascii_digit() || c.is_alphabetic() || c == '.' || c == '-') + }) + .unwrap_or(s.len()); + let (number, rest) = s.split_at(len); + let token = if number == "-" { + Token::Minus + } else if let Some(digits) = number.strip_prefix("0x") { + Token::Integer(i64::from_str_radix(digits, 16).map_err(|msg| { + self.error(format!("bad integer literal '{number}' ({msg})")) + })?) + } else if !number.contains('.') { + Token::Integer(number.parse().map_err(|msg| { + self.error(format!("bad integer literal '{number}' ({msg})")) + })?) + } else { + Token::Float(number.parse().map_err(|msg| { + self.error(format!("bad float literal '{number}' ({msg})")) + })?) + }; + (token, rest) + } + '"' => { + let s = iter.as_str(); + let Some(len) = s.find(['\n', '"']) else { + Err(self.error(String::from("end-of-file inside string")))? + }; + let (string, rest) = s.split_at(len); + let Some(rest) = rest.strip_prefix('"') else { + Err(self.error(format!("new-line inside string ({string}...{rest})")))? + }; + (Token::String(string.into()), rest) + } + ';' => (Token::Semicolon, iter.as_str()), + '*' => (Token::Asterisk, iter.as_str()), + '+' => (Token::Plus, iter.as_str()), + '(' => (Token::LParen, iter.as_str()), + ')' => (Token::RParen, iter.as_str()), + c if c.is_alphabetic() || c == '@' || c == '_' => { + let len = s + .find(|c: char| { + !(c.is_ascii_digit() + || c.is_alphabetic() + || c == '@' + || c == '.' + || c == '_') + }) + .unwrap_or(s.len()); + let (s, rest) = s.split_at(len); + if let Some(rest) = rest.strip_prefix(':') { + (Token::Label(s.into()), rest) + } else if let Some(name) = s.strip_prefix('@') { + (Token::At(name.into()), rest) + } else if let Some(count) = s.strip_prefix('s') { + let token = + Token::S(count.parse().map_err(|msg| { + self.error(format!("bad counted string '{s}' ({msg})")) + })?); + (token, rest) + } else { + let token = match s { + "i8" => Token::I8, + "i16" => Token::I16, + "i64" => Token::I64, + "SYSMIS" => Token::Float(OrderedFloat(-f64::MAX)), + "PCSYSMIS" => Token::PcSysmis, + "LOWEST" => Token::Float((-f64::MAX).next_after(0.0).into()), + "HIGHEST" => Token::Float(f64::MAX.into()), + "ENDIAN" => Token::Integer(if self.endian == Endian::Big { 1 } else { 2 }), + "COUNT" => Token::Count, + "COUNT8" => Token::Count8, + "hex" => Token::Hex, + _ => Err(self.error(format!("invalid token '{s}'")))?, + }; + (token, rest) + } + } + _ => Err(self.error(format!("invalid input byte '{c}'")))?, + }; + self.input = rest; + let repr = &start[..start.len() - rest.len()]; + println!("{token:?} {repr}"); + Ok(Some((token, repr))) + } +} + +#[cfg(test)] +mod test { + use crate::endian::Endian; + use crate::sack::sack; + use anyhow::Result; + use hexplay::HexView; + + #[test] + fn basic_sack() -> Result<()> { + let input = r#" +"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; # Layout code +28; # Nominal case size +0; # Not compressed +0; # Not weighted +1; # 1 case. +100.0; # Bias. +"01 Jan 11"; "20:53:52"; +"PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 ""; +i8 0 *3; +"#; + let output = sack(input, None, Endian::Big)?; + HexView::new(&output).print()?; + Ok(()) + } + + #[test] + fn pcp_sack() -> Result<()> { + let input = r#" +# File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +@LABELS; @LABELS_END - @LABELS; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; # Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; # Fixed. + i16 0; + i16 15; + 1; + i16 0; # Fixed. + 1; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + # Numeric variable, no label or missing values. + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + + # Numeric variable, variable label. + 0; 0; @NUM2_LABEL - @LABELS_OFS; 0x050800; s8 "NUM2"; PCSYSMIS; + + # Numeric variable with missing value. + 0; 0; 0; 0x050800; s8 "NUM3"; 1.0; + + # Numeric variable, variable label and missing value. + 0; 0; @NUM4_LABEL - @LABELS_OFS; 0x050800; s8 "NUM4"; 2.0; + + # String variable, no label or missing values. + 0; 0; 0; 0x010800; s8 "STR1"; PCSYSMIS; + + # String variable, variable label. + 0; 0; @STR2_LABEL - @LABELS_OFS; 0x010400; s8 "STR2"; PCSYSMIS; + + # String variable with missing value. + 0; 0; 0; 0x010500; s8 "STR3"; s8 "MISS"; + + # String variable, variable label and missing value. + 0; 0; @STR4_LABEL - @LABELS_OFS; 0x010100; s8 "STR4"; s8 "OTHR"; + + # Long string variable + 0; 0; 0; 0x010b00; s8 "STR5"; PCSYSMIS; + 0 * 8; + + # Long string variable with variable label + 0; 0; @STR6_LABEL - @LABELS_OFS; 0x010b00; s8 "STR6"; PCSYSMIS; + 0 * 8; +VARS_END: + +LABELS: + 3; i8 0 0 0; LABELS_OFS: i8 0; + NUM2_LABEL: COUNT8("Numeric variable 2's label"); + NUM4_LABEL: COUNT8("Another numeric variable label"); + STR2_LABEL: COUNT8("STR2's variable label"); + STR4_LABEL: COUNT8("STR4's variable label"); + STR6_LABEL: COUNT8("Another string variable's label"); +LABELS_END: + +DATA: + 0.0; "11/28/14"; 1.0; + 0.0; 1.0; 2.0; PCSYSMIS; s8 "abcdefgh"; s8 "ijkl"; s8 "mnopq"; s8 "r"; + s16 "stuvwxyzAB"; s16 "CDEFGHIJKLM"; +DATA_END: +"#; + let output = sack(input, None, Endian::Big)?; + HexView::new(&output).print()?; + Ok(()) + } +} diff --git a/rust/pspp/src/settings.rs b/rust/pspp/src/settings.rs new file mode 100644 index 0000000000..de51951202 --- /dev/null +++ b/rust/pspp/src/settings.rs @@ -0,0 +1,140 @@ +use std::sync::OnceLock; + +use enum_map::EnumMap; + +use crate::{ + endian::Endian, + format::{Format, Settings as FormatSettings}, + message::Severity, +}; + +pub struct Settings { + pub input_integer_format: Endian, + pub input_float_format: Endian, + pub output_integer_format: Endian, + pub output_float_format: Endian, + + /// `MDISPLAY`: how to display matrices in `MATRIX`...`END MATRIX`. + pub matrix_display: MatrixDisplay, + + pub view_length: usize, + pub view_width: usize, + pub safer: bool, + pub include: bool, + pub route_errors_to_terminal: bool, + pub route_errors_to_listing: bool, + pub scompress: bool, + pub undefined: bool, + pub blanks: Option, + pub max_messages: EnumMap, + pub printback: bool, + pub macros: MacroSettings, + pub max_loops: usize, + pub workspace: usize, + pub default_format: Format, + pub testing: bool, + pub fuzz_bits: usize, + pub scale_min: usize, + pub commands: Compatibility, + pub global: Compatibility, + pub syntax: Compatibility, + pub formats: FormatSettings, + pub small: f64, +} + +impl Default for Settings { + fn default() -> Self { + Self { + input_integer_format: Endian::NATIVE, + input_float_format: Endian::NATIVE, + output_integer_format: Endian::NATIVE, + output_float_format: Endian::NATIVE, + matrix_display: MatrixDisplay::default(), + view_length: 24, + view_width: 79, + safer: false, + include: true, + route_errors_to_terminal: true, + route_errors_to_listing: true, + scompress: true, + undefined: true, + blanks: None, + max_messages: EnumMap::from_fn(|_| 100), + printback: true, + macros: MacroSettings::default(), + max_loops: 40, + workspace: 64 * 1024 * 1024, + default_format: Format::F8_2, + testing: false, + fuzz_bits: 6, + scale_min: 24, + commands: Compatibility::Enhanced, + global: Compatibility::Enhanced, + syntax: Compatibility::Enhanced, + formats: FormatSettings::default(), + small: 0.0001, + } + } +} + +impl Settings { + pub fn global() -> &'static Settings { + static GLOBAL: OnceLock = OnceLock::new(); + &GLOBAL.get_or_init(|| Settings::default()) + } +} + +pub enum Compatibility { + Compatible, + Enhanced, +} + +pub struct MacroSettings { + /// Expand macros? + pub expand: bool, + + /// Print macro expansions? + pub print_expansions: bool, + + /// Maximum iterations of `!FOR`. + pub max_iterations: usize, + + /// Maximum nested macro expansion levels. + pub max_nest: usize, +} + +impl Default for MacroSettings { + fn default() -> Self { + Self { + expand: true, + print_expansions: false, + max_iterations: 1000, + max_nest: 50, + } + } +} + +/// How to display matrices in `MATRIX`...`END MATRIX`. +#[derive(Default)] +pub enum MatrixDisplay { + /// Output matrices as text. + #[default] + Text, + + /// Output matrices as pivot tables. + Tables, +} + +pub enum OutputType { + /// Errors and warnings. + Error, + + /// Notes. + Notes, + + /// Syntax printback. + Syntax, + + /// Everything else. + Other, +} diff --git a/rust/pspp/tests/sack.rs b/rust/pspp/tests/sack.rs new file mode 100644 index 0000000000..49b10e77ac --- /dev/null +++ b/rust/pspp/tests/sack.rs @@ -0,0 +1,93 @@ +use std::fs::read_to_string; +use std::path::PathBuf; + +use anyhow::{anyhow, Result}; +use clap::Parser; +use pspp::endian::Endian; +use pspp::sack::sack; + +/// SAv Construction Kit +/// +/// The input is a sequence of data items, each followed by a semicolon. Each +/// data item is converted to the output format and written on stdout. A data +/// item is one of the following: +/// +/// - An integer in decimal, in hexadecimal prefixed by `0x`, or in octal +/// prefixed by `0`. Output as a 32-bit binary integer. +/// +/// - A floating-point number. Output in 64-bit IEEE 754 format. +/// +/// - A string enclosed in double quotes. Output literally. There is no +/// syntax for "escapes". Strings may not contain new-lines. +/// +/// - A literal of the form `s` followed by a quoted string as above. +/// Output as the string's contents followed by enough spaces to fill up +/// `` bytes. For example, `s8 "foo"` is output as `foo` followed +/// by 5 spaces. +/// +/// - The literal `i8`, `i16`, or `i64` followed by an integer. Output +/// as a binary integer with the specified number of bits. +/// +/// - One of the literals `SYSMIS`, `LOWEST`, or `HIGHEST`. Output as a +/// 64-bit IEEE 754 float of the appropriate PSPP value. +/// +/// - `PCSYSMIS`. Output as SPSS/PC+ system-missing value. +/// +/// - The literal `ENDIAN`. Output as a 32-bit binary integer, either with +/// value 1 if `--be` is in effect or 2 if `--le` is in effect. +/// +/// - A pair of parentheses enclosing a sequence of data items, each followed +/// by a semicolon (the last semicolon is optional). Output as the enclosed +/// data items in sequence. +/// +/// - The literal `COUNT` or `COUNT8` followed by a sequence of parenthesized +/// data items, as above. Output as a 32-bit or 8-bit binary integer whose +/// value is the number of bytes enclosed within the parentheses, followed +/// by the enclosed data items themselves. +/// +/// optionally followed by an asterisk and a positive integer, which specifies a +/// repeat count for the data item. +#[derive(Parser, Debug)] +struct Args { + /// Big-endian output format (default) + #[arg(long = "be")] + be: bool, + + /// Little-endian output format + #[arg(long = "le")] + le: bool, + + /// Input file. + #[arg(required = true, name = "input")] + input_file_name: PathBuf, + + /// Output file. + #[arg(required = true, name = "output")] + output_file_name: PathBuf, +} + +fn main() -> Result<()> { + let Args { + be, + le, + input_file_name, + output_file_name, + } = Args::parse(); + let endian = match (be, le) { + (false, false) | (true, false) => Endian::Big, + (false, true) => Endian::Little, + (true, true) => return Err(anyhow!("can't use both `--be` and `--le`")), + }; + + let input_file_str = input_file_name.to_string_lossy(); + let input = read_to_string(&input_file_name) + .map_err(|err| anyhow!("{input_file_str}: read failed ({err})"))?; + + let output = sack(&input, Some(&input_file_str), endian)?; + + let output_file_str = output_file_name.to_string_lossy(); + std::fs::write(&output_file_name, output) + .map_err(|err| anyhow!("{output_file_str}: write failed ({err})"))?; + + Ok(()) +} diff --git a/rust/src/command.rs b/rust/src/command.rs deleted file mode 100644 index d337d1823a..0000000000 --- a/rust/src/command.rs +++ /dev/null @@ -1,192 +0,0 @@ -use std::{fmt::Write, sync::OnceLock}; - -use flagset::{flags, FlagSet}; - -use crate::{ - integer::ToInteger, - lex::{ - command_name::CommandMatcher, - lexer::Lexer, - token::{Punct, Token}, - }, - message::Diagnostic, -}; - -flags! { - enum State: u8 { - /// No active dataset yet defined. - Initial, - - /// Active dataset has been defined. - Data, - - /// Inside `INPUT PROGRAM`. - InputProgram, - - /// Inside `FILE TYPE`. - FileType, - - /// State nested inside `LOOP` or `DO IF`, inside [State::Data]. - NestedData, - - /// State nested inside `LOOP` or `DO IF`, inside [State::InputProgram]. - NestedInputProgram, - } -} - -struct Command { - allowed_states: FlagSet, - enhanced_only: bool, - testing_only: bool, - no_abbrev: bool, - name: &'static str, - run: Box, -} - -fn commands() -> &'static [Command] { - fn new_commands() -> Vec { - vec![Command { - allowed_states: State::Initial | State::Data, - enhanced_only: false, - testing_only: false, - no_abbrev: false, - name: "ECHO", - run: Box::new(|_context| { - println!("hi"); - }), - }] - } - - static COMMANDS: OnceLock> = OnceLock::new(); - COMMANDS.get_or_init(|| new_commands()).as_slice() -} - -fn parse_command_word(lexer: &mut Lexer, s: &mut String, n: isize) -> bool { - let separator = match s.chars().next_back() { - Some(c) if c != '-' => " ", - _ => "", - }; - - match lexer.next(n) { - Token::Punct(Punct::Dash) => { - s.push('-'); - true - } - Token::Id(id) => { - write!(s, "{separator}{id}").unwrap(); - true - } - Token::Number(number) if number.is_sign_positive() => { - if let Some(integer) = number.to_exact_usize() { - write!(s, "{separator}{integer}").unwrap(); - true - } else { - false - } - } - _ => false, - } -} - -fn find_best_match(s: &str) -> (Option<&'static Command>, isize) { - let mut cm = CommandMatcher::new(s); - for command in commands() { - cm.add(command.name, command); - } - cm.get_match() -} - -fn parse_command_name( - lexer: &mut Lexer, - error: &Box, -) -> Result<(&'static Command, isize), ()> { - let mut s = String::new(); - let mut word = 0; - let mut missing_words = 0; - let mut command = None; - while parse_command_word(lexer, &mut s, word) { - (command, missing_words) = find_best_match(&s); - if missing_words <= 0 { - break; - } - word += 1; - } - if command.is_none() && missing_words > 0 { - s.push_str(" ."); - (command, missing_words) = find_best_match(&s); - s.truncate(s.len() - 2); - } - - match command { - Some(command) => Ok((command, (word + 1) + missing_words)), - None => { - if s.is_empty() { - error(lexer.error("Syntax error expecting command name")) - } else { - error(lexer.error("Unknown command `{s}`.")) - }; - Err(()) - } - } -} - -pub enum Success { - Success, - Eof, - Finish, -} - -pub fn end_of_command(context: &Context) -> Result { - match context.lexer.token() { - Token::EndCommand | Token::End => Ok(Success::Success), - _ => { - context.error( - context - .lexer - .error("Syntax error expecting end of command."), - ); - Err(()) - } - } -} - -fn parse_in_state(lexer: &mut Lexer, error: &Box, _state: State) { - match lexer.token() { - Token::End | Token::EndCommand => (), - _ => { - if let Ok((command, n_tokens)) = parse_command_name(lexer, error) { - for _ in 0..n_tokens { - lexer.get(); - } - let context = Context { - error, - lexer, - command_name: Some(command.name), - }; - (command.run)(&context); - end_of_command(&context); - } - lexer.interactive_reset(); - lexer.discard_rest_of_command(); - } - } - while let Token::EndCommand = lexer.token() { - lexer.get(); - } -} - -pub fn parse(lexer: &mut Lexer, error: &Box) { - parse_in_state(lexer, error, State::Initial) -} - -pub struct Context<'a> { - error: &'a Box, - lexer: &'a mut Lexer, - command_name: Option<&'static str>, -} - -impl<'a> Context<'a> { - pub fn error(&self, diagnostic: Diagnostic) { - (self.error)(diagnostic); - } -} diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs deleted file mode 100644 index d2617df528..0000000000 --- a/rust/src/cooked.rs +++ /dev/null @@ -1,1482 +0,0 @@ -use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc}; - -use crate::{ - dictionary::{Dictionary, VarWidth, Variable}, - encoding::Error as EncodingError, - endian::Endian, - format::{Error as FormatError, Format, UncheckedFormat}, - identifier::{Error as IdError, Identifier}, - raw::{ - self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord, - FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongNamesRecord, - LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord, - NumberOfCasesRecord, ProductInfoRecord, RawStr, ValueLabel, ValueLabelRecord, - VarDisplayRecord, VariableAttributeRecord, VariableRecord, VariableSetRecord, - VeryLongStringsRecord, ZHeader, ZTrailer, - }, -}; -use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; -use encoding_rs::Encoding; -use num::Integer; -use thiserror::Error as ThisError; - -pub use crate::raw::{CategoryLabels, Compression}; - -#[derive(ThisError, Debug)] -pub enum Error { - #[error("Missing header record")] - MissingHeaderRecord, - - // XXX this is an internal error - #[error("More than one file header record")] - DuplicateHeaderRecord, - - #[error("{0}")] - EncodingError(EncodingError), - - #[error("Using default encoding {0}.")] - UsingDefaultEncoding(String), - - #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)] - InvalidVariableWidth { offsets: Range, width: i32 }, - - #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")] - InvalidLongMissingValueFormat, - - #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")] - InvalidCreationDate { creation_date: String }, - - #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")] - InvalidCreationTime { creation_time: String }, - - #[error("{id_error} Renaming variable to {new_name}.")] - InvalidVariableName { - id_error: IdError, - new_name: Identifier, - }, - - #[error( - "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}" - )] - InvalidPrintFormat { - new_spec: Format, - variable: Identifier, - format_error: FormatError, - }, - - #[error( - "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}" - )] - InvalidWriteFormat { - new_spec: Format, - variable: Identifier, - format_error: FormatError, - }, - - #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")] - DuplicateVariableName { - duplicate_name: Identifier, - new_name: Identifier, - }, - - #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")] - InvalidDictIndex { dict_index: usize, max_index: usize }, - - #[error("Dictionary index {0} refers to a long string continuation.")] - DictIndexIsContinuation(usize), - - #[error("At offset {offset:#x}, one or more variable indexes for value labels referred to long string continuation records: {indexes:?}")] - LongStringContinuationIndexes { offset: u64, indexes: Vec }, - - #[error( - "At offsets {:#x}...{:#x}, record types 3 and 4 may not add value labels to one or more long string variables: {variables:?}", .offsets.start, .offsets.end - )] - InvalidLongStringValueLabels { - offsets: Range, - variables: Vec, - }, - - #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")] - ValueLabelsDifferentTypes { - numeric_var: Identifier, - string_var: Identifier, - }, - - #[error("Invalid multiple response set name. {0}")] - InvalidMrSetName(IdError), - - #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")] - UnknownMrSetVariable { - mr_set: Identifier, - short_name: Identifier, - }, - - #[error("Multiple response set {0} has no variables.")] - EmptyMrSet(Identifier), - - #[error("Multiple response set {0} has only one variable.")] - OneVarMrSet(Identifier), - - #[error("Multiple response set {0} contains both string and numeric variables.")] - MixedMrSet(Identifier), - - #[error( - "Invalid numeric format for counted value {number} in multiple response set {mr_set}." - )] - InvalidMDGroupCountedValue { mr_set: Identifier, number: String }, - - #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")] - TooWideMDGroupCountedValue { - mr_set: Identifier, - value: String, - width: usize, - max_width: u16, - }, - - #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")] - InvalidLongValueLabelWidth { - name: Identifier, - width: u32, - min_width: u16, - max_width: u16, - }, - - #[error("Invalid attribute name. {0}")] - InvalidAttributeName(IdError), - - #[error("Invalid short name in long variable name record. {0}")] - InvalidShortName(IdError), - - #[error("Invalid name in long variable name record. {0}")] - InvalidLongName(IdError), - - #[error("Invalid variable name in very long string record. {0}")] - InvalidLongStringName(IdError), - - #[error("Invalid variable name in long string value label record. {0}")] - InvalidLongStringValueLabelName(IdError), - - #[error("Invalid variable name in attribute record. {0}")] - InvalidAttributeVariableName(IdError), - - // XXX This is risky because `text` might be arbitarily long. - #[error("Text string contains invalid bytes for {encoding} encoding: {text}")] - MalformedString { encoding: String, text: String }, - - #[error("Details TBD")] - TBD, -} - -type DictIndex = usize; - -#[derive(Clone, Debug)] -pub struct Headers { - pub header: HeaderRecord, - pub variable: Vec>, - pub value_label: Vec, String>>, - pub document: Vec>, - pub integer_info: Option, - pub float_info: Option, - pub var_display: Option, - pub multiple_response: Vec>, - pub long_string_value_labels: Vec>, - pub long_string_missing_values: Vec>, - pub encoding: Option, - pub number_of_cases: Option, - pub variable_sets: Vec, - pub product_info: Option, - pub long_names: Vec, - pub very_long_strings: Vec, - pub file_attributes: Vec, - pub variable_attributes: Vec, - pub other_extension: Vec, - pub end_of_headers: Option, - pub z_header: Option, - pub z_trailer: Option, - pub cases: Option>>, -} - -fn take_first(mut vec: Vec, more_than_one: F) -> Option -where - F: FnOnce(), -{ - if vec.len() > 1 { - more_than_one(); - } - vec.drain(..).next() -} - -impl Headers { - pub fn new(headers: Vec, warn: &impl Fn(Error)) -> Result { - let mut file_header = Vec::new(); - let mut variable = Vec::new(); - let mut value_label = Vec::new(); - let mut document = Vec::new(); - let mut integer_info = Vec::new(); - let mut float_info = Vec::new(); - let mut var_display = Vec::new(); - let mut multiple_response = Vec::new(); - let mut long_string_value_labels = Vec::new(); - let mut long_string_missing_values = Vec::new(); - let mut encoding = Vec::new(); - let mut number_of_cases = Vec::new(); - let mut variable_sets = Vec::new(); - let mut product_info = Vec::new(); - let mut long_names = Vec::new(); - let mut very_long_strings = Vec::new(); - let mut file_attributes = Vec::new(); - let mut variable_attributes = Vec::new(); - let mut other_extension = Vec::new(); - let mut end_of_headers = Vec::new(); - let mut z_header = Vec::new(); - let mut z_trailer = Vec::new(); - let mut cases = Vec::new(); - - for header in headers { - match header { - DecodedRecord::Header(record) => { - file_header.push(record); - } - DecodedRecord::Variable(record) => { - variable.push(record); - } - DecodedRecord::ValueLabel(record) => { - value_label.push(record); - } - DecodedRecord::Document(record) => { - document.push(record); - } - DecodedRecord::IntegerInfo(record) => { - integer_info.push(record); - } - DecodedRecord::FloatInfo(record) => { - float_info.push(record); - } - DecodedRecord::VariableSets(record) => { - variable_sets.push(record); - } - DecodedRecord::VarDisplay(record) => { - var_display.push(record); - } - DecodedRecord::MultipleResponse(record) => { - multiple_response.push(record); - } - DecodedRecord::LongStringValueLabels(record) => { - long_string_value_labels.push(record) - } - DecodedRecord::LongStringMissingValues(record) => { - long_string_missing_values.push(record); - } - DecodedRecord::Encoding(record) => { - encoding.push(record); - } - DecodedRecord::NumberOfCases(record) => { - number_of_cases.push(record); - } - DecodedRecord::ProductInfo(record) => { - product_info.push(record); - } - DecodedRecord::LongNames(record) => { - long_names.push(record); - } - DecodedRecord::VeryLongStrings(record) => { - very_long_strings.push(record); - } - DecodedRecord::FileAttributes(record) => { - file_attributes.push(record); - } - DecodedRecord::VariableAttributes(record) => { - variable_attributes.push(record); - } - DecodedRecord::OtherExtension(record) => { - other_extension.push(record); - } - DecodedRecord::EndOfHeaders(record) => { - end_of_headers.push(record); - } - DecodedRecord::ZHeader(record) => { - z_header.push(record); - } - DecodedRecord::ZTrailer(record) => { - z_trailer.push(record); - } - DecodedRecord::Cases(record) => { - cases.push(record); - } - } - } - - let Some(file_header) = take_first(file_header, || warn(Error::DuplicateHeaderRecord)) - else { - return Err(Error::MissingHeaderRecord); - }; - - Ok(Headers { - header: file_header, - variable, - value_label, - document, - integer_info: take_first(integer_info, || warn(Error::TBD)), - float_info: take_first(float_info, || warn(Error::TBD)), - var_display: take_first(var_display, || warn(Error::TBD)), - multiple_response, - long_string_value_labels, - long_string_missing_values, - encoding: take_first(encoding, || warn(Error::TBD)), - number_of_cases: take_first(number_of_cases, || warn(Error::TBD)), - variable_sets, - product_info: take_first(product_info, || warn(Error::TBD)), - long_names, - very_long_strings, - file_attributes, - variable_attributes, - other_extension, - end_of_headers: take_first(end_of_headers, || warn(Error::TBD)), - z_header: take_first(z_header, || warn(Error::TBD)), - z_trailer: take_first(z_trailer, || warn(Error::TBD)), - cases: take_first(cases, || warn(Error::TBD)), - }) - } -} - -pub struct Metadata { - creation: NaiveDateTime, - endian: Endian, - compression: Option, - n_cases: Option, - product: String, - product_ext: Option, - version: Option<(i32, i32, i32)>, -} - -impl Metadata { - fn decode(headers: &Headers, warn: impl Fn(Error)) -> Self { - let header = &headers.header; - let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y") - .unwrap_or_else(|_| { - warn(Error::InvalidCreationDate { - creation_date: header.creation_date.to_string(), - }); - Default::default() - }); - let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S") - .unwrap_or_else(|_| { - warn(Error::InvalidCreationTime { - creation_time: header.creation_time.to_string(), - }); - Default::default() - }); - let creation = NaiveDateTime::new(creation_date, creation_time); - - let product = header - .eye_catcher - .trim_start_matches("@(#) SPSS DATA FILE") - .trim_end() - .to_string(); - - Self { - creation, - endian: header.endian, - compression: header.compression, - n_cases: header.n_cases.map(|n| n as u64), - product, - product_ext: headers.product_info.as_ref().map(|pe| fix_line_ends(&pe.0)), - version: headers.integer_info.as_ref().map(|ii| ii.version), - } - } -} - -struct Decoder { - //pub raw: raw::Decoder, - pub encoding: &'static Encoding, - //pub variables: HashMap, - //pub var_names: HashMap, - //pub dictionary: Dictionary, - //n_dict_indexes: usize, - n_generated_names: usize, -} - -impl Decoder { - fn generate_name(&mut self, dictionary: &Dictionary) -> Identifier { - loop { - self.n_generated_names += 1; - let name = Identifier::from_encoding(&format!("VAR{:03}", self.n_generated_names), self.encoding) - .unwrap(); - if !dictionary.variables.contains(&name.0) { - return name; - } - assert!(self.n_generated_names < usize::MAX); - } - } -} - -pub fn decode( - mut headers: Headers, - encoding: &'static Encoding, - warn: impl Fn(Error), -) -> Result<(Dictionary, Metadata), Error> { - let mut dictionary = Dictionary::new(encoding); - - let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' ')); - if !file_label.is_empty() { - dictionary.file_label = Some(file_label); - } - - for attributes in headers.file_attributes.drain(..) { - dictionary.attributes.extend(attributes.0 .0.into_iter()) - } - - // Concatenate all the document records (really there should only be one) - // and trim off the trailing spaces that pad them to 80 bytes. - dictionary.documents = headers - .document - .drain(..) - .flat_map(|record| record.lines) - .map(trim_end_spaces) - .collect(); - - // XXX warn for weird integer format - // XXX warn for weird floating-point format, etc. - - let mut decoder = Decoder { - encoding, - n_generated_names: 0, - }; - - let mut header_vars = headers.variable.iter().enumerate(); - let mut var_index_map = HashMap::new(); - while let Some((value_index, input)) = header_vars.next() { - let name = trim_end_spaces(input.name.to_string()); - let name = match Identifier::from_encoding(&name, encoding) { - Ok(name) => { - if !dictionary.variables.contains(&name.0) { - name - } else { - let new_name = decoder.generate_name(&dictionary); - warn(Error::DuplicateVariableName { - duplicate_name: name.clone(), - new_name: new_name.clone(), - }); - new_name - } - } - Err(id_error) => { - let new_name = decoder.generate_name(&dictionary); - warn(Error::InvalidVariableName { - id_error, - new_name: new_name.clone(), - }); - new_name - } - }; - let mut variable = Variable::new(name.clone(), VarWidth::from_raw(input.width).unwrap()); - - // Set the short name the same as the long name (even if we renamed it). - variable.short_names = vec![name]; - - variable.label = input.label.clone(); - - variable.missing_values = input.missing_values.clone(); - - variable.print_format = decode_format( - input.print_format, - variable.width, - |new_spec, format_error| { - warn(Error::InvalidPrintFormat { - new_spec, - variable: variable.name.clone(), - format_error, - }) - }, - ); - variable.write_format = decode_format( - input.write_format, - variable.width, - |new_spec, format_error| { - warn(Error::InvalidWriteFormat { - new_spec, - variable: variable.name.clone(), - format_error, - }) - }, - ); - - // Skip long string continuation records. - if input.width > 0 { - #[allow(unstable_name_collisions)] - for _ in 1..input.width.div_ceil(&8) { - if let Some((_, continuation)) = header_vars.next() { - if continuation.width == -1 { - continue; - } - } - return Err(Error::TBD); - } - } - - let dict_index = dictionary.add_var(variable).unwrap(); - assert_eq!(var_index_map.insert(value_index, dict_index), None); - } - - for record in headers.value_label.drain(..) { - let mut dict_indexes = Vec::with_capacity(record.dict_indexes.len()); - let mut continuation_indexes = Vec::new(); - let mut long_string_variables = Vec::new(); - for value_index in record.dict_indexes.iter() { - if let Some(dict_index) = var_index_map.get(&(*value_index as usize - 1)) { - let variable = &dictionary.variables[*dict_index]; - if variable.width.is_long_string() { - long_string_variables.push(variable.name.clone()); - } else { - dict_indexes.push(*dict_index); - } - } else { - continuation_indexes.push(*value_index); - } - } - if !continuation_indexes.is_empty() { - warn(Error::LongStringContinuationIndexes { - offset: record.offsets.start, - indexes: continuation_indexes, - }); - } - if !long_string_variables.is_empty() { - warn(Error::InvalidLongStringValueLabels { - offsets: record.offsets.clone(), - variables: long_string_variables, - }); - } - - for dict_index in dict_indexes { - let mut variable = &dictionary.variables[dict_index]; - for ValueLabel { value, label } in record.labels.iter().cloned() { - - } - } - } - - let metadata = Metadata::decode(&headers, warn); - Ok((dictionary, metadata)) -} - -fn trim_end_spaces(mut s: String) -> String { - s.truncate(s.trim_end_matches(' ').len()); - s -} - -/// Returns a copy of `s` in which all lone CR and CR LF pairs have been -/// replaced by LF. -/// -/// (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system -/// files that use CR-only line ends in the file label and extra product info.) -fn fix_line_ends(s: &str) -> String { - let mut out = String::with_capacity(s.len()); - let mut s = s.chars().peekable(); - while let Some(c) = s.next() { - match c { - '\r' => { - s.next_if_eq(&'\n'); - out.push('\n') - } - c => out.push(c), - } - } - out -} - -fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Format, FormatError)) -> Format { - UncheckedFormat::try_from(raw) - .and_then(Format::try_from) - .and_then(|x| x.check_width_compatibility(width)) - .unwrap_or_else(|error| { - let new_format = Format::default_for_width(width); - warn(new_format, error); - new_format - }) -} - -/* -impl Decoder { - fn generate_name(&mut self) -> Identifier { - loop { - self.n_generated_names += 1; - let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding) - .unwrap(); - if !self.var_names.contains_key(&name) { - return name; - } - assert!(self.n_generated_names < usize::MAX); - } - } - fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> { - let (output, malformed) = self.encoding.decode_without_bom_handling(input); - if malformed { - warn(Error::MalformedString { - encoding: self.encoding.name().into(), - text: output.clone().into(), - }); - } - output - } - fn decode_string(&self, input: &[u8], warn: &impl Fn(Error)) -> String { - self.decode_string_cow(input, warn).into() - } - pub fn decode_identifier( - &self, - input: &[u8], - warn: &impl Fn(Error), - ) -> Result { - let s = self.decode_string_cow(input, warn); - Identifier::new(&s, self.encoding) - } - fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> { - let max_index = self.n_dict_indexes; - if dict_index == 0 || dict_index > max_index { - return Err(Error::InvalidDictIndex { - dict_index, - max_index, - }); - } - let Some(variable) = self.variables.get(&(dict_index - 1)) else { - return Err(Error::DictIndexIsContinuation(dict_index)); - }; - Ok(variable) - } - - /// Returns `input` decoded from `self.encoding` into UTF-8 such that - /// re-encoding the result back into `self.encoding` will have exactly the - /// same length in bytes. - /// - /// XXX warn about errors? - fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { - if let (s, false) = self.encoding.decode_without_bom_handling(input) { - // This is the common case. Usually there will be no errors. - s - } else { - // Unusual case. Don't bother to optimize it much. - let mut decoder = self.encoding.new_decoder_without_bom_handling(); - let mut output = String::with_capacity( - decoder - .max_utf8_buffer_length_without_replacement(input.len()) - .unwrap(), - ); - let mut rest = input; - while !rest.is_empty() { - match decoder.decode_to_string_without_replacement(rest, &mut output, true) { - (DecoderResult::InputEmpty, _) => break, - (DecoderResult::OutputFull, _) => unreachable!(), - (DecoderResult::Malformed(a, b), consumed) => { - let skipped = a as usize + b as usize; - output.extend(repeat('?').take(skipped)); - rest = &rest[consumed..]; - } - } - } - assert_eq!(self.encoding.encode(&output).0.len(), input.len()); - output.into() - } - } -} - -pub trait TryDecode: Sized { - type Input<'a>; - fn try_decode( - decoder: &mut Decoder, - input: &Self::Input<'_>, - warn: impl Fn(Error), - ) -> Result, Error>; -} - -pub trait Decode: Sized { - fn decode(decoder: &Decoder, input: &Input, warn: impl Fn(Error)) -> Self; -} - -impl Decode> for String { - fn decode(decoder: &Decoder, input: &RawStr, warn: impl Fn(Error)) -> Self { - decoder.decode_string(&input.0, &warn) - } -} -*/ -/* -#[derive(Clone, Debug)] -pub struct HeaderRecord { - pub eye_catcher: String, - pub weight_index: Option, - pub n_cases: Option, - pub creation: NaiveDateTime, - pub file_label: String, -} - -fn trim_end_spaces(mut s: String) -> String { - s.truncate(s.trim_end_matches(' ').len()); - s -} - -/// Data file info that doesn't fit in [Dictionary]. -pub struct Metadata { - creation: NaiveDateTime, - endian: Endian, - compression: Option, - n_cases: Option, - product: String, - product_ext: Option, - version: Option<(i32, i32, i32)>, -} - -impl Metadata { - fn decode( - header: &crate::raw::HeaderRecord>, - integer_info: Option<&IntegerInfoRecord>, - product_ext: Option<&ProductInfoRecord>, - warn: impl Fn(Error), - ) -> Self { - let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y") - .unwrap_or_else(|_| { - warn(Error::InvalidCreationDate { - creation_date: header.creation_date.to_string(), - }); - Default::default() - }); - let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S") - .unwrap_or_else(|_| { - warn(Error::InvalidCreationTime { - creation_time: header.creation_time.to_string(), - }); - Default::default() - }); - let creation = NaiveDateTime::new(creation_date, creation_time); - - let product = header - .eye_catcher - .trim_start_matches("@(#) SPSS DATA FILE") - .trim_end() - .to_string(); - - Self { - creation, - endian: header.endian, - compression: header.compression, - n_cases: header.n_cases.map(|n| n as u64), - product, - product_ext: product_ext.map(|pe| pe.0.clone()), - version: integer_info.map(|ii| ii.version), - } - } -} - -impl TryDecode for HeaderRecord { - type Input<'a> = crate::raw::HeaderRecord>; - - fn try_decode( - _decoder: &mut Decoder, - input: &Self::Input<'_>, - warn: impl Fn(Error), - ) -> Result, Error> { - let eye_catcher = trim_end_spaces(input.eye_catcher.to_string()); - let file_label = trim_end_spaces(input.file_label.to_string()); - let creation_date = NaiveDate::parse_from_str(&input.creation_date, "%e %b %Y") - .unwrap_or_else(|_| { - warn(Error::InvalidCreationDate { - creation_date: input.creation_date.to_string(), - }); - Default::default() - }); - let creation_time = NaiveTime::parse_from_str(&input.creation_time, "%H:%M:%S") - .unwrap_or_else(|_| { - warn(Error::InvalidCreationTime { - creation_time: input.creation_time.to_string(), - }); - Default::default() - }); - Ok(Some(HeaderRecord { - eye_catcher, - weight_index: input.weight_index.map(|n| n as usize), - n_cases: input.n_cases.map(|n| n as u64), - creation: NaiveDateTime::new(creation_date, creation_time), - file_label, - })) - } -} - -#[derive(Clone, Debug)] -pub struct VariableRecord { - pub width: VarWidth, - pub name: Identifier, - pub print_format: Spec, - pub write_format: Spec, - pub missing_values: MissingValues, - pub label: Option, -} - - -fn parse_variable_record( - decoder: &mut Decoder, - input: &raw::VariableRecord, String>, - warn: impl Fn(Error), -) -> Result<(), Error> { - let width = match input.width { - 0 => VarWidth::Numeric, - w @ 1..=255 => VarWidth::String(w as u16), - -1 => return Ok(()), - _ => { - return Err(Error::InvalidVariableWidth { - offsets: input.offsets.clone(), - width: input.width, - }) - } - }; - let name = trim_end_spaces(input.name.to_string()); - let name = match Identifier::new(&name, decoder.encoding) { - Ok(name) => { - if !decoder.var_names.contains_key(&name) { - name - } else { - let new_name = decoder.generate_name(); - warn(Error::DuplicateVariableName { - duplicate_name: name.clone(), - new_name: new_name.clone(), - }); - new_name - } - } - Err(id_error) => { - let new_name = decoder.generate_name(); - warn(Error::InvalidVariableName { - id_error, - new_name: new_name.clone(), - }); - new_name - } - }; - let variable = Variable { - dict_index: decoder.n_dict_indexes, - short_name: name.clone(), - long_name: None, - width, - }; - decoder.n_dict_indexes += width.n_dict_indexes(); - assert!(decoder - .var_names - .insert(name.clone(), variable.dict_index) - .is_none()); - assert!(decoder - .variables - .insert(variable.dict_index, variable) - .is_none()); - - let print_format = decode_format(input.print_format, width, |new_spec, format_error| { - warn(Error::InvalidPrintFormat { - new_spec, - variable: name.clone(), - format_error, - }) - }); - let write_format = decode_format(input.write_format, width, |new_spec, format_error| { - warn(Error::InvalidWriteFormat { - new_spec, - variable: name.clone(), - format_error, - }) - }); - let mut variable = dictionary::Variable::new(name, width); - variable.print_format = print_format; - variable.write_format = write_format; - variable.missing_values = input.missing_values.clone(); - if let Some(ref label) = input.label { - variable.label = Some(label.to_string()); - } - decoder.dictionary.add_var(variable).unwrap(); - Ok(()) -} - -#[derive(Clone, Debug)] -pub struct DocumentRecord(Vec); - -impl TryDecode for DocumentRecord { - type Input<'a> = crate::raw::DocumentRecord; - - fn try_decode( - decoder: &mut Decoder, - input: &Self::Input<'_>, - warn: impl Fn(Error), - ) -> Result, Error> { - Ok(Some(DocumentRecord( - input - .lines - .iter() - .map(|s| trim_end_spaces(decoder.decode_string(&s.0, &warn))) - .collect(), - ))) - } -} - -trait TextRecord -where - Self: Sized, -{ - const NAME: &'static str; - fn parse(input: &str, warn: impl Fn(Error)) -> Result; -} - -#[derive(Clone, Debug)] -pub struct VariableSet { - pub name: String, - pub vars: Vec, -} - -impl VariableSet { - fn parse(input: &str) -> Result { - let (name, input) = input.split_once('=').ok_or(Error::TBD)?; - let vars = input.split_ascii_whitespace().map(String::from).collect(); - Ok(VariableSet { - name: name.into(), - vars, - }) - } -} - -trait WarnOnError { - fn warn_on_error(self, warn: &F) -> Option; -} -impl WarnOnError for Result { - fn warn_on_error(self, warn: &F) -> Option { - match self { - Ok(result) => Some(result), - Err(error) => { - warn(error); - None - } - } - } -} - -#[derive(Clone, Debug)] -pub struct ValueLabel { - pub value: Value, - pub label: String, -} - -#[derive(Clone, Debug)] -pub struct ValueLabelRecord { - pub var_type: VarType, - pub labels: Vec, - pub variables: Vec, -} - -impl TryDecode for ValueLabelRecord { - type Input<'a> = crate::raw::ValueLabelRecord, RawString>; - fn try_decode( - decoder: &mut Decoder, - input: &Self::Input<'_>, - warn: impl Fn(Error), - ) -> Result, Error> { - let variables: Vec<&Variable> = input - .dict_indexes - .iter() - .filter_map(|&dict_index| { - decoder - .get_var_by_index(dict_index as usize) - .warn_on_error(&warn) - }) - .filter(|&variable| match variable.width { - VarWidth::String(width) if width > 8 => { - warn(Error::InvalidLongStringValueLabel( - variable.short_name.clone(), - )); - false - } - _ => true, - }) - .collect(); - let mut i = variables.iter(); - let Some(&first_var) = i.next() else { - return Ok(None); - }; - let var_type: VarType = first_var.width.into(); - for &variable in i { - let this_type: VarType = variable.width.into(); - if var_type != this_type { - let (numeric_var, string_var) = match var_type { - VarType::Numeric => (first_var, variable), - VarType::String => (variable, first_var), - }; - warn(Error::ValueLabelsDifferentTypes { - numeric_var: numeric_var.short_name.clone(), - string_var: string_var.short_name.clone(), - }); - return Ok(None); - } - } - let labels = input - .labels - .iter() - .map(|raw::ValueLabel { value, label }| { - let label = decoder.decode_string(&label.0, &warn); - let value = Value::decode(value, decoder); - ValueLabel { value, label } - }) - .collect(); - let variables = variables - .iter() - .map(|&variable| variable.short_name.clone()) - .collect(); - Ok(Some(ValueLabelRecord { - var_type, - labels, - variables, - })) - } -} - -#[derive(Clone, Debug)] -pub struct VariableSetRecord(Vec); - -impl TextRecord for VariableSetRecord { - const NAME: &'static str = "variable set"; - fn parse(input: &str, warn: impl Fn(Error)) -> Result { - let mut sets = Vec::new(); - for line in input.lines() { - if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) { - sets.push(set) - } - } - Ok(VariableSetRecord(sets)) - } -} - -#[derive(Clone, Debug)] -pub struct LongName { - pub short_name: Identifier, - pub long_name: Identifier, -} - -impl LongName { - fn new(decoder: &mut Decoder, short_name: &str, long_name: &str) -> Result { - let short_name = - Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidShortName)?; - let long_name = - Identifier::new(long_name, decoder.encoding).map_err(Error::InvalidLongName)?; - Ok(LongName { - short_name, - long_name, - }) - } -} - -#[derive(Clone, Debug)] -pub struct LongNameRecord(Vec); - -impl LongNameRecord { - pub fn parse(decoder: &mut Decoder, input: &str, warn: impl Fn(Error)) -> Result { - let mut names = Vec::new(); - for pair in input.split('\t').filter(|s| !s.is_empty()) { - if let Some((short_name, long_name)) = pair.split_once('=') { - if let Some(long_name) = - LongName::new(decoder, short_name, long_name).warn_on_error(&warn) - { - names.push(long_name); - } - } else { - warn(Error::TBD) - } - } - Ok(LongNameRecord(names)) - } -} - -#[derive(Clone, Debug)] -pub struct VeryLongString { - pub short_name: Identifier, - pub length: u16, -} - -impl VeryLongString { - fn parse(decoder: &Decoder, input: &str) -> Result { - let Some((short_name, length)) = input.split_once('=') else { - return Err(Error::TBD); - }; - let short_name = - Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidLongStringName)?; - let length: u16 = length.parse().map_err(|_| Error::TBD)?; - if length > VarWidth::MAX_STRING { - return Err(Error::TBD); - } - Ok(VeryLongString { short_name, length }) - } -} - -#[derive(Clone, Debug)] -pub struct VeryLongStringRecord(Vec); - -impl VeryLongStringRecord { - pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result { - let mut very_long_strings = Vec::new(); - for tuple in input - .split('\0') - .map(|s| s.trim_end_matches('\t')) - .filter(|s| !s.is_empty()) - { - if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&warn) { - very_long_strings.push(vls) - } - } - Ok(VeryLongStringRecord(very_long_strings)) - } -} - -#[derive(Clone, Debug)] -pub struct Attribute { - pub name: Identifier, - pub values: Vec, -} - -impl Attribute { - fn parse<'a>( - decoder: &Decoder, - input: &'a str, - warn: &impl Fn(Error), - ) -> Result<(Option, &'a str), Error> { - let Some((name, mut input)) = input.split_once('(') else { - return Err(Error::TBD); - }; - let mut values = Vec::new(); - loop { - let Some((value, rest)) = input.split_once('\n') else { - return Err(Error::TBD); - }; - if let Some(stripped) = value - .strip_prefix('\'') - .and_then(|value| value.strip_suffix('\'')) - { - values.push(stripped.into()); - } else { - warn(Error::TBD); - values.push(value.into()); - } - if let Some(rest) = rest.strip_prefix(')') { - let attribute = Identifier::new(name, decoder.encoding) - .map_err(Error::InvalidAttributeName) - .warn_on_error(warn) - .map(|name| Attribute { name, values }); - return Ok((attribute, rest)); - }; - input = rest; - } - } -} - -#[derive(Clone, Debug)] -pub struct AttributeSet(pub Vec); - -impl AttributeSet { - fn parse<'a>( - decoder: &Decoder, - mut input: &'a str, - sentinel: Option, - warn: &impl Fn(Error), - ) -> Result<(AttributeSet, &'a str), Error> { - let mut attributes = Vec::new(); - let rest = loop { - match input.chars().next() { - None => break input, - c if c == sentinel => break &input[1..], - _ => { - let (attribute, rest) = Attribute::parse(decoder, input, &warn)?; - if let Some(attribute) = attribute { - attributes.push(attribute); - } - input = rest; - } - } - }; - Ok((AttributeSet(attributes), rest)) - } -} - -#[derive(Clone, Debug)] -pub struct FileAttributeRecord(AttributeSet); - -impl FileAttributeRecord { - pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result { - let (set, rest) = AttributeSet::parse(decoder, input, None, &warn)?; - if !rest.is_empty() { - warn(Error::TBD); - } - Ok(FileAttributeRecord(set)) - } -} - -#[derive(Clone, Debug)] -pub struct VarAttributeSet { - pub long_var_name: Identifier, - pub attributes: AttributeSet, -} - -impl VarAttributeSet { - fn parse<'a>( - decoder: &Decoder, - input: &'a str, - warn: &impl Fn(Error), - ) -> Result<(Option, &'a str), Error> { - let Some((long_var_name, rest)) = input.split_once(':') else { - return Err(Error::TBD); - }; - let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'), warn)?; - let var_attribute = Identifier::new(long_var_name, decoder.encoding) - .map_err(Error::InvalidAttributeVariableName) - .warn_on_error(warn) - .map(|name| VarAttributeSet { - long_var_name: name, - attributes, - }); - Ok((var_attribute, rest)) - } -} - -#[derive(Clone, Debug)] -pub struct VariableAttributeRecord(Vec); - -impl VariableAttributeRecord { - pub fn parse(decoder: &Decoder, mut input: &str, warn: impl Fn(Error)) -> Result { - let mut var_attribute_sets = Vec::new(); - while !input.is_empty() { - let Some((var_attribute, rest)) = - VarAttributeSet::parse(decoder, input, &warn).warn_on_error(&warn) - else { - break; - }; - if let Some(var_attribute) = var_attribute { - var_attribute_sets.push(var_attribute); - } - input = rest; - } - Ok(VariableAttributeRecord(var_attribute_sets)) - } -} - -#[derive(Clone, Debug)] -pub enum MultipleResponseType { - MultipleDichotomy { - value: Value, - labels: CategoryLabels, - }, - MultipleCategory, -} - -impl MultipleResponseType { - fn decode( - decoder: &Decoder, - mr_set: &Identifier, - input: &raw::MultipleResponseType, - min_width: VarWidth, - warn: &impl Fn(Error), - ) -> Result { - let mr_type = match input { - raw::MultipleResponseType::MultipleDichotomy { value, labels } => { - let value = decoder.decode_string_cow(&value.0, warn); - let value = match min_width { - VarWidth::Numeric => { - let number: f64 = value.trim().parse().map_err(|_| { - Error::InvalidMDGroupCountedValue { - mr_set: mr_set.clone(), - number: value.into(), - } - })?; - Value::Number(Some(number.into())) - } - VarWidth::String(max_width) => { - let value = value.trim_end_matches(' '); - let width = value.len(); - if width > max_width as usize { - return Err(Error::TooWideMDGroupCountedValue { - mr_set: mr_set.clone(), - value: value.into(), - width, - max_width, - }); - }; - Value::String(value.into()) - } - }; - MultipleResponseType::MultipleDichotomy { - value, - labels: *labels, - } - } - raw::MultipleResponseType::MultipleCategory => MultipleResponseType::MultipleCategory, - }; - Ok(mr_type) - } -} - -#[derive(Clone, Debug)] -pub struct MultipleResponseSet { - pub name: Identifier, - pub min_width: VarWidth, - pub max_width: VarWidth, - pub label: String, - pub mr_type: MultipleResponseType, - pub dict_indexes: Vec, -} - -impl MultipleResponseSet { - fn decode( - decoder: &Decoder, - input: &raw::MultipleResponseSet>, - warn: &impl Fn(Error), - ) -> Result { - let mr_set_name = input.name.clone(); - let mut dict_indexes = Vec::with_capacity(input.short_names.len()); - for short_name in input.short_names.iter() { - let Some(&dict_index) = decoder.var_names.get(&short_name) else { - warn(Error::UnknownMrSetVariable { - mr_set: mr_set_name.clone(), - short_name: short_name.clone(), - }); - continue; - }; - dict_indexes.push(dict_index); - } - - match dict_indexes.len() { - 0 => return Err(Error::EmptyMrSet(mr_set_name)), - 1 => return Err(Error::OneVarMrSet(mr_set_name)), - _ => (), - } - - let Some((Some(min_width), Some(max_width))) = dict_indexes - .iter() - .map(|dict_index| decoder.variables[dict_index].width) - .map(|w| (Some(w), Some(w))) - .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb))) - else { - return Err(Error::MixedMrSet(mr_set_name)); - }; - - let mr_type = - MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?; - - Ok(MultipleResponseSet { - name: mr_set_name, - min_width, - max_width, - label: input.label.to_string(), - mr_type, - dict_indexes, - }) - } -} - -#[derive(Clone, Debug)] -pub struct MultipleResponseRecord(pub Vec); - -impl TryDecode for MultipleResponseRecord { - type Input<'a> = raw::MultipleResponseRecord>; - - fn try_decode( - decoder: &mut Decoder, - input: &Self::Input<'_>, - warn: impl Fn(Error), - ) -> Result, Error> { - let mut sets = Vec::with_capacity(input.0.len()); - for set in &input.0 { - match MultipleResponseSet::decode(decoder, set, &warn) { - Ok(set) => sets.push(set), - Err(error) => warn(error), - } - } - Ok(Some(MultipleResponseRecord(sets))) - } -} - -#[derive(Clone, Debug)] -pub struct LongStringValueLabels { - pub var_name: Identifier, - pub width: VarWidth, - pub labels: Vec, -} - -impl LongStringValueLabels { - fn decode( - decoder: &Decoder, - input: &raw::LongStringValueLabels, - warn: &impl Fn(Error), - ) -> Result { - let var_name = decoder.decode_string(&input.var_name.0, warn); - let var_name = Identifier::new(var_name.trim_end(), decoder.encoding) - .map_err(Error::InvalidLongStringValueLabelName)?; - - let min_width = 9; - let max_width = VarWidth::MAX_STRING; - if input.width < 9 || input.width > max_width as u32 { - return Err(Error::InvalidLongValueLabelWidth { - name: var_name, - width: input.width, - min_width, - max_width, - }); - } - let width = input.width as u16; - - let mut labels = Vec::with_capacity(input.labels.len()); - for (value, label) in input.labels.iter() { - let value = Value::String(decoder.decode_exact_length(&value.0).into()); - let label = decoder.decode_string(&label.0, warn); - labels.push(ValueLabel { value, label }); - } - - Ok(LongStringValueLabels { - var_name, - width: VarWidth::String(width), - labels, - }) - } -} - -#[derive(Clone, Debug)] -pub struct LongStringValueLabelRecord(pub Vec); - -impl TryDecode for LongStringValueLabelRecord { - type Input<'a> = raw::LongStringValueLabelRecord; - - fn try_decode( - decoder: &mut Decoder, - input: &Self::Input<'_>, - warn: impl Fn(Error), - ) -> Result, Error> { - let mut labels = Vec::with_capacity(input.0.len()); - for label in &input.0 { - match LongStringValueLabels::decode(decoder, label, &warn) { - Ok(set) => labels.push(set), - Err(error) => warn(error), - } - } - Ok(Some(LongStringValueLabelRecord(labels))) - } -} - -#[cfg(test)] -mod test { - use encoding_rs::WINDOWS_1252; - - #[test] - fn test() { - let mut s = String::new(); - s.push(char::REPLACEMENT_CHARACTER); - let encoded = WINDOWS_1252.encode(&s).0; - let decoded = WINDOWS_1252.decode(&encoded[..]).0; - println!("{:?}", decoded); - } - - #[test] - fn test2() { - let charset: Vec = (0..=255).collect(); - println!("{}", charset.len()); - let decoded = WINDOWS_1252.decode(&charset[..]).0; - println!("{}", decoded.len()); - let encoded = WINDOWS_1252.encode(&decoded[..]).0; - println!("{}", encoded.len()); - assert_eq!(&charset[..], &encoded[..]); - } -} -*/ diff --git a/rust/src/dictionary.rs b/rust/src/dictionary.rs deleted file mode 100644 index c26009921b..0000000000 --- a/rust/src/dictionary.rs +++ /dev/null @@ -1,530 +0,0 @@ -use std::{ - cmp::Ordering, - collections::{HashMap, HashSet}, - fmt::Debug, - ops::{Bound, RangeBounds}, -}; - -use encoding_rs::Encoding; -use indexmap::IndexSet; -use num::integer::div_ceil; -use ordered_float::OrderedFloat; -use unicase::UniCase; - -use crate::{ - format::Format, - identifier::{ByIdentifier, HasIdentifier, Identifier}, - raw::{self, Alignment, CategoryLabels, Decoder, Measure, MissingValues, RawStr, VarType}, -}; - -pub type DictIndex = usize; - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum VarWidth { - Numeric, - String(u16), -} - -impl PartialOrd for VarWidth { - fn partial_cmp(&self, other: &Self) -> Option { - match (self, other) { - (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal), - (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)), - _ => None, - } - } -} - -impl VarWidth { - pub const MAX_STRING: u16 = 32767; - - pub fn n_dict_indexes(self) -> usize { - match self { - VarWidth::Numeric => 1, - VarWidth::String(w) => div_ceil(w as usize, 8), - } - } - - fn width_predicate( - a: Option, - b: Option, - f: impl Fn(u16, u16) -> u16, - ) -> Option { - match (a, b) { - (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric), - (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => { - Some(VarWidth::String(f(a, b))) - } - _ => None, - } - } - - /// Returns the wider of `self` and `other`: - /// - Numerical variable widths are equally wide. - /// - Longer strings are wider than shorter strings. - /// - Numerical and string types are incomparable, so result in `None`. - /// - Any `None` in the input yields `None` in the output. - pub fn wider(a: Option, b: Option) -> Option { - Self::width_predicate(a, b, |a, b| a.max(b)) - } - - /// Returns the narrower of `self` and `other` (see [`Self::wider`]). - pub fn narrower(a: Option, b: Option) -> Option { - Self::width_predicate(a, b, |a, b| a.min(b)) - } - - pub fn default_display_width(&self) -> u32 { - match self { - VarWidth::Numeric => 8, - VarWidth::String(width) => *width.min(&32) as u32, - } - } - - pub fn from_raw(raw: impl Into) -> Result { - let raw: i32 = raw.into(); - match raw { - 0 => Ok(Self::Numeric), - 1..=255 => Ok(Self::String(raw as u16)), - _ => Err(()), - } - } - - pub fn is_long_string(&self) -> bool { - if let Self::String(width) = self { - *width > 8 - } else { - false - } - } -} - -impl From for VarType { - fn from(source: VarWidth) -> Self { - match source { - VarWidth::Numeric => VarType::Numeric, - VarWidth::String(_) => VarType::String, - } - } -} - -#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub enum Value { - Number(Option>), - String(String), -} - -impl Value { - pub fn decode(raw: &raw::Value>, decoder: &Decoder) -> Self { - match raw { - raw::Value::Number(x) => Value::Number(x.map(|x| x.into())), - raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()), - } - } -} - -#[derive(Clone, Debug)] -pub struct Dictionary { - pub variables: IndexSet>, - pub split_file: Vec, - pub weight: Option, - pub filter: Option, - pub case_limit: Option, - pub file_label: Option, - pub documents: Vec, - pub vectors: HashSet>, - pub attributes: HashMap>, - pub mrsets: HashSet>, - pub variable_sets: HashSet>, - pub encoding: &'static Encoding, -} - -#[derive(Debug)] -pub struct DuplicateVariableName; - -impl Dictionary { - pub fn new(encoding: &'static Encoding) -> Self { - Self { - variables: IndexSet::new(), - split_file: Vec::new(), - weight: None, - filter: None, - case_limit: None, - file_label: None, - documents: Vec::new(), - vectors: HashSet::new(), - attributes: HashMap::new(), - mrsets: HashSet::new(), - variable_sets: HashSet::new(), - encoding, - } - } - - pub fn add_var(&mut self, variable: Variable) -> Result { - let (index, inserted) = self.variables.insert_full(ByIdentifier::new(variable)); - if inserted { - Ok(index) - } else { - Err(DuplicateVariableName) - } - } - - pub fn reorder_var(&mut self, from_index: DictIndex, to_index: DictIndex) { - if from_index != to_index { - self.variables.move_index(from_index, to_index); - self.update_dict_indexes(&|index| { - #[allow(clippy::collapsible_else_if)] - if index == from_index { - Some(to_index) - } else if from_index < to_index { - if index > from_index && index <= to_index { - Some(index - 1) - } else { - Some(index) - } - } else { - if index >= to_index && index < from_index { - Some(index + 1) - } else { - Some(index) - } - } - }) - } - } - - pub fn retain_vars(&mut self, keep: F) - where - F: Fn(&Variable) -> bool, - { - let mut deleted = Vec::new(); - let mut index = 0; - self.variables.retain(|var_by_id| { - let keep = keep(&var_by_id.0); - if !keep { - deleted.push(index); - } - index += 1; - keep - }); - if !deleted.is_empty() { - self.update_dict_indexes(&|index| match deleted.binary_search(&index) { - Ok(_) => None, - Err(position) => Some(position), - }) - } - } - - pub fn delete_vars(&mut self, range: R) - where - R: RangeBounds, - { - let start = match range.start_bound() { - Bound::Included(&start) => start, - Bound::Excluded(&start) => start + 1, - Bound::Unbounded => 0, - }; - let end = match range.end_bound() { - Bound::Included(&end) => end + 1, - Bound::Excluded(&end) => end, - Bound::Unbounded => self.variables.len(), - }; - if end > start { - self.variables.drain(start..end); - self.update_dict_indexes(&|index| { - if index < start { - Some(index) - } else if index < end { - None - } else { - Some(index - end - start) - } - }) - } - } - - fn update_dict_indexes(&mut self, f: &F) - where - F: Fn(DictIndex) -> Option, - { - update_dict_index_vec(&mut self.split_file, f); - self.weight = self.weight.and_then(f); - self.filter = self.filter.and_then(f); - self.vectors = self - .vectors - .drain() - .filter_map(|vector_by_id| { - vector_by_id - .0 - .with_updated_dict_indexes(f) - .map(ByIdentifier::new) - }) - .collect(); - self.mrsets = self - .mrsets - .drain() - .filter_map(|mrset_by_id| { - mrset_by_id - .0 - .with_updated_dict_indexes(f) - .map(ByIdentifier::new) - }) - .collect(); - self.variable_sets = self - .variable_sets - .drain() - .filter_map(|var_set_by_id| { - var_set_by_id - .0 - .with_updated_dict_indexes(f) - .map(ByIdentifier::new) - }) - .collect(); - } -} - -fn update_dict_index_vec(dict_indexes: &mut Vec, f: F) -where - F: Fn(DictIndex) -> Option, -{ - dict_indexes.retain_mut(|index| { - if let Some(new) = f(*index) { - *index = new; - true - } else { - false - } - }); -} - -#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] -pub enum Role { - Input, - Target, - Both, - None, - Partition, - Split, -} - -impl Default for Role { - fn default() -> Self { - Self::Input - } -} - -pub enum DictClass { - Ordinary, - System, - Scratch, -} - -impl DictClass { - pub fn from_identifier(id: &Identifier) -> Self { - if id.0.starts_with('$') { - Self::System - } else if id.0.starts_with('#') { - Self::Scratch - } else { - Self::Ordinary - } - } - - pub fn must_leave(self) -> bool { - match self { - DictClass::Ordinary => false, - DictClass::System => false, - DictClass::Scratch => true, - } - } -} - -#[derive(Clone, Debug)] -pub struct Variable { - pub name: Identifier, - pub width: VarWidth, - pub missing_values: MissingValues, - pub print_format: Format, - pub write_format: Format, - pub value_labels: HashMap, - pub label: Option, - pub measure: Option, - pub role: Role, - pub display_width: u32, - pub alignment: Alignment, - pub leave: bool, - pub short_names: Vec, - pub attributes: HashSet>, -} - -impl Variable { - pub fn new(name: Identifier, width: VarWidth) -> Self { - let var_type = VarType::from_width(width); - let leave = DictClass::from_identifier(&name).must_leave(); - Self { - name, - width, - missing_values: MissingValues::default(), - print_format: Format::default_for_width(width), - write_format: Format::default_for_width(width), - value_labels: HashMap::new(), - label: None, - measure: Measure::default_for_type(var_type), - role: Role::default(), - display_width: width.default_display_width(), - alignment: Alignment::default_for_type(var_type), - leave, - short_names: Vec::new(), - attributes: HashSet::new(), - } - } -} - -impl HasIdentifier for Variable { - fn identifier(&self) -> &UniCase { - &self.name.0 - } -} - -#[derive(Clone, Debug)] -pub struct Vector { - pub name: Identifier, - pub variables: Vec, -} - -impl Vector { - fn with_updated_dict_indexes( - mut self, - f: impl Fn(DictIndex) -> Option, - ) -> Option { - update_dict_index_vec(&mut self.variables, f); - (!self.variables.is_empty()).then_some(self) - } -} - -impl HasIdentifier for Vector { - fn identifier(&self) -> &UniCase { - &self.name.0 - } -} - -#[derive(Clone, Debug)] -pub struct Attribute { - pub name: Identifier, - pub values: Vec, -} - -impl HasIdentifier for Attribute { - fn identifier(&self) -> &UniCase { - &self.name.0 - } -} - -#[derive(Clone, Debug)] -pub struct MultipleResponseSet { - pub name: Identifier, - pub label: String, - pub mr_type: MultipleResponseType, - pub variables: Vec, -} - -impl MultipleResponseSet { - fn with_updated_dict_indexes( - mut self, - f: impl Fn(DictIndex) -> Option, - ) -> Option { - update_dict_index_vec(&mut self.variables, f); - (self.variables.len() > 1).then_some(self) - } -} - -impl HasIdentifier for MultipleResponseSet { - fn identifier(&self) -> &UniCase { - &self.name.0 - } -} - -#[derive(Clone, Debug)] -pub enum MultipleResponseType { - MultipleDichotomy { - value: Value, - labels: CategoryLabels, - }, - MultipleCategory, -} - -#[derive(Clone, Debug)] -pub struct VariableSet { - pub name: Identifier, - pub variables: Vec, -} - -impl VariableSet { - fn with_updated_dict_indexes( - mut self, - f: impl Fn(DictIndex) -> Option, - ) -> Option { - update_dict_index_vec(&mut self.variables, f); - (!self.variables.is_empty()).then_some(self) - } -} - -impl HasIdentifier for VariableSet { - fn identifier(&self) -> &UniCase { - &self.name.0 - } -} - -#[cfg(test)] -mod test { - use std::collections::HashSet; - - use unicase::UniCase; - - use crate::identifier::Identifier; - - use super::{ByIdentifier, HasIdentifier}; - - #[derive(PartialEq, Eq, Debug, Clone)] - struct Variable { - name: Identifier, - value: i32, - } - - impl HasIdentifier for Variable { - fn identifier(&self) -> &UniCase { - &self.name.0 - } - } - - #[test] - fn test() { - // Variables should not be the same if their values differ. - let abcd = Identifier::new("abcd").unwrap(); - let abcd1 = Variable { - name: abcd.clone(), - value: 1, - }; - let abcd2 = Variable { - name: abcd, - value: 2, - }; - assert_ne!(abcd1, abcd2); - - // But `ByName` should treat them the same. - let abcd1_by_name = ByIdentifier::new(abcd1); - let abcd2_by_name = ByIdentifier::new(abcd2); - assert_eq!(abcd1_by_name, abcd2_by_name); - - // And a `HashSet` of `ByName` should also treat them the same. - let mut vars: HashSet> = HashSet::new(); - assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone()))); - assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone()))); - assert_eq!( - vars.get(&UniCase::new(String::from("abcd"))) - .unwrap() - .0 - .value, - 1 - ); - } -} diff --git a/rust/src/encoding.rs b/rust/src/encoding.rs deleted file mode 100644 index aaed5fd4ca..0000000000 --- a/rust/src/encoding.rs +++ /dev/null @@ -1,64 +0,0 @@ -use crate::locale_charset::locale_charset; -use encoding_rs::{Encoding, UTF_8}; - -include!(concat!(env!("OUT_DIR"), "/encodings.rs")); - -pub fn codepage_from_encoding(encoding: &str) -> Option { - CODEPAGE_NAME_TO_NUMBER - .get(encoding.to_ascii_lowercase().as_str()) - .copied() -} - -use thiserror::Error as ThisError; - -#[derive(ThisError, Debug)] -pub enum Error { - #[error("This system file does not indicate its own character encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")] - NoEncoding, - - #[error("This system file encodes text strings with unknown code page {0}.")] - UnknownCodepage(i32), - - #[error("This system file encodes text strings with unknown encoding {0}.")] - UnknownEncoding(String), - - #[error("This system file is encoded in EBCDIC, which is not supported.")] - Ebcdic, -} - -pub fn default_encoding() -> &'static Encoding { - lazy_static! { - static ref DEFAULT_ENCODING: &'static Encoding = - Encoding::for_label(locale_charset().as_bytes()).unwrap_or(UTF_8); - } - &DEFAULT_ENCODING -} - -pub fn get_encoding( - encoding: Option<&str>, - character_code: Option, -) -> Result<&'static Encoding, Error> { - let label = if let Some(encoding) = encoding { - encoding - } else if let Some(codepage) = character_code { - match codepage { - 1 => return Err(Error::Ebcdic), - 2 | 3 => { - // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic] - // respectively. However, many files have character code 2 but - // data which are clearly not ASCII. Therefore, ignore these - // values. - return Err(Error::NoEncoding); - } - 4 => "MS_KANJI", - _ => CODEPAGE_NUMBER_TO_NAME - .get(&codepage) - .copied() - .ok_or(Error::UnknownCodepage(codepage))?, - } - } else { - return Err(Error::NoEncoding); - }; - - Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into())) -} diff --git a/rust/src/endian.rs b/rust/src/endian.rs deleted file mode 100644 index dd89a6cc1d..0000000000 --- a/rust/src/endian.rs +++ /dev/null @@ -1,168 +0,0 @@ -/// The endianness for integer and floating-point numbers in SPSS system files. -/// -/// SPSS system files can declare IBM 370 and DEC VAX floating-point -/// representations, but no file that uses either of these has ever been found -/// in the wild, so this code does not handle them. -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum Endian { - /// Big-endian: MSB at lowest address. - Big, - - /// Little-endian: LSB at lowest address. - Little, -} - -impl Endian { - #[cfg(target_endian = "big")] - pub const NATIVE: Endian = Endian::Big; - #[cfg(target_endian = "little")] - pub const NATIVE: Endian = Endian::Little; - - pub fn identify_u32(expected_value: u32, bytes: [u8; 4]) -> Option { - let as_big: u32 = Endian::Big.parse(bytes); - let as_little: u32 = Endian::Little.parse(bytes); - match (as_big == expected_value, as_little == expected_value) { - (true, false) => Some(Endian::Big), - (false, true) => Some(Endian::Little), - _ => None, - } - } - - pub fn identify_f64(expected_value: f64, bytes: [u8; 8]) -> Option { - let as_big: f64 = Endian::Big.parse(bytes); - let as_little: f64 = Endian::Little.parse(bytes); - match (as_big == expected_value, as_little == expected_value) { - (true, false) => Some(Endian::Big), - (false, true) => Some(Endian::Little), - _ => None, - } - } -} - -pub trait ToBytes { - fn to_bytes(self, value: T) -> [u8; N]; -} -impl ToBytes for Endian { - fn to_bytes(self, value: i64) -> [u8; 8] { - match self { - Endian::Big => i64::to_be_bytes(value), - Endian::Little => i64::to_le_bytes(value), - } - } -} -impl ToBytes for Endian { - fn to_bytes(self, value: u32) -> [u8; 4] { - match self { - Endian::Big => u32::to_be_bytes(value), - Endian::Little => u32::to_le_bytes(value), - } - } -} -impl ToBytes for Endian { - fn to_bytes(self, value: i32) -> [u8; 4] { - match self { - Endian::Big => i32::to_be_bytes(value), - Endian::Little => i32::to_le_bytes(value), - } - } -} -impl ToBytes for Endian { - fn to_bytes(self, value: u16) -> [u8; 2] { - match self { - Endian::Big => u16::to_be_bytes(value), - Endian::Little => u16::to_le_bytes(value), - } - } -} -impl ToBytes for Endian { - fn to_bytes(self, value: u8) -> [u8; 1] { - [value] - } -} -impl ToBytes for Endian { - fn to_bytes(self, value: f64) -> [u8; 8] { - match self { - Endian::Big => f64::to_be_bytes(value), - Endian::Little => f64::to_le_bytes(value), - } - } -} - -/// Parses an `N`-byte slice in one of the supported formats into native format -/// as type `T`. -pub trait Parse { - /// Given 'bytes', returns `T`. - fn parse(self, bytes: [u8; N]) -> T; -} -impl Parse for Endian { - fn parse(self, bytes: [u8; 8]) -> u64 { - match self { - Endian::Big => u64::from_be_bytes(bytes), - Endian::Little => u64::from_le_bytes(bytes), - } - } -} -impl Parse for Endian { - fn parse(self, bytes: [u8; 4]) -> u32 { - match self { - Endian::Big => u32::from_be_bytes(bytes), - Endian::Little => u32::from_le_bytes(bytes), - } - } -} -impl Parse for Endian { - fn parse(self, bytes: [u8; 2]) -> u16 { - match self { - Endian::Big => u16::from_be_bytes(bytes), - Endian::Little => u16::from_le_bytes(bytes), - } - } -} -impl Parse for Endian { - fn parse(self, bytes: [u8; 1]) -> u8 { - match self { - Endian::Big => u8::from_be_bytes(bytes), - Endian::Little => u8::from_le_bytes(bytes), - } - } -} -impl Parse for Endian { - fn parse(self, bytes: [u8; 8]) -> i64 { - match self { - Endian::Big => i64::from_be_bytes(bytes), - Endian::Little => i64::from_le_bytes(bytes), - } - } -} -impl Parse for Endian { - fn parse(self, bytes: [u8; 4]) -> i32 { - match self { - Endian::Big => i32::from_be_bytes(bytes), - Endian::Little => i32::from_le_bytes(bytes), - } - } -} -impl Parse for Endian { - fn parse(self, bytes: [u8; 2]) -> i16 { - match self { - Endian::Big => i16::from_be_bytes(bytes), - Endian::Little => i16::from_le_bytes(bytes), - } - } -} -impl Parse for Endian { - fn parse(self, bytes: [u8; 1]) -> i8 { - match self { - Endian::Big => i8::from_be_bytes(bytes), - Endian::Little => i8::from_le_bytes(bytes), - } - } -} -impl Parse for Endian { - fn parse(self, bytes: [u8; 8]) -> f64 { - match self { - Endian::Big => f64::from_be_bytes(bytes), - Endian::Little => f64::from_le_bytes(bytes), - } - } -} diff --git a/rust/src/engine.rs b/rust/src/engine.rs deleted file mode 100644 index f48c1948c1..0000000000 --- a/rust/src/engine.rs +++ /dev/null @@ -1,51 +0,0 @@ -use crate::{ - command::parse, - lex::{lexer::{Lexer, Source}, token::Token}, - message::Diagnostic, -}; - -pub struct Engine { - lexer: Lexer, -} - -impl Engine { - fn new() -> Self { - Self { - lexer: Lexer::new(Box::new(|location, error| println!("{location}: {error}"))), - } - } - fn run(&mut self, source: Source) { - self.lexer.append(source); - self.lexer.get(); - while self.lexer.token() != &Token::End { - let error: Box = Box::new(|diagnostic| { - println!("{diagnostic}"); - }); - parse(&mut self.lexer, &error); - } - } -} - -#[cfg(test)] -mod tests { - use encoding_rs::UTF_8; - - use crate::lex::{ - lexer::{ErrorHandling, Source}, - segment::Mode, - }; - - use super::Engine; - - #[test] - fn test_echo() { - let mut engine = Engine::new(); - engine.run(Source::for_file_contents( - "ECHO 'hi there'.\nECHO 'bye there'.\n".to_string(), - Some("test.sps".to_string()), - UTF_8, - Mode::default(), - ErrorHandling::default(), - )); - } -} diff --git a/rust/src/format.rs b/rust/src/format.rs deleted file mode 100644 index bafdf2726c..0000000000 --- a/rust/src/format.rs +++ /dev/null @@ -1,658 +0,0 @@ -use std::{ - fmt::{Display, Formatter, Result as FmtResult}, - ops::RangeInclusive, -}; - -use enum_map::{Enum, EnumMap}; -use thiserror::Error as ThisError; - -use crate::{ - dictionary::VarWidth, - raw::{self, VarType}, -}; - -#[derive(ThisError, Debug)] -pub enum Error { - #[error("Unknown format type {value}.")] - UnknownFormat { value: u16 }, - - #[error("Output format {0} specifies width {}, but {} requires an even width.", .0.w, .0.type_)] - OddWidthNotAllowed(UncheckedFormat), - - #[error("Output format {0} specifies width {}, but {} requires a width between {} and {}.", .0.w, .0.type_, .0.type_.min_width(), .0.type_.max_width())] - BadWidth(UncheckedFormat), - - #[error("Output format {0} specifies decimal places, but {} format does not allow any decimals.", .0.type_)] - DecimalsNotAllowedForFormat(UncheckedFormat), - - #[error("Output format {0} specifies {} decimal places, but with a width of {}, {} does not allow any decimal places.", .0.d, .0.w, .0.type_)] - DecimalsNotAllowedForWidth(UncheckedFormat), - - #[error("Output format {spec} specifies {} decimal places but, with a width of {}, {} allows at most {max_d} decimal places.", .spec.d, .spec.w, .spec.type_)] - TooManyDecimalsForWidth { - spec: UncheckedFormat, - max_d: Decimals, - }, - - #[error("String variable is not compatible with numeric format {0}.")] - UnnamedVariableNotCompatibleWithNumericFormat(Type), - - #[error("Numeric variable is not compatible with string format {0}.")] - UnnamedVariableNotCompatibleWithStringFormat(Type), - - #[error("String variable {variable} with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")] - NamedStringVariableBadSpecWidth { - variable: String, - width: Width, - bad_spec: Format, - good_spec: Format, - }, - - #[error("String variable with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")] - UnnamedStringVariableBadSpecWidth { - width: Width, - bad_spec: Format, - good_spec: Format, - }, -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] -pub enum Category { - // Numeric formats. - Basic, - Custom, - Legacy, - Binary, - Hex, - Date, - Time, - DateComponent, - - // String formats. - String, -} - -impl From for Category { - fn from(source: Type) -> Self { - match source { - Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => Self::Basic, - Type::CC(_) => Self::Custom, - Type::N | Type::Z => Self::Legacy, - Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => Self::Binary, - Type::PIBHex | Type::RBHex => Self::Hex, - Type::Date - | Type::ADate - | Type::EDate - | Type::JDate - | Type::SDate - | Type::QYr - | Type::MoYr - | Type::WkYr - | Type::DateTime - | Type::YMDHMS => Self::Date, - Type::MTime | Type::Time | Type::DTime => Self::Time, - Type::WkDay | Type::Month => Self::DateComponent, - Type::A | Type::AHex => Self::String, - } - } -} - -#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Hash)] -pub enum CC { - A, - B, - C, - D, - E, -} - -impl Display for CC { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - let s = match self { - CC::A => "A", - CC::B => "B", - CC::C => "C", - CC::D => "D", - CC::E => "E", - }; - write!(f, "{}", s) - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] -pub enum Type { - // Basic numeric formats. - F, - Comma, - Dot, - Dollar, - Pct, - E, - - // Custom currency formats. - CC(CC), - - // Legacy numeric formats. - N, - Z, - - // Binary and hexadecimal formats. - P, - PK, - IB, - PIB, - PIBHex, - RB, - RBHex, - - // Time and date formats. - Date, - ADate, - EDate, - JDate, - SDate, - QYr, - MoYr, - WkYr, - DateTime, - YMDHMS, - MTime, - Time, - DTime, - - // Date component formats. - WkDay, - Month, - - // String formats. - A, - AHex, -} - -pub type Width = u16; -pub type SignedWidth = i16; - -pub type Decimals = u8; - -impl Type { - pub fn max_width(self) -> Width { - match self { - Self::P | Self::PK | Self::PIBHex | Self::RBHex => 16, - Self::IB | Self::PIB | Self::RB => 8, - Self::A => 32767, - Self::AHex => 32767 * 2, - _ => 40, - } - } - - pub fn min_width(self) -> Width { - match self { - // Basic numeric formats. - Self::F => 1, - Self::Comma => 1, - Self::Dot => 1, - Self::Dollar => 2, - Self::Pct => 2, - Self::E => 6, - - // Custom currency formats. - Self::CC(_) => 2, - - // Legacy numeric formats. - Self::N => 1, - Self::Z => 1, - - // Binary and hexadecimal formats. - Self::P => 1, - Self::PK => 1, - Self::IB => 1, - Self::PIB => 1, - Self::PIBHex => 2, - Self::RB => 2, - Self::RBHex => 4, - - // Time and date formats. - Self::Date => 9, - Self::ADate => 8, - Self::EDate => 8, - Self::JDate => 5, - Self::SDate => 8, - Self::QYr => 6, - Self::MoYr => 6, - Self::WkYr => 8, - Self::DateTime => 17, - Self::YMDHMS => 16, - Self::MTime => 5, - Self::Time => 5, - Self::DTime => 8, - - // Date component formats. - Self::WkDay => 2, - Self::Month => 3, - - // String formats. - Self::A => 1, - Self::AHex => 2, - } - } - - pub fn width_range(self) -> RangeInclusive { - self.min_width()..=self.max_width() - } - - pub fn max_decimals(self, width: Width) -> Decimals { - let width = width.clamp(1, 40) as SignedWidth; - let max = match self { - Self::F | Self::Comma | Self::Dot | Self::CC(_) => width - 1, - Self::Dollar | Self::Pct => width - 2, - Self::E => width - 7, - Self::N | Self::Z => width, - Self::P => width * 2 - 1, - Self::PK => width * 2, - Self::IB | Self::PIB => max_digits_for_bytes(width as usize) as SignedWidth, - Self::PIBHex => 0, - Self::RB | Self::RBHex => 16, - Self::Date - | Self::ADate - | Self::EDate - | Self::JDate - | Self::SDate - | Self::QYr - | Self::MoYr - | Self::WkYr => 0, - Self::DateTime => width - 21, - Self::YMDHMS => width - 20, - Self::MTime => width - 6, - Self::Time => width - 9, - Self::DTime => width - 12, - Self::WkDay | Self::Month | Self::A | Self::AHex => 0, - }; - max.clamp(0, 16) as Decimals - } - - pub fn takes_decimals(self) -> bool { - self.max_decimals(Width::MAX) > 0 - } - - pub fn category(self) -> Category { - self.into() - } - - pub fn width_step(self) -> Width { - if self.category() == Category::Hex || self == Self::AHex { - 2 - } else { - 1 - } - } - - pub fn clamp_width(self, width: Width) -> Width { - let (min, max) = self.width_range().into_inner(); - let width = width.clamp(min, max); - if self.width_step() == 2 { - width / 2 * 2 - } else { - width - } - } - - pub fn var_type(self) -> VarType { - match self { - Self::A | Self::AHex => VarType::String, - _ => VarType::Numeric, - } - } - - /// Checks whether this format is valid for a variable with the given - /// `var_type`. - pub fn check_type_compatibility(self, var_type: VarType) -> Result<(), Error> { - let my_type = self.var_type(); - match (my_type, var_type) { - (VarType::Numeric, VarType::String) => { - Err(Error::UnnamedVariableNotCompatibleWithNumericFormat(self)) - } - (VarType::String, VarType::Numeric) => { - Err(Error::UnnamedVariableNotCompatibleWithStringFormat(self)) - } - _ => Ok(()), - } - } -} - -impl Display for Type { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - let s = match self { - Self::F => "F", - Self::Comma => "COMMA", - Self::Dot => "DOT", - Self::Dollar => "DOLLAR", - Self::Pct => "PCT", - Self::E => "E", - Self::CC(cc) => return write!(f, "{}", cc), - Self::N => "N", - Self::Z => "Z", - Self::P => "P", - Self::PK => "PK", - Self::IB => "IB", - Self::PIB => "PIB", - Self::PIBHex => "PIBHEX", - Self::RB => "RB", - Self::RBHex => "RBHEX", - Self::Date => "DATE", - Self::ADate => "ADATE", - Self::EDate => "EDATE", - Self::JDate => "JDATE", - Self::SDate => "SDATE", - Self::QYr => "QYR", - Self::MoYr => "MOYR", - Self::WkYr => "WKYR", - Self::DateTime => "DATETIME", - Self::YMDHMS => "YMDHMS", - Self::MTime => "MTIME", - Self::Time => "TIME", - Self::DTime => "DTIME", - Self::WkDay => "WKDAY", - Self::Month => "MONTH", - Self::A => "A", - Self::AHex => "AHEX", - }; - write!(f, "{}", s) - } -} - -fn max_digits_for_bytes(bytes: usize) -> usize { - *[0, 3, 5, 8, 10, 13, 15, 17].get(bytes).unwrap_or(&20) -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] -pub struct Format { - type_: Type, - w: Width, - d: Decimals, -} - -impl Format { - pub const F40: Format = Format { - type_: Type::F, - w: 40, - d: 0, - }; - - pub const F8_2: Format = Format { - type_: Type::F, - w: 8, - d: 2, - }; - - pub fn format(self) -> Type { - self.type_ - } - pub fn w(self) -> Width { - self.w - } - pub fn d(self) -> Decimals { - self.d - } - - pub fn default_for_width(var_width: VarWidth) -> Self { - match var_width { - VarWidth::Numeric => Format { - type_: Type::F, - w: 8, - d: 2, - }, - VarWidth::String(w) => Format { - type_: Type::A, - w, - d: 0, - }, - } - } - - pub fn fixed_from(source: &UncheckedFormat) -> Self { - let UncheckedFormat { - type_: format, - w, - d, - } = *source; - let (min, max) = format.width_range().into_inner(); - let mut w = w.clamp(min, max); - if d <= format.max_decimals(Width::MAX) { - while d > format.max_decimals(w) { - w += 1; - assert!(w <= 40); - } - } - let d = d.clamp(0, format.max_decimals(w)); - Self { - type_: format, - w, - d, - } - } - - pub fn var_width(self) -> VarWidth { - match self.type_ { - Type::A => VarWidth::String(self.w), - Type::AHex => VarWidth::String(self.w / 2), - _ => VarWidth::Numeric, - } - } - - pub fn var_type(self) -> VarType { - self.type_.var_type() - } - - /// Checks whether this format specification is valid for a variable with - /// width `var_width`. - pub fn check_width_compatibility(self, var_width: VarWidth) -> Result { - // Verify that the format is right for the variable's type. - self.type_.check_type_compatibility(var_width.into())?; - - if let VarWidth::String(w) = var_width { - if var_width != self.var_width() { - let bad_spec = self; - let good_spec = if self.type_ == Type::A { - Format { w, ..self } - } else { - Format { w: w * 2, ..self } - }; - return Err(Error::UnnamedStringVariableBadSpecWidth { - width: w, - bad_spec, - good_spec, - }); - } - } - - Ok(self) - } -} - -impl Display for Format { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{}{}", self.type_, self.w)?; - if self.type_.takes_decimals() || self.d > 0 { - write!(f, ".{}", self.d)?; - } - Ok(()) - } -} - -impl TryFrom for Format { - type Error = Error; - - fn try_from(source: UncheckedFormat) -> Result { - let UncheckedFormat { - type_: format, - w, - d, - } = source; - let max_d = format.max_decimals(w); - if w % format.width_step() != 0 { - Err(Error::OddWidthNotAllowed(source)) - } else if !format.width_range().contains(&w) { - Err(Error::BadWidth(source)) - } else if d > max_d { - if format.takes_decimals() { - Err(Error::DecimalsNotAllowedForFormat(source)) - } else if max_d > 0 { - Err(Error::TooManyDecimalsForWidth { - spec: source, - max_d, - }) - } else { - Err(Error::DecimalsNotAllowedForWidth(source)) - } - } else { - Ok(Format { - type_: format, - w, - d, - }) - } - } -} - -impl TryFrom for Type { - type Error = Error; - - fn try_from(source: u16) -> Result { - match source { - 1 => Ok(Self::A), - 2 => Ok(Self::AHex), - 3 => Ok(Self::Comma), - 4 => Ok(Self::Dollar), - 5 => Ok(Self::F), - 6 => Ok(Self::IB), - 7 => Ok(Self::PIBHex), - 8 => Ok(Self::P), - 9 => Ok(Self::PIB), - 10 => Ok(Self::PK), - 11 => Ok(Self::RB), - 12 => Ok(Self::RBHex), - 15 => Ok(Self::Z), - 16 => Ok(Self::N), - 17 => Ok(Self::E), - 20 => Ok(Self::Date), - 21 => Ok(Self::Time), - 22 => Ok(Self::DateTime), - 23 => Ok(Self::ADate), - 24 => Ok(Self::JDate), - 25 => Ok(Self::DTime), - 26 => Ok(Self::WkDay), - 27 => Ok(Self::Month), - 28 => Ok(Self::MoYr), - 29 => Ok(Self::QYr), - 30 => Ok(Self::WkYr), - 31 => Ok(Self::Pct), - 32 => Ok(Self::Dot), - 33 => Ok(Self::CC(CC::A)), - 34 => Ok(Self::CC(CC::B)), - 35 => Ok(Self::CC(CC::C)), - 36 => Ok(Self::CC(CC::D)), - 37 => Ok(Self::CC(CC::E)), - 38 => Ok(Self::EDate), - 39 => Ok(Self::SDate), - 40 => Ok(Self::MTime), - 41 => Ok(Self::YMDHMS), - _ => Err(Error::UnknownFormat { value: source }), - } - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] -pub struct UncheckedFormat { - pub type_: Type, - - pub w: Width, - - pub d: Decimals, -} - -impl TryFrom for UncheckedFormat { - type Error = Error; - - fn try_from(raw: raw::Spec) -> Result { - let raw = raw.0; - let raw_format = (raw >> 16) as u16; - let format = raw_format.try_into()?; - let w = ((raw >> 8) & 0xff) as Width; - let d = (raw & 0xff) as Decimals; - Ok(Self { - type_: format, - w, - d, - }) - } -} - -impl Display for UncheckedFormat { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{}{}", self.type_, self.w)?; - if self.type_.takes_decimals() || self.d > 0 { - write!(f, ".{}", self.d)?; - } - Ok(()) - } -} - -pub struct Settings { - epoch: Option, - - /// Either `'.'` or `','`. - decimal: char, - - /// Format `F`, `E`, `COMMA`, and `DOT` with leading zero (e.g. `0.5` - /// instead of `.5`)? - include_leading_zero: bool, - - /// Custom currency styles. - ccs: EnumMap>, -} - -impl Default for Settings { - fn default() -> Self { - Self { - epoch: None, - decimal: '.', - include_leading_zero: false, - ccs: Default::default(), - } - } -} - -/// A numeric output style. This can express numeric formats in -/// [Category::Basic] and [Category::Custom]. -pub struct NumberStyle { - neg_prefix: Affix, - prefix: Affix, - suffix: Affix, - neg_suffix: Affix, - - /// Decimal point: `'.'` or `','`. - decimal: char, - - /// Grouping character: `'.'` or `','` or `None`. - grouping: Option, - - /// Format as `.5` or `0.5`? - include_leading_zero: bool, - - /// An `Affix` may require more bytes than its display width; for example, - /// U+00A5 (Â¥) is 2 bytes in UTF-8 but occupies only one display column. - /// This member is the sum of the number of bytes required by all of the - /// `Affix` members in this struct, minus their display widths. Thus, it - /// can be used to size memory allocations: for example, the formatted - /// result of `CCA20.5` requires no more than `(20 + extra_bytes)` bytes in - /// UTF-8. - extra_bytes: usize, -} - -pub struct Affix { - /// String contents of affix. - s: String, - - /// Display width in columns (see [unicode_width]) - width: usize, -} diff --git a/rust/src/hexfloat.rs b/rust/src/hexfloat.rs deleted file mode 100644 index b885fb2266..0000000000 --- a/rust/src/hexfloat.rs +++ /dev/null @@ -1,52 +0,0 @@ -use num::Float; -use std::{num::FpCategory, fmt::{Display, Formatter, Result}}; - -pub struct HexFloat(pub T); - -impl Display for HexFloat { - fn fmt(&self, f: &mut Formatter<'_>) -> Result { - let sign = if self.0.is_sign_negative() { "-" } else { "" }; - match self.0.classify() { - FpCategory::Nan => return write!(f, "NaN"), - FpCategory::Infinite => return write!(f, "{sign}Infinity"), - FpCategory::Zero => return write!(f, "{sign}0.0"), - _ => (), - }; - let (significand, mut exponent, _) = self.0.integer_decode(); - let mut hex_sig = format!("{:x}", significand); - while hex_sig.ends_with('0') { - hex_sig.pop(); - exponent += 4; - } - match hex_sig.len() { - 0 => write!(f, "{sign}0.0"), - 1 => write!(f, "{sign}0x{hex_sig}.0p{exponent}"), - len => write!( - f, - "{sign}0x{}.{}p{}", - hex_sig.chars().next().unwrap(), - &hex_sig[1..], - exponent + 4 * (len as i16 - 1) - ), - } - } -} - -#[cfg(test)] -mod hex_float_tests { - use crate::HexFloat; - use num::Float; - - #[test] - fn test() { - assert_eq!(format!("{}", HexFloat(1.0)), "0x1.0p0"); - assert_eq!(format!("{}", HexFloat(123.0)), "0x1.ecp6"); - assert_eq!(format!("{}", HexFloat(1.0 / 16.0)), "0x1.0p-4"); - assert_eq!(format!("{}", HexFloat(f64::infinity())), "Infinity"); - assert_eq!(format!("{}", HexFloat(f64::neg_infinity())), "-Infinity"); - assert_eq!(format!("{}", HexFloat(f64::nan())), "NaN"); - assert_eq!(format!("{}", HexFloat(0.0)), "0.0"); - assert_eq!(format!("{}", HexFloat(f64::neg_zero())), "-0.0"); - } -} - diff --git a/rust/src/identifier.rs b/rust/src/identifier.rs deleted file mode 100644 index 2d5c0317ec..0000000000 --- a/rust/src/identifier.rs +++ /dev/null @@ -1,394 +0,0 @@ -use std::{ - borrow::Borrow, - cmp::Ordering, - fmt::{Debug, Display, Formatter, Result as FmtResult}, - hash::{Hash, Hasher}, - ops::Deref, -}; - -use encoding_rs::{EncoderResult, Encoding, UTF_8}; -use finl_unicode::categories::{CharacterCategories, MajorCategory}; -use thiserror::Error as ThisError; -use unicase::UniCase; - -pub trait IdentifierChar { - /// Returns true if `self` is an ASCII character that may be the first - /// character in an identifier. - fn ascii_may_start_id(self) -> bool; - - /// Returns true if `self` may be the first character in an identifier. - fn may_start_id(self) -> bool; - - /// Returns true if `self` is an ASCII character that may be a second or - /// subsequent character in an identifier. - fn ascii_may_continue_id(self) -> bool; - - /// Returns true if `self` may be a second or subsequent character in an - /// identifier. - fn may_continue_id(self) -> bool; -} - -impl IdentifierChar for char { - fn ascii_may_start_id(self) -> bool { - matches!(self, 'a'..='z' | 'A'..='Z' | '@' | '#' | '$' | '!') - } - - fn may_start_id(self) -> bool { - if self < '\u{0080}' { - self.ascii_may_start_id() - } else { - use MajorCategory::*; - - [L, M, S].contains(&self.get_major_category()) && self != char::REPLACEMENT_CHARACTER - } - } - - fn ascii_may_continue_id(self) -> bool { - matches!(self, 'a'..='z' | 'A'..='Z' | '0'..='9' | '@' | '#' | '$' | '.' | '_') - } - - fn may_continue_id(self) -> bool { - if self < '\u{0080}' { - self.ascii_may_continue_id() - } else { - use MajorCategory::*; - - [L, M, S, N].contains(&self.get_major_category()) && self != char::REPLACEMENT_CHARACTER - } - } -} - -#[derive(Clone, Debug, ThisError)] -pub enum Error { - #[error("Identifier cannot be empty string.")] - Empty, - - #[error("\"{0}\" may not be used as an identifier because it is a reserved word.")] - Reserved(String), - - #[error("\"!\" is not a valid identifier.")] - Bang, - - #[error("\"{0}\" may not be used as an identifier because it begins with disallowed character \"{1}\".")] - BadFirstCharacter(String, char), - - #[error("\"{0}\" may not be used as an identifier because it contains disallowed character \"{1}\".")] - BadLaterCharacter(String, char), - - #[error("Identifier \"{id}\" is {length} bytes in the encoding in use ({encoding}), which exceeds the {max}-byte limit.")] - TooLong { - id: String, - length: usize, - encoding: &'static str, - max: usize, - }, - - #[error("\"{id}\" may not be used as an identifier because the encoding in use ({encoding}) cannot represent \"{c}\".")] - NotEncodable { - id: String, - encoding: &'static str, - c: char, - }, -} - -pub enum ReservedWord { - And, - Or, - Not, - Eq, - Ge, - Gt, - Le, - Lt, - Ne, - All, - By, - To, - With, -} - -impl TryFrom<&str> for ReservedWord { - type Error = (); - - fn try_from(source: &str) -> Result { - if !(2..=4).contains(&source.len()) { - Err(()) - } else { - let b = source.as_bytes(); - let c0 = b[0].to_ascii_uppercase(); - let c1 = b[1].to_ascii_uppercase(); - match (source.len(), c0, c1) { - (2, b'B', b'Y') => Ok(Self::By), - (2, b'E', b'Q') => Ok(Self::Eq), - (2, b'G', b'T') => Ok(Self::Gt), - (2, b'G', b'E') => Ok(Self::Ge), - (2, b'L', b'T') => Ok(Self::Lt), - (2, b'L', b'E') => Ok(Self::Le), - (2, b'N', b'E') => Ok(Self::Ne), - (3, b'N', b'O') if b[2].to_ascii_uppercase() == b'T' => Ok(Self::Not), - (2, b'O', b'R') => Ok(Self::Or), - (2, b'T', b'O') => Ok(Self::To), - (3, b'A', b'L') if b[2].to_ascii_uppercase() == b'L' => Ok(Self::All), - (3, b'A', b'N') if b[2].to_ascii_uppercase() == b'D' => Ok(Self::And), - (4, b'W', b'I') - if b[2].to_ascii_uppercase() == b'T' && b[3].to_ascii_uppercase() == b'H' => - { - Ok(Self::With) - } - _ => Err(()), - } - } - } -} - -pub fn is_reserved_word(s: &str) -> bool { - ReservedWord::try_from(s).is_ok() -} - -#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct Identifier(pub UniCase); - -impl Identifier { - /// Maximum length of an identifier, in bytes. The limit applies in the - /// encoding used by the dictionary, not in UTF-8. - pub const MAX_LEN: usize = 64; - - pub fn new(s: &str) -> Result { - Self::from_encoding(s, UTF_8) - } - pub fn from_encoding(s: &str, encoding: &'static Encoding) -> Result { - Self::is_plausible(s)?; - let identifier = Identifier(s.into()); - identifier.check_encoding(encoding)?; - Ok(identifier) - } - - /// Checks whether this is a valid identifier in the given `encoding`. An - /// identifier that is valid in one encoding might be invalid in another - /// because some characters are unencodable or because it is too long. - pub fn check_encoding(&self, encoding: &'static Encoding) -> Result<(), Error> { - let s = self.0.as_str(); - let (_encoded, _, unencodable) = encoding.encode(s); - if unencodable { - let mut encoder = encoding.new_encoder(); - let mut buf = Vec::with_capacity( - encoder - .max_buffer_length_from_utf8_without_replacement(s.len()) - .unwrap(), - ); - let EncoderResult::Unmappable(c) = encoder - .encode_from_utf8_to_vec_without_replacement(s, &mut buf, true) - .0 - else { - unreachable!(); - }; - return Err(Error::NotEncodable { - id: s.into(), - encoding: encoding.name(), - c, - }); - } - /* - if encoded.len() > Self::MAX_LEN { - return Err(Error::TooLong { - id: s.into(), - length: encoded.len(), - encoding: encoding.name(), - max: Self::MAX_LEN, - }); - }*/ - Ok(()) - } - pub fn is_plausible(s: &str) -> Result<(), Error> { - if s.is_empty() { - return Err(Error::Empty); - } - if is_reserved_word(s) { - return Err(Error::Reserved(s.into())); - } - if s == "!" { - return Err(Error::Bang); - } - - let mut i = s.chars(); - let first = i.next().unwrap(); - if !first.may_start_id() { - return Err(Error::BadFirstCharacter(s.into(), first)); - } - for c in i { - if !c.may_continue_id() { - return Err(Error::BadLaterCharacter(s.into(), c)); - } - } - Ok(()) - } - - /// Returns true if `token` is a case-insensitive match for `keyword`. - /// - /// Keywords match `keyword` and `token` are identical, or `token` is at - /// least 3 characters long and those characters are identical to `keyword` - /// or differ only in case. - /// - /// `keyword` must be ASCII. - pub fn matches_keyword(&self, keyword: &str) -> bool { - id_match_n_nonstatic(keyword, self.0.as_str(), 3) - } - - /// Returns true if `token` is a case-insensitive match for at least the - /// first `n` characters of `keyword`. - /// - /// `keyword` must be ASCII. - pub fn matches_keyword_n(&self, keyword: &str, n: usize) -> bool { - id_match_n_nonstatic(keyword, self.0.as_str(), n) - } -} - -impl PartialEq for Identifier { - fn eq(&self, other: &str) -> bool { - self.0.eq(&UniCase::new(other)) - } -} - -/// Returns true if `token` is a case-insensitive match for `keyword`. -/// -/// Keywords match `keyword` and `token` are identical, or `token` is at least 3 -/// characters long and those characters are identical to `keyword` or differ -/// only in case. -/// -/// `keyword` must be ASCII. It's normally a constant string, so it's declared -/// as `&'static str` to make it harder to reverse the argument order. But -/// there's no reason that a non-static string won't work, so use -/// [`id_match_n_nonstatic`] instead if you need it. -pub fn id_match(keyword: &'static str, token: &str) -> bool { - id_match_n(keyword, token, 3) -} - -/// Returns true if `token` is a case-insensitive match for at least the first -/// `n` characters of `keyword`. -/// -/// `keyword` must be ASCII. It's normally a constant string, so it's declared -/// as `&'static str` to make it harder to reverse the argument order. But -/// there's no reason that a non-static string won't work, so use -/// [`id_match_n_nonstatic`] instead if you need it. -pub fn id_match_n(keyword: &'static str, token: &str, n: usize) -> bool { - id_match_n_nonstatic(keyword, token, n) -} - -/// Returns true if `token` is a case-insensitive match for at least the first -/// `n` characters of `keyword`. -/// -/// `keyword` must be ASCII. -pub fn id_match_n_nonstatic(keyword: &str, token: &str, n: usize) -> bool { - debug_assert!(keyword.is_ascii()); - let keyword_prefix = if (n..keyword.len()).contains(&token.len()) { - &keyword[..token.len()] - } else { - keyword - }; - keyword_prefix.eq_ignore_ascii_case(token) -} - -impl Display for Identifier { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{}", self.0) - } -} - -impl Debug for Identifier { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - write!(f, "{}", self.0) - } -} - -pub trait HasIdentifier { - fn identifier(&self) -> &UniCase; -} - -pub struct ByIdentifier(pub T) -where - T: HasIdentifier; - -impl ByIdentifier -where - T: HasIdentifier, -{ - pub fn new(inner: T) -> Self { - Self(inner) - } -} - -impl PartialEq for ByIdentifier -where - T: HasIdentifier, -{ - fn eq(&self, other: &Self) -> bool { - self.0.identifier().eq(other.0.identifier()) - } -} - -impl Eq for ByIdentifier where T: HasIdentifier {} - -impl PartialOrd for ByIdentifier -where - T: HasIdentifier, -{ - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for ByIdentifier -where - T: HasIdentifier, -{ - fn cmp(&self, other: &Self) -> Ordering { - self.0.identifier().cmp(other.0.identifier()) - } -} - -impl Hash for ByIdentifier -where - T: HasIdentifier, -{ - fn hash(&self, state: &mut H) { - self.0.identifier().hash(state) - } -} - -impl Borrow> for ByIdentifier -where - T: HasIdentifier, -{ - fn borrow(&self) -> &UniCase { - self.0.identifier() - } -} - -impl Debug for ByIdentifier -where - T: HasIdentifier + Debug, -{ - fn fmt(&self, f: &mut Formatter) -> FmtResult { - self.0.fmt(f) - } -} - -impl Clone for ByIdentifier -where - T: HasIdentifier + Clone, -{ - fn clone(&self) -> Self { - Self(self.0.clone()) - } -} - -impl Deref for ByIdentifier -where - T: HasIdentifier + Clone, -{ - type Target = T; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} diff --git a/rust/src/integer.rs b/rust/src/integer.rs deleted file mode 100644 index 6c76839927..0000000000 --- a/rust/src/integer.rs +++ /dev/null @@ -1,86 +0,0 @@ -pub trait ToInteger { - fn to_exact_integer(&self) -> Option - where - T: FromFloat; - fn to_exact_usize(&self) -> Option { - self.to_exact_integer() - } - fn to_exact_u8(&self) -> Option { - self.to_exact_integer() - } - fn to_exact_u16(&self) -> Option { - self.to_exact_integer() - } - fn to_exact_u32(&self) -> Option { - self.to_exact_integer() - } - fn to_exact_u64(&self) -> Option { - self.to_exact_integer() - } - fn to_exact_u128(&self) -> Option { - self.to_exact_integer() - } - fn to_exact_isize(&self) -> Option { - self.to_exact_integer() - } - fn to_exact_i8(&self) -> Option { - self.to_exact_integer() - } - fn to_exact_i16(&self) -> Option { - self.to_exact_integer() - } - fn to_exact_i32(&self) -> Option { - self.to_exact_integer() - } - fn to_exact_i64(&self) -> Option { - self.to_exact_integer() - } - fn to_exact_i128(&self) -> Option { - self.to_exact_integer() - } -} - -impl ToInteger for f64 { - fn to_exact_integer(&self) -> Option - where - T: FromFloat, - { - T::from_float(*self) - } -} - -pub trait FromFloat { - fn from_float(x: f64) -> Option - where - Self: Sized; -} - -macro_rules! impl_from_float { - ($T:ident) => { - impl FromFloat for $T { - fn from_float(x: f64) -> Option - where - Self: Sized, - { - if x.trunc() == x && x >= $T::MIN as f64 && x <= $T::MAX as f64 { - Some(x as Self) - } else { - None - } - } - } - }; -} - -impl_from_float!(usize); -impl_from_float!(u8); -impl_from_float!(u16); -impl_from_float!(u32); -impl_from_float!(u64); -impl_from_float!(u128); -impl_from_float!(isize); -impl_from_float!(i8); -impl_from_float!(i16); -impl_from_float!(i32); -impl_from_float!(i64); -impl_from_float!(i128); diff --git a/rust/src/lex/command_name.rs b/rust/src/lex/command_name.rs deleted file mode 100644 index bccea1483b..0000000000 --- a/rust/src/lex/command_name.rs +++ /dev/null @@ -1,359 +0,0 @@ -use crate::identifier::id_match_n_nonstatic; - -pub struct Match { - pub exact: bool, - pub missing_words: isize, -} - -fn count_words(s: &str) -> isize { - s.split_whitespace().count() as isize -} - -/// Compares `string` obtained from the user against the full name of a `command`, -/// using this algorithm: -/// -/// 1. Divide `command` into words `c[0]` through `c[n - 1]`. -/// -/// 2. Divide `string` into words `s[0]` through `s[m - 1]`. -/// -/// 3. Compare word `c[i]` against `s[i]` for `0 <= i < min(n, m)`, using the keyword -/// matching algorithm implemented by lex_id_match(). If any of them fail to -/// match, then `string` does not match `command` and the function returns false. -/// -/// 4. Otherwise, `string` and `command` match. Set *MISSING_WORDS to n - m. Set -/// *EXACT to false if any of the S[i] were found to be abbreviated in the -/// comparisons done in step 3, or to true if they were all exactly equal -/// (modulo case). Return true. -pub fn command_match(command: &str, string: &str) -> Option { - let mut command_words = command.split_whitespace(); - let mut string_words = string.split_whitespace(); - let mut exact = true; - loop { - let Some(cw) = command_words.next() else { - return Some(Match { - exact, - missing_words: -(string_words.count() as isize), - }); - }; - let Some(sw) = string_words.next() else { - return Some(Match { - exact, - missing_words: 1 + command_words.count() as isize, - }); - }; - if !id_match_n_nonstatic(cw, sw, 3) { - return None; - } - if sw.len() < cw.len() { - exact = false; - } - } -} - -/// Matches a string against a collection of command names. -pub struct CommandMatcher<'a, T> { - string: &'a str, - extensible: bool, - exact_match: Option, - n_matches: usize, - match_: Option, - match_missing_words: isize, -} - -impl<'a, T> CommandMatcher<'a, T> { - pub fn new(string: &'a str) -> Self { - Self { - string, - extensible: false, - exact_match: None, - n_matches: 0, - match_: None, - match_missing_words: 0, - } - } - - /// Consider `command` as a candidate for the command name being parsed. If - /// `command` is the correct command name, then [Self::get_match] will - /// return `aux` later. - pub fn add(&mut self, command: &str, aux: T) { - if let Some(Match { - missing_words, - exact, - }) = command_match(command, self.string) - { - if missing_words > 0 { - self.extensible = true; - } else if exact && missing_words == 0 { - self.exact_match = Some(aux); - } else { - if missing_words > self.match_missing_words { - self.n_matches = 0; - } - if missing_words >= self.match_missing_words || self.n_matches == 0 { - self.n_matches += 1; - self.match_ = Some(aux); - self.match_missing_words = missing_words; - } - } - } - } - - pub fn get_match(self) -> (Option, isize) { - if self.extensible { - (None, 1) - } else if let Some(exact_match) = self.exact_match { - (Some(exact_match), 0) - } else if self.n_matches == 1 { - (self.match_, self.match_missing_words) - } else { - (None, self.match_missing_words) - } - } -} - -pub const COMMAND_NAMES: &'static [&'static str] = &[ - "2SLS", - "ACF", - "ADD DOCUMENT", - "ADD FILES", - "ADD VALUE LABELS", - "AGGREGATE", - "ALSCAL", - "ANACOR", - "ANOVA", - "APPLY DICTIONARY", - "AUTORECODE", - "BEGIN DATA", - "BREAK", - "CACHE", - "CASEPLOT", - "CASESTOVARS", - "CATPCA", - "CATREG", - "CCF", - "CD", - "CLEAR TRANSFORMATIONS", - "CLOSE FILE HANDLE", - "CLUSTER", - "COMPUTE", - "CONJOINT", - "CORRELATIONS", - "CORRESPONDENCE", - "COUNT", - "COXREG", - "CREATE", - "CROSSTABS", - "CSDESCRIPTIVES", - "CSGLM", - "CSLOGISTIC", - "CSPLAN", - "CSSELECT", - "CSTABULATE", - "CTABLES", - "CURVEFIT", - "DATA LIST", - "DATAFILE ATTRIBUTE", - "DATASET ACTIVATE", - "DATASET CLOSE", - "DATASET COPY", - "DATASET DECLARE", - "DATASET DISPLAY", - "DATASET NAME", - "DATE", - "DEBUG EVALUATE", - "DEBUG EXPAND", - "DEBUG FLOAT FORMAT", - "DEBUG FORMAT GUESSER", - "DEBUG MATRIX READ", - "DEBUG MOMENTS", - "DEBUG PAPER SIZE", - "DEBUG POOL", - "DEBUG XFORM FAIL", - "DEFINE", - "DELETE VARIABLES", - "DESCRIPTIVES", - "DETECTANOMALY", - "DISCRIMINANT", - "DISPLAY MACROS", - "DISPLAY VARIABLE SETS", - "DISPLAY", - "DO IF", - "DO REPEAT", - "DOCUMENT", - "DROP DOCUMENTS", - "ECHO", - "EDIT", - "ELSE IF", - "ELSE", - "END CASE", - "END FILE TYPE", - "END FILE", - "END IF", - "END LOOP", - "END REPEAT", - "ERASE", - "EXAMINE", - "EXECUTE", - "EXIT", - "EXPORT", - "FACTOR", - "FILE HANDLE", - "FILE LABEL", - "FILE TYPE", - "FILTER", - "FINISH", - "FIT", - "FLIP", - "FORMATS", - "FREQUENCIES", - "GENLOG", - "GET DATA", - "GET TRANSLATE", - "GET", - "GGRAPH", - "GLM", - "GRAPH", - "HILOGLINEAR", - "HOMALS", - "HOST", - "IF", - "IGRAPH", - "IMPORT", - "INCLUDE", - "INFO", - "INPUT PROGRAM", - "INSERT", - "KEYED DATA LIST", - "KM", - "LEAVE", - "LIST", - "LOGISTIC REGRESSION", - "LOGLINEAR", - "LOOP", - "MANOVA", - "MAPS", - "MATCH FILES", - "MATRIX DATA", - "MATRIX", - "MCONVERT", - "MEANS", - "MISSING VALUES", - "MIXED", - "MODEL CLOSE", - "MODEL HANDLE", - "MODEL LIST", - "MODEL NAME", - "MRSETS", - "MULT RESPONSE", - "MULTIPLE CORRESPONDENCE", - "MVA", - "N OF CASES", - "N", - "NAIVEBAYES", - "NEW FILE", - "NLR", - "NOMREG", - "NONPAR CORR", - "NPAR TESTS", - "NUMBERED", - "NUMERIC", - "OLAP CUBES", - "OMS", - "ONEWAY", - "ORTHOPLAN", - "OUTPUT MODIFY", - "OVERALS", - "PACF", - "PARTIAL CORR", - "PEARSON CORRELATIONS", - "PERMISSIONS", - "PLANCARDS", - "PLUM", - "POINT", - "PPLOT", - "PREDICT", - "PREFSCAL", - "PRESERVE", - "PRINCALS", - "PRINT EJECT", - "PRINT FORMATS", - "PRINT SPACE", - "PRINT", - "PROBIT", - "PROCEDURE OUTPUT", - "PROXIMITIES", - "PROXSCAL", - "Q", - "QUICK CLUSTER", - "QUIT", - "RANK", - "RATIO STATISTICS", - "READ MODEL", - "RECODE", - "RECORD TYPE", - "REFORMAT", - "REGRESSION", - "RELIABILITY", - "RENAME VARIABLES", - "REPEATING DATA", - "REPORT", - "REREAD", - "RESTORE", - "RMV", - "ROC", - "SAMPLE", - "SAVE DATA COLLECTION", - "SAVE TRANSLATE", - "SAVE", - "SCRIPT", - "SEASON", - "SELECT IF", - "SELECTPRED", - "SET", - "SHOW", - "SORT CASES", - "SORT VARIABLES", - "SPCHART", - "SPECTRA", - "SPLIT FILE", - "STEMLEAF", - "STRING", - "SUBTITLE", - "SUMMARIZE", - "SURVIVAL", - "SYSFILE INFO", - "T-TEST", - "TDISPLAY", - "TEMPORARY", - "TITLE", - "TREE", - "TSAPPLY", - "TSET", - "TSHOW", - "TSMODEL", - "TSPLOT", - "TWOSTEP CLUSTER", - "UNIANOVA", - "UNNUMBERED", - "UPDATE", - "USE", - "VALIDATEDATA", - "VALUE LABELS", - "VARCOMP", - "VARIABLE ALIGNMENT", - "VARIABLE ATTRIBUTE", - "VARIABLE LABELS", - "VARIABLE LEVEL", - "VARIABLE ROLE", - "VARIABLE WIDTH", - "VARSTOCASES", - "VECTOR", - "VERIFY", - "WEIGHT", - "WLS", - "WRITE FORMATS", - "WRITE", - "XEXPORT", - "XGRAPH", - "XSAVE", -]; diff --git a/rust/src/lex/lexer.rs b/rust/src/lex/lexer.rs deleted file mode 100644 index 82ef008aef..0000000000 --- a/rust/src/lex/lexer.rs +++ /dev/null @@ -1,929 +0,0 @@ -use std::{ - borrow::{Borrow, Cow}, - collections::{HashMap, VecDeque}, - fmt::Write, - fs, - io::Result as IoResult, - mem, - ops::{Range, RangeInclusive}, - path::Path, - sync::Arc, -}; - -use chardetng::EncodingDetector; -use encoding_rs::{Encoding, UTF_8}; -use thiserror::Error as ThisError; -use unicode_width::{UnicodeWidthChar, UnicodeWidthStr}; - -use crate::{ - macros::{macro_tokens_to_syntax, MacroSet, ParseStatus, Parser}, - message::{Category, Diagnostic, Location, Point, Severity}, - prompt::PromptStyle, - settings::Settings, -}; - -use super::{ - scan::{MergeResult, ScanError, ScanToken}, - segment::{Mode, Segment, Segmenter}, - token::Token, -}; - -/// Error handling for a [`Reader`]. -#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] -pub enum ErrorHandling { - /// Discard input line and continue reading. - Terminal, - - /// Continue to next command, except for cascading failures. - #[default] - Continue, - - /// Continue, even for cascading failures. - Ignore, - - /// Stop processing, - Stop, -} - -/// # Token pipeline -/// -/// Tokens pass through a pipeline with the following stages. Each token -/// eventually made available to the parser passes through of these stages. -/// The stages are named after the processing that happens in each one. -/// -/// Initially, tokens come from the segmenter and scanner to `pp`: -/// -/// - `pp`: Tokens that need to pass through the macro preprocessor to end up -/// in `merge`. -/// -/// - `merge`: Tokens that need to pass through -/// [`super::scan::ScanToken::merge`] to end up in `parse`. -/// -/// - `parse`: Tokens available to the client for parsing. -/// -/// `pp` and `merge` store tokens only temporarily until they pass into `parse`. -/// Tokens then live in `parse` until the command is fully consumed, at which -/// time they are freed together. -pub struct Source { - /// Error-handling mode. - error_handling: ErrorHandling, - - /// Encoding. - encoding: &'static Encoding, - - /// `None` if this reader is not associated with a file. - file_name: Option>, - - /// True if we've reached EOF already. - eof: bool, - - /// Read some input from the source. If successful, returns the input that - /// was read. At end of file or on error, returns an empty string. - /// - /// `prompt` provides a hint to interactive readers as to what kind of - /// syntax is being read right now. - read: Box String>, - - /// Source file contents. - buffer: String, - - /// 0-based line number of the first line not yet written to the journal. - journal_line: usize, - - /// Byte offset of first character not yet scanned as token. - seg_pos: usize, - - /// Byte offsets into `buffer` of starts of lines. The first element is 0. - lines: Vec, - - /// Tokens that need to pass through the macro preprocessor to end up in - /// `merge`. - pp: VecDeque, - - /// Tokens that need to pass through [`super::scan::ScanToken::merge`] to - /// end up in `parse`. - merge: VecDeque, - - /// Tokens available to the client for parsing. - parse: Vec, - - /// Offset in `parse` of the current token. - parse_ofs: usize, - - segmenter: Segmenter, - - suppress_next_newline: bool, -} - -impl Default for Source { - fn default() -> Self { - Self { - error_handling: ErrorHandling::default(), - encoding: UTF_8, - file_name: None, - eof: false, - read: Box::new(|_| String::new()), - buffer: String::new(), - journal_line: 0, - seg_pos: 0, - lines: vec![0], - pp: VecDeque::new(), - merge: VecDeque::new(), - parse: Vec::new(), - parse_ofs: 0, - segmenter: Segmenter::new(Mode::default(), false), - suppress_next_newline: false, - } - } -} - -impl Source { - pub fn for_file

( - path: P, - encoding: Option<&'static Encoding>, - syntax: Mode, - error_handling: ErrorHandling, - ) -> IoResult - where - P: AsRef, - { - let bytes = fs::read(path.as_ref())?; - let encoding = encoding.unwrap_or_else(|| { - let mut encoding_detector = EncodingDetector::new(); - encoding_detector.feed(&bytes, true); - encoding_detector.guess(None, true) - }); - let (contents, _malformed) = encoding.decode_with_bom_removal(&bytes); - Ok(Self::for_file_contents( - contents.to_string(), - Some(path.as_ref().to_string_lossy().to_string()), - encoding, - syntax, - error_handling, - )) - } - - pub fn for_file_contents( - contents: String, - file_name: Option, - encoding: &'static Encoding, - syntax: Mode, - error_handling: ErrorHandling, - ) -> Self { - Self { - buffer: contents, - file_name: file_name.map(Arc::new), - encoding, - error_handling, - segmenter: Segmenter::new(syntax, false), - ..Self::default() - } - } - - pub fn for_string(contents: String, encoding: &'static Encoding) -> Self { - Self { - buffer: contents, - encoding, - ..Self::default() - } - } - - pub fn for_function( - read: Box String>, - file_name: Option, - encoding: &'static Encoding, - syntax: Mode, - error_handling: ErrorHandling, - ) -> Self { - Self { - read, - file_name: file_name.map(Arc::new), - encoding, - segmenter: Segmenter::new(syntax, false), - error_handling, - ..Self::default() - } - } - - fn read(&mut self) { - loop { - let prompt = self.segmenter.prompt(); - let s = (self.read)(prompt); - if s.is_empty() { - self.eof = true; - return; - } - self.buffer.push_str(&s); - if self.buffer[self.seg_pos..].contains('\n') { - return; - } - } - } - fn try_get_pp(&mut self, context: &Context) -> bool { - let (seg_len, seg_type) = loop { - if let Ok(result) = self.segmenter.push(&self.buffer[self.seg_pos..], self.eof) { - break result; - } - - debug_assert!(!self.eof); - self.read(); - }; - - let pos = self.seg_pos..self.seg_pos + seg_len; - self.seg_pos += seg_len; - if seg_type == Segment::Newline { - self.lines.push(self.seg_pos); - } - - let scan_token = ScanToken::from_segment(&self.buffer[pos.clone()], seg_type); - - let n_lines = match (seg_type, self.suppress_next_newline) { - (Segment::EndCommand, false) => { - self.suppress_next_newline = true; - 1 - } - (Segment::Newline, true) => { - self.suppress_next_newline = false; - 0 - } - (Segment::Newline, false) => 1, - _ => 0, - }; - for line_num in self.journal_line..self.journal_line + n_lines { - let start_ofs = self.lines[line_num]; - let end_ofs = self - .lines - .get(line_num + 1) - .copied() - .unwrap_or(self.buffer.len()); - let line = &self.buffer[start_ofs..end_ofs]; - let _line = line - .strip_suffix("\r\n") - .unwrap_or(line.strip_suffix('\n').unwrap_or(line)); - // XXX submit the line as syntax - } - self.journal_line += n_lines; - - let pos = pos.start..pos.end; - match scan_token { - None => false, - Some(ScanToken::Token(Token::End)) => { - self.pp.push_back(LexToken { - token: Token::EndCommand, - pos, - macro_rep: None, - }); - self.eof = true; - true - } - Some(ScanToken::Token(token)) => { - self.pp.push_back(LexToken { - token, - pos, - macro_rep: None, - }); - true - } - Some(ScanToken::Error(error)) => { - (context.error)( - Location { - file_name: self.file_name.clone(), - span: Some(self.offset_to_point(pos.start)..self.offset_to_point(pos.end)), - omit_underlines: false, - }, - error.into(), - ); - false - } - } - } - - fn get_pp(&mut self, context: &Context) -> bool { - while !self.eof { - if self.try_get_pp(context) { - return true; - } - } - false - } - - fn try_get_merge(&mut self, context: &Context) -> bool { - if self.pp.is_empty() && !self.get_pp(context) { - return false; - } - - if !Settings::global().macros.expand { - self.merge.append(&mut self.pp); - return true; - } - - // Now pass tokens one-by-one to the macro expander. - let Some(mut parser) = Parser::new(context.macros, &self.pp[0].token) else { - // Common case where there is no macro to expand. - self.merge.push_back(self.pp.pop_front().unwrap()); - return true; - }; - for ofs in 1.. { - if self.pp.len() <= ofs && !self.get_pp(context) { - // This should not be reachable because we always get a - // `Token::EndCommand` at the end of an input file, which should - // always terminate macro expansion. - unreachable!(); - } - let token = &self.pp[ofs]; - if parser.push(&token.token, &self.buffer[token.pos.clone()], &|e| { - println!("{e:?}") - }) == ParseStatus::Complete - { - break; - } - } - let call = parser.finish(); - if call.len() == 0 { - // False alarm: no macro to expand after all. - self.merge.push_back(self.pp.pop_front().unwrap()); - return true; - } - - // Expand the tokens. - let c0 = &self.pp[0]; - let c1 = &self.pp[call.len() - 1]; - let mut expansion = Vec::new(); - call.expand( - self.segmenter.mode(), - self.token_location(c0..=c1), - &mut expansion, - |e| println!("{e:?}"), - ); - let retval = !expansion.is_empty(); - - if Settings::global().macros.print_expansions { - // XXX - } - - // Append the macro expansion tokens to the lookahead. - let mut macro_rep = String::new(); - let mut pos = Vec::with_capacity(expansion.len()); - for [prefix, token] in macro_tokens_to_syntax(expansion.as_slice()) { - macro_rep.push_str(prefix); - let len = macro_rep.len(); - pos.push(len..=len + token.len() - 1); - } - let macro_rep = Arc::new(macro_rep); - for (index, token) in expansion.into_iter().enumerate() { - let lt = LexToken { - token: token.token, - pos: c0.pos.start..c1.pos.end, - macro_rep: Some(MacroRepresentation { - expansion: Arc::clone(¯o_rep), - pos: pos[index].clone(), - }), - }; - self.merge.push_back(lt); - } - self.pp.drain(..call.len()); - retval - } - - /// Attempts to obtain at least one new token into `self.merge`. - /// - /// Returns true if successful, false on failure. In the latter case, this source - /// exhausted and 'self.eof' is now true. - fn get_merge(&mut self, context: &Context) -> bool { - while !self.eof { - if self.try_get_merge(context) { - return true; - } - } - false - } - - fn get_parse__(&mut self, context: &Context) -> bool { - for i in 0.. { - if self.merge.len() <= i && !self.get_merge(context) { - // We always get a `Token::EndCommand` at the end of an input - // file and the merger should return `Some(...)` for that token. - debug_assert_eq!(self.merge.len(), 0); - return false; - } - - match ScanToken::merge(&self.merge) { - None => (), - Some(MergeResult::Copy) => { - self.parse.push(self.merge.pop_front().unwrap()); - return true; - } - Some(MergeResult::Expand { n, token }) => { - let first = &self.merge[0]; - let last = &self.merge[n - 1]; - self.parse.push(LexToken { - token, - pos: first.pos.start..last.pos.end, - macro_rep: match (&first.macro_rep, &last.macro_rep) { - (Some(a), Some(b)) if Arc::ptr_eq(&a.expansion, &b.expansion) => { - Some(MacroRepresentation { - expansion: a.expansion.clone(), - pos: *a.pos.start()..=*b.pos.end(), - }) - } - _ => None, - }, - }); - self.merge.drain(..n); - return true; - } - } - } - unreachable!(); - } - - fn get_parse(&mut self, context: &Context) -> bool { - // XXX deal with accumulated messages - self.get_parse__(context) - } - - fn offset_to_point(&self, offset: usize) -> Point { - let line = self - .lines - .partition_point(|&line_start| line_start <= offset); - Point { - line: line as i32, - column: Some( - self.buffer - .get(self.lines[line - 1]..offset) - .unwrap_or_default() - .width() as i32 - + 1, - ), - } - } - - /// Returns the syntax for 1-based line-number `line_number`. - fn get_line(&self, line_number: i32) -> &str { - if (1..=self.lines.len() as i32).contains(&line_number) { - let line_number = line_number as usize; - let start = self.lines[line_number - 1]; - let end = self.lines.get(line_number).copied().unwrap_or( - self.buffer[start..] - .find('\n') - .map(|ofs| ofs + start) - .unwrap_or(self.buffer.len()), - ); - let line = &self.buffer[start..end]; - line.strip_suffix("\r\n") - .unwrap_or(line.strip_suffix('\n').unwrap_or(line)) - } else { - "" - } - } - - fn token_location(&self, range: RangeInclusive<&LexToken>) -> Location { - Location { - file_name: self.file_name.clone(), - span: Some( - self.offset_to_point(range.start().pos.start) - ..self.offset_to_point(range.end().pos.end), - ), - omit_underlines: false, - } - } - - fn ofs_location(&self, range: RangeInclusive) -> Location { - if *range.start() <= *range.end() && *range.end() < self.parse.len() { - self.token_location(&self.parse[*range.start()]..=&self.parse[*range.end()]) - } else { - Location { - file_name: self.file_name.clone(), - span: None, - omit_underlines: false, - } - } - } - - fn token(&self) -> &Token { - &self.parse[self.parse_ofs].token - } - - fn next(&mut self, offset: isize, context: &Context) -> &Token { - let Some(index) = offset.checked_add(self.parse_ofs as isize) else { - return &Token::EndCommand; - }; - let Ok(index) = usize::try_from(index) else { - return &Token::EndCommand; - }; - - while index >= self.parse.len() { - if let Some(token) = self.parse.last() { - match token.token { - Token::End => return &Token::End, - Token::EndCommand => return &Token::EndCommand, - _ => (), - } - } - self.get_parse(context); - } - &self.parse[index].token - } - - /// If the tokens in `ofs` contains a macro call, this returns the raw - /// syntax for the macro call (not for the expansion) and for any other - /// tokens included in that range. The syntax is encoded in UTF-8 and in - /// the original form supplied to the lexer so that, for example, it may - /// include comments, spaces, and new-lines if it spans multiple tokens. - /// - /// Returns `None` if the token range doesn't include a macro call. - fn get_macro_call(&self, ofs: RangeInclusive) -> Option<&str> { - if self - .parse - .get(ofs.clone()) - .unwrap_or_default() - .iter() - .all(|token| token.macro_rep.is_none()) - { - return None; - } - - let token0 = &self.parse[*ofs.start()]; - let token1 = &self.parse[*ofs.end()]; - Some(&self.buffer[token0.pos.start..token1.pos.end]) - } - - fn is_empty(&self) -> bool { - self.buffer.is_empty() && self.eof - } - - fn diagnostic( - &self, - severity: Severity, - ofs: RangeInclusive, - text: String, - ) -> Diagnostic { - let mut s = String::with_capacity(text.len() + 16); - if self.is_empty() { - s.push_str("At end of input: "); - } else if let Some(call) = self.get_macro_call(ofs.clone()) { - write!(&mut s, "In syntax expanded from `{}`: ", ellipsize(call)).unwrap(); - } - - if !text.is_empty() { - s.push_str(&text); - } else { - s.push_str("Syntax error."); - } - - if !s.ends_with('.') { - s.push('.'); - } - - let location = self.ofs_location(ofs); - let mut source = Vec::new(); - if let Some(Range { - start: Point { line: l0, .. }, - end: Point { line: l1, .. }, - }) = location.span - { - let lines = if l1 - l0 > 3 { - vec![l0, l0 + 1, l1] - } else { - (l0..=l1).collect() - }; - for line_number in lines { - source.push((line_number, self.get_line(line_number).to_string())); - } - } - - Diagnostic { - category: Category::Syntax, - severity, - location, - source, - stack: Vec::new(), - command_name: None, // XXX - text: s, - } - } - - fn interactive_reset(&mut self) { - if self.error_handling == ErrorHandling::Terminal { - let Source { - error_handling, - encoding, - read, - .. - } = mem::take(self); - *self = Self { - error_handling, - encoding, - read, - ..Source::default() - }; - } - } -} - -fn ellipsize(s: &str) -> Cow { - if s.width() > 64 { - let mut out = String::new(); - let mut width = 0; - for c in s.chars() { - out.push(c); - width += c.width().unwrap_or(0); - if width > 64 { - break; - } - } - out.push_str("..."); - Cow::from(out) - } else { - Cow::from(s) - } -} - -/// A token in a [`Source`]. -struct LexToken { - /// The regular token. - token: Token, - - /// For a token obtained through the lexer in an ordinary way, this is the - /// location of the token in the [`Source`]'s buffer. - /// - /// For a token produced through macro expansion, this is the entire macro - /// call. - pos: Range, - - /// For a token obtained through macro expansion, the part of the macro - /// expansion that represents this token. - /// - /// For a token obtained through the lexer in an ordinary way, this is - /// `None`. - macro_rep: Option, -} - -impl Borrow for LexToken { - fn borrow(&self) -> &Token { - &self.token - } -} - -struct MacroRepresentation { - /// An entire macro expansion. - expansion: Arc, - - /// The substring of `expansion` that represents a single token. - pos: RangeInclusive, -} - -pub struct Lexer { - source: Source, - stack: Vec, - macros: MacroSet, - error: Box, -} - -struct Context<'a> { - macros: &'a MacroSet, - error: &'a Box, -} - -impl Lexer { - pub fn new(error: Box) -> Self { - Self { - source: Source::default(), - stack: Vec::new(), - macros: HashMap::new(), - error, - } - } - - pub fn get(&mut self) -> &Token { - if self.source.parse_ofs < self.source.parse.len() { - if let Token::EndCommand = self.source.token() { - self.source.parse.clear(); - self.source.parse_ofs = 0; - } else { - self.source.parse_ofs += 1; - } - } - - while self.source.parse_ofs == self.source.parse.len() { - let context = Context { - macros: &self.macros, - error: &self.error, - }; - if !self.source.get_parse(&context) && !self.pop_stack() { - return &Token::End; - } - } - self.source.token() - } - - fn pop_stack(&mut self) -> bool { - if let Some(new_source) = self.stack.pop() { - self.source = new_source; - true - } else { - self.source = Source::default(); - self.source.parse.push(LexToken { - token: Token::End, - pos: 0..0, - macro_rep: None, - }); - false - } - } - - /// Inserts `source` so that the next token comes from it. This is only - /// permitted when the lexer is either empty or at `Token::EndCommand`. - pub fn include(&mut self, mut source: Source) { - // XXX what's the right assertion? - let context = Context { - macros: &self.macros, - error: &self.error, - }; - source.get_parse(&context); - let old_source = mem::replace(&mut self.source, source); - self.stack.push(old_source); - } - - /// Inserts `source` so that it will be read after all the other sources. - pub fn append(&mut self, mut source: Source) { - let context = Context { - macros: &self.macros, - error: &self.error, - }; - source.get_parse(&context); - self.stack.insert(0, source); - } - - pub fn token(&self) -> &Token { - self.source.token() - } - - pub fn next(&mut self, offset: isize) -> &Token { - let context = Context { - macros: &self.macros, - error: &self.error, - }; - self.source.next(offset, &context) - } - - pub fn error(&self, text: S) -> Diagnostic - where - S: ToString, - { - self.diagnostic( - Severity::Error, - self.source.parse_ofs..=self.source.parse_ofs, - text, - ) - } - - pub fn diagnostic( - &self, - severity: Severity, - ofs: RangeInclusive, - text: S, - ) -> Diagnostic - where - S: ToString, - { - self.source.diagnostic(severity, ofs, text.to_string()) - } - - pub fn error_handling(&self) -> ErrorHandling { - self.source.error_handling - } - - /// Discards all lookahead tokens, then discards all input sources - /// until it encounters one with error mode [ErrorHandling::Terminal] or until it - /// runs out of input sources. - pub fn discard_noninteractive(&mut self) { - while self.source.error_handling != ErrorHandling::Ignore { - self.source.pp.clear(); - self.source.merge.clear(); - self.source.parse.clear(); - self.source.parse_ofs = 0; - - if self.source.error_handling == ErrorHandling::Terminal || !self.pop_stack() { - return; - } - } - } - - /// If the source that the lexer is currently reading has error mode - /// [ErrorHandling::Terminal], discards all buffered input and tokens, so - /// that the next token to be read comes directly from whatever is next read - /// from the stream. - /// - /// It makes sense to call this function after encountering an error in a - /// command entered on the console, because usually the user would prefer - /// not to have cascading errors. - pub fn interactive_reset(&mut self) { - self.source.interactive_reset() - } - - /// Advances past any tokens up to [Token::EndCommand] or [Token::End]. - pub fn discard_rest_of_command(&mut self) { - while !matches!(self.token(), Token::EndCommand | Token::End) { - self.get(); - } - } -} - -#[derive(ThisError, Clone, Debug, PartialEq, Eq)] -pub enum Error { - /// Error forming tokens from the input. - #[error("{0}")] - TokenError(#[from] ScanError), -} - -#[cfg(test)] -mod tests { - use encoding_rs::UTF_8; - - use crate::lex::{segment::Mode, token::Token}; - - use super::{ErrorHandling, Lexer, Source}; - - #[test] - fn test() { - let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}"))); - lexer.include(Source::for_string( - String::from( - r#"#! /usr/local/bin/pspp -DATA LIST LIST NOTABLE /a. -BEGIN DATA. -1 -2 -END DATA. -LIST. -"#, - ), - UTF_8, - )); - loop { - lexer.get(); - let token = lexer.token(); - println!("{token:?}"); - if let Token::End = token { - break; - } - } - } - - #[test] - fn test_scan_errors() { - let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}"))); - lexer.include(Source::for_file_contents( - String::from( - r#"x'123' -x'1x' -u'' -u'012345678' -u'd800' -u'110000' -'foo -'very long unterminated string that be ellipsized in its error message -1e .x -^ -� -"#, - ), - Some(String::from("syntax.sps")), - UTF_8, - Mode::default(), - ErrorHandling::default(), - )); - loop { - lexer.get(); - let token = lexer.token(); - println!("{token:?}"); - if let Token::End = token { - break; - } - } - } - - #[test] - fn test_null_byte() { - let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}"))); - lexer.include(Source::for_file_contents( - String::from( - "datA dist list notable file='input.txt'/a b c. -lis|.\0", - ), - Some(String::from("syntax.sps")), - UTF_8, - Mode::default(), - ErrorHandling::default(), - )); - loop { - lexer.get(); - let token = lexer.token(); - println!("{token:?}"); - if let Token::End = token { - break; - } - } - } -} diff --git a/rust/src/lex/mod.rs b/rust/src/lex/mod.rs deleted file mode 100644 index e87b088cf4..0000000000 --- a/rust/src/lex/mod.rs +++ /dev/null @@ -1,17 +0,0 @@ -//! PSPP syntax scanning. -//! -//! PSPP divides traditional "lexical analysis" or "tokenization" into two -//! phases: a lower-level phase called "segmentation" and a higher-level phase -//! called "scanning". [super::segment] implements the segmentation phase and -//! this module the scanning phase. -//! -//! Scanning accepts as input a stream of segments, which are UTF-8 strings each -//! labeled with a segment type. It outputs a stream of "scan tokens", which -//! are the same as the tokens used by the PSPP parser with a few additional -//! types. - -pub mod segment; -pub mod scan; -pub mod command_name; -pub mod token; -pub mod lexer; diff --git a/rust/src/lex/scan/mod.rs b/rust/src/lex/scan/mod.rs deleted file mode 100644 index 05577a9259..0000000000 --- a/rust/src/lex/scan/mod.rs +++ /dev/null @@ -1,416 +0,0 @@ -//! PSPP lexical analysis. -//! -//! PSPP divides traditional "lexical analysis" or "tokenization" into two -//! phases: a lower-level phase called "segmentation" and a higher-level phase -//! called "scanning". [segment] implements the segmentation phase and [scan] -//! the scanning phase. -//! -//! Scanning accepts as input a stream of segments, which are UTF-8 strings each -//! labeled with a segment type. It outputs a stream of "scan tokens", which -//! are the same as the tokens used by the PSPP parser with a few additional -//! types. - -use crate::identifier::{Identifier, ReservedWord}; - -use super::{ - segment::{Mode, Segment, Segmenter}, - token::{Punct, Token}, -}; -use std::{borrow::Borrow, collections::VecDeque}; -use thiserror::Error as ThisError; - -#[derive(ThisError, Clone, Debug, PartialEq, Eq)] -pub enum ScanError { - /// Unterminated string constant. - #[error("Unterminated string constant.")] - ExpectedQuote, - - /// Missing exponent. - #[error("Missing exponent following `{0}`")] - ExpectedExponent(String), - - /// Odd length hex string. - #[error("String of hex digits has {0} characters, which is not a multiple of 2.")] - OddLengthHexString(usize), - - /// Invalid hex digit. - #[error("Invalid hex digit {0:?}.")] - BadHexDigit(char), - - /// Incomplete UTF-8 sequence. - #[error("Incomplete UTF-8 sequence `{substring}` starting {offset} digits into hex string.")] - IncompleteUtf8 { substring: String, offset: usize }, - - /// Bad UTF-8 sequence. - #[error("Invalid UTF-8 sequence `{substring}` starting {offset} digits into hex string.")] - BadUtf8 { substring: String, offset: usize }, - - /// Invalid length Unicode string. - #[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")] - BadLengthUnicodeString(usize), - - /// Invalid code point. - #[error("U+{0:04X} is not a valid Unicode code point.")] - BadCodePoint(u32), - - /// Expected hexadecimal Unicode code point - #[error("Expected hexadecimal Unicode code point.")] - ExpectedCodePoint, - - /// `DO REPEAT` nested too deeply. - #[error("`DO REPEAT` nested too deeply.")] - DoRepeatOverflow, - - /// Unexpected character. - #[error("Unexpected character {0:?} in input.")] - UnexpectedChar(char), -} - -/// The input or output to token merging. -#[derive(Clone, Debug, PartialEq)] -pub enum ScanToken { - Token(Token), - Error(ScanError), -} - -/// The result of merging tokens. -#[derive(Clone, Debug)] -pub enum MergeResult { - /// Copy one token literally from input to output. - Copy, - - /// Expand `n` tokens from the input into `token` in the output. - Expand { - /// Number of tokens to expand. - n: usize, - - /// Replacement token. - token: Token, - }, -} - -impl ScanToken { - pub fn from_segment(s: &str, segment: Segment) -> Option { - match segment { - Segment::Number => Some(Self::Token(Token::Number(s.parse().unwrap()))), - Segment::QuotedString => { - // Trim quote mark from front and back. - let mut chars = s.chars(); - let quote = chars.next().unwrap(); - let s = chars.as_str().strip_suffix(quote).unwrap(); - - // Replace doubled quotes by single ones. - let (single_quote, double_quote) = match quote { - '\'' => ("'", "''"), - '"' => ("\"", "\"\""), - _ => unreachable!(), - }; - Some(Self::Token(Token::String( - s.replace(double_quote, single_quote), - ))) - } - Segment::HexString => { - // Strip `X"` prefix and `"` suffix (or variations). - let s = &s[2..s.len() - 1]; - for c in s.chars() { - if !c.is_ascii_hexdigit() { - return Some(Self::Error(ScanError::BadHexDigit(c))); - } - } - if s.len() % 2 != 0 { - return Some(Self::Error(ScanError::OddLengthHexString(s.len()))); - } - let bytes = s - .as_bytes() - .chunks_exact(2) - .map(|pair| { - let hi = char::from(pair[0]).to_digit(16).unwrap() as u8; - let lo = char::from(pair[1]).to_digit(16).unwrap() as u8; - hi * 16 + lo - }) - .collect::>(); - match String::from_utf8(bytes) { - Ok(string) => Some(Self::Token(Token::String(string))), - Err(error) => { - let details = error.utf8_error(); - let offset = details.valid_up_to() * 2; - let end = details - .error_len() - .map(|len| offset + len * 2) - .unwrap_or(s.len()); - let substring = String::from(&s[offset..end]); - Some(Self::Error(if details.error_len().is_some() { - ScanError::BadUtf8 { substring, offset } - } else { - ScanError::IncompleteUtf8 { substring, offset } - })) - } - } - } - Segment::UnicodeString => { - // Strip `U"` prefix and `"` suffix (or variations). - let s = &s[2..s.len() - 1]; - if !(1..=8).contains(&s.len()) { - return Some(Self::Error(ScanError::BadLengthUnicodeString(s.len()))); - } - let Ok(code_point) = u32::from_str_radix(s, 16) else { - return Some(Self::Error(ScanError::ExpectedCodePoint)); - }; - let Some(c) = char::from_u32(code_point) else { - return Some(Self::Error(ScanError::BadCodePoint(code_point))); - }; - Some(Self::Token(Token::String(String::from(c)))) - } - - Segment::UnquotedString - | Segment::DoRepeatCommand - | Segment::InlineData - | Segment::Document - | Segment::MacroBody - | Segment::MacroName => Some(Self::Token(Token::String(String::from(s)))), - - Segment::Identifier => { - if let Ok(reserved_word) = ReservedWord::try_from(s) { - match reserved_word { - ReservedWord::And => Some(Self::Token(Token::Punct(Punct::And))), - ReservedWord::Or => Some(Self::Token(Token::Punct(Punct::Or))), - ReservedWord::Not => Some(Self::Token(Token::Punct(Punct::Not))), - ReservedWord::Eq => Some(Self::Token(Token::Punct(Punct::Eq))), - ReservedWord::Ge => Some(Self::Token(Token::Punct(Punct::Ge))), - ReservedWord::Gt => Some(Self::Token(Token::Punct(Punct::Gt))), - ReservedWord::Le => Some(Self::Token(Token::Punct(Punct::Le))), - ReservedWord::Lt => Some(Self::Token(Token::Punct(Punct::Lt))), - ReservedWord::Ne => Some(Self::Token(Token::Punct(Punct::Ne))), - ReservedWord::All => Some(Self::Token(Token::Punct(Punct::All))), - ReservedWord::By => Some(Self::Token(Token::Punct(Punct::By))), - ReservedWord::To => Some(Self::Token(Token::Punct(Punct::To))), - ReservedWord::With => Some(Self::Token(Token::Punct(Punct::With))), - } - } else { - Some(Self::Token(Token::Id(Identifier::new(s).unwrap()))) - } - } - Segment::Punct => match s { - "(" => Some(Self::Token(Token::Punct(Punct::LParen))), - ")" => Some(Self::Token(Token::Punct(Punct::RParen))), - "[" => Some(Self::Token(Token::Punct(Punct::LSquare))), - "]" => Some(Self::Token(Token::Punct(Punct::RSquare))), - "{" => Some(Self::Token(Token::Punct(Punct::LCurly))), - "}" => Some(Self::Token(Token::Punct(Punct::RCurly))), - "," => Some(Self::Token(Token::Punct(Punct::Comma))), - "=" => Some(Self::Token(Token::Punct(Punct::Equals))), - "-" => Some(Self::Token(Token::Punct(Punct::Dash))), - "&" => Some(Self::Token(Token::Punct(Punct::And))), - "|" => Some(Self::Token(Token::Punct(Punct::Or))), - "+" => Some(Self::Token(Token::Punct(Punct::Plus))), - "/" => Some(Self::Token(Token::Punct(Punct::Slash))), - "*" => Some(Self::Token(Token::Punct(Punct::Asterisk))), - "<" => Some(Self::Token(Token::Punct(Punct::Lt))), - ">" => Some(Self::Token(Token::Punct(Punct::Gt))), - "~" => Some(Self::Token(Token::Punct(Punct::Not))), - ":" => Some(Self::Token(Token::Punct(Punct::Colon))), - ";" => Some(Self::Token(Token::Punct(Punct::Semicolon))), - "**" => Some(Self::Token(Token::Punct(Punct::Exp))), - "<=" => Some(Self::Token(Token::Punct(Punct::Le))), - "<>" => Some(Self::Token(Token::Punct(Punct::Ne))), - "~=" => Some(Self::Token(Token::Punct(Punct::Ne))), - ">=" => Some(Self::Token(Token::Punct(Punct::Ge))), - "!" => Some(Self::Token(Token::Punct(Punct::Bang))), - "%" => Some(Self::Token(Token::Punct(Punct::Percent))), - "?" => Some(Self::Token(Token::Punct(Punct::Question))), - "`" => Some(Self::Token(Token::Punct(Punct::Backtick))), - "_" => Some(Self::Token(Token::Punct(Punct::Underscore))), - "." => Some(Self::Token(Token::Punct(Punct::Dot))), - "!*" => Some(Self::Token(Token::Punct(Punct::BangAsterisk))), - _ => unreachable!("bad punctuator {s:?}"), - }, - Segment::Shbang - | Segment::Spaces - | Segment::Comment - | Segment::Newline - | Segment::CommentCommand => None, - Segment::DoRepeatOverflow => Some(Self::Error(ScanError::DoRepeatOverflow)), - Segment::StartDocument => { - Some(Self::Token(Token::Id(Identifier::new("DOCUMENT").unwrap()))) - } - Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => { - Some(Self::Token(Token::EndCommand)) - } - Segment::End => Some(Self::Token(Token::End)), - Segment::ExpectedQuote => Some(Self::Error(ScanError::ExpectedQuote)), - Segment::ExpectedExponent => { - Some(Self::Error(ScanError::ExpectedExponent(String::from(s)))) - } - Segment::UnexpectedChar => Some(Self::Error(ScanError::UnexpectedChar( - s.chars().next().unwrap(), - ))), - } - } - - /// Attempts to merge a sequence of tokens together into a single token. The - /// tokens are taken from the beginning of `input`. If successful, removes one - /// or more token from the beginning of `input` and returnss the merged - /// token. More input tokens might be needed; if so, leaves `input` alone and - /// returns `None`. In the latter case, the caller should add more tokens to the - /// input ([Token::End] or [Token::Punct(Punct::EndCmd)] is always sufficient). - /// - /// This performs two different kinds of token merging: - /// - /// - String concatenation, where syntax like `"a" + "b"` is converted into a - /// single string token. This is definitely needed because the parser relies - /// on it. - /// - /// - Negative number merging, where syntax like `-5` is converted from a pair - /// of tokens (a dash and a positive number) into a single token (a negative - /// number). This might not be needed anymore because the segmenter - /// directly treats a dash followed by a number, with optional intervening - /// white space, as a negative number. It's only needed if we want - /// intervening comments to be allowed or for part of the negative number - /// token to be produced by macro expansion. - pub fn merge(tokens: &T) -> Option - where - T: Tokens, - { - match tokens.get(0)? { - Token::Punct(Punct::Dash) => match tokens.get(1)? { - Token::Number(number) if number.is_sign_positive() => { - let number = *number; - return Some(MergeResult::Expand { - n: 2, - token: Token::Number(-number), - }); - } - _ => Some(MergeResult::Copy), - }, - Token::String(_) => { - let mut i = 0; - while matches!(tokens.get(i * 2 + 1)?, Token::Punct(Punct::Plus)) - && matches!(tokens.get(i * 2 + 2)?, Token::String(_)) - { - i += 1; - } - if i == 0 { - Some(MergeResult::Copy) - } else { - let mut output = String::new(); - for i in 0..=i { - let Token::String(s) = tokens.get(i * 2).unwrap() else { - unreachable!() - }; - output.push_str(&s); - } - Some(MergeResult::Expand { - n: i * 2 + 1, - token: Token::String(output), - }) - } - } - _ => Some(MergeResult::Copy), - } - } -} - -pub trait Tokens { - fn get(&self, index: usize) -> Option<&Token>; -} - -impl Tokens for VecDeque -where - T: Borrow, -{ - fn get(&self, index: usize) -> Option<&Token> { - self.get(index).map(|token| token.borrow()) - } -} - -pub struct StringSegmenter<'a> { - input: &'a str, - segmenter: Segmenter, -} - -impl<'a> StringSegmenter<'a> { - pub fn new(input: &'a str, mode: Mode, is_snippet: bool) -> Self { - Self { - input, - segmenter: Segmenter::new(mode, is_snippet), - } - } -} - -impl<'a> Iterator for StringSegmenter<'a> { - type Item = (&'a str, ScanToken); - - fn next(&mut self) -> Option { - loop { - let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap(); - if seg_type == Segment::End { - return None; - } - let (s, rest) = self.input.split_at(seg_len); - self.input = rest; - - if let Some(token) = ScanToken::from_segment(s, seg_type) { - return Some((s, token)); - } - } - } -} - -pub struct StringScanner<'a> { - input: &'a str, - segmenter: Segmenter, - tokens: VecDeque, -} - -impl<'a> StringScanner<'a> { - pub fn new(input: &'a str, mode: Mode, is_snippet: bool) -> Self { - Self { - input, - segmenter: Segmenter::new(mode, is_snippet), - tokens: VecDeque::with_capacity(1), - } - } - - fn merge(&mut self) -> Option { - let result = ScanToken::merge(&self.tokens)?; - match result { - MergeResult::Copy => Some(ScanToken::Token(self.tokens.pop_front().unwrap())), - MergeResult::Expand { n, token } => { - self.tokens.drain(..n); - Some(ScanToken::Token(token)) - } - } - } -} - -impl<'a> Iterator for StringScanner<'a> { - type Item = ScanToken; - - fn next(&mut self) -> Option { - if let Some(token) = self.merge() { - return Some(token); - } - loop { - let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap(); - if seg_type == Segment::End && self.tokens.is_empty() { - return None; - } - let (s, rest) = self.input.split_at(seg_len); - self.input = rest; - - match ScanToken::from_segment(s, seg_type) { - Some(ScanToken::Error(error)) => return Some(ScanToken::Error(error)), - Some(ScanToken::Token(token)) => { - self.tokens.push_back(token); - if let Some(token) = self.merge() { - return Some(token); - } - } - None => (), - } - } - } -} - -#[cfg(test)] -mod test; diff --git a/rust/src/lex/scan/test.rs b/rust/src/lex/scan/test.rs deleted file mode 100644 index 0ed9be6555..0000000000 --- a/rust/src/lex/scan/test.rs +++ /dev/null @@ -1,1017 +0,0 @@ -use crate::{identifier::Identifier, lex::{ - segment::Mode, - token::{Punct, Token}, -}}; - -use super::{ScanError, ScanToken, StringScanner}; - -fn print_token(token: &Token) { - match token { - Token::End => print!("Token::End"), - Token::Id(s) => print!("Token::Id(String::from({s:?}))"), - Token::Number(number) => print!("Token::Number({number:?})"), - Token::String(s) => print!("Token::String(String::from({s:?}))"), - Token::EndCommand => print!("Token::EndCommand"), - Token::Punct(punct) => print!("Token::Punct(Punct::{punct:?})"), - } -} - -fn check_scan(input: &str, mode: Mode, expected: &[ScanToken]) { - let tokens = StringScanner::new(input, mode, false).collect::>(); - - if &tokens != expected { - for token in &tokens { - match token { - ScanToken::Token(token) => { - print!("ScanToken::Token("); - print_token(token); - print!(")"); - } - ScanToken::Error(error) => print!("ScanToken::Error(ScanError::{error:?})"), - } - println!(","); - } - - eprintln!("tokens differ from expected:"); - let difference = diff::slice(expected, &tokens); - for result in difference { - match result { - diff::Result::Left(left) => eprintln!("-{left:?}"), - diff::Result::Both(left, _right) => eprintln!(" {left:?}"), - diff::Result::Right(right) => eprintln!("+{right:?}"), - } - } - panic!(); - } -} - -#[test] -fn test_identifiers() { - check_scan( - r#"a aB i5 $x @efg @@. !abcd !* !*a #.# .x _z. -abcd. abcd. -QRSTUV./* end of line comment */ -QrStUv./* end of line comment */ -WXYZ. /* unterminated end of line comment -�. /* U+FFFD is not valid in an identifier -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("aB").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("i5").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("$x").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("@efg").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("@@.").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("!abcd").unwrap())), - ScanToken::Token(Token::Punct(Punct::BangAsterisk)), - ScanToken::Token(Token::Punct(Punct::BangAsterisk)), - ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("#.#").unwrap())), - ScanToken::Token(Token::Punct(Punct::Dot)), - ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), - ScanToken::Token(Token::Punct(Punct::Underscore)), - ScanToken::Token(Token::Id(Identifier::new("z").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("abcd.").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("abcd").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("QRSTUV").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("QrStUv").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("WXYZ").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Error(ScanError::UnexpectedChar('�')), - ScanToken::Token(Token::EndCommand), - ], - ); -} - -#[test] -fn test_reserved_words() { - check_scan( - r#"and or not eq ge gt le lt ne all by to with -AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH -andx orx notx eqx gex gtx lex ltx nex allx byx tox withx -and. with. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Punct(Punct::And)), - ScanToken::Token(Token::Punct(Punct::Or)), - ScanToken::Token(Token::Punct(Punct::Not)), - ScanToken::Token(Token::Punct(Punct::Eq)), - ScanToken::Token(Token::Punct(Punct::Ge)), - ScanToken::Token(Token::Punct(Punct::Gt)), - ScanToken::Token(Token::Punct(Punct::Le)), - ScanToken::Token(Token::Punct(Punct::Lt)), - ScanToken::Token(Token::Punct(Punct::Ne)), - ScanToken::Token(Token::Punct(Punct::All)), - ScanToken::Token(Token::Punct(Punct::By)), - ScanToken::Token(Token::Punct(Punct::To)), - ScanToken::Token(Token::Punct(Punct::With)), - ScanToken::Token(Token::Punct(Punct::And)), - ScanToken::Token(Token::Punct(Punct::Or)), - ScanToken::Token(Token::Punct(Punct::Not)), - ScanToken::Token(Token::Punct(Punct::Eq)), - ScanToken::Token(Token::Punct(Punct::Ge)), - ScanToken::Token(Token::Punct(Punct::Gt)), - ScanToken::Token(Token::Punct(Punct::Le)), - ScanToken::Token(Token::Punct(Punct::Lt)), - ScanToken::Token(Token::Punct(Punct::Ne)), - ScanToken::Token(Token::Punct(Punct::All)), - ScanToken::Token(Token::Punct(Punct::By)), - ScanToken::Token(Token::Punct(Punct::To)), - ScanToken::Token(Token::Punct(Punct::With)), - ScanToken::Token(Token::Id(Identifier::new("andx").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("orx").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("notx").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("eqx").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("gex").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("gtx").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("lex").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("ltx").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("nex").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("allx").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("byx").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("tox").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("withx").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("and.").unwrap())), - ScanToken::Token(Token::Punct(Punct::With)), - ScanToken::Token(Token::EndCommand), - ], - ); -} - -#[test] -fn test_punctuation() { - check_scan( - r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] ** -~&|=>=><=<~=<>(),-+*/[]** -% : ; ? _ ` { } ~ -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Punct(Punct::Not)), - ScanToken::Token(Token::Punct(Punct::And)), - ScanToken::Token(Token::Punct(Punct::Or)), - ScanToken::Token(Token::Punct(Punct::Equals)), - ScanToken::Token(Token::Punct(Punct::Ge)), - ScanToken::Token(Token::Punct(Punct::Gt)), - ScanToken::Token(Token::Punct(Punct::Le)), - ScanToken::Token(Token::Punct(Punct::Lt)), - ScanToken::Token(Token::Punct(Punct::Ne)), - ScanToken::Token(Token::Punct(Punct::Ne)), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::Punct(Punct::Comma)), - ScanToken::Token(Token::Punct(Punct::Dash)), - ScanToken::Token(Token::Punct(Punct::Plus)), - ScanToken::Token(Token::Punct(Punct::Asterisk)), - ScanToken::Token(Token::Punct(Punct::Slash)), - ScanToken::Token(Token::Punct(Punct::LSquare)), - ScanToken::Token(Token::Punct(Punct::RSquare)), - ScanToken::Token(Token::Punct(Punct::Exp)), - ScanToken::Token(Token::Punct(Punct::Not)), - ScanToken::Token(Token::Punct(Punct::And)), - ScanToken::Token(Token::Punct(Punct::Or)), - ScanToken::Token(Token::Punct(Punct::Equals)), - ScanToken::Token(Token::Punct(Punct::Ge)), - ScanToken::Token(Token::Punct(Punct::Gt)), - ScanToken::Token(Token::Punct(Punct::Le)), - ScanToken::Token(Token::Punct(Punct::Lt)), - ScanToken::Token(Token::Punct(Punct::Ne)), - ScanToken::Token(Token::Punct(Punct::Ne)), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::Punct(Punct::Comma)), - ScanToken::Token(Token::Punct(Punct::Dash)), - ScanToken::Token(Token::Punct(Punct::Plus)), - ScanToken::Token(Token::Punct(Punct::Asterisk)), - ScanToken::Token(Token::Punct(Punct::Slash)), - ScanToken::Token(Token::Punct(Punct::LSquare)), - ScanToken::Token(Token::Punct(Punct::RSquare)), - ScanToken::Token(Token::Punct(Punct::Exp)), - ScanToken::Token(Token::Punct(Punct::Percent)), - ScanToken::Token(Token::Punct(Punct::Colon)), - ScanToken::Token(Token::Punct(Punct::Semicolon)), - ScanToken::Token(Token::Punct(Punct::Question)), - ScanToken::Token(Token::Punct(Punct::Underscore)), - ScanToken::Token(Token::Punct(Punct::Backtick)), - ScanToken::Token(Token::Punct(Punct::LCurly)), - ScanToken::Token(Token::Punct(Punct::RCurly)), - ScanToken::Token(Token::Punct(Punct::Not)), - ], - ); -} - -#[test] -fn test_positive_numbers() { - check_scan( - r#"0 1 01 001. 1. -123. /* comment 1 */ /* comment 2 */ -.1 0.1 00.1 00.10 -5e1 6E-1 7e+1 6E+01 6e-03 -.3E1 .4e-1 .5E+1 .6e+01 .7E-03 -1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03 -. 1e e1 1e+ 1e- -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Number(0.0)), - ScanToken::Token(Token::Number(1.0)), - ScanToken::Token(Token::Number(1.0)), - ScanToken::Token(Token::Number(1.0)), - ScanToken::Token(Token::Number(1.0)), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Number(123.0)), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Number(1.0)), - ScanToken::Token(Token::Number(0.1)), - ScanToken::Token(Token::Number(0.1)), - ScanToken::Token(Token::Number(0.1)), - ScanToken::Token(Token::Number(50.0)), - ScanToken::Token(Token::Number(0.6)), - ScanToken::Token(Token::Number(70.0)), - ScanToken::Token(Token::Number(60.0)), - ScanToken::Token(Token::Number(0.006)), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Number(30.0)), - ScanToken::Token(Token::Number(0.04)), - ScanToken::Token(Token::Number(5.0)), - ScanToken::Token(Token::Number(6.0)), - ScanToken::Token(Token::Number(0.0007)), - ScanToken::Token(Token::Number(12.3)), - ScanToken::Token(Token::Number(4.56)), - ScanToken::Token(Token::Number(789.0)), - ScanToken::Token(Token::Number(999.0)), - ScanToken::Token(Token::Number(0.0112)), - ScanToken::Token(Token::EndCommand), - ScanToken::Error(ScanError::ExpectedExponent(String::from("1e"))), - ScanToken::Token(Token::Id(Identifier::new("e1").unwrap())), - ScanToken::Error(ScanError::ExpectedExponent(String::from("1e+"))), - ScanToken::Error(ScanError::ExpectedExponent(String::from("1e-"))), - ], - ); -} - -#[test] -fn test_negative_numbers() { - check_scan( - r#" -0 -1 -01 -001. -1. - -123. /* comment 1 */ /* comment 2 */ - -.1 -0.1 -00.1 -00.10 - -5e1 -6E-1 -7e+1 -6E+01 -6e-03 - -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03 - -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03 - -/**/1 - -. -1e -e1 -1e+ -1e- -1. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Number(-0.0)), - ScanToken::Token(Token::Number(-1.0)), - ScanToken::Token(Token::Number(-1.0)), - ScanToken::Token(Token::Number(-1.0)), - ScanToken::Token(Token::Number(-1.0)), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Number(-123.0)), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Number(-0.1)), - ScanToken::Token(Token::Number(-0.1)), - ScanToken::Token(Token::Number(-0.1)), - ScanToken::Token(Token::Number(-0.1)), - ScanToken::Token(Token::Number(-50.0)), - ScanToken::Token(Token::Number(-0.6)), - ScanToken::Token(Token::Number(-70.0)), - ScanToken::Token(Token::Number(-60.0)), - ScanToken::Token(Token::Number(-0.006)), - ScanToken::Token(Token::Number(-3.0)), - ScanToken::Token(Token::Number(-0.04)), - ScanToken::Token(Token::Number(-5.0)), - ScanToken::Token(Token::Number(-6.0)), - ScanToken::Token(Token::Number(-0.0007)), - ScanToken::Token(Token::Number(-12.3)), - ScanToken::Token(Token::Number(-4.56)), - ScanToken::Token(Token::Number(-789.0)), - ScanToken::Token(Token::Number(-999.0)), - ScanToken::Token(Token::Number(-0.0112)), - ScanToken::Token(Token::Number(-1.0)), - ScanToken::Token(Token::Punct(Punct::Dash)), - ScanToken::Token(Token::Punct(Punct::Dot)), - ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e"))), - ScanToken::Token(Token::Punct(Punct::Dash)), - ScanToken::Token(Token::Id(Identifier::new("e1").unwrap())), - ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e+"))), - ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e-"))), - ScanToken::Token(Token::Number(-1.0)), - ScanToken::Token(Token::EndCommand), - ], - ); -} - -#[test] -fn test_strings() { - check_scan( - r#"'x' "y" 'abc' -'Don''t' "Can't" 'Won''t' -"""quoted""" '"quoted"' -'' "" '''' """" -'missing end quote -"missing double quote -'x' + "y" -+ 'z' + -'a' /* abc */ + "b" /* -+ 'c' +/* */"d"/* */+'e' -'foo' -+ /* special case: + in column 0 would ordinarily start a new command -'bar' -'foo' - + -'bar' -'foo' -+ - -'bar' - -+ -x"4142"+'5152' -"4142"+ -x'5152' -x"4142" -+u'304a' -"�あいうえお" -"abc"+U"FFFD"+u'3048'+"xyz" -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::String(String::from("x"))), - ScanToken::Token(Token::String(String::from("y"))), - ScanToken::Token(Token::String(String::from("abc"))), - ScanToken::Token(Token::String(String::from("Don't"))), - ScanToken::Token(Token::String(String::from("Can't"))), - ScanToken::Token(Token::String(String::from("Won't"))), - ScanToken::Token(Token::String(String::from("\"quoted\""))), - ScanToken::Token(Token::String(String::from("\"quoted\""))), - ScanToken::Token(Token::String(String::from(""))), - ScanToken::Token(Token::String(String::from(""))), - ScanToken::Token(Token::String(String::from("'"))), - ScanToken::Token(Token::String(String::from("\""))), - ScanToken::Error(ScanError::ExpectedQuote), - ScanToken::Error(ScanError::ExpectedQuote), - ScanToken::Token(Token::String(String::from("xyzabcde"))), - ScanToken::Token(Token::String(String::from("foobar"))), - ScanToken::Token(Token::String(String::from("foobar"))), - ScanToken::Token(Token::String(String::from("foo"))), - ScanToken::Token(Token::Punct(Punct::Plus)), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::String(String::from("bar"))), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Punct(Punct::Plus)), - ScanToken::Token(Token::String(String::from("AB5152"))), - ScanToken::Token(Token::String(String::from("4142QR"))), - ScanToken::Token(Token::String(String::from("ABお"))), - ScanToken::Token(Token::String(String::from("�あいうえお"))), - ScanToken::Token(Token::String(String::from("abc�えxyz"))), - ScanToken::Token(Token::End), - ], - ); -} - -#[test] -fn test_shbang() { - check_scan( - r#"#! /usr/bin/pspp -#! /usr/bin/pspp -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("#").unwrap())), - ScanToken::Token(Token::Punct(Punct::Bang)), - ScanToken::Token(Token::Punct(Punct::Slash)), - ScanToken::Token(Token::Id(Identifier::new("usr").unwrap())), - ScanToken::Token(Token::Punct(Punct::Slash)), - ScanToken::Token(Token::Id(Identifier::new("bin").unwrap())), - ScanToken::Token(Token::Punct(Punct::Slash)), - ScanToken::Token(Token::Id(Identifier::new("pspp").unwrap())), - ], - ); -} - -#[test] -fn test_comments() { - check_scan( - r#"* Comment commands "don't -have to contain valid tokens. - -** Check ambiguity with ** token. -****************. - -comment keyword works too. -COMM also. -com is ambiguous with COMPUTE. - - * Comment need not start at left margin. - -* Comment ends with blank line - -next command. - -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("com").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("is").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("ambiguous").unwrap())), - ScanToken::Token(Token::Punct(Punct::With)), - ScanToken::Token(Token::Id(Identifier::new("COMPUTE").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("next").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::EndCommand), - ], - ); -} - -#[test] -fn test_document() { - check_scan( - r#"DOCUMENT one line. -DOC more - than - one - line. -docu -first.paragraph -isn't parsed as tokens - -second paragraph. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())), - ScanToken::Token(Token::String(String::from("DOCUMENT one line."))), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())), - ScanToken::Token(Token::String(String::from("DOC more"))), - ScanToken::Token(Token::String(String::from(" than"))), - ScanToken::Token(Token::String(String::from(" one"))), - ScanToken::Token(Token::String(String::from(" line."))), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())), - ScanToken::Token(Token::String(String::from("docu"))), - ScanToken::Token(Token::String(String::from("first.paragraph"))), - ScanToken::Token(Token::String(String::from("isn't parsed as tokens"))), - ScanToken::Token(Token::String(String::from(""))), - ScanToken::Token(Token::String(String::from("second paragraph."))), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::EndCommand), - ], - ); -} - -#[test] -fn test_file_label() { - check_scan( - r#"FIL label isn't quoted. -FILE - lab 'is quoted'. -FILE /* -/**/ lab not quoted here either - -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("FIL").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("label").unwrap())), - ScanToken::Token(Token::String(String::from("isn't quoted"))), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("FILE").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("lab").unwrap())), - ScanToken::Token(Token::String(String::from("is quoted"))), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("FILE").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("lab").unwrap())), - ScanToken::Token(Token::String(String::from("not quoted here either"))), - ScanToken::Token(Token::EndCommand), - ], - ); -} - -#[test] -fn test_begin_data() { - check_scan( - r#"begin data. -123 -xxx -end data. - -BEG /**/ DAT /* -5 6 7 /* x - -end data -end data -. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("begin").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::String(String::from("123"))), - ScanToken::Token(Token::String(String::from("xxx"))), - ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("BEG").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("DAT").unwrap())), - ScanToken::Token(Token::String(String::from("5 6 7 /* x"))), - ScanToken::Token(Token::String(String::from(""))), - ScanToken::Token(Token::String(String::from("end data"))), - ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), - ScanToken::Token(Token::EndCommand), - ], - ); -} - -#[test] -fn test_do_repeat() { - check_scan( - r#"do repeat x=a b c - y=d e f. - do repeat a=1 thru 5. -another command. -second command -+ third command. -end /* x */ /* y */ repeat print. -end - repeat. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("do").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), - ScanToken::Token(Token::Punct(Punct::Equals)), - ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("b").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("c").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("y").unwrap())), - ScanToken::Token(Token::Punct(Punct::Equals)), - ScanToken::Token(Token::Id(Identifier::new("d").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("e").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("f").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::String(String::from(" do repeat a=1 thru 5."))), - ScanToken::Token(Token::String(String::from("another command."))), - ScanToken::Token(Token::String(String::from("second command"))), - ScanToken::Token(Token::String(String::from("+ third command."))), - ScanToken::Token(Token::String(String::from( - "end /* x */ /* y */ repeat print.", - ))), - ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), - ScanToken::Token(Token::EndCommand), - ], - ); -} - -#[test] -fn test_do_repeat_batch() { - check_scan( - r#"do repeat x=a b c - y=d e f -do repeat a=1 thru 5 -another command -second command -+ third command -end /* x */ /* y */ repeat print -end - repeat -do - repeat #a=1 - - inner command -end repeat -"#, - Mode::Batch, - &[ - ScanToken::Token(Token::Id(Identifier::new("do").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), - ScanToken::Token(Token::Punct(Punct::Equals)), - ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("b").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("c").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("y").unwrap())), - ScanToken::Token(Token::Punct(Punct::Equals)), - ScanToken::Token(Token::Id(Identifier::new("d").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("e").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("f").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::String(String::from("do repeat a=1 thru 5"))), - ScanToken::Token(Token::String(String::from("another command"))), - ScanToken::Token(Token::String(String::from("second command"))), - ScanToken::Token(Token::String(String::from("+ third command"))), - ScanToken::Token(Token::String(String::from( - "end /* x */ /* y */ repeat print", - ))), - ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("do").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("#a").unwrap())), - ScanToken::Token(Token::Punct(Punct::Equals)), - ScanToken::Token(Token::Number(1.0)), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::String(String::from(" inner command"))), - ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), - ], - ); -} - -#[test] -fn test_batch_mode() { - check_scan( - r#"first command - another line of first command -+ second command -third command - -fourth command. - fifth command. -"#, - Mode::Batch, - &[ - ScanToken::Token(Token::Id(Identifier::new("first").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("another").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("line").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("of").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("first").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("second").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("third").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("fourth").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("fifth").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), - ScanToken::Token(Token::EndCommand), - ], - ); -} - -mod define { - use crate::{identifier::Identifier, lex::{ - scan::ScanToken, - segment::Mode, - token::{Punct, Token}, - }}; - - use super::check_scan; - - #[test] - fn test_simple() { - check_scan( - r#"define !macro1() -var1 var2 var3 -!enddefine. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), - ScanToken::Token(Token::String(String::from("!macro1"))), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::String(String::from("var1 var2 var3"))), - ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), - ScanToken::Token(Token::EndCommand), - ], - ); - } - - #[test] - fn test_no_newline_after_parentheses() { - check_scan( - r#"define !macro1() var1 var2 var3 -!enddefine. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), - ScanToken::Token(Token::String(String::from("!macro1"))), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::String(String::from(" var1 var2 var3"))), - ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), - ScanToken::Token(Token::EndCommand), - ], - ); - } - - #[test] - fn test_no_newline_before_enddefine() { - check_scan( - r#"define !macro1() -var1 var2 var3!enddefine. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), - ScanToken::Token(Token::String(String::from("!macro1"))), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::String(String::from("var1 var2 var3"))), - ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), - ScanToken::Token(Token::EndCommand), - ], - ); - } - - #[test] - fn test_all_on_one_line() { - check_scan( - r#"define !macro1()var1 var2 var3!enddefine. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), - ScanToken::Token(Token::String(String::from("!macro1"))), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::String(String::from("var1 var2 var3"))), - ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), - ScanToken::Token(Token::EndCommand), - ], - ); - } - - #[test] - fn test_empty() { - check_scan( - r#"define !macro1() -!enddefine. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), - ScanToken::Token(Token::String(String::from("!macro1"))), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), - ScanToken::Token(Token::EndCommand), - ], - ); - } - - #[test] - fn test_blank_lines() { - check_scan( - r#"define !macro1() - - -!enddefine. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), - ScanToken::Token(Token::String(String::from("!macro1"))), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::String(String::from(""))), - ScanToken::Token(Token::String(String::from(""))), - ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), - ScanToken::Token(Token::EndCommand), - ], - ); - } - - #[test] - fn test_arguments() { - check_scan( - r#"define !macro1(a(), b(), c()) -!enddefine. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), - ScanToken::Token(Token::String(String::from("!macro1"))), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::Punct(Punct::Comma)), - ScanToken::Token(Token::Id(Identifier::new("b").unwrap())), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::Punct(Punct::Comma)), - ScanToken::Token(Token::Id(Identifier::new("c").unwrap())), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), - ScanToken::Token(Token::EndCommand), - ], - ); - } - - #[test] - fn test_multiline_arguments() { - check_scan( - r#"define !macro1( - a(), b( - ), - c() -) -!enddefine. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), - ScanToken::Token(Token::String(String::from("!macro1"))), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::Punct(Punct::Comma)), - ScanToken::Token(Token::Id(Identifier::new("b").unwrap())), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::Punct(Punct::Comma)), - ScanToken::Token(Token::Id(Identifier::new("c").unwrap())), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), - ScanToken::Token(Token::EndCommand), - ], - ); - } - - #[test] - fn test_arguments_start_on_second_line() { - check_scan( - r#"define !macro1 -(x,y,z -) -content 1 -content 2 -!enddefine. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), - ScanToken::Token(Token::String(String::from("!macro1"))), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), - ScanToken::Token(Token::Punct(Punct::Comma)), - ScanToken::Token(Token::Id(Identifier::new("y").unwrap())), - ScanToken::Token(Token::Punct(Punct::Comma)), - ScanToken::Token(Token::Id(Identifier::new("z").unwrap())), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::String(String::from("content 1"))), - ScanToken::Token(Token::String(String::from("content 2"))), - ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), - ScanToken::Token(Token::EndCommand), - ], - ); - } - - #[test] - fn test_early_end_of_command_1() { - check_scan( - r#"define !macro1. -data list /x 1. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), - ScanToken::Token(Token::String(String::from("!macro1"))), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("list").unwrap())), - ScanToken::Token(Token::Punct(Punct::Slash)), - ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), - ScanToken::Token(Token::Number(1.0)), - ScanToken::Token(Token::EndCommand), - ], - ); - } - - #[test] - fn test_early_end_of_command_2() { - check_scan( - r#"define !macro1 -x. -data list /x 1. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), - ScanToken::Token(Token::String(String::from("!macro1"))), - ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("list").unwrap())), - ScanToken::Token(Token::Punct(Punct::Slash)), - ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), - ScanToken::Token(Token::Number(1.0)), - ScanToken::Token(Token::EndCommand), - ], - ); - } - - #[test] - fn test_early_end_of_command_3() { - check_scan( - r#"define !macro1(. -x. -data list /x 1. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), - ScanToken::Token(Token::String(String::from("!macro1"))), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("list").unwrap())), - ScanToken::Token(Token::Punct(Punct::Slash)), - ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), - ScanToken::Token(Token::Number(1.0)), - ScanToken::Token(Token::EndCommand), - ], - ); - } - - #[test] - fn test_early_end_of_command_4() { - // Notice the command terminator at the end of the DEFINE command, - // which should not be there and ends it early. - check_scan( - r#"define !macro1. -data list /x 1. -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), - ScanToken::Token(Token::String(String::from("!macro1"))), - ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), - ScanToken::Token(Token::Id(Identifier::new("list").unwrap())), - ScanToken::Token(Token::Punct(Punct::Slash)), - ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), - ScanToken::Token(Token::Number(1.0)), - ScanToken::Token(Token::EndCommand), - ], - ); - } - - #[test] - fn test_missing_enddefine() { - check_scan( - r#"define !macro1() -content line 1 -content line 2 -"#, - Mode::Auto, - &[ - ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), - ScanToken::Token(Token::String(String::from("!macro1"))), - ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::String(String::from("content line 1"))), - ScanToken::Token(Token::String(String::from("content line 2"))), - ScanToken::Token(Token::End), - ], - ); - } -} diff --git a/rust/src/lex/segment/mod.rs b/rust/src/lex/segment/mod.rs deleted file mode 100644 index befe5b0c53..0000000000 --- a/rust/src/lex/segment/mod.rs +++ /dev/null @@ -1,1334 +0,0 @@ -//! Syntax segmentation. -//! -//! PSPP divides traditional "lexical analysis" or "tokenization" into two -//! phases: a lower-level phase called "segmentation" and a higher-level phase -//! called "scanning". This module implements the segmentation phase. -//! [`super::scan`] contains declarations for the scanning phase. -//! -//! Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label -//! (a segment type) for each byte or contiguous sequence of bytes in the input. -//! It also, in a few corner cases, outputs zero-width segments that label the -//! boundary between a pair of bytes in the input. -//! -//! Some segment types correspond directly to tokens; for example, an -//! "identifier" segment (SEG_IDENTIFIER) becomes an identifier token (T_ID) -//! later in lexical analysis. Other segments contribute to tokens but do not -//! correspond directly; for example, multiple quoted string segments -//! (SEG_QUOTED_STRING) separated by spaces (SEG_SPACES) and "+" punctuators -//! (SEG_PUNCT) may be combined to form a single string token (T_STRING). Still -//! other segments are ignored (e.g. SEG_SPACES) or trigger special behavior -//! such as error messages later in tokenization (e.g. SEG_EXPECTED_QUOTE). - -use crate::{ - identifier::{id_match, id_match_n, IdentifierChar}, - prompt::PromptStyle, -}; -use bitflags::bitflags; - -use super::command_name::{command_match, COMMAND_NAMES}; - -/// Segmentation mode. -/// -/// PSPP syntax is written in one of two modes which are broadly defined as -/// follows: -/// -/// - In interactive mode, commands end with a period at the end of the line -/// or with a blank line. -/// -/// - In batch mode, the second and subsequent lines of a command are indented -/// from the left margin. -/// -/// The segmenter can also try to automatically detect the mode in use, using a -/// heuristic that is usually correct. -#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)] -pub enum Mode { - /// Try to interpret input correctly regardless of whether it is written - /// for interactive or batch mode. - #[default] - Auto, - - /// Interactive syntax mode. - Interactive, - - /// Batch syntax mode. - Batch, -} - -/// The type of a segment. -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum Segment { - Number, - QuotedString, - HexString, - UnicodeString, - UnquotedString, - Identifier, - Punct, - Shbang, - Spaces, - Comment, - Newline, - CommentCommand, - DoRepeatCommand, - DoRepeatOverflow, - InlineData, - MacroName, - MacroBody, - StartDocument, - Document, - StartCommand, - SeparateCommands, - EndCommand, - End, - ExpectedQuote, - ExpectedExponent, - UnexpectedChar, -} - -bitflags! { - #[derive(Copy, Clone, Debug)] - pub struct Substate: u8 { - const START_OF_LINE = 1; - const START_OF_COMMAND = 2; - } -} - -#[derive(Copy, Clone)] -pub struct Segmenter { - state: (State, Substate), - nest: u8, - mode: Mode, -} - -#[derive(Copy, Clone, Debug)] -pub struct Incomplete; - -impl Segmenter { - /// Returns a segmenter with the given syntax `mode`. - /// - /// If `is_snippet` is false, then the segmenter will parse as if it's being - /// given a whole file. This means, for example, that it will interpret `-` - /// or `+` at the beginning of the syntax as a separator between commands - /// (since `-` or `+` at the beginning of a line has this meaning). - /// - /// If `is_snippet` is true, then the segmenter will parse as if it's being - /// given an isolated piece of syntax. This means that, for example, that - /// it will interpret `-` or `+` at the beginning of the syntax as an - /// operator token or (if followed by a digit) as part of a number. - pub fn new(mode: Mode, is_snippet: bool) -> Self { - Self { - state: if is_snippet { - (State::General, Substate::empty()) - } else { - (State::Shbang, Substate::empty()) - }, - mode, - nest: 0, - } - } - - pub fn mode(&self) -> Mode { - self.mode - } - - fn start_of_line(&self) -> bool { - self.state.1.contains(Substate::START_OF_LINE) - } - - fn start_of_command(&self) -> bool { - self.state.1.contains(Substate::START_OF_COMMAND) - } - - /// Returns the style of command prompt to display to an interactive user - /// for input in the current state.. The return value is most accurate in - /// mode `Mode::Interactive` and at the beginning of a line (that is, if - /// [`Segmenter::push`] consumed as much as possible of the input up to a - /// new-line). - pub fn prompt(&self) -> PromptStyle { - match self.state.0 { - State::Shbang => PromptStyle::First, - State::General => { - if self.start_of_command() { - PromptStyle::First - } else { - PromptStyle::Later - } - } - State::Comment1 | State::Comment2 => PromptStyle::Comment, - State::Document1 | State::Document2 => PromptStyle::Document, - State::Document3 => PromptStyle::First, - State::FileLabel1 => PromptStyle::Later, - State::FileLabel2 | State::FileLabel3 => PromptStyle::First, - State::DoRepeat1 | State::DoRepeat2 => { - if self.start_of_command() { - PromptStyle::First - } else { - PromptStyle::Later - } - } - State::DoRepeat3 => PromptStyle::DoRepeat, - State::DoRepeat4 => PromptStyle::DoRepeat, - State::Define1 | State::Define2 | State::Define3 => { - if self.start_of_command() { - PromptStyle::First - } else { - PromptStyle::Later - } - } - State::Define4 | State::Define5 | State::Define6 => PromptStyle::Define, - State::BeginData1 => PromptStyle::First, - State::BeginData2 => PromptStyle::Later, - State::BeginData3 | State::BeginData4 => PromptStyle::Data, - } - } - - /// Attempts to label a prefix of the remaining input with a segment type. - /// The caller supplies a prefix of the remaining input as `input`. If - /// `eof` is true, then `input` is the entire (remainder) of the input; if - /// `eof` is false, then further input is potentially available. - /// - /// The input may contain '\n' or '\r\n' line ends in any combination. - /// - /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes - /// in the segment at the beginning of `input` (a number in - /// `0..=input.len()`) and the type of that segment. The next call should - /// not include those bytes in `input`, because they have (figuratively) - /// been consumed by the segmenter. - /// - /// Segments can have zero length, including segment types `Type::End`, - /// `Type::SeparateCommands`, `Type::StartDocument`, `Type::InlineData`, and - /// `Type::Spaces`. - /// - /// Failure occurs only if the segment type of the bytes in `input` cannot - /// yet be determined. In this case, this function returns `Err(Incomplete)`. If - /// more input is available, the caller should obtain some more, then call - /// again with a longer `input`. If this is not enough, the process might - /// need to repeat again and again. If input is exhausted, then the caller - /// may call again setting `eof` to true. This function will never return - /// `Err(Incomplete)` when `eof` is true. - /// - /// The caller must not, in a sequence of calls, supply contradictory input. - /// That is, bytes provided as part of `input` in one call, but not - /// consumed, must not be provided with *different* values on subsequent - /// calls. This is because the function must often make decisions based on - /// looking ahead beyond the bytes that it consumes. - fn push_rest<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - if input.is_empty() { - if eof { - return Ok((input, Segment::End)); - } else { - return Err(Incomplete); - }; - } - - match self.state.0 { - State::Shbang => return self.parse_shbang(input, eof), - State::General => { - if self.start_of_line() { - self.parse_start_of_line(input, eof) - } else { - self.parse_mid_line(input, eof) - } - } - State::Comment1 => self.parse_comment_1(input, eof), - State::Comment2 => self.parse_comment_2(input, eof), - State::Document1 => self.parse_document_1(input, eof), - State::Document2 => self.parse_document_2(input, eof), - State::Document3 => self.parse_document_3(input, eof), - State::FileLabel1 => self.parse_file_label_1(input, eof), - State::FileLabel2 => self.parse_file_label_2(input, eof), - State::FileLabel3 => self.parse_file_label_3(input, eof), - State::DoRepeat1 => self.parse_do_repeat_1(input, eof), - State::DoRepeat2 => self.parse_do_repeat_2(input, eof), - State::DoRepeat3 => self.parse_do_repeat_3(input, eof), - State::DoRepeat4 => self.parse_do_repeat_4(input), - State::Define1 => self.parse_define_1_2(input, eof), - State::Define2 => self.parse_define_1_2(input, eof), - State::Define3 => self.parse_define_3(input, eof), - State::Define4 => self.parse_define_4_5(input, eof), - State::Define5 => self.parse_define_4_5(input, eof), - State::Define6 => self.parse_define_6(input, eof), - State::BeginData1 => self.parse_begin_data_1(input, eof), - State::BeginData2 => self.parse_begin_data_2(input, eof), - State::BeginData3 => self.parse_begin_data_3(input, eof), - State::BeginData4 => self.parse_begin_data_4(input, eof), - } - } - - pub fn push(&mut self, input: &str, eof: bool) -> Result<(usize, Segment), Incomplete> { - let (rest, seg_type) = self.push_rest(input, eof)?; - Ok((input.len() - rest.len(), seg_type)) - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -enum State { - Shbang, - General, - Comment1, - Comment2, - Document1, - Document2, - Document3, - FileLabel1, - FileLabel2, - FileLabel3, - DoRepeat1, - DoRepeat2, - DoRepeat3, - DoRepeat4, - Define1, - Define2, - Define3, - Define4, - Define5, - Define6, - BeginData1, - BeginData2, - BeginData3, - BeginData4, -} - -fn take(input: &str, eof: bool) -> Result<(Option, &str), Incomplete> { - let mut iter = input.chars(); - match iter.next() { - None if !eof => Err(Incomplete), - c => Ok((c, iter.as_str())), - } -} - -fn skip_comment(mut input: &str, eof: bool) -> Result<&str, Incomplete> { - loop { - let (Some(c), rest) = take(input, eof)? else { - return Ok(input); - }; - match c { - '\n' | '\r' if is_end_of_line(input, eof)? => return Ok(input), - '*' => { - if let (Some('/'), rest) = take(rest, eof)? { - return Ok(rest); - } - } - _ => (), - }; - input = rest; - } -} - -fn skip_matching(f: F, input: &str, eof: bool) -> Result<&str, Incomplete> -where - F: Fn(char) -> bool, -{ - let input = input.trim_start_matches(f); - if input.is_empty() && !eof { - Err(Incomplete) - } else { - Ok(input) - } -} - -fn match_char(f: F, input: &str, eof: bool) -> Result, Incomplete> -where - F: Fn(char) -> bool, -{ - if let (Some(c), rest) = take(input, eof)? { - if f(c) { - return Ok(Some(rest)); - } - } - Ok(None) -} - -fn skip_spaces(mut input: &str, eof: bool) -> Result<&str, Incomplete> { - loop { - let (Some(c), rest) = take(input, eof)? else { - return Ok(input); - }; - match c { - '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input), - c if c.is_whitespace() => (), - _ => return Ok(input), - } - input = rest; - } -} - -fn skip_digits(input: &str, eof: bool) -> Result<&str, Incomplete> { - skip_matching(|c| c.is_ascii_digit(), input, eof) -} - -fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomplete> { - loop { - let (Some(c), rest) = take(input, eof)? else { - return Ok(input); - }; - match c { - '/' => { - let (c, rest2) = take(rest, eof)?; - match c { - Some('*') => input = skip_comment(rest2, eof)?, - Some(_) | None => return Ok(rest), - } - } - '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input), - c if c.is_whitespace() => input = rest, - _ => return Ok(input), - }; - } -} - -fn is_start_of_string(input: &str, eof: bool) -> Result { - let (Some(c), rest) = take(input, eof)? else { - return Ok(false); - }; - match c { - 'x' | 'X' | 'u' | 'U' => { - let (c, _rest) = take(rest, eof)?; - Ok(c == Some('\'') || c == Some('"')) - } - '\'' | '"' => Ok(true), - '\n' | '\r' if is_end_of_line(input, eof)? => Ok(true), - _ => Ok(false), - } -} - -fn is_end_of_line(input: &str, eof: bool) -> Result { - let (Some(c), rest) = take(input, eof)? else { - return Ok(true); - }; - Ok(match c { - '\n' => true, - '\r' => take(rest, eof)?.0 == Some('\n'), - _ => false, - }) -} - -fn at_end_of_line(input: &str, eof: bool) -> Result { - is_end_of_line(skip_spaces_and_comments(input, eof)?, eof) -} - -fn first(s: &str) -> char { - s.chars().next().unwrap() -} -fn get_command_name_candidates(target: &str) -> &[&'static str] { - if target.is_empty() { - return &[]; - } - let target_first = first(target).to_ascii_uppercase(); - let low = COMMAND_NAMES.partition_point(|s| first(s) < target_first); - let high = COMMAND_NAMES.partition_point(|s| first(s) <= target_first); - &COMMAND_NAMES[low..high] -} - -fn detect_command_name(input: &str, eof: bool) -> Result { - let command_name = input - .split(|c: char| { - !((c.is_whitespace() && c != '\n') || (c.may_continue_id() && c != '.') || c == '-') - }) - .next() - .unwrap(); - if !eof && command_name.len() == input.len() { - return Err(Incomplete); - } - let command_name = command_name.trim_end_matches(|c: char| c.is_whitespace() || c == '.'); - for command in get_command_name_candidates(command_name) { - if let Some(m) = command_match(command, command_name) { - if m.missing_words <= 0 { - return Ok(true); - } - } - } - Ok(false) -} - -impl Segmenter { - fn parse_shbang<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - if let (Some('#'), rest) = take(input, eof)? { - if let (Some('!'), rest) = take(rest, eof)? { - let rest = self.parse_full_line(rest, eof)?; - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok((rest, Segment::Shbang)); - } - } - - self.state = ( - State::General, - Substate::START_OF_COMMAND | Substate::START_OF_LINE, - ); - self.push_rest(input, eof) - } - fn at_command_start(&self, input: &str, eof: bool) -> Result { - match self.mode { - Mode::Auto => detect_command_name(input, eof), - Mode::Interactive => Ok(false), - Mode::Batch => Ok(true), - } - } - fn parse_start_of_line<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - debug_assert_eq!(self.state.0, State::General); - debug_assert!(self.start_of_line()); - debug_assert!(!input.is_empty()); - - let (Some(c), rest) = take(input, eof).unwrap() else { - unreachable!() - }; - match c { - '+' if is_start_of_string(skip_spaces_and_comments(rest, eof)?, eof)? => { - // This `+` is punctuation that may separate pieces of a string. - self.state = (State::General, Substate::empty()); - return Ok((rest, Segment::Punct)); - } - '+' | '-' | '.' => { - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok((rest, Segment::StartCommand)); - } - _ if c.is_whitespace() => { - if at_end_of_line(input, eof)? { - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok((input, Segment::SeparateCommands)); - } - } - _ => { - if self.at_command_start(input, eof)? - && !self.state.1.contains(Substate::START_OF_COMMAND) - { - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok((input, Segment::StartCommand)); - } - } - } - self.state.1 = Substate::START_OF_COMMAND; - self.parse_mid_line(input, eof) - } - fn parse_mid_line<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - debug_assert!(self.state.0 == State::General); - debug_assert!(!self.state.1.contains(Substate::START_OF_LINE)); - let (Some(c), rest) = take(input, eof)? else { - unreachable!() - }; - match c { - '\r' | '\n' if is_end_of_line(input, eof)? => { - self.state.1 |= Substate::START_OF_LINE; - Ok(( - self.parse_newline(input, eof).unwrap().unwrap(), - Segment::Newline, - )) - } - '/' => { - if let (Some('*'), rest) = take(rest, eof)? { - let rest = skip_comment(rest, eof)?; - return Ok((rest, Segment::Comment)); - } else { - self.state.1 = Substate::empty(); - return Ok((rest, Segment::Punct)); - } - } - '-' => { - let (c, rest2) = take(skip_spaces(rest, eof)?, eof)?; - match c { - Some(c) if c.is_ascii_digit() => { - return self.parse_number(rest, eof); - } - Some('.') => { - if let (Some(c), _rest) = take(rest2, eof)? { - if c.is_ascii_digit() { - return self.parse_number(rest, eof); - } - } - } - None | Some(_) => (), - } - self.state.1 = Substate::empty(); - return Ok((rest, Segment::Punct)); - } - '(' | ')' | '[' | ']' | '{' | '}' | ',' | '=' | ';' | ':' | '&' | '|' | '+' => { - self.state.1 = Substate::empty(); - return Ok((rest, Segment::Punct)); - } - '*' => { - if self.state.1.contains(Substate::START_OF_COMMAND) { - self.state = (State::Comment1, Substate::empty()); - self.parse_comment_1(input, eof) - } else { - self.parse_digraph(&['*'], rest, eof) - } - } - '<' => self.parse_digraph(&['=', '>'], rest, eof), - '>' => self.parse_digraph(&['='], rest, eof), - '~' => self.parse_digraph(&['='], rest, eof), - '.' if at_end_of_line(rest, eof)? => { - self.state.1 = Substate::START_OF_COMMAND; - Ok((rest, Segment::EndCommand)) - } - '.' => match take(rest, eof)? { - (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof), - _ => Ok((rest, Segment::Punct)), - }, - '0'..='9' => self.parse_number(input, eof), - 'u' | 'U' => self.maybe_parse_string(Segment::UnicodeString, (input, rest), eof), - 'x' | 'X' => self.maybe_parse_string(Segment::HexString, (input, rest), eof), - '\'' | '"' => self.parse_string(Segment::QuotedString, c, rest, eof), - '!' => { - let (c, rest2) = take(rest, eof)?; - match c { - Some('*') => Ok((rest2, Segment::Punct)), - Some(_) => self.parse_id(input, eof), - None => Ok((rest, Segment::Punct)), - } - } - c if c.is_whitespace() => Ok((skip_spaces(rest, eof)?, Segment::Spaces)), - c if c.may_start_id() => self.parse_id(input, eof), - '#'..='~' if c != '\\' && c != '^' => { - self.state.1 = Substate::empty(); - Ok((rest, Segment::Punct)) - } - _ => { - self.state.1 = Substate::empty(); - Ok((rest, Segment::UnexpectedChar)) - } - } - } - fn parse_string<'a>( - &mut self, - segment: Segment, - quote: char, - mut input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - while let (Some(c), rest) = take(input, eof)? { - match c { - _ if c == quote => { - let (c, rest2) = take(rest, eof)?; - if c != Some(quote) { - self.state.1 = Substate::empty(); - return Ok((rest, segment)); - } - input = rest2; - } - '\r' | '\n' if is_end_of_line(input, eof)? => break, - _ => input = rest, - } - } - self.state.1 = Substate::empty(); - Ok((input, Segment::ExpectedQuote)) - } - fn maybe_parse_string<'a>( - &mut self, - segment: Segment, - input: (&'a str, &'a str), - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - match take(input.1, eof)? { - (Some(c), rest) if c == '\'' || c == '"' => self.parse_string(segment, c, rest, eof), - _ => self.parse_id(input.0, eof), - } - } - fn next_id_in_command<'a>( - &self, - mut input: &'a str, - eof: bool, - ) -> Result<(&'a str, &'a str), Incomplete> { - let mut sub = Segmenter::new(self.mode, true); - loop { - let (seg_len, seg_type) = sub.push(input, eof)?; - let (segment, rest) = input.split_at(seg_len); - match seg_type { - Segment::Shbang | Segment::Spaces | Segment::Comment | Segment::Newline => (), - - Segment::Identifier => return Ok((segment, rest)), - - Segment::Number - | Segment::QuotedString - | Segment::HexString - | Segment::UnicodeString - | Segment::UnquotedString - | Segment::Punct - | Segment::CommentCommand - | Segment::DoRepeatCommand - | Segment::DoRepeatOverflow - | Segment::InlineData - | Segment::MacroName - | Segment::MacroBody - | Segment::StartDocument - | Segment::Document - | Segment::StartCommand - | Segment::SeparateCommands - | Segment::EndCommand - | Segment::End - | Segment::ExpectedQuote - | Segment::ExpectedExponent - | Segment::UnexpectedChar => return Ok(("", rest)), - } - input = rest; - } - } - fn parse_id<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let (Some(_), mut end) = take(input, eof).unwrap() else { - unreachable!() - }; - while let (Some(c), rest) = take(end, eof)? { - if !c.may_continue_id() { - break; - }; - end = rest; - } - let identifier = &input[..input.len() - end.len()]; - let identifier = match identifier.strip_suffix('.') { - Some(without_dot) if at_end_of_line(end, eof)? => without_dot, - _ => identifier, - }; - let rest = &input[identifier.len()..]; - - if self.state.1.contains(Substate::START_OF_COMMAND) { - if id_match_n("COMMENT", identifier, 4) { - self.state = (State::Comment1, Substate::empty()); - return self.parse_comment_1(input, eof); - } else if id_match("DOCUMENT", identifier) { - self.state = (State::Document1, Substate::empty()); - return Ok((input, Segment::StartDocument)); - } else if id_match_n("DEFINE", identifier, 6) { - self.state = (State::Define1, Substate::empty()); - } else if id_match("FILE", identifier) { - if id_match("LABEL", self.next_id_in_command(rest, eof)?.0) { - self.state = (State::FileLabel1, Substate::empty()); - return Ok((rest, Segment::Identifier)); - } - } else if id_match("DO", identifier) { - if id_match("REPEAT", self.next_id_in_command(rest, eof)?.0) { - self.state = (State::DoRepeat1, Substate::empty()); - return Ok((rest, Segment::Identifier)); - } - } else if id_match("BEGIN", identifier) { - let (next_id, rest2) = self.next_id_in_command(rest, eof)?; - if id_match("DATA", next_id) { - let rest2 = skip_spaces_and_comments(rest2, eof)?; - let rest2 = if let Some(s) = rest2.strip_prefix('.') { - skip_spaces_and_comments(s, eof)? - } else { - rest2 - }; - if is_end_of_line(rest2, eof)? { - let s = &input[..input.len() - rest2.len()]; - self.state = ( - if s.contains('\n') { - State::BeginData1 - } else { - State::BeginData2 - }, - Substate::empty(), - ); - return Ok((rest, Segment::Identifier)); - } - } - } - } - - self.state.1 = Substate::empty(); - Ok(( - rest, - if identifier != "!" { - Segment::Identifier - } else { - Segment::Punct - }, - )) - } - fn parse_digraph<'a>( - &mut self, - seconds: &[char], - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let (c, rest) = take(input, eof)?; - self.state.1 = Substate::empty(); - Ok(( - match c { - Some(c) if seconds.contains(&c) => rest, - _ => input, - }, - Segment::Punct, - )) - } - fn parse_number<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let mut input = skip_digits(input, eof)?; - if let Some(rest) = match_char(|c| c == '.', input, eof)? { - let rest2 = skip_digits(rest, eof)?; - if rest2.len() < rest.len() || !at_end_of_line(rest2, eof)? { - input = rest2; - } - }; - if let Some(rest) = match_char(|c| c == 'e' || c == 'E', input, eof)? { - let rest = match_char(|c| c == '+' || c == '-', rest, eof)?.unwrap_or(rest); - let rest2 = skip_digits(rest, eof)?; - if rest2.len() == rest.len() { - self.state.1 = Substate::empty(); - return Ok((rest, Segment::ExpectedExponent)); - } - input = rest2; - } - self.state.1 = Substate::empty(); - Ok((input, Segment::Number)) - } - fn parse_comment_1<'a>( - &mut self, - mut input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - enum CommentState<'a> { - Blank, - NotBlank, - Period(&'a str), - } - let mut state = CommentState::Blank; - loop { - let (Some(c), rest) = take(input, eof)? else { - // End of file. - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok((input, Segment::SeparateCommands)); - }; - match c { - '.' => state = CommentState::Period(input), - '\n' | '\r' if is_end_of_line(input, eof)? => { - match state { - CommentState::Blank => { - // Blank line ends comment command. - self.state = (State::General, Substate::START_OF_COMMAND); - return Ok((input, Segment::SeparateCommands)); - } - CommentState::Period(period) => { - // '.' at end of line ends comment command. - self.state = (State::General, Substate::empty()); - return Ok((period, Segment::CommentCommand)); - } - CommentState::NotBlank => { - // Comment continues onto next line. - self.state = (State::Comment2, Substate::empty()); - return Ok((input, Segment::CommentCommand)); - } - } - } - c if c.is_whitespace() => (), - _ => state = CommentState::NotBlank, - } - input = rest; - } - } - fn parse_comment_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let rest = self.parse_newline(input, eof)?.unwrap(); - - let new_command = match take(rest, eof)?.0 { - Some('+') | Some('-') | Some('.') => true, - Some(c) if !c.is_whitespace() => self.at_command_start(rest, eof)?, - None | Some(_) => false, - }; - if new_command { - self.state = ( - State::General, - Substate::START_OF_LINE | Substate::START_OF_COMMAND, - ); - } else { - self.state = (State::Comment1, Substate::empty()); - } - Ok((rest, Segment::Newline)) - } - fn parse_document_1<'a>( - &mut self, - mut input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let mut end_cmd = false; - loop { - let (Some(c), rest) = take(input, eof)? else { - self.state = (State::Document3, Substate::empty()); - return Ok((input, Segment::Document)); - }; - match c { - '.' => end_cmd = true, - '\n' | '\r' if is_end_of_line(input, eof)? => { - self.state.0 = if end_cmd { - State::Document3 - } else { - State::Document2 - }; - return Ok((input, Segment::Document)); - } - c if !c.is_whitespace() => end_cmd = false, - _ => (), - } - input = rest; - } - } - fn parse_document_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let rest = self.parse_newline(input, eof)?.unwrap(); - self.state = (State::Document1, Substate::empty()); - Ok((rest, Segment::Newline)) - } - fn parse_document_3<'a>( - &mut self, - input: &'a str, - _eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - self.state = ( - State::General, - Substate::START_OF_COMMAND | Substate::START_OF_LINE, - ); - Ok((input, Segment::EndCommand)) - } - fn quoted_file_label(input: &str, eof: bool) -> Result { - let input = skip_spaces_and_comments(input, eof)?; - match take(input, eof)?.0 { - Some('\'') | Some('"') | Some('\n') => Ok(true), - _ => Ok(false), - } - } - fn parse_file_label_1<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let mut sub = Segmenter { - state: (State::General, self.state.1), - ..*self - }; - let (rest, segment) = sub.push_rest(input, eof)?; - if segment == Segment::Identifier { - let id = &input[..input.len() - rest.len()]; - debug_assert!(id_match("LABEL", id), "{id} should be LABEL"); - if Self::quoted_file_label(rest, eof)? { - *self = sub; - } else { - self.state.0 = State::FileLabel2; - } - } else { - self.state.1 = sub.state.1; - } - Ok((rest, segment)) - } - fn parse_file_label_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let input = skip_spaces(input, eof)?; - self.state = (State::FileLabel3, Substate::empty()); - Ok((input, Segment::Spaces)) - } - fn parse_file_label_3<'a>( - &mut self, - mut input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let mut end_cmd = None; - loop { - let (c, rest) = take(input, eof)?; - match c { - None | Some('\n') | Some('\r') if is_end_of_line(input, eof)? => { - self.state = (State::General, Substate::empty()); - return Ok((end_cmd.unwrap_or(input), Segment::UnquotedString)); - } - None => unreachable!(), - Some('.') => end_cmd = Some(input), - Some(c) if !c.is_whitespace() => end_cmd = None, - Some(_) => (), - } - input = rest; - } - } - fn subparse<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let mut sub = Segmenter { - mode: self.mode, - state: (State::General, self.state.1), - nest: 0, - }; - let result = sub.push_rest(input, eof)?; - self.state.1 = sub.state.1; - Ok(result) - } - /// We are segmenting a `DO REPEAT` command, currently reading the syntax - /// that defines the stand-in variables (the head) before the lines of - /// syntax to be repeated (the body). - fn parse_do_repeat_1<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let (rest, segment) = self.subparse(input, eof)?; - if segment == Segment::SeparateCommands { - // We reached a blank line that separates the head from the body. - self.state.0 = State::DoRepeat2; - } else if segment == Segment::EndCommand || segment == Segment::StartCommand { - // We reached the body. - self.state.0 = State::DoRepeat3; - self.nest = 1; - } - Ok((rest, segment)) - } - /// We are segmenting a `DO REPEAT` command, currently reading a blank line - /// that separates the head from the body. - fn parse_do_repeat_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let (rest, segment) = self.subparse(input, eof)?; - if segment == Segment::Newline { - // We reached the body. - self.state.0 = State::DoRepeat3; - self.nest = 1; - } - Ok((rest, segment)) - } - fn parse_newline<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result, Incomplete> { - let (Some(c), rest) = take(input, eof)? else { - return Ok(None); - }; - match c { - '\n' => Ok(Some(rest)), - '\r' => { - if let (Some('\n'), rest) = take(rest, eof)? { - Ok(Some(rest)) - } else { - Ok(None) - } - } - _ => Ok(None), - } - } - - fn parse_full_line<'a>( - &mut self, - mut input: &'a str, - eof: bool, - ) -> Result<&'a str, Incomplete> { - loop { - if is_end_of_line(input, eof)? { - return Ok(input); - } - input = take(input, eof).unwrap().1; - } - } - fn check_repeat_command<'a>(&mut self, input: &'a str, eof: bool) -> Result { - let input = input.strip_prefix(&['-', '+']).unwrap_or(input); - let (id1, input) = self.next_id_in_command(input, eof)?; - if id_match("DO", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) { - Ok(1) - } else if id_match("END", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) - { - Ok(-1) - } else { - Ok(0) - } - } - /// We are in the body of `DO REPEAT`, segmenting the lines of syntax that - /// are to be repeated. Report each line of syntax as a single - /// [`Type::DoRepeatCommand`]. - /// - /// `DO REPEAT` can be nested, so we look for `DO REPEAT...END REPEAT` - /// blocks inside the lines we're segmenting. `self.nest` counts the - /// nesting level, starting at 1. - fn parse_do_repeat_3<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - if let Some(rest) = self.parse_newline(input, eof)? { - return Ok((rest, Segment::Newline)); - } - let rest = self.parse_full_line(input, eof)?; - let direction = self.check_repeat_command(input, eof)?; - if direction > 0 { - if let Some(nest) = self.nest.checked_add(1) { - self.nest = nest; - } else { - self.state.0 = State::DoRepeat4; - } - } else if direction < 0 { - self.nest -= 1; - if self.nest == 0 { - // Nesting level dropped to 0, so we've finished reading the `DO - // REPEAT` body. - self.state = ( - State::General, - Substate::START_OF_COMMAND | Substate::START_OF_LINE, - ); - return self.push_rest(input, eof); - } - } - return Ok((rest, Segment::DoRepeatCommand)); - } - fn parse_do_repeat_4<'a>(&mut self, input: &'a str) -> Result<(&'a str, Segment), Incomplete> { - self.state.0 = State::DoRepeat3; - Ok((input, Segment::DoRepeatOverflow)) - } - /// We are segmenting a `DEFINE` command, which consists of: - /// - /// - The `DEFINE` keyword. - /// - /// - An identifier. We transform this into `Type::MacroName` instead of - /// `Type::Identifier` because this identifier must never be macro-expanded. - /// - /// - Anything but `(`. - /// - /// - `(` followed by a sequence of tokens possibly including balanced - /// parentheses up to a final `)`. - /// - /// - A sequence of any number of lines, one string per line, ending with - /// `!ENDDEFINE`. The first line is usually blank (that is, a newline - /// follows the `(`). The last line usually just has `!ENDDEFINE.` on - /// it, but it can start with other tokens. The whole - /// DEFINE...!ENDDEFINE can be on a single line, even. - fn parse_define_1_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let (rest, segment) = self.subparse(input, eof)?; - match segment { - Segment::Identifier if self.state.0 == State::Define1 => { - self.state.0 = State::Define2; - return Ok((rest, Segment::MacroName)); - } - Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => { - // The DEFINE command is malformed because we reached its end - // without ever hitting a `(` token. Transition back to general - // parsing. - self.state.0 = State::General; - } - Segment::Punct if input.starts_with('(') => { - self.state.0 = State::Define3; - self.nest = 1; - } - _ => (), - } - Ok((rest, segment)) - } - fn parse_define_3<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let (rest, segment) = self.subparse(input, eof)?; - match segment { - Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => { - // The DEFINE command is malformed because we reached its end - // without ever hitting a `(` token. Transition back to general - // parsing. - self.state.0 = State::General; - } - Segment::Punct if input.starts_with('(') => { - self.nest += 1; - } - Segment::Punct if input.starts_with(')') => { - self.nest -= 1; - if self.nest == 0 { - self.state = (State::Define4, Substate::empty()); - } - } - _ => (), - } - Ok((rest, segment)) - } - fn find_enddefine<'a>(mut input: &'a str) -> Option<&'a str> { - loop { - input = skip_spaces_and_comments(input, true).unwrap(); - let (Some(c), rest) = take(input, true).unwrap() else { - return None; - }; - match c { - '!' if strip_prefix_ignore_ascii_case(input, "!ENDDEFINE").is_some() => { - return Some(input) - } - '\'' | '"' => { - let index = rest.find(c)?; - input = &rest[index + 1..]; - } - _ => input = rest, - } - } - } - - /// We are in the body of a macro definition, looking for additional lines - /// of the body or `!ENDDEFINE`. - /// - /// In `State::Define4`, we're parsing the first line of the macro body (the - /// same line as the closing parenthesis in the argument definition). In - /// `State::Define5`, we're on a later line. - fn parse_define_4_5<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let rest = self.parse_full_line(input, eof)?; - let line = &input[..input.len() - rest.len()]; - if let Some(end) = Self::find_enddefine(line) { - // Macro ends at the !ENDDEFINE on this line. - self.state = (State::General, Substate::empty()); - let (prefix, rest) = input.split_at(line.len() - end.len()); - if prefix.is_empty() { - // Line starts with `!ENDDEFINE`. - self.push_rest(input, eof) - } else if prefix.trim_start().is_empty() { - // Line starts with spaces followed by `!ENDDEFINE`. - Ok((rest, Segment::Spaces)) - } else { - // Line starts with some content followed by `!ENDDEFINE`. - Ok((rest, Segment::MacroBody)) - } - } else { - // No `!ENDDEFINE`. We have a full line of macro body. - // - // If the first line of the macro body is blank, we just report it - // as spaces, or not at all if there are no spaces, because it's not - // significant. - // - // However, if it's a later line, we need to report it because blank - // lines can have significance. - let segment = if self.state.0 == State::Define4 && line.trim_start().is_empty() { - if line.is_empty() { - return self.parse_define_6(input, eof); - } - Segment::Spaces - } else { - Segment::MacroBody - }; - self.state.0 = State::Define6; - Ok((rest, segment)) - } - } - fn parse_define_6<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let rest = self.parse_newline(input, eof)?.unwrap(); - self.state.0 = State::Define5; - Ok((rest, Segment::Newline)) - } - fn parse_begin_data_1<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let (rest, segment) = self.subparse(input, eof)?; - if segment == Segment::Newline { - self.state.0 = State::BeginData2; - } - Ok((rest, segment)) - } - fn parse_begin_data_2<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let (rest, segment) = self.subparse(input, eof)?; - if segment == Segment::Newline { - self.state.0 = State::BeginData3; - } - Ok((rest, segment)) - } - fn is_end_data(line: &str) -> bool { - let Some(rest) = strip_prefix_ignore_ascii_case(line, "END") else { - return false; - }; - let (Some(c), rest) = take(rest, true).unwrap() else { - return false; - }; - if !c.is_whitespace() { - return false; - }; - let Some(rest) = strip_prefix_ignore_ascii_case(rest, "DATA") else { - return false; - }; - - let mut endcmd = false; - for c in rest.chars() { - match c { - '.' if endcmd => return false, - '.' => endcmd = true, - c if c.is_whitespace() => (), - _ => return false, - } - } - true - } - fn parse_begin_data_3<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let rest = self.parse_full_line(input, eof)?; - let line = &input[..input.len() - rest.len()]; - if Self::is_end_data(line) { - self.state = ( - State::General, - Substate::START_OF_COMMAND | Substate::START_OF_LINE, - ); - self.push_rest(input, eof) - } else { - self.state.0 = State::BeginData4; - Ok((rest, Segment::InlineData)) - } - } - fn parse_begin_data_4<'a>( - &mut self, - input: &'a str, - eof: bool, - ) -> Result<(&'a str, Segment), Incomplete> { - let rest = self.parse_newline(input, eof)?.unwrap(); - self.state.0 = State::BeginData3; - Ok((rest, Segment::Newline)) - } -} - -fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&'a str> { - line.get(..pattern.len()) - .map(|prefix| { - prefix - .eq_ignore_ascii_case(pattern) - .then(|| &line[pattern.len()..]) - }) - .flatten() -} - -#[cfg(test)] -mod test; diff --git a/rust/src/lex/segment/test.rs b/rust/src/lex/segment/test.rs deleted file mode 100644 index d8c337dcdf..0000000000 --- a/rust/src/lex/segment/test.rs +++ /dev/null @@ -1,2172 +0,0 @@ -use crate::prompt::PromptStyle; - -use super::{Mode, Segment, Segmenter}; - -fn push_segment<'a>( - segmenter: &mut Segmenter, - input: &'a str, - one_byte: bool, -) -> (usize, Segment) { - if one_byte { - for len in input.char_indices().map(|(pos, _c)| pos) { - if let Ok(result) = segmenter.push(&input[..len], false) { - return result; - } - } - } - segmenter.push(input, true).unwrap() -} - -fn _check_segmentation( - mut input: &str, - mode: Mode, - expect_segments: &[(Segment, &str)], - expect_prompts: &[PromptStyle], - one_byte: bool, -) { - let mut segments = Vec::with_capacity(expect_segments.len()); - let mut prompts = Vec::new(); - let mut segmenter = Segmenter::new(mode, false); - loop { - let (seg_len, seg_type) = push_segment(&mut segmenter, input, one_byte); - let (token, rest) = input.split_at(seg_len); - segments.push((seg_type, token)); - match seg_type { - Segment::End => break, - Segment::Newline => prompts.push(segmenter.prompt()), - _ => (), - } - input = rest; - } - - if &segments != expect_segments { - eprintln!("segments differ from expected:"); - let difference = diff::slice(expect_segments, &segments); - for result in difference { - match result { - diff::Result::Left(left) => eprintln!("-{left:?}"), - diff::Result::Both(left, _right) => eprintln!(" {left:?}"), - diff::Result::Right(right) => eprintln!("+{right:?}"), - } - } - panic!(); - } - - if &prompts != expect_prompts { - eprintln!("prompts differ from expected:"); - let difference = diff::slice(expect_prompts, &prompts); - for result in difference { - match result { - diff::Result::Left(left) => eprintln!("-{left:?}"), - diff::Result::Both(left, _right) => eprintln!(" {left:?}"), - diff::Result::Right(right) => eprintln!("+{right:?}"), - } - } - panic!(); - } -} - -fn check_segmentation( - input: &str, - mode: Mode, - expect_segments: &[(Segment, &str)], - expect_prompts: &[PromptStyle], -) { - for (one_byte, one_byte_name) in [(false, "full-string"), (true, "byte-by-byte")] { - println!("running {one_byte_name} segmentation test with LF newlines..."); - _check_segmentation(input, mode, expect_segments, expect_prompts, one_byte); - - println!("running {one_byte_name} segmentation test with CRLF newlines..."); - _check_segmentation( - &input.replace('\n', "\r\n"), - mode, - &expect_segments - .iter() - .map(|(segment, s)| match *segment { - Segment::Newline => (Segment::Newline, "\r\n"), - _ => (*segment, *s), - }) - .collect::>(), - expect_prompts, - one_byte, - ); - - if let Some(input) = input.strip_suffix('\n') { - println!("running {one_byte_name} segmentation test without final newline..."); - let mut expect_segments: Vec<_> = expect_segments.iter().copied().collect(); - assert_eq!(expect_segments.pop(), Some((Segment::End, ""))); - assert_eq!(expect_segments.pop(), Some((Segment::Newline, "\n"))); - while let Some((Segment::SeparateCommands | Segment::EndCommand, "")) = - expect_segments.last() - { - expect_segments.pop(); - } - expect_segments.push((Segment::End, "")); - _check_segmentation( - input, - mode, - &expect_segments, - &expect_prompts[..expect_prompts.len() - 1], - one_byte, - ); - } - } -} - -#[allow(dead_code)] -fn print_segmentation(mut input: &str) { - let mut segmenter = Segmenter::new(Mode::Interactive, false); - loop { - let (seg_len, seg_type) = segmenter.push(input, true).unwrap(); - let (token, rest) = input.split_at(seg_len); - print!("{seg_type:?} {token:?}"); - match seg_type { - Segment::Newline => print!(" ({:?})", segmenter.prompt()), - Segment::End => break, - _ => (), - } - println!(); - input = rest; - } -} - -#[test] -fn test_identifiers() { - check_segmentation( - r#"a ab abc abcd !abcd -A AB ABC ABCD !ABCD -aB aBC aBcD !aBcD -$x $y $z !$z -grève Ângstrom poté -#a #b #c ## #d !#d -@efg @ @@. @#@ !@ -## # #12345 #.# -f@#_.#6 -GhIjK -.x 1y _z -!abc abc! -"#, - Mode::Auto, - &[ - (Segment::Identifier, "a"), - (Segment::Spaces, " "), - (Segment::Identifier, "ab"), - (Segment::Spaces, " "), - (Segment::Identifier, "abc"), - (Segment::Spaces, " "), - (Segment::Identifier, "abcd"), - (Segment::Spaces, " "), - (Segment::Identifier, "!abcd"), - (Segment::Newline, "\n"), - (Segment::Identifier, "A"), - (Segment::Spaces, " "), - (Segment::Identifier, "AB"), - (Segment::Spaces, " "), - (Segment::Identifier, "ABC"), - (Segment::Spaces, " "), - (Segment::Identifier, "ABCD"), - (Segment::Spaces, " "), - (Segment::Identifier, "!ABCD"), - (Segment::Newline, "\n"), - (Segment::Identifier, "aB"), - (Segment::Spaces, " "), - (Segment::Identifier, "aBC"), - (Segment::Spaces, " "), - (Segment::Identifier, "aBcD"), - (Segment::Spaces, " "), - (Segment::Identifier, "!aBcD"), - (Segment::Newline, "\n"), - (Segment::Identifier, "$x"), - (Segment::Spaces, " "), - (Segment::Identifier, "$y"), - (Segment::Spaces, " "), - (Segment::Identifier, "$z"), - (Segment::Spaces, " "), - (Segment::Identifier, "!$z"), - (Segment::Newline, "\n"), - (Segment::Identifier, "grève"), - (Segment::Spaces, "\u{00a0}"), - (Segment::Identifier, "Ângstrom"), - (Segment::Spaces, "\u{00a0}"), - (Segment::Identifier, "poté"), - (Segment::Newline, "\n"), - (Segment::Identifier, "#a"), - (Segment::Spaces, " "), - (Segment::Identifier, "#b"), - (Segment::Spaces, " "), - (Segment::Identifier, "#c"), - (Segment::Spaces, " "), - (Segment::Identifier, "##"), - (Segment::Spaces, " "), - (Segment::Identifier, "#d"), - (Segment::Spaces, " "), - (Segment::Identifier, "!#d"), - (Segment::Newline, "\n"), - (Segment::Identifier, "@efg"), - (Segment::Spaces, " "), - (Segment::Identifier, "@"), - (Segment::Spaces, " "), - (Segment::Identifier, "@@."), - (Segment::Spaces, " "), - (Segment::Identifier, "@#@"), - (Segment::Spaces, " "), - (Segment::Identifier, "!@"), - (Segment::Spaces, " "), - (Segment::Newline, "\n"), - (Segment::Identifier, "##"), - (Segment::Spaces, " "), - (Segment::Identifier, "#"), - (Segment::Spaces, " "), - (Segment::Identifier, "#12345"), - (Segment::Spaces, " "), - (Segment::Identifier, "#.#"), - (Segment::Newline, "\n"), - (Segment::Identifier, "f@#_.#6"), - (Segment::Newline, "\n"), - (Segment::Identifier, "GhIjK"), - (Segment::Newline, "\n"), - (Segment::StartCommand, "."), - (Segment::Identifier, "x"), - (Segment::Spaces, " "), - (Segment::Number, "1"), - (Segment::Identifier, "y"), - (Segment::Spaces, " "), - (Segment::Punct, "_"), - (Segment::Identifier, "z"), - (Segment::Newline, "\n"), - (Segment::Identifier, "!abc"), - (Segment::Spaces, " "), - (Segment::Identifier, "abc"), - (Segment::Punct, "!"), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - ], - ); -} - -#[test] -fn test_identifiers_ending_in_dot() { - check_segmentation( - r#"abcd. abcd. -ABCD. ABCD. -aBcD. aBcD. -$y. $z. あいうえお. -#c. #d.. -@@. @@.... -#.#. -#abcd. -. -. -LMNOP. -QRSTUV./* end of line comment */ -qrstuv. /* end of line comment */ -QrStUv./* end of line comment */ -wxyz./* unterminated end of line comment -WXYZ. /* unterminated end of line comment -WxYz./* unterminated end of line comment -"#, - Mode::Auto, - &[ - (Segment::Identifier, "abcd."), - (Segment::Spaces, " "), - (Segment::Identifier, "abcd"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "ABCD."), - (Segment::Spaces, " "), - (Segment::Identifier, "ABCD"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "aBcD."), - (Segment::Spaces, " "), - (Segment::Identifier, "aBcD"), - (Segment::EndCommand, "."), - (Segment::Spaces, " "), - (Segment::Newline, "\n"), - (Segment::Identifier, "$y."), - (Segment::Spaces, " "), - (Segment::Identifier, "$z."), - (Segment::Spaces, " "), - (Segment::Identifier, "あいうえお"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "#c."), - (Segment::Spaces, " "), - (Segment::Identifier, "#d."), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "@@."), - (Segment::Spaces, " "), - (Segment::Identifier, "@@..."), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "#.#"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "#abcd"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::StartCommand, "."), - (Segment::Newline, "\n"), - (Segment::StartCommand, "."), - (Segment::Spaces, " "), - (Segment::Newline, "\n"), - (Segment::Identifier, "LMNOP"), - (Segment::EndCommand, "."), - (Segment::Spaces, " "), - (Segment::Newline, "\n"), - (Segment::Identifier, "QRSTUV"), - (Segment::EndCommand, "."), - (Segment::Comment, "/* end of line comment */"), - (Segment::Newline, "\n"), - (Segment::Identifier, "qrstuv"), - (Segment::EndCommand, "."), - (Segment::Spaces, " "), - (Segment::Comment, "/* end of line comment */"), - (Segment::Newline, "\n"), - (Segment::Identifier, "QrStUv"), - (Segment::EndCommand, "."), - (Segment::Comment, "/* end of line comment */"), - (Segment::Spaces, " "), - (Segment::Newline, "\n"), - (Segment::Identifier, "wxyz"), - (Segment::EndCommand, "."), - (Segment::Comment, "/* unterminated end of line comment"), - (Segment::Newline, "\n"), - (Segment::Identifier, "WXYZ"), - (Segment::EndCommand, "."), - (Segment::Spaces, " "), - (Segment::Comment, "/* unterminated end of line comment"), - (Segment::Newline, "\n"), - (Segment::Identifier, "WxYz"), - (Segment::EndCommand, "."), - (Segment::Comment, "/* unterminated end of line comment "), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - ], - ); -} - -#[test] -fn test_reserved_words() { - check_segmentation( - r#"and or not eq ge gt le lt ne all by to with -AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH -andx orx notx eqx gex gtx lex ltx nex allx byx tox withx -and. with. -"#, - Mode::Auto, - &[ - (Segment::Identifier, "and"), - (Segment::Spaces, " "), - (Segment::Identifier, "or"), - (Segment::Spaces, " "), - (Segment::Identifier, "not"), - (Segment::Spaces, " "), - (Segment::Identifier, "eq"), - (Segment::Spaces, " "), - (Segment::Identifier, "ge"), - (Segment::Spaces, " "), - (Segment::Identifier, "gt"), - (Segment::Spaces, " "), - (Segment::Identifier, "le"), - (Segment::Spaces, " "), - (Segment::Identifier, "lt"), - (Segment::Spaces, " "), - (Segment::Identifier, "ne"), - (Segment::Spaces, " "), - (Segment::Identifier, "all"), - (Segment::Spaces, " "), - (Segment::Identifier, "by"), - (Segment::Spaces, " "), - (Segment::Identifier, "to"), - (Segment::Spaces, " "), - (Segment::Identifier, "with"), - (Segment::Newline, "\n"), - (Segment::Identifier, "AND"), - (Segment::Spaces, " "), - (Segment::Identifier, "OR"), - (Segment::Spaces, " "), - (Segment::Identifier, "NOT"), - (Segment::Spaces, " "), - (Segment::Identifier, "EQ"), - (Segment::Spaces, " "), - (Segment::Identifier, "GE"), - (Segment::Spaces, " "), - (Segment::Identifier, "GT"), - (Segment::Spaces, " "), - (Segment::Identifier, "LE"), - (Segment::Spaces, " "), - (Segment::Identifier, "LT"), - (Segment::Spaces, " "), - (Segment::Identifier, "NE"), - (Segment::Spaces, " "), - (Segment::Identifier, "ALL"), - (Segment::Spaces, " "), - (Segment::Identifier, "BY"), - (Segment::Spaces, " "), - (Segment::Identifier, "TO"), - (Segment::Spaces, " "), - (Segment::Identifier, "WITH"), - (Segment::Newline, "\n"), - (Segment::Identifier, "andx"), - (Segment::Spaces, " "), - (Segment::Identifier, "orx"), - (Segment::Spaces, " "), - (Segment::Identifier, "notx"), - (Segment::Spaces, " "), - (Segment::Identifier, "eqx"), - (Segment::Spaces, " "), - (Segment::Identifier, "gex"), - (Segment::Spaces, " "), - (Segment::Identifier, "gtx"), - (Segment::Spaces, " "), - (Segment::Identifier, "lex"), - (Segment::Spaces, " "), - (Segment::Identifier, "ltx"), - (Segment::Spaces, " "), - (Segment::Identifier, "nex"), - (Segment::Spaces, " "), - (Segment::Identifier, "allx"), - (Segment::Spaces, " "), - (Segment::Identifier, "byx"), - (Segment::Spaces, " "), - (Segment::Identifier, "tox"), - (Segment::Spaces, " "), - (Segment::Identifier, "withx"), - (Segment::Newline, "\n"), - (Segment::Identifier, "and."), - (Segment::Spaces, " "), - (Segment::Identifier, "with"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::First, - ], - ); -} - -#[test] -fn test_punctuation() { - check_segmentation( - r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] ** -~&|=>=><=<~=<>(),-+*/[]**!* -% : ; ? _ ` { } ~ !* -"#, - Mode::Auto, - &[ - (Segment::Punct, "~"), - (Segment::Spaces, " "), - (Segment::Punct, "&"), - (Segment::Spaces, " "), - (Segment::Punct, "|"), - (Segment::Spaces, " "), - (Segment::Punct, "="), - (Segment::Spaces, " "), - (Segment::Punct, ">="), - (Segment::Spaces, " "), - (Segment::Punct, ">"), - (Segment::Spaces, " "), - (Segment::Punct, "<="), - (Segment::Spaces, " "), - (Segment::Punct, "<"), - (Segment::Spaces, " "), - (Segment::Punct, "~="), - (Segment::Spaces, " "), - (Segment::Punct, "<>"), - (Segment::Spaces, " "), - (Segment::Punct, "("), - (Segment::Spaces, " "), - (Segment::Punct, ")"), - (Segment::Spaces, " "), - (Segment::Punct, ","), - (Segment::Spaces, " "), - (Segment::Punct, "-"), - (Segment::Spaces, " "), - (Segment::Punct, "+"), - (Segment::Spaces, " "), - (Segment::Punct, "*"), - (Segment::Spaces, " "), - (Segment::Punct, "/"), - (Segment::Spaces, " "), - (Segment::Punct, "["), - (Segment::Spaces, " "), - (Segment::Punct, "]"), - (Segment::Spaces, " "), - (Segment::Punct, "**"), - (Segment::Newline, "\n"), - (Segment::Punct, "~"), - (Segment::Punct, "&"), - (Segment::Punct, "|"), - (Segment::Punct, "="), - (Segment::Punct, ">="), - (Segment::Punct, ">"), - (Segment::Punct, "<="), - (Segment::Punct, "<"), - (Segment::Punct, "~="), - (Segment::Punct, "<>"), - (Segment::Punct, "("), - (Segment::Punct, ")"), - (Segment::Punct, ","), - (Segment::Punct, "-"), - (Segment::Punct, "+"), - (Segment::Punct, "*"), - (Segment::Punct, "/"), - (Segment::Punct, "["), - (Segment::Punct, "]"), - (Segment::Punct, "**"), - (Segment::Punct, "!*"), - (Segment::Newline, "\n"), - (Segment::Punct, "%"), - (Segment::Spaces, " "), - (Segment::Punct, ":"), - (Segment::Spaces, " "), - (Segment::Punct, ";"), - (Segment::Spaces, " "), - (Segment::Punct, "?"), - (Segment::Spaces, " "), - (Segment::Punct, "_"), - (Segment::Spaces, " "), - (Segment::Punct, "`"), - (Segment::Spaces, " "), - (Segment::Punct, "{"), - (Segment::Spaces, " "), - (Segment::Punct, "}"), - (Segment::Spaces, " "), - (Segment::Punct, "~"), - (Segment::Spaces, " "), - (Segment::Punct, "!*"), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[PromptStyle::Later, PromptStyle::Later, PromptStyle::Later], - ); -} - -#[test] -fn test_positive_numbers() { - check_segmentation( - r#"0 1 01 001. 1. -123. /* comment 1 */ /* comment 2 */ -.1 0.1 00.1 00.10 -5e1 6E-1 7e+1 6E+01 6e-03 -.3E1 .4e-1 .5E+1 .6e+01 .7E-03 -1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03 -. 1e e1 1e+ 1e- 1. -"#, - Mode::Auto, - &[ - (Segment::Number, "0"), - (Segment::Spaces, " "), - (Segment::Number, "1"), - (Segment::Spaces, " "), - (Segment::Number, "01"), - (Segment::Spaces, " "), - (Segment::Number, "001."), - (Segment::Spaces, " "), - (Segment::Number, "1"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Number, "123"), - (Segment::EndCommand, "."), - (Segment::Spaces, " "), - (Segment::Comment, "/* comment 1 */"), - (Segment::Spaces, " "), - (Segment::Comment, "/* comment 2 */"), - (Segment::Newline, "\n"), - (Segment::StartCommand, "."), - (Segment::Number, "1"), - (Segment::Spaces, " "), - (Segment::Number, "0.1"), - (Segment::Spaces, " "), - (Segment::Number, "00.1"), - (Segment::Spaces, " "), - (Segment::Number, "00.10"), - (Segment::Newline, "\n"), - (Segment::Number, "5e1"), - (Segment::Spaces, " "), - (Segment::Number, "6E-1"), - (Segment::Spaces, " "), - (Segment::Number, "7e+1"), - (Segment::Spaces, " "), - (Segment::Number, "6E+01"), - (Segment::Spaces, " "), - (Segment::Number, "6e-03"), - (Segment::Newline, "\n"), - (Segment::StartCommand, "."), - (Segment::Number, "3E1"), - (Segment::Spaces, " "), - (Segment::Number, ".4e-1"), - (Segment::Spaces, " "), - (Segment::Number, ".5E+1"), - (Segment::Spaces, " "), - (Segment::Number, ".6e+01"), - (Segment::Spaces, " "), - (Segment::Number, ".7E-03"), - (Segment::Newline, "\n"), - (Segment::Number, "1.23e1"), - (Segment::Spaces, " "), - (Segment::Number, "45.6E-1"), - (Segment::Spaces, " "), - (Segment::Number, "78.9e+1"), - (Segment::Spaces, " "), - (Segment::Number, "99.9E+01"), - (Segment::Spaces, " "), - (Segment::Number, "11.2e-03"), - (Segment::Newline, "\n"), - (Segment::StartCommand, "."), - (Segment::Spaces, " "), - (Segment::ExpectedExponent, "1e"), - (Segment::Spaces, " "), - (Segment::Identifier, "e1"), - (Segment::Spaces, " "), - (Segment::ExpectedExponent, "1e+"), - (Segment::Spaces, " "), - (Segment::ExpectedExponent, "1e-"), - (Segment::Spaces, " "), - (Segment::Number, "1"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::First, - PromptStyle::First, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::First, - ], - ); -} - -#[test] -fn test_negative_numbers() { - check_segmentation( - r#" -0 -1 -01 -001. -1. - -123. /* comment 1 */ /* comment 2 */ - -.1 -0.1 -00.1 -00.10 - -5e1 -6E-1 -7e+1 -6E+01 -6e-03 - -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03 - -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03 - -/**/1 - -. -1e -e1 -1e+ -1e- -1. -"#, - Mode::Auto, - &[ - (Segment::Spaces, " "), - (Segment::Number, "-0"), - (Segment::Spaces, " "), - (Segment::Number, "-1"), - (Segment::Spaces, " "), - (Segment::Number, "-01"), - (Segment::Spaces, " "), - (Segment::Number, "-001."), - (Segment::Spaces, " "), - (Segment::Number, "-1"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Number, "-123"), - (Segment::EndCommand, "."), - (Segment::Spaces, " "), - (Segment::Comment, "/* comment 1 */"), - (Segment::Spaces, " "), - (Segment::Comment, "/* comment 2 */"), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Number, "-.1"), - (Segment::Spaces, " "), - (Segment::Number, "-0.1"), - (Segment::Spaces, " "), - (Segment::Number, "-00.1"), - (Segment::Spaces, " "), - (Segment::Number, "-00.10"), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Number, "-5e1"), - (Segment::Spaces, " "), - (Segment::Number, "-6E-1"), - (Segment::Spaces, " "), - (Segment::Number, "-7e+1"), - (Segment::Spaces, " "), - (Segment::Number, "-6E+01"), - (Segment::Spaces, " "), - (Segment::Number, "-6e-03"), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Number, "-.3E1"), - (Segment::Spaces, " "), - (Segment::Number, "-.4e-1"), - (Segment::Spaces, " "), - (Segment::Number, "-.5E+1"), - (Segment::Spaces, " "), - (Segment::Number, "-.6e+01"), - (Segment::Spaces, " "), - (Segment::Number, "-.7E-03"), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Number, "-1.23e1"), - (Segment::Spaces, " "), - (Segment::Number, "-45.6E-1"), - (Segment::Spaces, " "), - (Segment::Number, "-78.9e+1"), - (Segment::Spaces, " "), - (Segment::Number, "-99.9E+01"), - (Segment::Spaces, " "), - (Segment::Number, "-11.2e-03"), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Punct, "-"), - (Segment::Comment, "/**/"), - (Segment::Number, "1"), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Punct, "-"), - (Segment::Punct, "."), - (Segment::Spaces, " "), - (Segment::ExpectedExponent, "-1e"), - (Segment::Spaces, " "), - (Segment::Punct, "-"), - (Segment::Identifier, "e1"), - (Segment::Spaces, " "), - (Segment::ExpectedExponent, "-1e+"), - (Segment::Spaces, " "), - (Segment::ExpectedExponent, "-1e-"), - (Segment::Spaces, " "), - (Segment::Number, "-1"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::First, - PromptStyle::First, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::First, - ], - ); -} - -#[test] -fn test_strings() { - check_segmentation( - r#"'x' "y" 'abc' -'Don''t' "Can't" 'Won''t' -"""quoted""" '"quoted"' -'' "" -'missing end quote -"missing double quote -x"4142" X'5152' -u'fffd' U"041" -+ new command -+ /* comment */ 'string continuation' -+ /* also a punctuator on blank line -- 'new command' -"#, - Mode::Auto, - &[ - (Segment::QuotedString, "'x'"), - (Segment::Spaces, " "), - (Segment::QuotedString, "\"y\""), - (Segment::Spaces, " "), - (Segment::QuotedString, "'abc'"), - (Segment::Newline, "\n"), - (Segment::QuotedString, "'Don''t'"), - (Segment::Spaces, " "), - (Segment::QuotedString, "\"Can't\""), - (Segment::Spaces, " "), - (Segment::QuotedString, "'Won''t'"), - (Segment::Newline, "\n"), - (Segment::QuotedString, "\"\"\"quoted\"\"\""), - (Segment::Spaces, " "), - (Segment::QuotedString, "'\"quoted\"'"), - (Segment::Newline, "\n"), - (Segment::QuotedString, "''"), - (Segment::Spaces, " "), - (Segment::QuotedString, "\"\""), - (Segment::Newline, "\n"), - (Segment::ExpectedQuote, "'missing end quote"), - (Segment::Newline, "\n"), - (Segment::ExpectedQuote, "\"missing double quote"), - (Segment::Newline, "\n"), - (Segment::HexString, "x\"4142\""), - (Segment::Spaces, " "), - (Segment::HexString, "X'5152'"), - (Segment::Newline, "\n"), - (Segment::UnicodeString, "u'fffd'"), - (Segment::Spaces, " "), - (Segment::UnicodeString, "U\"041\""), - (Segment::Newline, "\n"), - (Segment::StartCommand, "+"), - (Segment::Spaces, " "), - (Segment::Identifier, "new"), - (Segment::Spaces, " "), - (Segment::Identifier, "command"), - (Segment::Newline, "\n"), - (Segment::Punct, "+"), - (Segment::Spaces, " "), - (Segment::Comment, "/* comment */"), - (Segment::Spaces, " "), - (Segment::QuotedString, "'string continuation'"), - (Segment::Newline, "\n"), - (Segment::Punct, "+"), - (Segment::Spaces, " "), - (Segment::Comment, "/* also a punctuator on blank line"), - (Segment::Newline, "\n"), - (Segment::StartCommand, "-"), - (Segment::Spaces, " "), - (Segment::QuotedString, "'new command'"), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - ], - ); -} - -#[test] -fn test_shbang() { - check_segmentation( - r#"#! /usr/bin/pspp -title my title. -#! /usr/bin/pspp -"#, - Mode::Interactive, - &[ - (Segment::Shbang, "#! /usr/bin/pspp"), - (Segment::Newline, "\n"), - (Segment::Identifier, "title"), - (Segment::Spaces, " "), - (Segment::Identifier, "my"), - (Segment::Spaces, " "), - (Segment::Identifier, "title"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "#"), - (Segment::Punct, "!"), - (Segment::Spaces, " "), - (Segment::Punct, "/"), - (Segment::Identifier, "usr"), - (Segment::Punct, "/"), - (Segment::Identifier, "bin"), - (Segment::Punct, "/"), - (Segment::Identifier, "pspp"), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[PromptStyle::First, PromptStyle::First, PromptStyle::Later], - ); -} - -#[test] -fn test_comment_command() { - check_segmentation( - r#"* Comment commands "don't -have to contain valid tokens. - -** Check ambiguity with ** token. -****************. - -comment keyword works too. -COMM also. -com is ambiguous with COMPUTE. - - * Comment need not start at left margin. - -* Comment ends with blank line - -next command. - -"#, - Mode::Interactive, - &[ - (Segment::CommentCommand, "* Comment commands \"don't"), - (Segment::Newline, "\n"), - (Segment::CommentCommand, "have to contain valid tokens"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::CommentCommand, "** Check ambiguity with ** token"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::CommentCommand, "****************"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::CommentCommand, "comment keyword works too"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::CommentCommand, "COMM also"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "com"), - (Segment::Spaces, " "), - (Segment::Identifier, "is"), - (Segment::Spaces, " "), - (Segment::Identifier, "ambiguous"), - (Segment::Spaces, " "), - (Segment::Identifier, "with"), - (Segment::Spaces, " "), - (Segment::Identifier, "COMPUTE"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - ( - Segment::CommentCommand, - "* Comment need not start at left margin", - ), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::CommentCommand, "* Comment ends with blank line"), - (Segment::Newline, "\n"), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::Identifier, "next"), - (Segment::Spaces, " "), - (Segment::Identifier, "command"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::Comment, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::Comment, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - ], - ); -} - -#[test] -fn test_document_command() { - check_segmentation( - r#"DOCUMENT one line. -DOC more - than - one - line. -docu -first.paragraph -isn't parsed as tokens - -second paragraph. -"#, - Mode::Interactive, - &[ - (Segment::StartDocument, ""), - (Segment::Document, "DOCUMENT one line."), - (Segment::EndCommand, ""), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::StartDocument, ""), - (Segment::Document, "DOC more"), - (Segment::Newline, "\n"), - (Segment::Document, " than"), - (Segment::Newline, "\n"), - (Segment::Document, " one"), - (Segment::Newline, "\n"), - (Segment::Document, " line."), - (Segment::EndCommand, ""), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::StartDocument, ""), - (Segment::Document, "docu"), - (Segment::Newline, "\n"), - (Segment::Document, "first.paragraph"), - (Segment::Newline, "\n"), - (Segment::Document, "isn't parsed as tokens"), - (Segment::Newline, "\n"), - (Segment::Document, ""), - (Segment::Newline, "\n"), - (Segment::Document, "second paragraph."), - (Segment::EndCommand, ""), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::First, - PromptStyle::Document, - PromptStyle::Document, - PromptStyle::Document, - PromptStyle::First, - PromptStyle::Document, - PromptStyle::Document, - PromptStyle::Document, - PromptStyle::Document, - PromptStyle::First, - ], - ); -} - -#[test] -fn test_file_label_command() { - check_segmentation( - r#"FIL label isn't quoted. -FILE - lab 'is quoted'. -FILE /* -/**/ lab not quoted here either - -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "FIL"), - (Segment::Spaces, " "), - (Segment::Identifier, "label"), - (Segment::Spaces, " "), - (Segment::UnquotedString, "isn't quoted"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "FILE"), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Identifier, "lab"), - (Segment::Spaces, " "), - (Segment::QuotedString, "'is quoted'"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "FILE"), - (Segment::Spaces, " "), - (Segment::Comment, "/*"), - (Segment::Newline, "\n"), - (Segment::Comment, "/**/"), - (Segment::Spaces, " "), - (Segment::Identifier, "lab"), - (Segment::Spaces, " "), - (Segment::UnquotedString, "not quoted here either"), - (Segment::Newline, "\n"), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::First, - PromptStyle::Later, - PromptStyle::First, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::First, - ], - ); -} - -#[test] -fn test_begin_data() { - check_segmentation( - r#"begin data. -end data. - -begin data. /* -123 -xxx -end data. - -BEG /**/ DAT /* -5 6 7 /* x - -end data -end data -. - -begin - data. -data -end data. - -begin data "xxx". -begin data 123. -not data -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "begin"), - (Segment::Spaces, " "), - (Segment::Identifier, "data"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "end"), - (Segment::Spaces, " "), - (Segment::Identifier, "data"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::Identifier, "begin"), - (Segment::Spaces, " "), - (Segment::Identifier, "data"), - (Segment::EndCommand, "."), - (Segment::Spaces, " "), - (Segment::Comment, "/*"), - (Segment::Newline, "\n"), - (Segment::InlineData, "123"), - (Segment::Newline, "\n"), - (Segment::InlineData, "xxx"), - (Segment::Newline, "\n"), - (Segment::Identifier, "end"), - (Segment::Spaces, " "), - (Segment::Identifier, "data"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::Identifier, "BEG"), - (Segment::Spaces, " "), - (Segment::Comment, "/**/"), - (Segment::Spaces, " "), - (Segment::Identifier, "DAT"), - (Segment::Spaces, " "), - (Segment::Comment, "/*"), - (Segment::Newline, "\n"), - (Segment::InlineData, "5 6 7 /* x"), - (Segment::Newline, "\n"), - (Segment::InlineData, ""), - (Segment::Newline, "\n"), - (Segment::InlineData, "end data"), - (Segment::Newline, "\n"), - (Segment::Identifier, "end"), - (Segment::Spaces, " "), - (Segment::Identifier, "data"), - (Segment::Newline, "\n"), - (Segment::StartCommand, "."), - (Segment::Newline, "\n"), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::Identifier, "begin"), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Identifier, "data"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::InlineData, "data"), - (Segment::Newline, "\n"), - (Segment::Identifier, "end"), - (Segment::Spaces, " "), - (Segment::Identifier, "data"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::Identifier, "begin"), - (Segment::Spaces, " "), - (Segment::Identifier, "data"), - (Segment::Spaces, " "), - (Segment::QuotedString, "\"xxx\""), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "begin"), - (Segment::Spaces, " "), - (Segment::Identifier, "data"), - (Segment::Spaces, " "), - (Segment::Number, "123"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "not"), - (Segment::Spaces, " "), - (Segment::Identifier, "data"), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::Data, - PromptStyle::First, - PromptStyle::First, - PromptStyle::Data, - PromptStyle::Data, - PromptStyle::Data, - PromptStyle::First, - PromptStyle::First, - PromptStyle::Data, - PromptStyle::Data, - PromptStyle::Data, - PromptStyle::Data, - PromptStyle::Later, - PromptStyle::First, - PromptStyle::First, - PromptStyle::Later, - PromptStyle::Data, - PromptStyle::Data, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::Later, - ], - ); -} - -#[test] -fn test_do_repeat() { - check_segmentation( - r#"do repeat x=a b c - y=d e f. - do repeat a=1 thru 5. -another command. -second command -+ third command. -end /* x */ /* y */ repeat print. -end - repeat. -do - repeat #a=1. - inner command. -end repeat. -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "do"), - (Segment::Spaces, " "), - (Segment::Identifier, "repeat"), - (Segment::Spaces, " "), - (Segment::Identifier, "x"), - (Segment::Punct, "="), - (Segment::Identifier, "a"), - (Segment::Spaces, " "), - (Segment::Identifier, "b"), - (Segment::Spaces, " "), - (Segment::Identifier, "c"), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Identifier, "y"), - (Segment::Punct, "="), - (Segment::Identifier, "d"), - (Segment::Spaces, " "), - (Segment::Identifier, "e"), - (Segment::Spaces, " "), - (Segment::Identifier, "f"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::DoRepeatCommand, " do repeat a=1 thru 5."), - (Segment::Newline, "\n"), - (Segment::DoRepeatCommand, "another command."), - (Segment::Newline, "\n"), - (Segment::DoRepeatCommand, "second command"), - (Segment::Newline, "\n"), - (Segment::DoRepeatCommand, "+ third command."), - (Segment::Newline, "\n"), - ( - Segment::DoRepeatCommand, - "end /* x */ /* y */ repeat print.", - ), - (Segment::Newline, "\n"), - (Segment::Identifier, "end"), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Identifier, "repeat"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "do"), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Identifier, "repeat"), - (Segment::Spaces, " "), - (Segment::Identifier, "#a"), - (Segment::Punct, "="), - (Segment::Number, "1"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::DoRepeatCommand, " inner command."), - (Segment::Newline, "\n"), - (Segment::Identifier, "end"), - (Segment::Spaces, " "), - (Segment::Identifier, "repeat"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::Later, - PromptStyle::First, - PromptStyle::Later, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::First, - ], - ); -} - -#[test] -fn test_do_repeat_overflow() { - const N: usize = 257; - let do_repeat: Vec = (0..N) - .map(|i| format!("do repeat v{i}={i} thru {}.\n", i + 5)) - .collect(); - let end_repeat: Vec = (0..N) - .rev() - .map(|i| format!("end repeat. /* {i}\n")) - .collect(); - - let s: String = do_repeat - .iter() - .chain(end_repeat.iter()) - .map(|s| s.as_str()) - .collect(); - let mut expect_output = vec![ - (Segment::Identifier, "do"), - (Segment::Spaces, " "), - (Segment::Identifier, "repeat"), - (Segment::Spaces, " "), - (Segment::Identifier, "v0"), - (Segment::Punct, "="), - (Segment::Number, "0"), - (Segment::Spaces, " "), - (Segment::Identifier, "thru"), - (Segment::Spaces, " "), - (Segment::Number, "5"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - ]; - for i in 1..N { - expect_output.push((Segment::DoRepeatCommand, &do_repeat[i].trim_end())); - if i >= 255 { - expect_output.push((Segment::DoRepeatOverflow, "")); - } - expect_output.push((Segment::Newline, "\n")); - } - for i in 0..254 { - expect_output.push((Segment::DoRepeatCommand, &end_repeat[i].trim_end())); - expect_output.push((Segment::Newline, "\n")); - } - let comments: Vec = (0..(N - 254)).rev().map(|i| format!("/* {i}")).collect(); - for comment in &comments { - expect_output.extend([ - (Segment::Identifier, "end"), - (Segment::Spaces, " "), - (Segment::Identifier, "repeat"), - (Segment::EndCommand, "."), - (Segment::Spaces, " "), - (Segment::Comment, comment), - (Segment::Newline, "\n"), - ]); - } - expect_output.push((Segment::End, "")); - - let expect_prompts: Vec<_> = (0..N * 2 - 3) - .map(|_| PromptStyle::DoRepeat) - .chain([PromptStyle::First, PromptStyle::First, PromptStyle::First]) - .collect(); - check_segmentation(&s, Mode::Interactive, &expect_output, &expect_prompts); -} - -#[test] -fn test_do_repeat_batch() { - check_segmentation( - r#"do repeat x=a b c - y=d e f -do repeat a=1 thru 5 -another command -second command -+ third command -end /* x */ /* y */ repeat print -end - repeat -do - repeat #a=1 - - inner command -end repeat -"#, - Mode::Batch, - &[ - (Segment::Identifier, "do"), - (Segment::Spaces, " "), - (Segment::Identifier, "repeat"), - (Segment::Spaces, " "), - (Segment::Identifier, "x"), - (Segment::Punct, "="), - (Segment::Identifier, "a"), - (Segment::Spaces, " "), - (Segment::Identifier, "b"), - (Segment::Spaces, " "), - (Segment::Identifier, "c"), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Identifier, "y"), - (Segment::Punct, "="), - (Segment::Identifier, "d"), - (Segment::Spaces, " "), - (Segment::Identifier, "e"), - (Segment::Spaces, " "), - (Segment::Identifier, "f"), - (Segment::Newline, "\n"), - (Segment::StartCommand, ""), - (Segment::DoRepeatCommand, "do repeat a=1 thru 5"), - (Segment::Newline, "\n"), - (Segment::DoRepeatCommand, "another command"), - (Segment::Newline, "\n"), - (Segment::DoRepeatCommand, "second command"), - (Segment::Newline, "\n"), - (Segment::DoRepeatCommand, "+ third command"), - (Segment::Newline, "\n"), - (Segment::DoRepeatCommand, "end /* x */ /* y */ repeat print"), - (Segment::Newline, "\n"), - (Segment::Identifier, "end"), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Identifier, "repeat"), - (Segment::Newline, "\n"), - (Segment::StartCommand, ""), - (Segment::Identifier, "do"), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Identifier, "repeat"), - (Segment::Spaces, " "), - (Segment::Identifier, "#a"), - (Segment::Punct, "="), - (Segment::Number, "1"), - (Segment::Newline, "\n"), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::DoRepeatCommand, " inner command"), - (Segment::Newline, "\n"), - (Segment::Identifier, "end"), - (Segment::Spaces, " "), - (Segment::Identifier, "repeat"), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::DoRepeat, - PromptStyle::DoRepeat, - PromptStyle::Later, - ], - ); -} - -mod define { - use crate::{ - lex::segment::{Mode, Segment}, - prompt::PromptStyle, - }; - - use super::check_segmentation; - - #[test] - fn test_simple() { - check_segmentation( - r#"define !macro1() -var1 var2 var3 "!enddefine" -!enddefine. -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "define"), - (Segment::Spaces, " "), - (Segment::MacroName, "!macro1"), - (Segment::Punct, "("), - (Segment::Punct, ")"), - (Segment::Newline, "\n"), - (Segment::MacroBody, "var1 var2 var3 \"!enddefine\""), - (Segment::Newline, "\n"), - (Segment::Identifier, "!enddefine"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[PromptStyle::Define, PromptStyle::Define, PromptStyle::First], - ); - } - - #[test] - fn test_no_newline_after_parentheses() { - check_segmentation( - r#"define !macro1() var1 var2 var3 /* !enddefine -!enddefine. -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "define"), - (Segment::Spaces, " "), - (Segment::MacroName, "!macro1"), - (Segment::Punct, "("), - (Segment::Punct, ")"), - (Segment::MacroBody, " var1 var2 var3 /* !enddefine"), - (Segment::Newline, "\n"), - (Segment::Identifier, "!enddefine"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[PromptStyle::Define, PromptStyle::First], - ); - } - - #[test] - fn test_no_newline_before_enddefine() { - check_segmentation( - r#"define !macro1() -var1 var2 var3!enddefine. -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "define"), - (Segment::Spaces, " "), - (Segment::MacroName, "!macro1"), - (Segment::Punct, "("), - (Segment::Punct, ")"), - (Segment::Newline, "\n"), - (Segment::MacroBody, "var1 var2 var3"), - (Segment::Identifier, "!enddefine"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[PromptStyle::Define, PromptStyle::First], - ); - } - - #[test] - fn test_all_on_one_line() { - check_segmentation( - r#"define !macro1()var1 var2 var3!enddefine. -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "define"), - (Segment::Spaces, " "), - (Segment::MacroName, "!macro1"), - (Segment::Punct, "("), - (Segment::Punct, ")"), - (Segment::MacroBody, "var1 var2 var3"), - (Segment::Identifier, "!enddefine"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[PromptStyle::First], - ); - } - - #[test] - fn test_empty() { - check_segmentation( - r#"define !macro1() -!enddefine. -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "define"), - (Segment::Spaces, " "), - (Segment::MacroName, "!macro1"), - (Segment::Punct, "("), - (Segment::Punct, ")"), - (Segment::Newline, "\n"), - (Segment::Identifier, "!enddefine"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[PromptStyle::Define, PromptStyle::First], - ); - } - - #[test] - fn test_blank_lines() { - check_segmentation( - r#"define !macro1() - - -!enddefine. -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "define"), - (Segment::Spaces, " "), - (Segment::MacroName, "!macro1"), - (Segment::Punct, "("), - (Segment::Punct, ")"), - (Segment::Newline, "\n"), - (Segment::MacroBody, ""), - (Segment::Newline, "\n"), - (Segment::MacroBody, ""), - (Segment::Newline, "\n"), - (Segment::Identifier, "!enddefine"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::Define, - PromptStyle::Define, - PromptStyle::Define, - PromptStyle::First, - ], - ); - } - - #[test] - fn test_arguments() { - check_segmentation( - r#"define !macro1(a(), b(), c()) -!enddefine. -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "define"), - (Segment::Spaces, " "), - (Segment::MacroName, "!macro1"), - (Segment::Punct, "("), - (Segment::Identifier, "a"), - (Segment::Punct, "("), - (Segment::Punct, ")"), - (Segment::Punct, ","), - (Segment::Spaces, " "), - (Segment::Identifier, "b"), - (Segment::Punct, "("), - (Segment::Punct, ")"), - (Segment::Punct, ","), - (Segment::Spaces, " "), - (Segment::Identifier, "c"), - (Segment::Punct, "("), - (Segment::Punct, ")"), - (Segment::Punct, ")"), - (Segment::Newline, "\n"), - (Segment::Identifier, "!enddefine"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[PromptStyle::Define, PromptStyle::First], - ); - } - - #[test] - fn test_multiline_arguments() { - check_segmentation( - r#"define !macro1( - a(), b( - ), - c() -) -!enddefine. -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "define"), - (Segment::Spaces, " "), - (Segment::MacroName, "!macro1"), - (Segment::Punct, "("), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Identifier, "a"), - (Segment::Punct, "("), - (Segment::Punct, ")"), - (Segment::Punct, ","), - (Segment::Spaces, " "), - (Segment::Identifier, "b"), - (Segment::Punct, "("), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Punct, ")"), - (Segment::Punct, ","), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Identifier, "c"), - (Segment::Punct, "("), - (Segment::Punct, ")"), - (Segment::Newline, "\n"), - (Segment::Punct, ")"), - (Segment::Newline, "\n"), - (Segment::Identifier, "!enddefine"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Define, - PromptStyle::First, - ], - ); - } - - #[test] - fn test_arguments_start_on_second_line() { - check_segmentation( - r#"define !macro1 -(x,y,z -) -content 1 -content 2 -!enddefine. -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "define"), - (Segment::Spaces, " "), - (Segment::MacroName, "!macro1"), - (Segment::Newline, "\n"), - (Segment::Punct, "("), - (Segment::Identifier, "x"), - (Segment::Punct, ","), - (Segment::Identifier, "y"), - (Segment::Punct, ","), - (Segment::Identifier, "z"), - (Segment::Newline, "\n"), - (Segment::Punct, ")"), - (Segment::Newline, "\n"), - (Segment::MacroBody, "content 1"), - (Segment::Newline, "\n"), - (Segment::MacroBody, "content 2"), - (Segment::Newline, "\n"), - (Segment::Identifier, "!enddefine"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Define, - PromptStyle::Define, - PromptStyle::Define, - PromptStyle::First, - ], - ); - } - - #[test] - fn test_early_end_of_command_1() { - check_segmentation( - r#"define !macro1. -data list /x 1. -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "define"), - (Segment::Spaces, " "), - (Segment::MacroName, "!macro1"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "data"), - (Segment::Spaces, " "), - (Segment::Identifier, "list"), - (Segment::Spaces, " "), - (Segment::Punct, "/"), - (Segment::Identifier, "x"), - (Segment::Spaces, " "), - (Segment::Number, "1"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[PromptStyle::First, PromptStyle::First], - ); - } - - #[test] - fn test_early_end_of_command_2() { - check_segmentation( - r#"define !macro1 -x. -data list /x 1. -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "define"), - (Segment::Spaces, " "), - (Segment::MacroName, "!macro1"), - (Segment::Newline, "\n"), - (Segment::Identifier, "x"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "data"), - (Segment::Spaces, " "), - (Segment::Identifier, "list"), - (Segment::Spaces, " "), - (Segment::Punct, "/"), - (Segment::Identifier, "x"), - (Segment::Spaces, " "), - (Segment::Number, "1"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[PromptStyle::Later, PromptStyle::First, PromptStyle::First], - ); - } - - #[test] - fn test_early_end_of_command_3() { - check_segmentation( - r#"define !macro1(. -x. -data list /x 1. -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "define"), - (Segment::Spaces, " "), - (Segment::MacroName, "!macro1"), - (Segment::Punct, "("), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "x"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "data"), - (Segment::Spaces, " "), - (Segment::Identifier, "list"), - (Segment::Spaces, " "), - (Segment::Punct, "/"), - (Segment::Identifier, "x"), - (Segment::Spaces, " "), - (Segment::Number, "1"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[PromptStyle::First, PromptStyle::First, PromptStyle::First], - ); - } - - #[test] - fn test_early_end_of_command_4() { - // Notice the command terminator at the end of the `DEFINE` command, - // which should not be there and ends it early. - check_segmentation( - r#"define !macro1. -data list /x 1. -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "define"), - (Segment::Spaces, " "), - (Segment::MacroName, "!macro1"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "data"), - (Segment::Spaces, " "), - (Segment::Identifier, "list"), - (Segment::Spaces, " "), - (Segment::Punct, "/"), - (Segment::Identifier, "x"), - (Segment::Spaces, " "), - (Segment::Number, "1"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[PromptStyle::First, PromptStyle::First], - ); - } - - #[test] - fn test_missing_enddefine() { - check_segmentation( - r#"define !macro1() -content line 1 -content line 2 -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "define"), - (Segment::Spaces, " "), - (Segment::MacroName, "!macro1"), - (Segment::Punct, "("), - (Segment::Punct, ")"), - (Segment::Newline, "\n"), - (Segment::MacroBody, "content line 1"), - (Segment::Newline, "\n"), - (Segment::MacroBody, "content line 2"), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::Define, - PromptStyle::Define, - PromptStyle::Define, - ], - ); - } - - #[test] - fn test_missing_enddefine_2() { - check_segmentation( - r#"define !macro1() -"#, - Mode::Interactive, - &[ - (Segment::Identifier, "define"), - (Segment::Spaces, " "), - (Segment::MacroName, "!macro1"), - (Segment::Punct, "("), - (Segment::Punct, ")"), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[PromptStyle::Define], - ); - } -} - -#[test] -fn test_batch_mode() { - check_segmentation( - r#"first command - another line of first command -+ second command -third command - -fourth command. - fifth command. -"#, - Mode::Batch, - &[ - (Segment::Identifier, "first"), - (Segment::Spaces, " "), - (Segment::Identifier, "command"), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Identifier, "another"), - (Segment::Spaces, " "), - (Segment::Identifier, "line"), - (Segment::Spaces, " "), - (Segment::Identifier, "of"), - (Segment::Spaces, " "), - (Segment::Identifier, "first"), - (Segment::Spaces, " "), - (Segment::Identifier, "command"), - (Segment::Newline, "\n"), - (Segment::StartCommand, "+"), - (Segment::Spaces, " "), - (Segment::Identifier, "second"), - (Segment::Spaces, " "), - (Segment::Identifier, "command"), - (Segment::Newline, "\n"), - (Segment::StartCommand, ""), - (Segment::Identifier, "third"), - (Segment::Spaces, " "), - (Segment::Identifier, "command"), - (Segment::Newline, "\n"), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::Identifier, "fourth"), - (Segment::Spaces, " "), - (Segment::Identifier, "command"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Identifier, "fifth"), - (Segment::Spaces, " "), - (Segment::Identifier, "command"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - ], - ); -} - -#[test] -fn test_auto_mode() { - check_segmentation( - r#"command - another line of command -2sls -+ another command -another line of second command -data list /x 1 -aggregate. -print eject. -twostep cluster - - -fourth command. - fifth command. -"#, - Mode::Auto, - &[ - (Segment::Identifier, "command"), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Identifier, "another"), - (Segment::Spaces, " "), - (Segment::Identifier, "line"), - (Segment::Spaces, " "), - (Segment::Identifier, "of"), - (Segment::Spaces, " "), - (Segment::Identifier, "command"), - (Segment::Newline, "\n"), - (Segment::StartCommand, ""), - (Segment::Number, "2"), - (Segment::Identifier, "sls"), - (Segment::Newline, "\n"), - (Segment::StartCommand, "+"), - (Segment::Spaces, " "), - (Segment::Identifier, "another"), - (Segment::Spaces, " "), - (Segment::Identifier, "command"), - (Segment::Newline, "\n"), - (Segment::Identifier, "another"), - (Segment::Spaces, " "), - (Segment::Identifier, "line"), - (Segment::Spaces, " "), - (Segment::Identifier, "of"), - (Segment::Spaces, " "), - (Segment::Identifier, "second"), - (Segment::Spaces, " "), - (Segment::Identifier, "command"), - (Segment::Newline, "\n"), - (Segment::StartCommand, ""), - (Segment::Identifier, "data"), - (Segment::Spaces, " "), - (Segment::Identifier, "list"), - (Segment::Spaces, " "), - (Segment::Punct, "/"), - (Segment::Identifier, "x"), - (Segment::Spaces, " "), - (Segment::Number, "1"), - (Segment::Newline, "\n"), - (Segment::StartCommand, ""), - (Segment::Identifier, "aggregate"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "print"), - (Segment::Spaces, " "), - (Segment::Identifier, "eject"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Identifier, "twostep"), - (Segment::Spaces, " "), - (Segment::Identifier, "cluster"), - (Segment::Newline, "\n"), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::SeparateCommands, ""), - (Segment::Newline, "\n"), - (Segment::Identifier, "fourth"), - (Segment::Spaces, " "), - (Segment::Identifier, "command"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::Spaces, " "), - (Segment::Identifier, "fifth"), - (Segment::Spaces, " "), - (Segment::Identifier, "command"), - (Segment::EndCommand, "."), - (Segment::Newline, "\n"), - (Segment::End, ""), - ], - &[ - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::Later, - PromptStyle::First, - PromptStyle::First, - PromptStyle::Later, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - PromptStyle::First, - ], - ); -} diff --git a/rust/src/lex/token.rs b/rust/src/lex/token.rs deleted file mode 100644 index 2b59423b5f..0000000000 --- a/rust/src/lex/token.rs +++ /dev/null @@ -1,272 +0,0 @@ -use std::fmt::{Display, Formatter, Result as FmtResult}; - -use crate::identifier::Identifier; - -#[derive(Clone, Debug, PartialEq)] -pub enum Token { - /// End of input. - End, - - /// Identifier. - Id(Identifier), - - /// Number. - Number(f64), - - /// Quoted string. - String(String), - - /// Command terminator or separator. - /// - /// Usually this is `.`, but a blank line also separates commands, and in - /// batch mode any line that begins with a non-blank starts a new command. - EndCommand, - - /// Operators, punctuators, and reserved words. - Punct(Punct), -} - -impl Token { - pub fn id(&self) -> Option<&Identifier> { - match self { - Self::Id(identifier) => Some(identifier), - _ => None, - } - } -} - -fn is_printable(c: char) -> bool { - !c.is_control() || ['\t', '\r', '\n'].contains(&c) -} - -fn string_representation(s: &str, quote: char, f: &mut Formatter<'_>) -> FmtResult { - write!(f, "{quote}")?; - for section in s.split_inclusive(quote) { - if let Some(rest) = section.strip_suffix(quote) { - write!(f, "{rest}{quote}{quote}")?; - } else { - write!(f, "{section}")?; - } - } - write!(f, "{quote}") -} - -impl Display for Token { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - match self { - Token::End => Ok(()), - Token::Id(s) => write!(f, "{s}"), - Token::Number(number) => { - if number.is_sign_negative() { - write!(f, "-{}", number.abs()) - } else { - write!(f, "{number}") - } - } - Token::String(s) => { - if s.chars().all(|c| is_printable(c)) { - if s.contains('"') { - string_representation(s, '\'', f) - } else { - string_representation(s, '"', f) - } - } else { - write!(f, "X\"")?; - for byte in s.bytes() { - let c1 = char::from_digit((byte >> 4) as u32, 16) - .unwrap() - .to_ascii_uppercase(); - let c2 = char::from_digit((byte & 0xf) as u32, 16) - .unwrap() - .to_ascii_uppercase() - .to_ascii_lowercase(); - write!(f, "{c1}{c2}")?; - } - write!(f, "\"") - } - } - Token::EndCommand => write!(f, "."), - Token::Punct(punct) => punct.fmt(f), - } - } -} - -/// Check that all negative numbers, even -0, get formatted with a leading `-`. -#[cfg(test)] -mod test { - use crate::lex::token::Token; - - #[test] - fn test_string() { - assert_eq!(Token::String(String::from("abc")).to_string(), "\"abc\""); - assert_eq!( - Token::String(String::from("\u{0080}")).to_string(), - "X\"C280\"" - ); - } - - #[test] - fn test_neg0() { - assert_eq!(Token::Number(-0.0).to_string(), "-0"); - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum Punct { - /// `+`. - Plus, - - /// `-`. - Dash, - - /// `*`. - Asterisk, - - /// `/`. - Slash, - - /// `=`. - Equals, - - /// `(`. - LParen, - - /// `)`. - RParen, - - /// `[`. - LSquare, - - /// `]`. - RSquare, - - /// `{`. - LCurly, - - /// `}`. - RCurly, - - /// `,`. - Comma, - - /// `;`. - Semicolon, - - /// `:`. - Colon, - - /// `AND` or `&`. - And, - - /// `OR` or `|`. - Or, - - /// `NOT` or `~`. - Not, - - /// `EQ` or `=`. - Eq, - - /// `GE` or '>=` - Ge, - - /// `GT` or `>`. - Gt, - - /// `LE` or `<=`. - Le, - - /// `LT` or `<`. - Lt, - - /// `NE` or `~=` or `<>`. - Ne, - - /// `ALL`. - All, - - /// `BY`. - By, - - /// `TO`. - To, - - /// `WITH`. - With, - - /// `**`. - Exp, - - /// `!` (only appears in macros). - Bang, - - /// `%` (only appears in macros). - Percent, - - /// `?` (only appears in macros). - Question, - - /// ```` (only appears in macros). - Backtick, - - /// `.`. - /// - /// This represents a dot in the middle of a line by itself, where it does not end a command. - Dot, - - /// `_` (only appears in macros). - /// - /// Although underscores may appear within identifiers, they can't be the - /// first character, so this represents an underscore found on its own. - Underscore, - - /// `!*` (only appears in macros). - BangAsterisk, -} - -impl Punct { - pub fn as_str(&self) -> &'static str { - match self { - Self::Plus => "+", - Self::Dash => "-", - Self::Asterisk => "*", - Self::Slash => "/", - Self::Equals => "=", - Self::LParen => "(", - Self::RParen => ")", - Self::LSquare => "[", - Self::RSquare => "]", - Self::LCurly => "{", - Self::RCurly => "}", - Self::Comma => ",", - Self::Semicolon => ";", - Self::Colon => ":", - Self::And => "AND", - Self::Or => "OR", - Self::Not => "NOT", - Self::Eq => "EQ", - Self::Ge => ">=", - Self::Gt => ">", - Self::Le => "<=", - Self::Lt => "<", - Self::Ne => "~=", - Self::All => "ALL", - Self::By => "BY", - Self::To => "TO", - Self::With => "WITH", - Self::Exp => "**", - Self::Bang => "!", - Self::Percent => "%", - Self::Question => "?", - Self::Backtick => "`", - Self::Dot => ".", - Self::Underscore => "_", - Self::BangAsterisk => "!*", - } - } -} -impl Display for Punct { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - write!(f, "{}", self.as_str()) - } -} diff --git a/rust/src/lib.rs b/rust/src/lib.rs deleted file mode 100644 index 3548e020ee..0000000000 --- a/rust/src/lib.rs +++ /dev/null @@ -1,20 +0,0 @@ -#[allow(unused_variables, unused_mut, dead_code)] -pub mod cooked; -pub mod dictionary; -pub mod encoding; -pub mod endian; -pub mod format; -pub mod identifier; -pub mod locale_charset; -pub mod output; -#[allow(unused_variables, unused_mut, dead_code)] -pub mod raw; -pub mod sack; -pub mod lex; -pub mod prompt; -pub mod message; -pub mod macros; -pub mod settings; -pub mod command; -pub mod integer; -pub mod engine; diff --git a/rust/src/locale_charset.rs b/rust/src/locale_charset.rs deleted file mode 100644 index 596fd62406..0000000000 --- a/rust/src/locale_charset.rs +++ /dev/null @@ -1,306 +0,0 @@ -// Determine a canonical name for the current locale's character encoding. -// -// Copyright (C) 2000-2006, 2008-2023 Free Software Foundation, Inc. -// -// This file is free software: you can redistribute it and/or modify it under -// the terms of the GNU Lesser General Public License as published by the Free -// Software Foundation; either version 2.1 of the License, or (at your option) -// any later version. -// -// This file is distributed in the hope that it will be useful, but WITHOUT ANY -// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR -// A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more -// details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with this program. If not, see . -// -// Written by Bruno Haible . Translated to Rust by Ben Pfaff -// . - -use lazy_static::lazy_static; - -fn map_aliases(s: &str) -> &'static str { - #[cfg(target_os = "freebsd")] - match s { - "ARMSCII-8" => return "ARMSCII-8", - "Big5" => return "BIG5", - "C" => return "ASCII", - "CP1131" => return "CP1131", - "CP1251" => return "CP1251", - "CP866" => return "CP866", - "GB18030" => return "GB18030", - "GB2312" => return "GB2312", - "GBK" => return "GBK", - "ISCII-DEV" => return "?", - "ISO8859-1" => return "ISO-8859-1", - "ISO8859-13" => return "ISO-8859-13", - "ISO8859-15" => return "ISO-8859-15", - "ISO8859-2" => return "ISO-8859-2", - "ISO8859-5" => return "ISO-8859-5", - "ISO8859-7" => return "ISO-8859-7", - "ISO8859-9" => return "ISO-8859-9", - "KOI8-R" => return "KOI8-R", - "KOI8-U" => return "KOI8-U", - "SJIS" => return "SHIFT_JIS", - "US-ASCII" => return "ASCII", - "eucCN" => return "GB2312", - "eucJP" => return "EUC-JP", - "eucKR" => return "EUC-KR", - _ => (), - }; - - #[cfg(target_os = "netbsd")] - match s { - "646" => return "ASCII", - "ARMSCII-8" => return "ARMSCII-8", - "BIG5" => return "BIG5", - "Big5-HKSCS" => return "BIG5-HKSCS", - "CP1251" => return "CP1251", - "CP866" => return "CP866", - "GB18030" => return "GB18030", - "GB2312" => return "GB2312", - "ISO8859-1" => return "ISO-8859-1", - "ISO8859-13" => return "ISO-8859-13", - "ISO8859-15" => return "ISO-8859-15", - "ISO8859-2" => return "ISO-8859-2", - "ISO8859-4" => return "ISO-8859-4", - "ISO8859-5" => return "ISO-8859-5", - "ISO8859-7" => return "ISO-8859-7", - "KOI8-R" => return "KOI8-R", - "KOI8-U" => return "KOI8-U", - "PT154" => return "PT154", - "SJIS" => return "SHIFT_JIS", - "eucCN" => return "GB2312", - "eucJP" => return "EUC-JP", - "eucKR" => return "EUC-KR", - "eucTW" => return "EUC-TW", - _ => (), - }; - - #[cfg(target_os = "openbsd")] - match s { - "646" => return "ASCII", - "ISO8859-1" => return "ISO-8859-1", - "ISO8859-13" => return "ISO-8859-13", - "ISO8859-15" => return "ISO-8859-15", - "ISO8859-2" => return "ISO-8859-2", - "ISO8859-4" => return "ISO-8859-4", - "ISO8859-5" => return "ISO-8859-5", - "ISO8859-7" => return "ISO-8859-7", - "US-ASCII" => return "ASCII", - _ => (), - }; - - /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is - useless: - - It returns the empty string when LANG is set to a locale of the - form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8 - LC_CTYPE file. - - The environment variables LANG, LC_CTYPE, LC_ALL are not set by - the system; nl_langinfo(CODESET) returns "US-ASCII" in this case. - - The documentation says: - "... all code that calls BSD system routines should ensure - that the const *char parameters of these routines are in UTF-8 - encoding. All BSD system functions expect their string - parameters to be in UTF-8 encoding and nothing else." - It also says - "An additional caveat is that string parameters for files, - paths, and other file-system entities must be in canonical - UTF-8. In a canonical UTF-8 Unicode string, all decomposable - characters are decomposed ..." - but this is not true: You can pass non-decomposed UTF-8 strings - to file system functions, and it is the OS which will convert - them to decomposed UTF-8 before accessing the file system. - - The Apple Terminal application displays UTF-8 by default. - - However, other applications are free to use different encodings: - - xterm uses ISO-8859-1 by default. - - TextEdit uses MacRoman by default. - We prefer UTF-8 over decomposed UTF-8-MAC because one should - minimize the use of decomposed Unicode. Unfortunately, through the - Darwin file system, decomposed UTF-8 strings are leaked into user - space nevertheless. - Then there are also the locales with encodings other than US-ASCII - and UTF-8. These locales can be occasionally useful to users (e.g. - when grepping through ISO-8859-1 encoded text files), when all their - file names are in US-ASCII. - */ - - #[cfg(target_os = "macos")] - match s { - "ARMSCII-8" => return "ARMSCII-8", - "Big5" => return "BIG5", - "Big5HKSCS" => return "BIG5-HKSCS", - "CP1131" => return "CP1131", - "CP1251" => return "CP1251", - "CP866" => return "CP866", - "CP949" => return "CP949", - "GB18030" => return "GB18030", - "GB2312" => return "GB2312", - "GBK" => return "GBK", - "ISO8859-1" => return "ISO-8859-1", - "ISO8859-13" => return "ISO-8859-13", - "ISO8859-15" => return "ISO-8859-15", - "ISO8859-2" => return "ISO-8859-2", - "ISO8859-4" => return "ISO-8859-4", - "ISO8859-5" => return "ISO-8859-5", - "ISO8859-7" => return "ISO-8859-7", - "ISO8859-9" => return "ISO-8859-9", - "KOI8-R" => return "KOI8-R", - "KOI8-U" => return "KOI8-U", - "PT154" => return "PT154", - "SJIS" => return "SHIFT_JIS", - "eucCN" => return "GB2312", - "eucJP" => return "EUC-JP", - "eucKR" => return "EUC-KR", - _ => (), - }; - - #[cfg(target_os = "aix")] - match s { - "GBK" => return "GBK", - "IBM-1046" => return "CP1046", - "IBM-1124" => return "CP1124", - "IBM-1129" => return "CP1129", - "IBM-1252" => return "CP1252", - "IBM-850" => return "CP850", - "IBM-856" => return "CP856", - "IBM-921" => return "ISO-8859-13", - "IBM-922" => return "CP922", - "IBM-932" => return "CP932", - "IBM-943" => return "CP943", - "IBM-eucCN" => return "GB2312", - "IBM-eucJP" => return "EUC-JP", - "IBM-eucKR" => return "EUC-KR", - "IBM-eucTW" => return "EUC-TW", - "ISO8859-1" => return "ISO-8859-1", - "ISO8859-15" => return "ISO-8859-15", - "ISO8859-2" => return "ISO-8859-2", - "ISO8859-5" => return "ISO-8859-5", - "ISO8859-6" => return "ISO-8859-6", - "ISO8859-7" => return "ISO-8859-7", - "ISO8859-8" => return "ISO-8859-8", - "ISO8859-9" => return "ISO-8859-9", - "TIS-620" => return "TIS-620", - "UTF-8" => return "UTF-8", - "big5" => return "BIG5", - _ => (), - }; - - #[cfg(windows)] - match s { - "CP1361" => return "JOHAB", - "CP20127" => return "ASCII", - "CP20866" => return "KOI8-R", - "CP20936" => return "GB2312", - "CP21866" => return "KOI8-RU", - "CP28591" => return "ISO-8859-1", - "CP28592" => return "ISO-8859-2", - "CP28593" => return "ISO-8859-3", - "CP28594" => return "ISO-8859-4", - "CP28595" => return "ISO-8859-5", - "CP28596" => return "ISO-8859-6", - "CP28597" => return "ISO-8859-7", - "CP28598" => return "ISO-8859-8", - "CP28599" => return "ISO-8859-9", - "CP28605" => return "ISO-8859-15", - "CP38598" => return "ISO-8859-8", - "CP51932" => return "EUC-JP", - "CP51936" => return "GB2312", - "CP51949" => return "EUC-KR", - "CP51950" => return "EUC-TW", - "CP54936" => return "GB18030", - "CP65001" => return "UTF-8", - "CP936" => return "GBK", - _ => (), - }; - - String::from(s).leak() -} - -#[cfg(unix)] -mod inner { - use std::{ - ffi::{c_int, CStr, CString}, - ptr::null, - }; - - use libc::{self, nl_langinfo, setlocale, CODESET, LC_CTYPE}; - - unsafe fn string_from_pointer(s: *const i8) -> Option { - if s.is_null() { - None - } else { - Some(CStr::from_ptr(s).to_string_lossy().into()) - } - } - - fn set_locale(category: c_int, locale: Option<&str>) -> Option { - unsafe { - let locale = locale.map(|s| CString::new(s).unwrap()); - let locale_ptr = locale.as_ref().map_or(null(), |s| s.as_ptr()); - string_from_pointer(setlocale(category, locale_ptr)) - } - } - - pub fn locale_charset() -> Option { - unsafe { - let saved_locale = set_locale(LC_CTYPE, None); - set_locale(LC_CTYPE, Some("")); - let codeset = string_from_pointer(nl_langinfo(CODESET)); - set_locale(LC_CTYPE, saved_locale.as_deref()); - codeset - } - } -} - -#[cfg(windows)] -mod inner { - use libc::{setlocale, LC_CTYPE}; - use std::ffi::{CStr, CString}; - use windows_sys::Win32::Globalization::GetACP; - - fn current_locale() -> Option { - unsafe { - let empty_cstr = CString::new("").unwrap(); - let locale = setlocale(LC_CTYPE, empty_cstr.as_ptr()); - if locale.is_null() { - None - } else { - Some(CStr::from_ptr(locale).to_string_lossy().into()) - } - } - } - - pub fn locale_charset() -> Option { - let Some(current_locale) = current_locale() else { - return None; - }; - let codepage = if let Some((_, pdot)) = current_locale.rsplit_once('.') { - format!("CP{pdot}") - } else { - format!("CP{}", unsafe { GetACP() }) - }; - Some(match codepage.as_str() { - "CP65001" | "CPutf8" => String::from("UTF-8"), - _ => codepage, - }) - } -} - -#[cfg(not(any(unix, windows)))] -mod inner { - pub fn locale_charse() -> String { - String::from("UTF-8") - } -} - -/// Returns the character set used by the locale configured in the operating -/// system. -pub fn locale_charset() -> &'static str { - lazy_static! { - static ref LOCALE_CHARSET: &'static str = - map_aliases(&inner::locale_charset().unwrap_or(String::from("UTF-8"))); - } - &LOCALE_CHARSET -} diff --git a/rust/src/macros.rs b/rust/src/macros.rs deleted file mode 100644 index 85671b05a5..0000000000 --- a/rust/src/macros.rs +++ /dev/null @@ -1,1668 +0,0 @@ -use lazy_static::lazy_static; -use num::Integer; -use std::{ - cell::RefCell, - cmp::Ordering, - collections::{BTreeMap, HashMap, HashSet}, - mem::take, - num::NonZeroUsize, - ops::RangeInclusive, -}; -use thiserror::Error as ThisError; -use unicase::UniCase; - -use crate::{ - identifier::Identifier, - lex::{ - scan::{ScanError, ScanToken, StringScanner, StringSegmenter}, - segment::Mode, - token::{Punct, Token}, - }, - message::Location, - settings::Settings, -}; - -#[derive(Clone, Debug, ThisError)] -pub enum MacroError { - /// Expected more tokens. - #[error( - "Reached end of command expecting {n} more tokens in argument {arg} to macro {macro_}." - )] - ExpectedMoreTokens { - n: usize, - arg: Identifier, - macro_: Identifier, - }, - - /// Expected a particular token at end of command. - #[error("Reached end of command expecting {token:?} in argument {arg} to macro {macro_}.")] - ExpectedToken { - token: String, - arg: Identifier, - macro_: Identifier, - }, - - /// Expected a particular token, got a different one. - #[error( - "Found `{actual}` while expecting `{expected}` reading argument {arg} to macro {macro_}." - )] - UnexpectedToken { - actual: String, - expected: String, - arg: Identifier, - macro_: Identifier, - }, - - /// Argument specified multiple times, - #[error("Argument {arg} specified multiple times in call to macro {macro_}.")] - DuplicateArg { arg: Identifier, macro_: Identifier }, - - /// Maximum nesting limit exceeded. - #[error("Maximum nesting level {limit} exceeded. (Use `SET MNEST` to change the limit.)")] - TooDeep { limit: usize }, - - /// Invalid `!*`. - #[error("`!*` may only be used within the expansion of a macro.")] - InvalidBangAsterisk, - - /// Error tokenizing during expansion. - #[error(transparent)] - ScanError(ScanError), - - /// Expecting `)` in macro expression. - #[error("Expecting `)` in macro expression.")] - ExpectingRParen, - - /// Expecting literal. - #[error("Expecting literal or function invocation in macro expression.")] - ExpectingLiteral, - - /// Expecting `!THEN`. - #[error("`!THEN` expected in macro `!IF` construct.")] - ExpectingThen, - - /// Expecting `!ELSE` or `!THEN`. - #[error("`!ELSE` or `!THEN` expected in macro `!IF` construct.")] - ExpectingElseOrIfEnd, - - /// Expecting `!IFEND`. - #[error("`!IFEND` expected in macro `!IF` construct.")] - ExpectingIfEnd, - - /// Expecting macro variable name. - #[error("Expecting macro variable name following `{0}`.")] - ExpectingMacroVarName(&'static str), - - /// Invalid macro variable name. - #[error("Cannot use argument name or macro keyword {name} as `{construct}` variable name.")] - BadMacroVarName { - name: Identifier, - construct: &'static str, - }, - - /// Expecting `=` following `!LET`. - #[error("Expecting `=` following `!LET`.")] - ExpectingEquals, - - /// Expecting `=` or `!IN` in `!DO` loop. - #[error("Expecting `=` or `!IN` in `!DO` loop.")] - ExpectingEqualsOrIn, - - /// Missing `!DOEND`. - #[error("Missing `!DOEND`.")] - MissingDoEnd, - - /// Bad numberic macro expression. - #[error("Macro expression must evaluate to a number (not {0:?})")] - BadNumericMacroExpression(String), - - /// Too many iteration for list-based loop. - #[error("`!DO` loop over list exceeded maximum number of iterations {0}. (Use `SET MITERATE` to change the limit.)")] - MiterateList(usize), - - /// Too many iteration for numerical loop. - #[error("Numerical `!DO` loop exceeded maximum number of iterations {0}. (Use `SET MITERATE` to change the limit.)")] - MiterateNumeric(usize), - - /// Expecting `!TO` in numerical `!DO` loop. - #[error("Expecting `!TO` in numerical `!DO` loop.")] - ExpectingTo, - - /// `!BY` value cannot be zero. - #[error("`!BY` value cannot be zero.")] - ZeroBy, - - /// `!BREAK` outside `!DO`. - #[error("`!BREAK` outside `!DO`.")] - BreakOutsideDo, - - /// `,` or `)` expected in call to macro function. - #[error("`,` or `)` expected in call to macro function `{0}`.")] - ExpectingCommaOrRParen(Identifier), - - /// Macro function takes one argument. - #[error("Macro function `{name}` takes one argument (not {n_args}).")] - ExpectingOneArg { name: Identifier, n_args: usize }, - - /// Macro function takes two arguments. - #[error("Macro function `{name}` takes two arguments (not {n_args}).")] - ExpectingTwoArgs { name: Identifier, n_args: usize }, - - /// Macro function takes two or three arguments. - #[error("Macro function `{name}` takes two or three arguments (not {n_args}).")] - ExpectingTwoOrThreeArgs { name: Identifier, n_args: usize }, - - /// Macro function needs at least one argument). - #[error("Macro function `{name}` needs at least one argument).")] - ExpectingOneOrMoreArgs { name: Identifier }, - - /// Argument to `!BLANKS` must be non-negative integer (not `{0}`). - #[error("Argument to `!BLANKS` must be non-negative integer (not `{0}`).")] - InvalidBlanks(String), - - /// Second argument of `!SUBSTR` must be positive integer (not `{0}`). - #[error("Second argument of `!SUBSTR` must be positive integer (not `{0}`).")] - InvalidSubstr2(String), - - /// Third argument of `!SUBSTR` must be non-negative integer (not `{0}`). - #[error("Third argument of `!SUBSTR` must be non-negative integer (not `{0}`).")] - InvalidSubstr3(String), -} - -/// A PSPP macro as defined with `!DEFINE`. -pub struct Macro { - /// The macro's name. This is an ordinary identifier except that it is - /// allowed (but not required) to begin with `!`. - pub name: Identifier, - - /// Source code location of macro definition, for error reporting. - pub location: Location, - - /// Parameters. - parameters: Vec, - - /// Body. - body: Vec, -} - -impl Macro { - fn initial_state(&self) -> ParserState { - if self.parameters.is_empty() { - ParserState::Finished - } else if self.parameters[0].is_positional() { - ParserState::Keyword - } else if let ValueType::Enclose(_, _) = self.parameters[0].arg { - ParserState::Enclose - } else { - ParserState::Arg - } - } - - fn find_parameter(&self, name: &Identifier) -> Option { - self.parameters.iter().position(|param| ¶m.name == name) - } -} - -struct Parameter { - /// `!name` or `!1`. - name: Identifier, - - /// Default value. - /// - /// The tokens don't include white space, etc. between them. - default: Vec, - - /// Macro-expand the value? - expand_value: bool, - - /// How the argument is specified. - arg: ValueType, -} - -impl Parameter { - /// Returns true if this is a positional parameter. Positional parameters - /// are expanded by index (position) rather than by name. - fn is_positional(&self) -> bool { - self.name.0.as_bytes()[1].is_ascii_digit() - } -} - -enum ValueType { - /// Argument consists of `.0` tokens. - NTokens(usize), - - /// Argument runs until token `.0`. - CharEnd(Token), - - /// Argument starts with token `.0` and ends with token `.1`. - Enclose(Token, Token), - - /// Argument runs until the end of the command. - CmdEnd, -} - -/// A token and the syntax that was tokenized to produce it. The syntax allows -/// the token to be turned back into syntax accurately. -#[derive(Clone)] -pub struct MacroToken { - /// The token. - pub token: Token, - - /// The syntax that produces `token`. - pub syntax: String, -} - -fn tokenize_string_into( - s: &str, - mode: Mode, - error: &impl Fn(MacroError), - output: &mut Vec, -) { - for (syntax, token) in StringSegmenter::new(s, mode, true) { - match token { - ScanToken::Token(token) => output.push(MacroToken { - token, - syntax: String::from(syntax), - }), - ScanToken::Error(scan_error) => error(MacroError::ScanError(scan_error)), - } - } -} - -fn tokenize_string(s: &str, mode: Mode, error: &impl Fn(MacroError)) -> Vec { - let mut tokens = Vec::new(); - tokenize_string_into(s, mode, error, &mut tokens); - tokens -} - -fn try_unquote_string(input: &String, mode: Mode) -> Option { - let mut scanner = StringScanner::new(input, mode, true); - let Some(ScanToken::Token(Token::String(unquoted))) = scanner.next() else { - return None; - }; - let None = scanner.next() else { return None }; - return Some(unquoted); -} - -fn unquote_string(input: String, mode: Mode) -> String { - try_unquote_string(&input, mode).unwrap_or(input) -} - -#[derive(Clone)] -struct MacroTokens<'a>(&'a [MacroToken]); - -impl<'a> MacroTokens<'a> { - fn is_empty(&self) -> bool { - self.0.is_empty() - } - fn match_(&mut self, s: &str) -> bool { - if let Some((first, rest)) = self.0.split_first() { - if first.syntax.eq_ignore_ascii_case(s) { - self.0 = rest; - return true; - } - } - false - } - fn take_relop(&mut self) -> Option { - if let Some((first, rest)) = self.0.split_first() { - if let Ok(relop) = first.syntax.as_str().try_into() { - self.0 = rest; - return Some(relop); - } - } - None - } - fn macro_id(&self) -> Option<&Identifier> { - self.0.get(0).map(|mt| mt.token.macro_id()).flatten() - } - fn take_macro_id(&mut self) -> Option<&Identifier> { - let result = self.0.get(0).map(|mt| mt.token.macro_id()).flatten(); - if result.is_some() { - self.advance(); - } - result - } - fn take(&mut self) -> Option<&MacroToken> { - match self.0.split_first() { - Some((first, rest)) => { - self.0 = rest; - Some(first) - } - None => None, - } - } - fn advance(&mut self) -> &MacroToken { - let (first, rest) = self.0.split_first().unwrap(); - self.0 = rest; - first - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -enum TokenClass { - /// No space before or after (new-line after). - EndCommand, - - /// Space on both sides. - BinaryOperator, - - /// Space afterward. - Comma, - - /// Don't need spaces except sequentially. - Id, - - /// Don't need spaces except sequentially. - Punct, -} - -impl TokenClass { - fn separator(prev: Self, next: Self) -> &'static str { - match (prev, next) { - // Don't need a separator before the end of a command, but we - // need a new-line afterward. - (_, Self::EndCommand) => "", - (Self::EndCommand, _) => "\n", - - // Binary operators always have a space on both sides, and a comma always has a space afterward. - (Self::BinaryOperator, _) | (_, Self::BinaryOperator) | (Self::Comma, _) => " ", - - // Otherwise, `prev` is `Self::Punct`, which only need a space if - // there are two or them in a row. - (Self::Punct, Self::Punct) => " ", - _ => "", - } - } -} - -impl From<&Token> for TokenClass { - fn from(source: &Token) -> Self { - match source { - Token::End => Self::Punct, - Token::Id(_) | Token::Number(_) | Token::String(_) => Self::Id, - Token::EndCommand => Self::EndCommand, - Token::Punct(punct) => match punct { - Punct::LParen - | Punct::RParen - | Punct::LSquare - | Punct::RSquare - | Punct::LCurly - | Punct::RCurly => Self::Punct, - - Punct::Plus - | Punct::Dash - | Punct::Asterisk - | Punct::Slash - | Punct::Equals - | Punct::Colon - | Punct::And - | Punct::Or - | Punct::Not - | Punct::Eq - | Punct::Ge - | Punct::Gt - | Punct::Le - | Punct::Lt - | Punct::Ne - | Punct::All - | Punct::By - | Punct::To - | Punct::With - | Punct::Exp - | Punct::Bang - | Punct::Percent - | Punct::Question - | Punct::Backtick - | Punct::Dot - | Punct::Underscore - | Punct::BangAsterisk => Self::BinaryOperator, - - Punct::Comma | Punct::Semicolon => Self::Comma, - }, - } - } -} - -pub fn macro_tokens_to_syntax(input: &[MacroToken]) -> impl Iterator { - input - .iter() - .take(1) - .map(|token| ["", token.syntax.as_str()]) - .chain(input.windows(2).map(|w| { - let c0 = (&w[0].token).into(); - let c1 = (&w[1].token).into(); - [TokenClass::separator(c0, c1), w[1].syntax.as_str()] - })) -} - -trait MacroId { - fn macro_id(&self) -> Option<&Identifier>; -} - -impl MacroId for Token { - fn macro_id(&self) -> Option<&Identifier> { - let id = self.id()?; - id.0.starts_with('!').then_some(id) - } -} - -enum RelOp { - Eq, - Ne, - Lt, - Gt, - Le, - Ge, -} - -impl TryFrom<&str> for RelOp { - type Error = (); - - fn try_from(source: &str) -> Result { - match source { - "=" => Ok(Self::Eq), - "~=" | "<>" => Ok(Self::Ne), - "<" => Ok(Self::Lt), - ">" => Ok(Self::Gt), - "<=" => Ok(Self::Le), - ">=" => Ok(Self::Ge), - _ if source.len() == 3 && source.as_bytes()[0] == b'!' => match ( - source.as_bytes()[0].to_ascii_uppercase(), - source.as_bytes()[1].to_ascii_uppercase(), - ) { - (b'E', b'Q') => Ok(Self::Eq), - (b'N', b'E') => Ok(Self::Ne), - (b'L', b'T') => Ok(Self::Lt), - (b'L', b'E') => Ok(Self::Le), - (b'G', b'T') => Ok(Self::Gt), - (b'G', b'E') => Ok(Self::Ge), - _ => Err(()), - }, - _ => Err(()), - } - } -} - -impl RelOp { - fn evaluate(&self, cmp: Ordering) -> bool { - match self { - RelOp::Eq => cmp == Ordering::Equal, - RelOp::Ne => cmp != Ordering::Equal, - RelOp::Lt => cmp == Ordering::Less, - RelOp::Gt => cmp == Ordering::Greater, - RelOp::Le => cmp != Ordering::Greater, - RelOp::Ge => cmp != Ordering::Less, - } - } -} - -pub type MacroSet = HashMap, Macro>; - -enum ParserState { - /// Accumulating tokens toward the end of any type of argument. - Arg, - - /// Expecting the opening delimiter of an ARG_ENCLOSE argument. - Enclose, - - /// Expecting a keyword for a keyword argument. - Keyword, - - /// Expecting an equal sign for a keyword argument. - Equals, - - /// Macro fully parsed and ready for expansion. - Finished, -} - -/// Macro call parser FSM. -pub struct Parser<'a> { - macros: &'a MacroSet, - macro_: &'a Macro, - state: ParserState, - args: Box<[Option>]>, - arg_index: usize, - - /// Length of macro call so far. - n_tokens: usize, -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum ParseStatus { - Complete, - Incomplete, -} - -impl<'a> Parser<'a> { - pub fn new(macros: &'a MacroSet, token: &Token) -> Option { - let macro_ = macros.get(&token.id()?.0)?; - Some(Self { - macros, - macro_, - state: macro_.initial_state(), - args: (0..macro_.parameters.len()).map(|_| None).collect(), - arg_index: 0, - n_tokens: 1, - }) - } - - fn finished(&mut self) { - self.state = ParserState::Finished; - for (i, arg) in self.args.iter_mut().enumerate() { - if arg.is_none() { - *arg = Some(self.macro_.parameters[i].default.clone()); - } - } - self.state = ParserState::Finished; - } - - fn next_arg(&mut self) { - if self.macro_.parameters.is_empty() { - self.finished() - } else { - let param = &self.macro_.parameters[self.arg_index]; - if param.is_positional() { - self.arg_index += 1; - if self.arg_index >= self.args.len() { - self.finished() - } else { - let param = &self.macro_.parameters[self.arg_index]; - self.state = if !param.is_positional() { - ParserState::Keyword - } else if let ValueType::Enclose(_, _) = param.arg { - ParserState::Enclose - } else { - ParserState::Arg - }; - } - } else { - if self.args.iter().any(|arg| arg.is_none()) { - self.state = ParserState::Keyword; - } else { - self.finished(); - } - } - } - } - - fn push_arg(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) { - let param = &self.macro_.parameters[self.args.len() - 1]; - if let Token::EndCommand | Token::End = token { - if let Some(arg) = &self.args[self.arg_index] { - let param = &self.macro_.parameters[self.args.len() - 1]; - - match ¶m.arg { - ValueType::NTokens(n) => error(MacroError::ExpectedMoreTokens { - n: n - arg.len(), - arg: param.name.clone(), - macro_: self.macro_.name.clone(), - }), - ValueType::CharEnd(end) | ValueType::Enclose(_, end) => { - error(MacroError::ExpectedToken { - token: end.to_string(), - arg: param.name.clone(), - macro_: self.macro_.name.clone(), - }) - } - ValueType::CmdEnd => { - // This is OK, it's the expected way to end the argument. - } - } - } - self.finished(); - } - - self.n_tokens += 1; - let arg = self.args[self.arg_index].get_or_insert(Vec::new()); - let ( - add_token, // Should we add `mt` to the current arg? - next_arg, // Should we advance to the next arg? - ) = match ¶m.arg { - ValueType::NTokens(n) => (arg.len() + 1 >= *n, true), - ValueType::CharEnd(end) | ValueType::Enclose(_, end) => { - let at_end = token == end; - (at_end, !at_end) - } - ValueType::CmdEnd => (false, true), - }; - if add_token { - if true - // !macro_expand_arg (&mt->token, mc->me, *argp) - { - arg.push(MacroToken { - token: token.clone(), - syntax: String::from(syntax), - }); - } - } - if next_arg { - self.next_arg() - } - } - - fn push_enclose(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) { - let param = &self.macro_.parameters[self.arg_index]; - let ValueType::Enclose(start, _) = ¶m.arg else { - unreachable!() - }; - if token == start { - self.n_tokens += 1; - self.args[self.arg_index].get_or_insert(Vec::new()); - self.state = ParserState::Arg; - } else if param.is_positional() && matches!(token, Token::End | Token::EndCommand) { - self.finished(); - } else { - error(MacroError::UnexpectedToken { - actual: String::from(syntax), - expected: start.to_string(), - arg: param.name.clone(), - macro_: self.macro_.name.clone(), - }); - self.finished(); - } - } - - fn push_keyword(&mut self, token: &Token, _syntax: &str, error: &impl Fn(MacroError)) { - let Some(id) = token.id() else { - return self.finished(); - }; - let Some(arg_index) = self.macro_.find_parameter(id) else { - return self.finished(); - }; - self.arg_index = arg_index; - if self.args[arg_index].is_some() { - error(MacroError::DuplicateArg { - arg: id.clone(), - macro_: self.macro_.name.clone(), - }); - } - self.args[arg_index] = Some(Vec::new()); - } - - fn push_equals(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) { - let param = &self.macro_.parameters[self.arg_index]; - if let Token::Punct(Punct::Eq) = token { - self.n_tokens += 1; - self.state = if let ValueType::Enclose(_, _) = param.arg { - ParserState::Enclose - } else { - ParserState::Arg - }; - } else { - error(MacroError::UnexpectedToken { - actual: syntax.into(), - expected: String::from("="), - arg: param.name.clone(), - macro_: self.macro_.name.clone(), - }); - self.finished() - } - } - - /// Adds `token`, which has the given `syntax`, to the collection of tokens - /// in `self` that potentially need to be macro expanded. - /// - /// Returns [ParseStatus::Incomplete] if the macro expander needs more - /// tokens, for macro arguments or to decide whether this is actually a - /// macro invocation. The caller should call `push` again with the next - /// token. - /// - /// Returns [ParseStatus::Complete] if the macro invocation is now complete. - /// The caller should call [`Self::finish()`] to obtain the expansion. - pub fn push( - &mut self, - token: &Token, - syntax: &str, - error: &impl Fn(MacroError), - ) -> ParseStatus { - match self.state { - ParserState::Arg => self.push_arg(token, syntax, error), - ParserState::Enclose => self.push_enclose(token, syntax, error), - ParserState::Keyword => self.push_keyword(token, syntax, error), - ParserState::Equals => self.push_equals(token, syntax, error), - ParserState::Finished => (), - } - if let ParserState::Finished = self.state { - ParseStatus::Complete - } else { - ParseStatus::Incomplete - } - } - - pub fn finish(self) -> Call<'a> { - let ParserState::Finished = self.state else { - panic!() - }; - Call(self) - } -} - -/// Expansion stack entry. -struct Frame { - /// A macro name or `!IF`, `!DO`, etc. - name: Option, - - /// Source location, if available. - location: Option, -} - -struct Expander<'a> { - /// Macros to expand recursively. - macros: &'a MacroSet, - - /// Error reporting callback. - error: &'a Box, - - /// Tokenization mode. - mode: Mode, - - /// Remaining nesting levels. - nesting_countdown: usize, - - /// Stack for error reporting. - stack: Vec, - - // May macro calls be expanded? - expand: &'a RefCell, - - /// Variables from `!DO` and `!LET`. - vars: &'a RefCell>, - - // Only set if inside a `!DO` loop. If true, break out of the loop. - break_: Option<&'a mut bool>, - - /// Only set if expanding a macro (and not, say, a macro argument). - macro_: Option<&'a Macro>, - - /// Only set if expanding a macro (and not, say, a macro argument). - args: Option<&'a [Option>]>, -} - -fn bool_to_string(b: bool) -> String { - if b { - String::from("1") - } else { - String::from("0") - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -enum IfEndClause { - Else, - IfEnd, -} - -fn macro_keywords() -> HashSet { - let mut keywords = HashSet::new(); - for kw in [ - "!BREAK", - "!CHAREND", - "!CMDEND", - "!DEFAULT", - "!DO", - "!DOEND", - "!ELSE", - "!ENCLOSE", - "!ENDDEFINE", - "!IF", - "!IFEND", - "!IN", - "!LET", - "!NOEXPAND", - "!OFFEXPAND", - "!ONEXPAND", - "!POSITIONAL", - "!THEN", - "!TOKENS", - ] { - keywords.insert(Identifier::new(kw).unwrap()); - } - keywords -} - -fn is_macro_keyword(s: &Identifier) -> bool { - lazy_static! { - static ref KEYWORDS: HashSet = macro_keywords(); - } - KEYWORDS.contains(s) -} - -enum DoInput { - List(Vec), - Up { first: f64, last: f64, by: f64 }, - Down { first: f64, last: f64, by: f64 }, - Empty, -} - -impl DoInput { - fn from_list(items: Vec) -> Self { - Self::List( - items - .into_iter() - .rev() - .take(Settings::global().macros.max_iterations + 1) - .map(|mt| mt.syntax) - .collect(), - ) - } - - fn from_by(first: f64, last: f64, by: f64) -> Self { - if by > 0.0 && first <= last { - Self::Up { first, last, by } - } else if by > 0.0 && first <= last { - Self::Down { first, last, by } - } else { - Self::Empty - } - } -} - -impl Iterator for DoInput { - type Item = String; - - fn next(&mut self) -> Option { - match self { - DoInput::List(vec) => vec.pop(), - DoInput::Up { first, last, by } => { - if first <= last { - let value = *first; - *first += *by; - Some(format!("{value}")) - } else { - None - } - } - DoInput::Down { first, last, by } => { - if first >= last { - let value = *first; - *first += *by; - Some(format!("{value}")) - } else { - None - } - } - DoInput::Empty => None, - } - } -} - -impl<'a> Expander<'a> { - fn may_expand(&self) -> bool { - *self.expand.borrow() - } - - fn should_break(&self) -> bool { - self.break_.as_ref().map(|b| **b).unwrap_or(false) - } - - fn expand(&mut self, input: &mut MacroTokens, output: &mut Vec) { - if self.nesting_countdown == 0 { - (self.error)(MacroError::TooDeep { - limit: Settings::global().macros.max_nest, - }); - output.extend(take(&mut input.0).iter().cloned()); - } else { - while !input.is_empty() && !self.should_break() { - self.expand__(input, output); - } - } - } - - fn expand_arg(&mut self, param_idx: usize, output: &mut Vec) { - let param = &self.macro_.unwrap().parameters[param_idx]; - let arg = &self.args.unwrap()[param_idx].as_ref().unwrap(); - if self.may_expand() && param.expand_value { - let vars = RefCell::new(BTreeMap::new()); - let mut stack = take(&mut self.stack); - stack.push(Frame { - name: Some(param.name.clone()), - location: None, - }); - let mut subexpander = Expander { - stack, - vars: &vars, - break_: None, - macro_: None, - args: None, - ..*self - }; - let mut arg_tokens = MacroTokens(&arg); - subexpander.expand(&mut arg_tokens, output); - self.stack = subexpander.stack; - self.stack.pop(); - } else { - output.extend(arg.iter().cloned()); - } - } - fn parse_function_args( - &mut self, - function: &Identifier, - input: &mut MacroTokens, - ) -> Option> { - input.advance(); - input.advance(); - let mut args = Vec::new(); - if input.match_(")") { - return Some(args); - } - loop { - args.push(self.parse_function_arg(input)?); - match input.take() { - Some(MacroToken { - token: Token::Punct(Punct::Comma), - .. - }) => (), - Some(MacroToken { - token: Token::Punct(Punct::RParen), - .. - }) => return Some(args), - _ => { - (self.error)(MacroError::ExpectingCommaOrRParen(function.clone())); - return None; - } - } - } - } - - fn expand_blanks(e: &mut Expander, args: Vec) -> Option { - let Ok(n) = args[0].trim().parse::() else { - (e.error)(MacroError::InvalidBlanks(args[0].clone())); - return None; - }; - Some(std::iter::repeat(' ').take(n).collect()) - } - - fn expand_concat(e: &mut Expander, args: Vec) -> Option { - Some( - args.into_iter() - .map(|arg| unquote_string(arg, e.mode)) - .collect(), - ) - } - - fn expand_eval(e: &mut Expander, args: Vec) -> Option { - let tokens = tokenize_string(&args[0], e.mode, e.error); - let mut stack = take(&mut e.stack); - stack.push(Frame { - name: Some(Identifier::new("!EVAL").unwrap()), - location: None, - }); - let mut break_ = false; - let mut subexpander = Expander { - break_: Some(&mut break_), - stack, - vars: e.vars, - ..*e - }; - let mut output = Vec::new(); - subexpander.expand(&mut MacroTokens(tokens.as_slice()), &mut output); - subexpander.stack.pop(); - e.stack = subexpander.stack; - Some(macro_tokens_to_syntax(&output).flatten().collect()) - } - - fn expand_head(e: &mut Expander, mut args: Vec) -> Option { - let arg = unquote_string(args.remove(0), e.mode); - let mut output = tokenize_string(&arg, e.mode, e.error); - if output.is_empty() { - Some(String::new()) - } else { - Some(output.swap_remove(0).syntax) - } - } - - fn expand_index(_e: &mut Expander, args: Vec) -> Option { - let haystack = &args[0]; - let needle = &args[1]; - let position = haystack.find(needle); - Some(format!( - "{}", - position.map_or(0, |position| &haystack[0..position].chars().count() + 1) - )) - } - - fn expand_length(_e: &mut Expander, args: Vec) -> Option { - Some(format!("{}", args[0].chars().count())) - } - - fn expand_quote(e: &mut Expander, mut args: Vec) -> Option { - let arg = args.remove(0); - if try_unquote_string(&arg, e.mode).is_some() { - Some(arg) - } else { - let mut output = String::with_capacity(arg.len() + 2); - output.push('\''); - for c in arg.chars() { - if c == '"' { - output.push('\''); - } - output.push(c); - } - output.push('\''); - Some(output) - } - } - - fn expand_substr(e: &mut Expander, args: Vec) -> Option { - let Ok(start) = args[1].trim().parse::() else { - (e.error)(MacroError::InvalidSubstr3(args[0].clone())); - return None; - }; - let start = start.get(); - let Ok(count) = args[2].trim().parse::() else { - (e.error)(MacroError::InvalidSubstr2(args[0].clone())); - return None; - }; - - Some(args[0].chars().skip(start - 1).take(count).collect()) - } - - fn expand_tail(e: &mut Expander, mut args: Vec) -> Option { - let arg = unquote_string(args.remove(0), e.mode); - let mut output = tokenize_string(&arg, e.mode, e.error); - Some( - output - .pop() - .map_or_else(|| String::new(), |tail| tail.syntax), - ) - } - - fn expand_unquote(e: &mut Expander, mut args: Vec) -> Option { - Some(unquote_string(args.remove(0), e.mode)) - } - - fn expand_upcase(e: &mut Expander, mut args: Vec) -> Option { - Some(unquote_string(args.remove(0), e.mode).to_uppercase()) - } - - fn expand_macro_function(&mut self, orig_input: &mut MacroTokens) -> Option { - let mut input = orig_input.clone(); - let name = input.macro_id()?; - if name == "!NULL" { - return Some(String::new()); - } - if input.0.len() < 2 || !matches!(input.0[1].token, Token::Punct(Punct::LParen)) { - return None; - } - - struct MacroFunction { - name: Identifier, - args: RangeInclusive, - parser: fn(&mut Expander, Vec) -> Option, - } - impl MacroFunction { - fn new( - name: &str, - args: RangeInclusive, - parser: fn(&mut Expander, Vec) -> Option, - ) -> Self { - Self { - name: Identifier::new(name).unwrap(), - args, - parser, - } - } - } - lazy_static! { - static ref MACRO_FUNCTIONS: [MacroFunction; 11] = [ - MacroFunction::new("!BLANKS", 1..=1, Expander::expand_blanks), - MacroFunction::new("!CONCAT", 1..=usize::MAX, Expander::expand_concat), - MacroFunction::new("!HEAD", 1..=1, Expander::expand_head), - MacroFunction::new("!INDEX", 2..=2, Expander::expand_index), - MacroFunction::new("!LENGTH", 1..=1, Expander::expand_length), - MacroFunction::new("!QUOTE", 1..=1, Expander::expand_quote), - MacroFunction::new("!SUBSTR", 2..=3, Expander::expand_substr), - MacroFunction::new("!TAIL", 1..=1, Expander::expand_tail), - MacroFunction::new("!UNQUOTE", 1..=1, Expander::expand_unquote), - MacroFunction::new("!UPCASE", 1..=1, Expander::expand_upcase), - MacroFunction::new("!EVAL", 1..=1, Expander::expand_eval), - ]; - } - - let function = MACRO_FUNCTIONS.iter().find(|mf| &mf.name == name)?; - - let args = self.parse_function_args(&function.name, &mut input)?; - - let n_args = args.len(); - if !function.args.contains(&n_args) { - let name = function.name.clone(); - let error = match &function.args { - x if x == &(1..=1) => MacroError::ExpectingOneArg { name, n_args }, - x if x == &(2..=2) => MacroError::ExpectingTwoArgs { name, n_args }, - x if x == &(2..=3) => MacroError::ExpectingTwoOrThreeArgs { name, n_args }, - x if x == &(1..=usize::MAX) => MacroError::ExpectingOneOrMoreArgs { name }, - _ => unreachable!(), - }; - (self.error)(error); - return None; - } - - *orig_input = input; - (function.parser)(self, args) - } - - /// Parses one function argument from `input`. Each argument to a macro - /// function is one of: - /// - /// - A quoted string or other single literal token. - /// - /// - An argument to the macro being expanded, e.g. `!1` or a named - /// argument. - /// - /// - `!*`. - /// - /// - A function invocation. - /// - /// Each function invocation yields a character sequence to be turned into a - /// sequence of tokens. The case where that character sequence is a single - /// quoted string is an important special case. - fn parse_function_arg(&mut self, input: &mut MacroTokens) -> Option { - if let Some(macro_) = self.macro_ { - match &input.0.get(0)?.token { - Token::Id(id) if id.0.starts_with('!') => { - if let Some(param_idx) = macro_.find_parameter(id) { - input.advance(); - return Some( - macro_tokens_to_syntax(self.args.unwrap()[param_idx].as_ref().unwrap()) - .flatten() - .collect(), - ); - } - if let Some(value) = self.vars.borrow().get(id) { - return Some(value.clone()); - } - - if let Some(output) = self.expand_macro_function(input) { - return Some(output); - } - } - Token::Punct(Punct::BangAsterisk) => { - let mut arg = String::new(); - for i in 0..macro_.parameters.len() { - if !macro_.parameters[i].is_positional() { - break; - } - if i > 0 { - arg.push(' ') - } - arg.extend( - macro_tokens_to_syntax(self.args.unwrap()[i].as_ref().unwrap()) - .flatten(), - ); - } - input.advance(); - return Some(arg); - } - _ => (), - } - } - Some(input.advance().syntax.clone()) - } - - fn evaluate_literal(&mut self, input: &mut MacroTokens) -> Option { - if input.match_("(") { - let value = self.evaluate_or(input)?; - if input.match_(")") { - Some(value) - } else { - (self.error)(MacroError::ExpectingRParen); - None - } - } else if input.match_(")") { - (self.error)(MacroError::ExpectingLiteral); - None - } else { - Some(unquote_string(self.parse_function_arg(input)?, self.mode)) - } - } - - fn evaluate_relational(&mut self, input: &mut MacroTokens) -> Option { - let lhs = self.evaluate_literal(input)?; - let Some(relop) = input.take_relop() else { - return Some(lhs); - }; - let rhs = self.evaluate_literal(input)?; - let cmp = unquote_string(lhs, self.mode).cmp(&unquote_string(rhs, self.mode)); - Some(bool_to_string(relop.evaluate(cmp))) - } - - fn evaluate_not(&mut self, input: &mut MacroTokens) -> Option { - let mut negations = 0; - while input.match_("!AND") || input.match_("&") { - negations += 1; - } - - let operand = self.evaluate_relational(input)?; - if negations == 0 { - return Some(operand); - } - - let mut b = operand != "0"; - if negations.is_odd() { - b = !b; - } - Some(bool_to_string(b)) - } - - fn evaluate_and(&mut self, input: &mut MacroTokens) -> Option { - let mut lhs = self.evaluate_not(input)?; - while input.match_("!AND") || input.match_("&") { - let rhs = self.evaluate_not(input)?; - lhs = bool_to_string(lhs != "0" && rhs != "0"); - } - Some(lhs) - } - fn evaluate_or(&mut self, input: &mut MacroTokens) -> Option { - let mut lhs = self.evaluate_and(input)?; - while input.match_("!OR") || input.match_("|") { - let rhs = self.evaluate_and(input)?; - lhs = bool_to_string(lhs != "0" || rhs != "0"); - } - Some(lhs) - } - - fn evaluate_expression(&mut self, input: &mut MacroTokens) -> Option { - self.evaluate_or(input) - } - - fn evaluate_number(&mut self, input: &mut MacroTokens) -> Option { - let s = self.evaluate_expression(input)?; - let tokens = tokenize_string(&s, self.mode, self.error); - let ( - Some(MacroToken { - token: Token::Number(number), - .. - }), - 1, - ) = (tokens.get(0), tokens.len()) - else { - (self.error)(MacroError::BadNumericMacroExpression(s)); - return None; - }; - - Some(*number) - } - - fn find_ifend_clause<'b>( - input: &mut MacroTokens<'b>, - ) -> Option<(MacroTokens<'b>, IfEndClause)> { - let input_copy = input.clone(); - let mut nesting = 0; - while !input.is_empty() { - if input.match_("!IF") { - nesting += 1; - } else if input.match_("!IFEND") { - if nesting == 0 { - return Some(( - MacroTokens(&input_copy.0[..input_copy.0.len() - input.0.len() - 1]), - IfEndClause::IfEnd, - )); - } - nesting -= 1; - } else if input.match_("!ELSE") && nesting == 0 { - return Some(( - MacroTokens(&input_copy.0[..input_copy.0.len() - input.0.len() - 1]), - IfEndClause::Else, - )); - } else { - input.advance(); - } - } - return None; - } - fn expand_if(&mut self, orig_input: &mut MacroTokens, output: &mut Vec) -> bool { - let mut input = orig_input.clone(); - if !input.match_("!IF") { - return false; - } - let Some(result) = self.evaluate_expression(&mut input) else { - return false; - }; - if !input.match_("!THEN") { - (self.error)(MacroError::ExpectingThen); - return false; - } - - let Some((if_tokens, clause)) = Self::find_ifend_clause(&mut input) else { - (self.error)(MacroError::ExpectingElseOrIfEnd); - return false; - }; - - let else_tokens = match clause { - IfEndClause::Else => { - let Some((else_tokens, IfEndClause::IfEnd)) = Self::find_ifend_clause(&mut input) - else { - (self.error)(MacroError::ExpectingIfEnd); - return false; - }; - Some(else_tokens) - } - IfEndClause::IfEnd => None, - }; - - let subinput = match result.as_str() { - "0" => else_tokens, - _ => Some(if_tokens), - }; - if let Some(mut subinput) = subinput { - self.stack.push(Frame { - name: Some(Identifier::new("!IF").unwrap()), - location: None, - }); - self.expand(&mut subinput, output); - self.stack.pop(); - } - *orig_input = input; - true - } - - fn take_macro_var_name( - &mut self, - input: &mut MacroTokens, - construct: &'static str, - ) -> Option { - let Some(var_name) = input.take_macro_id() else { - (self.error)(MacroError::ExpectingMacroVarName(construct)); - return None; - }; - if is_macro_keyword(var_name) - || self - .macro_ - .map(|m| m.find_parameter(var_name)) - .flatten() - .is_some() - { - (self.error)(MacroError::BadMacroVarName { - name: var_name.clone(), - construct, - }); - None - } else { - Some(var_name.clone()) - } - } - - fn expand_let(&mut self, orig_input: &mut MacroTokens) -> bool { - let mut input = orig_input.clone(); - if !input.match_("!LET") { - return false; - } - - let Some(var_name) = self.take_macro_var_name(&mut input, "!LET") else { - return false; - }; - input.advance(); - - if !input.match_("=") { - (self.error)(MacroError::ExpectingEquals); - return false; - } - - let Some(value) = self.evaluate_expression(&mut input) else { - return false; - }; - self.vars.borrow_mut().insert(var_name.clone(), value); - *orig_input = input; - true - } - - fn find_doend<'b>(&mut self, input: &mut MacroTokens<'b>) -> Option> { - let input_copy = input.clone(); - let mut nesting = 0; - while !input.is_empty() { - if input.match_("!DO") { - nesting += 1; - } else if input.match_("!DOEND") { - if nesting == 0 { - return Some(MacroTokens( - &input_copy.0[..input_copy.0.len() - input.0.len() - 1], - )); - } - nesting -= 1; - } else { - input.advance(); - } - } - (self.error)(MacroError::MissingDoEnd); - return None; - } - - fn expand_do(&mut self, orig_input: &mut MacroTokens, output: &mut Vec) -> bool { - let mut input = orig_input.clone(); - if !input.match_("!DO") { - return false; - } - - let Some(var_name) = self.take_macro_var_name(&mut input, "!DO") else { - return false; - }; - - let (items, miterate_error) = if input.match_("!IN") { - let Some(list) = self.evaluate_expression(&mut input) else { - return false; - }; - let items = tokenize_string(list.as_str(), self.mode, &self.error); - ( - DoInput::from_list(items), - MacroError::MiterateList(Settings::global().macros.max_iterations), - ) - } else if input.match_("=") { - let Some(first) = self.evaluate_number(&mut input) else { - return false; - }; - if !input.match_("!TO") { - (self.error)(MacroError::ExpectingTo); - return false; - } - let Some(last) = self.evaluate_number(&mut input) else { - return false; - }; - let by = if input.match_("!BY") { - let Some(by) = self.evaluate_number(&mut input) else { - return false; - }; - if by == 0.0 { - (self.error)(MacroError::ZeroBy); - return false; - } - by - } else { - 1.0 - }; - ( - DoInput::from_by(first, last, by), - MacroError::MiterateNumeric(Settings::global().macros.max_iterations), - ) - } else { - (self.error)(MacroError::ExpectingEqualsOrIn); - return false; - }; - - let Some(body) = self.find_doend(&mut input) else { - return false; - }; - - let mut stack = take(&mut self.stack); - stack.push(Frame { - name: Some(Identifier::new("!DO").unwrap()), - location: None, - }); - let mut break_ = false; - let mut subexpander = Expander { - break_: Some(&mut break_), - stack, - vars: self.vars, - ..*self - }; - - for (i, item) in items.enumerate() { - if subexpander.should_break() { - break; - } - if i >= Settings::global().macros.max_iterations { - (self.error)(miterate_error); - break; - } - let mut vars = self.vars.borrow_mut(); - if let Some(value) = vars.get_mut(&var_name) { - *value = item; - } else { - vars.insert(var_name.clone(), item); - } - subexpander.expand(&mut body.clone(), output); - } - *orig_input = input; - true - } - - fn expand__(&mut self, input: &mut MacroTokens, output: &mut Vec) { - // Recursive macro calls. - if self.may_expand() { - if let Some(call) = Call::for_tokens(self.macros, &input.0, &self.error) { - let vars = RefCell::new(BTreeMap::new()); - let mut stack = take(&mut self.stack); - stack.push(Frame { - name: Some(call.0.macro_.name.clone()), - location: Some(call.0.macro_.location.clone()), - }); - let mut subexpander = Expander { - break_: None, - vars: &vars, - nesting_countdown: self.nesting_countdown.saturating_sub(1), - stack, - ..*self - }; - let mut body = MacroTokens(call.0.macro_.body.as_slice()); - subexpander.expand(&mut body, output); - self.stack = subexpander.stack; - self.stack.pop(); - input.0 = &input.0[call.len()..]; - return; - } - } - - // Only identifiers beginning with `!` receive further processing. - let id = match &input.0[0].token { - Token::Id(id) if id.0.starts_with('!') => id, - Token::Punct(Punct::BangAsterisk) => { - if let Some(macro_) = self.macro_ { - for i in 0..macro_.parameters.len() { - self.expand_arg(i, output); - } - } else { - (self.error)(MacroError::InvalidBangAsterisk); - } - input.advance(); - return; - } - _ => { - output.push(input.advance().clone()); - return; - } - }; - - // Macro arguments. - if let Some(macro_) = self.macro_ { - if let Some(param_idx) = macro_.find_parameter(id) { - self.expand_arg(param_idx, output); - input.advance(); - return; - } - } - - // Variables set by `!DO` or `!LET`. - if let Some(value) = self.vars.borrow().get(id) { - tokenize_string_into(value.as_str(), self.mode, &self.error, output); - input.advance(); - return; - } - - // Macro functions. - if self.expand_if(input, output) { - return; - } - if self.expand_let(input) { - return; - } - if self.expand_do(input, output) { - return; - } - - if input.match_("!BREAK") { - if let Some(ref mut break_) = self.break_ { - **break_ = true; - } else { - (self.error)(MacroError::BreakOutsideDo); - } - return; - } - - if input.match_("!ONEXPAND") { - *self.expand.borrow_mut() = true; - } else if input.match_("!OFFEXPAND") { - *self.expand.borrow_mut() = false; - } else { - output.push(input.advance().clone()); - } - } -} - -pub struct Call<'a>(Parser<'a>); - -impl<'a> Call<'a> { - pub fn for_tokens(macros: &'a MacroSet, tokens: &[MacroToken], error: &F) -> Option - where - F: Fn(MacroError), - { - let mut parser = Parser::new(macros, &tokens.get(0)?.token)?; - for token in tokens[1..].iter().chain(&[MacroToken { - token: Token::EndCommand, - syntax: String::from(""), - }]) { - if parser.push(&token.token, &token.syntax, error) == ParseStatus::Complete { - return Some(parser.finish()); - } - } - return None; - } - - pub fn expand(&self, mode: Mode, call_loc: Location, output: &mut Vec, error: F) - where - F: Fn(MacroError) + 'a, - { - let error: Box = Box::new(error); - let vars = RefCell::new(BTreeMap::new()); - let expand = RefCell::new(true); - let mut me = Expander { - macros: self.0.macros, - error: &error, - macro_: Some(self.0.macro_), - args: Some(&self.0.args), - mode, - nesting_countdown: Settings::global().macros.max_nest, - stack: vec![ - Frame { - name: None, - location: Some(call_loc), - }, - Frame { - name: Some(self.0.macro_.name.clone()), - location: Some(self.0.macro_.location.clone()), - }, - ], - vars: &vars, - break_: None, - expand: &expand, - }; - let mut body = MacroTokens(&self.0.macro_.body); - me.expand(&mut body, output); - } - - /// Returns the number of tokens consumed from the input for the macro - /// invocation. If the result is 0, then there was no macro invocation and - /// the expansion will be empty. - pub fn len(&self) -> usize { - self.0.n_tokens - } -} diff --git a/rust/src/main.rs b/rust/src/main.rs deleted file mode 100644 index a3b3145bed..0000000000 --- a/rust/src/main.rs +++ /dev/null @@ -1,155 +0,0 @@ -/* PSPP - a program for statistical analysis. - * Copyright (C) 2023 Free Software Foundation, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . */ - -use anyhow::Result; -use clap::{Parser, ValueEnum}; -use encoding_rs::Encoding; -use pspp::raw::{encoding_from_headers, Decoder, Magic, Reader, Record}; -use std::fs::File; -use std::io::BufReader; -use std::path::{Path, PathBuf}; -use std::str; -use thiserror::Error as ThisError; - -/// A utility to dissect SPSS system files. -#[derive(Parser, Debug)] -#[command(author, version, about, long_about = None)] -struct Args { - /// Maximum number of cases to print. - #[arg(long = "data", default_value_t = 0)] - max_cases: u64, - - /// Files to dissect. - #[arg(required = true)] - files: Vec, - - /// How to dissect the file. - #[arg(short, long, value_enum, default_value_t)] - mode: Mode, - - /// The encoding to use. - #[arg(long, value_parser = parse_encoding)] - encoding: Option<&'static Encoding>, -} - -#[derive(ThisError, Debug)] -#[error("{0}: unknown encoding")] -struct UnknownEncodingError(String); - -fn parse_encoding(arg: &str) -> Result<&'static Encoding, UnknownEncodingError> { - match Encoding::for_label_no_replacement(arg.as_bytes()) { - Some(encoding) => Ok(encoding), - None => Err(UnknownEncodingError(arg.to_string())), - } -} - -#[derive(Clone, Copy, Debug, Default, ValueEnum)] -enum Mode { - Identify, - Raw, - Decoded, - #[default] - Cooked, -} - -fn main() -> Result<()> { - let Args { - max_cases, - files, - mode, - encoding, - } = Args::parse(); - - for file in files { - dissect(&file, max_cases, mode, encoding)?; - } - Ok(()) -} - -fn dissect( - file_name: &Path, - max_cases: u64, - mode: Mode, - encoding: Option<&'static Encoding>, -) -> Result<()> { - let reader = File::open(file_name)?; - let reader = BufReader::new(reader); - let mut reader = Reader::new(reader, |warning| println!("{warning}"))?; - - match mode { - Mode::Identify => { - let Record::Header(header) = reader.next().unwrap()? else { - unreachable!() - }; - match header.magic { - Magic::Sav => println!("SPSS System File"), - Magic::Zsav => println!("SPSS System File with Zlib compression"), - Magic::Ebcdic => println!("EBCDIC-encoded SPSS System File"), - } - return Ok(()); - } - Mode::Raw => { - for header in reader { - let header = header?; - println!("{:?}", header); - if let Record::Cases(cases) = header { - let mut cases = cases.borrow_mut(); - for _ in 0..max_cases { - let Some(Ok(record)) = cases.next() else { - break; - }; - println!("{:?}", record); - } - } - } - } - Mode::Decoded => { - let headers: Vec = reader.collect::, _>>()?; - let encoding = match encoding { - Some(encoding) => encoding, - None => encoding_from_headers(&headers, &|e| eprintln!("{e}"))?, - }; - let decoder = Decoder::new(encoding, |e| eprintln!("{e}")); - for header in headers { - let header = header.decode(&decoder); - println!("{:?}", header); - /* - if let Record::Cases(cases) = header { - let mut cases = cases.borrow_mut(); - for _ in 0..max_cases { - let Some(Ok(record)) = cases.next() else { - break; - }; - println!("{:?}", record); - } - } - */ - } - } - Mode::Cooked => { - /* - let headers: Vec = reader.collect::, _>>()?; - let encoding = encoding_from_headers(&headers, &|e| eprintln!("{e}"))?; - let (headers, _) = decode(headers, encoding, &|e| eprintln!("{e}"))?; - for header in headers { - println!("{header:?}"); - } - */ - } - } - - Ok(()) -} diff --git a/rust/src/message.rs b/rust/src/message.rs deleted file mode 100644 index a3ba1d8e9f..0000000000 --- a/rust/src/message.rs +++ /dev/null @@ -1,252 +0,0 @@ -use std::{ - cmp::{max, min}, - fmt::{Display, Formatter, Result as FmtResult}, - ops::Range, - sync::Arc, -}; - -use enum_map::Enum; -use unicode_width::UnicodeWidthStr; - -/// A line number and optional column number within a source file. -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] -pub struct Point { - /// 1-based line number. - pub line: i32, - - /// 1-based column number. - /// - /// Column numbers are measured according to the width of characters as - /// shown in a typical fixed-width font, in which CJK characters have width - /// 2 and combining characters have width 0, as measured by the - /// `unicode_width` crate. - pub column: Option, -} - -impl Point { - /// Takes `point`, adds to it the syntax in `syntax`, incrementing the line - /// number for each new-line in `syntax` and the column number for each - /// column, and returns the result. - pub fn advance(&self, syntax: &str) -> Self { - let mut result = *self; - for line in syntax.split_inclusive('\n') { - if line.ends_with('\n') { - result.line += 1; - result.column = Some(1); - } else { - result.column = result.column.map(|column| column + line.width() as i32); - } - } - result - } - - pub fn without_column(&self) -> Self { - Self { - line: self.line, - column: None, - } - } -} - -/// Location relevant to an diagnostic message. -#[derive(Clone, Debug)] -pub struct Location { - /// File name, if any. - pub file_name: Option>, - - /// Starting and ending point, if any. - pub span: Option>, - - /// Normally, if `span` contains column information, then displaying the - /// message will underline the location. Setting this to true disables - /// displaying underlines. - pub omit_underlines: bool, -} - -impl Display for Location { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - if let Some(file_name) = &self.file_name { - write!(f, "{}", file_name)?; - } - - if let Some(span) = &self.span { - if self.file_name.is_some() { - write!(f, ":")?; - } - let l1 = span.start.line; - let l2 = span.end.line; - if let (Some(c1), Some(c2)) = (span.start.column, span.end.column) { - if l2 > l1 { - write!(f, "{l1}.{c1}-{l2}.{}", c2 - 1)?; - } else { - write!(f, "{l1}.{c1}-{}", c2 - 1)?; - } - } else { - if l2 > l1 { - write!(f, "{l1}-{l2}")?; - } else { - write!(f, "{l1}")?; - } - } - } - Ok(()) - } -} - -impl Location { - pub fn without_columns(&self) -> Self { - Self { - file_name: self.file_name.clone(), - span: self - .span - .as_ref() - .map(|span| span.start.without_column()..span.end.without_column()), - omit_underlines: self.omit_underlines, - } - } - pub fn merge(a: Option, b: &Option) -> Option { - let Some(a) = a else { return b.clone() }; - let Some(b) = b else { return Some(a) }; - if a.file_name != b.file_name { - // Failure. - return Some(a); - } - let span = match (&a.span, &b.span) { - (None, None) => None, - (Some(r), None) | (None, Some(r)) => Some(r.clone()), - (Some(ar), Some(br)) => { - Some(min(ar.start, br.start).clone()..max(ar.end, br.end).clone()) - } - }; - Some(Self { - file_name: a.file_name, - span, - omit_underlines: a.omit_underlines || b.omit_underlines, - }) - } - pub fn is_empty(&self) -> bool { - self.file_name.is_none() && self.span.is_none() - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, Enum)] -pub enum Severity { - Error, - Warning, - Note, -} - -impl Severity { - fn as_str(&self) -> &'static str { - match self { - Severity::Error => "error", - Severity::Warning => "warning", - Severity::Note => "note", - } - } -} - -impl Display for Severity { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - write!(f, "{}", self.as_str()) - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum Category { - General, - Syntax, - Data, -} - -pub struct Stack { - location: Location, - description: String, -} - -pub struct Diagnostic { - pub severity: Severity, - pub category: Category, - pub location: Location, - pub source: Vec<(i32, String)>, - pub stack: Vec, - pub command_name: Option<&'static str>, - pub text: String, -} - -impl Display for Diagnostic { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - for Stack { - location, - description, - } in &self.stack - { - if !!location.is_empty() { - write!(f, "{location}: ")?; - } - writeln!(f, "{description}")?; - } - if self.category != Category::General && !self.location.is_empty() { - write!(f, "{}: ", self.location)?; - } - - write!(f, "{}: ", self.severity)?; - - match self.command_name { - Some(command_name) if self.category == Category::Syntax => { - write!(f, "{command_name}: ")? - } - _ => (), - } - - write!(f, "{}", self.text)?; - - if let Some(Range { - start: Point { - line: l0, - column: Some(c0), - }, - end: Point { - line: l1, - column: Some(c1), - }, - }) = self.location.span - { - let mut prev_line_number = None; - for (line_number, line) in &self.source { - if let Some(prev_line_number) = prev_line_number { - if *line_number != prev_line_number + 1 { - write!(f, "\n ... |")?; - } - } - prev_line_number = Some(line_number); - - write!(f, "\n{line_number:5} | {line}")?; - - if !self.location.omit_underlines { - let c0 = if *line_number == l0 { c0 } else { 1 }; - let c1 = if *line_number == l1 { - c1 - } else { - line.width() as i32 - }; - write!(f, "\n |")?; - for _ in 0..c0 { - f.write_str(" ")?; - } - if *line_number == l0 { - f.write_str("^")?; - for _ in c0..c1 { - f.write_str("~")?; - } - } else { - for _ in c0..=c1 { - f.write_str("~")?; - } - } - } - } - } - Ok(()) - } -} diff --git a/rust/src/output/mod.rs b/rust/src/output/mod.rs deleted file mode 100644 index 944cbe75d9..0000000000 --- a/rust/src/output/mod.rs +++ /dev/null @@ -1,58 +0,0 @@ -use std::sync::Arc; - -use self::pivot::Value; - -pub mod pivot; - -/// A single output item. -pub struct Item { - /// The localized label for the item that appears in the outline pane in the - /// output viewer and in PDF outlines. This is `None` if no label has been - /// explicitly set. - label: Option, - - /// A locale-invariant identifier for the command that produced the output, - /// which may be `None` if unknown or if a command did not produce this - /// output. - command_name: Option, - - /// For a group item, this is true if the group's subtree should - /// be expanded in an outline view, false otherwise. - /// - /// For other kinds of output items, this is true to show the item's - /// content, false to hide it. The item's label is always shown in an - /// outline view. - show: bool, - - /// Item details. - details: Details, -} - -pub enum Details { - Chart, - Image, - Group(Vec>), - Message, - Table, - Text(Text), -} - -pub struct Text { - type_: TextType, - - content: Value, -} - -pub enum TextType { - /// `TITLE` and `SUBTITLE` commands. - PageTitle, - - /// Title, - Title, - - /// Syntax printback logging. - Syntax, - - /// Other logging. - Log, -} diff --git a/rust/src/output/pivot/mod.rs b/rust/src/output/pivot/mod.rs deleted file mode 100644 index d8f5c9f17f..0000000000 --- a/rust/src/output/pivot/mod.rs +++ /dev/null @@ -1,738 +0,0 @@ -//! Pivot tables. -//! -//! Pivot tables are PSPP's primary form of output. They are analogous to the -//! pivot tables you might be familiar with from spreadsheets and databases. -//! See for a brief introduction to -//! the overall concept of a pivot table. -//! -//! In PSPP, the most important internal pieces of a pivot table are: -//! -//! - Title. Every pivot table has a title that is displayed above it. It also -//! has an optional caption (displayed below it) and corner text (displayed in -//! the upper left corner). -//! -//! - Dimensions. A dimension consists of zero or more categories. A category -//! has a label, such as "df" or "Asymp. Sig." or 123 or a variable name. The -//! categories are the leaves of a tree whose non-leaf nodes form groups of -//! categories. The tree always has a root group whose label is the name of -//! the dimension. -//! -//! - Axes. A table has three axes: column, row, and layer. Each dimension is -//! assigned to an axis, and each axis has zero or more dimensions. When an -//! axis has more than one dimension, they are ordered from innermost to -//! outermost. -//! -//! - Data. A table's data consists of zero or more cells. Each cell maps from -//! a category for each dimension to a value, which is commonly a number but -//! could also be a variable name or an arbitrary text string. -//! -//! Creating a pivot table usually consists of the following steps: -//! -//! 1. Create the table with pivot_table_create(), passing in the title. -//! -//! 2. Optionally, set the format to use for "count" values with -//! pivot_table_set_weight_var() or pivot_table_set_weight_format(). -//! -//! 3. Create each dimension with pivot_dimension_create() and populate it with -//! categories and, possibly, with groups that contain the categories. This -//! call also assigns the dimension to an axis. -//! -//! In simple cases, only a call to pivot_dimension_create() is needed. -//! Other functions such as pivot_category_create_group() can be used for -//! hierarchies of categories. -//! -//! Sometimes it's easier to create categories in tandem with inserting data, -//! for example by adding a category for a variable just before inserting the -//! first cell for that variable. In that case, creating categories and -//! inserting data can be interleaved. -//! -//! 4. Insert data. For each cell, supply the category indexes, which are -//! assigned starting from 0 in the order in which the categories were -//! created in step 2, and the value to go in the cell. If the table has a -//! small, fixed number of dimensions, functions like, e.g. -//! pivot_table_put3() for 3 dimensions, can be used. The general function -//! pivot_table_put() works for other cases. -//! -//! 5. Output the table for user consumption. Use pivot_table_submit(). - -use std::{ - collections::HashMap, - ops::Range, - sync::{Arc, OnceLock}, -}; - -use chrono::NaiveDateTime; -use enum_map::{enum_map, Enum, EnumMap}; - -use crate::format::{Format, Settings as FormatSettings}; - -/// Areas of a pivot table for styling purposes. -#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq)] -pub enum Area { - Title, - Caption, - - /// Footnotes, - Footer, - - // Top-left corner. - Corner, - - ColumnLabels, - RowLabels, - Data, - - /// Layer indication. - Layers, -} - -/// Table borders for styling purposes. -#[derive(Debug, Enum)] -pub enum Border { - Title, - OuterFrame(BoxBorder), - InnerFrame(BoxBorder), - Dimensions(RowColBorder), - Categories(RowColBorder), - DataLeft, - DataTop, -} - -/// The borders on a box. -#[derive(Debug, Enum)] -pub enum BoxBorder { - Left, - Top, - Right, - Bottom, -} - -/// Borders between rows and columns. -#[derive(Debug, Enum, PartialEq, Eq)] -pub enum RowColBorder { - RowHorz, - RowVert, - ColHorz, - ColVert, -} - -/// Sizing for rows or columns of a rendered table. -/// -/// The comments below talk about columns and their widths but they apply -/// equally to rows and their heights. -#[derive(Default)] -pub struct Sizing { - /// Specific column widths, in 1/96" units. - widths: Vec, - - /// Specific page breaks: 0-based columns after which a page break must - /// occur, e.g. a value of 1 requests a break after the second column. - breaks: Vec, - - /// Keeps: columns to keep together on a page if possible. - keeps: Vec>, -} - -#[derive(Enum)] -pub enum Axis3 { - X, - Y, - Z, -} - -/// An axis within a pivot table. -#[derive(Default)] -pub struct TableAxis { - /// `dimensions[0]` is the innermost dimension. - dimensions: Vec, - - /// The number of rows or columns along the axis, that is, the product of - /// `dimensions[*].n_leaves`. It is 0 if any dimension has 0 leaves. - extent: usize, - - /// Sum of `dimensions[*].label_depth`. - label_depth: usize, -} - -/// Dimensions. -/// -/// A [Dimension] identifies the categories associated with a single dimension -/// within a multidimensional pivot table. -/// -/// A dimension contains a collection of categories, which are the leaves in a -/// tree of groups. -/// -/// (A dimension or a group can contain zero categories, but this is unusual. -/// If a dimension contains no categories, then its table cannot contain any -/// data.) -pub struct Dimension { - axis_type: Axis3, - level: usize, - - top_index: usize, - - /// Hierarchy of categories within the dimension. The groups and categories - /// are sorted in the order that should be used for display. This might be - /// different from the original order produced for output if the user - /// adjusted it. - /// - /// The root must always be a group, although it is allowed to have no - /// subcategories. - root: Group, - - /// All of the leaves reachable via the root. - /// - /// The indexing for presentation_leaves is presentation order, thus - /// `presentation_leaves[i]->presentation_index == i`. This order is the - /// same as would be produced by an in-order traversal of the groups. It - /// is the order into which the user reordered or sorted the categories. - /// - /// The indexing for `data_leaves` is that used for `idx` in [Cell], thus - /// `data_leaves[i]->data_index == i`. This might differ from what an - /// in-order traversal of `root` would yield, if the user reordered - /// categories. - data_leaves: Vec>, - presentation_leaves: Vec>, - - /// Display. - hide_all_labels: bool, - - /// Number of rows or columns needed to express the labels. - label_depth: usize, -} - -pub struct Group { - name: Value, - label_depth: usize, - extra_depth: usize, - - /// The child categories. - /// - /// A group usually has multiple children, but it is allowed to have - /// only one or even (pathologically) none. - children: Vec, - - /// Display a label for the group itself? - show_label: bool, - - show_label_in_corner: bool, -} - -pub struct Leaf { - name: Value, - label_depth: usize, - extra_depth: usize, - - group_index: usize, - data_index: usize, - presentation_index: usize, - - /// Default format for values in this category. - format: Format, - - /// Honor [Table]'s `small` setting? - honor_small: bool, -} - -/// A pivot_category is a leaf (a category) or a group. -pub enum Category { - Group(Arc), - Leaf(Arc), -} - -trait CategoryTrait { - fn name(&self) -> &Value; - fn label_depth(&self) -> usize; - fn extra_depth(&self) -> usize; -} - -impl CategoryTrait for Group { - fn name(&self) -> &Value { - &self.name - } - - fn label_depth(&self) -> usize { - self.label_depth - } - - fn extra_depth(&self) -> usize { - self.extra_depth - } -} - -impl CategoryTrait for Leaf { - fn name(&self) -> &Value { - &self.name - } - - fn label_depth(&self) -> usize { - self.label_depth - } - - fn extra_depth(&self) -> usize { - self.extra_depth - } -} - -impl CategoryTrait for Category { - fn name(&self) -> &Value { - match self { - Category::Group(group) => group.name(), - Category::Leaf(leaf) => leaf.name(), - } - } - - fn label_depth(&self) -> usize { - match self { - Category::Group(group) => group.label_depth(), - Category::Leaf(leaf) => leaf.label_depth(), - } - } - - fn extra_depth(&self) -> usize { - match self { - Category::Group(group) => group.extra_depth(), - Category::Leaf(leaf) => leaf.extra_depth(), - } - } -} - -/// Styling for a pivot table. -/// -/// The division between this and the style information in [Table] seems fairly -/// arbitrary. The ultimate reason for the division is simply because that's -/// how SPSS documentation and file formats do it. -struct Look { - name: Option, - - omit_empty: bool, - row_labels_in_corner: bool, - - /// Range of column widths for columns in the row headings and corner , in 1/96" - /// units. - row_heading_widths: Range, - - /// Range of column widths for columns in the column headings , in 1/96" - /// units. - col_heading_widths: Range, - - /// Kind of markers to use for footnotes. - footnote_marker_type: FootnoteMarkerType, - - /// Where to put the footnote markers. - footnote_marker_position: FootnoteMarkerPosition, - - /// Styles for areas of the pivot table. - areas: EnumMap, - - /// Styles for borders in the pivot table. - borders: EnumMap, - - print_all_layers: bool, - - paginate_layers: bool, - - shrink_to_fit: EnumMap, - - top_continuation: bool, - - bottom_continuation: bool, - - continuation: Option, - - n_orphan_lines: usize, -} - -impl Default for Look { - fn default() -> Self { - Self { - name: None, - omit_empty: true, - row_labels_in_corner: true, - row_heading_widths: 36..72, - col_heading_widths: 36..120, - footnote_marker_type: FootnoteMarkerType::Alphabetic, - footnote_marker_position: FootnoteMarkerPosition::Subscript, - areas: EnumMap::from_fn(|area| { - use HorzAlign::*; - use VertAlign::*; - let (halign, valign, hmargins, vmargins) = match area { - Area::Title => (Center, Middle, [8, 11], [1, 8]), - Area::Caption => (Left, Top, [8, 11], [1, 1]), - Area::Footer => (Left, Top, [11, 8], [2, 3]), - Area::Corner => (Left, Bottom, [8, 11], [1, 1]), - Area::ColumnLabels => (Left, Top, [8, 11], [1, 3]), - Area::RowLabels => (Left, Top, [8, 11], [1, 3]), - Area::Data => (Mixed, Top, [8, 11], [1, 1]), - Area::Layers => (Left, Bottom, [8, 11], [1, 3]), - }; - AreaStyle { - cell_style: CellStyle { - horz_align: halign, - vert_align: valign, - margins: enum_map! { Axis2::X => hmargins, Axis2::Y => vmargins }, - }, - font_style: FontStyle { - bold: area == Area::Title, - italic: false, - underline: false, - markup: false, - font: String::from("Sans Serif"), - fg: [Color::BLACK; 2], - bg: [Color::WHITE; 2], - size: 9, - }, - } - }), - borders: EnumMap::from_fn(|border| { - let stroke = match border { - Border::InnerFrame(_) | Border::DataLeft | Border::DataTop => Stroke::Thick, - Border::Dimensions(side) if side != RowColBorder::RowVert => Stroke::Solid, - Border::Categories(RowColBorder::ColHorz | RowColBorder::ColVert) => { - Stroke::Solid - } - _ => Stroke::None, - }; - BorderStyle { - stroke, - color: Color::BLACK, - } - }), - print_all_layers: false, - paginate_layers: false, - shrink_to_fit: EnumMap::from_fn(|_| false), - top_continuation: false, - bottom_continuation: false, - continuation: None, - n_orphan_lines: 0, - } - } -} - -impl Look { - fn shared_default() -> Arc { - static LOOK: OnceLock> = OnceLock::new(); - LOOK.get_or_init(|| Arc::new(Look::default())).clone() - } -} - -pub struct AreaStyle { - cell_style: CellStyle, - font_style: FontStyle, -} - -pub struct CellStyle { - horz_align: HorzAlign, - vert_align: VertAlign, - - /// Margins in 1/96" units. - /// - /// `margins[Axis2::X][0]` is the left margin. - /// `margins[Axis2::X][1]` is the right margin. - /// `margins[Axis2::Y][0]` is the top margin. - /// `margins[Axis2::Y][1]` is the bottom margin. - margins: EnumMap, -} - -pub enum HorzAlign { - /// Right aligned. - Right, - - /// Left aligned. - Left, - - /// Centered. - Center, - - /// Align strings to the left, other formats to the right. - Mixed, - - /// Align the decimal point at the specified position. - Decimal { - /// Decimal offset from the right side of the cell, in 1/96" units. - offset: f64, - - /// Decimal character: either `b'.'` or `b','`. - c: char, - }, -} - -pub enum VertAlign { - /// Top alignment. - Top, - - /// Centered, - Middle, - - /// Bottom alignment. - Bottom, -} - -pub struct FontStyle { - bold: bool, - italic: bool, - underline: bool, - markup: bool, - font: String, - fg: [Color; 2], - bg: [Color; 2], - - /// In 1/72" units. - size: i32, -} - -pub struct Color { - alpha: u8, - r: u8, - g: u8, - b: u8, -} - -impl Color { - const BLACK: Color = Color::new(0, 0, 0); - const WHITE: Color = Color::new(255, 255, 255); - - const fn new(r: u8, g: u8, b: u8) -> Self { - Self { - alpha: 255, - r, - g, - b, - } - } -} - -pub struct BorderStyle { - stroke: Stroke, - color: Color, -} - -pub enum Stroke { - None, - Solid, - Dashed, - Thick, - Thin, - Double, -} - -/// An axis of a flat table. -#[derive(Debug, Enum)] -pub enum Axis2 { - X, - Y, -} - -pub enum FootnoteMarkerType { - /// a, b, c, ... - Alphabetic, - - /// 1, 2, 3, ... - Numeric, -} - -pub enum FootnoteMarkerPosition { - /// Subscripts. - Subscript, - - /// Superscripts. - Superscript, -} - -pub struct Table { - look: Arc, - - rotate_inner_column_labels: bool, - - rotate_outer_row_labels: bool, - - show_grid_lines: bool, - - show_title: bool, - - show_caption: bool, - - show_value: Option, - - show_variables: Option, - - weight_format: Format, - - /// Current layer indexes, with axes[PIVOT_AXIS_LAYER].n_dimensions - /// elements. current_layer[i] is an offset into - /// axes[PIVOT_AXIS_LAYER].dimensions[i]->data_leaves[], EXCEPT that a - /// dimension can have zero leaves, in which case current_layer[i] is zero - /// and there's no corresponding leaf. - current_layer: Vec, - - /// Column and row sizing and page breaks. - sizing: EnumMap, - - /// Format settings. - settings: FormatSettings, - - /// Numeric grouping character (usually `.` or `,`). - grouping: Option, - - small: f64, - - command_local: Option, - command_c: Option, - language: Option, - locale: Option, - dataset: Option, - datafile: Option, - date: Option, - footnotes: Vec, - title: Option, - subtype: Option, - corner_text: Option, - caption: Option, - notes: Option, - dimensions: Vec, - axes: EnumMap, - cells: HashMap, -} - -impl Table { - fn new() -> Self { - Self { - look: Look::shared_default(), - rotate_inner_column_labels: false, - rotate_outer_row_labels: false, - show_grid_lines: false, - show_title: true, - show_caption: true, - show_value: None, - show_variables: None, - weight_format: Format::F40, - current_layer: Vec::new(), - sizing: EnumMap::default(), - settings: FormatSettings::default(), // XXX from settings - grouping: None, - small: 0.0001, // XXX from settings. - command_local: None, - command_c: None, // XXX from current command name. - language: None, - locale: None, - dataset: None, - datafile: None, - date: None, - footnotes: Vec::new(), - subtype: None, - title: None, - corner_text: None, - caption: None, - notes: None, - dimensions: Vec::new(), - axes: EnumMap::default(), - cells: HashMap::new(), - } - } -} - -/// Whether to show variable or value labels or the underlying value or variable name. -pub enum ValueShow { - /// Value or variable name only. - Value, - - /// Label only. - Label, - - /// Value and label. - Both, -} - -pub struct Footnote { - content: Value, - marker: Value, - show: bool, -} - -/// The content of a single pivot table cell. -/// -/// A [Value] is also a pivot table's title, caption, footnote marker and -/// contents, and so on. -/// -/// A given [Value] is one of: -/// -/// 1. A number resulting from a calculation. -/// -/// A number has an associated display format (usually [F] or [Pct]). This -/// format can be set directly, but that is not usually the easiest way. -/// Instead, it is usually true that all of the values in a single category -/// should have the same format (e.g. all "Significance" values might use -/// format `F40.3`), so PSPP makes it easy to set the default format for a -/// category while creating the category. See pivot_dimension_create() for -/// more details. -/// -/// [F]: crate::format::Format::F -/// [Pct]: crate::format::Format::Pct -/// -/// 2. A numeric or string value obtained from data (PIVOT_VALUE_NUMERIC or -/// PIVOT_VALUE_STRING). If such a value corresponds to a variable, then the -/// variable's name can be attached to the pivot_value. If the value has a -/// value label, then that can also be attached. When a label is present, -/// the user can control whether to show the value or the label or both. -/// -/// 3. A variable name (PIVOT_VALUE_VARIABLE). The variable label, if any, can -/// be attached too, and again the user can control whether to show the value -/// or the label or both. -/// -/// 4. A text string (PIVOT_VALUE_TEXT). The value stores the string in English -/// and translated into the output language (localized). Use -/// pivot_value_new_text() or pivot_value_new_text_format() for those cases. -/// In some cases, only an English or a localized version is available for -/// one reason or another, although this is regrettable; in those cases, use -/// pivot_value_new_user_text() or pivot_value_new_user_text_nocopy(). -/// -/// 5. A template. PSPP doesn't create these itself yet, but it can read and -/// interpret those created by SPSS. -pub struct Value { - styling: Option>, - inner: ValueInner, -} - -pub enum ValueInner { - Number { - show: ValueShow, - format: Format, - honor_small: bool, - value: f64, - var_name: Option, - value_label: Option, - }, - String { - show: ValueShow, - hex: bool, - s: Option, - var_name: Option, - value_label: Option, - }, - Variable { - show: ValueShow, - var_name: Option, - value_label: Option, - }, - Text { - user_provided: bool, - /// Localized. - local: String, - /// English. - c: String, - /// Identifier. - id: String, - }, - Template { - args: Vec>, - local: String, - id: String, - }, -} - -pub struct ValueStyle { - font_style: FontStyle, - cell_style: CellStyle, - subscripts: Vec, - footnote_indexes: Vec, -} diff --git a/rust/src/prompt.rs b/rust/src/prompt.rs deleted file mode 100644 index c02ca9b367..0000000000 --- a/rust/src/prompt.rs +++ /dev/null @@ -1,37 +0,0 @@ -#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)] -pub enum PromptStyle { - /// First line of command. - First, - - /// Second or later line of command. - Later, - - /// Between `BEGIN DATA` and `END DATA`. - Data, - - /// `COMMENT` or `*` command. - Comment, - - /// DOCUMENT command. - Document, - - /// `DO REPEAT` command. - DoRepeat, - - /// `DEFINE` command. - Define, -} - -impl PromptStyle { - pub fn to_string(&self) -> &'static str { - match self { - PromptStyle::First => "first", - PromptStyle::Later => "later", - PromptStyle::Data => "data", - PromptStyle::Comment => "COMMENT", - PromptStyle::Document => "DOCUMENT", - PromptStyle::DoRepeat => "DO REPEAT", - PromptStyle::Define => "DEFINE", - } - } -} diff --git a/rust/src/raw.rs b/rust/src/raw.rs deleted file mode 100644 index c9b04773ff..0000000000 --- a/rust/src/raw.rs +++ /dev/null @@ -1,2888 +0,0 @@ -use crate::{ - dictionary::VarWidth, - encoding::{default_encoding, get_encoding, Error as EncodingError}, - endian::{Endian, Parse, ToBytes}, - identifier::{Error as IdError, Identifier}, -}; - -use encoding_rs::{mem::decode_latin1, DecoderResult, Encoding}; -use flate2::read::ZlibDecoder; -use num::Integer; -use std::{ - borrow::Cow, - cell::RefCell, - cmp::Ordering, - collections::{HashMap, VecDeque}, - fmt::{Debug, Display, Formatter, Result as FmtResult}, - io::{Error as IoError, Read, Seek, SeekFrom}, - iter::repeat, - mem::take, - ops::Range, - rc::Rc, - str::from_utf8, -}; -use thiserror::Error as ThisError; - -#[derive(ThisError, Debug)] -pub enum Error { - #[error("Not an SPSS system file")] - NotASystemFile, - - #[error("Invalid magic number {0:?}")] - BadMagic([u8; 4]), - - #[error("I/O error ({0})")] - Io(#[from] IoError), - - #[error("Invalid SAV compression code {0}")] - InvalidSavCompression(u32), - - #[error("Invalid ZSAV compression code {0}")] - InvalidZsavCompression(u32), - - #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")] - BadDocumentLength { offset: u64, n: usize, max: usize }, - - #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")] - BadRecordType { offset: u64, rec_type: u32 }, - - #[error("In variable record starting at offset {start_offset:#x}, variable width is not in the valid range -1 to 255.")] - BadVariableWidth { start_offset: u64, width: i32 }, - - #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")] - BadVariableLabelCode { - start_offset: u64, - code_offset: u64, - code: u32, - }, - - #[error( - "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3." - )] - BadNumericMissingValueCode { offset: u64, code: i32 }, - - #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")] - BadStringMissingValueCode { offset: u64, code: i32 }, - - #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")] - BadNumberOfValueLabels { offset: u64, n: u32, max: u32 }, - - #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")] - ExpectedVarIndexRecord { offset: u64, rec_type: u32 }, - - #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")] - TooManyVarIndexes { offset: u64, n: u32, max: u32 }, - - #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")] - ExtensionRecordTooLarge { - offset: u64, - subtype: u32, - size: u32, - count: u32, - }, - - #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")] - EofInCase { - offset: u64, - case_ofs: u64, - case_len: usize, - }, - - #[error( - "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case." - )] - EofInCompressedCase { offset: u64, case_ofs: u64 }, - - #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")] - PartialCompressedCase { offset: u64, case_ofs: u64 }, - - #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")] - CompressedNumberExpected { offset: u64, case_ofs: u64 }, - - #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")] - CompressedStringExpected { offset: u64, case_ofs: u64 }, - - #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")] - BadZlibTrailerNBlocks { - offset: u64, - n_blocks: u32, - expected_n_blocks: u64, - ztrailer_len: u64, - }, - - #[error("{0}")] - EncodingError(EncodingError), -} - -#[derive(ThisError, Debug)] -pub enum Warning { - #[error("Unexpected end of data inside extension record.")] - UnexpectedEndOfData, - - #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")] - NoVarIndexes { offset: u64 }, - - #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())] - MixedVarTypes { - offset: u64, - var_type: VarType, - wrong_types: Vec, - }, - - #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")] - InvalidVarIndexes { - offset: u64, - max: usize, - invalid: Vec, - }, - - #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")] - BadRecordSize { - offset: u64, - record: String, - size: u32, - expected_size: u32, - }, - - #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")] - BadRecordCount { - offset: u64, - record: String, - count: u32, - expected_count: u32, - }, - - #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")] - BadLongMissingValueLength { - record_offset: u64, - offset: u64, - value_len: u32, - }, - - #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")] - BadEncodingName { offset: u64 }, - - // XXX This is risky because `text` might be arbitarily long. - #[error("Text string contains invalid bytes for {encoding} encoding: {text}")] - MalformedString { encoding: String, text: String }, - - #[error("Invalid variable measurement level value {0}")] - InvalidMeasurement(u32), - - #[error("Invalid variable display alignment value {0}")] - InvalidAlignment(u32), - - #[error("Invalid attribute name. {0}")] - InvalidAttributeName(IdError), - - #[error("Invalid variable name in attribute record. {0}")] - InvalidAttributeVariableName(IdError), - - #[error("Invalid short name in long variable name record. {0}")] - InvalidShortName(IdError), - - #[error("Invalid name in long variable name record. {0}")] - InvalidLongName(IdError), - - #[error("Invalid variable name in very long string record. {0}")] - InvalidLongStringName(IdError), - - #[error("Invalid variable name in variable set record. {0}")] - InvalidVariableSetName(IdError), - - #[error("Invalid multiple response set name. {0}")] - InvalidMrSetName(IdError), - - #[error("Invalid multiple response set variable name. {0}")] - InvalidMrSetVariableName(IdError), - - #[error("Invalid variable name in long string missing values record. {0}")] - InvalidLongStringMissingValueVariableName(IdError), - - #[error("Invalid variable name in long string value label record. {0}")] - InvalidLongStringValueLabelName(IdError), - - #[error("{0}")] - EncodingError(EncodingError), - - #[error("Details TBD")] - TBD, -} - -impl From for Warning { - fn from(_source: IoError) -> Self { - Self::UnexpectedEndOfData - } -} - -#[derive(Clone, Debug)] -pub enum Record { - Header(HeaderRecord), - Variable(VariableRecord>), - ValueLabel(ValueLabelRecord, RawString>), - Document(DocumentRecord), - IntegerInfo(IntegerInfoRecord), - FloatInfo(FloatInfoRecord), - VarDisplay(VarDisplayRecord), - MultipleResponse(MultipleResponseRecord), - LongStringValueLabels(LongStringValueLabelRecord), - LongStringMissingValues(LongStringMissingValueRecord>), - Encoding(EncodingRecord), - NumberOfCases(NumberOfCasesRecord), - Text(TextRecord), - OtherExtension(Extension), - EndOfHeaders(u32), - ZHeader(ZHeader), - ZTrailer(ZTrailer), - Cases(Rc>), -} - -#[derive(Clone, Debug)] -pub enum DecodedRecord { - Header(HeaderRecord), - Variable(VariableRecord), - ValueLabel(ValueLabelRecord, String>), - Document(DocumentRecord), - IntegerInfo(IntegerInfoRecord), - FloatInfo(FloatInfoRecord), - VarDisplay(VarDisplayRecord), - MultipleResponse(MultipleResponseRecord), - LongStringValueLabels(LongStringValueLabelRecord), - LongStringMissingValues(LongStringMissingValueRecord), - Encoding(EncodingRecord), - NumberOfCases(NumberOfCasesRecord), - VariableSets(VariableSetRecord), - ProductInfo(ProductInfoRecord), - LongNames(LongNamesRecord), - VeryLongStrings(VeryLongStringsRecord), - FileAttributes(FileAttributeRecord), - VariableAttributes(VariableAttributeRecord), - OtherExtension(Extension), - EndOfHeaders(u32), - ZHeader(ZHeader), - ZTrailer(ZTrailer), - Cases(Rc>), -} - -impl Record { - fn read( - reader: &mut R, - endian: Endian, - var_types: &[VarType], - warn: &dyn Fn(Warning), - ) -> Result, Error> - where - R: Read + Seek, - { - let rec_type: u32 = endian.parse(read_bytes(reader)?); - match rec_type { - 2 => Ok(Some(VariableRecord::read(reader, endian)?)), - 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?), - 6 => Ok(Some(DocumentRecord::read(reader, endian)?)), - 7 => Extension::read(reader, endian, var_types.len(), warn), - 999 => Ok(Some(Record::EndOfHeaders( - endian.parse(read_bytes(reader)?), - ))), - _ => Err(Error::BadRecordType { - offset: reader.stream_position()?, - rec_type, - }), - } - } - - pub fn decode(self, decoder: &Decoder) -> Result { - Ok(match self { - Record::Header(record) => record.decode(decoder), - Record::Variable(record) => record.decode(decoder), - Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)), - Record::Document(record) => record.decode(decoder), - Record::IntegerInfo(record) => DecodedRecord::IntegerInfo(record.clone()), - Record::FloatInfo(record) => DecodedRecord::FloatInfo(record.clone()), - Record::VarDisplay(record) => DecodedRecord::VarDisplay(record.clone()), - Record::MultipleResponse(record) => record.decode(decoder), - Record::LongStringValueLabels(record) => { - DecodedRecord::LongStringValueLabels(record.decode(decoder)) - } - Record::LongStringMissingValues(record) => { - DecodedRecord::LongStringMissingValues(record.decode(decoder)) - } - Record::Encoding(record) => DecodedRecord::Encoding(record.clone()), - Record::NumberOfCases(record) => DecodedRecord::NumberOfCases(record.clone()), - Record::Text(record) => record.decode(decoder), - Record::OtherExtension(record) => DecodedRecord::OtherExtension(record.clone()), - Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record), - Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()), - Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()), - Record::Cases(record) => DecodedRecord::Cases(record.clone()), - }) - } -} - -pub fn encoding_from_headers( - headers: &Vec, - warn: &impl Fn(Warning), -) -> Result<&'static Encoding, Error> { - let mut encoding_record = None; - let mut integer_info_record = None; - for record in headers { - match record { - Record::Encoding(record) => encoding_record = Some(record), - Record::IntegerInfo(record) => integer_info_record = Some(record), - _ => (), - } - } - let encoding = encoding_record.map(|record| record.0.as_str()); - let character_code = integer_info_record.map(|record| record.character_code); - match get_encoding(encoding, character_code) { - Ok(encoding) => Ok(encoding), - Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)), - Err(err) => { - warn(Warning::EncodingError(err)); - // Warn that we're using the default encoding. - Ok(default_encoding()) - } - } -} - -// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it -// decoded as Latin-1 (actually bytes interpreted as Unicode code points). -fn default_decode(s: &[u8]) -> Cow { - from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from) -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum Compression { - Simple, - ZLib, -} - -trait Header { - fn offsets(&self) -> Range; -} - -#[derive(Clone)] -pub struct HeaderRecord -where - S: Debug, -{ - /// Offset in file. - pub offsets: Range, - - /// Magic number. - pub magic: Magic, - - /// Eye-catcher string, product name, in the file's encoding. Padded - /// on the right with spaces. - pub eye_catcher: S, - - /// Layout code, normally either 2 or 3. - pub layout_code: u32, - - /// Number of variable positions, or `None` if the value in the file is - /// questionably trustworthy. - pub nominal_case_size: Option, - - /// Compression type, if any, - pub compression: Option, - - /// 1-based variable index of the weight variable, or `None` if the file is - /// unweighted. - pub weight_index: Option, - - /// Claimed number of cases, if known. - pub n_cases: Option, - - /// Compression bias, usually 100.0. - pub bias: f64, - - /// `dd mmm yy` in the file's encoding. - pub creation_date: S, - - /// `HH:MM:SS` in the file's encoding. - pub creation_time: S, - - /// File label, in the file's encoding. Padded on the right with spaces. - pub file_label: S, - - /// Endianness of the data in the file header. - pub endian: Endian, -} - -impl HeaderRecord -where - S: Debug, -{ - fn debug_field(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult - where - T: Debug, - { - writeln!(f, "{name:>17}: {:?}", value) - } -} - -impl Debug for HeaderRecord -where - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> FmtResult { - writeln!(f, "File header record:")?; - self.debug_field(f, "Magic", self.magic)?; - self.debug_field(f, "Product name", &self.eye_catcher)?; - self.debug_field(f, "Layout code", self.layout_code)?; - self.debug_field(f, "Nominal case size", self.nominal_case_size)?; - self.debug_field(f, "Compression", self.compression)?; - self.debug_field(f, "Weight index", self.weight_index)?; - self.debug_field(f, "Number of cases", self.n_cases)?; - self.debug_field(f, "Compression bias", self.bias)?; - self.debug_field(f, "Creation date", &self.creation_date)?; - self.debug_field(f, "Creation time", &self.creation_time)?; - self.debug_field(f, "File label", &self.file_label)?; - self.debug_field(f, "Endianness", self.endian) - } -} - -impl HeaderRecord { - fn read(r: &mut R) -> Result { - let start = r.stream_position()?; - - let magic: [u8; 4] = read_bytes(r)?; - let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?; - - let eye_catcher = RawString(read_vec(r, 60)?); - let layout_code: [u8; 4] = read_bytes(r)?; - let endian = Endian::identify_u32(2, layout_code) - .or_else(|| Endian::identify_u32(2, layout_code)) - .ok_or_else(|| Error::NotASystemFile)?; - let layout_code = endian.parse(layout_code); - - let nominal_case_size: u32 = endian.parse(read_bytes(r)?); - let nominal_case_size = - (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size); - - let compression_code: u32 = endian.parse(read_bytes(r)?); - let compression = match (magic, compression_code) { - (Magic::Zsav, 2) => Some(Compression::ZLib), - (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)), - (_, 0) => None, - (_, 1) => Some(Compression::Simple), - (_, code) => return Err(Error::InvalidSavCompression(code)), - }; - - let weight_index: u32 = endian.parse(read_bytes(r)?); - let weight_index = (weight_index > 0).then_some(weight_index); - - let n_cases: u32 = endian.parse(read_bytes(r)?); - let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases); - - let bias: f64 = endian.parse(read_bytes(r)?); - - let creation_date = RawString(read_vec(r, 9)?); - let creation_time = RawString(read_vec(r, 8)?); - let file_label = RawString(read_vec(r, 64)?); - let _: [u8; 3] = read_bytes(r)?; - - Ok(HeaderRecord { - offsets: start..r.stream_position()?, - magic, - layout_code, - nominal_case_size, - compression, - weight_index, - n_cases, - bias, - creation_date, - creation_time, - eye_catcher, - file_label, - endian, - }) - } - - pub fn decode(self, decoder: &Decoder) -> DecodedRecord { - let eye_catcher = decoder.decode(&self.eye_catcher).to_string(); - let file_label = decoder.decode(&self.file_label).to_string(); - let creation_date = decoder.decode(&self.creation_date).to_string(); - let creation_time = decoder.decode(&self.creation_time).to_string(); - DecodedRecord::Header(HeaderRecord { - eye_catcher, - weight_index: self.weight_index, - n_cases: self.n_cases, - file_label, - offsets: self.offsets.clone(), - magic: self.magic, - layout_code: self.layout_code, - nominal_case_size: self.nominal_case_size, - compression: self.compression, - bias: self.bias, - creation_date, - creation_time, - endian: self.endian, - }) - } -} - -pub struct Decoder { - pub encoding: &'static Encoding, - pub warn: Box, -} - -impl Decoder { - pub fn new(encoding: &'static Encoding, warn: F) -> Self - where - F: Fn(Warning) + 'static, - { - Self { - encoding, - warn: Box::new(warn), - } - } - fn warn(&self, warning: Warning) { - (self.warn)(warning) - } - fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { - let (output, malformed) = self.encoding.decode_without_bom_handling(input); - if malformed { - self.warn(Warning::MalformedString { - encoding: self.encoding.name().into(), - text: output.clone().into(), - }); - } - output - } - - fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> { - self.decode_slice(input.0.as_slice()) - } - - /// Returns `input` decoded from `self.encoding` into UTF-8 such that - /// re-encoding the result back into `self.encoding` will have exactly the - /// same length in bytes. - /// - /// XXX warn about errors? - pub fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { - if let (s, false) = self.encoding.decode_without_bom_handling(input) { - // This is the common case. Usually there will be no errors. - s - } else { - // Unusual case. Don't bother to optimize it much. - let mut decoder = self.encoding.new_decoder_without_bom_handling(); - let mut output = String::with_capacity( - decoder - .max_utf8_buffer_length_without_replacement(input.len()) - .unwrap(), - ); - let mut rest = input; - while !rest.is_empty() { - match decoder.decode_to_string_without_replacement(rest, &mut output, true) { - (DecoderResult::InputEmpty, _) => break, - (DecoderResult::OutputFull, _) => unreachable!(), - (DecoderResult::Malformed(a, b), consumed) => { - let skipped = a as usize + b as usize; - output.extend(repeat('?').take(skipped)); - rest = &rest[consumed..]; - } - } - } - assert_eq!(self.encoding.encode(&output).0.len(), input.len()); - output.into() - } - } - - pub fn decode_identifier(&self, input: &RawString) -> Result { - self.new_identifier(&self.decode(input)) - } - - pub fn new_identifier(&self, name: &str) -> Result { - Identifier::from_encoding(name, self.encoding) - } -} - -impl Header for HeaderRecord -where - S: Debug, -{ - fn offsets(&self) -> Range { - self.offsets.clone() - } -} - -#[derive(Copy, Clone, PartialEq, Eq, Hash)] -pub enum Magic { - /// Regular system file. - Sav, - - /// System file with Zlib-compressed data. - Zsav, - - /// EBCDIC-encoded system file. - Ebcdic, -} - -impl Magic { - /// Magic number for a regular system file. - pub const SAV: [u8; 4] = *b"$FL2"; - - /// Magic number for a system file that contains zlib-compressed data. - pub const ZSAV: [u8; 4] = *b"$FL3"; - - /// Magic number for an EBCDIC-encoded system file. This is `$FL2` encoded - /// in EBCDIC. - pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2]; -} - -impl Debug for Magic { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - let s = match *self { - Magic::Sav => "$FL2", - Magic::Zsav => "$FL3", - Magic::Ebcdic => "($FL2 in EBCDIC)", - }; - write!(f, "{s}") - } -} - -impl TryFrom<[u8; 4]> for Magic { - type Error = Error; - - fn try_from(value: [u8; 4]) -> Result { - match value { - Magic::SAV => Ok(Magic::Sav), - Magic::ZSAV => Ok(Magic::Zsav), - Magic::EBCDIC => Ok(Magic::Ebcdic), - _ => Err(Error::BadMagic(value)), - } - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub enum VarType { - Numeric, - String, -} - -impl VarType { - pub fn from_width(width: VarWidth) -> VarType { - match width { - VarWidth::Numeric => Self::Numeric, - VarWidth::String(_) => Self::String, - } - } - - pub fn opposite(self) -> VarType { - match self { - Self::Numeric => Self::String, - Self::String => Self::Numeric, - } - } -} - -impl Display for VarType { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - match self { - VarType::Numeric => write!(f, "numeric"), - VarType::String => write!(f, "string"), - } - } -} - -#[derive(Copy, Clone)] -pub enum Value -where - S: Debug, -{ - Number(Option), - String(S), -} - -type RawValue = Value>; - -impl Debug for Value -where - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> FmtResult { - match self { - Value::Number(Some(number)) => write!(f, "{number:?}"), - Value::Number(None) => write!(f, "SYSMIS"), - Value::String(s) => write!(f, "{:?}", s), - } - } -} - -impl RawValue { - fn read(r: &mut R, var_type: VarType, endian: Endian) -> Result { - Ok(Self::from_raw( - &UntypedValue(read_bytes(r)?), - var_type, - endian, - )) - } - - pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self { - match var_type { - VarType::String => Value::String(RawStr(raw.0)), - VarType::Numeric => { - let number: f64 = endian.parse(raw.0); - Value::Number((number != -f64::MAX).then_some(number)) - } - } - } - - fn read_case( - reader: &mut R, - var_types: &[VarType], - endian: Endian, - ) -> Result>, Error> { - let case_start = reader.stream_position()?; - let mut values = Vec::with_capacity(var_types.len()); - for (i, &var_type) in var_types.iter().enumerate() { - let Some(raw) = try_read_bytes(reader)? else { - if i == 0 { - return Ok(None); - } else { - let offset = reader.stream_position()?; - return Err(Error::EofInCase { - offset, - case_ofs: offset - case_start, - case_len: var_types.len() * 8, - }); - } - }; - values.push(Value::from_raw(&UntypedValue(raw), var_type, endian)); - } - Ok(Some(values)) - } - - fn read_compressed_case( - reader: &mut R, - var_types: &[VarType], - codes: &mut VecDeque, - endian: Endian, - bias: f64, - ) -> Result>, Error> { - let case_start = reader.stream_position()?; - let mut values = Vec::with_capacity(var_types.len()); - for (i, &var_type) in var_types.iter().enumerate() { - let value = loop { - let Some(code) = codes.pop_front() else { - let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else { - if i == 0 { - return Ok(None); - } else { - let offset = reader.stream_position()?; - return Err(Error::EofInCompressedCase { - offset, - case_ofs: offset - case_start, - }); - } - }; - codes.extend(new_codes.into_iter()); - continue; - }; - match code { - 0 => (), - 1..=251 => match var_type { - VarType::Numeric => break Self::Number(Some(code as f64 - bias)), - VarType::String => { - break Self::String(RawStr(endian.to_bytes(code as f64 - bias))) - } - }, - 252 => { - if i == 0 { - return Ok(None); - } else { - let offset = reader.stream_position()?; - return Err(Error::PartialCompressedCase { - offset, - case_ofs: offset - case_start, - }); - } - } - 253 => { - break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian) - } - 254 => match var_type { - VarType::String => break Self::String(RawStr(*b" ")), // XXX EBCDIC - VarType::Numeric => { - return Err(Error::CompressedStringExpected { - offset: case_start, - case_ofs: reader.stream_position()? - case_start, - }) - } - }, - 255 => match var_type { - VarType::Numeric => break Self::Number(None), - VarType::String => { - return Err(Error::CompressedNumberExpected { - offset: case_start, - case_ofs: reader.stream_position()? - case_start, - }) - } - }, - } - }; - values.push(value); - } - Ok(Some(values)) - } - - pub fn decode(self, decoder: &Decoder) -> Value { - match self { - Self::Number(x) => Value::Number(x), - Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()), - } - } -} - -struct ZlibDecodeMultiple -where - R: Read + Seek, -{ - reader: Option>, -} - -impl ZlibDecodeMultiple -where - R: Read + Seek, -{ - fn new(reader: R) -> ZlibDecodeMultiple { - ZlibDecodeMultiple { - reader: Some(ZlibDecoder::new(reader)), - } - } -} - -impl Read for ZlibDecodeMultiple -where - R: Read + Seek, -{ - fn read(&mut self, buf: &mut [u8]) -> Result { - loop { - match self.reader.as_mut().unwrap().read(buf)? { - 0 => { - let inner = self.reader.take().unwrap().into_inner(); - self.reader = Some(ZlibDecoder::new(inner)); - } - n => return Ok(n), - }; - } - } -} - -impl Seek for ZlibDecodeMultiple -where - R: Read + Seek, -{ - fn seek(&mut self, pos: SeekFrom) -> Result { - self.reader.as_mut().unwrap().get_mut().seek(pos) - } -} - -enum ReaderState { - Start, - Headers, - ZlibHeader, - ZlibTrailer { - ztrailer_offset: u64, - ztrailer_len: u64, - }, - Cases, - End, -} - -pub struct Reader -where - R: Read + Seek + 'static, -{ - reader: Option, - warn: Box, - - header: HeaderRecord, - var_types: Vec, - - state: ReaderState, -} - -impl Reader -where - R: Read + Seek + 'static, -{ - pub fn new(mut reader: R, warn: F) -> Result - where - F: Fn(Warning) + 'static, - { - let header = HeaderRecord::read(&mut reader)?; - Ok(Self { - reader: Some(reader), - warn: Box::new(warn), - header, - var_types: Vec::new(), - state: ReaderState::Start, - }) - } - fn cases(&mut self) -> Cases { - self.state = ReaderState::End; - Cases::new( - self.reader.take().unwrap(), - take(&mut self.var_types), - &self.header, - ) - } - fn _next(&mut self) -> Option<::Item> { - match self.state { - ReaderState::Start => { - self.state = ReaderState::Headers; - Some(Ok(Record::Header(self.header.clone()))) - } - ReaderState::Headers => { - let record = loop { - match Record::read( - self.reader.as_mut().unwrap(), - self.header.endian, - self.var_types.as_slice(), - &self.warn, - ) { - Ok(Some(record)) => break record, - Ok(None) => (), - Err(error) => return Some(Err(error)), - } - }; - match record { - Record::Variable(VariableRecord { width, .. }) => { - self.var_types.push(if width == 0 { - VarType::Numeric - } else { - VarType::String - }); - } - Record::EndOfHeaders(_) => { - self.state = if let Some(Compression::ZLib) = self.header.compression { - ReaderState::ZlibHeader - } else { - ReaderState::Cases - }; - } - _ => (), - }; - Some(Ok(record)) - } - ReaderState::ZlibHeader => { - let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian) - { - Ok(zheader) => zheader, - Err(error) => return Some(Err(error)), - }; - self.state = ReaderState::ZlibTrailer { - ztrailer_offset: zheader.ztrailer_offset, - ztrailer_len: zheader.ztrailer_len, - }; - Some(Ok(Record::ZHeader(zheader))) - } - ReaderState::ZlibTrailer { - ztrailer_offset, - ztrailer_len, - } => { - match ZTrailer::read( - self.reader.as_mut().unwrap(), - self.header.endian, - ztrailer_offset, - ztrailer_len, - ) { - Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))), - Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))), - Err(error) => Some(Err(error)), - } - } - ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))), - ReaderState::End => None, - } - } -} - -impl Iterator for Reader -where - R: Read + Seek + 'static, -{ - type Item = Result; - - fn next(&mut self) -> Option { - let retval = self._next(); - if matches!(retval, Some(Err(_))) { - self.state = ReaderState::End; - } - retval - } -} - -trait ReadSeek: Read + Seek {} -impl ReadSeek for T where T: Read + Seek {} - -pub struct Cases { - reader: Box, - var_types: Vec, - compression: Option, - bias: f64, - endian: Endian, - codes: VecDeque, - eof: bool, -} - -impl Debug for Cases { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "Cases") - } -} - -impl Cases { - fn new(reader: R, var_types: Vec, header: &HeaderRecord) -> Self - where - R: Read + Seek + 'static, - { - Self { - reader: if header.compression == Some(Compression::ZLib) { - Box::new(ZlibDecodeMultiple::new(reader)) - } else { - Box::new(reader) - }, - var_types, - compression: header.compression, - bias: header.bias, - endian: header.endian, - codes: VecDeque::with_capacity(8), - eof: false, - } - } -} - -impl Iterator for Cases { - type Item = Result, Error>; - - fn next(&mut self) -> Option { - if self.eof { - return None; - } - - let retval = if self.compression.is_some() { - Value::read_compressed_case( - &mut self.reader, - &self.var_types, - &mut self.codes, - self.endian, - self.bias, - ) - .transpose() - } else { - Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose() - }; - self.eof = matches!(retval, None | Some(Err(_))); - retval - } -} - -#[derive(Copy, Clone, PartialEq, Eq, Hash)] -pub struct Spec(pub u32); - -impl Debug for Spec { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - let type_ = format_name(self.0 >> 16); - let w = (self.0 >> 8) & 0xff; - let d = self.0 & 0xff; - write!(f, "{:06x} ({type_}{w}.{d})", self.0) - } -} - -fn format_name(type_: u32) -> Cow<'static, str> { - match type_ { - 1 => "A", - 2 => "AHEX", - 3 => "COMMA", - 4 => "DOLLAR", - 5 => "F", - 6 => "IB", - 7 => "PIBHEX", - 8 => "P", - 9 => "PIB", - 10 => "PK", - 11 => "RB", - 12 => "RBHEX", - 15 => "Z", - 16 => "N", - 17 => "E", - 20 => "DATE", - 21 => "TIME", - 22 => "DATETIME", - 23 => "ADATE", - 24 => "JDATE", - 25 => "DTIME", - 26 => "WKDAY", - 27 => "MONTH", - 28 => "MOYR", - 29 => "QYR", - 30 => "WKYR", - 31 => "PCT", - 32 => "DOT", - 33 => "CCA", - 34 => "CCB", - 35 => "CCC", - 36 => "CCD", - 37 => "CCE", - 38 => "EDATE", - 39 => "SDATE", - 40 => "MTIME", - 41 => "YMDHMS", - _ => return format!("").into(), - } - .into() -} - -#[derive(Clone)] -pub struct MissingValues -where - S: Debug, -{ - /// Individual missing values, up to 3 of them. - pub values: Vec>, - - /// Optional range of missing values. - pub range: Option<(Value, Value)>, -} - -impl Debug for MissingValues -where - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> FmtResult { - for (i, value) in self.values.iter().enumerate() { - if i > 0 { - write!(f, ", ")?; - } - write!(f, "{value:?}")?; - } - - if let Some((low, high)) = &self.range { - if !self.values.is_empty() { - write!(f, ", ")?; - } - write!(f, "{low:?} THRU {high:?}")?; - } - - if self.is_empty() { - write!(f, "none")?; - } - - Ok(()) - } -} - -impl MissingValues -where - S: Debug, -{ - fn is_empty(&self) -> bool { - self.values.is_empty() && self.range.is_none() - } -} - -impl Default for MissingValues -where - S: Debug, -{ - fn default() -> Self { - Self { - values: Vec::new(), - range: None, - } - } -} - -impl MissingValues> { - fn read( - r: &mut R, - offset: u64, - width: i32, - code: i32, - endian: Endian, - ) -> Result { - let (n_values, has_range) = match (width, code) { - (_, 0..=3) => (code, false), - (0, -2) => (0, true), - (0, -3) => (1, true), - (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }), - (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }), - }; - - let var_type = if width == 0 { - VarType::Numeric - } else { - VarType::String - }; - - let mut values = Vec::new(); - for _ in 0..n_values { - values.push(RawValue::read(r, var_type, endian)?); - } - let range = if has_range { - let low = RawValue::read(r, var_type, endian)?; - let high = RawValue::read(r, var_type, endian)?; - Some((low, high)) - } else { - None - }; - Ok(Self { values, range }) - } - fn decode(&self, decoder: &Decoder) -> MissingValues { - MissingValues { - values: self - .values - .iter() - .map(|value| value.decode(decoder)) - .collect(), - range: self - .range - .as_ref() - .map(|(low, high)| (low.decode(decoder), high.decode(decoder))), - } - } -} - -#[derive(Clone)] -pub struct VariableRecord -where - S: Debug, - V: Debug, -{ - /// Range of offsets in file. - pub offsets: Range, - - /// Variable width, in the range -1..=255. - pub width: i32, - - /// Variable name, padded on the right with spaces. - pub name: S, - - /// Print format. - pub print_format: Spec, - - /// Write format. - pub write_format: Spec, - - /// Missing values. - pub missing_values: MissingValues, - - /// Optional variable label. - pub label: Option, -} - -impl Debug for VariableRecord -where - S: Debug, - V: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> FmtResult { - writeln!( - f, - "Width: {} ({})", - self.width, - match self.width.cmp(&0) { - Ordering::Greater => "string", - Ordering::Equal => "numeric", - Ordering::Less => "long string continuation record", - } - )?; - writeln!(f, "Print format: {:?}", self.print_format)?; - writeln!(f, "Write format: {:?}", self.write_format)?; - writeln!(f, "Name: {:?}", &self.name)?; - writeln!(f, "Variable label: {:?}", self.label)?; - writeln!(f, "Missing values: {:?}", self.missing_values) - } -} - -impl VariableRecord> { - fn read(r: &mut R, endian: Endian) -> Result { - let start_offset = r.stream_position()?; - let width: i32 = endian.parse(read_bytes(r)?); - if !(-1..=255).contains(&width) { - return Err(Error::BadVariableWidth { - start_offset, - width, - }); - } - let code_offset = r.stream_position()?; - let has_variable_label: u32 = endian.parse(read_bytes(r)?); - let missing_value_code: i32 = endian.parse(read_bytes(r)?); - let print_format = Spec(endian.parse(read_bytes(r)?)); - let write_format = Spec(endian.parse(read_bytes(r)?)); - let name = RawString(read_vec(r, 8)?); - - let label = match has_variable_label { - 0 => None, - 1 => { - let len: u32 = endian.parse(read_bytes(r)?); - let read_len = len.min(65535) as usize; - let label = RawString(read_vec(r, read_len)?); - - let padding_bytes = Integer::next_multiple_of(&len, &4) - len; - let _ = read_vec(r, padding_bytes as usize)?; - - Some(label) - } - _ => { - return Err(Error::BadVariableLabelCode { - start_offset, - code_offset, - code: has_variable_label, - }) - } - }; - - let missing_values = - MissingValues::read(r, start_offset, width, missing_value_code, endian)?; - - let end_offset = r.stream_position()?; - - Ok(Record::Variable(VariableRecord { - offsets: start_offset..end_offset, - width, - name, - print_format, - write_format, - missing_values, - label, - })) - } - - pub fn decode(self, decoder: &Decoder) -> DecodedRecord { - DecodedRecord::Variable(VariableRecord { - offsets: self.offsets.clone(), - width: self.width, - name: decoder.decode(&self.name).to_string(), - print_format: self.print_format, - write_format: self.write_format, - missing_values: self.missing_values.decode(decoder), - label: self - .label - .as_ref() - .map(|label| decoder.decode(label).to_string()), - }) - } -} - -#[derive(Copy, Clone)] -pub struct UntypedValue(pub [u8; 8]); - -impl Debug for UntypedValue { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - let little: f64 = Endian::Little.parse(self.0); - let little = format!("{:?}", little); - let big: f64 = Endian::Big.parse(self.0); - let big = format!("{:?}", big); - let number = if little.len() <= big.len() { - little - } else { - big - }; - write!(f, "{number}")?; - - let string = default_decode(&self.0); - let string = string - .split(|c: char| c == '\0' || c.is_control()) - .next() - .unwrap(); - write!(f, "{string:?}")?; - Ok(()) - } -} - -#[derive(Clone)] -pub struct RawString(pub Vec); - -impl From> for RawString { - fn from(source: Vec) -> Self { - Self(source) - } -} - -impl From<&[u8]> for RawString { - fn from(source: &[u8]) -> Self { - Self(source.into()) - } -} - -impl Debug for RawString { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{:?}", default_decode(self.0.as_slice())) - } -} - -#[derive(Copy, Clone)] -pub struct RawStr(pub [u8; N]); - -impl From<[u8; N]> for RawStr { - fn from(source: [u8; N]) -> Self { - Self(source) - } -} - -impl Debug for RawStr { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{:?}", default_decode(&self.0)) - } -} - -#[derive(Clone, Debug)] -pub struct ValueLabel -where - V: Debug, - S: Debug, -{ - pub value: Value, - pub label: S, -} - -#[derive(Clone)] -pub struct ValueLabelRecord -where - V: Debug, - S: Debug, -{ - /// Range of offsets in file. - pub offsets: Range, - - /// The labels. - pub labels: Vec>, - - /// The 1-based indexes of the variable indexes. - pub dict_indexes: Vec, - - /// The types of the variables. - pub var_type: VarType, -} - -impl Debug for ValueLabelRecord -where - V: Debug, - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> FmtResult { - writeln!(f, "labels: ")?; - for label in self.labels.iter() { - writeln!(f, "{label:?}")?; - } - write!(f, "apply to {} variables", self.var_type)?; - for dict_index in self.dict_indexes.iter() { - write!(f, " #{dict_index}")?; - } - Ok(()) - } -} - -impl Header for ValueLabelRecord -where - V: Debug, - S: Debug, -{ - fn offsets(&self) -> Range { - self.offsets.clone() - } -} - -impl ValueLabelRecord -where - V: Debug, - S: Debug, -{ - /// Maximum number of value labels in a record. - pub const MAX_LABELS: u32 = u32::MAX / 8; - - /// Maximum number of variable indexes in a record. - pub const MAX_INDEXES: u32 = u32::MAX / 8; -} - -impl ValueLabelRecord, RawString> { - fn read( - r: &mut R, - endian: Endian, - var_types: &[VarType], - warn: &dyn Fn(Warning), - ) -> Result, Error> { - let label_offset = r.stream_position()?; - let n: u32 = endian.parse(read_bytes(r)?); - if n > Self::MAX_LABELS { - return Err(Error::BadNumberOfValueLabels { - offset: label_offset, - n, - max: Self::MAX_LABELS, - }); - } - - let mut labels = Vec::new(); - for _ in 0..n { - let value = UntypedValue(read_bytes(r)?); - let label_len: u8 = endian.parse(read_bytes(r)?); - let label_len = label_len as usize; - let padded_len = Integer::next_multiple_of(&(label_len + 1), &8); - - let mut label = read_vec(r, padded_len - 1)?; - label.truncate(label_len); - labels.push((value, RawString(label))); - } - - let index_offset = r.stream_position()?; - let rec_type: u32 = endian.parse(read_bytes(r)?); - if rec_type != 4 { - return Err(Error::ExpectedVarIndexRecord { - offset: index_offset, - rec_type, - }); - } - - let n: u32 = endian.parse(read_bytes(r)?); - if n > Self::MAX_INDEXES { - return Err(Error::TooManyVarIndexes { - offset: index_offset, - n, - max: Self::MAX_INDEXES, - }); - } else if n == 0 { - warn(Warning::NoVarIndexes { - offset: index_offset, - }); - return Ok(None); - } - - let index_offset = r.stream_position()?; - let mut dict_indexes = Vec::with_capacity(n as usize); - let mut invalid_indexes = Vec::new(); - for _ in 0..n { - let index: u32 = endian.parse(read_bytes(r)?); - if index == 0 || index as usize > var_types.len() { - dict_indexes.push(index); - } else { - invalid_indexes.push(index); - } - } - if !invalid_indexes.is_empty() { - warn(Warning::InvalidVarIndexes { - offset: index_offset, - max: var_types.len(), - invalid: invalid_indexes, - }); - } - - let Some(&first_index) = dict_indexes.first() else { - return Ok(None); - }; - let var_type = var_types[first_index as usize - 1]; - let mut wrong_type_indexes = Vec::new(); - dict_indexes.retain(|&index| { - if var_types[index as usize - 1] != var_type { - wrong_type_indexes.push(index); - false - } else { - true - } - }); - if !wrong_type_indexes.is_empty() { - warn(Warning::MixedVarTypes { - offset: index_offset, - var_type, - wrong_types: wrong_type_indexes, - }); - } - - let labels = labels - .into_iter() - .map(|(value, label)| ValueLabel { - value: Value::from_raw(&value, var_type, endian), - label, - }) - .collect(); - - let end_offset = r.stream_position()?; - Ok(Some(Record::ValueLabel(ValueLabelRecord { - offsets: label_offset..end_offset, - labels, - dict_indexes, - var_type, - }))) - } - - fn decode(self, decoder: &Decoder) -> ValueLabelRecord, String> { - let labels = self - .labels - .iter() - .map(|ValueLabel { value, label }| ValueLabel { - value: *value, - label: decoder.decode(label).to_string(), - }) - .collect(); - ValueLabelRecord { - offsets: self.offsets.clone(), - labels, - dict_indexes: self.dict_indexes.clone(), - var_type: self.var_type, - } - } -} - -#[derive(Clone, Debug)] -pub struct DocumentRecord -where - S: Debug, -{ - pub offsets: Range, - - /// The document, as an array of lines. Raw lines are exactly 80 bytes long - /// and are right-padded with spaces without any new-line termination. - pub lines: Vec, -} - -pub type RawDocumentLine = RawStr; - -/// Length of a line in a document. Document lines are fixed-length and -/// padded on the right with spaces. -pub const DOC_LINE_LEN: usize = 80; - -impl DocumentRecord { - /// Maximum number of lines we will accept in a document. This is simply - /// the maximum number that will fit in a 32-bit space. - pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN; - - fn read(r: &mut R, endian: Endian) -> Result { - let start_offset = r.stream_position()?; - let n: u32 = endian.parse(read_bytes(r)?); - let n = n as usize; - if n > Self::MAX_LINES { - Err(Error::BadDocumentLength { - offset: start_offset, - n, - max: Self::MAX_LINES, - }) - } else { - let mut lines = Vec::with_capacity(n); - for _ in 0..n { - lines.push(RawStr(read_bytes(r)?)); - } - let end_offset = r.stream_position()?; - Ok(Record::Document(DocumentRecord { - offsets: start_offset..end_offset, - lines, - })) - } - } - - pub fn decode(self, decoder: &Decoder) -> DecodedRecord { - DecodedRecord::Document(DocumentRecord { - offsets: self.offsets.clone(), - lines: self - .lines - .iter() - .map(|s| decoder.decode_slice(&s.0).to_string()) - .collect(), - }) - } -} - -impl Header for DocumentRecord -where - S: Debug, -{ - fn offsets(&self) -> Range { - self.offsets.clone() - } -} - -trait ExtensionRecord { - const SUBTYPE: u32; - const SIZE: Option; - const COUNT: Option; - const NAME: &'static str; - fn parse(ext: &Extension, endian: Endian) -> Result; -} - -#[derive(Clone, Debug)] -pub struct IntegerInfoRecord { - pub offsets: Range, - pub version: (i32, i32, i32), - pub machine_code: i32, - pub floating_point_rep: i32, - pub compression_code: i32, - pub endianness: i32, - pub character_code: i32, -} - -impl ExtensionRecord for IntegerInfoRecord { - const SUBTYPE: u32 = 3; - const SIZE: Option = Some(4); - const COUNT: Option = Some(8); - const NAME: &'static str = "integer record"; - - fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size::()?; - - let mut input = &ext.data[..]; - let data: Vec = (0..8) - .map(|_| endian.parse(read_bytes(&mut input).unwrap())) - .collect(); - Ok(Record::IntegerInfo(IntegerInfoRecord { - offsets: ext.offsets.clone(), - version: (data[0], data[1], data[2]), - machine_code: data[3], - floating_point_rep: data[4], - compression_code: data[5], - endianness: data[6], - character_code: data[7], - })) - } -} - -#[derive(Clone, Debug)] -pub struct FloatInfoRecord { - pub sysmis: f64, - pub highest: f64, - pub lowest: f64, -} - -impl ExtensionRecord for FloatInfoRecord { - const SUBTYPE: u32 = 4; - const SIZE: Option = Some(8); - const COUNT: Option = Some(3); - const NAME: &'static str = "floating point record"; - - fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size::()?; - - let mut input = &ext.data[..]; - let data: Vec = (0..3) - .map(|_| endian.parse(read_bytes(&mut input).unwrap())) - .collect(); - Ok(Record::FloatInfo(FloatInfoRecord { - sysmis: data[0], - highest: data[1], - lowest: data[2], - })) - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub enum CategoryLabels { - VarLabels, - CountedValues, -} - -#[derive(Clone, Debug)] -pub enum MultipleResponseType { - MultipleDichotomy { - value: RawString, - labels: CategoryLabels, - }, - MultipleCategory, -} - -impl MultipleResponseType { - fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> { - let (mr_type, input) = match input.split_first() { - Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input), - Some((b'D', input)) => { - let (value, input) = parse_counted_string(input)?; - ( - MultipleResponseType::MultipleDichotomy { - value, - labels: CategoryLabels::VarLabels, - }, - input, - ) - } - Some((b'E', input)) => { - let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") { - (CategoryLabels::CountedValues, rest) - } else if let Some(rest) = input.strip_prefix(b" 11 ") { - (CategoryLabels::VarLabels, rest) - } else { - return Err(Warning::TBD); - }; - let (value, input) = parse_counted_string(input)?; - ( - MultipleResponseType::MultipleDichotomy { value, labels }, - input, - ) - } - _ => return Err(Warning::TBD), - }; - Ok((mr_type, input)) - } -} - -#[derive(Clone, Debug)] -pub struct MultipleResponseSet -where - I: Debug, - S: Debug, -{ - pub name: I, - pub label: S, - pub mr_type: MultipleResponseType, - pub short_names: Vec, -} - -impl MultipleResponseSet { - fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> { - let Some(equals) = input.iter().position(|&b| b == b'=') else { - return Err(Warning::TBD); - }; - let (name, input) = input.split_at(equals); - let (mr_type, input) = MultipleResponseType::parse(input)?; - let Some(input) = input.strip_prefix(b" ") else { - return Err(Warning::TBD); - }; - let (label, mut input) = parse_counted_string(input)?; - let mut vars = Vec::new(); - while input.first() != Some(&b'\n') { - match input.split_first() { - Some((b' ', rest)) => { - let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else { - return Err(Warning::TBD); - }; - let (var, rest) = rest.split_at(length); - if !var.is_empty() { - vars.push(var.into()); - } - input = rest; - } - _ => return Err(Warning::TBD), - } - } - while input.first() == Some(&b'\n') { - input = &input[1..]; - } - Ok(( - MultipleResponseSet { - name: name.into(), - label, - mr_type, - short_names: vars, - }, - input, - )) - } - - fn decode( - &self, - decoder: &Decoder, - ) -> Result, Warning> { - let mut short_names = Vec::with_capacity(self.short_names.len()); - for short_name in self.short_names.iter() { - if let Some(short_name) = decoder - .decode_identifier(short_name) - .map_err(Warning::InvalidMrSetName) - .issue_warning(&decoder.warn) - { - short_names.push(short_name); - } - } - Ok(MultipleResponseSet { - name: decoder - .decode_identifier(&self.name) - .map_err(Warning::InvalidMrSetVariableName)?, - label: decoder.decode(&self.label).to_string(), - mr_type: self.mr_type.clone(), - short_names, - }) - } -} - -#[derive(Clone, Debug)] -pub struct MultipleResponseRecord(pub Vec>) -where - I: Debug, - S: Debug; - -impl ExtensionRecord for MultipleResponseRecord { - const SUBTYPE: u32 = 7; - const SIZE: Option = Some(1); - const COUNT: Option = None; - const NAME: &'static str = "multiple response set record"; - - fn parse(ext: &Extension, _endian: Endian) -> Result { - ext.check_size::()?; - - let mut input = &ext.data[..]; - let mut sets = Vec::new(); - while !input.is_empty() { - let (set, rest) = MultipleResponseSet::parse(input)?; - sets.push(set); - input = rest; - } - Ok(Record::MultipleResponse(MultipleResponseRecord(sets))) - } -} - -impl MultipleResponseRecord { - fn decode(self, decoder: &Decoder) -> DecodedRecord { - let mut sets = Vec::new(); - for set in self.0.iter() { - if let Some(set) = set.decode(decoder).issue_warning(&decoder.warn) { - sets.push(set); - } - } - DecodedRecord::MultipleResponse(MultipleResponseRecord(sets)) - } -} - -fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> { - let Some(space) = input.iter().position(|&b| b == b' ') else { - return Err(Warning::TBD); - }; - let Ok(length) = from_utf8(&input[..space]) else { - return Err(Warning::TBD); - }; - let Ok(length): Result = length.parse() else { - return Err(Warning::TBD); - }; - - let input = &input[space + 1..]; - if input.len() < length { - return Err(Warning::TBD); - }; - - let (string, rest) = input.split_at(length); - Ok((string.into(), rest)) -} - -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub enum Measure { - Nominal, - Ordinal, - Scale, -} - -impl Measure { - pub fn default_for_type(var_type: VarType) -> Option { - match var_type { - VarType::Numeric => None, - VarType::String => Some(Self::Nominal), - } - } - - fn try_decode(source: u32) -> Result, Warning> { - match source { - 0 => Ok(None), - 1 => Ok(Some(Measure::Nominal)), - 2 => Ok(Some(Measure::Ordinal)), - 3 => Ok(Some(Measure::Scale)), - _ => Err(Warning::InvalidMeasurement(source)), - } - } -} - -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub enum Alignment { - Left, - Right, - Center, -} - -impl Alignment { - fn try_decode(source: u32) -> Result, Warning> { - match source { - 0 => Ok(None), - 1 => Ok(Some(Alignment::Left)), - 2 => Ok(Some(Alignment::Right)), - 3 => Ok(Some(Alignment::Center)), - _ => Err(Warning::InvalidAlignment(source)), - } - } - - pub fn default_for_type(var_type: VarType) -> Self { - match var_type { - VarType::Numeric => Self::Right, - VarType::String => Self::Left, - } - } -} - -#[derive(Clone, Debug)] -pub struct VarDisplay { - pub measure: Option, - pub width: Option, - pub alignment: Option, -} - -#[derive(Clone, Debug)] -pub struct VarDisplayRecord(pub Vec); - -impl VarDisplayRecord { - const SUBTYPE: u32 = 11; - - fn parse( - ext: &Extension, - n_vars: usize, - endian: Endian, - warn: &dyn Fn(Warning), - ) -> Result { - if ext.size != 4 { - return Err(Warning::BadRecordSize { - offset: ext.offsets.start, - record: String::from("variable display record"), - size: ext.size, - expected_size: 4, - }); - } - - let has_width = if ext.count as usize == 3 * n_vars { - true - } else if ext.count as usize == 2 * n_vars { - false - } else { - return Err(Warning::TBD); - }; - - let mut var_displays = Vec::new(); - let mut input = &ext.data[..]; - for _ in 0..n_vars { - let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap())) - .issue_warning(&warn) - .flatten(); - let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap())); - let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap())) - .issue_warning(&warn) - .flatten(); - var_displays.push(VarDisplay { - measure, - width, - alignment, - }); - } - Ok(Record::VarDisplay(VarDisplayRecord(var_displays))) - } -} - -#[derive(Clone, Debug)] -pub struct LongStringMissingValues -where - N: Debug, - V: Debug, -{ - /// Variable name. - pub var_name: N, - - /// Missing values. - pub missing_values: MissingValues, -} - -impl LongStringMissingValues> { - fn decode( - &self, - decoder: &Decoder, - ) -> Result, IdError> { - Ok(LongStringMissingValues { - var_name: decoder.decode_identifier(&self.var_name)?, - missing_values: self.missing_values.decode(decoder), - }) - } -} - -#[derive(Clone, Debug)] -pub struct LongStringMissingValueRecord(pub Vec>) -where - N: Debug, - V: Debug; - -impl ExtensionRecord for LongStringMissingValueRecord> { - const SUBTYPE: u32 = 22; - const SIZE: Option = Some(1); - const COUNT: Option = None; - const NAME: &'static str = "long string missing values record"; - - fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size::()?; - - let mut input = &ext.data[..]; - let mut missing_value_set = Vec::new(); - while !input.is_empty() { - let var_name = read_string(&mut input, endian)?; - let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?); - let value_len: u32 = endian.parse(read_bytes(&mut input)?); - if value_len != 8 { - let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start; - return Err(Warning::BadLongMissingValueLength { - record_offset: ext.offsets.start, - offset, - value_len, - }); - } - let mut values = Vec::new(); - for i in 0..n_missing_values { - let value: [u8; 8] = read_bytes(&mut input)?; - let numeric_value: u64 = endian.parse(value); - let value = if i > 0 && numeric_value == 8 { - // Tolerate files written by old, buggy versions of PSPP - // where we believed that the value_length was repeated - // before each missing value. - read_bytes(&mut input)? - } else { - value - }; - values.push(Value::String(RawStr(value))); - } - let missing_values = MissingValues { - values, - range: None, - }; - missing_value_set.push(LongStringMissingValues { - var_name, - missing_values, - }); - } - Ok(Record::LongStringMissingValues( - LongStringMissingValueRecord(missing_value_set), - )) - } -} - -impl LongStringMissingValueRecord> { - pub fn decode(self, decoder: &Decoder) -> LongStringMissingValueRecord { - let mut mvs = Vec::with_capacity(self.0.len()); - for mv in self.0.iter() { - if let Some(mv) = mv - .decode(decoder) - .map_err(Warning::InvalidLongStringMissingValueVariableName) - .issue_warning(&decoder.warn) - { - mvs.push(mv); - } - } - LongStringMissingValueRecord(mvs) - } -} - -#[derive(Clone, Debug)] -pub struct EncodingRecord(pub String); - -impl ExtensionRecord for EncodingRecord { - const SUBTYPE: u32 = 20; - const SIZE: Option = Some(1); - const COUNT: Option = None; - const NAME: &'static str = "encoding record"; - - fn parse(ext: &Extension, _endian: Endian) -> Result { - ext.check_size::()?; - - Ok(Record::Encoding(EncodingRecord( - String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName { - offset: ext.offsets.start, - })?, - ))) - } -} - -#[derive(Clone, Debug)] -pub struct NumberOfCasesRecord { - /// Always observed as 1. - pub one: u64, - - /// Number of cases. - pub n_cases: u64, -} - -impl ExtensionRecord for NumberOfCasesRecord { - const SUBTYPE: u32 = 16; - const SIZE: Option = Some(8); - const COUNT: Option = Some(2); - const NAME: &'static str = "extended number of cases record"; - - fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size::()?; - - let mut input = &ext.data[..]; - let one = endian.parse(read_bytes(&mut input)?); - let n_cases = endian.parse(read_bytes(&mut input)?); - - Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases })) - } -} - -#[derive(Clone, Debug)] -pub struct TextRecord { - pub offsets: Range, - - /// Type of record. - pub rec_type: TextRecordType, - - /// The text content of the record. - pub text: RawString, -} - -#[derive(Clone, Copy, Debug)] -pub enum TextRecordType { - VariableSets, - ProductInfo, - LongNames, - VeryLongStrings, - FileAttributes, - VariableAttributes, -} - -impl TextRecord { - fn new(extension: Extension, rec_type: TextRecordType) -> Self { - Self { - offsets: extension.offsets, - rec_type, - text: extension.data.into(), - } - } - pub fn decode(self, decoder: &Decoder) -> DecodedRecord { - match self.rec_type { - TextRecordType::VariableSets => { - DecodedRecord::VariableSets(VariableSetRecord::decode(&self, decoder)) - } - TextRecordType::ProductInfo => { - DecodedRecord::ProductInfo(ProductInfoRecord::decode(&self, decoder)) - } - TextRecordType::LongNames => { - DecodedRecord::LongNames(LongNamesRecord::decode(&self, decoder)) - } - TextRecordType::VeryLongStrings => { - DecodedRecord::VeryLongStrings(VeryLongStringsRecord::decode(&self, decoder)) - } - TextRecordType::FileAttributes => { - DecodedRecord::FileAttributes(FileAttributeRecord::decode(&self, decoder)) - } - TextRecordType::VariableAttributes => { - DecodedRecord::VariableAttributes(VariableAttributeRecord::decode(&self, decoder)) - } - } - } -} - -#[derive(Clone, Debug)] -pub struct VeryLongString { - pub short_name: Identifier, - pub length: u16, -} - -impl VeryLongString { - fn parse(decoder: &Decoder, input: &str) -> Result { - let Some((short_name, length)) = input.split_once('=') else { - return Err(Warning::TBD); - }; - let short_name = decoder - .new_identifier(short_name) - .map_err(Warning::InvalidLongStringName)?; - let length = length.parse().map_err(|_| Warning::TBD)?; - Ok(VeryLongString { short_name, length }) - } -} - -#[derive(Clone, Debug)] -pub struct VeryLongStringsRecord(Vec); - -impl VeryLongStringsRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Self { - let input = decoder.decode(&source.text); - let mut very_long_strings = Vec::new(); - for tuple in input - .split('\0') - .map(|s| s.trim_end_matches('\t')) - .filter(|s| !s.is_empty()) - { - if let Some(vls) = VeryLongString::parse(decoder, tuple).issue_warning(&decoder.warn) { - very_long_strings.push(vls) - } - } - VeryLongStringsRecord(very_long_strings) - } -} - -#[derive(Clone, Debug)] -pub struct Attribute { - pub name: Identifier, - pub values: Vec, -} - -impl Attribute { - fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> { - let Some((name, mut input)) = input.split_once('(') else { - return Err(Warning::TBD); - }; - let name = decoder - .new_identifier(name) - .map_err(Warning::InvalidAttributeName)?; - let mut values = Vec::new(); - loop { - let Some((value, rest)) = input.split_once('\n') else { - return Err(Warning::TBD); - }; - if let Some(stripped) = value - .strip_prefix('\'') - .and_then(|value| value.strip_suffix('\'')) - { - values.push(stripped.into()); - } else { - decoder.warn(Warning::TBD); - values.push(value.into()); - } - if let Some(rest) = rest.strip_prefix(')') { - let attribute = Attribute { name, values }; - return Ok((attribute, rest)); - }; - input = rest; - } - } -} - -#[derive(Clone, Debug, Default)] -pub struct AttributeSet(pub HashMap>); - -impl AttributeSet { - fn parse<'a>( - decoder: &Decoder, - mut input: &'a str, - sentinel: Option, - ) -> Result<(AttributeSet, &'a str), Warning> { - let mut attributes = HashMap::new(); - let rest = loop { - match input.chars().next() { - None => break input, - c if c == sentinel => break &input[1..], - _ => { - let (attribute, rest) = Attribute::parse(decoder, input)?; - // XXX report duplicate name - attributes.insert(attribute.name, attribute.values); - input = rest; - } - } - }; - Ok((AttributeSet(attributes), rest)) - } -} - -#[derive(Clone, Debug, Default)] -pub struct FileAttributeRecord(pub AttributeSet); - -impl FileAttributeRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Self { - let input = decoder.decode(&source.text); - match AttributeSet::parse(decoder, &input, None).issue_warning(&decoder.warn) { - Some((set, rest)) => { - if !rest.is_empty() { - decoder.warn(Warning::TBD); - } - FileAttributeRecord(set) - } - None => FileAttributeRecord::default(), - } - } -} - -#[derive(Clone, Debug)] -pub struct VarAttributeSet { - pub long_var_name: Identifier, - pub attributes: AttributeSet, -} - -impl VarAttributeSet { - fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributeSet, &'a str), Warning> { - let Some((long_var_name, rest)) = input.split_once(':') else { - return Err(Warning::TBD); - }; - let long_var_name = decoder - .new_identifier(long_var_name) - .map_err(Warning::InvalidAttributeVariableName)?; - let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?; - let var_attribute = VarAttributeSet { - long_var_name, - attributes, - }; - Ok((var_attribute, rest)) - } -} - -#[derive(Clone, Debug)] -pub struct VariableAttributeRecord(Vec); - -impl VariableAttributeRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Self { - let decoded = decoder.decode(&source.text); - let mut input = decoded.as_ref(); - let mut var_attribute_sets = Vec::new(); - while !input.is_empty() { - let Some((var_attribute, rest)) = - VarAttributeSet::parse(decoder, input).issue_warning(&decoder.warn) - else { - break; - }; - var_attribute_sets.push(var_attribute); - input = rest; - } - VariableAttributeRecord(var_attribute_sets) - } -} - -#[derive(Clone, Debug)] -pub struct LongName { - pub short_name: Identifier, - pub long_name: Identifier, -} - -impl LongName { - fn parse(input: &str, decoder: &Decoder) -> Result { - let Some((short_name, long_name)) = input.split_once('=') else { - return Err(Warning::TBD); - }; - let short_name = decoder - .new_identifier(short_name) - .map_err(Warning::InvalidShortName)?; - let long_name = decoder - .new_identifier(long_name) - .map_err(Warning::InvalidLongName)?; - Ok(LongName { - short_name, - long_name, - }) - } -} - -#[derive(Clone, Debug)] -pub struct LongNamesRecord(Vec); - -impl LongNamesRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Self { - let input = decoder.decode(&source.text); - let mut names = Vec::new(); - for pair in input.split('\t').filter(|s| !s.is_empty()) { - if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&decoder.warn) { - names.push(long_name); - } - } - LongNamesRecord(names) - } -} - -#[derive(Clone, Debug)] -pub struct ProductInfoRecord(pub String); - -impl ProductInfoRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Self { - Self(decoder.decode(&source.text).into()) - } -} -#[derive(Clone, Debug)] -pub struct VariableSet { - pub name: String, - pub vars: Vec, -} - -impl VariableSet { - fn parse(input: &str, decoder: &Decoder) -> Result { - let (name, input) = input.split_once('=').ok_or(Warning::TBD)?; - let mut vars = Vec::new(); - for var in input.split_ascii_whitespace() { - if let Some(identifier) = decoder - .new_identifier(var) - .map_err(Warning::InvalidVariableSetName) - .issue_warning(&decoder.warn) - { - vars.push(identifier); - } - } - Ok(VariableSet { - name: name.into(), - vars, - }) - } -} - -#[derive(Clone, Debug)] -pub struct VariableSetRecord { - pub offsets: Range, - pub sets: Vec, -} - -impl VariableSetRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord { - let mut sets = Vec::new(); - let input = decoder.decode(&source.text); - for line in input.lines() { - if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&decoder.warn) { - sets.push(set) - } - } - VariableSetRecord { - offsets: source.offsets.clone(), - sets, - } - } -} - -trait IssueWarning { - fn issue_warning(self, warn: &F) -> Option - where - F: Fn(Warning); -} -impl IssueWarning for Result { - fn issue_warning(self, warn: &F) -> Option - where - F: Fn(Warning), - { - match self { - Ok(result) => Some(result), - Err(error) => { - warn(error); - None - } - } - } -} - -#[derive(Clone, Debug)] -pub struct Extension { - pub offsets: Range, - - /// Record subtype. - pub subtype: u32, - - /// Size of each data element. - pub size: u32, - - /// Number of data elements. - pub count: u32, - - /// `size * count` bytes of data. - pub data: Vec, -} - -impl Extension { - fn check_size(&self) -> Result<(), Warning> { - if let Some(expected_size) = E::SIZE { - if self.size != expected_size { - return Err(Warning::BadRecordSize { - offset: self.offsets.start, - record: E::NAME.into(), - size: self.size, - expected_size, - }); - } - } - if let Some(expected_count) = E::COUNT { - if self.count != expected_count { - return Err(Warning::BadRecordCount { - offset: self.offsets.start, - record: E::NAME.into(), - count: self.count, - expected_count, - }); - } - } - Ok(()) - } - - fn read( - r: &mut R, - endian: Endian, - n_vars: usize, - warn: &dyn Fn(Warning), - ) -> Result, Error> { - let subtype = endian.parse(read_bytes(r)?); - let header_offset = r.stream_position()?; - let size: u32 = endian.parse(read_bytes(r)?); - let count = endian.parse(read_bytes(r)?); - let Some(product) = size.checked_mul(count) else { - return Err(Error::ExtensionRecordTooLarge { - offset: header_offset, - subtype, - size, - count, - }); - }; - let start_offset = r.stream_position()?; - let data = read_vec(r, product as usize)?; - let end_offset = start_offset + product as u64; - let extension = Extension { - offsets: start_offset..end_offset, - subtype, - size, - count, - data, - }; - let result = match subtype { - IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian), - FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian), - VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, n_vars, endian, warn), - MultipleResponseRecord::SUBTYPE | 19 => { - MultipleResponseRecord::parse(&extension, endian) - } - LongStringValueLabelRecord::SUBTYPE => { - LongStringValueLabelRecord::parse(&extension, endian) - } - EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian), - NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian), - 5 => Ok(Record::Text(TextRecord::new( - extension, - TextRecordType::VariableSets, - ))), - 10 => Ok(Record::Text(TextRecord::new( - extension, - TextRecordType::ProductInfo, - ))), - 13 => Ok(Record::Text(TextRecord::new( - extension, - TextRecordType::LongNames, - ))), - 14 => Ok(Record::Text(TextRecord::new( - extension, - TextRecordType::VeryLongStrings, - ))), - 17 => Ok(Record::Text(TextRecord::new( - extension, - TextRecordType::FileAttributes, - ))), - 18 => Ok(Record::Text(TextRecord::new( - extension, - TextRecordType::VariableAttributes, - ))), - _ => Ok(Record::OtherExtension(extension)), - }; - match result { - Ok(result) => Ok(Some(result)), - Err(error) => { - warn(error); - Ok(None) - } - } - } -} - -#[derive(Clone, Debug)] -pub struct ZHeader { - /// File offset to the start of the record. - pub offset: u64, - - /// File offset to the ZLIB data header. - pub zheader_offset: u64, - - /// File offset to the ZLIB trailer. - pub ztrailer_offset: u64, - - /// Length of the ZLIB trailer in bytes. - pub ztrailer_len: u64, -} - -impl ZHeader { - fn read(r: &mut R, endian: Endian) -> Result { - let offset = r.stream_position()?; - let zheader_offset: u64 = endian.parse(read_bytes(r)?); - let ztrailer_offset: u64 = endian.parse(read_bytes(r)?); - let ztrailer_len: u64 = endian.parse(read_bytes(r)?); - - Ok(ZHeader { - offset, - zheader_offset, - ztrailer_offset, - ztrailer_len, - }) - } -} - -#[derive(Clone, Debug)] -pub struct ZTrailer { - /// File offset to the start of the record. - pub offset: u64, - - /// Compression bias as a negative integer, e.g. -100. - pub int_bias: i64, - - /// Always observed as zero. - pub zero: u64, - - /// Uncompressed size of each block, except possibly the last. Only - /// `0x3ff000` has been observed so far. - pub block_size: u32, - - /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them. - pub blocks: Vec, -} - -#[derive(Clone, Debug)] -pub struct ZBlock { - /// Offset of block of data if simple compression were used. - pub uncompressed_ofs: u64, - - /// Actual offset within the file of the compressed data block. - pub compressed_ofs: u64, - - /// The number of bytes in this data block after decompression. This is - /// `block_size` in every data block but the last, which may be smaller. - pub uncompressed_size: u32, - - /// The number of bytes in this data block, as stored compressed in this - /// file. - pub compressed_size: u32, -} - -impl ZBlock { - fn read(r: &mut R, endian: Endian) -> Result { - Ok(ZBlock { - uncompressed_ofs: endian.parse(read_bytes(r)?), - compressed_ofs: endian.parse(read_bytes(r)?), - uncompressed_size: endian.parse(read_bytes(r)?), - compressed_size: endian.parse(read_bytes(r)?), - }) - } -} - -impl ZTrailer { - fn read( - reader: &mut R, - endian: Endian, - ztrailer_ofs: u64, - ztrailer_len: u64, - ) -> Result, Error> { - let start_offset = reader.stream_position()?; - if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() { - return Ok(None); - } - let int_bias = endian.parse(read_bytes(reader)?); - let zero = endian.parse(read_bytes(reader)?); - let block_size = endian.parse(read_bytes(reader)?); - let n_blocks: u32 = endian.parse(read_bytes(reader)?); - let expected_n_blocks = (ztrailer_len - 24) / 24; - if n_blocks as u64 != expected_n_blocks { - return Err(Error::BadZlibTrailerNBlocks { - offset: ztrailer_ofs, - n_blocks, - expected_n_blocks, - ztrailer_len, - }); - } - let blocks = (0..n_blocks) - .map(|_| ZBlock::read(reader, endian)) - .collect::, _>>()?; - reader.seek(SeekFrom::Start(start_offset))?; - Ok(Some(ZTrailer { - offset: ztrailer_ofs, - int_bias, - zero, - block_size, - blocks, - })) - } -} - -fn try_read_bytes(r: &mut R) -> Result, IoError> { - let mut buf = [0; N]; - let n = r.read(&mut buf)?; - if n > 0 { - if n < N { - r.read_exact(&mut buf[n..])?; - } - Ok(Some(buf)) - } else { - Ok(None) - } -} - -fn read_bytes(r: &mut R) -> Result<[u8; N], IoError> { - let mut buf = [0; N]; - r.read_exact(&mut buf)?; - Ok(buf) -} - -fn read_vec(r: &mut R, n: usize) -> Result, IoError> { - let mut vec = vec![0; n]; - r.read_exact(&mut vec)?; - Ok(vec) -} - -fn read_string(r: &mut R, endian: Endian) -> Result { - let length: u32 = endian.parse(read_bytes(r)?); - Ok(read_vec(r, length as usize)?.into()) -} - -#[derive(Clone, Debug)] -pub struct LongStringValueLabels -where - S: Debug, -{ - pub var_name: N, - pub width: u32, - - /// `(value, label)` pairs, where each value is `width` bytes. - pub labels: Vec<(S, S)>, -} - -impl LongStringValueLabels { - fn decode( - &self, - decoder: &Decoder, - ) -> Result, Warning> { - let var_name = decoder.decode(&self.var_name); - let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding) - .map_err(Warning::InvalidLongStringValueLabelName)?; - - let mut labels = Vec::with_capacity(self.labels.len()); - for (value, label) in self.labels.iter() { - let value = decoder.decode_exact_length(&value.0).to_string(); - let label = decoder.decode(label).to_string(); - labels.push((value, label)); - } - - Ok(LongStringValueLabels { - var_name, - width: self.width, - labels, - }) - } -} - -#[derive(Clone, Debug)] -pub struct LongStringValueLabelRecord(pub Vec>) -where - N: Debug, - S: Debug; - -impl ExtensionRecord for LongStringValueLabelRecord { - const SUBTYPE: u32 = 21; - const SIZE: Option = Some(1); - const COUNT: Option = None; - const NAME: &'static str = "long string value labels record"; - - fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size::()?; - - let mut input = &ext.data[..]; - let mut label_set = Vec::new(); - while !input.is_empty() { - let var_name = read_string(&mut input, endian)?; - let width: u32 = endian.parse(read_bytes(&mut input)?); - let n_labels: u32 = endian.parse(read_bytes(&mut input)?); - let mut labels = Vec::new(); - for _ in 0..n_labels { - let value = read_string(&mut input, endian)?; - let label = read_string(&mut input, endian)?; - labels.push((value, label)); - } - label_set.push(LongStringValueLabels { - var_name, - width, - labels, - }) - } - Ok(Record::LongStringValueLabels(LongStringValueLabelRecord( - label_set, - ))) - } -} - -impl LongStringValueLabelRecord { - fn decode(self, decoder: &Decoder) -> LongStringValueLabelRecord { - let mut labels = Vec::with_capacity(self.0.len()); - for label in &self.0 { - match label.decode(decoder) { - Ok(set) => labels.push(set), - Err(error) => decoder.warn(error), - } - } - LongStringValueLabelRecord(labels) - } -} diff --git a/rust/src/sack.rs b/rust/src/sack.rs deleted file mode 100644 index c6be5d1eef..0000000000 --- a/rust/src/sack.rs +++ /dev/null @@ -1,633 +0,0 @@ -use float_next_after::NextAfter; -use num::{Bounded, Zero}; -use ordered_float::OrderedFloat; -use std::{ - collections::{hash_map::Entry, HashMap}, - error::Error as StdError, - fmt::{Display, Formatter, Result as FmtResult}, - iter::repeat, -}; - -use crate::endian::{Endian, ToBytes}; - -pub type Result = std::result::Result; - -#[derive(Debug)] -pub struct Error { - pub file_name: Option, - pub line_number: Option, - pub token: Option, - pub message: String, -} - -impl Error { - fn new( - file_name: Option<&str>, - line_number: Option, - token: Option<&str>, - message: String, - ) -> Error { - Error { - file_name: file_name.map(String::from), - line_number, - token: token.map(String::from), - message, - } - } -} - -impl StdError for Error {} - -impl Display for Error { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - match (self.file_name.as_ref(), self.line_number) { - (Some(ref file_name), Some(line_number)) => write!(f, "{file_name}:{line_number}: ")?, - (Some(ref file_name), None) => write!(f, "{file_name}: ")?, - (None, Some(line_number)) => write!(f, "line {line_number}: ")?, - (None, None) => (), - } - if let Some(ref token) = self.token { - write!(f, "at '{token}': ")?; - } - write!(f, "{}", self.message) - } -} - -pub fn sack(input: &str, input_file_name: Option<&str>, endian: Endian) -> Result> { - let mut symbol_table = HashMap::new(); - let output = _sack(input, input_file_name, endian, &mut symbol_table)?; - let output = if !symbol_table.is_empty() { - for (k, v) in symbol_table.iter() { - println!("{k} => {v:?}"); - } - for (k, v) in symbol_table.iter() { - if v.is_none() { - Err(Error::new( - input_file_name, - None, - None, - format!("label {k} used but never defined"), - ))? - } - } - _sack(input, input_file_name, endian, &mut symbol_table)? - } else { - output - }; - Ok(output) -} - -fn _sack( - input: &str, - input_file_name: Option<&str>, - endian: Endian, - symbol_table: &mut HashMap>, -) -> Result> { - let mut lexer = Lexer::new(input, input_file_name, endian)?; - let mut output = Vec::new(); - while parse_data_item(&mut lexer, &mut output, symbol_table)? {} - Ok(output) -} - -fn parse_data_item( - lexer: &mut Lexer, - output: &mut Vec, - symbol_table: &mut HashMap>, -) -> Result { - if lexer.token.is_none() { - return Ok(false); - }; - - let initial_len = output.len(); - match lexer.take()? { - Token::Integer(integer) => { - if let Ok(integer) = TryInto::::try_into(integer) { - output.extend_from_slice(&lexer.endian.to_bytes(integer)); - } else if let Ok(integer) = TryInto::::try_into(integer) { - output.extend_from_slice(&lexer.endian.to_bytes(integer)); - } else { - Err(lexer.error(format!( - "{integer} is not in the valid range [{},{}]", - i32::min_value(), - u32::max_value() - )))?; - }; - } - Token::Float(float) => output.extend_from_slice(&lexer.endian.to_bytes(float.0)), - Token::PcSysmis => { - output.extend_from_slice(&[0xf5, 0x1e, 0x26, 0x02, 0x8a, 0x8c, 0xed, 0xff]) - } - Token::I8 => put_integers::(lexer, "i8", output)?, - Token::I16 => put_integers::(lexer, "i16", output)?, - Token::I64 => put_integers::(lexer, "i64", output)?, - Token::String(string) => output.extend_from_slice(string.as_bytes()), - Token::S(size) => { - let Some((Token::String(ref string), _)) = lexer.token else { - Err(lexer.error(format!("string expected after 's{size}'")))? - }; - let len = string.len(); - if len > size { - Err(lexer.error(format!( - "{len}-byte string is longer than pad length {size}" - )))? - } - output.extend_from_slice(string.as_bytes()); - output.extend(repeat(b' ').take(size - len)); - lexer.get()?; - } - Token::LParen => { - while !matches!(lexer.token, Some((Token::RParen, _))) { - parse_data_item(lexer, output, symbol_table)?; - } - lexer.get()?; - } - Token::Count => put_counted_items::(lexer, "COUNT", output, symbol_table)?, - Token::Count8 => put_counted_items::(lexer, "COUNT8", output, symbol_table)?, - Token::Hex => { - let Some((Token::String(ref string), _)) = lexer.token else { - Err(lexer.error(String::from("string expected after 'hex'")))? - }; - let mut string = &string[..]; - loop { - string = string.trim_start(); - if string.is_empty() { - break; - }; - - let mut i = string.chars(); - let Some(c0) = i.next() else { return Ok(true) }; - let Some(c1) = i.next() else { - Err(lexer.error(String::from("hex string has odd number of characters")))? - }; - - let (Some(digit0), Some(digit1)) = (c0.to_digit(16), c1.to_digit(16)) else { - Err(lexer.error(String::from("invalid digit in hex string")))? - }; - let byte = digit0 * 16 + digit1; - output.push(byte as u8); - - string = i.as_str(); - } - lexer.get()?; - } - Token::Label(name) => { - println!("define {name}"); - let value = output.len() as u32; - match symbol_table.entry(name.clone()) { - Entry::Vacant(v) => { - v.insert(Some(value)); - } - Entry::Occupied(mut o) => { - match o.get() { - Some(v) => { - if *v != value { - Err(lexer.error(format!("{name}: can't redefine label for offset {:#x} with offset {:#x}", *v, value)))? - } - } - None => drop(o.insert(Some(value))), - } - } - }; - return Ok(true); - } - Token::At(name) => { - let mut value = *symbol_table.entry(name.clone()).or_insert(None); - loop { - let plus = match lexer.token { - Some((Token::Plus, _)) => true, - Some((Token::Minus, _)) => false, - _ => break, - }; - lexer.get()?; - - let operand = match lexer.token { - Some((Token::At(ref name), _)) => { - *symbol_table.entry(name.clone()).or_insert(None) - } - Some((Token::Integer(integer), _)) => Some( - integer - .try_into() - .map_err(|msg| lexer.error(format!("bad offset literal ({msg})")))?, - ), - _ => Err(lexer.error(String::from("expecting @label or integer literal")))?, - }; - lexer.get()?; - - value = match (value, operand) { - (Some(a), Some(b)) => Some( - if plus { - a.checked_add(b) - } else { - a.checked_sub(b) - } - .ok_or_else(|| { - lexer.error(String::from("overflow in offset arithmetic")) - })?, - ), - _ => None, - }; - } - let value = value.unwrap_or(0); - output.extend_from_slice(&lexer.endian.to_bytes(value)); - } - _ => (), - }; - if let Some((Token::Asterisk, _)) = lexer.token { - lexer.get()?; - let Token::Integer(count) = lexer.take()? else { - Err(lexer.error(String::from("positive integer expected after '*'")))? - }; - if count < 1 { - Err(lexer.error(String::from("positive integer expected after '*'")))? - }; - let final_len = output.len(); - for _ in 1..count { - output.extend_from_within(initial_len..final_len); - } - } - match lexer.token { - Some((Token::Semicolon, _)) => { - lexer.get()?; - } - Some((Token::RParen, _)) => (), - _ => Err(lexer.error(String::from("';' expected")))?, - } - Ok(true) -} - -fn put_counted_items( - lexer: &mut Lexer, - name: &str, - output: &mut Vec, - symbol_table: &mut HashMap>, -) -> Result<()> -where - T: Zero + TryFrom, - Endian: ToBytes, -{ - let old_size = output.len(); - output.extend_from_slice(&lexer.endian.to_bytes(T::zero())); - let start = output.len(); - if !matches!(lexer.token, Some((Token::LParen, _))) { - Err(lexer.error(format!("'(' expected after '{name}'")))? - } - lexer.get()?; - while !matches!(lexer.token, Some((Token::RParen, _))) { - parse_data_item(lexer, output, symbol_table)?; - } - lexer.get()?; - let delta = output.len() - start; - let Ok(delta): Result = delta.try_into() else { - Err(lexer.error(format!("{delta} bytes is too much for '{name}'")))? - }; - let dest = &mut output[old_size..old_size + N]; - dest.copy_from_slice(&lexer.endian.to_bytes(delta)); - Ok(()) -} - -fn put_integers( - lexer: &mut Lexer, - name: &str, - output: &mut Vec, -) -> Result<()> -where - T: Bounded + Display + TryFrom + Copy, - Endian: ToBytes, -{ - println!("put_integers {:?}", lexer.token); - let mut n = 0; - while let Some(integer) = lexer.take_if(|t| match t { - Token::Integer(integer) => Some(*integer), - _ => None, - })? { - println!("got integer {integer}"); - let Ok(integer) = integer.try_into() else { - Err(lexer.error(format!( - "{integer} is not in the valid range [{},{}]", - T::min_value(), - T::max_value() - )))? - }; - output.extend_from_slice(&lexer.endian.to_bytes(integer)); - n += 1; - } - println!("put_integers {:?} {n}", lexer.token); - if n == 0 { - Err(lexer.error(format!("integer expected after '{name}'")))? - } - Ok(()) -} - -#[derive(PartialEq, Eq, Clone, Debug)] -enum Token { - Integer(i64), - Float(OrderedFloat), - PcSysmis, - String(String), - Semicolon, - Asterisk, - LParen, - RParen, - I8, - I16, - I64, - S(usize), - Count, - Count8, - Hex, - Label(String), - At(String), - Minus, - Plus, -} - -struct Lexer<'a> { - input: &'a str, - token: Option<(Token, &'a str)>, - input_file_name: Option<&'a str>, - line_number: usize, - endian: Endian, -} - -fn skip_comments(mut s: &str) -> (&str, usize) { - let mut n_newlines = 0; - let s = loop { - s = s.trim_start_matches([' ', '\t', '\r', '<', '>']); - if let Some(remainder) = s.strip_prefix('#') { - let Some((_, remainder)) = remainder.split_once('\n') else { - break ""; - }; - s = remainder; - n_newlines += 1; - } else if let Some(remainder) = s.strip_prefix('\n') { - s = remainder; - n_newlines += 1; - } else { - break s; - } - }; - (s, n_newlines) -} - -impl<'a> Lexer<'a> { - fn new(input: &'a str, input_file_name: Option<&'a str>, endian: Endian) -> Result> { - let mut lexer = Lexer { - input, - token: None, - input_file_name, - line_number: 1, - endian, - }; - lexer.token = lexer.next()?; - Ok(lexer) - } - fn error(&self, message: String) -> Error { - let repr = self.token.as_ref().map(|(_, repr)| *repr); - Error::new(self.input_file_name, Some(self.line_number), repr, message) - } - fn take(&mut self) -> Result { - let Some(token) = self.token.take() else { - Err(self.error(String::from("unexpected end of input")))? - }; - self.token = self.next()?; - Ok(token.0) - } - fn take_if(&mut self, condition: F) -> Result> - where - F: FnOnce(&Token) -> Option, - { - let Some(ref token) = self.token else { - return Ok(None); - }; - match condition(&token.0) { - Some(value) => { - self.token = self.next()?; - Ok(Some(value)) - } - None => Ok(None), - } - } - fn get(&mut self) -> Result> { - if self.token.is_none() { - Err(self.error(String::from("unexpected end of input")))? - } else { - self.token = self.next()?; - match self.token { - Some((ref token, _)) => Ok(Some(token)), - None => Ok(None), - } - } - } - - fn next(&mut self) -> Result> { - // Get the first character of the token, skipping past white space and - // comments. - let (s, n_newlines) = skip_comments(self.input); - self.line_number += n_newlines; - self.input = s; - - let start = s; - let mut iter = s.chars(); - let Some(c) = iter.next() else { - return Ok(None); - }; - let (token, rest) = match c { - c if c.is_ascii_digit() || c == '-' => { - let len = s - .find(|c: char| { - !(c.is_ascii_digit() || c.is_alphabetic() || c == '.' || c == '-') - }) - .unwrap_or(s.len()); - let (number, rest) = s.split_at(len); - let token = if number == "-" { - Token::Minus - } else if let Some(digits) = number.strip_prefix("0x") { - Token::Integer(i64::from_str_radix(digits, 16).map_err(|msg| { - self.error(format!("bad integer literal '{number}' ({msg})")) - })?) - } else if !number.contains('.') { - Token::Integer(number.parse().map_err(|msg| { - self.error(format!("bad integer literal '{number}' ({msg})")) - })?) - } else { - Token::Float(number.parse().map_err(|msg| { - self.error(format!("bad float literal '{number}' ({msg})")) - })?) - }; - (token, rest) - } - '"' => { - let s = iter.as_str(); - let Some(len) = s.find(['\n', '"']) else { - Err(self.error(String::from("end-of-file inside string")))? - }; - let (string, rest) = s.split_at(len); - let Some(rest) = rest.strip_prefix('"') else { - Err(self.error(format!("new-line inside string ({string}...{rest})")))? - }; - (Token::String(string.into()), rest) - } - ';' => (Token::Semicolon, iter.as_str()), - '*' => (Token::Asterisk, iter.as_str()), - '+' => (Token::Plus, iter.as_str()), - '(' => (Token::LParen, iter.as_str()), - ')' => (Token::RParen, iter.as_str()), - c if c.is_alphabetic() || c == '@' || c == '_' => { - let len = s - .find(|c: char| { - !(c.is_ascii_digit() - || c.is_alphabetic() - || c == '@' - || c == '.' - || c == '_') - }) - .unwrap_or(s.len()); - let (s, rest) = s.split_at(len); - if let Some(rest) = rest.strip_prefix(':') { - (Token::Label(s.into()), rest) - } else if let Some(name) = s.strip_prefix('@') { - (Token::At(name.into()), rest) - } else if let Some(count) = s.strip_prefix('s') { - let token = - Token::S(count.parse().map_err(|msg| { - self.error(format!("bad counted string '{s}' ({msg})")) - })?); - (token, rest) - } else { - let token = match s { - "i8" => Token::I8, - "i16" => Token::I16, - "i64" => Token::I64, - "SYSMIS" => Token::Float(OrderedFloat(-f64::MAX)), - "PCSYSMIS" => Token::PcSysmis, - "LOWEST" => Token::Float((-f64::MAX).next_after(0.0).into()), - "HIGHEST" => Token::Float(f64::MAX.into()), - "ENDIAN" => Token::Integer(if self.endian == Endian::Big { 1 } else { 2 }), - "COUNT" => Token::Count, - "COUNT8" => Token::Count8, - "hex" => Token::Hex, - _ => Err(self.error(format!("invalid token '{s}'")))?, - }; - (token, rest) - } - } - _ => Err(self.error(format!("invalid input byte '{c}'")))?, - }; - self.input = rest; - let repr = &start[..start.len() - rest.len()]; - println!("{token:?} {repr}"); - Ok(Some((token, repr))) - } -} - -#[cfg(test)] -mod test { - use crate::endian::Endian; - use crate::sack::sack; - use anyhow::Result; - use hexplay::HexView; - - #[test] - fn basic_sack() -> Result<()> { - let input = r#" -"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; -2; # Layout code -28; # Nominal case size -0; # Not compressed -0; # Not weighted -1; # 1 case. -100.0; # Bias. -"01 Jan 11"; "20:53:52"; -"PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 ""; -i8 0 *3; -"#; - let output = sack(input, None, Endian::Big)?; - HexView::new(&output).print()?; - Ok(()) - } - - #[test] - fn pcp_sack() -> Result<()> { - let input = r#" -# File header. -2; 0; -@MAIN; @MAIN_END - @MAIN; -@VARS; @VARS_END - @VARS; -@LABELS; @LABELS_END - @LABELS; -@DATA; @DATA_END - @DATA; -(0; 0) * 11; -i8 0 * 128; - -MAIN: - i16 1; # Fixed. - s62 "PCSPSS PSPP synthetic test product"; - PCSYSMIS; - 0; 0; i16 1; # Fixed. - i16 0; - i16 15; - 1; - i16 0; # Fixed. - 1; - s8 "11/28/14"; - s8 "15:11:00"; - s64 "PSPP synthetic test file"; -MAIN_END: - -VARS: - 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; - 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; - 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; - - # Numeric variable, no label or missing values. - 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; - - # Numeric variable, variable label. - 0; 0; @NUM2_LABEL - @LABELS_OFS; 0x050800; s8 "NUM2"; PCSYSMIS; - - # Numeric variable with missing value. - 0; 0; 0; 0x050800; s8 "NUM3"; 1.0; - - # Numeric variable, variable label and missing value. - 0; 0; @NUM4_LABEL - @LABELS_OFS; 0x050800; s8 "NUM4"; 2.0; - - # String variable, no label or missing values. - 0; 0; 0; 0x010800; s8 "STR1"; PCSYSMIS; - - # String variable, variable label. - 0; 0; @STR2_LABEL - @LABELS_OFS; 0x010400; s8 "STR2"; PCSYSMIS; - - # String variable with missing value. - 0; 0; 0; 0x010500; s8 "STR3"; s8 "MISS"; - - # String variable, variable label and missing value. - 0; 0; @STR4_LABEL - @LABELS_OFS; 0x010100; s8 "STR4"; s8 "OTHR"; - - # Long string variable - 0; 0; 0; 0x010b00; s8 "STR5"; PCSYSMIS; - 0 * 8; - - # Long string variable with variable label - 0; 0; @STR6_LABEL - @LABELS_OFS; 0x010b00; s8 "STR6"; PCSYSMIS; - 0 * 8; -VARS_END: - -LABELS: - 3; i8 0 0 0; LABELS_OFS: i8 0; - NUM2_LABEL: COUNT8("Numeric variable 2's label"); - NUM4_LABEL: COUNT8("Another numeric variable label"); - STR2_LABEL: COUNT8("STR2's variable label"); - STR4_LABEL: COUNT8("STR4's variable label"); - STR6_LABEL: COUNT8("Another string variable's label"); -LABELS_END: - -DATA: - 0.0; "11/28/14"; 1.0; - 0.0; 1.0; 2.0; PCSYSMIS; s8 "abcdefgh"; s8 "ijkl"; s8 "mnopq"; s8 "r"; - s16 "stuvwxyzAB"; s16 "CDEFGHIJKLM"; -DATA_END: -"#; - let output = sack(input, None, Endian::Big)?; - HexView::new(&output).print()?; - Ok(()) - } -} diff --git a/rust/src/settings.rs b/rust/src/settings.rs deleted file mode 100644 index de51951202..0000000000 --- a/rust/src/settings.rs +++ /dev/null @@ -1,140 +0,0 @@ -use std::sync::OnceLock; - -use enum_map::EnumMap; - -use crate::{ - endian::Endian, - format::{Format, Settings as FormatSettings}, - message::Severity, -}; - -pub struct Settings { - pub input_integer_format: Endian, - pub input_float_format: Endian, - pub output_integer_format: Endian, - pub output_float_format: Endian, - - /// `MDISPLAY`: how to display matrices in `MATRIX`...`END MATRIX`. - pub matrix_display: MatrixDisplay, - - pub view_length: usize, - pub view_width: usize, - pub safer: bool, - pub include: bool, - pub route_errors_to_terminal: bool, - pub route_errors_to_listing: bool, - pub scompress: bool, - pub undefined: bool, - pub blanks: Option, - pub max_messages: EnumMap, - pub printback: bool, - pub macros: MacroSettings, - pub max_loops: usize, - pub workspace: usize, - pub default_format: Format, - pub testing: bool, - pub fuzz_bits: usize, - pub scale_min: usize, - pub commands: Compatibility, - pub global: Compatibility, - pub syntax: Compatibility, - pub formats: FormatSettings, - pub small: f64, -} - -impl Default for Settings { - fn default() -> Self { - Self { - input_integer_format: Endian::NATIVE, - input_float_format: Endian::NATIVE, - output_integer_format: Endian::NATIVE, - output_float_format: Endian::NATIVE, - matrix_display: MatrixDisplay::default(), - view_length: 24, - view_width: 79, - safer: false, - include: true, - route_errors_to_terminal: true, - route_errors_to_listing: true, - scompress: true, - undefined: true, - blanks: None, - max_messages: EnumMap::from_fn(|_| 100), - printback: true, - macros: MacroSettings::default(), - max_loops: 40, - workspace: 64 * 1024 * 1024, - default_format: Format::F8_2, - testing: false, - fuzz_bits: 6, - scale_min: 24, - commands: Compatibility::Enhanced, - global: Compatibility::Enhanced, - syntax: Compatibility::Enhanced, - formats: FormatSettings::default(), - small: 0.0001, - } - } -} - -impl Settings { - pub fn global() -> &'static Settings { - static GLOBAL: OnceLock = OnceLock::new(); - &GLOBAL.get_or_init(|| Settings::default()) - } -} - -pub enum Compatibility { - Compatible, - Enhanced, -} - -pub struct MacroSettings { - /// Expand macros? - pub expand: bool, - - /// Print macro expansions? - pub print_expansions: bool, - - /// Maximum iterations of `!FOR`. - pub max_iterations: usize, - - /// Maximum nested macro expansion levels. - pub max_nest: usize, -} - -impl Default for MacroSettings { - fn default() -> Self { - Self { - expand: true, - print_expansions: false, - max_iterations: 1000, - max_nest: 50, - } - } -} - -/// How to display matrices in `MATRIX`...`END MATRIX`. -#[derive(Default)] -pub enum MatrixDisplay { - /// Output matrices as text. - #[default] - Text, - - /// Output matrices as pivot tables. - Tables, -} - -pub enum OutputType { - /// Errors and warnings. - Error, - - /// Notes. - Notes, - - /// Syntax printback. - Syntax, - - /// Everything else. - Other, -} diff --git a/rust/tests/sack.rs b/rust/tests/sack.rs deleted file mode 100644 index 49b10e77ac..0000000000 --- a/rust/tests/sack.rs +++ /dev/null @@ -1,93 +0,0 @@ -use std::fs::read_to_string; -use std::path::PathBuf; - -use anyhow::{anyhow, Result}; -use clap::Parser; -use pspp::endian::Endian; -use pspp::sack::sack; - -/// SAv Construction Kit -/// -/// The input is a sequence of data items, each followed by a semicolon. Each -/// data item is converted to the output format and written on stdout. A data -/// item is one of the following: -/// -/// - An integer in decimal, in hexadecimal prefixed by `0x`, or in octal -/// prefixed by `0`. Output as a 32-bit binary integer. -/// -/// - A floating-point number. Output in 64-bit IEEE 754 format. -/// -/// - A string enclosed in double quotes. Output literally. There is no -/// syntax for "escapes". Strings may not contain new-lines. -/// -/// - A literal of the form `s` followed by a quoted string as above. -/// Output as the string's contents followed by enough spaces to fill up -/// `` bytes. For example, `s8 "foo"` is output as `foo` followed -/// by 5 spaces. -/// -/// - The literal `i8`, `i16`, or `i64` followed by an integer. Output -/// as a binary integer with the specified number of bits. -/// -/// - One of the literals `SYSMIS`, `LOWEST`, or `HIGHEST`. Output as a -/// 64-bit IEEE 754 float of the appropriate PSPP value. -/// -/// - `PCSYSMIS`. Output as SPSS/PC+ system-missing value. -/// -/// - The literal `ENDIAN`. Output as a 32-bit binary integer, either with -/// value 1 if `--be` is in effect or 2 if `--le` is in effect. -/// -/// - A pair of parentheses enclosing a sequence of data items, each followed -/// by a semicolon (the last semicolon is optional). Output as the enclosed -/// data items in sequence. -/// -/// - The literal `COUNT` or `COUNT8` followed by a sequence of parenthesized -/// data items, as above. Output as a 32-bit or 8-bit binary integer whose -/// value is the number of bytes enclosed within the parentheses, followed -/// by the enclosed data items themselves. -/// -/// optionally followed by an asterisk and a positive integer, which specifies a -/// repeat count for the data item. -#[derive(Parser, Debug)] -struct Args { - /// Big-endian output format (default) - #[arg(long = "be")] - be: bool, - - /// Little-endian output format - #[arg(long = "le")] - le: bool, - - /// Input file. - #[arg(required = true, name = "input")] - input_file_name: PathBuf, - - /// Output file. - #[arg(required = true, name = "output")] - output_file_name: PathBuf, -} - -fn main() -> Result<()> { - let Args { - be, - le, - input_file_name, - output_file_name, - } = Args::parse(); - let endian = match (be, le) { - (false, false) | (true, false) => Endian::Big, - (false, true) => Endian::Little, - (true, true) => return Err(anyhow!("can't use both `--be` and `--le`")), - }; - - let input_file_str = input_file_name.to_string_lossy(); - let input = read_to_string(&input_file_name) - .map_err(|err| anyhow!("{input_file_str}: read failed ({err})"))?; - - let output = sack(&input, Some(&input_file_str), endian)?; - - let output_file_str = output_file_name.to_string_lossy(); - std::fs::write(&output_file_name, output) - .map_err(|err| anyhow!("{output_file_str}: write failed ({err})"))?; - - Ok(()) -}