--- /dev/null
+#! /usr/bin/python3
+
+# Copyright (C) 2020, 2021 Free Software Foundation
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import re
+import sys
+
+def process_converter(fields):
+ if not fields or fields[0] == '{':
+ return
+
+ global codepages
+ cps = {}
+ iana = []
+ other = []
+
+ i = 0
+ while i < len(fields):
+ name = fields[i]
+ i += 1
+
+ if i < len(fields) and fields[i] == '{':
+ i += 1
+
+ standards = set()
+ while True:
+ standard = fields[i]
+ i += 1
+ if standard == '}':
+ break
+ standards.add(standard)
+ if 'IANA*' in standards:
+ iana = [name] + iana
+ elif 'IANA' in standards:
+ iana += [name]
+ elif any(map(lambda s: s.endswith('*'), standards)):
+ other = [name] + other
+ else:
+ other += [name]
+ else:
+ # Untagged names are completely nonstandard.
+ continue
+
+ m = re.match(r'cp([0-9]+)$', name)
+ if m:
+ cps[CP] = int(m.group(1))
+ continue
+
+ m = re.match(r'windows-([0-9]+)$', name)
+ if m:
+ cps[WINDOWS] = int(m.group(1))
+ continue
+
+ m = re.match(r'ibm-([0-9]+)$', name)
+ if m:
+ cps[IBM] = int(m.group(1))
+ continue
+
+ # If there are no tagged names then this is completely nonstandard.
+ if not iana and not other:
+ return
+
+ for cp in cps.keys():
+ codepages.setdefault(cps[cp], {})[cp] = iana + other
+
+if len(sys.argv) != 2 or sys.argv[1] == '--help':
+ sys.stderr.write("""\
+%s: generate code page tables from ICU encoding list
+usage: %s CONVRTRS-TXT > sys-file-encoding.c
+
+To update the encoding data, get the latest ICU encoding data from:
+https://raw.githubusercontent.com/unicode-org/icu/\
+main/icu4c/source/data/mappings/convrtrs.txt
+""" % (sys.argv[0], sys.argv[0]))
+ sys.exit(0 if len(sys.argv) == 2 and sys.argv[1] == '--help' else 1)
+
+WINDOWS = 3 # Windows code pages.
+IBM = 2 # IBM code pages.
+CP = 1 # Java (?) code pages.
+sources = { WINDOWS: "windows", IBM: "ibm", CP: "cp" }
+
+codepages = {}
+
+converter = []
+for line in open(sys.argv[1], 'r'):
+ line = line.rstrip()
+ comment_ofs = line.find('#')
+ if comment_ofs >= 0:
+ line = line[:comment_ofs]
+ if line.lstrip() == line:
+ process_converter(converter)
+ converter = []
+ converter += line.split()
+process_converter(converter)
+
+print("""\
+/* -*- mode: c; buffer-read-only: t -*-
+
+ Generated by sys-file-encoding.py. Do not modify!
+*/
+
+/*
+PSPP - a program for statistical analysis.
+Copyright (C) 2017 Free Software Foundation, Inc.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+
+#include "data/sys-file-private.h"
+
+struct sys_encoding sys_codepage_number_to_name[] = {""")
+
+for cpnumber, value in sorted(codepages.items()):
+ source = max(value.keys())
+ name = value[source][0]
+ print(' { %s, "%s" },' % (cpnumber, name))
+print(""" { 0, NULL }
+};
+""")
+
+names = {}
+for cpnumber, value in sorted(codepages.items()):
+ for source, value2 in value.items():
+ for name in value2:
+ names.setdefault(name, {}).setdefault(source, []).append(cpnumber)
+
+print('struct sys_encoding sys_codepage_name_to_number[] = {')
+for name in sorted(names.keys()):
+ for source in sorted(sources.keys(), reverse=True):
+ if source not in names[name]:
+ continue
+
+ numbers = names[name][source]
+
+ # The only two encodings that currently print this are KSC_5601
+ # and KS_C_5601-1987, for code pages 949 and 51949. It looks to
+ # me like the correct code page number is 949, which is the one
+ # chosen (because the numbers are in sorted order).
+ if len(numbers) > 1:
+ print(' /* %s has multiple numbers for %s: %s */'
+ % (name, sources[source], ' '.join(map(str, numbers))))
+ print(' { %s, "%s" },' % (numbers[0], name))
+ break
+print("""\
+ { 0, NULL }
+};""")
+