Newer
Older
#+xcvb (module (:depends-on ("pkgdcl")))
Francois-Rene Rideau
committed
;;; http://www.iana.org/assignments/character-sets
Francois-Rene Rideau
committed
(in-package :asdf-encodings)
Francois-Rene Rideau
committed
(defparameter *encodings*
;; Define valid names for an encoding.
;; We prefer the main name in Wikipedia,
;; but use dashes whenever Wikipedia uses underscores.
;; We also accept some common aliases
;; We probably should grab an official list from the IANA or something,
;; and match it against encodings known to any and all Lisp implementation???
'((:utf-8 :utf8 :u8) ; our preferred default, environment-independent.
(:us-ascii :ascii :iso-646-us :ANSI_X3.4-1968) ; in practice the lowest common denominator
Francois-Rene Rideau
committed
(:iso-646 :|646|) ; even lower common denominator for old international encodings
;;; ISO/IEC 8859
(:iso-8859-1 :iso8859-1 :latin1 :latin-1 :l1 ; direct mapping to first 256 unicode characters
:iso_8859-1 :iso-ir-100 :csISOLatin1 :ibm819 :cp819 :windows-28591)
(:iso-8859-2 :iso8859-2 :latin2 :latin-2) ; eastern european; not to be confused with dos-cp852
(:iso-8859-3 :iso8859-3 :latin3 :latin-3) ; esperanto, maltese, (turkish)
(:iso-8859-4 :iso8859-4 :latin4 :latin-4) ; prefer latin6, utf-8.
Francois-Rene Rideau
committed
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
(:iso-8859-5 :iso8859-5) ; cyrillic; prefer koi8-r, or utf-8
(:iso-8859-6 :iso8859-6) ; arabic
(:iso-8859-7 :iso8859-7 :ecma-118) ; greek
(:iso-8859-8 :iso8859-8) ; hebrew
(:iso-8859-9 :iso8859-9 :latin5 :latin-5) ; turkish
(:iso-8859-10 :iso8859-10 :latin6 :latin-6) ; nordic languages
(:iso-8859-11 :iso8859-11) ; almost same as TIS 620 which is preferred for thai
;;(:iso-8859-12 :iso8859-12) ; abandoned. Was meant for devanagari. Use
(:iso-8859-13 :iso8859-13 :latin7 :latin-7) ; baltic rim
(:iso-8859-14 :iso8859-14 :latin8 :latin-8) ; celtic
(:iso-8859-15 :iso8859-15 :latin9 :latin-9) ; formerly also latin0. Tweak of latin1.
(:iso-8859-16 :iso8859-16 :latin10 :latin-10) ; south-eastern european
;;; Windows code pages
(:windows-1250 :windows-cp1250 :cp-1250 :cp1250 :ms-ee) ; eastern european
(:windows-1251 :windows-cp1251 :cp-1251 :cp1251 :ms-cyrl) ; russian
(:windows-1252 :windows-cp1252 :cp-1252 :cp1252 :ms-ansi :windows-latin1) ; superset of latin1
(:windows-1253 :windows-cp1253 :cp-1253 :cp1253 :ms-greek) ; not quite iso-8859-7; prefer UTF-8
(:windows-1254 :windows-cp1254 :cp-1254 :cp1254 :ms-turk) ; superset of iso-8859-9; prefer UTF-8
(:windows-1255 :windows-cp1255 :cp-1255 :cp1255 :ms-hebr) ; mostly iso-8859-8; prefer UTF-8
(:windows-1256 :windows-cp1256 :cp-1256 :cp1256 :ms-arab) ; incompatible w/ iso-8859-6; use UTF-8
(:windows-1257 :windows-cp1257 :cp-1257 :cp1257 :ms-baltic :winbaltrim) ; prefer UTF-8
(:windows-1258 :windows-cp1258 :cp-1258 :cp1258 :ms-viet) ; vietnamese combining. Prefer UTF-8.
(:windows-cp932 :cp-932 :cp932 :windows-31j) ; Microsoft extension of Shift-JIS
(:windows-cp936 :cp-936 :cp936) ; Simplified Chinese. Extends GB2312 with most of GBK. Use 54936.
(:windows-cp949 :cp-949 :cp949) ; variant of EUC-KR. Use UTF-8
(:windows-cp950 :cp-950 :cp950) ; Microsoft variant of Big5
;;; DOS code pages
;;; For some of these CLISP has both "ms" and "ibm" variants
;;; as in cp437 and cp437-ibm. We write CLISP!? next to them.
(:dos-cp437 :cp-437 :cp437) ; Original IBM PC character set. CLISP!?
(:dos-cp737 :cp-737 :cp737) ; Greek.
(:dos-cp775 :cp-775 :cp775) ; Estonian, Lithuanian and Latvian
(:dos-cp850 :cp-850 :cp850) ; Western Europe. Default MS-DOS code page in Windows 95.
(:dos-cp852 :cp-852 :cp852) ; Central Europe. CLISP!?
(:dos-cp855 :cp-855 :cp855) ; Russian. Not used much.
(:dos-cp856 :cp-856 :cp856) ; Yet another russian code page.
(:dos-cp857 :cp-857 :cp857) ; Turkish.
(:dos-cp860 :cp-860 :cp860) ; Portuguese. CLISP!?
(:dos-cp861 :cp-861 :cp861) ; Icelandic. CLISP!?
(:dos-cp862 :cp-862 :cp862) ; Hebrew. CLISP!?
(:dos-cp863 :cp-863 :cp863) ; French (mainly used in Quebec). CLISP!?
(:dos-cp864 :cp-864 :cp864) ; Arabic. CLISP!?
(:dos-cp865 :cp-865 :cp865) ; Nordic except Icelandic. CLISP!?
(:dos-cp866 :cp-866 :cp866) ; Russian. Once popular.
(:dos-cp869 :cp-869 :cp869 :dos-greek-2) ; failed before 737. CLISP!?
(:dos-cp874 :cp-874 :cp874) ; Thai. Extension of TIS-620. CLISP!?
(:cp-1133 :cp1133) ; IBM code page for lao
;;; Mac code pages
(:mac-roman :macintosh :macos-roman) ; the latter name used by lispworks
(:mac-arabic)
(:mac-central-europe :mac-latin2) ; mac-latin2 name used by cmucl
(:mac-croatian)
(:mac-cyrillic :x-mac-cyrillic) ; sbcl calls it x-mac-cyrillic
(:mac-greek)
(:mac-hebrew)
(:mac-icelandic :mac-iceland)
(:mac-romania)
(:mac-thai) ; extension of TIS-620.
(:mac-turkish)
(:mac-ukraine)
(:mac-dingbat)
(:mac-symbol)
;;; Implementation-specific hacks
;;(:beta-gk) ; CMUCL: ASCII encoding of ancient Greek
;;(:final-sigma) ; CMUCL: tweak final sigmas in greek (composable)
;;(:base64) ;; CLISP: base64-encoded latin1? composable?
;;(:cr :mac) (:crlf :dos) ; CMUCL: line ending tweaks (composable)
;; CJK character sets
(:big5)
(:big5-hkscs)
(:euc-cn :euccn)
(:euc-jp :eucjp)
(:euc-kr :euckr)
(:euc-tw :euctw)
(:gbk) ; de facto standard of communist china, extends gb2312
(:gb18030 :cp54936) ; official character set of communist china, extends gb2312 and gbk
;; ECMA-35 is same as ISO-2022, and free.
(:iso-2022-jp) ; rfc 1468
(:iso-2022-jp-1) ; rfc 2237
(:iso-2022-jp-2) ; rfc 1554
(:iso-2022-jp-3)
(:iso-2022-jp-2004)
(:iso-2022-kr) ; rfc 1557
(:iso-2022-cn) (:iso-2022-cn-ext) ; both rfc-1922
(:jis-x0201 :jisx0201 :jis_x0201) ; phonetic japanese katakana
(:jis-x0208 :jisx0208 :jis_x0208)
(:jis-x0212 :jisx0212 :jis_x0212)
(:shift-jis :sjis)
(:johab :ksc-5601 :ksc5601) ; korean
;;; Various National character sets
(:armscii-8) ; armenian
(:georgian-academy)
(:georgian-ps)
(:koi8-r :koi8r :cp-1866 :cp1866) ; russian
(:koi8-u :koi8u :cp-21866 :cp21866) ; ukrainian
(:tis-620) ; thai
(:tcvn) ; viet
(:viscii) ; viet
;;; Other computer-specific sets
(:atascii :atarist) ; still supported by ECL
(:hp-roman8) ; used on HP-UX
;;; EBCDIC
(:cp-037 :cp037 :ibm037 :ibm-037 :ebcdic-us) ; latin1
(:utf-ebcdic) ; utf-8
;;; Unicode encodings beside utf-8
(:utf-7 :utf7) ; seldom used magic format for email
(:cesu-8) ; kind of utf-8 encoded utf-16 (ugh). What oracle miscalls utf-8.
(:java) ; java's modified UTF-8, like CESU-8 but with special encoding of U+0000.
(:ucs-2 :ucs2) ; only BMP, may be either of the below
(:ucs-2-le :ucs-2le :ucs2le)
(:ucs-2-be :ucs-2be :ucs2be)
(:utf-16 :utf16) ; may be either of the below
(:utf-16-be :utf16be :utf16-be :utf-16be :unicode-16-big-endian) ;also unicode-16 in clisp
(:utf-16-le :utf16le :utf16-le :utf-16le :unicode-16-little-endian)
(:utf-32 :utf32 :ucs-4 :ucs4 :unicode-32) ; may be either of the below
(:utf-32-be :utf32be :utf-32be :utf32-be :ucs-4-be :ucs-4be :unicode-32-big-endian)
(:utf-32-le :utf32le :utf-32le :utf32-le :ucs-4-le :ucs-4le :unicode-32-little-endian)))
(defvar *normalized-encodings* (make-hash-table))
(defun find-implementation-encoding (encoding)
(declare (ignorable encoding)) nil ;; default, for unsupported implementations
#+abcl (normalize-encoding encoding) ;; we bootstrap that in initialize-normalized-encodings
#+allegro (excl:find-external-format encoding)
#+clozure (ignore-errors (ccl::normalize-external-format t encoding))
#+clisp (asdf:find-symbol* encoding :charset)
#+cmu (stream::find-external-format encoding)
#+ecl (ignore-errors (ext:make-encoding encoding))
#+lispworks
Francois-Rene Rideau
committed
(or
(case encoding
;; lispworks supports a :jis, but which encoding is it?
((:latin-1 :ascii :utf-8 :sjis :euc-jp :macos-roman) encoding)
Francois-Rene Rideau
committed
((:ucs-2) :unicode)
((:ucs-2-le) '(:unicode :little-endian t))
((:ucs-2-be) '(:unicode :little-endian nil)))
#+windows
(let ((s (string encoding)))
(and (< 3 (length s)) (string-equal "cp-" s :end2 3)
(multiple-value-bind (i l)
(parse-integer s :start 3 :junk-allowed t)
(and i (= l (length s)) `(win32:code-page :id ,i))))))
#+sbcl (and (sb-impl::get-external-format encoding) encoding)
#+scl (and (or (lisp::encoding-character-width encoding) ; only works for fixed-width
(member encoding '(:utf-8 :utf-16 :utf-16le :utf-16be))) ; more may exist
encoding))
Francois-Rene Rideau
committed
(defun initialize-normalized-encodings (&optional warn)
#+abcl
(loop :for name :in (let ((ae (find-symbol* :available-encodings :sys)))
(when ae (funcall ae)))
:for n = (intern (string name) :keyword) ;; is this needed?
:do (setf (gethash n *normalized-encodings*) name))
;; Special case for :default,
;; the meaning of which may vary depending on the environment.
(setf (gethash :default *normalized-encodings*) :default)
Francois-Rene Rideau
committed
(loop :for names :in *encodings*
:for (name encoding) = (loop :for n :in names
:for e = (find-implementation-encoding n)
:when e :return (list n e))
:when name :do
(loop :for n :in names
:for e = (find-implementation-encoding n) :do
(when (and e (not (eq e encoding)) warn)
(warn "Encoding ~S differs from encoding for ~S yet registered as same" e encoding))
(setf (gethash n *normalized-encodings*) name))))
(defun normalize-encoding (encoding)
Francois-Rene Rideau
committed
(values (gethash encoding *normalized-encodings*)))