encodings.lisp

#+xcvb (module (:depends-on ("pkgdcl")))

;;; http://www.iana.org/assignments/character-sets

(in-package :asdf-encodings)

(defparameter *encodings*
  ;; Define valid names for an encoding.
  ;; We prefer the main name in Wikipedia,
  ;; but use dashes whenever Wikipedia uses underscores.
  ;; We also accept some common aliases
  ;; We probably should grab an official list from the IANA or something,
  ;; and match it against encodings known to any and all Lisp implementation???
  '((:utf-8 :utf8 :u8) ; our preferred default, environment-independent.
    (:us-ascii :ascii :iso-646-us :ANSI_X3.4-1968) ; in practice the lowest common denominator
    (:iso-646 :|646|) ; even lower common denominator for old international encodings
    ;;; ISO/IEC 8859
    (:iso-8859-1 :iso8859-1 :latin1 :latin-1 :l1 ; direct mapping to first 256 unicode characters
     :iso_8859-1 :iso-ir-100 :csISOLatin1 :ibm819 :cp819 :windows-28591)
    (:iso-8859-2 :iso8859-2 :latin2 :latin-2) ; eastern european; not to be confused with dos-cp852
    (:iso-8859-3 :iso8859-3 :latin3 :latin-3) ; esperanto, maltese, (turkish)
    (:iso-8859-4 :iso8859-4 :latin4 :latin-4) ; prefer latin6, utf-8.
    (:iso-8859-5 :iso8859-5) ; cyrillic; prefer koi8-r, or utf-8
    (:iso-8859-6 :iso8859-6) ; arabic
    (:iso-8859-7 :iso8859-7 :ecma-118) ; greek
    (:iso-8859-8 :iso8859-8) ; hebrew
    (:iso-8859-9 :iso8859-9 :latin5 :latin-5) ; turkish
    (:iso-8859-10 :iso8859-10 :latin6 :latin-6) ; nordic languages
    (:iso-8859-11 :iso8859-11) ; almost same as TIS 620 which is preferred for thai
    ;;(:iso-8859-12 :iso8859-12) ; abandoned. Was meant for devanagari. Use
    (:iso-8859-13 :iso8859-13 :latin7 :latin-7) ; baltic rim
    (:iso-8859-14 :iso8859-14 :latin8 :latin-8) ; celtic
    (:iso-8859-15 :iso8859-15 :latin9 :latin-9) ; formerly also latin0. Tweak of latin1.
    (:iso-8859-16 :iso8859-16 :latin10 :latin-10) ; south-eastern european
    ;;; Windows code pages
    (:windows-1250 :windows-cp1250 :cp-1250 :cp1250 :ms-ee) ; eastern european
    (:windows-1251 :windows-cp1251 :cp-1251 :cp1251 :ms-cyrl) ; russian
    (:windows-1252 :windows-cp1252 :cp-1252 :cp1252 :ms-ansi :windows-latin1) ; superset of latin1
    (:windows-1253 :windows-cp1253 :cp-1253 :cp1253 :ms-greek) ; not quite iso-8859-7; prefer UTF-8
    (:windows-1254 :windows-cp1254 :cp-1254 :cp1254 :ms-turk) ; superset of iso-8859-9; prefer UTF-8
    (:windows-1255 :windows-cp1255 :cp-1255 :cp1255 :ms-hebr) ; mostly iso-8859-8; prefer UTF-8
    (:windows-1256 :windows-cp1256 :cp-1256 :cp1256 :ms-arab) ; incompatible w/ iso-8859-6; use UTF-8
    (:windows-1257 :windows-cp1257 :cp-1257 :cp1257 :ms-baltic :winbaltrim) ; prefer UTF-8
    (:windows-1258 :windows-cp1258 :cp-1258 :cp1258 :ms-viet) ; vietnamese combining. Prefer UTF-8.
    (:windows-cp932 :cp-932 :cp932 :windows-31j) ; Microsoft extension of Shift-JIS
    (:windows-cp936 :cp-936 :cp936) ; Simplified Chinese. Extends GB2312 with most of GBK. Use 54936.
    (:windows-cp949 :cp-949 :cp949) ; variant of EUC-KR. Use UTF-8
    (:windows-cp950 :cp-950 :cp950) ; Microsoft variant of Big5
    ;;; DOS code pages
    ;;; For some of these CLISP has both "ms" and "ibm" variants
    ;;; as in cp437 and cp437-ibm. We write CLISP!? next to them.
    (:dos-cp437 :cp-437 :cp437) ; Original IBM PC character set. CLISP!?
    (:dos-cp737 :cp-737 :cp737) ; Greek.
    (:dos-cp775 :cp-775 :cp775) ; Estonian, Lithuanian and Latvian
    (:dos-cp850 :cp-850 :cp850) ; Western Europe. Default MS-DOS code page in Windows 95.
    (:dos-cp852 :cp-852 :cp852) ; Central Europe. CLISP!?
    (:dos-cp855 :cp-855 :cp855) ; Russian. Not used much.
    (:dos-cp856 :cp-856 :cp856) ; Yet another russian code page.
    (:dos-cp857 :cp-857 :cp857) ; Turkish.
    (:dos-cp860 :cp-860 :cp860) ; Portuguese. CLISP!?
    (:dos-cp861 :cp-861 :cp861) ; Icelandic. CLISP!?
    (:dos-cp862 :cp-862 :cp862) ; Hebrew. CLISP!?
    (:dos-cp863 :cp-863 :cp863) ; French (mainly used in Quebec). CLISP!?
    (:dos-cp864 :cp-864 :cp864) ; Arabic. CLISP!?
    (:dos-cp865 :cp-865 :cp865) ; Nordic except Icelandic. CLISP!?
    (:dos-cp866 :cp-866 :cp866) ; Russian. Once popular.
    (:dos-cp869 :cp-869 :cp869 :dos-greek-2) ; failed before 737. CLISP!?
    (:dos-cp874 :cp-874 :cp874) ; Thai. Extension of TIS-620. CLISP!?
    (:cp-1133 :cp1133) ; IBM code page for lao
    ;;; Mac code pages
    (:mac-roman :macintosh :macos-roman) ; the latter name used by lispworks
    (:mac-arabic)
    (:mac-central-europe :mac-latin2) ; mac-latin2 name used by cmucl
    (:mac-croatian)
    (:mac-cyrillic :x-mac-cyrillic) ; sbcl calls it x-mac-cyrillic
    (:mac-greek)
    (:mac-hebrew)
    (:mac-icelandic :mac-iceland)
    (:mac-romania)
    (:mac-thai) ; extension of TIS-620.
    (:mac-turkish)
    (:mac-ukraine)
    (:mac-dingbat)
    (:mac-symbol)
    ;;; Implementation-specific hacks
    ;;(:beta-gk) ; CMUCL: ASCII encoding of ancient Greek
    ;;(:final-sigma) ; CMUCL: tweak final sigmas in greek (composable)
    ;;(:base64) ;; CLISP: base64-encoded latin1? composable?
    ;;(:cr :mac) (:crlf :dos) ; CMUCL: line ending tweaks (composable)
    ;; CJK character sets
    (:big5)
    (:big5-hkscs)
    (:euc-cn :euccn)
    (:euc-jp :eucjp)
    (:euc-kr :euckr)
    (:euc-tw :euctw)
    (:gbk) ; de facto standard of communist china, extends gb2312
    (:gb18030 :cp54936) ; official character set of communist china, extends gb2312 and gbk
    ;; ECMA-35 is same as ISO-2022, and free.
    (:iso-2022-jp) ; rfc 1468
    (:iso-2022-jp-1) ; rfc 2237
    (:iso-2022-jp-2) ; rfc 1554
    (:iso-2022-jp-3)
    (:iso-2022-jp-2004)
    (:iso-2022-kr) ; rfc 1557
    (:iso-2022-cn) (:iso-2022-cn-ext) ; both rfc-1922
    (:jis-x0201 :jisx0201 :jis_x0201) ; phonetic japanese katakana
    (:jis-x0208 :jisx0208 :jis_x0208)
    (:jis-x0212 :jisx0212 :jis_x0212)
    (:shift-jis :sjis)
    (:johab :ksc-5601 :ksc5601) ; korean
    ;;; Various National character sets
    (:armscii-8) ; armenian
    (:georgian-academy)
    (:georgian-ps)
    (:koi8-r :koi8r :cp-1866 :cp1866) ; russian
    (:koi8-u :koi8u :cp-21866 :cp21866) ; ukrainian
    (:tis-620) ; thai
    (:tcvn) ; viet
    (:viscii) ; viet
    ;;; Other computer-specific sets
    (:atascii :atarist) ; still supported by ECL
    (:hp-roman8) ; used on HP-UX
    ;;; EBCDIC
    (:cp-037 :cp037 :ibm037 :ibm-037 :ebcdic-us) ; latin1
    (:utf-ebcdic) ; utf-8
    ;;; Unicode encodings beside utf-8
    (:utf-7 :utf7) ; seldom used magic format for email
    (:cesu-8) ; kind of utf-8 encoded utf-16 (ugh). What oracle miscalls utf-8.
    (:java) ; java's modified UTF-8, like CESU-8 but with special encoding of U+0000.
    (:ucs-2 :ucs2) ; only BMP, may be either of the below
    (:ucs-2-le :ucs-2le :ucs2le)
    (:ucs-2-be :ucs-2be :ucs2be)
    (:utf-16 :utf16) ; may be either of the below
    (:utf-16-be :utf16be :utf16-be :utf-16be :unicode-16-big-endian) ;also unicode-16 in clisp
    (:utf-16-le :utf16le :utf16-le :utf-16le :unicode-16-little-endian)
    (:utf-32 :utf32 :ucs-4 :ucs4 :unicode-32) ; may be either of the below
    (:utf-32-be :utf32be :utf-32be :utf32-be :ucs-4-be :ucs-4be :unicode-32-big-endian)
    (:utf-32-le :utf32le :utf-32le :utf32-le :ucs-4-le :ucs-4le :unicode-32-little-endian)))

(defvar *normalized-encodings* (make-hash-table))

(defun find-implementation-encoding (encoding)
  (declare (ignorable encoding)) nil ;; default, for unsupported implementations
  #+abcl (normalize-encoding encoding) ;; we bootstrap that in initialize-normalized-encodings
  #+allegro (excl:find-external-format encoding)
  #+clozure (ignore-errors (ccl::normalize-external-format t encoding))
  #+clisp (asdf:find-symbol* encoding :charset)
  #+cmu (stream::find-external-format encoding)
  #+ecl (ignore-errors (ext:make-encoding encoding))
  #+lispworks
  (or
   (case encoding
     ;; lispworks supports a :jis, but which encoding is it?
     ((:latin-1 :ascii :utf-8 :sjis :euc-jp :macos-roman) encoding)
     ((:ucs-2) :unicode)
     ((:ucs-2-le) '(:unicode :little-endian t))
     ((:ucs-2-be) '(:unicode :little-endian nil)))
   #+windows
   (let ((s (string encoding)))
     (and (< 3 (length s)) (string-equal "cp-" s :end2 3)
          (multiple-value-bind (i l)
              (parse-integer s :start 3 :junk-allowed t)
            (and i (= l (length s)) `(win32:code-page :id ,i))))))
  #+sbcl (and (sb-impl::get-external-format encoding) encoding)
  #+scl (and (or (lisp::encoding-character-width encoding) ; only works for fixed-width
                 (member encoding '(:utf-8 :utf-16 :utf-16le :utf-16be))) ; more may exist
             encoding))

(defun initialize-normalized-encodings (&optional warn)
  #+abcl
  (loop :for name :in (let ((ae (find-symbol* :available-encodings :sys)))
                        (when ae (funcall ae)))
    :for n = (intern (string name) :keyword) ;; is this needed?
    :do (setf (gethash n *normalized-encodings*) name))
  ;; Special case for :default,
  ;; the meaning of which may vary depending on the environment.
  (setf (gethash :default *normalized-encodings*) :default)
  (loop :for names :in *encodings*
    :for (name encoding) = (loop :for n :in names
                             :for e = (find-implementation-encoding n)
                             :when e :return (list n e))
    :when name :do
    (loop :for n :in names
      :for e = (find-implementation-encoding n) :do
      (when (and e (not (eq e encoding)) warn)
        (warn "Encoding ~S differs from encoding for ~S yet registered as same" e encoding))
      (setf (gethash n *normalized-encodings*) name))))

(defun normalize-encoding (encoding)
  (values (gethash encoding *normalized-encodings*)))