(in-package :asdf-encodings)
-;;;; Before we declare this file done, I propose we reimplement
-;;;; an algorithm compatible with the one used by Emacs.
+#|
+;; Before we declare this file done, I propose we reimplement
+;; an algorithm compatible with the one used by Emacs.
;; find-auto-coding is a compiled Lisp function in `mule.el'.
;; (find-auto-coding FILENAME SIZE)
;; Find a coding system for a file FILENAME of which SIZE bytes follow point.
;; hack-local-variables is a compiled Lisp function in `files.el`
;; (hack-local-variables &optional MODE-ONLY)
;; Parse and put into effect this buffer's local variables spec.
+|#
+
+
+;;;;; What's below is based on code by Douglas Crosher
-;;; What's below is based on code by Douglas Crosher
;;; Examine the octect 'stream and returns three values:
;;; 1. True if valid UTF-8.
-;;; 2. True if valid UTF-8 and UTF-8 specific sequences were found.
+;;; 2. True if non-ASCII octets were found.
+;;; (If valid UTF-8, then only UTF-8 specific sequences were found.)
;;; 3. True if valid UTF-8 and the UTF-8 BOM was found at the start.
(defun detect-utf-8 (file)
(with-open-file (stream file :direction :input
- :element-type '(unsigned-byte 8))
+ :element-type '(unsigned-byte 8))
(flet ((extrap (c)
- (= (logand c #xc0) #x80))
- (b2-leading-p (c)
- (= (logand c #xe0) #xc0))
- (b3-leading-p (c)
- (= (logand c #xf0) #xe0))
- (b4-leading-p (c)
- (= (logand c #xf8) #xf0))
- (b5-leading-p (c)
- (= (logand c #xfc) #xf8)))
+ (= (logand c #xc0) #x80))
+ (b2-leading-p (c)
+ (= (logand c #xe0) #xc0))
+ (b3-leading-p (c)
+ (= (logand c #xf0) #xe0))
+ (b4-leading-p (c)
+ (= (logand c #xf8) #xf0))
+ (b5-leading-p (c)
+ (= (logand c #xfc) #xf8)))
(let ((bom-found-p nil)
- (foundp nil))
- (loop
+ (nonasciip nil))
+ (loop
(let ((b (read-byte stream nil nil)))
(cond ((not b)
(return))
(let ((b1 (read-byte stream nil nil)))
(cond ((or (not b1) (not (extrap b1)))
(return-from detect-utf-8
- (values nil nil nil)))
+ (values nil t nil)))
((b2-leading-p b)
- (setf foundp t))
+ (setf nonasciip t))
(t
(let ((b2 (read-byte stream nil nil)))
(cond ((or (not b2) (not (extrap b2)))
(return-from detect-utf-8
- (values nil nil nil)))
+ (values nil t nil)))
((b3-leading-p b)
- (setf foundp t)
- (when (and foundp
+ (setf nonasciip t)
+ (when (and nonasciip
(= b #xef)
(= b1 #xbb)
(= b2 #xbf))
(let ((b3 (read-byte stream nil nil)))
(cond ((or (not b3) (not (extrap b3)))
(return-from detect-utf-8
- (values nil nil nil)))
+ (values nil t nil)))
((b4-leading-p b)
- (setf foundp t))
+ (setf nonasciip t))
(t
(let ((b4 (read-byte stream nil nil)))
(cond ((or (not b4) (not (extrap b4)))
(return-from detect-utf-8
- (values nil nil nil)))
+ (values nil t nil)))
((b5-leading-p b)
- (setf foundp t))
+ (setf nonasciip t))
(t
(return-from detect-utf-8
- (values nil nil nil))))))))))))))))))
- (values t foundp bom-found-p)))))
+ (values nil t nil))))))))))))))))))
+ (values t nonasciip bom-found-p)))))
(defun decode-ascii-encoded-declaration (buffer available start size offset)
(type (integer 1 4) size)
(type (integer 0 3) offset))
;; Convert the buffered chunk to ASCII.
- (let ((ascii (make-string 320 :initial-element #\?))
+ (let ((ascii (make-string 1024 :initial-element #\?))
(ascii-end 0))
(do ()
((< available (+ start size)))
((cdr (assoc "encoding" options :test 'equalp)))
((cdr (assoc "coding" options :test 'equalp)))))))
-#|
-;;; Examine the 'file to determine the encoding. In some cases the
-;;; encoding can be determined from the coding of the file itself,
-;;; otherwise it may be specified in a file options line with the
-;;; 'external-format', 'encoding', or 'coding' options. If the
-;;; encoding is not detected or declared but is valid UTF-8 using then
-;;; UTF-8 specific characters then :utf-8 is returned, otherwise
-;;; :default is returned.
-(defvar *detect-lisp-source-encoding* t)
-;;;
-(defun lisp-source-encoding (file)
- (unless *detect-lisp-source-encoding*
- (return-from lisp-source-encoding :default))
+;;; Examine the 'file to determine the encoding.
+;;; In some cases the encoding can be determined
+;;; from the coding of the file itself,
+;;; otherwise it may be specified in a file options line
+;;; with the 'external-format', 'encoding', or 'coding' options.
+;;; If the encoding is not detected or declared
+;;; but is valid UTF-8 using then UTF-8 specific characters
+;;; then :utf-8 is returned, otherwise :latin1 is returned.
+
+(defun detect-encoding (file)
(let ((initial-encoding nil)
- (declared-encoding nil))
+ (declared-encoding nil))
(with-open-file (s file :element-type '(unsigned-byte 8)
- :direction :input)
+ :direction :input)
;; Buffer a chunk from the start of the file.
- (let* ((buffer (make-array 320 :element-type '(unsigned-byte 8)))
- (available (read-sequence buffer s)))
+ (let* ((buffer (make-array 1024 :element-type '(unsigned-byte 8)))
+ (available (read-sequence buffer s)))
(labels ((decode-declaration (start size offset)
;; Look at the first four bytes to determine the encoding.
- (decode-ascii-encoded-declaration
- buffer available start size offset)))
+ (setf declared-encoding
+ (decode-ascii-encoded-declaration
+ buffer available start size offset))))
(cond ((>= available 4)
(let ((b1 (aref buffer 0))
(b2 (aref buffer 1))
(cond ((and (= b1 #x00) (= b2 #x00) (= b3 #xFE) (= b4 #xFF))
;; UCS-4, big-endian (1234 order).
(setf initial-encoding :ucs-4be)
- (decode-ascii-encoded-declaration buffer available 4 4 3))
+ (decode-declaration 4 4 3))
((and (= b1 #xff) (= b2 #xfe))
- (cond ((and (= b3 #x00) (= b4 #x00))
- ;; UCS-4, little-endian (4321 order).
- (setf initial-encoding :ucs-4le)
- (decode-ascii-encoded-declaration buffer 4 4 0))
- (t
- ;; UTF-16, little-endian
- (setf initial-encoding :utf-16le)
- (decode-ascii-encoded-declaration buffer 2 2 0))))
- ((and (= b1 #x00) (= b2 #x00) (= b3 #xFF) (= b4 #xFE))
- ;; UCS-4, order (2143).
- (decode-ascii-encoded-declaration buffer 4 4 2))
- ((and (= b1 #xfe) (= b2 #xff))
- (cond ((and (= b3 #x00) (= b4 #x00))
- ;; UCS-4, order (3412).
- (decode-ascii-encoded-declaration buffer 4 4 1))
- (t
- ;; UTF-16, big-endian.
- (setf initial-encoding :utf-16be)
- (decode-ascii-encoded-declaration buffer 2 2 1))))
- ((and (= b1 #xEF) (= b2 #xBB) (= b3 #xBF))
- ;; UTF-8 BOM.
- (setf initial-encoding :utf-8)
- (decode-ascii-encoded-declaration buffer 3 1 0))
- ;;
- ;; Without a byte order mark, check for ASCII ';'.
- ((and (= b1 #x3B) (= b2 #x00) (= b3 #x00) (= b4 #x00))
- (setf initial-encoding :ucs-4le)
- (decode-ascii-encoded-declaration buffer 0 4 0))
- ((and (= b1 #x00) (= b2 #x3B) (= b3 #x00) (= b4 #x00))
- (decode-ascii-encoded-declaration buffer 0 4 1))
- ((and (= b1 #x00) (= b2 #x00) (= b3 #x3B) (= b4 #x00))
- (decode-ascii-encoded-declaration buffer 0 4 2))
- ((and (= b1 #x00) (= b2 #x00) (= b3 #x00) (= b4 #x3B))
- (setf initial-encoding :ucs-4be)
- (decode-ascii-encoded-declaration buffer 0 4 3))
- ;;
- ;; Check for ASCII ';;'.
- ((and (= b1 #x3B) (= b2 #x00) (= b3 #x3B) (= b4 #x00))
- (setf initial-encoding :utf-16le)
- (decode-ascii-encoded-declaration buffer 0 2 0))
- ((and (= b1 #x00) (= b2 #x3B) (= b3 #x00) (= b4 #x3B))
- (setf initial-encoding :utf-16be)
- (decode-ascii-encoded-declaration buffer 0 2 1))
- ((and (= b1 #x3B) (= b2 #x3B))
- (setf initial-encoding :utf-8-auto)
- (decode-ascii-encoded-declaration buffer 0 1 0))
- ;;
- ;; Check for UTF-7 ';'.
- ((and (= b1 #x2B) (= b2 #x41) (= b3 #x44))
- (setf initial-encoding :utf-7))
- ;;
- ;; Check for ISO-2022-KR.
- ((and (= b1 #x1B) (= b2 #x24) (= b3 #x29) (= b4 #x43))
- (setf initial-encoding :iso-2022-kr)
- (decode-ascii-encoded-declaration buffer 4 1 0))
- ;;
- ;; Check for EBCDIC ';;'.
- ((and (= b1 #x5e) (= b2 #x5e))
- ;; EBCDIC - TODO read the declaration to determine the code page.
- (setf initial-encoding :ebcdic-us))
- (t
- ;; Not detected and no declaration, detect UTF-8.
- (setf initial-encoding :utf-8-auto)))))
- ((= available 3)
- (let ((b1 (aref buffer 0))
- (b2 (aref buffer 1))
- (b3 (aref buffer 2)))
- (cond ((and (= b1 #xEF) (= b2 #xBB) (= b3 #xBF))
- ;; UTF-8 BOM.
- (setf initial-encoding :utf-8))
- (t
- (setf initial-encoding :utf-8-auto)))))
- ((= available 2)
- (let ((b1 (aref buffer 0))
- (b2 (aref buffer 1)))
- (cond ((and (= b1 #xff) (= b2 #xfe))
- ;; UTF-16, little-endian
- (setf initial-encoding :utf-16le))
- ((and (= b1 #xfe) (= b2 #xff))
- ;; UTF-16, big-endian.
- (setf initial-encoding :utf-16be))
- (t
- (setf initial-encoding :default)))))
- (t
- ;; Empty file - just use the default.
- (setf initial-encoding :default))))))
+ (cond ((and (= b3 #x00) (= b4 #x00))
+ ;; UCS-4, little-endian (4321 order).
+ (setf initial-encoding :ucs-4le)
+ (decode-declaration 4 4 0))
+ (t
+ ;; UTF-16, little-endian
+ (setf initial-encoding :utf-16le)
+ (decode-declaration 2 2 0))))
+ ((and (= b1 #x00) (= b2 #x00) (= b3 #xFF) (= b4 #xFE))
+ ;; UCS-4, order (2143).
+ (decode-declaration 4 4 2))
+ ((and (= b1 #xfe) (= b2 #xff))
+ (cond ((and (= b3 #x00) (= b4 #x00))
+ ;; UCS-4, order (3412).
+ (decode-declaration 4 4 1))
+ (t
+ ;; UTF-16, big-endian.
+ (setf initial-encoding :utf-16be)
+ (decode-declaration 2 2 1))))
+ ((and (= b1 #xEF) (= b2 #xBB) (= b3 #xBF))
+ ;; UTF-8 BOM.
+ (setf initial-encoding :utf-8)
+ (decode-declaration 3 1 0))
+ ;;
+ ;; Without a byte order mark, check for ASCII ';'.
+ ((and (= b1 #x3B) (= b2 #x00) (= b3 #x00) (= b4 #x00))
+ (setf initial-encoding :ucs-4le)
+ (decode-declaration 0 4 0))
+ ((and (= b1 #x00) (= b2 #x3B) (= b3 #x00) (= b4 #x00))
+ (decode-declaration 0 4 1))
+ ((and (= b1 #x00) (= b2 #x00) (= b3 #x3B) (= b4 #x00))
+ (decode-declaration 0 4 2))
+ ((and (= b1 #x00) (= b2 #x00) (= b3 #x00) (= b4 #x3B))
+ (setf initial-encoding :ucs-4be)
+ (decode-declaration 0 4 3))
+ ;;
+ ;; Check for ASCII ';;'.
+ ((and (= b1 #x3B) (= b2 #x00) (= b3 #x3B) (= b4 #x00))
+ (setf initial-encoding :utf-16le)
+ (decode-declaration 0 2 0))
+ ((and (= b1 #x00) (= b2 #x3B) (= b3 #x00) (= b4 #x3B))
+ (setf initial-encoding :utf-16be)
+ (decode-declaration 0 2 1))
+ ((and (= b1 #x3B) (= b2 #x3B))
+ (setf initial-encoding :utf-8-auto)
+ (decode-declaration 0 1 0))
+ ;;
+ ;; Check for UTF-7 ';'.
+ ((and (= b1 #x2B) (= b2 #x41) (= b3 #x44))
+ (setf initial-encoding :utf-7))
+ ;;
+ ;; Check for ISO-2022-KR.
+ ((and (= b1 #x1B) (= b2 #x24) (= b3 #x29) (= b4 #x43))
+ (setf initial-encoding :iso-2022-kr)
+ (decode-declaration 4 1 0))
+ ;;
+ ;; Check for EBCDIC ';;'.
+ ((and (= b1 #x5e) (= b2 #x5e))
+ ;; EBCDIC - TODO read the declaration to determine the code page.
+ (setf initial-encoding :ebcdic-us))
+ (t
+ ;; Not detected and no declaration, detect UTF-8.
+ (setf initial-encoding :utf-8-auto)))))
+ ((= available 3)
+ (let ((b1 (aref buffer 0))
+ (b2 (aref buffer 1))
+ (b3 (aref buffer 2)))
+ (cond ((and (= b1 #xEF) (= b2 #xBB) (= b3 #xBF))
+ ;; UTF-8 BOM.
+ (setf initial-encoding :utf-8))
+ (t
+ (setf initial-encoding :utf-8-auto)))))
+ ((= available 2)
+ (let ((b1 (aref buffer 0))
+ (b2 (aref buffer 1)))
+ (cond ((and (= b1 #xff) (= b2 #xfe))
+ ;; UTF-16, little-endian
+ (setf initial-encoding :utf-16le))
+ ((and (= b1 #xfe) (= b2 #xff))
+ ;; UTF-16, big-endian.
+ (setf initial-encoding :utf-16be))
+ (t
+ (setf initial-encoding :latin1)))))
+ (t
+ ;; Empty file - just use the default.
+ (setf initial-encoding :default))))))
(cond ((and (not initial-encoding) (not declared-encoding))
- :default)
- ((or (and (not declared-encoding)
- (eq initial-encoding :utf-8-auto))
- (equalp declared-encoding :utf-8-auto))
- (multiple-value-bind (valid-utf-8 utf-8-use utf-8-bom)
- (detect-utf-8 file)
- (cond ((and valid-utf-8 (or utf-8-use utf-8-bom))
- :utf-8)
- (t
- :default))))
- ((or (not declared-encoding)
- (member initial-encoding '(:utf-16le :utf-16be
- :ucs-4le :ucs-4be)))
- ;; Use the detected encoding.
- initial-encoding)
- (t
- declared-encoding))))
-
-;;; Based on code by pjb
-(defun parse-emacs-variables (line)
- (when (search "-*-" line)
- (flet ((chunk (text start end) (string-trim " " (subseq text start end))))
- (loop
- :with start = (+ 3 (search "-*-" line))
- :with end = (or (search "-*-" line :start2 start) (length line))
- :with result = '()
- :for colon = (and (< start end) (position #\: line :start start))
- :while (and colon (< colon end))
- :do (let ((vend
- (or (and (< (1+ colon) end)
- (position #\; line :start (1+ colon) :end end))
- end)))
- (push (intern (string-upcase (chunk line start colon)) "KEYWORD")
- result)
- (push (chunk line (1+ colon) vend) result)
- (setf start (1+ vend)))
- :finally (return (nreverse result))))))
-|#
+ :latin1)
+ ((or (and (not declared-encoding)
+ (eq initial-encoding :utf-8-auto))
+ (equalp declared-encoding :utf-8-auto))
+ (multiple-value-bind (valid-utf-8 nonasciip utf-8-bom)
+ (detect-utf-8 file)
+ (cond ((and valid-utf-8 (or nonasciip utf-8-bom))
+ :utf-8)
+ (nonasciip
+ :latin1)
+ (t
+ :ascii))))
+ ((or (not declared-encoding)
+ (member initial-encoding '(:utf-16le :utf-16be
+ :ucs-4le :ucs-4be)))
+ ;; Use the detected encoding.
+ initial-encoding)
+ (t
+ declared-encoding))))