diff --git a/asdf-support.lisp b/asdf-support.lisp index 1893b953d2cd93749062bf3734586c878e9c463a..ecc09cb180a246e95067432b8bcd106a27ca68b8 100644 --- a/asdf-support.lisp +++ b/asdf-support.lisp @@ -12,6 +12,6 @@ :default)) (defun register-asdf-encodings () - (setf asdf:*encoding-external-format-hook* 'encoding-external-format) + (setf asdf:*encoding-external-format-hook* 'encoding-external-format + asdf:*encoding-detection-hook* 'detect-encoding) (values)) - diff --git a/autodetect.lisp b/autodetect.lisp index d6800d8e0b0651d25d86ccf3ffa07cd8338dfd43..8aa0c7651284dde62a29fb239ed0aa0c1a33fb96 100644 --- a/autodetect.lisp +++ b/autodetect.lisp @@ -2,8 +2,9 @@ (in-package :asdf-encodings) -;;;; Before we declare this file done, I propose we reimplement -;;;; an algorithm compatible with the one used by Emacs. +#| +;; Before we declare this file done, I propose we reimplement +;; an algorithm compatible with the one used by Emacs. ;; find-auto-coding is a compiled Lisp function in `mule.el'. ;; (find-auto-coding FILENAME SIZE) ;; Find a coding system for a file FILENAME of which SIZE bytes follow point. @@ -15,28 +16,32 @@ ;; hack-local-variables is a compiled Lisp function in `files.el` ;; (hack-local-variables &optional MODE-ONLY) ;; Parse and put into effect this buffer's local variables spec. +|# + + +;;;;; What's below is based on code by Douglas Crosher -;;; What's below is based on code by Douglas Crosher ;;; Examine the octect 'stream and returns three values: ;;; 1. True if valid UTF-8. -;;; 2. True if valid UTF-8 and UTF-8 specific sequences were found. +;;; 2. True if non-ASCII octets were found. +;;; (If valid UTF-8, then only UTF-8 specific sequences were found.) ;;; 3. True if valid UTF-8 and the UTF-8 BOM was found at the start. (defun detect-utf-8 (file) (with-open-file (stream file :direction :input - :element-type '(unsigned-byte 8)) + :element-type '(unsigned-byte 8)) (flet ((extrap (c) - (= (logand c #xc0) #x80)) - (b2-leading-p (c) - (= (logand c #xe0) #xc0)) - (b3-leading-p (c) - (= (logand c #xf0) #xe0)) - (b4-leading-p (c) - (= (logand c #xf8) #xf0)) - (b5-leading-p (c) - (= (logand c #xfc) #xf8))) + (= (logand c #xc0) #x80)) + (b2-leading-p (c) + (= (logand c #xe0) #xc0)) + (b3-leading-p (c) + (= (logand c #xf0) #xe0)) + (b4-leading-p (c) + (= (logand c #xf8) #xf0)) + (b5-leading-p (c) + (= (logand c #xfc) #xf8))) (let ((bom-found-p nil) - (foundp nil)) - (loop + (nonasciip nil)) + (loop (let ((b (read-byte stream nil nil))) (cond ((not b) (return)) @@ -45,17 +50,17 @@ (let ((b1 (read-byte stream nil nil))) (cond ((or (not b1) (not (extrap b1))) (return-from detect-utf-8 - (values nil nil nil))) + (values nil t nil))) ((b2-leading-p b) - (setf foundp t)) + (setf nonasciip t)) (t (let ((b2 (read-byte stream nil nil))) (cond ((or (not b2) (not (extrap b2))) (return-from detect-utf-8 - (values nil nil nil))) + (values nil t nil))) ((b3-leading-p b) - (setf foundp t) - (when (and foundp + (setf nonasciip t) + (when (and nonasciip (= b #xef) (= b1 #xbb) (= b2 #xbf)) @@ -64,20 +69,20 @@ (let ((b3 (read-byte stream nil nil))) (cond ((or (not b3) (not (extrap b3))) (return-from detect-utf-8 - (values nil nil nil))) + (values nil t nil))) ((b4-leading-p b) - (setf foundp t)) + (setf nonasciip t)) (t (let ((b4 (read-byte stream nil nil))) (cond ((or (not b4) (not (extrap b4))) (return-from detect-utf-8 - (values nil nil nil))) + (values nil t nil))) ((b5-leading-p b) - (setf foundp t)) + (setf nonasciip t)) (t (return-from detect-utf-8 - (values nil nil nil)))))))))))))))))) - (values t foundp bom-found-p))))) + (values nil t nil)))))))))))))))))) + (values t nonasciip bom-found-p))))) (defun decode-ascii-encoded-declaration (buffer available start size offset) @@ -85,7 +90,7 @@ (type (integer 1 4) size) (type (integer 0 3) offset)) ;; Convert the buffered chunk to ASCII. - (let ((ascii (make-string 320 :initial-element #\?)) + (let ((ascii (make-string 1024 :initial-element #\?)) (ascii-end 0)) (do () ((< available (+ start size))) @@ -148,30 +153,28 @@ ((cdr (assoc "encoding" options :test 'equalp))) ((cdr (assoc "coding" options :test 'equalp))))))) -#| -;;; Examine the 'file to determine the encoding. In some cases the -;;; encoding can be determined from the coding of the file itself, -;;; otherwise it may be specified in a file options line with the -;;; 'external-format', 'encoding', or 'coding' options. If the -;;; encoding is not detected or declared but is valid UTF-8 using then -;;; UTF-8 specific characters then :utf-8 is returned, otherwise -;;; :default is returned. -(defvar *detect-lisp-source-encoding* t) -;;; -(defun lisp-source-encoding (file) - (unless *detect-lisp-source-encoding* - (return-from lisp-source-encoding :default)) +;;; Examine the 'file to determine the encoding. +;;; In some cases the encoding can be determined +;;; from the coding of the file itself, +;;; otherwise it may be specified in a file options line +;;; with the 'external-format', 'encoding', or 'coding' options. +;;; If the encoding is not detected or declared +;;; but is valid UTF-8 using then UTF-8 specific characters +;;; then :utf-8 is returned, otherwise :latin1 is returned. + +(defun detect-encoding (file) (let ((initial-encoding nil) - (declared-encoding nil)) + (declared-encoding nil)) (with-open-file (s file :element-type '(unsigned-byte 8) - :direction :input) + :direction :input) ;; Buffer a chunk from the start of the file. - (let* ((buffer (make-array 320 :element-type '(unsigned-byte 8))) - (available (read-sequence buffer s))) + (let* ((buffer (make-array 1024 :element-type '(unsigned-byte 8))) + (available (read-sequence buffer s))) (labels ((decode-declaration (start size offset) ;; Look at the first four bytes to determine the encoding. - (decode-ascii-encoded-declaration - buffer available start size offset))) + (setf declared-encoding + (decode-ascii-encoded-declaration + buffer available start size offset)))) (cond ((>= available 4) (let ((b1 (aref buffer 0)) (b2 (aref buffer 1)) @@ -180,130 +183,111 @@ (cond ((and (= b1 #x00) (= b2 #x00) (= b3 #xFE) (= b4 #xFF)) ;; UCS-4, big-endian (1234 order). (setf initial-encoding :ucs-4be) - (decode-ascii-encoded-declaration buffer available 4 4 3)) + (decode-declaration 4 4 3)) ((and (= b1 #xff) (= b2 #xfe)) - (cond ((and (= b3 #x00) (= b4 #x00)) - ;; UCS-4, little-endian (4321 order). - (setf initial-encoding :ucs-4le) - (decode-ascii-encoded-declaration buffer 4 4 0)) - (t - ;; UTF-16, little-endian - (setf initial-encoding :utf-16le) - (decode-ascii-encoded-declaration buffer 2 2 0)))) - ((and (= b1 #x00) (= b2 #x00) (= b3 #xFF) (= b4 #xFE)) - ;; UCS-4, order (2143). - (decode-ascii-encoded-declaration buffer 4 4 2)) - ((and (= b1 #xfe) (= b2 #xff)) - (cond ((and (= b3 #x00) (= b4 #x00)) - ;; UCS-4, order (3412). - (decode-ascii-encoded-declaration buffer 4 4 1)) - (t - ;; UTF-16, big-endian. - (setf initial-encoding :utf-16be) - (decode-ascii-encoded-declaration buffer 2 2 1)))) - ((and (= b1 #xEF) (= b2 #xBB) (= b3 #xBF)) - ;; UTF-8 BOM. - (setf initial-encoding :utf-8) - (decode-ascii-encoded-declaration buffer 3 1 0)) - ;; - ;; Without a byte order mark, check for ASCII ';'. - ((and (= b1 #x3B) (= b2 #x00) (= b3 #x00) (= b4 #x00)) - (setf initial-encoding :ucs-4le) - (decode-ascii-encoded-declaration buffer 0 4 0)) - ((and (= b1 #x00) (= b2 #x3B) (= b3 #x00) (= b4 #x00)) - (decode-ascii-encoded-declaration buffer 0 4 1)) - ((and (= b1 #x00) (= b2 #x00) (= b3 #x3B) (= b4 #x00)) - (decode-ascii-encoded-declaration buffer 0 4 2)) - ((and (= b1 #x00) (= b2 #x00) (= b3 #x00) (= b4 #x3B)) - (setf initial-encoding :ucs-4be) - (decode-ascii-encoded-declaration buffer 0 4 3)) - ;; - ;; Check for ASCII ';;'. - ((and (= b1 #x3B) (= b2 #x00) (= b3 #x3B) (= b4 #x00)) - (setf initial-encoding :utf-16le) - (decode-ascii-encoded-declaration buffer 0 2 0)) - ((and (= b1 #x00) (= b2 #x3B) (= b3 #x00) (= b4 #x3B)) - (setf initial-encoding :utf-16be) - (decode-ascii-encoded-declaration buffer 0 2 1)) - ((and (= b1 #x3B) (= b2 #x3B)) - (setf initial-encoding :utf-8-auto) - (decode-ascii-encoded-declaration buffer 0 1 0)) - ;; - ;; Check for UTF-7 ';'. - ((and (= b1 #x2B) (= b2 #x41) (= b3 #x44)) - (setf initial-encoding :utf-7)) - ;; - ;; Check for ISO-2022-KR. - ((and (= b1 #x1B) (= b2 #x24) (= b3 #x29) (= b4 #x43)) - (setf initial-encoding :iso-2022-kr) - (decode-ascii-encoded-declaration buffer 4 1 0)) - ;; - ;; Check for EBCDIC ';;'. - ((and (= b1 #x5e) (= b2 #x5e)) - ;; EBCDIC - TODO read the declaration to determine the code page. - (setf initial-encoding :ebcdic-us)) - (t - ;; Not detected and no declaration, detect UTF-8. - (setf initial-encoding :utf-8-auto))))) - ((= available 3) - (let ((b1 (aref buffer 0)) - (b2 (aref buffer 1)) - (b3 (aref buffer 2))) - (cond ((and (= b1 #xEF) (= b2 #xBB) (= b3 #xBF)) - ;; UTF-8 BOM. - (setf initial-encoding :utf-8)) - (t - (setf initial-encoding :utf-8-auto))))) - ((= available 2) - (let ((b1 (aref buffer 0)) - (b2 (aref buffer 1))) - (cond ((and (= b1 #xff) (= b2 #xfe)) - ;; UTF-16, little-endian - (setf initial-encoding :utf-16le)) - ((and (= b1 #xfe) (= b2 #xff)) - ;; UTF-16, big-endian. - (setf initial-encoding :utf-16be)) - (t - (setf initial-encoding :default))))) - (t - ;; Empty file - just use the default. - (setf initial-encoding :default)))))) + (cond ((and (= b3 #x00) (= b4 #x00)) + ;; UCS-4, little-endian (4321 order). + (setf initial-encoding :ucs-4le) + (decode-declaration 4 4 0)) + (t + ;; UTF-16, little-endian + (setf initial-encoding :utf-16le) + (decode-declaration 2 2 0)))) + ((and (= b1 #x00) (= b2 #x00) (= b3 #xFF) (= b4 #xFE)) + ;; UCS-4, order (2143). + (decode-declaration 4 4 2)) + ((and (= b1 #xfe) (= b2 #xff)) + (cond ((and (= b3 #x00) (= b4 #x00)) + ;; UCS-4, order (3412). + (decode-declaration 4 4 1)) + (t + ;; UTF-16, big-endian. + (setf initial-encoding :utf-16be) + (decode-declaration 2 2 1)))) + ((and (= b1 #xEF) (= b2 #xBB) (= b3 #xBF)) + ;; UTF-8 BOM. + (setf initial-encoding :utf-8) + (decode-declaration 3 1 0)) + ;; + ;; Without a byte order mark, check for ASCII ';'. + ((and (= b1 #x3B) (= b2 #x00) (= b3 #x00) (= b4 #x00)) + (setf initial-encoding :ucs-4le) + (decode-declaration 0 4 0)) + ((and (= b1 #x00) (= b2 #x3B) (= b3 #x00) (= b4 #x00)) + (decode-declaration 0 4 1)) + ((and (= b1 #x00) (= b2 #x00) (= b3 #x3B) (= b4 #x00)) + (decode-declaration 0 4 2)) + ((and (= b1 #x00) (= b2 #x00) (= b3 #x00) (= b4 #x3B)) + (setf initial-encoding :ucs-4be) + (decode-declaration 0 4 3)) + ;; + ;; Check for ASCII ';;'. + ((and (= b1 #x3B) (= b2 #x00) (= b3 #x3B) (= b4 #x00)) + (setf initial-encoding :utf-16le) + (decode-declaration 0 2 0)) + ((and (= b1 #x00) (= b2 #x3B) (= b3 #x00) (= b4 #x3B)) + (setf initial-encoding :utf-16be) + (decode-declaration 0 2 1)) + ((and (= b1 #x3B) (= b2 #x3B)) + (setf initial-encoding :utf-8-auto) + (decode-declaration 0 1 0)) + ;; + ;; Check for UTF-7 ';'. + ((and (= b1 #x2B) (= b2 #x41) (= b3 #x44)) + (setf initial-encoding :utf-7)) + ;; + ;; Check for ISO-2022-KR. + ((and (= b1 #x1B) (= b2 #x24) (= b3 #x29) (= b4 #x43)) + (setf initial-encoding :iso-2022-kr) + (decode-declaration 4 1 0)) + ;; + ;; Check for EBCDIC ';;'. + ((and (= b1 #x5e) (= b2 #x5e)) + ;; EBCDIC - TODO read the declaration to determine the code page. + (setf initial-encoding :ebcdic-us)) + (t + ;; Not detected and no declaration, detect UTF-8. + (setf initial-encoding :utf-8-auto))))) + ((= available 3) + (let ((b1 (aref buffer 0)) + (b2 (aref buffer 1)) + (b3 (aref buffer 2))) + (cond ((and (= b1 #xEF) (= b2 #xBB) (= b3 #xBF)) + ;; UTF-8 BOM. + (setf initial-encoding :utf-8)) + (t + (setf initial-encoding :utf-8-auto))))) + ((= available 2) + (let ((b1 (aref buffer 0)) + (b2 (aref buffer 1))) + (cond ((and (= b1 #xff) (= b2 #xfe)) + ;; UTF-16, little-endian + (setf initial-encoding :utf-16le)) + ((and (= b1 #xfe) (= b2 #xff)) + ;; UTF-16, big-endian. + (setf initial-encoding :utf-16be)) + (t + (setf initial-encoding :latin1))))) + (t + ;; Empty file - just use the default. + (setf initial-encoding :default)))))) (cond ((and (not initial-encoding) (not declared-encoding)) - :default) - ((or (and (not declared-encoding) - (eq initial-encoding :utf-8-auto)) - (equalp declared-encoding :utf-8-auto)) - (multiple-value-bind (valid-utf-8 utf-8-use utf-8-bom) - (detect-utf-8 file) - (cond ((and valid-utf-8 (or utf-8-use utf-8-bom)) - :utf-8) - (t - :default)))) - ((or (not declared-encoding) - (member initial-encoding '(:utf-16le :utf-16be - :ucs-4le :ucs-4be))) - ;; Use the detected encoding. - initial-encoding) - (t - declared-encoding)))) - -;;; Based on code by pjb -(defun parse-emacs-variables (line) - (when (search "-*-" line) - (flet ((chunk (text start end) (string-trim " " (subseq text start end)))) - (loop - :with start = (+ 3 (search "-*-" line)) - :with end = (or (search "-*-" line :start2 start) (length line)) - :with result = '() - :for colon = (and (< start end) (position #\: line :start start)) - :while (and colon (< colon end)) - :do (let ((vend - (or (and (< (1+ colon) end) - (position #\; line :start (1+ colon) :end end)) - end))) - (push (intern (string-upcase (chunk line start colon)) "KEYWORD") - result) - (push (chunk line (1+ colon) vend) result) - (setf start (1+ vend))) - :finally (return (nreverse result)))))) -|# + :latin1) + ((or (and (not declared-encoding) + (eq initial-encoding :utf-8-auto)) + (equalp declared-encoding :utf-8-auto)) + (multiple-value-bind (valid-utf-8 nonasciip utf-8-bom) + (detect-utf-8 file) + (cond ((and valid-utf-8 (or nonasciip utf-8-bom)) + :utf-8) + (nonasciip + :latin1) + (t + :ascii)))) + ((or (not declared-encoding) + (member initial-encoding '(:utf-16le :utf-16be + :ucs-4le :ucs-4be))) + ;; Use the detected encoding. + initial-encoding) + (t + declared-encoding))))