check for overlong byte sequences
Thu Sep 16 10:53:48 PDT 2010 marijnh@gmail.com
* check for overlong byte sequences
diff -rN -u old-trivial-utf-8/trivial-utf-8.lisp new-trivial-utf-8/trivial-utf-8.lisp
--- old-trivial-utf-8/trivial-utf-8.lisp 2014-07-30 00:02:03.000000000 -0700
+++ new-trivial-utf-8/trivial-utf-8.lisp 2014-07-30 00:02:03.000000000 -0700
@@ -128,25 +128,34 @@
(declare (type (simple-array (unsigned-byte 8) (*)) bytes)
(type fixnum group-size start)
#.*optimize*)
- (labels ((next-byte ()
- (prog1 (elt bytes start)
- (incf start)))
- (six-bits (byte)
- (unless (= (logand byte #b11000000) #b10000000)
- (error 'utf-8-decoding-error :byte byte
- :message "Invalid byte 0x~X inside a character."))
- (ldb (byte 6 0) byte)))
+ (macrolet ((next-byte ()
+ '(prog1 (elt bytes start)
+ (incf start)))
+ (six-bits (byte)
+ (let ((b (gensym)))
+ `(let ((,b ,byte))
+ (unless (= (logand ,b #b11000000) #b10000000)
+ (error 'utf-8-decoding-error :byte ,b
+ :message "Invalid byte 0x~X inside a character."))
+ (ldb (byte 6 0) ,b))))
+ (test-overlong (byte min-size)
+ (let ((b (gensym)))
+ `(let ((,b ,byte))
+ (unless (> ,b ,min-size)
+ (error 'utf-8-decoding-error :byte ,b
+ :message "Overlong byte sequence found."))
+ ,b))))
(case group-size
(1 (next-byte))
- (2 (logior (ash (ldb (byte 5 0) (next-byte)) 6)
- (six-bits (next-byte))))
- (3 (logior (ash (ldb (byte 4 0) (next-byte)) 12)
- (ash (six-bits (next-byte)) 6)
- (six-bits (next-byte))))
- (4 (logior (ash (ldb (byte 3 0) (next-byte)) 18)
- (ash (six-bits (next-byte)) 12)
- (ash (six-bits (next-byte)) 6)
- (six-bits (next-byte)))))))
+ (2 (test-overlong (logior (ash (ldb (byte 5 0) (next-byte)) 6)
+ (six-bits (next-byte))) 128))
+ (3 (test-overlong (logior (ash (ldb (byte 4 0) (next-byte)) 12)
+ (ash (six-bits (next-byte)) 6)
+ (six-bits (next-byte))) 2048))
+ (4 (test-overlong (logior (ash (ldb (byte 3 0) (next-byte)) 18)
+ (ash (six-bits (next-byte)) 12)
+ (ash (six-bits (next-byte)) 6)
+ (six-bits (next-byte))) 65536)))))
(defun utf-8-bytes-to-string (bytes-in &key (start 0) (end (length bytes-in)))
"Convert a byte array containing utf-8 encoded characters into