/[flexi-streams]/branches/edi/length.lisp
ViewVC logotype

Contents of /branches/edi/length.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 61 - (show annotations)
Sun May 25 23:43:22 2008 UTC (5 years, 10 months ago) by eweitz
File size: 17536 byte(s)
Ready for release
1 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*-
2 ;;; $Header: /usr/local/cvsrep/flexi-streams/length.lisp,v 1.4 2008/05/25 22:23:58 edi Exp $
3
4 ;;; Copyright (c) 2005-2008, Dr. Edmund Weitz. All rights reserved.
5
6 ;;; Redistribution and use in source and binary forms, with or without
7 ;;; modification, are permitted provided that the following conditions
8 ;;; are met:
9
10 ;;; * Redistributions of source code must retain the above copyright
11 ;;; notice, this list of conditions and the following disclaimer.
12
13 ;;; * Redistributions in binary form must reproduce the above
14 ;;; copyright notice, this list of conditions and the following
15 ;;; disclaimer in the documentation and/or other materials
16 ;;; provided with the distribution.
17
18 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
19 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
22 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
24 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 (in-package :flexi-streams)
31
32 (defgeneric encoding-factor (format)
33 (:documentation "Given an external format FORMAT, returns a factor
34 which denotes the octets to characters ratio to expect when
35 encoding/decoding. If the returned value is an integer, the factor is
36 assumed to be exact. If it is a \(double) float, the factor is
37 supposed to be based on heuristics and usually not exact.
38
39 This factor is used in string.lisp.")
40 (declare #.*standard-optimize-settings*))
41
42 (defmethod encoding-factor ((format flexi-8-bit-format))
43 (declare #.*standard-optimize-settings*)
44 ;; 8-bit encodings map octets to characters in an exact one-to-one
45 ;; fashion
46 1)
47
48 (defmethod encoding-factor ((format flexi-utf-8-format))
49 (declare #.*standard-optimize-settings*)
50 ;; UTF-8 characters can be anything from one to six octets, but we
51 ;; assume that the "overhead" is only about 5 percent - this
52 ;; estimate is obviously very much dependant on the content
53 1.05d0)
54
55 (defmethod encoding-factor ((format flexi-utf-16-format))
56 (declare #.*standard-optimize-settings*)
57 ;; usually one character maps to two octets, but characters with
58 ;; code points above #x10000 map to four octets - we assume that we
59 ;; usually don't see these characters but of course have to return a
60 ;; float
61 2.0d0)
62
63 (defmethod encoding-factor ((format flexi-utf-32-format))
64 (declare #.*standard-optimize-settings*)
65 ;; UTF-32 always matches every character to four octets
66 4)
67
68 (defmethod encoding-factor ((format flexi-crlf-mixin))
69 (declare #.*standard-optimize-settings*)
70 ;; if the sequence #\Return #\Linefeed is the line-end marker, this
71 ;; obviously makes encodings potentially longer and definitely makes
72 ;; the estimate unexact
73 (* 1.02d0 (call-next-method)))
74
75 (defgeneric check-end (format start end i)
76 (declare #.*fixnum-optimize-settings*)
77 (:documentation "Helper function used below to determine if we tried
78 to read past the end of the sequence.")
79 (:method (format start end i)
80 (declare #.*fixnum-optimize-settings*)
81 (declare (fixnum start end i))
82 (when (> i end)
83 (signal-encoding-error format "These ~A octet~:P can't be ~
84 decoded using ~A as the sequence is too short. ~A octet~:P missing ~
85 at then end."
86 (- end start)
87 (external-format-name format)
88 (- i end))))
89 (:method ((format flexi-utf-16-format) start end i)
90 (declare #.*fixnum-optimize-settings*)
91 (declare (fixnum start end i))
92 (declare (ignore i))
93 ;; don't warn twice
94 (when (evenp (- end start))
95 (call-next-method))))
96
97 (defgeneric compute-number-of-chars (format sequence start end)
98 (declare #.*standard-optimize-settings*)
99 (:documentation "Computes the exact number of characters required to
100 decode the sequence of octets in SEQUENCE from START to END using the
101 external format FORMAT."))
102
103 (defmethod compute-number-of-chars :around (format (list list) start end)
104 (declare #.*standard-optimize-settings*)
105 (call-next-method format (coerce list 'vector) start end))
106
107 (defmethod compute-number-of-chars ((format flexi-8-bit-format) sequence start end)
108 (declare #.*fixnum-optimize-settings*)
109 (declare (fixnum start end))
110 (declare (ignore sequence))
111 (- end start))
112
113 (defmethod compute-number-of-chars ((format flexi-crlf-mixin) sequence start end)
114 ;; this method only applies to the 8-bit formats as all other
115 ;; formats with CRLF line endings have their own specialized methods
116 ;; below
117 (declare #.*fixnum-optimize-settings*)
118 (declare (fixnum start end) (vector sequence))
119 (let ((i start)
120 (length (- end start)))
121 (declare (fixnum i length))
122 (loop
123 (when (>= i end)
124 (return))
125 (let ((position (search #.(vector +cr+ +lf+) sequence :start2 i :end2 end :test #'=)))
126 (unless position
127 (return))
128 (setq i (1+ position))
129 (decf length)))
130 length))
131
132 (defmethod compute-number-of-chars ((format flexi-utf-8-format) sequence start end)
133 (declare #.*fixnum-optimize-settings*)
134 (declare (fixnum start end) (vector sequence))
135 (let ((sum 0)
136 (i start))
137 (declare (fixnum i sum))
138 (loop
139 (when (>= i end)
140 (return))
141 (let* ((octet (aref sequence i))
142 ;; note that there are no validity checks here
143 (length (cond ((not (logbitp 7 octet)) 1)
144 ((= #b11000000 (logand* octet #b11100000)) 2)
145 ((= #b11100000 (logand* octet #b11110000)) 3)
146 (t 4))))
147 (declare (fixnum length) (type octet octet))
148 (incf sum)
149 (incf i length)))
150 (check-end format start end i)
151 sum))
152
153 (defmethod compute-number-of-chars ((format flexi-crlf-utf-8-format) sequence start end)
154 (declare #.*fixnum-optimize-settings*)
155 (declare (fixnum start end) (vector sequence))
156 (let ((sum 0)
157 (i start)
158 (last-octet 0))
159 (declare (fixnum i sum) (type octet last-octet))
160 (loop
161 (when (>= i end)
162 (return))
163 (let* ((octet (aref sequence i))
164 ;; note that there are no validity checks here
165 (length (cond ((not (logbitp 7 octet)) 1)
166 ((= #b11000000 (logand* octet #b11100000)) 2)
167 ((= #b11100000 (logand* octet #b11110000)) 3)
168 (t 4))))
169 (declare (fixnum length) (type octet octet))
170 (unless (and (= octet +lf+) (= last-octet +cr+))
171 (incf sum))
172 (incf i length)
173 (setq last-octet octet)))
174 (check-end format start end i)
175 sum))
176
177 (defmethod compute-number-of-chars :before ((format flexi-utf-16-format) sequence start end)
178 (declare #.*fixnum-optimize-settings*)
179 (declare (fixnum start end) (vector sequence))
180 (declare (ignore sequence))
181 (when (oddp (- end start))
182 (signal-encoding-error format "~A octet~:P cannot be decoded ~
183 using UTF-16 as ~:*~A is not even."
184 (- end start))))
185
186 (defmethod compute-number-of-chars ((format flexi-utf-16-le-format) sequence start end)
187 (declare #.*fixnum-optimize-settings*)
188 (declare (fixnum start end))
189 (let ((sum 0)
190 (i start))
191 (declare (fixnum i sum))
192 (decf end 2)
193 (loop
194 (when (> i end)
195 (return))
196 (let* ((high-octet (aref sequence (1+ i)))
197 (length (cond ((<= #xd8 high-octet #xdf) 4)
198 (t 2))))
199 (declare (fixnum length) (type octet high-octet))
200 (incf sum)
201 (incf i length)))
202 (check-end format start (+ end 2) i)
203 sum))
204
205 (defmethod compute-number-of-chars ((format flexi-utf-16-be-format) sequence start end)
206 (declare #.*fixnum-optimize-settings*)
207 (declare (fixnum start end) (vector sequence))
208 (let ((sum 0)
209 (i start))
210 (declare (fixnum i sum))
211 (decf end 2)
212 (loop
213 (when (> i end)
214 (return))
215 (let* ((high-octet (aref sequence i))
216 (length (cond ((<= #xd8 high-octet #xdf) 4)
217 (t 2))))
218 (declare (fixnum length) (type octet high-octet))
219 (incf sum)
220 (incf i length)))
221 (check-end format start (+ end 2) i)
222 sum))
223
224 (defmethod compute-number-of-chars ((format flexi-crlf-utf-16-le-format) sequence start end)
225 (declare #.*fixnum-optimize-settings*)
226 (declare (fixnum start end) (vector sequence))
227 (let ((sum 0)
228 (i start)
229 (last-octet 0))
230 (declare (fixnum i sum) (type octet last-octet))
231 (decf end 2)
232 (loop
233 (when (> i end)
234 (return))
235 (let* ((high-octet (aref sequence (1+ i)))
236 (length (cond ((<= #xd8 high-octet #xdf) 4)
237 (t 2))))
238 (declare (fixnum length) (type octet high-octet))
239 (unless (and (zerop high-octet)
240 (= (the octet (aref sequence i)) +lf+)
241 (= last-octet +cr+))
242 (incf sum))
243 (setq last-octet (if (zerop high-octet)
244 (aref sequence i)
245 0))
246 (incf i length)))
247 (check-end format start (+ end 2) i)
248 sum))
249
250 (defmethod compute-number-of-chars ((format flexi-crlf-utf-16-be-format) sequence start end)
251 (declare #.*fixnum-optimize-settings*)
252 (declare (fixnum start end) (vector sequence))
253 (let ((sum 0)
254 (i start)
255 (last-octet 0))
256 (declare (fixnum i sum) (type octet last-octet))
257 (decf end 2)
258 (loop
259 (when (> i end)
260 (return))
261 (let* ((high-octet (aref sequence i))
262 (length (cond ((<= #xd8 high-octet #xdf) 4)
263 (t 2))))
264 (declare (fixnum length) (type octet high-octet))
265 (unless (and (zerop high-octet)
266 (= (the octet (aref sequence (1+ i))) +lf+)
267 (= last-octet +cr+))
268 (incf sum))
269 (setq last-octet (if (zerop high-octet)
270 (aref sequence (1+ i))
271 0))
272 (incf i length)))
273 (check-end format start (+ end 2) i)
274 sum))
275
276 (defmethod compute-number-of-chars :before ((format flexi-utf-32-format) sequence start end)
277 (declare #.*fixnum-optimize-settings*)
278 (declare (fixnum start end))
279 (declare (ignore sequence))
280 (let ((length (- end start)))
281 (when (plusp (mod length 4))
282 (signal-encoding-error format "~A octet~:P cannot be decoded ~
283 using UTF-32 as ~:*~A is not a multiple-value of four."
284 length))))
285
286 (defmethod compute-number-of-chars ((format flexi-utf-32-format) sequence start end)
287 (declare #.*fixnum-optimize-settings*)
288 (declare (fixnum start end))
289 (declare (ignore sequence))
290 (ceiling (- end start) 4))
291
292 (defmethod compute-number-of-chars ((format flexi-crlf-utf-32-le-format) sequence start end)
293 (declare #.*fixnum-optimize-settings*)
294 (declare (fixnum start end) (vector sequence))
295 (let ((i start)
296 (length (ceiling (- end start) 4)))
297 (decf end 8)
298 (loop
299 (when (> i end)
300 (return))
301 (cond ((loop for j of-type fixnum from i
302 for octet across #.(vector +cr+ 0 0 0 +lf+ 0 0 0)
303 always (= octet (aref sequence j)))
304 (decf length)
305 (incf i 8))
306 (t (incf i 4))))
307 length))
308
309 (defmethod compute-number-of-chars ((format flexi-crlf-utf-32-be-format) sequence start end)
310 (declare #.*fixnum-optimize-settings*)
311 (declare (fixnum start end) (vector sequence))
312 (let ((i start)
313 (length (ceiling (- end start) 4)))
314 (decf end 8)
315 (loop
316 (when (> i end)
317 (return))
318 (cond ((loop for j of-type fixnum from i
319 for octet across #.(vector 0 0 0 +cr+ 0 0 0 +lf+)
320 always (= octet (aref sequence j)))
321 (decf length)
322 (incf i 8))
323 (t (incf i 4))))
324 length))
325
326 (defgeneric compute-number-of-octets (format sequence start end)
327 (declare #.*standard-optimize-settings*)
328 (:documentation "Computes the exact number of octets required to
329 encode the sequence of characters in SEQUENCE from START to END using
330 the external format FORMAT."))
331
332 (defmethod compute-number-of-octets :around (format (list list) start end)
333 (declare #.*standard-optimize-settings*)
334 (call-next-method format (coerce list 'string*) start end))
335
336 (defmethod compute-number-of-octets ((format flexi-8-bit-format) string start end)
337 (declare #.*fixnum-optimize-settings*)
338 (declare (fixnum start end))
339 (declare (ignore string))
340 (- end start))
341
342 (defmethod compute-number-of-octets ((format flexi-utf-8-format) string start end)
343 (declare #.*fixnum-optimize-settings*)
344 (declare (fixnum start end) (string string))
345 (let ((sum 0)
346 (i start))
347 (declare (fixnum i sum))
348 (loop
349 (when (>= i end)
350 (return))
351 (let* ((char-code (char-code (char string i)))
352 (char-length (cond ((< char-code #x80) 1)
353 ((< char-code #x800) 2)
354 ((< char-code #x10000) 3)
355 (t 4))))
356 (declare (fixnum char-length) (type char-code-integer char-code))
357 (incf sum char-length)
358 (incf i)))
359 sum))
360
361 (defmethod compute-number-of-octets ((format flexi-crlf-utf-8-format) string start end)
362 (declare #.*fixnum-optimize-settings*)
363 (declare (fixnum start end) (string string))
364 (let ((sum 0)
365 (i start))
366 (declare (fixnum i sum))
367 (loop
368 (when (>= i end)
369 (return))
370 (let* ((char-code (char-code (char string i)))
371 (char-length (cond ((= char-code #.(char-code #\Newline)) 2)
372 ((< char-code #x80) 1)
373 ((< char-code #x800) 2)
374 ((< char-code #x10000) 3)
375 (t 4))))
376 (declare (fixnum char-length) (type char-code-integer char-code))
377 (incf sum char-length)
378 (incf i)))
379 sum))
380
381 (defmethod compute-number-of-octets ((format flexi-utf-16-format) string start end)
382 (declare #.*fixnum-optimize-settings*)
383 (declare (fixnum start end) (string string))
384 (let ((sum 0)
385 (i start))
386 (declare (fixnum i sum))
387 (loop
388 (when (>= i end)
389 (return))
390 (let* ((char-code (char-code (char string i)))
391 (char-length (cond ((< char-code #x10000) 2)
392 (t 4))))
393 (declare (fixnum char-length) (type char-code-integer char-code))
394 (incf sum char-length)
395 (incf i)))
396 sum))
397
398 (defmethod compute-number-of-octets ((format flexi-crlf-utf-16-le-format) string start end)
399 (declare #.*fixnum-optimize-settings*)
400 (declare (fixnum start end) (string string))
401 (let ((sum 0)
402 (i start))
403 (declare (fixnum i sum))
404 (loop
405 (when (>= i end)
406 (return))
407 (let* ((char-code (char-code (char string i)))
408 (char-length (cond ((= char-code #.(char-code #\Newline)) 4)
409 ((< char-code #x10000) 2)
410 (t 4))))
411 (declare (fixnum char-length) (type char-code-integer char-code))
412 (incf sum char-length)
413 (incf i)))
414 sum))
415
416 (defmethod compute-number-of-octets ((format flexi-crlf-utf-16-be-format) string start end)
417 (declare #.*fixnum-optimize-settings*)
418 (declare (fixnum start end) (string string))
419 (let ((sum 0)
420 (i start))
421 (declare (fixnum i sum))
422 (loop
423 (when (>= i end)
424 (return))
425 (let* ((char-code (char-code (char string i)))
426 (char-length (cond ((= char-code #.(char-code #\Newline)) 4)
427 ((< char-code #x10000) 2)
428 (t 4))))
429 (declare (fixnum char-length) (type char-code-integer char-code))
430 (incf sum char-length)
431 (incf i)))
432 sum))
433
434 (defmethod compute-number-of-octets ((format flexi-utf-32-format) string start end)
435 (declare #.*fixnum-optimize-settings*)
436 (declare (fixnum start end))
437 (declare (ignore string))
438 (* 4 (- end start)))
439
440 (defmethod compute-number-of-octets ((format flexi-crlf-mixin) string start end)
441 (declare #.*fixnum-optimize-settings*)
442 (declare (fixnum start end) (string string))
443 (+ (call-next-method)
444 (* (case (external-format-name format)
445 (:utf-32 4)
446 (otherwise 1))
447 (count #\Newline string :start start :end end :test #'char=))))
448
449 (defgeneric character-length (format char)
450 (declare #.*fixnum-optimize-settings*)
451 (:documentation "Returns the number of octets needed to encode the
452 single character CHAR.")
453 (:method (format char)
454 (compute-number-of-octets format (string char) 0 1)))
455
456 (defmethod character-length :around ((format flexi-crlf-mixin) (char (eql #\Newline)))
457 (declare #.*fixnum-optimize-settings*)
458 (+ (call-next-method format +cr+)
459 (call-next-method format +lf+)))
460
461 (defmethod character-length ((format flexi-8-bit-format) char)
462 (declare #.*fixnum-optimize-settings*)
463 (declare (ignore char))
464 1)
465
466 (defmethod character-length ((format flexi-utf-32-format) char)
467 (declare #.*fixnum-optimize-settings*)
468 (declare (ignore char))
469 4)

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.5