/[cmucl]/src/code/extfmts.lisp
ViewVC logotype

Contents of /src/code/extfmts.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.20 - (show annotations)
Sun Oct 18 14:21:23 2009 UTC (4 years, 6 months ago) by rtoy
Branch: MAIN
CVS Tags: amd64-dd-start, intl-2-branch-base, snapshot-2010-01, snapshot-2010-02, snapshot-2009-11, snapshot-2009-12, intl-branch-base
Branch point for: amd64-dd-branch, intl-branch, intl-2-branch
Changes since 1.19: +52 -38 lines
Merge changes from unicode-string-buffer-impl-branch which gives
faster reads on external-formats.  This is done by adding an
additional buffer to streams so we can convert the entire in-buffer
into characters all at once.

To build this change, you need to do a cross-compile using
boot-2009-10-1-cross.lisp.  Using that build, do a normal build with
these sources.

For a non-unicode build use boot-2009-10-01.lisp with a 20a
non-unicode build.

code/extfmts.lisp:
o Add another slot to the extfmts for copying the state.
o Modify EF-OCTETS-TO-STRING and OCTETS-TO-STRING to support the
  necesssary changes for fast formats.  This is incompatible with the
  previous version because the string is not grown if needed.

code/fd-stream-extfmt.lisp:
o Set *enable-stream-buffer-p* to T so we have fast streams.

code/fd-stream.lisp:
o Add new slots to support fast strams.
o In SET-ROUTINES, initialize the new slots appropriately.
o Update UNREAD-CHAR to be able to back up in the string buffer to
  unread.
o Add implementation to copy the state of an external format.

code/stream.lisp:
o Change %SET-FD-STREAM-EXTERNAL-FORMAT to be able to change formats
  even if we've already converted the buffer with a different format.
  We reconvert the buffer with the old format until we reach the
  current character.  Then the remaining octets are converted using
  the new format and stored in the string buffer.
o Add FAST-READ-CHAR-STRING-REFILL to refill the string buffer, like
  FAST-READ-CHAR-REFILL does for the octet in-buffer.

code/struct.lisp:
o Add new slots to hold the string buffer, the current index, and
  length.  These are needed for the fast formats.

code/sysmacs.lisp:
o Update PREPARE-FOR-FAST-READ-CHAR, DONE-WITH-FAST-READ-CHAR, and
  FAST-READ-CHAR to support the string buffer.

code/string.lisp:
o Microoptimization of SURROGATEP to reduce the number of branchs.

general-info/release-20b.txt:
o Update with these changes

pcl/simple-streams/external-formats/utf-16-be.lisp:
pcl/simple-streams/external-formats/utf-16-le.lisp:
pcl/simple-streams/external-formats/utf-16.lisp:
o These formats actually have state, so update them to take a handle
  an initial state.  These are needed if the string buffer ends with a
  leading surrogate and the next string buffer starts with a trailing
  surrogate.  The conversion needs to combine the surrogates together.
1 ;;; -*- Mode: LISP; Syntax: ANSI-Common-Lisp; Package: STREAM -*-
2 ;;;
3 ;;; **********************************************************************
4 ;;; This code was written by Paul Foley and has been placed in the public
5 ;;; domain.
6 ;;;
7 (ext:file-comment
8 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/extfmts.lisp,v 1.20 2009/10/18 14:21:23 rtoy Exp $")
9 ;;;
10 ;;; **********************************************************************
11 ;;;
12 ;;; Implementation of external-formats
13
14 (in-package "STREAM")
15
16 (export '(string-to-octets octets-to-string *default-external-format*
17 string-encode string-decode set-system-external-format
18 +replacement-character-code+))
19
20 (defvar *default-external-format* :iso8859-1)
21
22 (defvar *external-formats* (make-hash-table :test 'equal))
23 (defvar *external-format-aliases* (make-hash-table))
24
25 (vm::defenum (:prefix "+EF-" :suffix "+" :start 1)
26 str ; string length
27 cin ; input a character
28 cout ; output a character
29 sin ; input string
30 sout ; output string
31 os ; octets to string
32 so ; string to octets
33 en ; encode
34 de ; decode
35 flush ; flush state
36 copy-state ; copy state
37 max)
38
39 ;; Unicode replacement character U+FFFD
40 (defconstant +replacement-character-code+ #xFFFD)
41
42 (define-condition external-format-not-implemented (error)
43 ()
44 (:report
45 (lambda (condition stream)
46 (declare (ignore condition))
47 (format stream "Attempting unimplemented external-format I/O."))))
48
49 (defun %efni (a b c d)
50 (declare (ignore a b c d))
51 (error 'external-format-not-implemented))
52
53 (defstruct efx
54 ;;
55 ;; Function to read a sequence of octets from a stream and convert
56 ;; them a code point.
57 (octets-to-code #'%efni :type function :read-only t)
58 ;;
59 ;; Function to convert a codepoint to a sequence of octets and write
60 ;; them to an output stream.
61 (code-to-octets #'%efni :type function :read-only t)
62 ;;
63 ;; Function (or NIL) to force any state in the external format to be
64 ;; flushed to the output stream. A NIL value means the external
65 ;; format does not need to do anything special.
66 (flush-state nil :type (or null function) :read-only t)
67 ;;
68 ;; Function to copy the state of the external-format. If NIL, then
69 ;; there is no state to be copied.
70 (copy-state nil :type (or null function) :read-only t)
71 (cache nil :type (or null simple-vector))
72 ;;
73 ;; Minimum number of octets needed to form a codepoint
74 (min 1 :type kernel:index :read-only t)
75 ;;
76 ;; Maximum number of octets needed to form a codepoint.
77 (max 1 :type kernel:index :read-only t))
78
79 (defstruct (external-format
80 (:conc-name ef-)
81 (:print-function %print-external-format)
82 (:constructor make-external-format (name efx composingp
83 &optional slots slotd)))
84 (name (ext:required-argument) :type (or keyword cons) :read-only t)
85 (efx (ext:required-argument) :type efx :read-only t)
86 (composingp (ext:required-argument) :type boolean :read-only t)
87 (slots #() :type simple-vector :read-only t)
88 (slotd nil :type list :read-only t))
89
90 (defun %print-external-format (ef stream depth)
91 (declare (ignore depth))
92 (print-unreadable-object (ef stream :type t :identity t)
93 (princ (ef-name ef) stream)))
94
95 (defun %intern-ef (ef)
96 (setf (gethash (ef-name ef) *external-formats*) ef))
97
98 (declaim (inline ef-octets-to-code ef-code-to-octets ef-flush-state ef-copy-state
99 ef-cache ef-min-octets ef-max-octets))
100
101 (defun ef-octets-to-code (ef)
102 (efx-octets-to-code (ef-efx ef)))
103
104 (defun ef-code-to-octets (ef)
105 (efx-code-to-octets (ef-efx ef)))
106
107 (defun ef-flush-state (ef)
108 (efx-flush-state (ef-efx ef)))
109
110 (defun ef-copy-state (ef)
111 (efx-copy-state (ef-efx ef)))
112
113 (defun ef-cache (ef)
114 (efx-cache (ef-efx ef)))
115
116 (defun ef-min-octets (ef)
117 (efx-min (ef-efx ef)))
118
119 (defun ef-max-octets (ef)
120 (efx-max (ef-efx ef)))
121
122 (eval-when (:compile-toplevel :load-toplevel :execute)
123 (defun %merge-slots (old new)
124 (let* ((pos (length old))
125 (tmp (mapcar (lambda (x)
126 (let* ((name (if (consp x) (first x) x))
127 (init (if (consp x) (second x) nil))
128 (list (if (consp x) (nthcdr 2 x) nil))
129 (prev (assoc name old))
130 (posn (if prev (second prev) (1- (incf pos)))))
131 (list name posn init (getf list :type t))))
132 new)))
133 (delete-duplicates (stable-sort (append old tmp) #'< :key #'second)
134 :key #'second))))
135
136 ;;; DEFINE-EXTERNAL-FORMAT -- Public
137 ;;;
138 ;;; name (&key min max size) (&rest slots) octets-to-code code-to-octets
139 ;;; flush-state copy-state
140 ;;;
141 ;;; Define a new external format. Min/Max/Size are the minimum and
142 ;;; maximum number of octets that make up a character (:size N is just
143 ;;; shorthand for :min N :max N). Slots is a list of slot descriptions
144 ;;; similar to defstruct.
145 ;;;
146 ;;; name (base) (&rest slots)
147 ;;; Define an external format based on a previously-defined external
148 ;;; format, Base. The slot names used in Slots must match those in Base.
149 ;;;
150 ;;; octets-to-code (state input unput &rest vars)
151 ;;; Defines a form to be used by the external format to convert
152 ;;; octets to a code point. State is a form that can be used by the
153 ;;; body to access the state of the stream. Input is a form that
154 ;;; can be used to read one octet from the input stream. (It can be
155 ;;; called as many times as needed.) Similarly, Unput is a form to
156 ;;; put back one octet to the input stream. Vars is a list of vars
157 ;;; that need to be defined for any symbols used within the form.
158 ;;;
159 ;;; This should return two values: the code and the number of octets
160 ;;; read to form the code.
161 ;;;
162 ;;; code-to-octets (code state output &rest vars)
163 ;;; Defines a form to be used by the external format to convert a
164 ;;; code point to octets for output. Code is the code point to be
165 ;;; converted. State is a form to access the current value of the
166 ;;; stream's state variable. Output is a form that writes one octet
167 ;;; to the output stream.
168 ;;;
169 ;;; flush-state (state output &rest vars)
170 ;;; Defines a form to be used by the external format to flush out
171 ;;; any state when an output stream is closed. Similar to
172 ;;; CODE-TO-OCTETS, but there is no code.
173 ;;;
174 ;;; copy-state (state &rest vars)
175 ;;; Defines a form to copy any state needed by the external format.
176 ;;; This should probably be a deep copy so that if the original
177 ;;; state is modified, the copy is not.
178 ;;;
179 ;;; Note: external-formats work on code-points, not
180 ;;; characters, so that the entire 31 bit ISO-10646 range can be
181 ;;; used internally regardless of the size of a character recognized
182 ;;; by Lisp and external formats can be useful to people who want to
183 ;;; process characters outside the Lisp range (see
184 ;;; CODEPOINT-TO-OCTETS, OCTETS-TO-CODEPOINT)
185 ;;;
186 (defmacro define-external-format (name (&rest args) (&rest slots)
187 &optional octets-to-code code-to-octets
188 flush-state copy-state)
189 (when (and (oddp (length args)) (not (= (length args) 1)))
190 (warn "Nonsensical argument (~S) to DEFINE-EXTERNAL-FORMAT." args))
191 (let* ((tmp (gensym))
192 (min (if (evenp (length args))
193 (or (getf args :min) (getf args :size) 1)
194 1))
195 (max (if (evenp (length args))
196 (or (getf args :max) (getf args :size) 6)
197 6))
198 (base (if (= (length args) 1)
199 (find-external-format (first args))
200 nil))
201 (bslotd (if base (ef-slotd base) nil))
202 (slotd (%merge-slots bslotd slots))
203 (slotb (loop for slot in slotd
204 collect `(,(first slot)
205 `(the ,',(fourth slot)
206 ;; IDENTITY is here to protect against SETF
207 (identity (svref %slots% ,',(second slot))))))))
208 `(macrolet ((octets-to-code ((state input unput &rest vars) body)
209 `(lambda (,state ,input ,unput)
210 (declare (ignorable ,state ,input ,unput)
211 (optimize (ext:inhibit-warnings 3)))
212 (let (,@',slotb
213 (,input `(the (or (unsigned-byte 8) null) ,,input))
214 ,@(loop for var in vars collect `(,var (gensym))))
215 ,body)))
216 (code-to-octets ((code state output &rest vars) body)
217 `(lambda (,',tmp ,state ,output)
218 (declare (ignorable ,state ,output)
219 (optimize (ext:inhibit-warnings 3)))
220 (let (,@',slotb
221 (,code ',code)
222 ,@(loop for var in vars collect `(,var (gensym))))
223 `(let ((,',code (the lisp:codepoint ,,',tmp)))
224 (declare (ignorable ,',code))
225 ,,body))))
226 (flush-state ((state output &rest vars) body)
227 `(lambda (,state ,output)
228 (declare (ignorable ,state ,output))
229 (let (,@',slotb
230 ,@(loop for var in vars collect `(,var (gensym))))
231 ,body)))
232 (copy-state ((state &rest vars) body)
233 `(lambda (,state)
234 (declare (ignorable ,state))
235 (let (,@',slotb
236 ,@(loop for var in vars collect `(,var (gensym))))
237 ,body))))
238 (%intern-ef (make-external-format ,name
239 ,(if base
240 `(ef-efx (find-external-format ,(ef-name base)))
241 `(make-efx :octets-to-code ,octets-to-code
242 :code-to-octets ,code-to-octets
243 :flush-state ,flush-state
244 :copy-state ,copy-state
245 :cache (make-array +ef-max+
246 :initial-element nil)
247 :min ,(min min max) :max ,(max min max)))
248 nil
249 (let* ,(loop for x in slotd
250 collect (list (first x) (third x)))
251 (vector ,@(mapcar #'first slotd)))
252 ',slotd)))))
253
254 ;;; DEFINE-COMPOSING-EXTERNAL-FORMAT -- Public
255 ;;;
256 ;;; A composing-external-format differs from an (ordinary) external-format
257 ;;; in that it translates characters (really codepoints, of course) into
258 ;;; other characters, rather than translating between characters and binary
259 ;;; octets. They have to be composed with a non-composing external-format
260 ;;; to be of any use.
261 ;;;
262 ;;;
263 ;;; name (&key min max size) input output
264 ;;; Defines a new composing external format. The parameters Min,
265 ;;; Max, and Size are the same as for defining an external format.
266 ;;; The parameters input and output are forms to handle input and
267 ;;; output.
268 ;;;
269 ;;; input (state input unput &rest vars)
270 ;;; Defines a form to be used by the composing external format when
271 ;;; reading input to transform a codepoint (or sequence of
272 ;;; codepoints) to another. State is a form that can be used by the
273 ;;; body to access the state of the external format. Input is a
274 ;;; form that can be used to read one code point from the input
275 ;;; stream. (Input returns two values, the codepoint and the number
276 ;;; of octets read.) It may be called as many times as needed.
277 ;;; This returns two values: the codepoint of the character (or NIL)
278 ;;; and the number of octets read. Similarly, Unput is a form to
279 ;;; put back one octet to the input stream. Vars is a list of vars
280 ;;; that need to be defined for any symbols used within the form.
281 ;;;
282 ;;; This should return two values: the code and the number of octets
283 ;;; read to form the code.
284 ;;;
285 ;;; output (code state output &rest vars)
286 ;;; Defines a form to be used by the composing external format to
287 ;;; convert a code point to octets for output. Code is the code
288 ;;; point to be converted. State is a form to access the current
289 ;;; value of the stream's state variable. Output is a form that
290 ;;; writes one octet to the output stream.
291
292 (defmacro define-composing-external-format (name (&key min max size)
293 input output)
294 (let ((tmp (gensym))
295 (min (or min size 1))
296 (max (or max size 1)))
297 `(macrolet ((input ((state input unput &rest vars) body)
298 `(lambda (,state ,input ,unput)
299 (declare (ignorable ,state ,input ,unput)
300 (optimize (ext:inhibit-warnings 3)))
301 (let ((,input `(the (values (or lisp:codepoint null)
302 kernel:index)
303 ,,input))
304 ,@(loop for var in vars collect `(,var (gensym))))
305 ,body)))
306 (output ((code state output &rest vars) body)
307 `(lambda (,',tmp ,state ,output)
308 (declare (ignorable ,state ,output)
309 (optimize (ext:inhibit-warnings 3)))
310 (let ((,code ',code)
311 ,@(loop for var in vars collect `(,var (gensym))))
312 `(let ((,',code (the lisp:codepoint ,,',tmp)))
313 (declare (ignorable ,',code))
314 ,,body)))))
315 (%intern-ef (make-external-format ,name
316 (make-efx :octets-to-code ,input
317 :code-to-octets ,output
318 :min ,(min min max) :max ,(max min max))
319 t
320 #() '())))))
321
322 (defun load-external-format-aliases ()
323 (let ((*package* (find-package "KEYWORD"))
324 (unix::*filename-encoding* :iso8859-1))
325 (with-open-file (stm "ext-formats:aliases" :if-does-not-exist nil
326 :external-format :iso8859-1)
327 (when stm
328 (do ((alias (read stm nil stm) (read stm nil stm))
329 (value (read stm nil stm) (read stm nil stm)))
330 ((or (eq alias stm) (eq value stm))
331 (unless (eq alias stm)
332 (warn "External-format aliases file ends early.")))
333 (if (and (keywordp alias) (or (keywordp value)
334 (and (consp value)
335 (every #'keywordp value))))
336 (setf (gethash alias *external-format-aliases*) value)
337 (warn "Bad entry in external-format aliases file: ~S => ~S."
338 alias value)))))))
339
340 (defun %find-external-format (name)
341 ;; avoid loading files, etc., early in the boot sequence
342 (when (or (eq name :iso8859-1)
343 (and (eq name :default) (eq *default-external-format* :iso8859-1)))
344 (return-from %find-external-format
345 (gethash :iso8859-1 *external-formats*)))
346
347 (when (zerop (hash-table-count *external-format-aliases*))
348 (setf (gethash :latin1 *external-format-aliases*) :iso8859-1)
349 (setf (gethash :latin-1 *external-format-aliases*) :iso8859-1)
350 (setf (gethash :iso-8859-1 *external-format-aliases*) :iso8859-1)
351 (load-external-format-aliases))
352
353 (do ((tmp (gethash name *external-format-aliases*)
354 (gethash tmp *external-format-aliases*))
355 (cnt 0 (1+ cnt)))
356 ((or (null tmp) (= cnt 50))
357 (unless (null tmp)
358 (error "External-format aliasing depth exceeded.")))
359 (setq name tmp))
360
361 (or (gethash name *external-formats*)
362 (and (consp name) (find-external-format name))
363 (and (let ((*package* (find-package "STREAM"))
364 (lisp::*enable-package-locked-errors* nil)
365 (s (open (format nil "ext-formats:~(~A~).lisp" name)
366 :if-does-not-exist nil :external-format :iso8859-1)))
367 (when s
368 (null (nth-value 1 (ext:compile-from-stream s)))))
369 (gethash name *external-formats*))))
370
371 (defun %composed-ef-name (a b)
372 (if (consp a) (append a (list b)) (list a b)))
373
374 (defun %compose-external-formats (a b)
375 (when (ef-composingp a)
376 (error "~S is a Composing-External-Format." (ef-name a)))
377 (unless (ef-composingp b)
378 (error "~S is not a Composing-External-Format." (ef-name b)))
379 (make-external-format
380 (%composed-ef-name (ef-name a) (ef-name b))
381 (make-efx
382 :octets-to-code (lambda (state input unput)
383 (let ((nstate (gensym "STATE-")))
384 `(let ((,nstate ,state))
385 (when (null ,nstate)
386 (setq ,nstate (setf ,state (cons nil nil))))
387 ,(funcall (ef-octets-to-code b) `(car ,nstate)
388 (funcall (ef-octets-to-code a)
389 `(cdr ,nstate) input unput)
390 unput))))
391 :code-to-octets (lambda (code state output)
392 (let ((nstate (gensym "STATE-")))
393 `(let ((,nstate ,state))
394 (when (null ,nstate)
395 (setq ,nstate (setf ,state (cons nil nil))))
396 ,(funcall (ef-code-to-octets b) code `(car ,nstate)
397 `(lambda (x)
398 ,(funcall (ef-code-to-octets a)
399 'x `(cdr ,nstate) output))))))
400 :cache (make-array +ef-max+ :initial-element nil)
401 :min (* (ef-min-octets a) (ef-min-octets b))
402 :max (* (ef-max-octets a) (ef-max-octets b)))
403 nil #() '()))
404
405 (defun find-external-format (name &optional (error-p t))
406 (when (external-format-p name)
407 (return-from find-external-format name))
408
409 (or (if (consp name) (every #'keywordp name) (keywordp name))
410 (error "~S is not a valid external format name." name))
411
412 (when (eq name :default)
413 (setq name *default-external-format*))
414
415 (when (and (consp name) (not (cdr name)))
416 (setq name (car name)))
417
418 (flet ((not-found ()
419 (when (equal *default-external-format* name)
420 (setq *default-external-format* :iso8859-1))
421 (if error-p (error "External format ~S not found." name) nil)))
422 (if (consp name)
423 (let ((efs (mapcar #'%find-external-format name)))
424 (if (member nil efs)
425 (not-found)
426 (let ((name (reduce #'%composed-ef-name (mapcar #'ef-name efs))))
427 (or (gethash name *external-formats*)
428 (%intern-ef (reduce #'%compose-external-formats efs))))))
429 (or (%find-external-format name) (not-found)))))
430
431 (defun flush-external-formats ()
432 (maphash (lambda (name ef)
433 (declare (ignore name))
434 (fill (ef-cache ef) nil))
435 *external-formats*))
436
437 (defvar *.table-inverse.* (make-hash-table :test 'eq :size 7))
438
439 (defun invert-table (table)
440 (declare (type (or (simple-array (unsigned-byte 31) *)
441 (simple-array (unsigned-byte 16) *))
442 table)
443 (optimize (speed 3) (space 0) (safety 0) (debug 0)
444 (ext:inhibit-warnings 3)))
445 (or (gethash table *.table-inverse.*)
446 (let* ((mbits (if (= (array-total-size table) 128) 7 8))
447 (lbits (cond ((> (array-total-size table) 256) 3)
448 ((< (array-total-size table) 100) 6)
449 (t 5)))
450 (hvec (make-array (1+ (ash #x110000 (- 0 mbits lbits)))
451 :element-type '(unsigned-byte 16)
452 :initial-element #xFFFF))
453 (mvec (make-array 0 :element-type '(unsigned-byte 16)))
454 (lvec (make-array 0 :element-type '(unsigned-byte 16)))
455 (width (array-dimension table 0))
456 (power (1- (array-rank table)))
457 (base (if (= width 94) 1 0))
458 hx mx lx)
459 (assert (and (< power 2) (<= width 256)))
460 (dotimes (i (array-total-size table))
461 (declare (type (integer 0 (#.array-dimension-limit)) i))
462 (let ((tmp i) (val (row-major-aref table i)) (z 0))
463 (declare (type (integer 0 (#.array-dimension-limit)) tmp)
464 (type (unsigned-byte 16) z))
465 (unless (= val #xFFFE)
466 (when (plusp power)
467 (multiple-value-bind (x y) (floor tmp width)
468 (setq tmp x)
469 (setq z (logior z (ash (the (integer 0 255) (+ y base))
470 (the (integer 0 24)
471 (* 8 power)))))))
472 (setq hx (ash val (- 0 mbits lbits)))
473 (when (= (aref hvec hx) #xFFFF)
474 (setf (aref hvec hx) (length mvec))
475 (let ((tmp (make-array (+ (length mvec) (ash 1 mbits))
476 :element-type '(unsigned-byte 16)
477 :initial-element #xFFFF)))
478 (replace tmp mvec)
479 (setq mvec tmp)))
480 (setq mx (logand (ash val (- lbits)) (lognot (ash -1 mbits))))
481 (when (= (aref mvec (+ (aref hvec hx) mx)) #xFFFF)
482 (setf (aref mvec (+ (aref hvec hx) mx)) (length lvec))
483 (let ((tmp (make-array (+ (length lvec) (ash 1 lbits))
484 :element-type '(unsigned-byte 16)
485 :initial-element #xFFFF)))
486 (replace tmp lvec)
487 (setq lvec tmp)))
488 (setq lx (logand val (lognot (ash -1 lbits))))
489 (setf (aref lvec (+ (aref mvec (+ (aref hvec hx) mx)) lx))
490 (logior z (+ tmp base))))))
491 (setf (gethash table *.table-inverse.*)
492 (lisp::make-ntrie16 :split (logior (ash (1- mbits) 4) (1- lbits))
493 :hvec hvec :mvec mvec :lvec lvec)))))
494
495 (declaim (inline get-inverse))
496 (defun get-inverse (ntrie code)
497 (declare (type lisp::ntrie16 ntrie) (type (integer 0 #x10FFFF) code))
498 (let ((n (lisp::qref ntrie code)))
499 (and n (let ((m (aref (lisp::ntrie16-lvec ntrie) n)))
500 (if (= m #xFFFF) nil m)))))
501
502
503 (define-condition void-external-format (error)
504 ()
505 (:report
506 (lambda (condition stream)
507 (declare (ignore condition))
508 (format stream "Attempting I/O through void external-format."))))
509
510 (define-external-format :void (:size 0) ()
511 (octets-to-code (state input unput)
512 `(error 'void-external-format))
513 (code-to-octets (code state output)
514 `(error 'void-external-format)))
515
516 (define-external-format :iso8859-1 (:size 1) ()
517 (octets-to-code (state input unput)
518 `(values ,input 1))
519 (code-to-octets (code state output)
520 `(,output (if (> ,code 255) #x3F ,code))))
521
522 ;;; OCTETS-TO-CODEPOINT, CODEPOINT-TO-OCTETS -- Semi-Public
523 ;;;
524 ;;; Normally you'd want to use OCTETS-TO-CHAR and CHAR-TO-OCTETS instead of
525 ;;; these, but that limits you to Lisp's idea of a character - either Latin-1
526 ;;; in 8 bit Lisp images, or the Unicode BMP in 16 bit images. If you want
527 ;;; to read or write texts containing characters not supported by your Lisp,
528 ;;; these macros can be used instead.
529 (defmacro octets-to-codepoint (external-format state count input unput)
530 (let ((tmp1 (gensym)) (tmp2 (gensym))
531 (ef (find-external-format external-format)))
532 `(multiple-value-bind (,tmp1 ,tmp2)
533 ,(funcall (ef-octets-to-code ef) state input unput)
534 (setf ,count (the kernel:index ,tmp2))
535 (the (or lisp:codepoint null) ,tmp1))))
536
537 (defmacro codepoint-to-octets (external-format code state output)
538 (let ((ef (find-external-format external-format)))
539 (funcall (ef-code-to-octets ef) code state output)))
540
541
542
543 (defvar *ef-base* +ef-max+)
544 (defvar *ef-extensions* '())
545
546 (defun ensure-cache (ef id reqd)
547 (let ((base (or (getf *ef-extensions* id)
548 (setf (getf *ef-extensions* id)
549 (prog1 *ef-base* (incf *ef-base* reqd))))))
550 (when (< (length (ef-cache ef)) (+ base reqd))
551 (setf (efx-cache (ef-efx ef))
552 (adjust-array (ef-cache ef) (+ base reqd) :initial-element nil)))
553 base))
554
555 ;;; DEF-EF-MACRO -- Public
556 ;;;
557 ;;;
558 (defmacro def-ef-macro (name (ef id reqd idx) body)
559 (let* ((tmp1 (gensym))
560 (tmp2 (gensym))
561 (blknm (nth-value 1 (lisp::valid-function-name-p name)))
562 (%name (intern (format nil "%~A" name) #|(symbol-package blknm)|#)))
563 `(progn
564 (defun ,%name (,ef)
565 (let* ((,tmp1 (find-external-format ,ef))
566 (,tmp2 ,(if (eq id 'lisp::lisp)
567 idx
568 `(+ (ensure-cache ,tmp1 ',id ,reqd) ,idx))))
569 (funcall (or (aref (ef-cache ,tmp1) ,tmp2)
570 (setf (aref (ef-cache ,tmp1) ,tmp2)
571 (let ((*compile-print* nil)
572 ;; Set default format when we compile so we
573 ;; can see compiler messages. If we don't,
574 ;; we run into a problem that we might be
575 ;; changing the default format while we're
576 ;; compiling, and we don't know how to output
577 ;; the compiler messages.
578 #|(*default-external-format* :iso8859-1)|#)
579 (compile nil `(lambda (%slots%)
580 (declare (ignorable %slots%))
581 (block ,',blknm
582 ,,body))))))
583 (ef-slots ,tmp1))))
584 (declaim (inline ,name))
585 (defun ,name (,tmp1)
586 (let ((,tmp2 (load-time-value (cons nil nil))))
587 (when (eq ,tmp1 :default)
588 (setq ,tmp1 *default-external-format*))
589 (if (eq ,tmp1 (car ,tmp2))
590 (cdr ,tmp2)
591 (setf (car ,tmp2) ,tmp1
592 (cdr ,tmp2) (,%name ,tmp1))))))))
593
594
595
596 ;;; OCTETS-TO-CHAR, CHAR-TO-OCTETS -- Public
597 ;;;
598 ;;; Read and write one character through an external-format
599 ;;;
600 (defmacro octets-to-char (external-format state count input unput)
601 (let ((nstate (gensym)))
602 `(let ((,nstate ,state))
603 (when (null ,nstate) (setq ,nstate (setf ,state (cons nil nil))))
604 (if (car ,nstate)
605 ;; Return the trailing surrgate. Must set count to 0 to
606 ;; tell the stream code we didn't consume any octets!
607 (prog1 (the character (car ,nstate))
608 (setf (car ,nstate) nil ,count 0))
609 (let ((code (octets-to-codepoint ,external-format
610 (cdr ,nstate) ,count ,input ,unput)))
611 (declare (type lisp:codepoint code))
612 ;;@@ on non-Unicode builds, limit to 8-bit chars
613 ;;@@ if unicode-bootstrap, can't use #\u+fffd
614 (cond ((or (lisp::surrogatep code) (> code #x10FFFF))
615 #-(and unicode (not unicode-bootstrap)) #\?
616 #+(and unicode (not unicode-bootstrap)) #\U+FFFD)
617 #+unicode
618 ((> code #xFFFF)
619 (multiple-value-bind (hi lo) (surrogates code)
620 (setf (car ,nstate) lo)
621 hi))
622 (t (code-char code))))))))
623
624 (defmacro char-to-octets (external-format char state output)
625 (let ((nchar (gensym))
626 (nstate (gensym))
627 (wryte (gensym))
628 (ch (gensym)))
629 `(let ((,nchar ,char)
630 (,nstate ,state))
631 (when (null ,nstate) (setq ,nstate (setf ,state (cons nil nil))))
632 (if (lisp::surrogatep (char-code ,nchar) :high)
633 (setf (car ,nstate) ,nchar)
634 (flet ((,wryte (,ch)
635 (codepoint-to-octets ,external-format ,ch (cdr ,nstate)
636 ,output)))
637 (declare (dynamic-extent #',wryte))
638 (if (car ,nstate)
639 (prog1
640 ;; Invalid surrogate sequences get replaced with
641 ;; the replacement character.
642 (,wryte (if (lisp::surrogatep (char-code ,nchar) :low)
643 (surrogates-to-codepoint (car ,nstate) ,nchar)
644 +replacement-character-code+))
645 (setf (car ,nstate) nil))
646 ;; A lone trailing (low) surrogate gets replaced with
647 ;; the replacement character.
648 (,wryte (if (lisp::surrogatep (char-code ,nchar) :low)
649 +replacement-character-code+
650 (char-code ,nchar)))))))))
651
652 (defmacro flush-state (external-format state output)
653 (let* ((ef (find-external-format external-format))
654 (f (ef-flush-state ef)))
655 (when f
656 (funcall f state output))))
657
658 (defmacro copy-state (external-format state)
659 (let* ((ef (find-external-format external-format))
660 (f (ef-copy-state ef)))
661 (when f
662 (funcall f state))))
663
664 (def-ef-macro ef-string-to-octets (extfmt lisp::lisp +ef-max+ +ef-so+)
665 `(lambda (string start end buffer &aux (ptr 0) (state nil))
666 (declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
667 (type simple-string string)
668 (type kernel:index start end ptr)
669 (type (simple-array (unsigned-byte 8) (*)) buffer)
670 (ignorable state))
671 (dotimes (i (- end start) (values buffer ptr))
672 (declare (type kernel:index i))
673 (char-to-octets ,extfmt (schar string (+ start i)) state
674 (lambda (b)
675 (when (= ptr (length buffer))
676 (setq buffer (adjust-array buffer (* 2 ptr))))
677 (setf (aref buffer (1- (incf ptr))) b))))))
678
679 (defun string-to-octets (string &key (start 0) end (external-format :default)
680 (buffer nil bufferp))
681 "Convert String to octets using the specified External-format. The
682 string is bounded by Start (defaulting to 0) and End (defaulting to
683 the end of the string. If Buffer is given, the octets are stored
684 there. If not, a new buffer is created."
685 (declare (type string string)
686 (type kernel:index start)
687 (type (or kernel:index null) end)
688 (type (or (simple-array (unsigned-byte 8) (*)) null) buffer))
689 (let* ((buffer (or buffer (make-array (length string)
690 :element-type '(unsigned-byte 8)))))
691 (multiple-value-bind (buffer ptr)
692 (lisp::with-array-data ((string string) (start start) (end end))
693 (funcall (ef-string-to-octets external-format)
694 string start end buffer))
695 (values (if bufferp buffer (lisp::shrink-vector buffer ptr)) ptr))))
696
697 (def-ef-macro ef-octets-to-string (extfmt lisp::lisp +ef-max+ +ef-os+)
698 `(lambda (octets ptr end state string s-start s-end &aux (pos s-start) (count 0) (last-octet 0))
699 (declare (optimize (speed 3) (safety 0) #|(space 0) (debug 0)|#)
700 (type (simple-array (unsigned-byte 8) (*)) octets)
701 (type kernel:index pos end count last-octet s-start s-end)
702 (type (integer -1 (#.array-dimension-limit)) ptr)
703 (type simple-string string)
704 (ignorable state))
705 (catch 'end-of-octets
706 (loop while (< pos s-end)
707 do (setf (schar string pos)
708 (octets-to-char ,extfmt state count
709 (if (>= ptr end)
710 (throw 'end-of-octets nil)
711 (aref octets (incf ptr)))
712 (lambda (n) (decf ptr n))))
713 (incf pos)
714 (incf last-octet count)))
715 (values string pos last-octet state)))
716
717 (defun octets-to-string (octets &key (start 0) end (external-format :default)
718 (string nil stringp)
719 (s-start 0) (s-end nil s-end-p)
720 (state nil))
721 "Octets-to-string converts an array of octets in Octets to a string
722 according to the specified External-format. The array of octets is
723 bounded by Start (defaulting ot 0) and End (defaulting to the end of
724 the array. If String is not given, a new string is created. If
725 String is given, the converted octets are stored in String, starting
726 at S-Start (defaulting to the 0) and ending at S-End (defaulting to
727 the length of String). If the string is not large enough to hold
728 all of characters, then some octets will not be converted. A State
729 may also be specified; this is used as the state of the external
730 format.
731
732 Four values are returned: the string, the number of characters read,
733 the number of octets actually consumed and the new state of the
734 external format."
735 (declare (type (simple-array (unsigned-byte 8) (*)) octets)
736 (type kernel:index start s-start)
737 (type (or kernel:index null) end)
738 (type (or simple-string null) string))
739 (let ((s-end (if s-end-p
740 s-end
741 (if stringp
742 (length string)
743 (length octets)))))
744 (multiple-value-bind (string pos last-octet new-state)
745 (funcall (ef-octets-to-string external-format)
746 octets (1- start) (1- (or end (length octets)))
747 state
748 (or string (make-string (length octets)))
749 s-start s-end)
750 (values (if stringp string (lisp::shrink-vector string pos)) pos last-octet new-state))))
751
752
753
754 (def-ef-macro ef-encode (extfmt lisp::lisp +ef-max+ +ef-en+)
755 `(lambda (string start end result &aux (ptr 0) (state nil))
756 (declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
757 (type simple-string string)
758 (type kernel:index start end ptr)
759 (type simple-base-string result)
760 (ignorable state))
761 (dotimes (i (- end start) (values result ptr))
762 (declare (type kernel:index i))
763 (char-to-octets ,extfmt (schar string (+ start i)) state
764 (lambda (b)
765 (when (= ptr (length result))
766 (setq result (adjust-array result (* 2 ptr))))
767 (setf (aref result (1- (incf ptr)))
768 (code-char b)))))))
769
770 (defun string-encode (string external-format &optional (start 0) end)
771 "Encode the given String using External-Format and return a new
772 string. The characters of the new string are the octets of the
773 encoded result, with each octet converted to a character via
774 code-char. This is the inverse to String-Decode"
775 (when (zerop (length string))
776 (return-from string-encode string))
777 (multiple-value-bind (result ptr)
778 (lisp::with-array-data ((string string) (start start) (end end))
779 (funcall (ef-encode external-format) string start end
780 (make-string (length string) :element-type 'base-char)))
781 (lisp::shrink-vector result ptr)))
782
783 (def-ef-macro ef-decode (extfmt lisp::lisp +ef-max+ +ef-de+)
784 `(lambda (string ptr end result &aux (pos -1) (count 0) (state nil))
785 (declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
786 (type simple-string string)
787 (type kernel:index end count)
788 (type (integer -1 (#.array-dimension-limit)) ptr pos)
789 (type simple-string result)
790 (ignorable state))
791 (loop until (>= ptr end)
792 ;; increasing size of result shouldn't ever be necessary, unless
793 ;; someone implements an encoding smaller than the source string...
794 do (setf (schar result (incf pos))
795 (octets-to-char ,extfmt state count
796 ;; note the need to return NIL for EOF
797 (if (= (1+ ptr) (length string))
798 nil
799 (char-code (char string (incf ptr))))
800 (lambda (n) (decf ptr n))))
801 finally (return (values result (1+ pos))))))
802
803 (defun string-decode (string external-format &optional (start 0) end)
804 "Decode String using the given External-Format and return the new
805 string. The input string is treated as if it were an array of
806 octets, where the char-code of each character is the octet. This is
807 the inverse of String-Encode."
808 (when (zerop (length string))
809 (return-from string-decode string))
810 (multiple-value-bind (result pos)
811 (lisp::with-array-data ((string string) (start start) (end end))
812 (funcall (ef-decode external-format)
813 string (1- start) (1- end) (make-string (length string))))
814 (lisp::shrink-vector result pos)))
815
816
817 (defun set-system-external-format (terminal &optional filenames)
818 "Change the external format of the standard streams to Terminal.
819 The standard streams are sys::*stdin*, sys::*stdout*, and
820 sys::*stderr*, which are normally the input and/or output streams
821 for *standard-input* and *standard-output*. Also sets sys::*tty*
822 (normally *terminal-io* to the given external format. If the
823 optional argument Filenames is gvien, then the filename encoding is
824 set to the specified format."
825 (unless (find-external-format terminal)
826 (error "Can't find external-format ~S." terminal))
827 (setf (stream-external-format sys:*stdin*) terminal
828 (stream-external-format sys:*stdout*) terminal
829 (stream-external-format sys:*stderr*) terminal)
830 (when (lisp::fd-stream-p sys:*tty*)
831 (setf (stream-external-format sys:*tty*) terminal))
832 (when filenames
833 (unless (find-external-format filenames)
834 (error "Can't find external-format ~S." filenames))
835 (when (and unix::*filename-encoding*
836 (not (eq unix::*filename-encoding* filenames)))
837 (cerror "Change it anyway."
838 "The external-format for encoding filenames is already set.")
839 (setq unix::*filename-encoding* filenames)))
840 t)
841
842
843 ;; Despite its name, this doesn't actually compile anything at all. What it
844 ;; does is expand into a lambda expression that can be compiled by the file
845 ;; compiler.
846 (defmacro precompile-ef-slot (ef slot)
847 (let* ((ef (find-external-format ef)))
848 ;; if there's no lambda expression available, flush it and regenerate
849 (unless (and (aref (ef-cache ef) slot)
850 (function-lambda-expression (aref (ef-cache ef) slot)))
851 (setf (aref (ef-cache ef) slot) nil)
852 (ecase slot
853 (#.+ef-cin+ (lisp::%ef-cin ef))
854 (#.+ef-cout+ (lisp::%ef-cout ef))
855 (#.+ef-sout+ (lisp::%ef-sout ef))
856 (#.+ef-os+ (%ef-octets-to-string ef))
857 (#.+ef-so+ (%ef-string-to-octets ef))
858 (#.+ef-en+ (%ef-encode ef))
859 (#.+ef-de+ (%ef-decode ef))))
860 `(setf (aref (ef-cache (find-external-format ,(ef-name ef))) ,slot)
861 ,(subst (ef-name ef) ef
862 (function-lambda-expression (aref (ef-cache ef) slot))))))

  ViewVC Help
Powered by ViewVC 1.1.5