/[cmucl]/src/code/extfmts.lisp
ViewVC logotype

Contents of /src/code/extfmts.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.18.4.3 - (hide annotations)
Fri Oct 2 17:24:13 2009 UTC (4 years, 6 months ago) by rtoy
Branch: unicode-string-buffer-impl-branch
Changes since 1.18.4.2: +3 -3 lines
o Use bref instead of aref in EF-OCTETS-TO-STRING.  (Generated code is
  smaller and better.  Probably some issue with register allocation
  and vop selection.)
o Use SAFETY 0 in EF-OCTETS-TO-STRING, depending on OCTETS-TO-STRING
  to do the correct things.

With these changes, utf16 is now the same speed is 20a, and utf8 is
much faster than 20a.
1 rtoy 1.1 ;;; -*- Mode: LISP; Syntax: ANSI-Common-Lisp; Package: STREAM -*-
2     ;;;
3     ;;; **********************************************************************
4     ;;; This code was written by Paul Foley and has been placed in the public
5     ;;; domain.
6     ;;;
7     (ext:file-comment
8 rtoy 1.18.4.3 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/extfmts.lisp,v 1.18.4.3 2009/10/02 17:24:13 rtoy Exp $")
9 rtoy 1.1 ;;;
10     ;;; **********************************************************************
11     ;;;
12     ;;; Implementation of external-formats
13    
14     (in-package "STREAM")
15    
16     (export '(string-to-octets octets-to-string *default-external-format*
17 rtoy 1.15 string-encode string-decode set-system-external-format
18     +replacement-character-code+))
19 rtoy 1.1
20     (defvar *default-external-format* :iso8859-1)
21    
22 rtoy 1.3 (defvar *external-formats* (make-hash-table :test 'equal))
23 rtoy 1.1 (defvar *external-format-aliases* (make-hash-table))
24    
25 rtoy 1.15 (vm::defenum (:prefix "+EF-" :suffix "+" :start 1)
26     str ; string length
27     cin ; input a character
28     cout ; output a character
29     sin ; input string
30     sout ; output string
31     os ; octets to string
32     so ; string to octets
33     en ; encode
34     de ; decode
35     flush ; flush state
36     max)
37 rtoy 1.6
38     ;; Unicode replacement character U+FFFD
39     (defconstant +replacement-character-code+ #xFFFD)
40 rtoy 1.3
41 rtoy 1.1 (define-condition external-format-not-implemented (error)
42     ()
43     (:report
44     (lambda (condition stream)
45     (declare (ignore condition))
46     (format stream "Attempting unimplemented external-format I/O."))))
47    
48     (defun %efni (a b c d)
49     (declare (ignore a b c d))
50     (error 'external-format-not-implemented))
51    
52 rtoy 1.5 (defstruct efx
53 rtoy 1.15 ;;
54     ;; Function to read a sequence of octets from a stream and convert
55     ;; them a code point.
56 rtoy 1.5 (octets-to-code #'%efni :type function :read-only t)
57 rtoy 1.15 ;;
58     ;; Function to convert a codepoint to a sequence of octets and write
59     ;; them to an output stream.
60 rtoy 1.5 (code-to-octets #'%efni :type function :read-only t)
61 rtoy 1.15 ;;
62     ;; Function (or NIL) to force any state in the external format to be
63     ;; flushed to the output stream. A NIL value means the external
64     ;; format does not need to do anything special.
65     (flush-state nil :type (or null function) :read-only t)
66 rtoy 1.16 ;;
67     ;; Function to copy the state of the external-format. If NIL, then
68     ;; there is no state to be copied.
69     (copy-state nil :type (or null function) :read-only t)
70 rtoy 1.6 (cache nil :type (or null simple-vector))
71 rtoy 1.15 ;;
72     ;; Minimum number of octets needed to form a codepoint
73 rtoy 1.6 (min 1 :type kernel:index :read-only t)
74 rtoy 1.15 ;;
75 rtoy 1.17 ;; Maximum number of octets needed to form a codepoint.
76 rtoy 1.6 (max 1 :type kernel:index :read-only t))
77 rtoy 1.5
78 rtoy 1.1 (defstruct (external-format
79     (:conc-name ef-)
80     (:print-function %print-external-format)
81 rtoy 1.5 (:constructor make-external-format (name efx composingp
82     &optional slots slotd)))
83 rtoy 1.3 (name (ext:required-argument) :type (or keyword cons) :read-only t)
84 rtoy 1.5 (efx (ext:required-argument) :type efx :read-only t)
85 rtoy 1.3 (composingp (ext:required-argument) :type boolean :read-only t)
86 rtoy 1.1 (slots #() :type simple-vector :read-only t)
87 rtoy 1.5 (slotd nil :type list :read-only t))
88 rtoy 1.1
89     (defun %print-external-format (ef stream depth)
90     (declare (ignore depth))
91     (print-unreadable-object (ef stream :type t :identity t)
92     (princ (ef-name ef) stream)))
93    
94 rtoy 1.6 (defun %intern-ef (ef)
95 rtoy 1.3 (setf (gethash (ef-name ef) *external-formats*) ef))
96    
97 rtoy 1.16 (declaim (inline ef-octets-to-code ef-code-to-octets ef-flush-state ef-copy-state
98 rtoy 1.15 ef-cache ef-min-octets ef-max-octets))
99 rtoy 1.5
100     (defun ef-octets-to-code (ef)
101     (efx-octets-to-code (ef-efx ef)))
102    
103     (defun ef-code-to-octets (ef)
104     (efx-code-to-octets (ef-efx ef)))
105    
106 rtoy 1.15 (defun ef-flush-state (ef)
107     (efx-flush-state (ef-efx ef)))
108    
109 rtoy 1.16 (defun ef-copy-state (ef)
110     (efx-copy-state (ef-efx ef)))
111    
112 rtoy 1.5 (defun ef-cache (ef)
113     (efx-cache (ef-efx ef)))
114    
115 rtoy 1.6 (defun ef-min-octets (ef)
116     (efx-min (ef-efx ef)))
117    
118     (defun ef-max-octets (ef)
119     (efx-max (ef-efx ef)))
120    
121     (eval-when (:compile-toplevel :load-toplevel :execute)
122     (defun %merge-slots (old new)
123     (let* ((pos (length old))
124     (tmp (mapcar (lambda (x)
125     (let* ((name (if (consp x) (first x) x))
126     (init (if (consp x) (second x) nil))
127     (list (if (consp x) (nthcdr 2 x) nil))
128     (prev (assoc name old))
129     (posn (if prev (second prev) (1- (incf pos)))))
130     (list name posn init (getf list :type t))))
131     new)))
132     (delete-duplicates (stable-sort (append old tmp) #'< :key #'second)
133     :key #'second))))
134    
135     ;;; DEFINE-EXTERNAL-FORMAT -- Public
136     ;;;
137 rtoy 1.16 ;;; name (&key min max size) (&rest slots) octets-to-code code-to-octets
138     ;;; flush-state copy-state
139     ;;;
140 rtoy 1.6 ;;; Define a new external format. Min/Max/Size are the minimum and
141     ;;; maximum number of octets that make up a character (:size N is just
142     ;;; shorthand for :min N :max N). Slots is a list of slot descriptions
143     ;;; similar to defstruct.
144     ;;;
145     ;;; name (base) (&rest slots)
146     ;;; Define an external format based on a previously-defined external
147     ;;; format, Base. The slot names used in Slots must match those in Base.
148     ;;;
149     ;;; octets-to-code (state input unput &rest vars)
150     ;;; Defines a form to be used by the external format to convert
151     ;;; octets to a code point. State is a form that can be used by the
152 rtoy 1.16 ;;; body to access the state of the stream. Input is a form that
153     ;;; can be used to read one octet from the input stream. (It can be
154     ;;; called as many times as needed.) Similarly, Unput is a form to
155     ;;; put back one octet to the input stream. Vars is a list of vars
156     ;;; that need to be defined for any symbols used within the form.
157 rtoy 1.6 ;;;
158     ;;; This should return two values: the code and the number of octets
159     ;;; read to form the code.
160     ;;;
161     ;;; code-to-octets (code state output &rest vars)
162     ;;; Defines a form to be used by the external format to convert a
163     ;;; code point to octets for output. Code is the code point to be
164     ;;; converted. State is a form to access the current value of the
165     ;;; stream's state variable. Output is a form that writes one octet
166     ;;; to the output stream.
167     ;;;
168 rtoy 1.15 ;;; flush-state (state output &rest vars)
169     ;;; Defines a form to be used by the external format to flush out
170     ;;; any state when an output stream is closed. Similar to
171     ;;; CODE-TO-OCTETS, but there is no code.
172     ;;;
173 rtoy 1.16 ;;; copy-state (state &rest vars)
174     ;;; Defines a form to copy any state needed by the external format.
175     ;;; This should probably be a deep copy so that if the original
176     ;;; state is modified, the copy is not.
177     ;;;
178 rtoy 1.6 ;;; Note: external-formats work on code-points, not
179     ;;; characters, so that the entire 31 bit ISO-10646 range can be
180     ;;; used internally regardless of the size of a character recognized
181     ;;; by Lisp and external formats can be useful to people who want to
182     ;;; process characters outside the Lisp range (see
183     ;;; CODEPOINT-TO-OCTETS, OCTETS-TO-CODEPOINT)
184     ;;;
185     (defmacro define-external-format (name (&rest args) (&rest slots)
186 rtoy 1.16 &optional octets-to-code code-to-octets
187     flush-state copy-state)
188 rtoy 1.6 (when (and (oddp (length args)) (not (= (length args) 1)))
189     (warn "Nonsensical argument (~S) to DEFINE-EXTERNAL-FORMAT." args))
190     (let* ((tmp (gensym))
191     (min (if (evenp (length args))
192     (or (getf args :min) (getf args :size) 1)
193     1))
194     (max (if (evenp (length args))
195     (or (getf args :max) (getf args :size) 6)
196     6))
197     (base (if (= (length args) 1)
198     (find-external-format (first args))
199     nil))
200     (bslotd (if base (ef-slotd base) nil))
201     (slotd (%merge-slots bslotd slots))
202     (slotb (loop for slot in slotd
203     collect `(,(first slot)
204     `(the ,',(fourth slot)
205     ;; IDENTITY is here to protect against SETF
206     (identity (svref %slots% ,',(second slot))))))))
207 rtoy 1.3 `(macrolet ((octets-to-code ((state input unput &rest vars) body)
208 rtoy 1.6 `(lambda (,state ,input ,unput)
209     (declare (ignorable ,state ,input ,unput)
210 rtoy 1.3 (optimize (ext:inhibit-warnings 3)))
211 rtoy 1.6 (let (,@',slotb
212     (,input `(the (or (unsigned-byte 8) null) ,,input))
213 rtoy 1.3 ,@(loop for var in vars collect `(,var (gensym))))
214     ,body)))
215     (code-to-octets ((code state output &rest vars) body)
216 rtoy 1.6 `(lambda (,',tmp ,state ,output)
217     (declare (ignorable ,state ,output)
218 rtoy 1.3 (optimize (ext:inhibit-warnings 3)))
219 rtoy 1.6 (let (,@',slotb
220     (,code ',code)
221 rtoy 1.3 ,@(loop for var in vars collect `(,var (gensym))))
222 rtoy 1.16 `(let ((,',code (the lisp:codepoint ,,',tmp)))
223 rtoy 1.3 (declare (ignorable ,',code))
224 rtoy 1.15 ,,body))))
225     (flush-state ((state output &rest vars) body)
226     `(lambda (,state ,output)
227     (declare (ignorable ,state ,output))
228     (let (,@',slotb
229     ,@(loop for var in vars collect `(,var (gensym))))
230 rtoy 1.16 ,body)))
231     (copy-state ((state &rest vars) body)
232     `(lambda (,state)
233     (declare (ignorable ,state))
234     (let (,@',slotb
235     ,@(loop for var in vars collect `(,var (gensym))))
236 rtoy 1.15 ,body))))
237 rtoy 1.6 (%intern-ef (make-external-format ,name
238     ,(if base
239     `(ef-efx (find-external-format ,(ef-name base)))
240     `(make-efx :octets-to-code ,octets-to-code
241     :code-to-octets ,code-to-octets
242 rtoy 1.16 :flush-state ,flush-state
243     :copy-state ,copy-state
244 rtoy 1.6 :cache (make-array +ef-max+
245     :initial-element nil)
246     :min ,(min min max) :max ,(max min max)))
247     nil
248     (let* ,(loop for x in slotd
249     collect (list (first x) (third x)))
250     (vector ,@(mapcar #'first slotd)))
251     ',slotd)))))
252 rtoy 1.3
253 rtoy 1.6 ;;; DEFINE-COMPOSING-EXTERNAL-FORMAT -- Public
254     ;;;
255     ;;; A composing-external-format differs from an (ordinary) external-format
256     ;;; in that it translates characters (really codepoints, of course) into
257     ;;; other characters, rather than translating between characters and binary
258     ;;; octets. They have to be composed with a non-composing external-format
259     ;;; to be of any use.
260     ;;;
261 rtoy 1.16 ;;;
262     ;;; name (&key min max size) input output
263     ;;; Defines a new composing external format. The parameters Min,
264     ;;; Max, and Size are the same as for defining an external format.
265     ;;; The parameters input and output are forms to handle input and
266     ;;; output.
267     ;;;
268     ;;; input (state input unput &rest vars)
269     ;;; Defines a form to be used by the composing external format when
270     ;;; reading input to transform a codepoint (or sequence of
271     ;;; codepoints) to another. State is a form that can be used by the
272     ;;; body to access the state of the external format. Input is a
273     ;;; form that can be used to read one code point from the input
274     ;;; stream. (Input returns two values, the codepoint and the number
275     ;;; of octets read.) It may be called as many times as needed.
276     ;;; This returns two values: the codepoint of the character (or NIL)
277     ;;; and the number of octets read. Similarly, Unput is a form to
278     ;;; put back one octet to the input stream. Vars is a list of vars
279     ;;; that need to be defined for any symbols used within the form.
280     ;;;
281     ;;; This should return two values: the code and the number of octets
282     ;;; read to form the code.
283     ;;;
284     ;;; output (code state output &rest vars)
285     ;;; Defines a form to be used by the composing external format to
286     ;;; convert a code point to octets for output. Code is the code
287     ;;; point to be converted. State is a form to access the current
288     ;;; value of the stream's state variable. Output is a form that
289     ;;; writes one octet to the output stream.
290    
291 rtoy 1.6 (defmacro define-composing-external-format (name (&key min max size)
292     input output)
293     (let ((tmp (gensym))
294     (min (or min size 1))
295     (max (or max size 1)))
296 rtoy 1.3 `(macrolet ((input ((state input unput &rest vars) body)
297 rtoy 1.6 `(lambda (,state ,input ,unput)
298     (declare (ignorable ,state ,input ,unput)
299 rtoy 1.3 (optimize (ext:inhibit-warnings 3)))
300 rtoy 1.16 (let ((,input `(the (values (or lisp:codepoint null)
301 rtoy 1.6 kernel:index)
302 rtoy 1.3 ,,input))
303     ,@(loop for var in vars collect `(,var (gensym))))
304     ,body)))
305     (output ((code state output &rest vars) body)
306 rtoy 1.6 `(lambda (,',tmp ,state ,output)
307     (declare (ignorable ,state ,output)
308 rtoy 1.3 (optimize (ext:inhibit-warnings 3)))
309     (let ((,code ',code)
310     ,@(loop for var in vars collect `(,var (gensym))))
311 rtoy 1.16 `(let ((,',code (the lisp:codepoint ,,',tmp)))
312 rtoy 1.3 (declare (ignorable ,',code))
313     ,,body)))))
314 rtoy 1.6 (%intern-ef (make-external-format ,name
315     (make-efx :octets-to-code ,input
316     :code-to-octets ,output
317     :min ,(min min max) :max ,(max min max))
318     t
319     #() '())))))
320 rtoy 1.1
321     (defun load-external-format-aliases ()
322 rtoy 1.6 (let ((*package* (find-package "KEYWORD"))
323     (unix::*filename-encoding* :iso8859-1))
324 rtoy 1.11 (with-open-file (stm "ext-formats:aliases" :if-does-not-exist nil
325     :external-format :iso8859-1)
326 rtoy 1.1 (when stm
327 rtoy 1.6 (do ((alias (read stm nil stm) (read stm nil stm))
328     (value (read stm nil stm) (read stm nil stm)))
329     ((or (eq alias stm) (eq value stm))
330     (unless (eq alias stm)
331     (warn "External-format aliases file ends early.")))
332 rtoy 1.11 (if (and (keywordp alias) (or (keywordp value)
333     (and (consp value)
334     (every #'keywordp value))))
335 rtoy 1.6 (setf (gethash alias *external-format-aliases*) value)
336     (warn "Bad entry in external-format aliases file: ~S => ~S."
337     alias value)))))))
338 rtoy 1.1
339 rtoy 1.3 (defun %find-external-format (name)
340 rtoy 1.6 ;; avoid loading files, etc., early in the boot sequence
341     (when (or (eq name :iso8859-1)
342     (and (eq name :default) (eq *default-external-format* :iso8859-1)))
343     (return-from %find-external-format
344     (gethash :iso8859-1 *external-formats*)))
345    
346 rtoy 1.1 (when (zerop (hash-table-count *external-format-aliases*))
347     (setf (gethash :latin1 *external-format-aliases*) :iso8859-1)
348     (setf (gethash :latin-1 *external-format-aliases*) :iso8859-1)
349     (setf (gethash :iso-8859-1 *external-format-aliases*) :iso8859-1)
350     (load-external-format-aliases))
351    
352     (do ((tmp (gethash name *external-format-aliases*)
353     (gethash tmp *external-format-aliases*))
354     (cnt 0 (1+ cnt)))
355     ((or (null tmp) (= cnt 50))
356     (unless (null tmp)
357     (error "External-format aliasing depth exceeded.")))
358     (setq name tmp))
359    
360     (or (gethash name *external-formats*)
361 rtoy 1.11 (and (consp name) (find-external-format name))
362 rtoy 1.2 (and (let ((*package* (find-package "STREAM"))
363 rtoy 1.12 (lisp::*enable-package-locked-errors* nil)
364     (s (open (format nil "ext-formats:~(~A~).lisp" name)
365     :if-does-not-exist nil :external-format :iso8859-1)))
366     (when s
367     (null (nth-value 1 (ext:compile-from-stream s)))))
368 rtoy 1.3 (gethash name *external-formats*))))
369    
370     (defun %composed-ef-name (a b)
371     (if (consp a) (append a (list b)) (list a b)))
372    
373 rtoy 1.6 (defun %compose-external-formats (a b)
374 rtoy 1.3 (when (ef-composingp a)
375     (error "~S is a Composing-External-Format." (ef-name a)))
376     (unless (ef-composingp b)
377     (error "~S is not a Composing-External-Format." (ef-name b)))
378 rtoy 1.5 (make-external-format
379     (%composed-ef-name (ef-name a) (ef-name b))
380     (make-efx
381 rtoy 1.6 :octets-to-code (lambda (state input unput)
382 rtoy 1.11 (let ((nstate (gensym "STATE-")))
383     `(let ((,nstate ,state))
384     (when (null ,nstate)
385     (setq ,nstate (setf ,state (cons nil nil))))
386     ,(funcall (ef-octets-to-code b) `(car ,nstate)
387     (funcall (ef-octets-to-code a)
388     `(cdr ,nstate) input unput)
389     unput))))
390 rtoy 1.6 :code-to-octets (lambda (code state output)
391 rtoy 1.11 (let ((nstate (gensym "STATE-")))
392     `(let ((,nstate ,state))
393     (when (null ,nstate)
394     (setq ,nstate (setf ,state (cons nil nil))))
395     ,(funcall (ef-code-to-octets b) code `(car ,nstate)
396     `(lambda (x)
397     ,(funcall (ef-code-to-octets a)
398     'x `(cdr ,nstate) output))))))
399 rtoy 1.6 :cache (make-array +ef-max+ :initial-element nil)
400     :min (* (ef-min-octets a) (ef-min-octets b))
401     :max (* (ef-max-octets a) (ef-max-octets b)))
402 rtoy 1.5 nil #() '()))
403 rtoy 1.3
404     (defun find-external-format (name &optional (error-p t))
405     (when (external-format-p name)
406     (return-from find-external-format name))
407    
408     (or (if (consp name) (every #'keywordp name) (keywordp name))
409     (error "~S is not a valid external format name." name))
410 rtoy 1.1
411 rtoy 1.3 (when (eq name :default)
412     (setq name *default-external-format*))
413    
414     (when (and (consp name) (not (cdr name)))
415     (setq name (car name)))
416    
417 rtoy 1.6 (flet ((not-found ()
418     (when (equal *default-external-format* name)
419     (setq *default-external-format* :iso8859-1))
420     (if error-p (error "External format ~S not found." name) nil)))
421     (if (consp name)
422     (let ((efs (mapcar #'%find-external-format name)))
423     (if (member nil efs)
424     (not-found)
425     (let ((name (reduce #'%composed-ef-name (mapcar #'ef-name efs))))
426     (or (gethash name *external-formats*)
427     (%intern-ef (reduce #'%compose-external-formats efs))))))
428     (or (%find-external-format name) (not-found)))))
429    
430     (defun flush-external-formats ()
431     (maphash (lambda (name ef)
432     (declare (ignore name))
433     (fill (ef-cache ef) nil))
434     *external-formats*))
435    
436     (defvar *.table-inverse.* (make-hash-table :test 'eq :size 7))
437    
438     (defun invert-table (table)
439     (declare (type (or (simple-array (unsigned-byte 31) *)
440     (simple-array (unsigned-byte 16) *))
441     table)
442     (optimize (speed 3) (space 0) (safety 0) (debug 0)
443     (ext:inhibit-warnings 3)))
444     (or (gethash table *.table-inverse.*)
445     (let* ((mbits (if (= (array-total-size table) 128) 7 8))
446     (lbits (cond ((> (array-total-size table) 256) 3)
447     ((< (array-total-size table) 100) 6)
448     (t 5)))
449     (hvec (make-array (1+ (ash #x110000 (- 0 mbits lbits)))
450     :element-type '(unsigned-byte 16)
451     :initial-element #xFFFF))
452     (mvec (make-array 0 :element-type '(unsigned-byte 16)))
453     (lvec (make-array 0 :element-type '(unsigned-byte 16)))
454     (width (array-dimension table 0))
455     (power (1- (array-rank table)))
456     (base (if (= width 94) 1 0))
457     hx mx lx)
458     (assert (and (< power 2) (<= width 256)))
459     (dotimes (i (array-total-size table))
460     (declare (type (integer 0 (#.array-dimension-limit)) i))
461     (let ((tmp i) (val (row-major-aref table i)) (z 0))
462     (declare (type (integer 0 (#.array-dimension-limit)) tmp)
463     (type (unsigned-byte 16) z))
464     (unless (= val #xFFFE)
465     (when (plusp power)
466     (multiple-value-bind (x y) (floor tmp width)
467     (setq tmp x)
468     (setq z (logior z (ash (the (integer 0 255) (+ y base))
469     (the (integer 0 24)
470     (* 8 power)))))))
471     (setq hx (ash val (- 0 mbits lbits)))
472     (when (= (aref hvec hx) #xFFFF)
473     (setf (aref hvec hx) (length mvec))
474     (let ((tmp (make-array (+ (length mvec) (ash 1 mbits))
475     :element-type '(unsigned-byte 16)
476     :initial-element #xFFFF)))
477     (replace tmp mvec)
478     (setq mvec tmp)))
479     (setq mx (logand (ash val (- lbits)) (lognot (ash -1 mbits))))
480 rtoy 1.7 (when (= (aref mvec (+ (aref hvec hx) mx)) #xFFFF)
481     (setf (aref mvec (+ (aref hvec hx) mx)) (length lvec))
482 rtoy 1.6 (let ((tmp (make-array (+ (length lvec) (ash 1 lbits))
483     :element-type '(unsigned-byte 16)
484     :initial-element #xFFFF)))
485     (replace tmp lvec)
486     (setq lvec tmp)))
487     (setq lx (logand val (lognot (ash -1 lbits))))
488 rtoy 1.7 (setf (aref lvec (+ (aref mvec (+ (aref hvec hx) mx)) lx))
489 rtoy 1.6 (logior z (+ tmp base))))))
490     (setf (gethash table *.table-inverse.*)
491     (lisp::make-ntrie16 :split (logior (ash (1- mbits) 4) (1- lbits))
492     :hvec hvec :mvec mvec :lvec lvec)))))
493    
494     (declaim (inline get-inverse))
495     (defun get-inverse (ntrie code)
496 rtoy 1.11 (declare (type lisp::ntrie16 ntrie) (type (integer 0 #x10FFFF) code))
497 rtoy 1.6 (let ((n (lisp::qref ntrie code)))
498     (and n (let ((m (aref (lisp::ntrie16-lvec ntrie) n)))
499     (if (= m #xFFFF) nil m)))))
500    
501 rtoy 1.1
502     (define-condition void-external-format (error)
503     ()
504     (:report
505     (lambda (condition stream)
506     (declare (ignore condition))
507     (format stream "Attempting I/O through void external-format."))))
508    
509 rtoy 1.6 (define-external-format :void (:size 0) ()
510 rtoy 1.1 (octets-to-code (state input unput)
511 rtoy 1.3 `(error 'void-external-format))
512 rtoy 1.1 (code-to-octets (code state output)
513 rtoy 1.3 `(error 'void-external-format)))
514 rtoy 1.1
515 rtoy 1.6 (define-external-format :iso8859-1 (:size 1) ()
516 rtoy 1.1 (octets-to-code (state input unput)
517 rtoy 1.3 `(values ,input 1))
518 rtoy 1.1 (code-to-octets (code state output)
519 rtoy 1.3 `(,output (if (> ,code 255) #x3F ,code))))
520 rtoy 1.1
521 rtoy 1.6 ;;; OCTETS-TO-CODEPOINT, CODEPOINT-TO-OCTETS -- Semi-Public
522     ;;;
523     ;;; Normally you'd want to use OCTETS-TO-CHAR and CHAR-TO-OCTETS instead of
524     ;;; these, but that limits you to Lisp's idea of a character - either Latin-1
525     ;;; in 8 bit Lisp images, or the Unicode BMP in 16 bit images. If you want
526     ;;; to read or write texts containing characters not supported by your Lisp,
527     ;;; these macros can be used instead.
528 rtoy 1.1 (defmacro octets-to-codepoint (external-format state count input unput)
529 rtoy 1.6 (let ((tmp1 (gensym)) (tmp2 (gensym))
530     (ef (find-external-format external-format)))
531     `(multiple-value-bind (,tmp1 ,tmp2)
532     ,(funcall (ef-octets-to-code ef) state input unput)
533     (setf ,count (the kernel:index ,tmp2))
534 rtoy 1.16 (the (or lisp:codepoint null) ,tmp1))))
535 rtoy 1.1
536     (defmacro codepoint-to-octets (external-format code state output)
537 rtoy 1.6 (let ((ef (find-external-format external-format)))
538     (funcall (ef-code-to-octets ef) code state output)))
539 rtoy 1.3
540    
541    
542     (defvar *ef-base* +ef-max+)
543     (defvar *ef-extensions* '())
544    
545     (defun ensure-cache (ef id reqd)
546     (let ((base (or (getf *ef-extensions* id)
547     (setf (getf *ef-extensions* id)
548     (prog1 *ef-base* (incf *ef-base* reqd))))))
549     (when (< (length (ef-cache ef)) (+ base reqd))
550 rtoy 1.5 (setf (efx-cache (ef-efx ef))
551 rtoy 1.3 (adjust-array (ef-cache ef) (+ base reqd) :initial-element nil)))
552     base))
553    
554 rtoy 1.6 ;;; DEF-EF-MACRO -- Public
555     ;;;
556     ;;;
557 rtoy 1.3 (defmacro def-ef-macro (name (ef id reqd idx) body)
558 rtoy 1.11 (let* ((tmp1 (gensym))
559     (tmp2 (gensym))
560     (blknm (nth-value 1 (lisp::valid-function-name-p name)))
561     (%name (intern (format nil "%~A" name) #|(symbol-package blknm)|#)))
562 rtoy 1.6 `(progn
563     (defun ,%name (,ef)
564     (let* ((,tmp1 (find-external-format ,ef))
565     (,tmp2 ,(if (eq id 'lisp::lisp)
566     idx
567     `(+ (ensure-cache ,tmp1 ',id ,reqd) ,idx))))
568     (funcall (or (aref (ef-cache ,tmp1) ,tmp2)
569     (setf (aref (ef-cache ,tmp1) ,tmp2)
570     (let ((*compile-print* nil)
571     ;; Set default format when we compile so we
572     ;; can see compiler messages. If we don't,
573     ;; we run into a problem that we might be
574     ;; changing the default format while we're
575     ;; compiling, and we don't know how to output
576     ;; the compiler messages.
577 rtoy 1.11 #|(*default-external-format* :iso8859-1)|#)
578 rtoy 1.6 (compile nil `(lambda (%slots%)
579     (declare (ignorable %slots%))
580 rtoy 1.11 (block ,',blknm
581     ,,body))))))
582 rtoy 1.6 (ef-slots ,tmp1))))
583     (declaim (inline ,name))
584     (defun ,name (,tmp1)
585     (let ((,tmp2 (load-time-value (cons nil nil))))
586     (when (eq ,tmp1 :default)
587     (setq ,tmp1 *default-external-format*))
588     (if (eq ,tmp1 (car ,tmp2))
589     (cdr ,tmp2)
590     (setf (car ,tmp2) ,tmp1
591     (cdr ,tmp2) (,%name ,tmp1))))))))
592 rtoy 1.1
593    
594    
595 rtoy 1.6 ;;; OCTETS-TO-CHAR, CHAR-TO-OCTETS -- Public
596     ;;;
597     ;;; Read and write one character through an external-format
598     ;;;
599 rtoy 1.1 (defmacro octets-to-char (external-format state count input unput)
600 rtoy 1.11 (let ((nstate (gensym)))
601     `(let ((,nstate ,state))
602     (when (null ,nstate) (setq ,nstate (setf ,state (cons nil nil))))
603     (if (car ,nstate)
604 rtoy 1.13 ;; Return the trailing surrgate. Must set count to 0 to
605     ;; tell the stream code we didn't consume any octets!
606 rtoy 1.11 (prog1 (the character (car ,nstate))
607     (setf (car ,nstate) nil ,count 0))
608     (let ((code (octets-to-codepoint ,external-format
609     (cdr ,nstate) ,count ,input ,unput)))
610 rtoy 1.16 (declare (type lisp:codepoint code))
611 rtoy 1.11 ;;@@ on non-Unicode builds, limit to 8-bit chars
612     ;;@@ if unicode-bootstrap, can't use #\u+fffd
613 rtoy 1.16 (cond ((or (lisp::surrogatep code) (> code #x10FFFF))
614 rtoy 1.13 #-(and unicode (not unicode-bootstrap)) #\?
615     #+(and unicode (not unicode-bootstrap)) #\U+FFFD)
616     #+unicode
617 rtoy 1.11 ((> code #xFFFF)
618     (multiple-value-bind (hi lo) (surrogates code)
619     (setf (car ,nstate) lo)
620 rtoy 1.6 hi))
621 rtoy 1.11 (t (code-char code))))))))
622 rtoy 1.6
623 rtoy 1.1 (defmacro char-to-octets (external-format char state output)
624 rtoy 1.11 (let ((nchar (gensym))
625     (nstate (gensym))
626     (wryte (gensym))
627     (ch (gensym)))
628     `(let ((,nchar ,char)
629     (,nstate ,state))
630     (when (null ,nstate) (setq ,nstate (setf ,state (cons nil nil))))
631 rtoy 1.14 (if (lisp::surrogatep (char-code ,nchar) :high)
632 rtoy 1.11 (setf (car ,nstate) ,nchar)
633     (flet ((,wryte (,ch)
634     (codepoint-to-octets ,external-format ,ch (cdr ,nstate)
635     ,output)))
636     (declare (dynamic-extent #',wryte))
637     (if (car ,nstate)
638 rtoy 1.14 (prog1
639     ;; Invalid surrogate sequences get replaced with
640     ;; the replacement character.
641     (,wryte (if (lisp::surrogatep (char-code ,nchar) :low)
642     (surrogates-to-codepoint (car ,nstate) ,nchar)
643     +replacement-character-code+))
644 rtoy 1.11 (setf (car ,nstate) nil))
645 rtoy 1.14 ;; A lone trailing (low) surrogate gets replaced with
646     ;; the replacement character.
647     (,wryte (if (lisp::surrogatep (char-code ,nchar) :low)
648     +replacement-character-code+
649     (char-code ,nchar)))))))))
650 rtoy 1.6
651 rtoy 1.15 (defmacro flush-state (external-format state output)
652     (let* ((ef (find-external-format external-format))
653     (f (ef-flush-state ef)))
654     (when f
655     (funcall f state output))))
656    
657 rtoy 1.16 (defmacro copy-state (external-format state)
658     (let* ((ef (find-external-format external-format))
659     (f (ef-copy-state ef)))
660     (when f
661     (funcall f state))))
662    
663 rtoy 1.3 (def-ef-macro ef-string-to-octets (extfmt lisp::lisp +ef-max+ +ef-so+)
664 rtoy 1.11 `(lambda (string start end buffer &aux (ptr 0) (state nil))
665 rtoy 1.3 (declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
666     (type simple-string string)
667 rtoy 1.6 (type kernel:index start end ptr)
668 rtoy 1.3 (type (simple-array (unsigned-byte 8) (*)) buffer)
669     (ignorable state))
670 rtoy 1.11 (dotimes (i (- end start) (values buffer ptr))
671     (declare (type kernel:index i))
672     (char-to-octets ,extfmt (schar string (+ start i)) state
673     (lambda (b)
674     (when (= ptr (length buffer))
675     (setq buffer (adjust-array buffer (* 2 ptr))))
676     (setf (aref buffer (1- (incf ptr))) b))))))
677 rtoy 1.3
678 rtoy 1.2 (defun string-to-octets (string &key (start 0) end (external-format :default)
679     (buffer nil bufferp))
680 rtoy 1.16 "Convert String to octets using the specified External-format. The
681     string is bounded by Start (defaulting to 0) and End (defaulting to
682     the end of the string. If Buffer is given, the octets are stored
683     there. If not, a new buffer is created."
684 rtoy 1.1 (declare (type string string)
685 rtoy 1.6 (type kernel:index start)
686     (type (or kernel:index null) end)
687 rtoy 1.3 (type (or (simple-array (unsigned-byte 8) (*)) null) buffer))
688 rtoy 1.6 (let* ((buffer (or buffer (make-array (length string)
689 rtoy 1.3 :element-type '(unsigned-byte 8)))))
690 rtoy 1.6 (multiple-value-bind (buffer ptr)
691     (lisp::with-array-data ((string string) (start start) (end end))
692     (funcall (ef-string-to-octets external-format)
693     string start end buffer))
694     (values (if bufferp buffer (lisp::shrink-vector buffer ptr)) ptr))))
695 rtoy 1.3
696     (def-ef-macro ef-octets-to-string (extfmt lisp::lisp +ef-max+ +ef-os+)
697 rtoy 1.18.4.1 `(lambda (octets ptr end string s-start s-end &aux (pos s-start) (count 0) (state nil) (last-octet 0))
698 rtoy 1.18.4.3 (declare (optimize (speed 3) (safety 0) #|(space 0) (debug 0)|#)
699 rtoy 1.3 (type (simple-array (unsigned-byte 8) (*)) octets)
700 rtoy 1.18.4.2 (type kernel:index pos end count last-octet s-start s-end)
701 rtoy 1.18 (type (integer -1 (#.array-dimension-limit)) ptr)
702 rtoy 1.3 (type simple-string string)
703     (ignorable state))
704 rtoy 1.18 (catch 'end-of-octets
705 rtoy 1.18.4.1 (loop while (< pos s-end)
706     do (setf (schar string pos)
707     (octets-to-char ,extfmt state count
708     (if (>= ptr end)
709     (throw 'end-of-octets nil)
710 rtoy 1.18.4.3 (bref octets (incf ptr)))
711 rtoy 1.18.4.1 (lambda (n) (decf ptr n))))
712 rtoy 1.18 (incf pos)
713     (incf last-octet count)))
714     (values string pos last-octet)))
715 rtoy 1.1
716 rtoy 1.2 (defun octets-to-string (octets &key (start 0) end (external-format :default)
717 rtoy 1.18.4.1 (string nil stringp)
718     (s-start 0) (s-end (length string)))
719 rtoy 1.16 "Octets-to-string converts an array of octets in Octets to a string
720     according to the specified External-format. The array of octets is
721     bounded by Start (defaulting ot 0) and End (defaulting to the end of
722 rtoy 1.18 the array. If String is given, the string is stored there. If
723     String is too short to hold all of the characters, it will be
724     adjusted (via adjust-array). If String is not given, a new string
725     is created. Three values are returned: the string, the number of
726     characters read, and the number of octets consumed."
727 rtoy 1.1 (declare (type (simple-array (unsigned-byte 8) (*)) octets)
728 rtoy 1.18.4.1 (type kernel:index start s-start s-end)
729 rtoy 1.6 (type (or kernel:index null) end)
730 rtoy 1.3 (type (or simple-string null) string))
731 rtoy 1.17 (multiple-value-bind (string pos last-octet)
732 rtoy 1.18 (funcall (ef-octets-to-string external-format)
733     octets (1- start) (1- (or end (length octets)))
734 rtoy 1.18.4.1 (or string (make-string (length octets)))
735     s-start s-end)
736 rtoy 1.17 (values (if stringp string (lisp::shrink-vector string pos)) pos last-octet)))
737 rtoy 1.1
738    
739    
740 rtoy 1.3 (def-ef-macro ef-encode (extfmt lisp::lisp +ef-max+ +ef-en+)
741     `(lambda (string start end result &aux (ptr 0) (state nil))
742     (declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
743     (type simple-string string)
744 rtoy 1.6 (type kernel:index start end ptr)
745 rtoy 1.3 (type simple-base-string result)
746     (ignorable state))
747 rtoy 1.11 (dotimes (i (- end start) (values result ptr))
748     (declare (type kernel:index i))
749     (char-to-octets ,extfmt (schar string (+ start i)) state
750     (lambda (b)
751     (when (= ptr (length result))
752     (setq result (adjust-array result (* 2 ptr))))
753     (setf (aref result (1- (incf ptr)))
754     (code-char b)))))))
755 rtoy 1.3
756     (defun string-encode (string external-format &optional (start 0) end)
757 rtoy 1.6 "Encode the given String using External-Format and return a new
758     string. The characters of the new string are the octets of the
759     encoded result, with each octet converted to a character via
760     code-char. This is the inverse to String-Decode"
761     (when (zerop (length string))
762     (return-from string-encode string))
763 rtoy 1.3 (multiple-value-bind (result ptr)
764     (lisp::with-array-data ((string string) (start start) (end end))
765 rtoy 1.6 (funcall (ef-encode external-format) string start end
766 rtoy 1.3 (make-string (length string) :element-type 'base-char)))
767     (lisp::shrink-vector result ptr)))
768    
769     (def-ef-macro ef-decode (extfmt lisp::lisp +ef-max+ +ef-de+)
770     `(lambda (string ptr end result &aux (pos -1) (count 0) (state nil))
771     (declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
772     (type simple-string string)
773 rtoy 1.6 (type kernel:index end count)
774 rtoy 1.3 (type (integer -1 (#.array-dimension-limit)) ptr pos)
775     (type simple-string result)
776     (ignorable state))
777     (loop until (>= ptr end)
778     ;; increasing size of result shouldn't ever be necessary, unless
779     ;; someone implements an encoding smaller than the source string...
780 rtoy 1.11 do (setf (schar result (incf pos))
781     (octets-to-char ,extfmt state count
782     ;; note the need to return NIL for EOF
783     (if (= (1+ ptr) (length string))
784     nil
785     (char-code (char string (incf ptr))))
786     (lambda (n) (decf ptr n))))
787 rtoy 1.5 finally (return (values result (1+ pos))))))
788 rtoy 1.3
789     (defun string-decode (string external-format &optional (start 0) end)
790 rtoy 1.6 "Decode String using the given External-Format and return the new
791     string. The input string is treated as if it were an array of
792     octets, where the char-code of each character is the octet. This is
793     the inverse of String-Encode."
794     (when (zerop (length string))
795     (return-from string-decode string))
796 rtoy 1.3 (multiple-value-bind (result pos)
797     (lisp::with-array-data ((string string) (start start) (end end))
798 rtoy 1.6 (funcall (ef-decode external-format)
799 rtoy 1.3 string (1- start) (1- end) (make-string (length string))))
800 rtoy 1.5 (lisp::shrink-vector result pos)))
801 rtoy 1.11
802    
803     (defun set-system-external-format (terminal &optional filenames)
804     (unless (find-external-format terminal)
805     (error "Can't find external-format ~S." terminal))
806     (setf (stream-external-format sys:*stdin*) terminal
807     (stream-external-format sys:*stdout*) terminal
808     (stream-external-format sys:*stderr*) terminal)
809     (when (lisp::fd-stream-p sys:*tty*)
810     (setf (stream-external-format sys:*tty*) terminal))
811     (when filenames
812     (unless (find-external-format filenames)
813     (error "Can't find external-format ~S." filenames))
814     (when (and unix::*filename-encoding*
815     (not (eq unix::*filename-encoding* filenames)))
816     (cerror "Change it anyway."
817     "The external-format for encoding filenames is already set.")
818     (setq unix::*filename-encoding* filenames)))
819     t)
820    
821    
822     ;; Despite its name, this doesn't actually compile anything at all. What it
823     ;; does is expand into a lambda expression that can be compiled by the file
824     ;; compiler.
825     (defmacro precompile-ef-slot (ef slot)
826     (let* ((ef (find-external-format ef)))
827     ;; if there's no lambda expression available, flush it and regenerate
828     (unless (and (aref (ef-cache ef) slot)
829     (function-lambda-expression (aref (ef-cache ef) slot)))
830     (setf (aref (ef-cache ef) slot) nil)
831     (ecase slot
832     (#.+ef-cin+ (lisp::%ef-cin ef))
833     (#.+ef-cout+ (lisp::%ef-cout ef))
834     (#.+ef-sout+ (lisp::%ef-sout ef))
835     (#.+ef-os+ (%ef-octets-to-string ef))
836     (#.+ef-so+ (%ef-string-to-octets ef))
837     (#.+ef-en+ (%ef-encode ef))
838     (#.+ef-de+ (%ef-decode ef))))
839     `(setf (aref (ef-cache (find-external-format ,(ef-name ef))) ,slot)
840     ,(subst (ef-name ef) ef
841     (function-lambda-expression (aref (ef-cache ef) slot))))))

  ViewVC Help
Powered by ViewVC 1.1.5