/[cmucl]/src/code/extfmts.lisp
ViewVC logotype

Contents of /src/code/extfmts.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (hide annotations)
Thu Jun 25 02:18:02 2009 UTC (4 years, 9 months ago) by rtoy
Branch: MAIN
CVS Tags: snapshot-2009-07
Changes since 1.8: +24 -16 lines
Update STRING-ENCODE and STRING-DECODE to handle surrogate pairs
correctly.  Previously, each surrogate was converted individually.
This is wrong; they should be treated as a single codepoint that is
converted.
1 rtoy 1.1 ;;; -*- Mode: LISP; Syntax: ANSI-Common-Lisp; Package: STREAM -*-
2     ;;;
3     ;;; **********************************************************************
4     ;;; This code was written by Paul Foley and has been placed in the public
5     ;;; domain.
6     ;;;
7     (ext:file-comment
8 rtoy 1.9 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/extfmts.lisp,v 1.9 2009/06/25 02:18:02 rtoy Exp $")
9 rtoy 1.1 ;;;
10     ;;; **********************************************************************
11     ;;;
12     ;;; Implementation of external-formats
13    
14     (in-package "STREAM")
15    
16     (export '(string-to-octets octets-to-string *default-external-format*
17 rtoy 1.3 string-encode string-decode))
18 rtoy 1.1
19     (defvar *default-external-format* :iso8859-1)
20    
21 rtoy 1.3 (defvar *external-formats* (make-hash-table :test 'equal))
22 rtoy 1.1 (defvar *external-format-aliases* (make-hash-table))
23    
24 rtoy 1.8 (defconstant +ef-str+ 1)
25 rtoy 1.6 (defconstant +ef-cin+ 2)
26     (defconstant +ef-cout+ 3)
27     (defconstant +ef-sin+ 4)
28     (defconstant +ef-sout+ 5)
29     (defconstant +ef-os+ 6)
30     (defconstant +ef-so+ 7)
31     (defconstant +ef-en+ 8)
32     (defconstant +ef-de+ 9)
33     (defconstant +ef-max+ 10)
34    
35     ;; Unicode replacement character U+FFFD
36     (defconstant +replacement-character-code+ #xFFFD)
37 rtoy 1.3
38 rtoy 1.1 (define-condition external-format-not-implemented (error)
39     ()
40     (:report
41     (lambda (condition stream)
42     (declare (ignore condition))
43     (format stream "Attempting unimplemented external-format I/O."))))
44    
45     (defun %efni (a b c d)
46     (declare (ignore a b c d))
47     (error 'external-format-not-implemented))
48    
49 rtoy 1.5 (defstruct efx
50     (octets-to-code #'%efni :type function :read-only t)
51     (code-to-octets #'%efni :type function :read-only t)
52 rtoy 1.6 (cache nil :type (or null simple-vector))
53     (min 1 :type kernel:index :read-only t)
54     (max 1 :type kernel:index :read-only t))
55 rtoy 1.5
56 rtoy 1.1 (defstruct (external-format
57     (:conc-name ef-)
58     (:print-function %print-external-format)
59 rtoy 1.5 (:constructor make-external-format (name efx composingp
60     &optional slots slotd)))
61 rtoy 1.3 (name (ext:required-argument) :type (or keyword cons) :read-only t)
62 rtoy 1.5 (efx (ext:required-argument) :type efx :read-only t)
63 rtoy 1.3 (composingp (ext:required-argument) :type boolean :read-only t)
64 rtoy 1.1 (slots #() :type simple-vector :read-only t)
65 rtoy 1.5 (slotd nil :type list :read-only t))
66 rtoy 1.1
67     (defun %print-external-format (ef stream depth)
68     (declare (ignore depth))
69     (print-unreadable-object (ef stream :type t :identity t)
70     (princ (ef-name ef) stream)))
71    
72 rtoy 1.6 (defun %intern-ef (ef)
73 rtoy 1.3 (setf (gethash (ef-name ef) *external-formats*) ef))
74    
75 rtoy 1.6 (declaim (inline ef-octets-to-code ef-code-to-octets ef-cache
76     ef-min-octets ef-max-octets))
77 rtoy 1.5
78     (defun ef-octets-to-code (ef)
79     (efx-octets-to-code (ef-efx ef)))
80    
81     (defun ef-code-to-octets (ef)
82     (efx-code-to-octets (ef-efx ef)))
83    
84     (defun ef-cache (ef)
85     (efx-cache (ef-efx ef)))
86    
87 rtoy 1.6 (defun ef-min-octets (ef)
88     (efx-min (ef-efx ef)))
89    
90     (defun ef-max-octets (ef)
91     (efx-max (ef-efx ef)))
92    
93     (eval-when (:compile-toplevel :load-toplevel :execute)
94     (defun %merge-slots (old new)
95     (let* ((pos (length old))
96     (tmp (mapcar (lambda (x)
97     (let* ((name (if (consp x) (first x) x))
98     (init (if (consp x) (second x) nil))
99     (list (if (consp x) (nthcdr 2 x) nil))
100     (prev (assoc name old))
101     (posn (if prev (second prev) (1- (incf pos)))))
102     (list name posn init (getf list :type t))))
103     new)))
104     (delete-duplicates (stable-sort (append old tmp) #'< :key #'second)
105     :key #'second))))
106    
107     ;;; DEFINE-EXTERNAL-FORMAT -- Public
108     ;;;
109     ;;; name (&key min max size) (&rest slots) octets-to-code code-to-octets
110     ;;; Define a new external format. Min/Max/Size are the minimum and
111     ;;; maximum number of octets that make up a character (:size N is just
112     ;;; shorthand for :min N :max N). Slots is a list of slot descriptions
113     ;;; similar to defstruct.
114     ;;;
115     ;;; name (base) (&rest slots)
116     ;;; Define an external format based on a previously-defined external
117     ;;; format, Base. The slot names used in Slots must match those in Base.
118     ;;;
119     ;;; octets-to-code (state input unput &rest vars)
120     ;;; Defines a form to be used by the external format to convert
121     ;;; octets to a code point. State is a form that can be used by the
122     ;;; body to access the state variable of the stream. Input is a
123     ;;; form that can be used to read one more octets from the input
124     ;;; strema. Similarly, Unput is a form to put back one octet to the
125     ;;; input stream. Vars is a list of vars that need to be defined
126     ;;; for any symbols used within the form.
127     ;;;
128     ;;; This should return two values: the code and the number of octets
129     ;;; read to form the code.
130     ;;;
131     ;;; code-to-octets (code state output &rest vars)
132     ;;; Defines a form to be used by the external format to convert a
133     ;;; code point to octets for output. Code is the code point to be
134     ;;; converted. State is a form to access the current value of the
135     ;;; stream's state variable. Output is a form that writes one octet
136     ;;; to the output stream.
137     ;;;
138     ;;; Note: external-formats work on code-points, not
139     ;;; characters, so that the entire 31 bit ISO-10646 range can be
140     ;;; used internally regardless of the size of a character recognized
141     ;;; by Lisp and external formats can be useful to people who want to
142     ;;; process characters outside the Lisp range (see
143     ;;; CODEPOINT-TO-OCTETS, OCTETS-TO-CODEPOINT)
144     ;;;
145     (defmacro define-external-format (name (&rest args) (&rest slots)
146     &optional octets-to-code code-to-octets)
147     (when (and (oddp (length args)) (not (= (length args) 1)))
148     (warn "Nonsensical argument (~S) to DEFINE-EXTERNAL-FORMAT." args))
149     (let* ((tmp (gensym))
150     (min (if (evenp (length args))
151     (or (getf args :min) (getf args :size) 1)
152     1))
153     (max (if (evenp (length args))
154     (or (getf args :max) (getf args :size) 6)
155     6))
156     (base (if (= (length args) 1)
157     (find-external-format (first args))
158     nil))
159     (bslotd (if base (ef-slotd base) nil))
160     (slotd (%merge-slots bslotd slots))
161     (slotb (loop for slot in slotd
162     collect `(,(first slot)
163     `(the ,',(fourth slot)
164     ;; IDENTITY is here to protect against SETF
165     (identity (svref %slots% ,',(second slot))))))))
166 rtoy 1.3 `(macrolet ((octets-to-code ((state input unput &rest vars) body)
167 rtoy 1.6 `(lambda (,state ,input ,unput)
168     (declare (ignorable ,state ,input ,unput)
169 rtoy 1.3 (optimize (ext:inhibit-warnings 3)))
170 rtoy 1.6 (let (,@',slotb
171     (,input `(the (or (unsigned-byte 8) null) ,,input))
172 rtoy 1.3 ,@(loop for var in vars collect `(,var (gensym))))
173     ,body)))
174     (code-to-octets ((code state output &rest vars) body)
175 rtoy 1.6 `(lambda (,',tmp ,state ,output)
176     (declare (ignorable ,state ,output)
177 rtoy 1.3 (optimize (ext:inhibit-warnings 3)))
178 rtoy 1.6 (let (,@',slotb
179     (,code ',code)
180 rtoy 1.3 ,@(loop for var in vars collect `(,var (gensym))))
181 rtoy 1.6 `(let ((,',code (the (unsigned-byte 21) ,,',tmp)))
182 rtoy 1.3 (declare (ignorable ,',code))
183     ,,body)))))
184 rtoy 1.6 (%intern-ef (make-external-format ,name
185     ,(if base
186     `(ef-efx (find-external-format ,(ef-name base)))
187     `(make-efx :octets-to-code ,octets-to-code
188     :code-to-octets ,code-to-octets
189     :cache (make-array +ef-max+
190     :initial-element nil)
191     :min ,(min min max) :max ,(max min max)))
192     nil
193     (let* ,(loop for x in slotd
194     collect (list (first x) (third x)))
195     (vector ,@(mapcar #'first slotd)))
196     ',slotd)))))
197 rtoy 1.3
198 rtoy 1.6 ;;; DEFINE-COMPOSING-EXTERNAL-FORMAT -- Public
199     ;;;
200     ;;; A composing-external-format differs from an (ordinary) external-format
201     ;;; in that it translates characters (really codepoints, of course) into
202     ;;; other characters, rather than translating between characters and binary
203     ;;; octets. They have to be composed with a non-composing external-format
204     ;;; to be of any use.
205     ;;;
206     (defmacro define-composing-external-format (name (&key min max size)
207     input output)
208     (let ((tmp (gensym))
209     (min (or min size 1))
210     (max (or max size 1)))
211 rtoy 1.3 `(macrolet ((input ((state input unput &rest vars) body)
212 rtoy 1.6 `(lambda (,state ,input ,unput)
213     (declare (ignorable ,state ,input ,unput)
214 rtoy 1.3 (optimize (ext:inhibit-warnings 3)))
215 rtoy 1.6 (let ((,input `(the (values (or (unsigned-byte 21) null)
216     kernel:index)
217 rtoy 1.3 ,,input))
218     ,@(loop for var in vars collect `(,var (gensym))))
219     ,body)))
220     (output ((code state output &rest vars) body)
221 rtoy 1.6 `(lambda (,',tmp ,state ,output)
222     (declare (ignorable ,state ,output)
223 rtoy 1.3 (optimize (ext:inhibit-warnings 3)))
224     (let ((,code ',code)
225     ,@(loop for var in vars collect `(,var (gensym))))
226 rtoy 1.6 `(let ((,',code (the (unsigned-byte 21) ,,',tmp)))
227 rtoy 1.3 (declare (ignorable ,',code))
228     ,,body)))))
229 rtoy 1.6 (%intern-ef (make-external-format ,name
230     (make-efx :octets-to-code ,input
231     :code-to-octets ,output
232     :min ,(min min max) :max ,(max min max))
233     t
234     #() '())))))
235 rtoy 1.1
236     (defun load-external-format-aliases ()
237 rtoy 1.6 (let ((*package* (find-package "KEYWORD"))
238     (unix::*filename-encoding* :iso8859-1))
239     (with-open-file (stm "ext-formats:aliases" :if-does-not-exist nil :external-format :iso8859-1)
240 rtoy 1.1 (when stm
241 rtoy 1.6 (do ((alias (read stm nil stm) (read stm nil stm))
242     (value (read stm nil stm) (read stm nil stm)))
243     ((or (eq alias stm) (eq value stm))
244     (unless (eq alias stm)
245     (warn "External-format aliases file ends early.")))
246     (if (and (keywordp alias) (keywordp value))
247     (setf (gethash alias *external-format-aliases*) value)
248     (warn "Bad entry in external-format aliases file: ~S => ~S."
249     alias value)))))))
250 rtoy 1.1
251 rtoy 1.3 (defun %find-external-format (name)
252 rtoy 1.6 ;; avoid loading files, etc., early in the boot sequence
253     (when (or (eq name :iso8859-1)
254     (and (eq name :default) (eq *default-external-format* :iso8859-1)))
255     (return-from %find-external-format
256     (gethash :iso8859-1 *external-formats*)))
257    
258 rtoy 1.1 (when (zerop (hash-table-count *external-format-aliases*))
259     (setf (gethash :latin1 *external-format-aliases*) :iso8859-1)
260     (setf (gethash :latin-1 *external-format-aliases*) :iso8859-1)
261     (setf (gethash :iso-8859-1 *external-format-aliases*) :iso8859-1)
262     (load-external-format-aliases))
263    
264     (do ((tmp (gethash name *external-format-aliases*)
265     (gethash tmp *external-format-aliases*))
266     (cnt 0 (1+ cnt)))
267     ((or (null tmp) (= cnt 50))
268     (unless (null tmp)
269     (error "External-format aliasing depth exceeded.")))
270     (setq name tmp))
271    
272     (or (gethash name *external-formats*)
273 rtoy 1.2 (and (let ((*package* (find-package "STREAM"))
274 rtoy 1.6 (lisp::*enable-package-locked-errors* nil)
275     (*default-external-format* :iso8859-1)
276     (unix::*filename-encoding* :iso8859-1)
277     (s (open (format nil "ext-formats:~(~A~).lisp" name) :if-does-not-exist nil
278     :external-format :iso8859-1)))
279     (when s
280     (null (nth-value 1 (ext:compile-from-stream s)))))
281 rtoy 1.3 (gethash name *external-formats*))))
282    
283     (defun %composed-ef-name (a b)
284     (if (consp a) (append a (list b)) (list a b)))
285    
286 rtoy 1.6 (defun %compose-external-formats (a b)
287 rtoy 1.3 (when (ef-composingp a)
288     (error "~S is a Composing-External-Format." (ef-name a)))
289     (unless (ef-composingp b)
290     (error "~S is not a Composing-External-Format." (ef-name b)))
291 rtoy 1.5 (make-external-format
292     (%composed-ef-name (ef-name a) (ef-name b))
293     (make-efx
294 rtoy 1.6 :octets-to-code (lambda (state input unput)
295     (funcall (ef-octets-to-code b) state
296     (funcall (ef-octets-to-code a)
297     state input unput)
298 rtoy 1.5 unput))
299 rtoy 1.6 :code-to-octets (lambda (code state output)
300     (funcall (ef-code-to-octets b) code state
301 rtoy 1.5 `(lambda (x)
302     ,(funcall (ef-code-to-octets a)
303     'x state output))))
304 rtoy 1.6 :cache (make-array +ef-max+ :initial-element nil)
305     :min (* (ef-min-octets a) (ef-min-octets b))
306     :max (* (ef-max-octets a) (ef-max-octets b)))
307 rtoy 1.5 nil #() '()))
308 rtoy 1.3
309     (defun find-external-format (name &optional (error-p t))
310     (when (external-format-p name)
311     (return-from find-external-format name))
312    
313     (or (if (consp name) (every #'keywordp name) (keywordp name))
314     (error "~S is not a valid external format name." name))
315 rtoy 1.1
316 rtoy 1.3 (when (eq name :default)
317     (setq name *default-external-format*))
318    
319     (when (and (consp name) (not (cdr name)))
320     (setq name (car name)))
321    
322 rtoy 1.6 (flet ((not-found ()
323     (when (equal *default-external-format* name)
324     (setq *default-external-format* :iso8859-1))
325     (if error-p (error "External format ~S not found." name) nil)))
326     (if (consp name)
327     (let ((efs (mapcar #'%find-external-format name)))
328     (if (member nil efs)
329     (not-found)
330     (let ((name (reduce #'%composed-ef-name (mapcar #'ef-name efs))))
331     (or (gethash name *external-formats*)
332     (%intern-ef (reduce #'%compose-external-formats efs))))))
333     (or (%find-external-format name) (not-found)))))
334    
335     (defun flush-external-formats ()
336     (maphash (lambda (name ef)
337     (declare (ignore name))
338     (fill (ef-cache ef) nil))
339     *external-formats*))
340    
341     (defvar *.table-inverse.* (make-hash-table :test 'eq :size 7))
342    
343     (defun invert-table (table)
344     (declare (type (or (simple-array (unsigned-byte 31) *)
345     (simple-array (unsigned-byte 16) *))
346     table)
347     (optimize (speed 3) (space 0) (safety 0) (debug 0)
348     (ext:inhibit-warnings 3)))
349     (or (gethash table *.table-inverse.*)
350     (let* ((mbits (if (= (array-total-size table) 128) 7 8))
351     (lbits (cond ((> (array-total-size table) 256) 3)
352     ((< (array-total-size table) 100) 6)
353     (t 5)))
354     (hvec (make-array (1+ (ash #x110000 (- 0 mbits lbits)))
355     :element-type '(unsigned-byte 16)
356     :initial-element #xFFFF))
357     (mvec (make-array 0 :element-type '(unsigned-byte 16)))
358     (lvec (make-array 0 :element-type '(unsigned-byte 16)))
359     (width (array-dimension table 0))
360     (power (1- (array-rank table)))
361     (base (if (= width 94) 1 0))
362     hx mx lx)
363     (assert (and (< power 2) (<= width 256)))
364     (dotimes (i (array-total-size table))
365     (declare (type (integer 0 (#.array-dimension-limit)) i))
366     (let ((tmp i) (val (row-major-aref table i)) (z 0))
367     (declare (type (integer 0 (#.array-dimension-limit)) tmp)
368     (type (unsigned-byte 16) z))
369     (unless (= val #xFFFE)
370     (when (plusp power)
371     (multiple-value-bind (x y) (floor tmp width)
372     (setq tmp x)
373     (setq z (logior z (ash (the (integer 0 255) (+ y base))
374     (the (integer 0 24)
375     (* 8 power)))))))
376     (setq hx (ash val (- 0 mbits lbits)))
377     (when (= (aref hvec hx) #xFFFF)
378     (setf (aref hvec hx) (length mvec))
379     (let ((tmp (make-array (+ (length mvec) (ash 1 mbits))
380     :element-type '(unsigned-byte 16)
381     :initial-element #xFFFF)))
382     (replace tmp mvec)
383     (setq mvec tmp)))
384     (setq mx (logand (ash val (- lbits)) (lognot (ash -1 mbits))))
385 rtoy 1.7 (when (= (aref mvec (+ (aref hvec hx) mx)) #xFFFF)
386     (setf (aref mvec (+ (aref hvec hx) mx)) (length lvec))
387 rtoy 1.6 (let ((tmp (make-array (+ (length lvec) (ash 1 lbits))
388     :element-type '(unsigned-byte 16)
389     :initial-element #xFFFF)))
390     (replace tmp lvec)
391     (setq lvec tmp)))
392     (setq lx (logand val (lognot (ash -1 lbits))))
393 rtoy 1.7 (setf (aref lvec (+ (aref mvec (+ (aref hvec hx) mx)) lx))
394 rtoy 1.6 (logior z (+ tmp base))))))
395     (setf (gethash table *.table-inverse.*)
396     (lisp::make-ntrie16 :split (logior (ash (1- mbits) 4) (1- lbits))
397     :hvec hvec :mvec mvec :lvec lvec)))))
398    
399     (declaim (inline get-inverse))
400     (defun get-inverse (ntrie code)
401     (declare (type lisp::ntrie16 ntrie) (type lisp:codepoint code))
402     (let ((n (lisp::qref ntrie code)))
403     (and n (let ((m (aref (lisp::ntrie16-lvec ntrie) n)))
404     (if (= m #xFFFF) nil m)))))
405    
406 rtoy 1.1
407     (define-condition void-external-format (error)
408     ()
409     (:report
410     (lambda (condition stream)
411     (declare (ignore condition))
412     (format stream "Attempting I/O through void external-format."))))
413    
414 rtoy 1.6 (define-external-format :void (:size 0) ()
415 rtoy 1.1 (octets-to-code (state input unput)
416 rtoy 1.3 `(error 'void-external-format))
417 rtoy 1.1 (code-to-octets (code state output)
418 rtoy 1.3 `(error 'void-external-format)))
419 rtoy 1.1
420 rtoy 1.6 (define-external-format :iso8859-1 (:size 1) ()
421 rtoy 1.1 (octets-to-code (state input unput)
422 rtoy 1.3 `(values ,input 1))
423 rtoy 1.1 (code-to-octets (code state output)
424 rtoy 1.3 `(,output (if (> ,code 255) #x3F ,code))))
425 rtoy 1.1
426 rtoy 1.6 ;;; OCTETS-TO-CODEPOINT, CODEPOINT-TO-OCTETS -- Semi-Public
427     ;;;
428     ;;; Normally you'd want to use OCTETS-TO-CHAR and CHAR-TO-OCTETS instead of
429     ;;; these, but that limits you to Lisp's idea of a character - either Latin-1
430     ;;; in 8 bit Lisp images, or the Unicode BMP in 16 bit images. If you want
431     ;;; to read or write texts containing characters not supported by your Lisp,
432     ;;; these macros can be used instead.
433 rtoy 1.1 (defmacro octets-to-codepoint (external-format state count input unput)
434 rtoy 1.6 (let ((tmp1 (gensym)) (tmp2 (gensym))
435     (ef (find-external-format external-format)))
436     `(multiple-value-bind (,tmp1 ,tmp2)
437     ,(funcall (ef-octets-to-code ef) state input unput)
438     (setf ,count (the kernel:index ,tmp2))
439     (the (or (unsigned-byte 21) null) ,tmp1))))
440 rtoy 1.1
441     (defmacro codepoint-to-octets (external-format code state output)
442 rtoy 1.6 (let ((ef (find-external-format external-format)))
443     (funcall (ef-code-to-octets ef) code state output)))
444 rtoy 1.3
445    
446    
447     (defvar *ef-base* +ef-max+)
448     (defvar *ef-extensions* '())
449    
450     (defun ensure-cache (ef id reqd)
451     (let ((base (or (getf *ef-extensions* id)
452     (setf (getf *ef-extensions* id)
453     (prog1 *ef-base* (incf *ef-base* reqd))))))
454     (when (< (length (ef-cache ef)) (+ base reqd))
455 rtoy 1.5 (setf (efx-cache (ef-efx ef))
456 rtoy 1.3 (adjust-array (ef-cache ef) (+ base reqd) :initial-element nil)))
457     base))
458    
459 rtoy 1.6 ;;; DEF-EF-MACRO -- Public
460     ;;;
461     ;;;
462 rtoy 1.3 (defmacro def-ef-macro (name (ef id reqd idx) body)
463 rtoy 1.6 (let ((tmp1 (gensym))
464     (tmp2 (gensym))
465     (%name (intern (format nil "%~A" name) (symbol-package name))))
466     `(progn
467     (defun ,%name (,ef)
468     (let* ((,tmp1 (find-external-format ,ef))
469     (,tmp2 ,(if (eq id 'lisp::lisp)
470     idx
471     `(+ (ensure-cache ,tmp1 ',id ,reqd) ,idx))))
472     (funcall (or (aref (ef-cache ,tmp1) ,tmp2)
473     (setf (aref (ef-cache ,tmp1) ,tmp2)
474     (let ((*compile-print* nil)
475     ;; Set default format when we compile so we
476     ;; can see compiler messages. If we don't,
477     ;; we run into a problem that we might be
478     ;; changing the default format while we're
479     ;; compiling, and we don't know how to output
480     ;; the compiler messages.
481     (*default-external-format* :iso8859-1))
482     (compile nil `(lambda (%slots%)
483     (declare (ignorable %slots%))
484     ,,body)))))
485     (ef-slots ,tmp1))))
486     (declaim (inline ,name))
487     (defun ,name (,tmp1)
488     (let ((,tmp2 (load-time-value (cons nil nil))))
489     (when (eq ,tmp1 :default)
490     (setq ,tmp1 *default-external-format*))
491     (if (eq ,tmp1 (car ,tmp2))
492     (cdr ,tmp2)
493     (setf (car ,tmp2) ,tmp1
494     (cdr ,tmp2) (,%name ,tmp1))))))))
495 rtoy 1.1
496    
497    
498 rtoy 1.6 ;;; OCTETS-TO-CHAR, CHAR-TO-OCTETS -- Public
499     ;;;
500     ;;; Read and write one character through an external-format
501     ;;;
502 rtoy 1.1 (defmacro octets-to-char (external-format state count input unput)
503 rtoy 1.6 (let ((s (gensym "STATE-"))
504     (code (gensym "CODE-")))
505     `(let ((,s ,state))
506     (when (null ,s)
507     ;; Need our own state variable to hold our state and the
508     ;; state for the external format.
509     (setq ,s (setf ,state (cons nil nil))))
510     (if (car ,s)
511     ;; Return the trailing surrgate. Must set count to 0 to
512     ;; tell the stream code we didn't consume any octets!
513     (prog1 (the character (car ,s))
514     (setf (car ,s) nil)
515     (setf ,count 0))
516     (let ((,code (octets-to-codepoint ,external-format
517     (cdr ,s) ,count ,input ,unput)))
518     (declare (type (unsigned-byte 31) ,code))
519     (cond ((or (lisp::surrogatep ,code)
520     (> ,code #x10FFFF))
521     #-(and unicode (not unicode-bootstrap)) #\?
522     #+(and unicode (not unicode-bootstrap)) #\U+FFFD)
523     #+unicode
524     ((> ,code #xFFFF)
525     (multiple-value-bind (hi lo)
526     (lisp::surrogates ,code)
527     (setf (car ,state) lo)
528     hi))
529     (t (code-char ,code))))))))
530    
531     ;; This doesn't handle surrogate code units correctly. It just
532     ;; outputs the surrogate value to the external format. External
533     ;; formats almost never allow surrogate code points (except UTF-16).
534 rtoy 1.1 (defmacro char-to-octets (external-format char state output)
535     `(codepoint-to-octets ,external-format (char-code ,char) ,state ,output))
536    
537 rtoy 1.6
538 rtoy 1.3 (def-ef-macro ef-string-to-octets (extfmt lisp::lisp +ef-max+ +ef-so+)
539 rtoy 1.6 `(lambda (string start end buffer &aux (ptr 0) (state nil) (code 0) (c 0) widep)
540 rtoy 1.3 (declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
541     (type simple-string string)
542 rtoy 1.6 (type kernel:index start end ptr)
543 rtoy 1.3 (type (simple-array (unsigned-byte 8) (*)) buffer)
544 rtoy 1.6 (type lisp:codepoint code c)
545     (type (or null fixnum) widep)
546 rtoy 1.3 (ignorable state))
547 rtoy 1.6 (loop with i of-type kernel:index = start
548     while (< i end)
549     do
550     (multiple-value-bind (c widep)
551     (lisp::codepoint string i end)
552     (incf i (if widep 2 1))
553     (codepoint-to-octets ,extfmt c state
554     (lambda (b)
555     (when (= ptr (length buffer))
556     (setq buffer (adjust-array buffer (* 2 ptr))))
557     (setf (aref buffer (1- (incf ptr))) b)))))))
558 rtoy 1.3
559 rtoy 1.2 (defun string-to-octets (string &key (start 0) end (external-format :default)
560     (buffer nil bufferp))
561 rtoy 1.1 (declare (type string string)
562 rtoy 1.6 (type kernel:index start)
563     (type (or kernel:index null) end)
564 rtoy 1.3 (type (or (simple-array (unsigned-byte 8) (*)) null) buffer))
565 rtoy 1.6 (let* ((buffer (or buffer (make-array (length string)
566 rtoy 1.3 :element-type '(unsigned-byte 8)))))
567 rtoy 1.6 (multiple-value-bind (buffer ptr)
568     (lisp::with-array-data ((string string) (start start) (end end))
569     (funcall (ef-string-to-octets external-format)
570     string start end buffer))
571     (values (if bufferp buffer (lisp::shrink-vector buffer ptr)) ptr))))
572 rtoy 1.3
573     (def-ef-macro ef-octets-to-string (extfmt lisp::lisp +ef-max+ +ef-os+)
574 rtoy 1.6 `(lambda (octets ptr end string &aux (pos -1) (count 0) (state nil) (code 0))
575 rtoy 1.3 (declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
576     (type (simple-array (unsigned-byte 8) (*)) octets)
577 rtoy 1.6 (type kernel:index end count)
578 rtoy 1.3 (type (integer -1 (#.array-dimension-limit)) ptr pos)
579     (type simple-string string)
580 rtoy 1.6 (type lisp:codepoint code)
581 rtoy 1.3 (ignorable state))
582     (loop until (>= ptr end)
583     do (when (= pos (length string))
584     (setq string (adjust-array string (* 2 pos))))
585 rtoy 1.6 (setf code
586     (octets-to-codepoint ,extfmt state count
587     (aref octets (incf ptr)) ;;@@ EOF??
588     (lambda (n) (decf ptr n))))
589     ;; Convert codepoint to UTF-16 surrogate pairs if needed
590     (multiple-value-bind (high low)
591     (surrogates code)
592     (setf (aref string (incf pos)) high)
593     (when low
594     (setf (aref string (incf pos)) low)))
595 rtoy 1.5 finally (return (values string (1+ pos))))))
596 rtoy 1.1
597 rtoy 1.2 (defun octets-to-string (octets &key (start 0) end (external-format :default)
598     (string nil stringp))
599 rtoy 1.1 (declare (type (simple-array (unsigned-byte 8) (*)) octets)
600 rtoy 1.6 (type kernel:index start)
601     (type (or kernel:index null) end)
602 rtoy 1.3 (type (or simple-string null) string))
603     (multiple-value-bind (string pos)
604 rtoy 1.6 (funcall (ef-octets-to-string external-format)
605 rtoy 1.3 octets (1- start) (1- (or end (length octets)))
606     (or string (make-string (length octets))))
607 rtoy 1.2 (values (if stringp string (lisp::shrink-vector string pos)) pos)))
608 rtoy 1.1
609    
610    
611 rtoy 1.3 (def-ef-macro ef-encode (extfmt lisp::lisp +ef-max+ +ef-en+)
612     `(lambda (string start end result &aux (ptr 0) (state nil))
613     (declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
614     (type simple-string string)
615 rtoy 1.6 (type kernel:index start end ptr)
616 rtoy 1.3 (type simple-base-string result)
617     (ignorable state))
618 rtoy 1.9 (do ((k start (1+ k)))
619     ((>= k end)
620     (values result ptr))
621     (multiple-value-bind (code widep)
622     (lisp::codepoint string k end)
623     (when widep (incf k))
624     (codepoint-to-octets ,extfmt code state
625     (lambda (b)
626     (when (= ptr (length result))
627     (setq result (adjust-array result (* 2 ptr))))
628     (setf (aref result (1- (incf ptr)))
629     (code-char b))))))))
630 rtoy 1.3
631     (defun string-encode (string external-format &optional (start 0) end)
632 rtoy 1.6 "Encode the given String using External-Format and return a new
633     string. The characters of the new string are the octets of the
634     encoded result, with each octet converted to a character via
635     code-char. This is the inverse to String-Decode"
636     (when (zerop (length string))
637     (return-from string-encode string))
638 rtoy 1.3 (multiple-value-bind (result ptr)
639     (lisp::with-array-data ((string string) (start start) (end end))
640 rtoy 1.6 (funcall (ef-encode external-format) string start end
641 rtoy 1.3 (make-string (length string) :element-type 'base-char)))
642     (lisp::shrink-vector result ptr)))
643    
644     (def-ef-macro ef-decode (extfmt lisp::lisp +ef-max+ +ef-de+)
645     `(lambda (string ptr end result &aux (pos -1) (count 0) (state nil))
646     (declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
647     (type simple-string string)
648 rtoy 1.6 (type kernel:index end count)
649 rtoy 1.3 (type (integer -1 (#.array-dimension-limit)) ptr pos)
650     (type simple-string result)
651     (ignorable state))
652     (loop until (>= ptr end)
653     ;; increasing size of result shouldn't ever be necessary, unless
654     ;; someone implements an encoding smaller than the source string...
655 rtoy 1.9 do (let ((code (octets-to-codepoint ,extfmt state count
656     ;; note the need to return NIL for EOF
657     (if (= (1+ ptr) (length string))
658     nil
659     (char-code (char string (incf ptr))))
660     (lambda (n) (decf ptr n)))))
661     (multiple-value-bind (hi lo)
662     (lisp::surrogates code)
663     (setf (schar result (incf pos)) hi)
664     (when lo
665     (setf (schar result (incf pos)) lo))))
666 rtoy 1.5 finally (return (values result (1+ pos))))))
667 rtoy 1.3
668     (defun string-decode (string external-format &optional (start 0) end)
669 rtoy 1.6 "Decode String using the given External-Format and return the new
670     string. The input string is treated as if it were an array of
671     octets, where the char-code of each character is the octet. This is
672     the inverse of String-Encode."
673     (when (zerop (length string))
674     (return-from string-decode string))
675 rtoy 1.3 (multiple-value-bind (result pos)
676     (lisp::with-array-data ((string string) (start start) (end end))
677 rtoy 1.6 (funcall (ef-decode external-format)
678 rtoy 1.3 string (1- start) (1- end) (make-string (length string))))
679 rtoy 1.5 (lisp::shrink-vector result pos)))

  ViewVC Help
Powered by ViewVC 1.1.5