/[cmucl]/src/code/extfmts.lisp
ViewVC logotype

Contents of /src/code/extfmts.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.35.4.4 - (show annotations)
Thu Sep 2 23:47:31 2010 UTC (3 years, 7 months ago) by rtoy
Branch: RELEASE-20B-BRANCH
Changes since 1.35.4.3: +1 -1 lines
Fix yet another bug in the FAST-READ-CHAR-STRING-REFILL.   This shows
up when running the word break test in
i18n/tests/word-break-test.lisp.

extfmts.lisp:
o Return the number of characters that were actually converted instead
  of the position of the starting point of the output string.

stream.lisp:
o In FAST-READ-CHAR-STRING-REFILL, sometimes, we'll only read one
  octet into the octet buffer, and the octet will be the first octet
  of a multi-octet character.  If this happens, we need to try to read
  some more octets in so that the call to FAST-READ-CHAR-STRING-REFILL
  can return a character.  We only retry once.  If this still fails to
  read enough octets to form a character, we're hosed since we don't
  check for this.  (Should we?)

  Need to refactor this code a bit too.
1 ;;; -*- Mode: LISP; Syntax: ANSI-Common-Lisp; Package: STREAM -*-
2 ;;;
3 ;;; **********************************************************************
4 ;;; This code was written by Paul Foley and has been placed in the public
5 ;;; domain.
6 ;;;
7 (ext:file-comment
8 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/extfmts.lisp,v 1.35.4.4 2010/09/02 23:47:31 rtoy Exp $")
9 ;;;
10 ;;; **********************************************************************
11 ;;;
12 ;;; Implementation of external-formats
13
14 (in-package "STREAM")
15
16 (intl:textdomain "cmucl")
17
18 (export '(string-to-octets octets-to-string *default-external-format*
19 string-encode string-decode set-system-external-format
20 +replacement-character-code+
21 list-all-external-formats
22 describe-external-format))
23
24 (defvar *default-external-format*
25 :iso8859-1
26 "The default external format to use if no other external format is
27 specified")
28
29 (defvar *external-formats*
30 (make-hash-table :test 'equal)
31 "Hash table of all the external formats that have been loaded")
32
33 (defvar *external-format-aliases*
34 (make-hash-table)
35 "Hash table mapping an external format alias to the actual external
36 format implementation")
37
38 (vm::defenum (:prefix "+EF-" :suffix "+" :start 1)
39 str ; string length
40 cin ; input a character
41 cout ; output a character
42 sin ; input string
43 sout ; output string
44 os ; octets to string
45 so ; string to octets
46 en ; encode
47 de ; decode
48 flush ; flush state
49 copy-state ; copy state
50 max)
51
52 ;; Unicode replacement character U+FFFD
53 (defconstant +replacement-character-code+ #xFFFD)
54
55 (define-condition external-format-not-implemented (error)
56 ()
57 (:report
58 (lambda (condition stream)
59 (declare (ignore condition))
60 (format stream (intl:gettext "Attempting unimplemented external-format I/O.")))))
61
62 (define-condition external-format-not-found (error)
63 ((name :reader external-format-not-found-name
64 :initarg :name))
65 (:report
66 (lambda (condition stream)
67 (format stream (intl:gettext "External format ~S not found.")
68 (external-format-not-found-name condition)))))
69
70 (defun %efni (a b c d)
71 (declare (ignore a b c d))
72 (error 'external-format-not-implemented))
73
74 (defstruct efx
75 ;;
76 ;; Function to read a sequence of octets from a stream and convert
77 ;; them a code point.
78 (octets-to-code #'%efni :type function :read-only t)
79 ;;
80 ;; Function to convert a codepoint to a sequence of octets and write
81 ;; them to an output stream.
82 (code-to-octets #'%efni :type function :read-only t)
83 ;;
84 ;; Function (or NIL) to force any state in the external format to be
85 ;; flushed to the output stream. A NIL value means the external
86 ;; format does not need to do anything special.
87 (flush-state nil :type (or null function) :read-only t)
88 ;;
89 ;; Function to copy the state of the external-format. If NIL, then
90 ;; there is no state to be copied.
91 (copy-state nil :type (or null function) :read-only t)
92 (cache nil :type (or null simple-vector))
93 ;;
94 ;; Minimum number of octets needed to form a codepoint
95 (min 1 :type kernel:index :read-only t)
96 ;;
97 ;; Maximum number of octets needed to form a codepoint.
98 (max 1 :type kernel:index :read-only t)
99 ;;
100 ;; Documentation for this external format
101 #+nil(documentation nil :type (or null string) :read-only t))
102
103 (defstruct (external-format
104 (:conc-name ef-)
105 (:print-function %print-external-format)
106 (:constructor make-external-format (name efx composingp documentation
107 &optional slots slotd)))
108 (name (ext:required-argument) :type (or keyword cons) :read-only t)
109 (efx (ext:required-argument) :type efx :read-only t)
110 (composingp (ext:required-argument) :type boolean :read-only t)
111 (slots #() :type simple-vector :read-only t)
112 (slotd nil :type list :read-only t)
113 (documentation nil :type (or null string) :read-only t))
114
115 (defun %print-external-format (ef stream depth)
116 (declare (ignore depth))
117 (print-unreadable-object (ef stream :type t :identity t)
118 (princ (ef-name ef) stream)))
119
120 (defun %intern-ef (ef)
121 (setf (gethash (ef-name ef) *external-formats*) ef))
122
123 (declaim (inline ef-octets-to-code ef-code-to-octets ef-flush-state ef-copy-state
124 ef-cache ef-min-octets ef-max-octets))
125
126 (defun ef-octets-to-code (ef)
127 (efx-octets-to-code (ef-efx ef)))
128
129 (defun ef-code-to-octets (ef)
130 (efx-code-to-octets (ef-efx ef)))
131
132 (defun ef-flush-state (ef)
133 (efx-flush-state (ef-efx ef)))
134
135 (defun ef-copy-state (ef)
136 (efx-copy-state (ef-efx ef)))
137
138 (defun ef-cache (ef)
139 (efx-cache (ef-efx ef)))
140
141 (defun ef-min-octets (ef)
142 (efx-min (ef-efx ef)))
143
144 (defun ef-max-octets (ef)
145 (efx-max (ef-efx ef)))
146
147 (eval-when (:compile-toplevel :load-toplevel :execute)
148 (defun %merge-slots (old new)
149 (let* ((pos (length old))
150 (tmp (mapcar (lambda (x)
151 (let* ((name (if (consp x) (first x) x))
152 (init (if (consp x) (second x) nil))
153 (list (if (consp x) (nthcdr 2 x) nil))
154 (prev (assoc name old))
155 (posn (if prev (second prev) (1- (incf pos)))))
156 (list name posn init (getf list :type t))))
157 new)))
158 (delete-duplicates (stable-sort (append old tmp) #'< :key #'second)
159 :key #'second))))
160
161 ;;; DEFINE-EXTERNAL-FORMAT -- Public
162 ;;;
163 ;;; name (&key base min max size documentation) (&rest slots) octets-to-code
164 ;;; code-to-octets flush-state copy-state
165 ;;;
166 ;;; Define a new external format. If base is specified, then an
167 ;;; external format is defined that is based on a previously defined
168 ;;; external format named Base. The slot names used in Slots must
169 ;;; match those defined in the Base format.
170 ;;;
171 ;;; If Base is not specified, a new external format is defined.
172 ;;; Min/Max/Size are the minimum and maximum number of octets that
173 ;;; make up a character (:size N is just shorthand for :min N :max
174 ;;; N). Slots is a list of slot descriptions similar to defstruct.
175 ;;;
176 ;;; In both cases, Documentation is a string that documents the
177 ;;; external format.
178 ;;;
179 ;;; octets-to-code (state input unput error &rest vars)
180 ;;; Defines a form to be used by the external format to convert
181 ;;; octets to a code point. State is a form that can be used by the
182 ;;; body to access the state of the stream. Input is a form that
183 ;;; can be used to read one octet from the input stream. (It can be
184 ;;; called as many times as needed.) Similarly, Unput is a form to
185 ;;; put back one octet to the input stream. Error is an error
186 ;;; handler. The default is NIL to indicate that the code should do
187 ;;; its default handling. Otherwise, it should be a function or
188 ;;; symbol to indicate how errors are handled. Vars is a list of
189 ;;; vars that need to be defined for any symbols used within the
190 ;;; form.
191 ;;;
192 ;;; The error handler is a function of 3 arguments: a format message
193 ;;; string, the offending octet (or NIL) and the number of octets
194 ;;; read for this encoding. If the function returns, it should
195 ;;; return the codepoint to be used in place of the erroneous
196 ;;; sequence.
197 ;;;
198 ;;; This should return two values: the code and the number of octets
199 ;;; read to form the code.
200 ;;;
201 ;;; code-to-octets (code state output error &rest vars)
202 ;;; Defines a form to be used by the external format to convert a
203 ;;; code point to octets for output. Code is the code point to be
204 ;;; converted. State is a form to access the current value of the
205 ;;; stream's state variable. Output is a form that writes one octet
206 ;;; to the output stream. Error is the error handler. A value of
207 ;;; NIL means the external format should use its default method.
208 ;;; Otherwise, it should be a symbol or function that will e called
209 ;;; to handle the error.
210 ;;;
211 ;;; The error function takes 2 arguments: a format message string
212 ;;; and the offending codepoint. If the function returns, it should
213 ;;; be the desired replacement codepoint.
214 ;;;
215 ;;; flush-state (state output error &rest vars)
216 ;;; Defines a form to be used by the external format to flush out
217 ;;; any state when an output stream is closed. Similar to
218 ;;; CODE-TO-OCTETS, but there is no code. Error is similar to the
219 ;;; error parameter for code-to-octets.
220 ;;;
221 ;;; copy-state (state &rest vars)
222 ;;; Defines a form to copy any state needed by the external format.
223 ;;; This should probably be a deep copy so that if the original
224 ;;; state is modified, the copy is not.
225 ;;;
226 ;;; Note: external-formats work on code-points, not
227 ;;; characters, so that the entire 31 bit ISO-10646 range can be
228 ;;; used internally regardless of the size of a character recognized
229 ;;; by Lisp and external formats can be useful to people who want to
230 ;;; process characters outside the Lisp range (see
231 ;;; CODEPOINT-TO-OCTETS, OCTETS-TO-CODEPOINT)
232 ;;;
233 (defmacro define-external-format (name (&key base min max size (documentation ""))
234 (&rest slots)
235 &optional octets-to-code code-to-octets
236 flush-state copy-state)
237 (let* ((tmp (gensym))
238 (min (or min size 1))
239 (max (or max size 6))
240 (base (when base
241 (find-external-format base)))
242 (bslotd (if base (ef-slotd base) nil))
243 (slotd (%merge-slots bslotd slots))
244 (slotb (loop for slot in slotd
245 collect `(,(first slot)
246 `(the ,',(fourth slot)
247 ;; IDENTITY is here to protect against SETF
248 (identity (svref %slots% ,',(second slot))))))))
249 (when documentation
250 (intl::note-translatable intl::*default-domain* documentation))
251 `(macrolet ((octets-to-code ((state input unput error &rest vars) body)
252 `(lambda (,state ,input ,unput ,error)
253 (declare (ignorable ,state ,input ,unput ,error)
254 (optimize (ext:inhibit-warnings 3)))
255 (let (,@',slotb
256 (,input `(the (or (unsigned-byte 8) null) ,,input))
257 ,@(loop for var in vars collect `(,var (gensym))))
258 ,body)))
259 (code-to-octets ((code state output error &rest vars) body)
260 `(lambda (,',tmp ,state ,output ,error)
261 (declare (ignorable ,state ,output ,error)
262 (optimize (ext:inhibit-warnings 3)))
263 (let (,@',slotb
264 (,code ',code)
265 ,@(loop for var in vars collect `(,var (gensym))))
266 `(let ((,',code (the lisp:codepoint ,,',tmp)))
267 (declare (ignorable ,',code))
268 ,,body))))
269 (flush-state ((state output error &rest vars) body)
270 `(lambda (,state ,output ,error)
271 (declare (ignorable ,state ,output ,error))
272 (let (,@',slotb
273 ,@(loop for var in vars collect `(,var (gensym))))
274 ,body)))
275 (copy-state ((state &rest vars) body)
276 `(lambda (,state)
277 (declare (ignorable ,state))
278 (let (,@',slotb
279 ,@(loop for var in vars collect `(,var (gensym))))
280 ,body))))
281 (%intern-ef (make-external-format ,name
282 ,(if base
283 `(ef-efx (find-external-format ,(ef-name base)))
284 `(make-efx :octets-to-code ,octets-to-code
285 :code-to-octets ,code-to-octets
286 :flush-state ,flush-state
287 :copy-state ,copy-state
288 :cache (make-array +ef-max+
289 :initial-element nil)
290 :min ,(min min max)
291 :max ,(max min max)))
292 nil
293 ,documentation
294 (let* ,(loop for x in slotd
295 collect (list (first x) (third x)))
296 (vector ,@(mapcar #'first slotd)))
297 ',slotd)))))
298
299 ;;; DEFINE-COMPOSING-EXTERNAL-FORMAT -- Public
300 ;;;
301 ;;; A composing-external-format differs from an (ordinary) external-format
302 ;;; in that it translates characters (really codepoints, of course) into
303 ;;; other characters, rather than translating between characters and binary
304 ;;; octets. They have to be composed with a non-composing external-format
305 ;;; to be of any use.
306 ;;;
307 ;;;
308 ;;; name (&key min max size documentation) input output
309 ;;; Defines a new composing external format. The parameters Min,
310 ;;; Max, Size, and Documentation are the same as for defining an
311 ;;; external format. The parameters input and output are forms to
312 ;;; handle input and output.
313 ;;;
314 ;;; input (state input unput &rest vars)
315 ;;; Defines a form to be used by the composing external format when
316 ;;; reading input to transform a codepoint (or sequence of
317 ;;; codepoints) to another. State is a form that can be used by the
318 ;;; body to access the state of the external format. Input is a
319 ;;; form that can be used to read one code point from the input
320 ;;; stream. (Input returns two values, the codepoint and the number
321 ;;; of octets read.) It may be called as many times as needed.
322 ;;; This returns two values: the codepoint of the character (or NIL)
323 ;;; and the number of octets read. Similarly, Unput is a form to
324 ;;; put back one octet to the input stream. Vars is a list of vars
325 ;;; that need to be defined for any symbols used within the form.
326 ;;;
327 ;;; This should return two values: the code and the number of octets
328 ;;; read to form the code.
329 ;;;
330 ;;; output (code state output &rest vars)
331 ;;; Defines a form to be used by the composing external format to
332 ;;; convert a code point to octets for output. Code is the code
333 ;;; point to be converted. State is a form to access the current
334 ;;; value of the stream's state variable. Output is a form that
335 ;;; writes one octet to the output stream.
336
337 (defmacro define-composing-external-format (name (&key min max size documentation)
338 input output)
339 (let ((tmp (gensym))
340 (min (or min size 1))
341 (max (or max size 1)))
342 `(macrolet ((input ((state input unput &rest vars) body)
343 `(lambda (,state ,input ,unput)
344 (declare (ignorable ,state ,input ,unput)
345 (optimize (ext:inhibit-warnings 3)))
346 (let ((,input `(the (values (or lisp:codepoint null)
347 kernel:index)
348 ,,input))
349 ,@(loop for var in vars collect `(,var (gensym))))
350 ,body)))
351 (output ((code state output &rest vars) body)
352 `(lambda (,',tmp ,state ,output)
353 (declare (ignorable ,state ,output)
354 (optimize (ext:inhibit-warnings 3)))
355 (let ((,code ',code)
356 ,@(loop for var in vars collect `(,var (gensym))))
357 `(let ((,',code (the lisp:codepoint ,,',tmp)))
358 (declare (ignorable ,',code))
359 ,,body)))))
360 (%intern-ef (make-external-format ,name
361 (make-efx :octets-to-code ,input
362 :code-to-octets ,output
363 :min ,(min min max) :max ,(max min max))
364 t
365 ,documentation
366 #() '())))))
367
368 (defun load-external-format-aliases ()
369 (let ((*package* (find-package "KEYWORD"))
370 (unix::*filename-encoding* :iso8859-1))
371 (with-open-file (stm "ext-formats:aliases" :if-does-not-exist nil
372 :external-format :iso8859-1)
373 (when stm
374 (do ((alias (read stm nil stm) (read stm nil stm))
375 (value (read stm nil stm) (read stm nil stm)))
376 ((or (eq alias stm) (eq value stm))
377 (unless (eq alias stm)
378 (warn (intl:gettext "External-format aliases file ends early."))))
379 (if (and (keywordp alias) (or (keywordp value)
380 (and (consp value)
381 (every #'keywordp value))))
382 (setf (gethash alias *external-format-aliases*) value)
383 (warn (intl:gettext "Bad entry in external-format aliases file: ~S => ~S.")
384 alias value)))))))
385
386 (defun list-all-external-formats ()
387 "List the available external formats. A list is returned where each
388 element is list of the external format and a list of aliases for the
389 format. No distinction is made between external formats and
390 composing external formats."
391 ;; Look for all lisp files in the ext-formats directory. These are
392 ;; the available formats.
393 (let ((ef (make-hash-table))
394 result)
395 (map nil #'(lambda (p)
396 (setf (gethash (intern (string-upcase (pathname-name p)) :keyword) ef)
397 nil))
398 (directory "ext-formats:*.lisp"))
399
400 ;; Look through aliases and update formats with a list of aliases.
401 (load-external-format-aliases)
402 (maphash #'(lambda (k v)
403 (push k (gethash v ef)))
404 *external-format-aliases*)
405
406 (maphash #'(lambda (k v)
407 (push (if v
408 (list k v)
409 (list k))
410 result))
411 ef)
412 (sort result #'string< :key #'first)))
413
414 (defun describe-external-format (external-format)
415 "Print a description of the given External-Format. This may cause
416 the external format to be loaded (silently), if it is not already
417 loaded."
418 (when (zerop (hash-table-count
419 *external-format-aliases*))
420 (load-external-format-aliases))
421 (let ((alias (gethash external-format *external-format-aliases*)))
422 (cond (alias
423 (format t (intl:gettext "~&~S is an alias for the external format ~S.~2%")
424 external-format alias))
425 ((and (listp external-format)
426 (> (length external-format) 1))
427 ;; Some kind of composed external format
428 (format t (intl:gettext "~&~S is a composed external format.~2%") external-format))
429 (t
430 (let ((ef (handler-case (let ((*compile-print* nil)
431 (ext:*compile-progress* nil)
432 (*compile-verbose* nil))
433 ;; Should we be this silent when
434 ;; loading the external format?
435 ;; We aren't when the normally
436 ;; loading the format.
437 (find-external-format external-format))
438 (external-format-not-found ()
439 (format *error-output*
440 (intl:gettext "~&Could not find external format ~S~%")
441 external-format)))))
442 (when ef
443 (let (aliases)
444 ;; Find any aliases for this external format. Doesn't need to be efficient.
445 (maphash #'(lambda (k v)
446 (when (eq v external-format)
447 (push k aliases)))
448 *external-format-aliases*)
449 (format t (intl:gettext "~S~:[~; - [Aliases: ~{~S~^, ~}~]]~%")
450 external-format aliases aliases))
451 (when (ef-composingp ef)
452 (format t (intl:gettext "~&~S is a composing external format.~2%")
453 external-format))
454 (format t "~&~A~%"
455 (intl:gettext (or (ef-documentation ef) "")))))))))
456
457 (defun %find-external-format (name)
458 ;; avoid loading files, etc., early in the boot sequence
459 (when (or (eq name :iso8859-1)
460 (and (eq name :default) (eq *default-external-format* :iso8859-1)))
461 (return-from %find-external-format
462 (gethash :iso8859-1 *external-formats*)))
463
464 (when (zerop (hash-table-count *external-format-aliases*))
465 (setf (gethash :latin1 *external-format-aliases*) :iso8859-1)
466 (setf (gethash :latin-1 *external-format-aliases*) :iso8859-1)
467 (setf (gethash :iso-8859-1 *external-format-aliases*) :iso8859-1)
468 (load-external-format-aliases))
469
470 (do ((tmp (gethash name *external-format-aliases*)
471 (gethash tmp *external-format-aliases*))
472 (cnt 0 (1+ cnt)))
473 ((or (null tmp) (= cnt 50))
474 (unless (null tmp)
475 (error (intl:gettext "External-format aliasing depth exceeded."))))
476 (setq name tmp))
477
478 (or (gethash name *external-formats*)
479 (and (consp name) (find-external-format name))
480 (and (with-standard-io-syntax
481 ;; Use standard IO syntax so that changes by the user
482 ;; don't mess up compiling the external format.
483 (let ((*package* (find-package "STREAM"))
484 (lisp::*enable-package-locked-errors* nil)
485 (s (open (format nil "ext-formats:~(~A~).lisp" name)
486 :if-does-not-exist nil :external-format :iso8859-1)))
487 (when s
488 (null (nth-value 1 (ext:compile-from-stream s))))))
489 (gethash name *external-formats*))))
490
491 (defun %composed-ef-name (a b)
492 (if (consp a) (append a (list b)) (list a b)))
493
494 (defun %compose-external-formats (a b)
495 (when (ef-composingp a)
496 (error (intl:gettext "~S is a Composing-External-Format.") (ef-name a)))
497 (unless (ef-composingp b)
498 (error (intl:gettext "~S is not a Composing-External-Format.") (ef-name b)))
499 (make-external-format
500 (%composed-ef-name (ef-name a) (ef-name b))
501 (make-efx
502 :octets-to-code (lambda (state input unput error)
503 (let ((nstate (gensym "STATE-")))
504 `(let ((,nstate ,state))
505 (when (null ,nstate)
506 (setq ,nstate (setf ,state (cons nil nil))))
507 ,(funcall (ef-octets-to-code b) `(car ,nstate)
508 (funcall (ef-octets-to-code a)
509 `(cdr ,nstate) input unput error)
510 unput))))
511 :code-to-octets (lambda (code state output error)
512 (let ((nstate (gensym "STATE-")))
513 `(let ((,nstate ,state))
514 (when (null ,nstate)
515 (setq ,nstate (setf ,state (cons nil nil))))
516 ,(funcall (ef-code-to-octets b) code `(car ,nstate)
517 `(lambda (x)
518 ,(funcall (ef-code-to-octets a)
519 'x `(cdr ,nstate) output error))))))
520 :cache (make-array +ef-max+ :initial-element nil)
521 :min (* (ef-min-octets a) (ef-min-octets b))
522 :max (* (ef-max-octets a) (ef-max-octets b)))
523 nil
524 nil #() '()))
525
526 (defun find-external-format (name &optional (error-p t))
527 (when (external-format-p name)
528 (return-from find-external-format name))
529
530 (or (if (consp name) (every #'keywordp name) (keywordp name))
531 (error (intl:gettext "~S is not a valid external format name.") name))
532
533 (when (eq name :default)
534 (setq name *default-external-format*))
535
536 (when (and (consp name) (not (cdr name)))
537 (setq name (car name)))
538
539 (flet ((not-found ()
540 (when (equal *default-external-format* name)
541 (setq *default-external-format* :iso8859-1))
542 (if error-p (error 'external-format-not-found :name name) nil)))
543 (if (consp name)
544 (let ((efs (mapcar #'%find-external-format name)))
545 (if (member nil efs)
546 (not-found)
547 (let ((name (reduce #'%composed-ef-name (mapcar #'ef-name efs))))
548 (or (gethash name *external-formats*)
549 (%intern-ef (reduce #'%compose-external-formats efs))))))
550 (or (%find-external-format name) (not-found)))))
551
552 (defun flush-external-formats ()
553 (maphash (lambda (name ef)
554 (declare (ignore name))
555 (fill (ef-cache ef) nil))
556 *external-formats*))
557
558 (defvar *.table-inverse.* (make-hash-table :test 'eq :size 7))
559
560 (defun invert-table (table)
561 (declare (type (or (simple-array (unsigned-byte 31) *)
562 (simple-array (unsigned-byte 16) *))
563 table)
564 (optimize (speed 3) (space 0) (safety 0) (debug 0)
565 (ext:inhibit-warnings 3)))
566 (or (gethash table *.table-inverse.*)
567 (let* ((mbits (if (= (array-total-size table) 128) 7 8))
568 (lbits (cond ((> (array-total-size table) 256) 3)
569 ((< (array-total-size table) 100) 6)
570 (t 5)))
571 (hvec (make-array (1+ (ash #x110000 (- 0 mbits lbits)))
572 :element-type '(unsigned-byte 16)
573 :initial-element #xFFFF))
574 (mvec (make-array 0 :element-type '(unsigned-byte 16)))
575 (lvec (make-array 0 :element-type '(unsigned-byte 16)))
576 (width (array-dimension table 0))
577 (power (1- (array-rank table)))
578 (base (if (= width 94) 1 0))
579 hx mx lx)
580 (assert (and (< power 2) (<= width 256)))
581 (dotimes (i (array-total-size table))
582 (declare (type (integer 0 (#.array-dimension-limit)) i))
583 (let ((tmp i) (val (row-major-aref table i)) (z 0))
584 (declare (type (integer 0 (#.array-dimension-limit)) tmp)
585 (type (unsigned-byte 16) z))
586 (unless (= val #xFFFE)
587 (when (plusp power)
588 (multiple-value-bind (x y) (floor tmp width)
589 (setq tmp x)
590 (setq z (logior z (ash (the (integer 0 255) (+ y base))
591 (the (integer 0 24)
592 (* 8 power)))))))
593 (setq hx (ash val (- 0 mbits lbits)))
594 (when (= (aref hvec hx) #xFFFF)
595 (setf (aref hvec hx) (length mvec))
596 (let ((tmp (make-array (+ (length mvec) (ash 1 mbits))
597 :element-type '(unsigned-byte 16)
598 :initial-element #xFFFF)))
599 (replace tmp mvec)
600 (setq mvec tmp)))
601 (setq mx (logand (ash val (- lbits)) (lognot (ash -1 mbits))))
602 (when (= (aref mvec (+ (aref hvec hx) mx)) #xFFFF)
603 (setf (aref mvec (+ (aref hvec hx) mx)) (length lvec))
604 (let ((tmp (make-array (+ (length lvec) (ash 1 lbits))
605 :element-type '(unsigned-byte 16)
606 :initial-element #xFFFF)))
607 (replace tmp lvec)
608 (setq lvec tmp)))
609 (setq lx (logand val (lognot (ash -1 lbits))))
610 (setf (aref lvec (+ (aref mvec (+ (aref hvec hx) mx)) lx))
611 (logior z (+ tmp base))))))
612 (setf (gethash table *.table-inverse.*)
613 (lisp::make-ntrie16 :split (logior (ash (1- mbits) 4) (1- lbits))
614 :hvec hvec :mvec mvec :lvec lvec)))))
615
616 (declaim (inline get-inverse))
617 (defun get-inverse (ntrie code)
618 (declare (type lisp::ntrie16 ntrie) (type (integer 0 #x10FFFF) code))
619 (let ((n (lisp::qref ntrie code)))
620 (and n (let ((m (aref (lisp::ntrie16-lvec ntrie) n)))
621 (if (= m #xFFFF) nil m)))))
622
623
624 (define-condition void-external-format (error)
625 ()
626 (:report
627 (lambda (condition stream)
628 (declare (ignore condition))
629 (format stream (intl:gettext "Attempting I/O through void external-format.")))))
630
631 (define-external-format :void (:size 0 :documentation
632 "Void external format that signals an error on any input or output.")
633 ()
634 (octets-to-code (state input unput error)
635 `(error 'void-external-format))
636 (code-to-octets (code state output error)
637 `(error 'void-external-format)))
638
639 (define-external-format :iso8859-1 (:size 1 :documentation
640 "ISO8859-1 is an 8-bit character encoding generally intended for
641 Western European languages including English, German, Italian,
642 Norwegian, Portuguese, Spanish, Swedish and many others.
643
644 By default, illegal inputs are replaced by the Unicode replacement
645 character and illegal outputs are replaced by a question mark.")
646 ()
647 (octets-to-code (state input unput error)
648 `(values ,input 1))
649 (code-to-octets (code state output error)
650 `(,output (if (> ,code 255)
651 (if ,error
652 (locally
653 ;; No warnings about fdefinition
654 (declare (optimize (ext:inhibit-warnings 3)))
655 (funcall ,error
656 (intl:gettext "Cannot output codepoint #x~X to ISO8859-1 stream")
657 ,code 1))
658 #x3F)
659 ,code))))
660
661 ;;; OCTETS-TO-CODEPOINT, CODEPOINT-TO-OCTETS -- Semi-Public
662 ;;;
663 ;;; Normally you'd want to use OCTETS-TO-CHAR and CHAR-TO-OCTETS instead of
664 ;;; these, but that limits you to Lisp's idea of a character - either Latin-1
665 ;;; in 8 bit Lisp images, or the Unicode BMP in 16 bit images. If you want
666 ;;; to read or write texts containing characters not supported by your Lisp,
667 ;;; these macros can be used instead.
668 (defmacro octets-to-codepoint (external-format state count input unput &optional error)
669 (let ((tmp1 (gensym)) (tmp2 (gensym))
670 (ef (find-external-format external-format)))
671 `(multiple-value-bind (,tmp1 ,tmp2)
672 ,(funcall (ef-octets-to-code ef) state input unput error)
673 (setf ,count (the kernel:index ,tmp2))
674 (the (or lisp:codepoint null) ,tmp1))))
675
676 (defmacro codepoint-to-octets (external-format code state output &optional error)
677 (let ((ef (find-external-format external-format)))
678 (funcall (ef-code-to-octets ef) code state output error)))
679
680
681
682 (defvar *ef-base* +ef-max+)
683 (defvar *ef-extensions* '())
684
685 (defun ensure-cache (ef id reqd)
686 (let ((base (or (getf *ef-extensions* id)
687 (setf (getf *ef-extensions* id)
688 (prog1 *ef-base* (incf *ef-base* reqd))))))
689 (when (< (length (ef-cache ef)) (+ base reqd))
690 (setf (efx-cache (ef-efx ef))
691 (adjust-array (ef-cache ef) (+ base reqd) :initial-element nil)))
692 base))
693
694 ;;; DEF-EF-MACRO -- Public
695 ;;;
696 ;;;
697 (defmacro def-ef-macro (name (ef id reqd idx) body)
698 (let* ((tmp1 (gensym))
699 (tmp2 (gensym))
700 (blknm (nth-value 1 (lisp::valid-function-name-p name)))
701 (%name (intern (format nil "%~A" name) #|(symbol-package blknm)|#)))
702 `(progn
703 (defun ,%name (,ef)
704 (let* ((,tmp1 (find-external-format ,ef))
705 (,tmp2 ,(if (eq id 'lisp::lisp)
706 idx
707 `(+ (ensure-cache ,tmp1 ',id ,reqd) ,idx))))
708 (funcall (or (aref (ef-cache ,tmp1) ,tmp2)
709 (setf (aref (ef-cache ,tmp1) ,tmp2)
710 (let ((*compile-print* nil)
711 ;; Set default format when we compile so we
712 ;; can see compiler messages. If we don't,
713 ;; we run into a problem that we might be
714 ;; changing the default format while we're
715 ;; compiling, and we don't know how to output
716 ;; the compiler messages.
717 #|(*default-external-format* :iso8859-1)|#)
718 (compile nil `(lambda (%slots%)
719 (declare (ignorable %slots%))
720 (block ,',blknm
721 ,,body))))))
722 (ef-slots ,tmp1))))
723 (declaim (inline ,name))
724 (defun ,name (,tmp1)
725 (let ((,tmp2 (load-time-value (cons nil nil))))
726 (when (eq ,tmp1 :default)
727 (setq ,tmp1 *default-external-format*))
728 (if (eq ,tmp1 (car ,tmp2))
729 (cdr ,tmp2)
730 (setf (car ,tmp2) ,tmp1
731 (cdr ,tmp2) (,%name ,tmp1))))))))
732
733
734
735 ;;; OCTETS-TO-CHAR, CHAR-TO-OCTETS -- Public
736 ;;;
737 ;;; Read and write one character through an external-format
738 ;;;
739 (defmacro octets-to-char (external-format state count input unput &optional error)
740 (let ((nstate (gensym)))
741 `(let ((,nstate ,state))
742 (when (null ,nstate) (setq ,nstate (setf ,state (cons nil nil))))
743 (if (car ,nstate)
744 ;; Return the trailing surrgate. Must set count to 0 to
745 ;; tell the stream code we didn't consume any octets!
746 (prog1 (the character (car ,nstate))
747 (setf (car ,nstate) nil ,count 0))
748 (let ((code (octets-to-codepoint ,external-format
749 (cdr ,nstate) ,count ,input ,unput ,error)))
750 (declare (type lisp:codepoint code))
751 ;;@@ on non-Unicode builds, limit to 8-bit chars
752 ;;@@ if unicode-bootstrap, can't use #\u+fffd
753 (cond ((or (lisp::surrogatep code) (>= code lisp:codepoint-limit))
754 ;; Surrogate characters (that weren't combined
755 ;; into a codepoint by octets-to-codepoint) are
756 ;; illegal. So are codepoints that are too large.
757 (if ,error
758 (if (lisp::surrogatep code)
759 (locally
760 (declare (optimize (ext:inhibit-warnings 3)))
761 (funcall ,error
762 (format nil (intl:gettext "Surrogate codepoint #x~~4,'0X is illegal for ~A")
763 ,external-format)
764 code nil))
765 (locally
766 (declare (optimize (ext:inhibit-warnings 3)))
767 (funcall ,error (intl:gettext "Illegal codepoint on input: #x~X") code nil)))
768 #-(and unicode (not unicode-bootstrap)) #\?
769 #+(and unicode (not unicode-bootstrap)) #\U+FFFD))
770 #+unicode
771 ((> code #xFFFF)
772 (multiple-value-bind (hi lo) (surrogates code)
773 (setf (car ,nstate) lo)
774 hi))
775 (t (code-char code))))))))
776
777 (defmacro char-to-octets (external-format char state output &optional error)
778 (let ((nchar (gensym))
779 (nstate (gensym))
780 (wryte (gensym))
781 (ch (gensym)))
782 `(let ((,nchar ,char)
783 (,nstate ,state))
784 (when (null ,nstate) (setq ,nstate (setf ,state (cons nil nil))))
785 (if (lisp::surrogatep (char-code ,nchar) :high)
786 (setf (car ,nstate) ,nchar)
787 (flet ((,wryte (,ch)
788 (codepoint-to-octets ,external-format ,ch (cdr ,nstate)
789 ,output ,error)))
790 (declare (dynamic-extent #',wryte))
791 (if (car ,nstate)
792 (prog1
793 ;; Invalid surrogate sequences get replaced with
794 ;; the replacement character.
795 (,wryte (if (lisp::surrogatep (char-code ,nchar) :low)
796 (surrogates-to-codepoint (car ,nstate) ,nchar)
797 (if ,error
798 (locally
799 (declare (optimize (ext:inhibit-warnings 3)))
800 (funcall ,error
801 (intl:gettext "Cannot convert invalid surrogate #x~X to character")
802 ,nchar))
803 +replacement-character-code+)))
804 (setf (car ,nstate) nil))
805 ;; A lone trailing (low) surrogate gets replaced with
806 ;; the replacement character.
807 (,wryte (if (lisp::surrogatep (char-code ,nchar) :low)
808 (if ,error
809 (locally
810 (declare (optimize (ext:inhibit-warnings 3)))
811 (funcall ,error
812 (intl:gettext "Cannot convert lone trailing surrogate #x~X to character")
813 ,nchar))
814 +replacement-character-code+)
815 (char-code ,nchar)))))))))
816
817 (defmacro flush-state (external-format state output &optional error)
818 (let* ((ef (find-external-format external-format))
819 (f (ef-flush-state ef)))
820 (when f
821 (funcall f state output error))))
822
823 (defmacro copy-state (external-format state)
824 (let* ((ef (find-external-format external-format))
825 (f (ef-copy-state ef)))
826 (when f
827 (funcall f state))))
828
829 (def-ef-macro ef-string-to-octets (extfmt lisp::lisp +ef-max+ +ef-so+)
830 `(lambda (string start end buffer error &aux (ptr 0) (state nil))
831 (declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
832 (type simple-string string)
833 (type kernel:index start end ptr)
834 (type (simple-array (unsigned-byte 8) (*)) buffer)
835 (ignorable state))
836 (dotimes (i (- end start) (values buffer ptr))
837 (declare (type kernel:index i))
838 (char-to-octets ,extfmt (schar string (+ start i)) state
839 (lambda (b)
840 (when (= ptr (length buffer))
841 (setq buffer (adjust-array buffer (* 2 ptr))))
842 (setf (aref buffer (1- (incf ptr))) b))
843 error))))
844
845 (defun string-to-octets (string &key (start 0) end (external-format :default)
846 (buffer nil bufferp)
847 error)
848 "Convert String to octets using the specified External-format. The
849 string is bounded by Start (defaulting to 0) and End (defaulting to
850 the end of the string. If Buffer is given, the octets are stored
851 there. If not, a new buffer is created."
852 (declare (type string string)
853 (type kernel:index start)
854 (type (or kernel:index null) end)
855 (type (or (simple-array (unsigned-byte 8) (*)) null) buffer))
856 (let* ((buffer (or buffer (make-array (length string)
857 :element-type '(unsigned-byte 8)))))
858 (multiple-value-bind (buffer ptr)
859 (lisp::with-array-data ((string string) (start start) (end end))
860 (funcall (ef-string-to-octets external-format)
861 string start end buffer error))
862 (values (if bufferp buffer (lisp::shrink-vector buffer ptr)) ptr))))
863
864 (def-ef-macro ef-octets-to-string (extfmt lisp::lisp +ef-max+ +ef-os+)
865 `(lambda (octets ptr end state string s-start s-end error
866 &aux (pos s-start) (count 0) (last-octet 0))
867 (declare (optimize (speed 3) (safety 0) #|(space 0) (debug 0)|#)
868 (type (simple-array (unsigned-byte 8) (*)) octets)
869 (type kernel:index pos end count last-octet s-start s-end)
870 (type (integer -1 (#.array-dimension-limit)) ptr)
871 (type simple-string string)
872 (ignorable state))
873 (catch 'end-of-octets
874 (loop while (< pos s-end)
875 do (setf (schar string pos)
876 (octets-to-char ,extfmt state count
877 (if (>= ptr end)
878 (throw 'end-of-octets nil)
879 (aref octets (incf ptr)))
880 (lambda (n) (decf ptr n))
881 error))
882 (incf pos)
883 (incf last-octet count)))
884 (values string pos last-octet state)))
885
886 (defun octets-to-string (octets &key (start 0) end (external-format :default)
887 (string nil stringp)
888 (s-start 0) (s-end nil s-end-p)
889 (state nil)
890 error)
891 "Octets-to-string converts an array of octets in Octets to a string
892 according to the specified External-format. The array of octets is
893 bounded by Start (defaulting ot 0) and End (defaulting to the end of
894 the array. If String is not given, a new string is created. If
895 String is given, the converted octets are stored in String, starting
896 at S-Start (defaulting to the 0) and ending at S-End (defaulting to
897 the length of String). If the string is not large enough to hold
898 all of characters, then some octets will not be converted. A State
899 may also be specified; this is used as the state of the external
900 format.
901
902 Four values are returned: the string, the number of characters read,
903 the number of octets actually consumed and the new state of the
904 external format."
905 (declare (type (simple-array (unsigned-byte 8) (*)) octets)
906 (type kernel:index start s-start)
907 (type (or kernel:index null) end)
908 (type (or simple-string null) string))
909 (let ((s-end (if s-end-p
910 s-end
911 (if stringp
912 (length string)
913 (length octets)))))
914 (multiple-value-bind (string pos last-octet new-state)
915 (funcall (ef-octets-to-string external-format)
916 octets (1- start) (1- (or end (length octets)))
917 state
918 (or string (make-string (length octets)))
919 s-start s-end
920 error)
921 (values (if stringp string (lisp::shrink-vector string pos)) pos last-octet new-state))))
922
923
924 (def-ef-macro ef-octets-to-string-counted (extfmt lisp::lisp +ef-max+ +ef-os+)
925 `(lambda (octets ptr end state ocount string s-start s-end error
926 &aux (pos s-start) (last-octet 0))
927 (declare (optimize (speed 3) (safety 0) #|(space 0) (debug 0)|#)
928 (type (simple-array (unsigned-byte 8) (*)) octets ocount)
929 (type kernel:index pos end last-octet s-start s-end)
930 (type (integer -1 (#.array-dimension-limit)) ptr)
931 (type simple-string string)
932 (ignorable state))
933 (catch 'end-of-octets
934 (loop for k of-type fixnum from 0
935 while (< pos s-end)
936 do (setf (schar string pos)
937 (octets-to-char ,extfmt state (aref ocount k)
938 (if (>= ptr end)
939 (throw 'end-of-octets nil)
940 (aref octets (incf ptr)))
941 (lambda (n) (decf ptr n))
942 error))
943 (incf pos)
944 (incf last-octet (aref ocount k))))
945 (values string pos last-octet state)))
946
947 ;; Like OCTETS-TO-STRING, but we take an extra argument which is an
948 ;; array which will contain the number of octets read for each
949 ;; character placed in the output string.
950 (defun octets-to-string-counted (octets ocount
951 &key (start 0) end (external-format :default)
952 (string nil stringp)
953 (s-start 0) (s-end nil s-end-p)
954 (state nil)
955 error)
956 "Octets-to-string converts an array of octets in Octets to a string
957 according to the specified External-format. The array of octets is
958 bounded by Start (defaulting ot 0) and End (defaulting to the end of
959 the array. If String is not given, a new string is created. If
960 String is given, the converted octets are stored in String, starting
961 at S-Start (defaulting to the 0) and ending at S-End (defaulting to
962 the length of String). If the string is not large enough to hold
963 all of characters, then some octets will not be converted. A State
964 may also be specified; this is used as the state of the external
965 format.
966
967 In Ocount, the number of octets read for each character in the
968 string is saved
969
970 Four values are returned: the string, the number of characters read,
971 the number of octets actually consumed and the new state of the
972 external format."
973 (declare (type (simple-array (unsigned-byte 8) (*)) octets ocount)
974 (type kernel:index start s-start)
975 (type (or kernel:index null) end)
976 (type (or simple-string null) string))
977 (let ((s-end (if s-end-p
978 s-end
979 (if stringp
980 (length string)
981 (length octets)))))
982 (multiple-value-bind (string pos last-octet new-state)
983 (funcall (ef-octets-to-string-counted external-format)
984 octets (1- start) (1- (or end (length octets)))
985 state
986 ocount
987 (or string (make-string (length octets)))
988 s-start s-end
989 error)
990 (values (if stringp string (lisp::shrink-vector string pos)) (- pos s-start) last-octet new-state))))
991
992
993
994 (def-ef-macro ef-encode (extfmt lisp::lisp +ef-max+ +ef-en+)
995 `(lambda (string start end result error &aux (ptr 0) (state nil))
996 (declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
997 (type simple-string string)
998 (type kernel:index start end ptr)
999 (type simple-base-string result)
1000 (ignorable state))
1001 (dotimes (i (- end start) (values result ptr))
1002 (declare (type kernel:index i))
1003 (char-to-octets ,extfmt (schar string (+ start i)) state
1004 (lambda (b)
1005 (when (= ptr (length result))
1006 (setq result (adjust-array result (* 2 ptr))))
1007 (setf (aref result (1- (incf ptr)))
1008 (code-char b)))
1009 error))))
1010
1011 (defun string-encode (string external-format &optional (start 0) end error)
1012 "Encode the given String using External-Format and return a new
1013 string. The characters of the new string are the octets of the
1014 encoded result, with each octet converted to a character via
1015 code-char. This is the inverse to String-Decode"
1016 (when (zerop (length string))
1017 (return-from string-encode string))
1018 (multiple-value-bind (result ptr)
1019 (lisp::with-array-data ((string string) (start start) (end end))
1020 (funcall (ef-encode external-format) string start end
1021 (make-string (length string) :element-type 'base-char)
1022 error))
1023 (lisp::shrink-vector result ptr)))
1024
1025 (def-ef-macro ef-decode (extfmt lisp::lisp +ef-max+ +ef-de+)
1026 `(lambda (string ptr end result error &aux (pos -1) (count 0) (state nil))
1027 (declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
1028 (type simple-string string)
1029 (type kernel:index end count)
1030 (type (integer -1 (#.array-dimension-limit)) ptr pos)
1031 (type simple-string result)
1032 (ignorable state))
1033 (loop until (>= ptr end)
1034 ;; increasing size of result shouldn't ever be necessary, unless
1035 ;; someone implements an encoding smaller than the source string...
1036 do (setf (schar result (incf pos))
1037 (octets-to-char ,extfmt state count
1038 ;; note the need to return NIL for EOF
1039 (if (= (1+ ptr) (length string))
1040 nil
1041 (char-code (char string (incf ptr))))
1042 (lambda (n) (decf ptr n))
1043 error))
1044 finally (return (values result (1+ pos))))))
1045
1046 (defun string-decode (string external-format &optional (start 0) end error)
1047 "Decode String using the given External-Format and return the new
1048 string. The input string is treated as if it were an array of
1049 octets, where the char-code of each character is the octet. This is
1050 the inverse of String-Encode."
1051 (when (zerop (length string))
1052 (return-from string-decode string))
1053 (multiple-value-bind (result pos)
1054 (lisp::with-array-data ((string string) (start start) (end end))
1055 (funcall (ef-decode external-format)
1056 string (1- start) (1- end) (make-string (length string))
1057 error))
1058 (lisp::shrink-vector result pos)))
1059
1060
1061 (defun set-system-external-format (terminal &optional filenames)
1062 "Change the external format of the standard streams to Terminal.
1063 The standard streams are sys::*stdin*, sys::*stdout*, and
1064 sys::*stderr*, which are normally the input and/or output streams
1065 for *standard-input* and *standard-output*. Also sets sys::*tty*
1066 (normally *terminal-io* to the given external format. If the
1067 optional argument Filenames is gvien, then the filename encoding is
1068 set to the specified format."
1069 (unless (find-external-format terminal)
1070 (error (intl:gettext "Can't find external-format ~S.") terminal))
1071 (setf (stream-external-format sys:*stdin*) terminal
1072 (stream-external-format sys:*stdout*) terminal
1073 (stream-external-format sys:*stderr*) terminal)
1074 (when (lisp::fd-stream-p sys:*tty*)
1075 (setf (stream-external-format sys:*tty*) terminal))
1076 (when filenames
1077 (unless (find-external-format filenames)
1078 (error (intl:gettext "Can't find external-format ~S.") filenames))
1079 (when (and unix::*filename-encoding*
1080 (not (eq unix::*filename-encoding* filenames)))
1081 (cerror (intl:gettext "Change it anyway.")
1082 (intl:gettext "The external-format for encoding filenames is already set."))
1083 (setq unix::*filename-encoding* filenames)))
1084 t)
1085
1086
1087 ;; Despite its name, this doesn't actually compile anything at all. What it
1088 ;; does is expand into a lambda expression that can be compiled by the file
1089 ;; compiler.
1090 (defmacro precompile-ef-slot (ef slot)
1091 (let* ((ef (find-external-format ef)))
1092 ;; if there's no lambda expression available, flush it and regenerate
1093 (unless (and (aref (ef-cache ef) slot)
1094 (function-lambda-expression (aref (ef-cache ef) slot)))
1095 (setf (aref (ef-cache ef) slot) nil)
1096 (ecase slot
1097 (#.+ef-cin+ (lisp::%ef-cin ef))
1098 (#.+ef-cout+ (lisp::%ef-cout ef))
1099 (#.+ef-sout+ (lisp::%ef-sout ef))
1100 (#.+ef-os+ (%ef-octets-to-string ef))
1101 (#.+ef-so+ (%ef-string-to-octets ef))
1102 (#.+ef-en+ (%ef-encode ef))
1103 (#.+ef-de+ (%ef-decode ef))))
1104 `(setf (aref (ef-cache (find-external-format ,(ef-name ef))) ,slot)
1105 ,(subst (ef-name ef) ef
1106 (function-lambda-expression (aref (ef-cache ef) slot))))))

  ViewVC Help
Powered by ViewVC 1.1.5