/[cmucl]/src/code/string.lisp
ViewVC logotype

Contents of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.15 - (show annotations)
Tue Jun 16 21:25:02 2009 UTC (4 years, 10 months ago) by rtoy
Branch: MAIN
CVS Tags: portable-clx-import-2009-06-16, portable-clx-base, snapshot-2009-07
Branch point for: portable-clx-branch
Changes since 1.14: +11 -1 lines
Cleanups for non-unicode build.

code/stream.lisp:
o Only define (setf stream-external-format) for Unicode builds.
o In stream-external-format, don't try to look up the external format
  from the fd-stream structure, which doesn't exist in non-unicode
  builds.

code/strings.lisp:
o Conditionalize out things that will only work if unicode is
  available.

tools/worldcom.lisp:
o Only compile fd-stream-extfmt for unicode builds.
1 ;;; -*- Log: code.log; Package: Lisp -*-
2 ;;;
3 ;;; **********************************************************************
4 ;;; This code was written as part of the CMU Common Lisp project at
5 ;;; Carnegie Mellon University, and has been placed in the public domain.
6 ;;;
7 (ext:file-comment
8 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/string.lisp,v 1.15 2009/06/16 21:25:02 rtoy Exp $")
9 ;;;
10 ;;; **********************************************************************
11 ;;;
12 ;;; Functions to implement strings for CMU Common Lisp
13 ;;; Written by David Dill
14 ;;; Rewritten by Skef Wholey, Bill Chiles and Rob MacLachlan.
15 ;;;
16 ;;; ****************************************************************
17 ;;;
18 (in-package "LISP")
19 (export '(char schar glyph sglyph string
20 string= string-equal string< string> string<= string>= string/=
21 string-lessp string-greaterp string-not-lessp string-not-greaterp
22 string-not-equal
23 string-to-nfc
24 make-string
25 string-trim string-left-trim string-right-trim
26 string-upcase
27 string-downcase string-capitalize nstring-upcase nstring-downcase
28 nstring-capitalize))
29
30 #+unicode
31 (export '(string-to-nfd string-to-nfkd string-to-nfkc))
32
33 (declaim (inline surrogatep surrogates-to-codepoint codepoint surrogates))
34
35 (defun surrogatep (c &optional surrogate-type)
36 "Test if C is a surrogate. C may be either an integer or a
37 character. Surrogate-type indicates what kind of surrogate to test
38 for. :High means to test for the high (leading) surrogate; :Low
39 tests for the low (trailing surrogate). A value of :Any or Nil
40 tests for any surrogate value (high or low)."
41 (declare (type (or character codepoint) c))
42 (let ((code (if (characterp c)
43 (char-code c)
44 c)))
45 (ecase surrogate-type
46 ((:high :leading)
47 ;; Test for high surrogate
48 (<= #xD800 code #xDBFF))
49 ((:low :trailing)
50 ;; Test for low surrogate
51 (<= #xDC00 code #xDFFF))
52 ((:any nil)
53 ;; Test for any surrogate
54 (<= #xD800 code #xDFFF)))))
55
56 (defun surrogates-to-codepoint (hi lo)
57 "Convert the given Hi and Lo surrogate characters to the
58 corresponding codepoint value"
59 (declare (type character hi lo))
60 (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi)) #xD800) 10)
61 (the (integer #xDC00 #xDFFF) (char-code lo)) #x2400))
62
63 (defun codepoint (string i &optional (end (length string)))
64 "Return the codepoint value from String at position I. If that
65 position is a surrogate, it is combined with either the previous or
66 following character (when possible) to compute the codepoint. The
67 second return value is NIL if the position is not a surrogate pair.
68 Otherwise +1 or -1 is returned if the position is the high or low
69 surrogate value, respectively."
70 (declare (type simple-string string) (type kernel:index i end))
71 (let ((code (char-code (schar string i))))
72 (cond ((and (surrogatep code :high) (< (1+ i) end))
73 (let ((tmp (char-code (schar string (1+ i)))))
74 (if (surrogatep tmp :low)
75 (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)
76 (values code nil))))
77 ((and (surrogatep code :low) (> i 0))
78 (let ((tmp (char-code (schar string (1- i)))))
79 (if (surrogatep tmp :high)
80 (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)
81 (values code nil))))
82 (t (values code nil)))))
83
84 (defun surrogates (codepoint)
85 "Return the high and low surrogate characters for Codepoint. If
86 Codepoint is in the BMP, the first return value is the corresponding
87 character and the second is NIL."
88 (declare (type codepoint codepoint))
89 (if (< codepoint #x10000)
90 (values (code-char codepoint) nil)
91 (let* ((tmp (- codepoint #x10000))
92 (hi (logior (ldb (byte 10 10) tmp) #xD800))
93 (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
94 (values (code-char hi) (code-char lo)))))
95
96 (defun (setf codepoint) (codepoint string i)
97 "Set the codepoint at string position I to the Codepoint. If the
98 codepoint requires a surrogate pair, the high (leading surrogate) is
99 stored at position I and the low (trailing) surrogate is stored at
100 I+1"
101 (declare (type codepoint codepoint)
102 (type simple-string string))
103 (let ((widep nil))
104 (multiple-value-bind (hi lo)
105 (surrogates codepoint)
106 (setf (aref string i) hi)
107 (when lo
108 (setf (aref string (1+ i)) lo)
109 (setf widep t)))
110 (values codepoint widep)))
111
112 #+unicode
113 (defun utf16-string-p (string)
114 "Check if String is a valid UTF-16 string. If the string is valid,
115 T is returned. If the string is not valid, NIL is returned, and the
116 second value is the index into the string of the invalid character.
117 A string is also invalid if it contains any unassigned codepoints."
118 (do ((len (length string))
119 (index 0 (1+ index)))
120 ((>= index len)
121 t)
122 (multiple-value-bind (codepoint wide)
123 (codepoint string index)
124 ;; We step through the string in order. If there are any
125 ;; surrogates pairs, we must reach the lead surrogate first,
126 ;; which means WIDE is +1. Otherwise, we have an invalid
127 ;; surrogate pair. If we get any codepoint that is in the
128 ;; surrogate range, we also have an invalid string. An
129 ;; unassigned codepoint is also considered invalid.
130 (when (or (eq wide -1)
131 (surrogatep codepoint)
132 (lisp::unicode-assigned-codepoint-p codepoint))
133 (return-from utf16-string-p (values nil index)))
134 (when wide (incf index)))))
135
136 (defun string (X)
137 "Coerces X into a string. If X is a string, X is returned. If X is a
138 symbol, X's pname is returned. If X is a character then a one element
139 string containing that character is returned. If X cannot be coerced
140 into a string, an error occurs."
141 (cond ((stringp x) x)
142 ((symbolp x) (symbol-name x))
143 ((characterp x)
144 (let ((res (make-string 1)))
145 (setf (schar res 0) x) res))
146 (t
147 (error 'simple-type-error
148 :datum x
149 :expected-type '(or string symbol character)
150 :format-control "~S cannot be coerced to a string."
151 :format-arguments (list x)))))
152
153 ;;; With-One-String is used to set up some string hacking things. The keywords
154 ;;; are parsed, and the string is hacked into a simple-string.
155
156 (eval-when (compile)
157
158 (defmacro with-one-string (string start end cum-offset &rest forms)
159 `(let ((,string (if (stringp ,string) ,string (string ,string))))
160 ;; Optimizer may prove STRING is one.
161 (declare (optimize (ext:inhibit-warnings 3)))
162 (with-array-data ((,string ,string :offset-var ,cum-offset)
163 (,start ,start)
164 (,end (or ,end (length (the vector ,string)))))
165 ,@forms)))
166
167 )
168
169 ;;; With-String is like With-One-String, but doesn't parse keywords.
170
171 (eval-when (compile)
172
173 (defmacro with-string (string &rest forms)
174 `(let ((,string (if (stringp ,string) ,string (string ,string))))
175 (with-array-data ((,string ,string)
176 (start)
177 (end (length (the vector ,string))))
178 ,@forms)))
179
180 )
181
182 ;;; With-Two-Strings is used to set up string comparison operations. The
183 ;;; keywords are parsed, and the strings are hacked into simple-strings.
184
185 (eval-when (compile)
186
187 (defmacro with-two-strings (string1 string2 start1 end1 cum-offset-1
188 start2 end2 &rest forms)
189 `(let ((,string1 (if (stringp ,string1) ,string1 (string ,string1)))
190 (,string2 (if (stringp ,string2) ,string2 (string ,string2))))
191 (with-array-data ((,string1 ,string1 :offset-var ,cum-offset-1)
192 (,start1 ,start1)
193 (,end1 (or ,end1 (length (the vector ,string1)))))
194 (with-array-data ((,string2 ,string2)
195 (,start2 ,start2)
196 (,end2 (or ,end2 (length (the vector ,string2)))))
197 ,@forms))))
198
199 )
200
201
202 (defun char (string index)
203 "Given a string and a non-negative integer index less than the length of
204 the string, returns the character object representing the character at
205 that position in the string."
206 (declare (optimize (safety 1)))
207 (char string index))
208
209 (defun %charset (string index new-el)
210 (declare (optimize (safety 1)))
211 (setf (char string index) new-el))
212
213 (defun schar (string index)
214 "SCHAR returns the character object at an indexed position in a string
215 just as CHAR does, except the string must be a simple-string."
216 (declare (optimize (safety 1)))
217 (schar string index))
218
219 (defun %scharset (string index new-el)
220 (declare (optimize (safety 1)))
221 (setf (schar string index) new-el))
222
223 (defun string=* (string1 string2 start1 end1 start2 end2)
224 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
225 (not (%sp-string-compare string1 start1 end1 string2 start2 end2))))
226
227
228 (defun string/=* (string1 string2 start1 end1 start2 end2)
229 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
230 (let ((comparison (%sp-string-compare string1 start1 end1
231 string2 start2 end2)))
232 (if comparison (- (the fixnum comparison) offset1)))))
233
234 (eval-when (compile eval)
235
236 ;;; Lessp is true if the desired expansion is for string<* or string<=*.
237 ;;; Equalp is true if the desired expansion is for string<=* or string>=*.
238 (defmacro string<>=*-body (lessp equalp)
239 (let ((offset1 (gensym)))
240 `(with-two-strings string1 string2 start1 end1 ,offset1 start2 end2
241 (let ((index (%sp-string-compare string1 start1 end1
242 string2 start2 end2)))
243 (if index
244 (cond ((= (the fixnum index) (the fixnum end1))
245 ,(if lessp
246 `(- (the fixnum index) ,offset1)
247 `nil))
248 ((= (+ (the fixnum index) (- start2 start1))
249 (the fixnum end2))
250 ,(if lessp
251 `nil
252 `(- (the fixnum index) ,offset1)))
253 #-unicode
254 ((,(if lessp 'char< 'char>)
255 (schar string1 index)
256 (schar string2 (+ (the fixnum index) (- start2 start1))))
257 (- (the fixnum index) ,offset1))
258 #-unicode
259 (t nil)
260 #+unicode
261 (t
262 ;; Compare in code point order. See
263 ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
264 (flet ((fixup (code)
265 (if (>= code #xe000)
266 (- code #x800)
267 (+ code #x2000))))
268 (declare (inline fixup))
269 (let* ((c1 (char-code (schar string1 index)))
270 (c2 (char-code (schar string2
271 (+ (the fixnum index)
272 (- start2 start1))))))
273 (cond ((and (>= c1 #xd800)
274 (>= c2 #xd800))
275 (let ((fix-c1 (fixup c1))
276 (fix-c2 (fixup c2)))
277 (if (,(if lessp '< '>) fix-c1 fix-c2)
278 (- (the fixnum index) ,offset1)
279 nil)))
280 (t
281 (if (,(if lessp '< '>) c1 c2)
282 (- (the fixnum index) ,offset1)
283 nil)))))))
284 ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
285 ) ; eval-when
286
287 (defun string<* (string1 string2 start1 end1 start2 end2)
288 (declare (fixnum start1 start2))
289 (string<>=*-body t nil))
290
291 (defun string>* (string1 string2 start1 end1 start2 end2)
292 (declare (fixnum start1 start2))
293 (string<>=*-body nil nil))
294
295 (defun string<=* (string1 string2 start1 end1 start2 end2)
296 (declare (fixnum start1 start2))
297 (string<>=*-body t t))
298
299 (defun string>=* (string1 string2 start1 end1 start2 end2)
300 (declare (fixnum start1 start2))
301 (string<>=*-body nil t))
302
303
304
305 (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
306 "Given two strings, if the first string is lexicographically less than
307 the second string, returns the longest common prefix (using char=)
308 of the two strings. Otherwise, returns ()."
309 (string<* string1 string2 start1 end1 start2 end2))
310
311 (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
312 "Given two strings, if the first string is lexicographically greater than
313 the second string, returns the longest common prefix (using char=)
314 of the two strings. Otherwise, returns ()."
315 (string>* string1 string2 start1 end1 start2 end2))
316
317
318 (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
319 "Given two strings, if the first string is lexicographically less than
320 or equal to the second string, returns the longest common prefix
321 (using char=) of the two strings. Otherwise, returns ()."
322 (string<=* string1 string2 start1 end1 start2 end2))
323
324 (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
325 "Given two strings, if the first string is lexicographically greater
326 than or equal to the second string, returns the longest common prefix
327 (using char=) of the two strings. Otherwise, returns ()."
328 (string>=* string1 string2 start1 end1 start2 end2))
329
330 (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
331 "Given two strings (string1 and string2), and optional integers start1,
332 start2, end1 and end2, compares characters in string1 to characters in
333 string2 (using char=)."
334 (string=* string1 string2 start1 end1 start2 end2))
335
336 (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
337 "Given two strings, if the first string is not lexicographically equal
338 to the second string, returns the longest common prefix (using char=)
339 of the two strings. Otherwise, returns ()."
340 (string/=* string1 string2 start1 end1 start2 end2))
341
342
343 (eval-when (compile eval)
344
345 ;;; STRING-NOT-EQUAL-LOOP is used to generate character comparison loops for
346 ;;; STRING-EQUAL and STRING-NOT-EQUAL.
347 (defmacro string-not-equal-loop (end end-value
348 &optional (abort-value nil abortp))
349 (declare (fixnum end))
350 (let ((end-test (if (= end 1)
351 `(= index1 (the fixnum end1))
352 `(= index2 (the fixnum end2)))))
353 `(do ((index1 start1 (1+ index1))
354 (index2 start2 (1+ index2)))
355 (,(if abortp
356 end-test
357 `(or ,end-test
358 (not (char-equal (schar string1 index1)
359 (schar string2 index2)))))
360 ,end-value)
361 (declare (fixnum index1 index2))
362 ,@(if abortp
363 `((if (not (char-equal (schar string1 index1)
364 (schar string2 index2)))
365 (return ,abort-value)))))))
366 ) ; eval-when
367
368 #+unicode
369 (defun string-case-fold (string &key (start 0) end (casing :simple))
370 "Return a new string with the case folded according to Casing as follows:
371
372 :SIMPLE Unicode simple case folding (preserving length)
373 :FULL Unicode full case folding (possibly changing length)
374
375 Default Casing is :SIMPLE."
376 (ecase casing
377 (:simple
378 (with-output-to-string (s)
379 (with-one-string string start end offset
380 (do ((index offset (1+ index)))
381 ((>= index end))
382 (multiple-value-bind (code widep)
383 (codepoint string index)
384 (when widep (incf index))
385 (multiple-value-bind (hi lo)
386 (surrogates (unicode-case-fold-simple code))
387 (write-char hi s)
388 (when lo (write-char lo s))))))))
389 (:full
390 (with-output-to-string (s)
391 (with-one-string string start end offset
392 (do ((index offset (1+ index)))
393 ((>= index end))
394 (multiple-value-bind (code widep)
395 (codepoint string index)
396 (when widep (incf index))
397 (write-string (unicode-case-fold-full code) s))))))))
398
399 (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
400 "Given two strings (string1 and string2), and optional integers start1,
401 start2, end1 and end2, compares characters in string1 to characters in
402 string2 (using char-equal)."
403 (declare (fixnum start1 start2))
404 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
405 (let ((slen1 (- (the fixnum end1) start1))
406 (slen2 (- (the fixnum end2) start2)))
407 (declare (fixnum slen1 slen2))
408 (if (or (minusp slen1) (minusp slen2))
409 ;;prevent endless looping later.
410 (error "Improper bounds for string comparison."))
411 (if (= slen1 slen2)
412 ;;return () immediately if lengths aren't equal.
413 (string-not-equal-loop 1 t nil)))))
414
415 (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
416 "Given two strings, if the first string is not lexicographically equal
417 to the second string, returns the longest common prefix (using char-equal)
418 of the two strings. Otherwise, returns ()."
419 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
420 (let ((slen1 (- end1 start1))
421 (slen2 (- end2 start2)))
422 (declare (fixnum slen1 slen2))
423 (if (or (minusp slen1) (minusp slen2))
424 ;;prevent endless looping later.
425 (error "Improper bounds for string comparison."))
426 (cond ((or (minusp slen1) (or (minusp slen2)))
427 (error "Improper substring for comparison."))
428 ((= slen1 slen2)
429 (string-not-equal-loop 1 nil (- index1 offset1)))
430 ((< slen1 slen2)
431 (string-not-equal-loop 1 (- index1 offset1)))
432 (t
433 (string-not-equal-loop 2 (- index1 offset1)))))))
434
435
436
437 (eval-when (compile eval)
438
439 ;;; STRING-LESS-GREATER-EQUAL-TESTS returns a test on the lengths of string1
440 ;;; and string2 and a test on the current characters from string1 and string2
441 ;;; for the following macro.
442 (defun string-less-greater-equal-tests (lessp equalp)
443 (if lessp
444 (if equalp
445 ;; STRING-NOT-GREATERP
446 (values '<=
447 #-unicode `(not (char-greaterp char1 char2))
448 #+unicode `(<= char1 char2))
449 ;; STRING-LESSP
450 (values '<
451 #-unicode `(char-lessp char1 char2)
452 #+unicode `(< char1 char2)))
453 (if equalp
454 ;; STRING-NOT-LESSP
455 (values '>=
456 #-unicode `(not (char-lessp char1 char2))
457 #+unicode `(>= char1 char2))
458 ;; STRING-GREATERP
459 (values '>
460 #-unicode `(char-greaterp char1 char2)
461 #+unicode `(> char1 char2)))))
462
463 #-unicode
464 (defmacro string-less-greater-equal (lessp equalp)
465 (multiple-value-bind (length-test character-test)
466 (string-less-greater-equal-tests lessp equalp)
467 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
468 (let ((slen1 (- (the fixnum end1) start1))
469 (slen2 (- (the fixnum end2) start2)))
470 (declare (fixnum slen1 slen2))
471 (if (or (minusp slen1) (minusp slen2))
472 ;;prevent endless looping later.
473 (error "Improper bounds for string comparison."))
474 (do ((index1 start1 (1+ index1))
475 (index2 start2 (1+ index2))
476 (char1)
477 (char2))
478 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
479 (if (,length-test slen1 slen2) (- index1 offset1)))
480 (declare (fixnum index1 index2))
481 (setq char1 (schar string1 index1))
482 (setq char2 (schar string2 index2))
483 (if (not (char-equal char1 char2))
484 (if ,character-test
485 (return (- index1 offset1))
486 (return ()))))))))
487
488 ;; Convert to lowercase for case folding, to match what Unicode
489 ;; CaseFolding.txt says. An example where this matters: U+1E9E maps
490 ;; to U+00DF. But the uppercase version of U+00DF is U+00DF.
491 #+unicode
492 (defmacro equal-char-codepoint (codepoint)
493 `(let ((ch ,codepoint))
494 ;; Handle ASCII separately for bootstrapping and for unidata missing.
495 (if (< 64 ch 91)
496 (+ ch 32)
497 #-(and unicode (not unicode-bootstrap))
498 ch
499 #+(and unicode (not unicode-bootstrap))
500 (if (> ch 127) (unicode-lower ch) ch))))
501
502 #+unicode
503 (defmacro string-less-greater-equal (lessp equalp)
504 (multiple-value-bind (length-test character-test)
505 (string-less-greater-equal-tests lessp equalp)
506 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
507 (let ((slen1 (- (the fixnum end1) start1))
508 (slen2 (- (the fixnum end2) start2)))
509 (declare (fixnum slen1 slen2))
510 (if (or (minusp slen1) (minusp slen2))
511 ;;prevent endless looping later.
512 (error "Improper bounds for string comparison."))
513 (do ((index1 start1 (1+ index1))
514 (index2 start2 (1+ index2)))
515 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
516 (if (,length-test slen1 slen2) (- index1 offset1)))
517 (declare (fixnum index1 index2))
518 (multiple-value-bind (char1 wide1)
519 (codepoint string1 index1)
520 (declare (type codepoint char1))
521 (multiple-value-bind (char2 wide2)
522 (codepoint string2 index2)
523 (declare (type codepoint char2))
524 (setf char1 (equal-char-codepoint char1))
525 (setf char2 (equal-char-codepoint char2))
526 (if (= char1 char2)
527 (progn
528 (when wide1 (incf index1))
529 (when wide2 (incf index2)))
530 (if ,character-test
531 (return (- index1 offset1))
532 (return ()))))))))))
533
534 ) ; eval-when
535
536 (defun string-lessp* (string1 string2 start1 end1 start2 end2)
537 (declare (fixnum start1 start2))
538 (string-less-greater-equal t nil))
539
540 (defun string-greaterp* (string1 string2 start1 end1 start2 end2)
541 (declare (fixnum start1 start2))
542 (string-less-greater-equal nil nil))
543
544 (defun string-not-lessp* (string1 string2 start1 end1 start2 end2)
545 (declare (fixnum start1 start2))
546 (string-less-greater-equal nil t))
547
548 (defun string-not-greaterp* (string1 string2 start1 end1 start2 end2)
549 (declare (fixnum start1 start2))
550 (string-less-greater-equal t t))
551
552 (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
553 "Given two strings, if the first string is lexicographically less than
554 the second string, returns the longest common prefix (using char-equal)
555 of the two strings. Otherwise, returns ()."
556 (string-lessp* string1 string2 start1 end1 start2 end2))
557
558 (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)
559 "Given two strings, if the first string is lexicographically greater than
560 the second string, returns the longest common prefix (using char-equal)
561 of the two strings. Otherwise, returns ()."
562 (string-greaterp* string1 string2 start1 end1 start2 end2))
563
564 (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
565 "Given two strings, if the first string is lexicographically greater
566 than or equal to the second string, returns the longest common prefix
567 (using char-equal) of the two strings. Otherwise, returns ()."
568 (string-not-lessp* string1 string2 start1 end1 start2 end2))
569
570 (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
571 end2)
572 "Given two strings, if the first string is lexicographically less than
573 or equal to the second string, returns the longest common prefix
574 (using char-equal) of the two strings. Otherwise, returns ()."
575 (string-not-greaterp* string1 string2 start1 end1 start2 end2))
576
577
578 (defun make-string (count &key element-type ((:initial-element fill-char)))
579 "Given a character count and an optional fill character, makes and returns
580 a new string Count long filled with the fill character."
581 (declare (type fixnum count))
582 (assert (subtypep element-type 'character))
583 (if fill-char
584 (do ((i 0 (1+ i))
585 (string (make-string count)))
586 ((= i count) string)
587 (declare (fixnum i))
588 (setf (schar string i) fill-char))
589 (make-string count)))
590
591 (defun string-upcase-simple (string &key (start 0) end)
592 (declare (fixnum start))
593 (let* ((string (if (stringp string) string (string string)))
594 (slen (length string)))
595 (declare (fixnum slen))
596 (with-one-string string start end offset
597 (let ((offset-slen (+ slen offset))
598 (newstring (make-string slen)))
599 (declare (fixnum offset-slen))
600 (do ((index offset (1+ index))
601 (new-index 0 (1+ new-index)))
602 ((= index start))
603 (declare (fixnum index new-index))
604 (setf (schar newstring new-index) (schar string index)))
605 (do ((index start (1+ index))
606 (new-index (- start offset) (1+ new-index)))
607 ((= index (the fixnum end)))
608 (declare (fixnum index new-index))
609 (multiple-value-bind (code wide) (codepoint string index)
610 (when wide (incf index))
611 ;; Handle ASCII specially because this is called early in
612 ;; initialization, before unidata is available.
613 (cond ((< 96 code 123) (decf code 32))
614 #+unicode
615 ((> code 127) (setq code (unicode-upper code))))
616 ;;@@ WARNING: this may, in theory, need to extend newstring
617 ;; but that never actually occurs as of Unicode 5.1.0,
618 ;; so I'm just going to ignore it for now...
619 (multiple-value-bind (hi lo) (surrogates code)
620 (setf (schar newstring new-index) hi)
621 (when lo
622 (setf (schar newstring (incf new-index)) lo)))))
623 ;;@@ WARNING: see above
624 (do ((index end (1+ index))
625 (new-index (- (the fixnum end) offset) (1+ new-index)))
626 ((= index offset-slen))
627 (declare (fixnum index new-index))
628 (setf (schar newstring new-index) (schar string index)))
629 newstring))))
630
631 #+unicode
632 (defun string-upcase-full (string &key (start 0) end)
633 (declare (fixnum start))
634 (let* ((string (if (stringp string) string (string string)))
635 (slen (length string)))
636 (declare (fixnum slen))
637 (with-output-to-string (s)
638 (with-one-string string start end offset
639 (let ((offset-slen (+ slen offset)))
640 (declare (fixnum offset-slen))
641 (write-string string s :start offset :end start)
642 (do ((index start (1+ index)))
643 ((= index (the fixnum end)))
644 (declare (fixnum index))
645 (multiple-value-bind (code wide)
646 (codepoint string index)
647 (when wide (incf index))
648 ;; Handle ASCII specially because this is called early in
649 ;; initialization, before unidata is available.
650 (cond ((< 96 code 123)
651 (write-char (code-char (decf code 32)) s))
652 ((> code 127)
653 (write-string (unicode-full-case-upper code) s))
654 (t
655 (multiple-value-bind (hi lo)
656 (surrogates code)
657 (write-char hi s)
658 (when lo
659 (write-char lo s)))))))
660 (write-string string s :start end :end offset-slen))))))
661
662 (defun string-upcase (string &key (start 0) end #+unicode (casing :simple))
663 #-unicode
664 "Given a string, returns a new string that is a copy of it with all
665 lower case alphabetic characters converted to uppercase."
666 #+unicode
667 "Given a string, returns a new string that is a copy of it with all
668 lower case alphabetic characters converted to uppercase. Casing is
669 :simple or :full for simple or full case conversion, respectively."
670 (declare (fixnum start))
671 #-unicode
672 (string-upcase-simple string :start start :end end)
673 #+unicode
674 (if (eq casing :simple)
675 (string-upcase-simple string :start start :end end)
676 (string-upcase-full string :start start :end end)))
677
678 (defun string-downcase-simple (string &key (start 0) end)
679 (declare (fixnum start))
680 (let* ((string (if (stringp string) string (string string)))
681 (slen (length string)))
682 (declare (fixnum slen))
683 (with-one-string string start end offset
684 (let ((offset-slen (+ slen offset))
685 (newstring (make-string slen)))
686 (declare (fixnum offset-slen))
687 (do ((index offset (1+ index))
688 (new-index 0 (1+ new-index)))
689 ((= index start))
690 (declare (fixnum index new-index))
691 (setf (schar newstring new-index) (schar string index)))
692 (do ((index start (1+ index))
693 (new-index (- start offset) (1+ new-index)))
694 ((= index (the fixnum end)))
695 (declare (fixnum index new-index))
696 (multiple-value-bind (code wide) (codepoint string index)
697 (when wide (incf index))
698 ;; Handle ASCII specially because this is called early in
699 ;; initialization, before unidata is available.
700 (cond ((< 64 code 91) (incf code 32))
701 ((> code 127) (setq code (unicode-lower code))))
702 ;;@@ WARNING: this may, in theory, need to extend newstring
703 ;; but that never actually occurs as of Unicode 5.1.0,
704 ;; so I'm just going to ignore it for now...
705 (multiple-value-bind (hi lo) (surrogates code)
706 (setf (schar newstring new-index) hi)
707 (when lo
708 (setf (schar newstring (incf new-index)) lo)))))
709 ;;@@ WARNING: see above
710 (do ((index end (1+ index))
711 (new-index (- (the fixnum end) offset) (1+ new-index)))
712 ((= index offset-slen))
713 (declare (fixnum index new-index))
714 (setf (schar newstring new-index) (schar string index)))
715 newstring))))
716
717 #+unicode
718 (defun string-downcase-full (string &key (start 0) end)
719 (declare (fixnum start))
720 (let* ((string (if (stringp string) string (string string)))
721 (slen (length string)))
722 (declare (fixnum slen))
723 (with-output-to-string (s)
724 (with-one-string string start end offset
725 (let ((offset-slen (+ slen offset)))
726 (declare (fixnum offset-slen))
727 (write-string string s :start offset :end start)
728 (do ((index start (1+ index)))
729 ((= index (the fixnum end)))
730 (declare (fixnum index))
731 (multiple-value-bind (code wide)
732 (codepoint string index)
733 (when wide (incf index))
734 ;; Handle ASCII specially because this is called early in
735 ;; initialization, before unidata is available.
736 (cond ((< 64 code 91)
737 (write-char (code-char (incf code 32)) s))
738 ((> code 127)
739 (write-string (unicode-full-case-lower code) s))
740 (t
741 (multiple-value-bind (hi lo)
742 (surrogates code)
743 (write-char hi s)
744 (when lo
745 (write-char lo s)))))))
746 (write-string string s :start end :end offset-slen))))))
747
748 (defun string-downcase (string &key (start 0) end #+unicode (casing :simple))
749 #-unicode
750 "Given a string, returns a new string that is a copy of it with all
751 upper case alphabetic characters converted to lowercase."
752 #+unicode
753 "Given a string, returns a new string that is a copy of it with all
754 upper case alphabetic characters converted to lowercase. Casing is
755 :simple or :full for simple or full case conversion, respectively."
756 (declare (fixnum start))
757 #-unicode
758 (string-downcase-simple string :start start :end end)
759 #+unicode
760 (if (eq casing :simple)
761 (string-downcase-simple string :start start :end end)
762 (string-downcase-full string :start start :end end)))
763
764 (defun string-capitalize-simple (string &key (start 0) end)
765 (declare (fixnum start))
766 (let* ((string (if (stringp string) string (string string)))
767 (slen (length string)))
768 (declare (fixnum slen))
769 (with-one-string string start end offset
770 (let ((offset-slen (+ slen offset))
771 (newstring (make-string slen)))
772 (declare (fixnum offset-slen))
773 (do ((index offset (1+ index))
774 (new-index 0 (1+ new-index)))
775 ((= index start))
776 (declare (fixnum index new-index))
777 (setf (schar newstring new-index) (schar string index)))
778 (do ((index start (1+ index))
779 (new-index (- start offset) (1+ new-index))
780 (newword t)
781 (char ()))
782 ((= index (the fixnum end)))
783 (declare (fixnum index new-index))
784 (setq char (schar string index))
785 (cond ((not (alphanumericp char))
786 (setq newword t))
787 (newword
788 ;;char is first case-modifiable after non-case-modifiable
789 (setq char (char-titlecase char))
790 (setq newword ()))
791 ;;char is case-modifiable, but not first
792 (t (setq char (char-downcase char))))
793 (setf (schar newstring new-index) char))
794 (do ((index end (1+ index))
795 (new-index (- (the fixnum end) offset) (1+ new-index)))
796 ((= index offset-slen))
797 (declare (fixnum index new-index))
798 (setf (schar newstring new-index) (schar string index)))
799 newstring))))
800
801 #+unicode
802 (defun string-capitalize-full (string &key (start 0) end)
803 (declare (fixnum start))
804 (let* ((string (if (stringp string) string (string string)))
805 (slen (length string)))
806 (declare (fixnum slen))
807 (with-output-to-string (s)
808 (with-one-string string start end offset
809 (let ((offset-slen (+ slen offset)))
810 (declare (fixnum offset-slen))
811 (write-string string s :start offset :end start)
812 (flet ((alphanump (m)
813 (or (< 47 m 58) (< 64 m 91) (< 96 m 123)
814 #+(and unicode (not unicode-bootstrap))
815 (and (> m 127)
816 (<= +unicode-category-letter+ (unicode-category m)
817 (+ +unicode-category-letter+ #x0F))))))
818 (do ((index start (1+ index))
819 (newword t))
820 ((= index (the fixnum end)))
821 (declare (fixnum index))
822 (multiple-value-bind (code wide)
823 (codepoint string index)
824 (when wide (incf index))
825 (cond ((not (alphanump code))
826 (multiple-value-bind (hi lo)
827 (surrogates code)
828 (write-char hi s)
829 (when lo (write-char lo s)))
830 (setq newword t))
831 (newword
832 ;;char is first case-modifiable after non-case-modifiable
833 (write-string (unicode-full-case-title code) s)
834 (setq newword ()))
835 ;;char is case-modifiable, but not first
836 (t
837 (write-string (unicode-full-case-lower code) s))))))
838 (write-string string s :start end :end offset-slen))))))
839
840 (defun string-capitalize (string &key (start 0) end #+unicode (casing :simple))
841 #-unicode
842 "Given a string, returns a copy of the string with the first
843 character of each ``word'' converted to upper-case, and remaining
844 chars in the word converted to lower case. A ``word'' is defined
845 to be a string of case-modifiable characters delimited by
846 non-case-modifiable chars."
847 #+unicode
848 "Given a string, returns a copy of the string with the first
849 character of each ``word'' converted to upper-case, and remaining
850 chars in the word converted to lower case. A ``word'' is defined
851 to be a string of case-modifiable characters delimited by
852 non-case-modifiable chars. Casing is :simple or :full for
853 simple or full case conversion, respectively."
854
855 (declare (fixnum start))
856 #-unicode
857 (string-capitalize-simple string :start start :end end)
858 #+unicode
859 (if (eq casing :simple)
860 (string-capitalize-simple string :start start :end end)
861 (string-capitalize-full string :start start :end end)))
862
863 (defun nstring-upcase (string &key (start 0) end)
864 "Given a string, returns that string with all lower case alphabetic
865 characters converted to uppercase."
866 (declare (fixnum start))
867 (let ((save-header string))
868 (with-one-string string start end offset
869 (do ((index start (1+ index)))
870 ((= index (the fixnum end)))
871 (declare (fixnum index))
872 (multiple-value-bind (code wide) (codepoint string index)
873 (declare (ignore wide))
874 ;; Handle ASCII specially because this is called early in
875 ;; initialization, before unidata is available.
876 (cond ((< 96 code 123) (decf code 32))
877 #+unicode
878 ((> code 127) (setq code (unicode-upper code))))
879 ;;@@ WARNING: this may, in theory, need to extend string
880 ;; (which, obviously, we can't do here. Unless
881 ;; STRING is adjustable, maybe)
882 ;; but that never actually occurs as of Unicode 5.1.0,
883 ;; so I'm just going to ignore it for now...
884 (multiple-value-bind (hi lo) (surrogates code)
885 (setf (schar string index) hi)
886 (when lo
887 (setf (schar string (incf index)) lo))))))
888 save-header))
889
890 (defun nstring-downcase (string &key (start 0) end)
891 "Given a string, returns that string with all upper case alphabetic
892 characters converted to lowercase."
893 (declare (fixnum start))
894 (let ((save-header string))
895 (with-one-string string start end offset
896 (do ((index start (1+ index)))
897 ((= index (the fixnum end)))
898 (declare (fixnum index))
899 (multiple-value-bind (code wide) (codepoint string index)
900 (declare (ignore wide))
901 (cond ((< 64 code 91) (incf code 32))
902 #+unicode
903 ((> code 127) (setq code (unicode-lower code))))
904 ;;@@ WARNING: this may, in theory, need to extend string
905 ;; (which, obviously, we can't do here. Unless
906 ;; STRING is adjustable, maybe)
907 ;; but that never actually occurs as of Unicode 5.1.0,
908 ;; so I'm just going to ignore it for now...
909 (multiple-value-bind (hi lo) (surrogates code)
910 (setf (schar string index) hi)
911 (when lo
912 (setf (schar string (incf index)) lo))))))
913 save-header))
914
915 (defun nstring-capitalize (string &key (start 0) end)
916 "Given a string, returns that string with the first
917 character of each ``word'' converted to upper-case, and remaining
918 chars in the word converted to lower case. A ``word'' is defined
919 to be a string of case-modifiable characters delimited by
920 non-case-modifiable chars."
921 (declare (fixnum start))
922 (let ((save-header string))
923 (with-one-string string start end offset
924 (do ((index start (1+ index))
925 (newword t)
926 (char ()))
927 ((= index (the fixnum end)))
928 (declare (fixnum index))
929 (setq char (schar string index))
930 (cond ((not (alphanumericp char))
931 (setq newword t))
932 (newword
933 ;;char is first case-modifiable after non-case-modifiable
934 (setf (schar string index) (char-titlecase char))
935 (setq newword ()))
936 (t
937 (setf (schar string index) (char-downcase char))))))
938 save-header))
939
940
941 #+unicode
942 (progn
943 ;; Like string-left-trim, but return the index
944 (defun string-left-trim-index (char-bag string)
945 (with-string string
946 (if (stringp char-bag)
947 ;; When char-bag is a string, we try to do the right thing.
948 ;; Convert char-bag to a list of codepoints and compare the
949 ;; codepoints in the string with this.
950 (let ((code-bag (with-string char-bag
951 (do ((index start (1+ index))
952 (result nil))
953 ((= index end)
954 (nreverse result))
955 (multiple-value-bind (c widep)
956 (codepoint char-bag index)
957 (push c result)
958 (when widep (incf index)))))))
959 (do ((index start (1+ index)))
960 ((= index (the fixnum end))
961 end)
962 (declare (fixnum index))
963 (multiple-value-bind (c widep)
964 (codepoint string index)
965 (unless (find c code-bag)
966 (return-from string-left-trim-index index))
967 (when widep (incf index)))))
968 ;; When char-bag is a list, we just look at each codepoint of
969 ;; STRING to see if it's in char-bag. If char-bag contains a
970 ;; surrogate, we could accidentally trim off a surrogate,
971 ;; leaving an invalid UTF16 string.
972 (do ((index start (1+ index)))
973 ((= index (the fixnum end))
974 end)
975 (declare (fixnum index))
976 (multiple-value-bind (c widep)
977 (codepoint string index)
978 (unless (find c char-bag :key #'char-code)
979 (return-from string-left-trim-index index))
980 (when widep (incf index)))))))
981
982 (defun string-left-trim (char-bag string)
983 "Given a set of characters (a list or string) and a string, returns
984 a copy of the string with the characters in the set removed from the
985 left end. If the set of characters is a string, surrogates will be
986 properly handled."
987 (let ((begin (string-left-trim-index char-bag string)))
988 (with-string string
989 (subseq string begin end))))
990
991 (defun string-right-trim-index (char-bag string)
992 (with-string string
993 (if (stringp char-bag)
994 ;; When char-bag is a string, we try to do the right thing
995 ;; with surrogates. Convert char-bag to a list of codepoints
996 ;; and compare the codepoints in the string with this.
997 (let ((code-bag (with-string char-bag
998 (do ((index start (1+ index))
999 (result nil))
1000 ((= index end)
1001 result)
1002 (multiple-value-bind (c widep)
1003 (codepoint char-bag index)
1004 (push c result)
1005 (when widep (incf index)))))))
1006 (do ((index (1- end) (1- index)))
1007 ((< index start)
1008 start)
1009 (declare (fixnum index))
1010 (multiple-value-bind (c widep)
1011 (codepoint string index)
1012 (unless (find c code-bag)
1013 (return-from string-right-trim-index (1+ index)))
1014 (when widep (decf index)))))
1015 ;; When char-bag is a list, we just look at each codepoint of
1016 ;; STRING to see if it's in char-bag. If char-bag contains a
1017 ;; surrogate, we could accidentally trim off a surrogate,
1018 ;; leaving an invalid UTF16 string.
1019 (do ((index (1- end) (1- index)))
1020 ((< index start)
1021 start)
1022 (declare (fixnum index))
1023 (multiple-value-bind (c widep)
1024 (codepoint string index)
1025 (unless (find c char-bag :key #'char-code)
1026 (return-from string-right-trim-index (1+ index)))
1027 (when widep (decf index)))))))
1028
1029 (defun string-right-trim (char-bag string)
1030 "Given a set of characters (a list or string) and a string, returns
1031 a copy of the string with the characters in the set removed from the
1032 right end. If the set of characters is a string, surrogates will be
1033 properly handled."
1034 (let ((stop (string-right-trim-index char-bag string)))
1035 (with-string string
1036 (subseq string start stop))))
1037
1038 (defun string-trim (char-bag string)
1039 "Given a set of characters (a list or string) and a string, returns a
1040 copy of the string with the characters in the set removed from both
1041 ends. If the set of characters is a string, surrogates will be
1042 properly handled."
1043 (let ((left-end (string-left-trim-index char-bag string))
1044 (right-end (string-right-trim-index char-bag string)))
1045 (with-string string
1046 (subseq (the simple-string string) left-end right-end))))
1047 ) ; end unicode version
1048
1049 #-unicode
1050 (progn
1051 (defun string-left-trim (char-bag string)
1052 "Given a set of characters (a list or string) and a string, returns
1053 a copy of the string with the characters in the set removed from the
1054 left end."
1055 (with-string string
1056 (do ((index start (1+ index)))
1057 ((or (= index (the fixnum end))
1058 (not (find (schar string index) char-bag)))
1059 (subseq (the simple-string string) index end))
1060 (declare (fixnum index)))))
1061
1062 (defun string-right-trim (char-bag string)
1063 "Given a set of characters (a list or string) and a string, returns
1064 a copy of the string with the characters in the set removed from the
1065 right end."
1066 (with-string string
1067 (do ((index (1- (the fixnum end)) (1- index)))
1068 ((or (< index start) (not (find (schar string index) char-bag)))
1069 (subseq (the simple-string string) start (1+ index)))
1070 (declare (fixnum index)))))
1071
1072 (defun string-trim (char-bag string)
1073 "Given a set of characters (a list or string) and a string, returns a
1074 copy of the string with the characters in the set removed from both
1075 ends."
1076 (with-string string
1077 (let* ((left-end (do ((index start (1+ index)))
1078 ((or (= index (the fixnum end))
1079 (not (find (schar string index) char-bag)))
1080 index)
1081 (declare (fixnum index))))
1082 (right-end (do ((index (1- (the fixnum end)) (1- index)))
1083 ((or (< index left-end)
1084 (not (find (schar string index) char-bag)))
1085 (1+ index))
1086 (declare (fixnum index)))))
1087 (subseq (the simple-string string) left-end right-end))))
1088 ) ; non-unicode version
1089
1090 #+unicode
1091 (progn
1092 (declaim (inline %glyph-f %glyph-b))
1093 (defun %glyph-f (string index)
1094 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
1095 (type simple-string string) (type kernel:index index))
1096 (let* ((prev 0)
1097 (l (length string))
1098 (c (codepoint string index l))
1099 (n (+ index (if (> c #xFFFF) 2 1))))
1100 (declare (type codepoint c) (type kernel:index l n))
1101 (loop while (< n l) do
1102 (let* ((c (codepoint string n l))
1103 (d (the (unsigned-byte 8) (unicode-combining-class c))))
1104 (when (or (zerop d) (< d prev))
1105 (return))
1106 (setq prev d)
1107 (incf n (if (> c #xFFFF) 2 1))))
1108 n))
1109
1110 (defun %glyph-b (string index)
1111 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
1112 (type simple-string string) (type kernel:index index))
1113 (let* ((prev 255)
1114 (n (1- index)))
1115 (declare (type kernel:index n))
1116 (loop until (< n 0) do
1117 (let* ((c (codepoint string n 0))
1118 (d (the (unsigned-byte 8) (unicode-combining-class c))))
1119 (cond ((zerop d) (return))
1120 ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
1121 (setq prev d)
1122 (decf n (if (> c #xFFFF) 2 1))))
1123 n))
1124 ) ; unicode
1125
1126 (defun glyph (string index &key (from-end nil))
1127 "GLYPH returns the glyph at the indexed position in a string, and the
1128 position of the next glyph (or NIL) as a second value. A glyph is
1129 a substring consisting of the character at INDEX followed by all
1130 subsequent combining characters."
1131 (declare (type simple-string string) (type kernel:index index))
1132 #-unicode
1133 (char string index)
1134 #+unicode
1135 (with-array-data ((string string) (start) (end))
1136 (declare (ignore start end))
1137 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
1138 (if from-end
1139 (values (subseq string n index) (and (> n 0) n))
1140 (values (subseq string index n) (and (< n (length string)) n))))))
1141
1142 (defun sglyph (string index &key (from-end nil))
1143 "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
1144 except that the string must be a simple-string"
1145 (declare (type simple-string string) (type kernel:index index))
1146 #-unicode
1147 (schar string index)
1148 #+unicode
1149 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
1150 (if from-end
1151 (values (subseq string n index) (and (> n 0) n))
1152 (values (subseq string index n) (and (< n (length string)) n)))))
1153
1154 #+unicode
1155 (defun string-reverse* (sequence)
1156 (declare (optimize (speed 3) (space 0) (safety 0))
1157 (type string sequence))
1158 (with-string sequence
1159 (let* ((length (- end start))
1160 (string (make-string length))
1161 (j length))
1162 (declare (type kernel:index length j))
1163 (loop for i = start then n as n = (%glyph-f sequence i) do
1164 (replace string sequence :start1 (decf j (- n i)) :start2 i :end2 n)
1165 while (< n end))
1166 string)))
1167
1168 #+unicode
1169 (defun string-nreverse* (sequence)
1170 (declare (optimize (speed 3) (space 0) (safety 0))
1171 (type string sequence))
1172 (with-string sequence
1173 (flet ((rev (start end)
1174 (do ((i start (1+ i))
1175 (j (1- end) (1- j)))
1176 ((>= i j))
1177 (declare (type kernel:index i j))
1178 (rotatef (schar sequence i) (schar sequence j)))))
1179 (let ((len end))
1180 (loop for i = start then n as n = (%glyph-f sequence i) do
1181 (rev i n) while (< n len))
1182 (rev start end))))
1183 sequence)
1184
1185
1186
1187
1188 #+unicode
1189 (progn
1190 (defun decompose (string &optional (compatibility t))
1191 (declare (type string string))
1192 (let ((result (make-string (cond ((< (length string) 40)
1193 (* 5 (length string)))
1194 ((< (length string) 4096)
1195 (* 2 (length string)))
1196 (t (round (length string) 5/6)))))
1197 (fillptr 0))
1198 (declare (type kernel:index fillptr))
1199 (labels ((rec (string start end)
1200 (declare (type simple-string string))
1201 (do ((i start (1+ i)))
1202 ((= i end))
1203 (declare (type kernel:index i))
1204 (multiple-value-bind (code wide) (codepoint string i)
1205 (when wide (incf i))
1206 (let ((decomp (unicode-decomp code compatibility)))
1207 (if decomp (rec decomp 0 (length decomp)) (out code))))))
1208 (out (code)
1209 (multiple-value-bind (hi lo) (surrogates code)
1210 (outch hi)
1211 (when lo
1212 (outch lo))
1213 (let ((cc (unicode-combining-class code)))
1214 (unless (zerop cc)
1215 (order lo cc (- fillptr (if lo 3 2)))))))
1216 (outch (char)
1217 (when (= fillptr (length result))
1218 (let ((tmp (make-string (round (length result) 5/6))))
1219 (replace tmp result)
1220 (setq result tmp)))
1221 (setf (schar result fillptr) char)
1222 (incf fillptr))
1223 (order (wide1 cc last)
1224 (loop until (minusp last) do
1225 (multiple-value-bind (code2 wide2) (codepoint result last)
1226 (let ((cc2 (unicode-combining-class code2)))
1227 (cond ((zerop cc2) (return))
1228 ((> cc2 cc)
1229 (case (+ (if wide2 2 0) (if wide1 1 0))
1230 (0 (rotatef (schar result last)
1231 (schar result (1+ last))))
1232 (1 (rotatef (schar result last)
1233 (schar result (+ last 1))
1234 (schar result (+ last 2))))
1235 (2 (rotatef (schar result last)
1236 (schar result (1- last))
1237 (schar result (1+ last))))
1238 (3 (rotatef (schar result last)
1239 (schar result (+ last 2)))
1240 (rotatef (schar result (1- last))
1241 (schar result (1+ last)))))
1242 (decf last (if wide2 2 1)))
1243 (t (return))))))))
1244 (with-string string
1245 (rec string start end))
1246 (shrink-vector result fillptr))))
1247
1248 (declaim (inline normalized-codepoint-p))
1249 (defun normalized-codepoint-p (cp form)
1250 (ecase form
1251 (:nfc (unicode-nfc-qc cp))
1252 (:nfkc (unicode-nfkc-qc cp))
1253 (:nfd (unicode-nfd-qc cp))
1254 (:nfkd (unicode-nfkd-qc cp))))
1255
1256 ;; Perform check to see if string is already normalized. The Unicode
1257 ;; example can return YES, NO, or MAYBE. For our purposes, only YES
1258 ;; is important, for which we return T. For NO or MAYBE, we return NIL.
1259 (defun normalized-form-p (string &optional (form :nfc))
1260 (declare (type (member :nfc :nfkc :nfd :nfkd) form)
1261 (optimize (speed 3)))
1262 (with-string string
1263 (let ((last-class 0))
1264 (declare (type (integer 0 256) last-class))
1265 (do ((k start (1+ k)))
1266 ((>= k end))
1267 (declare (type kernel:index k))
1268 (multiple-value-bind (ch widep)
1269 (codepoint string k end)
1270 (when widep (incf k))
1271 ;; Handle ASCII specially
1272 (unless (< ch 128)
1273 (let ((class (unicode-combining-class ch)))
1274 (declare (type (unsigned-byte 8) class))
1275 (when (and (> last-class class) (not (zerop class)))
1276 ;; Definitely not normalized
1277 (return-from normalized-form-p nil))
1278 (let ((check (normalized-codepoint-p ch form)))
1279 (unless (eq check :y)
1280 (return-from normalized-form-p nil)))
1281 (setf last-class class)))))
1282 t)))
1283
1284
1285 ;; Compose a string in place. The string must already be in decomposed form.
1286 (defun %compose (target)
1287 (declare (type string target)
1288 (optimize (speed 3)))
1289 (let ((len (length target))
1290 (starter-pos 0))
1291 (declare (type kernel:index starter-pos))
1292 (multiple-value-bind (starter-ch wide)
1293 (codepoint target 0 len)
1294 (let ((comp-pos (if wide 2 1))
1295 (last-class (unicode-combining-class starter-ch)))
1296 (declare (type (integer 0 256) last-class)
1297 (type kernel:index comp-pos))
1298 (unless (zerop last-class)
1299 ;; Fix for strings starting with a combining character
1300 (setf last-class 256))
1301 ;; Loop on decomposed characters, combining where possible
1302 (do ((decomp-pos comp-pos (1+ decomp-pos)))
1303 ((>= decomp-pos len))
1304 (declare (type kernel:index decomp-pos))
1305 (multiple-value-bind (ch wide)
1306 (codepoint target decomp-pos len)
1307 (when wide (incf decomp-pos))
1308 (let ((ch-class (unicode-combining-class ch))
1309 (composite (unicode-pairwise-composition starter-ch ch)))
1310 (declare (type (integer 0 256) ch-class))
1311 (cond ((and composite
1312 (or (< last-class ch-class) (zerop last-class)))
1313 ;; Don't have to worry about surrogate pairs here
1314 ;; because the composite is always in the BMP.
1315 (setf (aref target starter-pos) (code-char composite))
1316 (setf starter-ch composite))
1317 (t
1318 (when (zerop ch-class)
1319 (setf starter-pos comp-pos)
1320 (setf starter-ch ch))
1321 (setf last-class ch-class)
1322 (multiple-value-bind (hi lo)
1323 (surrogates ch)
1324 (setf (aref target comp-pos) hi)
1325 (when lo
1326 (incf comp-pos)
1327 (setf (aref target comp-pos) lo))
1328 (incf comp-pos)))))))
1329 (shrink-vector target comp-pos)))))
1330
1331 (defun string-to-nfd (string)
1332 "Convert String to Unicode Normalization Form D (NFD) using the
1333 canonical decomposition. The NFD string is returned"
1334 (decompose string nil))
1335
1336 (defun string-to-nfkd (string)
1337 "Convert String to Unicode Normalization Form KD (NFKD) uisng the
1338 compatible decomposition form. The NFKD string is returned."
1339 (decompose string t))
1340
1341 (defun string-to-nfc (string)
1342 "Convert String to Unicode Normalization Form C (NFC). If the
1343 string a simple string and is already normalized, the original
1344 string is returned."
1345 (if (normalized-form-p string :nfc)
1346 (if (simple-string-p string) string (coerce string 'simple-string))
1347 (coerce (if (normalized-form-p string :nfd)
1348 (%compose (copy-seq string))
1349 (%compose (string-to-nfd string)))
1350 'simple-string)))
1351
1352 (defun string-to-nfkc (string)
1353 "Convert String to Unicode Normalization Form KC (NFKC). If the
1354 string is a simple string and is already normalized, the original
1355 string is returned."
1356 (if (normalized-form-p string :nfkc)
1357 (if (simple-string-p string) string (coerce string 'simple-string))
1358 (coerce (if (normalized-form-p string :nfkd)
1359 (%compose (copy-seq string))
1360 (%compose (string-to-nfkd string)))
1361 'simple-string)))
1362 ) ; end unicode
1363
1364 #-unicode ;; Needed by package.lisp
1365 (defun string-to-nfc (string)
1366 (if (simple-string-p string) string (coerce string 'simple-string)))
1367

  ViewVC Help
Powered by ViewVC 1.1.5