/[cmucl]/src/code/string.lisp
ViewVC logotype

Contents of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12.30.33 - (show annotations)
Thu Jun 11 13:30:01 2009 UTC (4 years, 10 months ago) by rtoy
Branch: unicode-utf16-extfmt-branch
Changes since 1.12.30.32: +66 -124 lines
Revert previous change that added case folding to string-equal and
friends.  We can't really do that for a couple of reasons:

- Case folding should be done on the NFD form according to the Unicode
  spec
- Full case folding may change the length of the string so it's not
  clear what the return value from string-lessp and friends should be.

Instead, we provide a new function, STRING-CASE-FOLD, to perform case
folding.

code/char.lisp:
o Use lowercase for case insensitve comparisons again.

code/string.lisp:
o Remove :casing option for string-lessp and friends.
o Remove code needed to support :casing option.
o Add STRING-CASE-FOLD to perform case folding operation.

compiler/fndb.lisp:
o Remove :casing option from defknowns.
1 ;;; -*- Log: code.log; Package: Lisp -*-
2 ;;;
3 ;;; **********************************************************************
4 ;;; This code was written as part of the CMU Common Lisp project at
5 ;;; Carnegie Mellon University, and has been placed in the public domain.
6 ;;;
7 (ext:file-comment
8 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/string.lisp,v 1.12.30.33 2009/06/11 13:30:01 rtoy Exp $")
9 ;;;
10 ;;; **********************************************************************
11 ;;;
12 ;;; Functions to implement strings for CMU Common Lisp
13 ;;; Written by David Dill
14 ;;; Rewritten by Skef Wholey, Bill Chiles and Rob MacLachlan.
15 ;;;
16 ;;; ****************************************************************
17 ;;;
18 (in-package "LISP")
19 (export '(char schar glyph sglyph string
20 string= string-equal string< string> string<= string>= string/=
21 string-lessp string-greaterp string-not-lessp string-not-greaterp
22 string-not-equal
23 string-to-nfd string-to-nfkd string-to-nfc string-to-nfkc
24 make-string
25 string-trim string-left-trim string-right-trim
26 string-upcase
27 string-downcase string-capitalize nstring-upcase nstring-downcase
28 nstring-capitalize))
29
30
31 (declaim (inline surrogatep surrogates-to-codepoint codepoint surrogates))
32
33 (defun surrogatep (c &optional surrogate-type)
34 "Test if C is a surrogate. C may be either an integer or a
35 character. Surrogate-type indicates what kind of surrogate to test
36 for. :High means to test for the high (leading) surrogate; :Low
37 tests for the low (trailing surrogate). A value of :Any or Nil
38 tests for any surrogate value (high or low)."
39 (declare (type (or character codepoint) c))
40 (let ((code (if (characterp c)
41 (char-code c)
42 c)))
43 (ecase surrogate-type
44 ((:high :leading)
45 ;; Test for high surrogate
46 (<= #xD800 code #xDBFF))
47 ((:low :trailing)
48 ;; Test for low surrogate
49 (<= #xDC00 code #xDFFF))
50 ((:any nil)
51 ;; Test for any surrogate
52 (<= #xD800 code #xDFFF)))))
53
54 (defun surrogates-to-codepoint (hi lo)
55 "Convert the given Hi and Lo surrogate characters to the
56 corresponding codepoint value"
57 (declare (type character hi lo))
58 (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi)) #xD800) 10)
59 (the (integer #xDC00 #xDFFF) (char-code lo)) #x2400))
60
61 (defun codepoint (string i &optional (end (length string)))
62 "Return the codepoint value from String at position I. If that
63 position is a surrogate, it is combined with either the previous or
64 following character (when possible) to compute the codepoint. The
65 second return value is NIL if the position is not a surrogate pair.
66 Otherwise +1 or -1 is returned if the position is the high or low
67 surrogate value, respectively."
68 (declare (type simple-string string) (type kernel:index i end))
69 (let ((code (char-code (schar string i))))
70 (cond ((and (surrogatep code :high) (< (1+ i) end))
71 (let ((tmp (char-code (schar string (1+ i)))))
72 (if (surrogatep tmp :low)
73 (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)
74 (values code nil))))
75 ((and (surrogatep code :low) (> i 0))
76 (let ((tmp (char-code (schar string (1- i)))))
77 (if (surrogatep tmp :high)
78 (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)
79 (values code nil))))
80 (t (values code nil)))))
81
82 (defun surrogates (codepoint)
83 "Return the high and low surrogate characters for Codepoint. If
84 Codepoint is in the BMP, the first return value is the corresponding
85 character and the second is NIL."
86 (declare (type codepoint codepoint))
87 (if (< codepoint #x10000)
88 (values (code-char codepoint) nil)
89 (let* ((tmp (- codepoint #x10000))
90 (hi (logior (ldb (byte 10 10) tmp) #xD800))
91 (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
92 (values (code-char hi) (code-char lo)))))
93
94 (defun (setf codepoint) (codepoint string i)
95 "Set the codepoint at string position I to the Codepoint. If the
96 codepoint requires a surrogate pair, the high (leading surrogate) is
97 stored at position I and the low (trailing) surrogate is stored at
98 I+1"
99 (declare (type codepoint codepoint)
100 (type simple-string string))
101 (let ((widep nil))
102 (multiple-value-bind (hi lo)
103 (surrogates codepoint)
104 (setf (aref string i) hi)
105 (when lo
106 (setf (aref string (1+ i)) lo)
107 (setf widep t)))
108 (values codepoint widep)))
109
110 (defun utf16-string-p (string)
111 "Check if String is a valid UTF-16 string. If the string is valid,
112 T is returned. If the string is not valid, NIL is returned, and the
113 second value is the index into the string of the invalid character.
114 A string is also invalid if it contains any unassigned codepoints."
115 (do ((len (length string))
116 (index 0 (1+ index)))
117 ((>= index len)
118 t)
119 (multiple-value-bind (codepoint wide)
120 (codepoint string index)
121 ;; We step through the string in order. If there are any
122 ;; surrogates pairs, we must reach the lead surrogate first,
123 ;; which means WIDE is +1. Otherwise, we have an invalid
124 ;; surrogate pair. If we get any codepoint that is in the
125 ;; surrogate range, we also have an invalid string. An
126 ;; unassigned codepoint is also considered invalid.
127 (when (or (eq wide -1)
128 (surrogatep codepoint)
129 (lisp::unicode-assigned-codepoint-p codepoint))
130 (return-from utf16-string-p (values nil index)))
131 (when wide (incf index)))))
132
133 (defun string (X)
134 "Coerces X into a string. If X is a string, X is returned. If X is a
135 symbol, X's pname is returned. If X is a character then a one element
136 string containing that character is returned. If X cannot be coerced
137 into a string, an error occurs."
138 (cond ((stringp x) x)
139 ((symbolp x) (symbol-name x))
140 ((characterp x)
141 (let ((res (make-string 1)))
142 (setf (schar res 0) x) res))
143 (t
144 (error 'simple-type-error
145 :datum x
146 :expected-type '(or string symbol character)
147 :format-control "~S cannot be coerced to a string."
148 :format-arguments (list x)))))
149
150 ;;; With-One-String is used to set up some string hacking things. The keywords
151 ;;; are parsed, and the string is hacked into a simple-string.
152
153 (eval-when (compile)
154
155 (defmacro with-one-string (string start end cum-offset &rest forms)
156 `(let ((,string (if (stringp ,string) ,string (string ,string))))
157 ;; Optimizer may prove STRING is one.
158 (declare (optimize (ext:inhibit-warnings 3)))
159 (with-array-data ((,string ,string :offset-var ,cum-offset)
160 (,start ,start)
161 (,end (or ,end (length (the vector ,string)))))
162 ,@forms)))
163
164 )
165
166 ;;; With-String is like With-One-String, but doesn't parse keywords.
167
168 (eval-when (compile)
169
170 (defmacro with-string (string &rest forms)
171 `(let ((,string (if (stringp ,string) ,string (string ,string))))
172 (with-array-data ((,string ,string)
173 (start)
174 (end (length (the vector ,string))))
175 ,@forms)))
176
177 )
178
179 ;;; With-Two-Strings is used to set up string comparison operations. The
180 ;;; keywords are parsed, and the strings are hacked into simple-strings.
181
182 (eval-when (compile)
183
184 (defmacro with-two-strings (string1 string2 start1 end1 cum-offset-1
185 start2 end2 &rest forms)
186 `(let ((,string1 (if (stringp ,string1) ,string1 (string ,string1)))
187 (,string2 (if (stringp ,string2) ,string2 (string ,string2))))
188 (with-array-data ((,string1 ,string1 :offset-var ,cum-offset-1)
189 (,start1 ,start1)
190 (,end1 (or ,end1 (length (the vector ,string1)))))
191 (with-array-data ((,string2 ,string2)
192 (,start2 ,start2)
193 (,end2 (or ,end2 (length (the vector ,string2)))))
194 ,@forms))))
195
196 )
197
198
199 (defun char (string index)
200 "Given a string and a non-negative integer index less than the length of
201 the string, returns the character object representing the character at
202 that position in the string."
203 (declare (optimize (safety 1)))
204 (char string index))
205
206 (defun %charset (string index new-el)
207 (declare (optimize (safety 1)))
208 (setf (char string index) new-el))
209
210 (defun schar (string index)
211 "SCHAR returns the character object at an indexed position in a string
212 just as CHAR does, except the string must be a simple-string."
213 (declare (optimize (safety 1)))
214 (schar string index))
215
216 (defun %scharset (string index new-el)
217 (declare (optimize (safety 1)))
218 (setf (schar string index) new-el))
219
220 (defun string=* (string1 string2 start1 end1 start2 end2)
221 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
222 (not (%sp-string-compare string1 start1 end1 string2 start2 end2))))
223
224
225 (defun string/=* (string1 string2 start1 end1 start2 end2)
226 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
227 (let ((comparison (%sp-string-compare string1 start1 end1
228 string2 start2 end2)))
229 (if comparison (- (the fixnum comparison) offset1)))))
230
231 (eval-when (compile eval)
232
233 ;;; Lessp is true if the desired expansion is for string<* or string<=*.
234 ;;; Equalp is true if the desired expansion is for string<=* or string>=*.
235 (defmacro string<>=*-body (lessp equalp)
236 (let ((offset1 (gensym)))
237 `(with-two-strings string1 string2 start1 end1 ,offset1 start2 end2
238 (let ((index (%sp-string-compare string1 start1 end1
239 string2 start2 end2)))
240 (if index
241 (cond ((= (the fixnum index) (the fixnum end1))
242 ,(if lessp
243 `(- (the fixnum index) ,offset1)
244 `nil))
245 ((= (+ (the fixnum index) (- start2 start1))
246 (the fixnum end2))
247 ,(if lessp
248 `nil
249 `(- (the fixnum index) ,offset1)))
250 #-unicode
251 ((,(if lessp 'char< 'char>)
252 (schar string1 index)
253 (schar string2 (+ (the fixnum index) (- start2 start1))))
254 (- (the fixnum index) ,offset1))
255 #-unicode
256 (t nil)
257 #+unicode
258 (t
259 ;; Compare in code point order. See
260 ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
261 (flet ((fixup (code)
262 (if (>= code #xe000)
263 (- code #x800)
264 (+ code #x2000))))
265 (declare (inline fixup))
266 (let* ((c1 (char-code (schar string1 index)))
267 (c2 (char-code (schar string2
268 (+ (the fixnum index)
269 (- start2 start1))))))
270 (cond ((and (>= c1 #xd800)
271 (>= c2 #xd800))
272 (let ((fix-c1 (fixup c1))
273 (fix-c2 (fixup c2)))
274 (if (,(if lessp '< '>) fix-c1 fix-c2)
275 (- (the fixnum index) ,offset1)
276 nil)))
277 (t
278 (if (,(if lessp '< '>) c1 c2)
279 (- (the fixnum index) ,offset1)
280 nil)))))))
281 ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
282 ) ; eval-when
283
284 (defun string<* (string1 string2 start1 end1 start2 end2)
285 (declare (fixnum start1 start2))
286 (string<>=*-body t nil))
287
288 (defun string>* (string1 string2 start1 end1 start2 end2)
289 (declare (fixnum start1 start2))
290 (string<>=*-body nil nil))
291
292 (defun string<=* (string1 string2 start1 end1 start2 end2)
293 (declare (fixnum start1 start2))
294 (string<>=*-body t t))
295
296 (defun string>=* (string1 string2 start1 end1 start2 end2)
297 (declare (fixnum start1 start2))
298 (string<>=*-body nil t))
299
300
301
302 (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
303 "Given two strings, if the first string is lexicographically less than
304 the second string, returns the longest common prefix (using char=)
305 of the two strings. Otherwise, returns ()."
306 (string<* string1 string2 start1 end1 start2 end2))
307
308 (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
309 "Given two strings, if the first string is lexicographically greater than
310 the second string, returns the longest common prefix (using char=)
311 of the two strings. Otherwise, returns ()."
312 (string>* string1 string2 start1 end1 start2 end2))
313
314
315 (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
316 "Given two strings, if the first string is lexicographically less than
317 or equal to the second string, returns the longest common prefix
318 (using char=) of the two strings. Otherwise, returns ()."
319 (string<=* string1 string2 start1 end1 start2 end2))
320
321 (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
322 "Given two strings, if the first string is lexicographically greater
323 than or equal to the second string, returns the longest common prefix
324 (using char=) of the two strings. Otherwise, returns ()."
325 (string>=* string1 string2 start1 end1 start2 end2))
326
327 (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
328 "Given two strings (string1 and string2), and optional integers start1,
329 start2, end1 and end2, compares characters in string1 to characters in
330 string2 (using char=)."
331 (string=* string1 string2 start1 end1 start2 end2))
332
333 (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
334 "Given two strings, if the first string is not lexicographically equal
335 to the second string, returns the longest common prefix (using char=)
336 of the two strings. Otherwise, returns ()."
337 (string/=* string1 string2 start1 end1 start2 end2))
338
339
340 (eval-when (compile eval)
341
342 ;;; STRING-NOT-EQUAL-LOOP is used to generate character comparison loops for
343 ;;; STRING-EQUAL and STRING-NOT-EQUAL.
344 (defmacro string-not-equal-loop (end end-value
345 &optional (abort-value nil abortp))
346 (declare (fixnum end))
347 (let ((end-test (if (= end 1)
348 `(= index1 (the fixnum end1))
349 `(= index2 (the fixnum end2)))))
350 `(do ((index1 start1 (1+ index1))
351 (index2 start2 (1+ index2)))
352 (,(if abortp
353 end-test
354 `(or ,end-test
355 (not (char-equal (schar string1 index1)
356 (schar string2 index2)))))
357 ,end-value)
358 (declare (fixnum index1 index2))
359 ,@(if abortp
360 `((if (not (char-equal (schar string1 index1)
361 (schar string2 index2)))
362 (return ,abort-value)))))))
363 ) ; eval-when
364
365 #+unicode
366 (defun string-case-fold (string &key (start 0) end (casing :simple))
367 "Return a new string with the case folded according to Casing as follows:
368
369 :SIMPLE Unicode simple case folding (preserving length)
370 :FULL Unicode full case folding (possibly changing length)
371
372 Default Casing is :SIMPLE."
373 (ecase casing
374 (:simple
375 (with-output-to-string (s)
376 (with-one-string string start end offset
377 (do ((index offset (1+ index)))
378 ((>= index end))
379 (multiple-value-bind (code widep)
380 (codepoint string index)
381 (when widep (incf index))
382 (let ((new (,f code)))
383 (multiple-value-bind (hi lo)
384 (surrogates (,f code))
385 (write-char hi s)
386 (when lo (write-char lo s)))))))))
387 (:full
388 (with-output-to-string (s)
389 (with-one-string string start end offset
390 (do ((index offset (1+ index)))
391 ((>= index end))
392 (multiple-value-bind (code widep)
393 (codepoint string index)
394 (when widep (incf index))
395 (write-string (,f code) s))))))))
396
397 (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
398 "Given two strings (string1 and string2), and optional integers start1,
399 start2, end1 and end2, compares characters in string1 to characters in
400 string2 (using char-equal)."
401 (declare (fixnum start1 start2))
402 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
403 (let ((slen1 (- (the fixnum end1) start1))
404 (slen2 (- (the fixnum end2) start2)))
405 (declare (fixnum slen1 slen2))
406 (if (or (minusp slen1) (minusp slen2))
407 ;;prevent endless looping later.
408 (error "Improper bounds for string comparison."))
409 (if (= slen1 slen2)
410 ;;return () immediately if lengths aren't equal.
411 (string-not-equal-loop 1 t nil)))))
412
413 (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
414 "Given two strings, if the first string is not lexicographically equal
415 to the second string, returns the longest common prefix (using char-equal)
416 of the two strings. Otherwise, returns ()."
417 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
418 (let ((slen1 (- end1 start1))
419 (slen2 (- end2 start2)))
420 (declare (fixnum slen1 slen2))
421 (if (or (minusp slen1) (minusp slen2))
422 ;;prevent endless looping later.
423 (error "Improper bounds for string comparison."))
424 (cond ((or (minusp slen1) (or (minusp slen2)))
425 (error "Improper substring for comparison."))
426 ((= slen1 slen2)
427 (string-not-equal-loop 1 nil (- index1 offset1)))
428 ((< slen1 slen2)
429 (string-not-equal-loop 1 (- index1 offset1)))
430 (t
431 (string-not-equal-loop 2 (- index1 offset1)))))))
432
433
434
435 (eval-when (compile eval)
436
437 ;;; STRING-LESS-GREATER-EQUAL-TESTS returns a test on the lengths of string1
438 ;;; and string2 and a test on the current characters from string1 and string2
439 ;;; for the following macro.
440 (defun string-less-greater-equal-tests (lessp equalp)
441 (if lessp
442 (if equalp
443 ;; STRING-NOT-GREATERP
444 (values '<=
445 #-unicode `(not (char-greaterp char1 char2))
446 #+unicode `(<= char1 char2))
447 ;; STRING-LESSP
448 (values '<
449 #-unicode `(char-lessp char1 char2)
450 #+unicode `(< char1 char2)))
451 (if equalp
452 ;; STRING-NOT-LESSP
453 (values '>=
454 #-unicode `(not (char-lessp char1 char2))
455 #+unicode `(>= char1 char2))
456 ;; STRING-GREATERP
457 (values '>
458 #-unicode `(char-greaterp char1 char2)
459 #+unicode `(> char1 char2)))))
460
461 #-unicode
462 (defmacro string-less-greater-equal (lessp equalp)
463 (multiple-value-bind (length-test character-test)
464 (string-less-greater-equal-tests lessp equalp)
465 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
466 (let ((slen1 (- (the fixnum end1) start1))
467 (slen2 (- (the fixnum end2) start2)))
468 (declare (fixnum slen1 slen2))
469 (if (or (minusp slen1) (minusp slen2))
470 ;;prevent endless looping later.
471 (error "Improper bounds for string comparison."))
472 (do ((index1 start1 (1+ index1))
473 (index2 start2 (1+ index2))
474 (char1)
475 (char2))
476 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
477 (if (,length-test slen1 slen2) (- index1 offset1)))
478 (declare (fixnum index1 index2))
479 (setq char1 (schar string1 index1))
480 (setq char2 (schar string2 index2))
481 (if (not (char-equal char1 char2))
482 (if ,character-test
483 (return (- index1 offset1))
484 (return ()))))))))
485
486 ;; Convert to lowercase for case folding, to match what Unicode
487 ;; CaseFolding.txt says. An example where this matters: U+1E9E maps
488 ;; to U+00DF. But the uppercase version of U+00DF is U+00DF.
489 #+unicode
490 (defmacro equal-char-codepoint (codepoint)
491 `(let ((ch ,codepoint))
492 ;; Handle ASCII separately for bootstrapping and for unidata missing.
493 (if (< 64 ch 91)
494 (+ ch 32)
495 #-(and unicode (not unicode-bootstrap))
496 ch
497 #+(and unicode (not unicode-bootstrap))
498 (if (> ch 127) (unicode-lower ch) ch))))
499
500 #+unicode
501 (defmacro string-less-greater-equal (lessp equalp)
502 (multiple-value-bind (length-test character-test)
503 (string-less-greater-equal-tests lessp equalp)
504 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
505 (let ((slen1 (- (the fixnum end1) start1))
506 (slen2 (- (the fixnum end2) start2)))
507 (declare (fixnum slen1 slen2))
508 (if (or (minusp slen1) (minusp slen2))
509 ;;prevent endless looping later.
510 (error "Improper bounds for string comparison."))
511 (do ((index1 start1 (1+ index1))
512 (index2 start2 (1+ index2)))
513 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
514 (if (,length-test slen1 slen2) (- index1 offset1)))
515 (declare (fixnum index1 index2))
516 (multiple-value-bind (char1 wide1)
517 (codepoint string1 index1)
518 (declare (type codepoint char1))
519 (multiple-value-bind (char2 wide2)
520 (codepoint string2 index2)
521 (declare (type codepoint char2))
522 (setf char1 (equal-char-codepoint char1))
523 (setf char2 (equal-char-codepoint char2))
524 (if (= char1 char2)
525 (progn
526 (when wide1 (incf index1))
527 (when wide2 (incf index2)))
528 (if ,character-test
529 (return (- index1 offset1))
530 (return ()))))))))))
531
532 ) ; eval-when
533
534 (defun string-lessp* (string1 string2 start1 end1 start2 end2)
535 (declare (fixnum start1 start2))
536 (string-less-greater-equal t nil))
537
538 (defun string-greaterp* (string1 string2 start1 end1 start2 end2)
539 (declare (fixnum start1 start2))
540 (string-less-greater-equal nil nil))
541
542 (defun string-not-lessp* (string1 string2 start1 end1 start2 end2)
543 (declare (fixnum start1 start2))
544 (string-less-greater-equal nil t))
545
546 (defun string-not-greaterp* (string1 string2 start1 end1 start2 end2)
547 (declare (fixnum start1 start2))
548 (string-less-greater-equal t t))
549
550 (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
551 "Given two strings, if the first string is lexicographically less than
552 the second string, returns the longest common prefix (using char-equal)
553 of the two strings. Otherwise, returns ()."
554 (string-lessp* string1 string2 start1 end1 start2 end2))
555
556 (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)
557 "Given two strings, if the first string is lexicographically greater than
558 the second string, returns the longest common prefix (using char-equal)
559 of the two strings. Otherwise, returns ()."
560 (string-greaterp* string1 string2 start1 end1 start2 end2))
561
562 (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
563 "Given two strings, if the first string is lexicographically greater
564 than or equal to the second string, returns the longest common prefix
565 (using char-equal) of the two strings. Otherwise, returns ()."
566 (string-not-lessp* string1 string2 start1 end1 start2 end2))
567
568 (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
569 end2)
570 "Given two strings, if the first string is lexicographically less than
571 or equal to the second string, returns the longest common prefix
572 (using char-equal) of the two strings. Otherwise, returns ()."
573 (string-not-greaterp* string1 string2 start1 end1 start2 end2))
574
575
576 (defun make-string (count &key element-type ((:initial-element fill-char)))
577 "Given a character count and an optional fill character, makes and returns
578 a new string Count long filled with the fill character."
579 (declare (type fixnum count))
580 (assert (subtypep element-type 'character))
581 (if fill-char
582 (do ((i 0 (1+ i))
583 (string (make-string count)))
584 ((= i count) string)
585 (declare (fixnum i))
586 (setf (schar string i) fill-char))
587 (make-string count)))
588
589 (defun string-upcase-simple (string &key (start 0) end)
590 (declare (fixnum start))
591 (let* ((string (if (stringp string) string (string string)))
592 (slen (length string)))
593 (declare (fixnum slen))
594 (with-one-string string start end offset
595 (let ((offset-slen (+ slen offset))
596 (newstring (make-string slen)))
597 (declare (fixnum offset-slen))
598 (do ((index offset (1+ index))
599 (new-index 0 (1+ new-index)))
600 ((= index start))
601 (declare (fixnum index new-index))
602 (setf (schar newstring new-index) (schar string index)))
603 (do ((index start (1+ index))
604 (new-index (- start offset) (1+ new-index)))
605 ((= index (the fixnum end)))
606 (declare (fixnum index new-index))
607 (multiple-value-bind (code wide) (codepoint string index)
608 (when wide (incf index))
609 ;; Handle ASCII specially because this is called early in
610 ;; initialization, before unidata is available.
611 (cond ((< 96 code 123) (decf code 32))
612 ((> code 127) (setq code (unicode-upper code))))
613 ;;@@ WARNING: this may, in theory, need to extend newstring
614 ;; but that never actually occurs as of Unicode 5.1.0,
615 ;; so I'm just going to ignore it for now...
616 (multiple-value-bind (hi lo) (surrogates code)
617 (setf (schar newstring new-index) hi)
618 (when lo
619 (setf (schar newstring (incf new-index)) lo)))))
620 ;;@@ WARNING: see above
621 (do ((index end (1+ index))
622 (new-index (- (the fixnum end) offset) (1+ new-index)))
623 ((= index offset-slen))
624 (declare (fixnum index new-index))
625 (setf (schar newstring new-index) (schar string index)))
626 newstring))))
627
628 (defun string-upcase-full (string &key (start 0) end)
629 (declare (fixnum start))
630 (let* ((string (if (stringp string) string (string string)))
631 (slen (length string)))
632 (declare (fixnum slen))
633 (with-output-to-string (s)
634 (with-one-string string start end offset
635 (let ((offset-slen (+ slen offset)))
636 (declare (fixnum offset-slen))
637 (write-string string s :start offset :end start)
638 (do ((index start (1+ index)))
639 ((= index (the fixnum end)))
640 (declare (fixnum index))
641 (multiple-value-bind (code wide)
642 (codepoint string index)
643 (when wide (incf index))
644 ;; Handle ASCII specially because this is called early in
645 ;; initialization, before unidata is available.
646 (cond ((< 96 code 123)
647 (write-char (code-char (decf code 32)) s))
648 ((> code 127)
649 (write-string (unicode-full-case-upper code) s))
650 (t
651 (multiple-value-bind (hi lo)
652 (surrogates code)
653 (write-char hi s)
654 (when lo
655 (write-char lo s)))))))
656 (write-string string s :start end :end offset-slen))))))
657
658 (defun string-upcase (string &key (start 0) end #+unicode (casing :simple))
659 #-unicode
660 "Given a string, returns a new string that is a copy of it with all
661 lower case alphabetic characters converted to uppercase."
662 #+unicode
663 "Given a string, returns a new string that is a copy of it with all
664 lower case alphabetic characters converted to uppercase. Casing is
665 :simple or :full for simple or full case conversion, respectively."
666 (declare (fixnum start))
667 #-unicode
668 (string-upcase-simple string :start start :end end)
669 #+unicode
670 (if (eq casing :simple)
671 (string-upcase-simple string :start start :end end)
672 (string-upcase-full string :start start :end end)))
673
674 (defun string-downcase-simple (string &key (start 0) end)
675 (declare (fixnum start))
676 (let* ((string (if (stringp string) string (string string)))
677 (slen (length string)))
678 (declare (fixnum slen))
679 (with-one-string string start end offset
680 (let ((offset-slen (+ slen offset))
681 (newstring (make-string slen)))
682 (declare (fixnum offset-slen))
683 (do ((index offset (1+ index))
684 (new-index 0 (1+ new-index)))
685 ((= index start))
686 (declare (fixnum index new-index))
687 (setf (schar newstring new-index) (schar string index)))
688 (do ((index start (1+ index))
689 (new-index (- start offset) (1+ new-index)))
690 ((= index (the fixnum end)))
691 (declare (fixnum index new-index))
692 (multiple-value-bind (code wide) (codepoint string index)
693 (when wide (incf index))
694 ;; Handle ASCII specially because this is called early in
695 ;; initialization, before unidata is available.
696 (cond ((< 64 code 91) (incf code 32))
697 ((> code 127) (setq code (unicode-lower code))))
698 ;;@@ WARNING: this may, in theory, need to extend newstring
699 ;; but that never actually occurs as of Unicode 5.1.0,
700 ;; so I'm just going to ignore it for now...
701 (multiple-value-bind (hi lo) (surrogates code)
702 (setf (schar newstring new-index) hi)
703 (when lo
704 (setf (schar newstring (incf new-index)) lo)))))
705 ;;@@ WARNING: see above
706 (do ((index end (1+ index))
707 (new-index (- (the fixnum end) offset) (1+ new-index)))
708 ((= index offset-slen))
709 (declare (fixnum index new-index))
710 (setf (schar newstring new-index) (schar string index)))
711 newstring))))
712
713 (defun string-downcase-full (string &key (start 0) end)
714 (declare (fixnum start))
715 (let* ((string (if (stringp string) string (string string)))
716 (slen (length string)))
717 (declare (fixnum slen))
718 (with-output-to-string (s)
719 (with-one-string string start end offset
720 (let ((offset-slen (+ slen offset)))
721 (declare (fixnum offset-slen))
722 (write-string string s :start offset :end start)
723 (do ((index start (1+ index)))
724 ((= index (the fixnum end)))
725 (declare (fixnum index))
726 (multiple-value-bind (code wide)
727 (codepoint string index)
728 (when wide (incf index))
729 ;; Handle ASCII specially because this is called early in
730 ;; initialization, before unidata is available.
731 (cond ((< 64 code 91)
732 (write-char (code-char (incf code 32)) s))
733 ((> code 127)
734 (write-string (unicode-full-case-lower code) s))
735 (t
736 (multiple-value-bind (hi lo)
737 (surrogates code)
738 (write-char hi s)
739 (when lo
740 (write-char lo s)))))))
741 (write-string string s :start end :end offset-slen))))))
742
743 (defun string-downcase (string &key (start 0) end #+unicode (casing :simple))
744 #-unicode
745 "Given a string, returns a new string that is a copy of it with all
746 upper case alphabetic characters converted to lowercase."
747 #+unicode
748 "Given a string, returns a new string that is a copy of it with all
749 upper case alphabetic characters converted to lowercase. Casing is
750 :simple or :full for simple or full case conversion, respectively."
751 (declare (fixnum start))
752 #-unicode
753 (string-downcase-simple string :start start :end end)
754 #+unicode
755 (if (eq casing :simple)
756 (string-downcase-simple string :start start :end end)
757 (string-downcase-full string :start start :end end)))
758
759 (defun string-capitalize-simple (string &key (start 0) end)
760 (declare (fixnum start))
761 (let* ((string (if (stringp string) string (string string)))
762 (slen (length string)))
763 (declare (fixnum slen))
764 (with-one-string string start end offset
765 (let ((offset-slen (+ slen offset))
766 (newstring (make-string slen)))
767 (declare (fixnum offset-slen))
768 (do ((index offset (1+ index))
769 (new-index 0 (1+ new-index)))
770 ((= index start))
771 (declare (fixnum index new-index))
772 (setf (schar newstring new-index) (schar string index)))
773 (do ((index start (1+ index))
774 (new-index (- start offset) (1+ new-index))
775 (newword t)
776 (char ()))
777 ((= index (the fixnum end)))
778 (declare (fixnum index new-index))
779 (setq char (schar string index))
780 (cond ((not (alphanumericp char))
781 (setq newword t))
782 (newword
783 ;;char is first case-modifiable after non-case-modifiable
784 (setq char (char-titlecase char))
785 (setq newword ()))
786 ;;char is case-modifiable, but not first
787 (t (setq char (char-downcase char))))
788 (setf (schar newstring new-index) char))
789 (do ((index end (1+ index))
790 (new-index (- (the fixnum end) offset) (1+ new-index)))
791 ((= index offset-slen))
792 (declare (fixnum index new-index))
793 (setf (schar newstring new-index) (schar string index)))
794 newstring))))
795
796 (defun string-capitalize-full (string &key (start 0) end)
797 (declare (fixnum start))
798 (let* ((string (if (stringp string) string (string string)))
799 (slen (length string)))
800 (declare (fixnum slen))
801 (with-output-to-string (s)
802 (with-one-string string start end offset
803 (let ((offset-slen (+ slen offset)))
804 (declare (fixnum offset-slen))
805 (write-string string s :start offset :end start)
806 (flet ((alphanump (m)
807 (or (< 47 m 58) (< 64 m 91) (< 96 m 123)
808 #+(and unicode (not unicode-bootstrap))
809 (and (> m 127)
810 (<= +unicode-category-letter+ (unicode-category m)
811 (+ +unicode-category-letter+ #x0F))))))
812 (do ((index start (1+ index))
813 (newword t))
814 ((= index (the fixnum end)))
815 (declare (fixnum index))
816 (multiple-value-bind (code wide)
817 (codepoint string index)
818 (when wide (incf index))
819 (cond ((not (alphanump code))
820 (multiple-value-bind (hi lo)
821 (surrogates code)
822 (write-char hi s)
823 (when lo (write-char lo s)))
824 (setq newword t))
825 (newword
826 ;;char is first case-modifiable after non-case-modifiable
827 (write-string (unicode-full-case-title code) s)
828 (setq newword ()))
829 ;;char is case-modifiable, but not first
830 (t
831 (write-string (unicode-full-case-lower code) s))))))
832 (write-string string s :start end :end offset-slen))))))
833
834 (defun string-capitalize (string &key (start 0) end #+unicode (casing :simple))
835 #-unicode
836 "Given a string, returns a copy of the string with the first
837 character of each ``word'' converted to upper-case, and remaining
838 chars in the word converted to lower case. A ``word'' is defined
839 to be a string of case-modifiable characters delimited by
840 non-case-modifiable chars."
841 #+unicode
842 "Given a string, returns a copy of the string with the first
843 character of each ``word'' converted to upper-case, and remaining
844 chars in the word converted to lower case. A ``word'' is defined
845 to be a string of case-modifiable characters delimited by
846 non-case-modifiable chars. Casing is :simple or :full for
847 simple or full case conversion, respectively."
848
849 (declare (fixnum start))
850 #-unicode
851 (string-capitalize-simple string :start start :end end)
852 #+unicode
853 (if (eq casing :simple)
854 (string-capitalize-simple string :start start :end end)
855 (string-capitalize-full string :start start :end end)))
856
857 (defun nstring-upcase (string &key (start 0) end)
858 "Given a string, returns that string with all lower case alphabetic
859 characters converted to uppercase."
860 (declare (fixnum start))
861 (let ((save-header string))
862 (with-one-string string start end offset
863 (do ((index start (1+ index)))
864 ((= index (the fixnum end)))
865 (declare (fixnum index))
866 (multiple-value-bind (code wide) (codepoint string index)
867 (declare (ignore wide))
868 ;; Handle ASCII specially because this is called early in
869 ;; initialization, before unidata is available.
870 (cond ((< 96 code 123) (decf code 32))
871 ((> code 127) (setq code (unicode-upper code))))
872 ;;@@ WARNING: this may, in theory, need to extend string
873 ;; (which, obviously, we can't do here. Unless
874 ;; STRING is adjustable, maybe)
875 ;; but that never actually occurs as of Unicode 5.1.0,
876 ;; so I'm just going to ignore it for now...
877 (multiple-value-bind (hi lo) (surrogates code)
878 (setf (schar string index) hi)
879 (when lo
880 (setf (schar string (incf index)) lo))))))
881 save-header))
882
883 (defun nstring-downcase (string &key (start 0) end)
884 "Given a string, returns that string with all upper case alphabetic
885 characters converted to lowercase."
886 (declare (fixnum start))
887 (let ((save-header string))
888 (with-one-string string start end offset
889 (do ((index start (1+ index)))
890 ((= index (the fixnum end)))
891 (declare (fixnum index))
892 (multiple-value-bind (code wide) (codepoint string index)
893 (declare (ignore wide))
894 (cond ((< 64 code 91) (incf code 32))
895 ((> code 127) (setq code (unicode-lower code))))
896 ;;@@ WARNING: this may, in theory, need to extend string
897 ;; (which, obviously, we can't do here. Unless
898 ;; STRING is adjustable, maybe)
899 ;; but that never actually occurs as of Unicode 5.1.0,
900 ;; so I'm just going to ignore it for now...
901 (multiple-value-bind (hi lo) (surrogates code)
902 (setf (schar string index) hi)
903 (when lo
904 (setf (schar string (incf index)) lo))))))
905 save-header))
906
907 (defun nstring-capitalize (string &key (start 0) end)
908 "Given a string, returns that string with the first
909 character of each ``word'' converted to upper-case, and remaining
910 chars in the word converted to lower case. A ``word'' is defined
911 to be a string of case-modifiable characters delimited by
912 non-case-modifiable chars."
913 (declare (fixnum start))
914 (let ((save-header string))
915 (with-one-string string start end offset
916 (do ((index start (1+ index))
917 (newword t)
918 (char ()))
919 ((= index (the fixnum end)))
920 (declare (fixnum index))
921 (setq char (schar string index))
922 (cond ((not (alphanumericp char))
923 (setq newword t))
924 (newword
925 ;;char is first case-modifiable after non-case-modifiable
926 (setf (schar string index) (char-titlecase char))
927 (setq newword ()))
928 (t
929 (setf (schar string index) (char-downcase char))))))
930 save-header))
931
932
933 #+unicode
934 (progn
935 ;; Like string-left-trim, but return the index
936 (defun string-left-trim-index (char-bag string)
937 (with-string string
938 (if (stringp char-bag)
939 ;; When char-bag is a string, we try to do the right thing.
940 ;; Convert char-bag to a list of codepoints and compare the
941 ;; codepoints in the string with this.
942 (let ((code-bag (with-string char-bag
943 (do ((index start (1+ index))
944 (result nil))
945 ((= index end)
946 (nreverse result))
947 (multiple-value-bind (c widep)
948 (codepoint char-bag index)
949 (push c result)
950 (when widep (incf index)))))))
951 (do ((index start (1+ index)))
952 ((= index (the fixnum end))
953 end)
954 (declare (fixnum index))
955 (multiple-value-bind (c widep)
956 (codepoint string index)
957 (unless (find c code-bag)
958 (return-from string-left-trim-index index))
959 (when widep (incf index)))))
960 ;; When char-bag is a list, we just look at each codepoint of
961 ;; STRING to see if it's in char-bag. If char-bag contains a
962 ;; surrogate, we could accidentally trim off a surrogate,
963 ;; leaving an invalid UTF16 string.
964 (do ((index start (1+ index)))
965 ((= index (the fixnum end))
966 end)
967 (declare (fixnum index))
968 (multiple-value-bind (c widep)
969 (codepoint string index)
970 (unless (find c char-bag :key #'char-code)
971 (return-from string-left-trim-index index))
972 (when widep (incf index)))))))
973
974 (defun string-left-trim (char-bag string)
975 "Given a set of characters (a list or string) and a string, returns
976 a copy of the string with the characters in the set removed from the
977 left end. If the set of characters is a string, surrogates will be
978 properly handled."
979 (let ((begin (string-left-trim-index char-bag string)))
980 (with-string string
981 (subseq string begin end))))
982
983 (defun string-right-trim-index (char-bag string)
984 (with-string string
985 (if (stringp char-bag)
986 ;; When char-bag is a string, we try to do the right thing
987 ;; with surrogates. Convert char-bag to a list of codepoints
988 ;; and compare the codepoints in the string with this.
989 (let ((code-bag (with-string char-bag
990 (do ((index start (1+ index))
991 (result nil))
992 ((= index end)
993 result)
994 (multiple-value-bind (c widep)
995 (codepoint char-bag index)
996 (push c result)
997 (when widep (incf index)))))))
998 (do ((index (1- end) (1- index)))
999 ((< index start)
1000 start)
1001 (declare (fixnum index))
1002 (multiple-value-bind (c widep)
1003 (codepoint string index)
1004 (unless (find c code-bag)
1005 (return-from string-right-trim-index (1+ index)))
1006 (when widep (decf index)))))
1007 ;; When char-bag is a list, we just look at each codepoint of
1008 ;; STRING to see if it's in char-bag. If char-bag contains a
1009 ;; surrogate, we could accidentally trim off a surrogate,
1010 ;; leaving an invalid UTF16 string.
1011 (do ((index (1- end) (1- index)))
1012 ((< index start)
1013 start)
1014 (declare (fixnum index))
1015 (multiple-value-bind (c widep)
1016 (codepoint string index)
1017 (unless (find c char-bag :key #'char-code)
1018 (return-from string-right-trim-index (1+ index)))
1019 (when widep (decf index)))))))
1020
1021 (defun string-right-trim (char-bag string)
1022 "Given a set of characters (a list or string) and a string, returns
1023 a copy of the string with the characters in the set removed from the
1024 right end. If the set of characters is a string, surrogates will be
1025 properly handled."
1026 (let ((stop (string-right-trim-index char-bag string)))
1027 (with-string string
1028 (subseq string start stop))))
1029
1030 (defun string-trim (char-bag string)
1031 "Given a set of characters (a list or string) and a string, returns a
1032 copy of the string with the characters in the set removed from both
1033 ends. If the set of characters is a string, surrogates will be
1034 properly handled."
1035 (let ((left-end (string-left-trim-index char-bag string))
1036 (right-end (string-right-trim-index char-bag string)))
1037 (with-string string
1038 (subseq (the simple-string string) left-end right-end))))
1039 ) ; end unicode version
1040
1041 #-unicode
1042 (progn
1043 (defun string-left-trim (char-bag string)
1044 "Given a set of characters (a list or string) and a string, returns
1045 a copy of the string with the characters in the set removed from the
1046 left end."
1047 (with-string string
1048 (do ((index start (1+ index)))
1049 ((or (= index (the fixnum end))
1050 (not (find (schar string index) char-bag)))
1051 (subseq (the simple-string string) index end))
1052 (declare (fixnum index)))))
1053
1054 (defun string-right-trim (char-bag string)
1055 "Given a set of characters (a list or string) and a string, returns
1056 a copy of the string with the characters in the set removed from the
1057 right end."
1058 (with-string string
1059 (do ((index (1- (the fixnum end)) (1- index)))
1060 ((or (< index start) (not (find (schar string index) char-bag)))
1061 (subseq (the simple-string string) start (1+ index)))
1062 (declare (fixnum index)))))
1063
1064 (defun string-trim (char-bag string)
1065 "Given a set of characters (a list or string) and a string, returns a
1066 copy of the string with the characters in the set removed from both
1067 ends."
1068 (with-string string
1069 (let* ((left-end (do ((index start (1+ index)))
1070 ((or (= index (the fixnum end))
1071 (not (find (schar string index) char-bag)))
1072 index)
1073 (declare (fixnum index))))
1074 (right-end (do ((index (1- (the fixnum end)) (1- index)))
1075 ((or (< index left-end)
1076 (not (find (schar string index) char-bag)))
1077 (1+ index))
1078 (declare (fixnum index)))))
1079 (subseq (the simple-string string) left-end right-end))))
1080 ) ; non-unicode version
1081
1082 (declaim (inline %glyph-f %glyph-b))
1083 (defun %glyph-f (string index)
1084 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
1085 (type simple-string string) (type kernel:index index))
1086 (let* ((prev 0)
1087 (l (length string))
1088 (c (codepoint string index l))
1089 (n (+ index (if (> c #xFFFF) 2 1))))
1090 (declare (type codepoint c) (type kernel:index l n))
1091 (loop while (< n l) do
1092 (let* ((c (codepoint string n l))
1093 (d (the (unsigned-byte 8) (unicode-combining-class c))))
1094 (when (or (zerop d) (< d prev))
1095 (return))
1096 (setq prev d)
1097 (incf n (if (> c #xFFFF) 2 1))))
1098 n))
1099
1100 (defun %glyph-b (string index)
1101 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
1102 (type simple-string string) (type kernel:index index))
1103 (let* ((prev 255)
1104 (n (1- index)))
1105 (declare (type kernel:index n))
1106 (loop until (< n 0) do
1107 (let* ((c (codepoint string n 0))
1108 (d (the (unsigned-byte 8) (unicode-combining-class c))))
1109 (cond ((zerop d) (return))
1110 ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
1111 (setq prev d)
1112 (decf n (if (> c #xFFFF) 2 1))))
1113 n))
1114
1115 (defun glyph (string index &key (from-end nil))
1116 "GLYPH returns the glyph at the indexed position in a string, and the
1117 position of the next glyph (or NIL) as a second value. A glyph is
1118 a substring consisting of the character at INDEX followed by all
1119 subsequent combining characters."
1120 (declare (type simple-string string) (type kernel:index index))
1121 #-unicode
1122 (char string index)
1123 #+unicode
1124 (with-array-data ((string string) (start) (end))
1125 (declare (ignore start end))
1126 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
1127 (if from-end
1128 (values (subseq string n index) (and (> n 0) n))
1129 (values (subseq string index n) (and (< n (length string)) n))))))
1130
1131 (defun sglyph (string index &key (from-end nil))
1132 "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
1133 except that the string must be a simple-string"
1134 (declare (type simple-string string) (type kernel:index index))
1135 #-unicode
1136 (schar string index)
1137 #+unicode
1138 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
1139 (if from-end
1140 (values (subseq string n index) (and (> n 0) n))
1141 (values (subseq string index n) (and (< n (length string)) n)))))
1142
1143 #+unicode
1144 (defun string-reverse* (sequence)
1145 (declare (optimize (speed 3) (space 0) (safety 0))
1146 (type string sequence))
1147 (with-string sequence
1148 (let* ((length (- end start))
1149 (string (make-string length))
1150 (j length))
1151 (declare (type kernel:index length j))
1152 (loop for i = start then n as n = (%glyph-f sequence i) do
1153 (replace string sequence :start1 (decf j (- n i)) :start2 i :end2 n)
1154 while (< n end))
1155 string)))
1156
1157 #+unicode
1158 (defun string-nreverse* (sequence)
1159 (declare (optimize (speed 3) (space 0) (safety 0))
1160 (type string sequence))
1161 (with-string sequence
1162 (flet ((rev (start end)
1163 (do ((i start (1+ i))
1164 (j (1- end) (1- j)))
1165 ((>= i j))
1166 (declare (type kernel:index i j))
1167 (rotatef (schar sequence i) (schar sequence j)))))
1168 (let ((len end))
1169 (loop for i = start then n as n = (%glyph-f sequence i) do
1170 (rev i n) while (< n len))
1171 (rev start end))))
1172 sequence)
1173
1174
1175
1176
1177 (defun decompose (string &optional (compatibility t))
1178 (declare (type string string))
1179 (let ((result (make-string (cond ((< (length string) 40)
1180 (* 5 (length string)))
1181 ((< (length string) 4096)
1182 (* 2 (length string)))
1183 (t (round (length string) 5/6)))))
1184 (fillptr 0))
1185 (declare (type kernel:index fillptr))
1186 (labels ((rec (string start end)
1187 (declare (type simple-string string))
1188 (do ((i start (1+ i)))
1189 ((= i end))
1190 (declare (type kernel:index i))
1191 (multiple-value-bind (code wide) (codepoint string i)
1192 (when wide (incf i))
1193 (let ((decomp (unicode-decomp code compatibility)))
1194 (if decomp (rec decomp 0 (length decomp)) (out code))))))
1195 (out (code)
1196 (multiple-value-bind (hi lo) (surrogates code)
1197 (outch hi)
1198 (when lo
1199 (outch lo))
1200 (let ((cc (unicode-combining-class code)))
1201 (unless (zerop cc)
1202 (order lo cc (- fillptr (if lo 3 2)))))))
1203 (outch (char)
1204 (when (= fillptr (length result))
1205 (let ((tmp (make-string (round (length result) 5/6))))
1206 (replace tmp result)
1207 (setq result tmp)))
1208 (setf (schar result fillptr) char)
1209 (incf fillptr))
1210 (order (wide1 cc last)
1211 (loop until (minusp last) do
1212 (multiple-value-bind (code2 wide2) (codepoint result last)
1213 (let ((cc2 (unicode-combining-class code2)))
1214 (cond ((zerop cc2) (return))
1215 ((> cc2 cc)
1216 (case (+ (if wide2 2 0) (if wide1 1 0))
1217 (0 (rotatef (schar result last)
1218 (schar result (1+ last))))
1219 (1 (rotatef (schar result last)
1220 (schar result (+ last 1))
1221 (schar result (+ last 2))))
1222 (2 (rotatef (schar result last)
1223 (schar result (1- last))
1224 (schar result (1+ last))))
1225 (3 (rotatef (schar result last)
1226 (schar result (+ last 2)))
1227 (rotatef (schar result (1- last))
1228 (schar result (1+ last)))))
1229 (decf last (if wide2 2 1)))
1230 (t (return))))))))
1231 (with-string string
1232 (rec string start end))
1233 (shrink-vector result fillptr))))
1234
1235 (declaim (inline normalized-codepoint-p))
1236 (defun normalized-codepoint-p (cp form)
1237 (ecase form
1238 (:nfc (unicode-nfc-qc cp))
1239 (:nfkc (unicode-nfkc-qc cp))
1240 (:nfd (unicode-nfd-qc cp))
1241 (:nfkd (unicode-nfkd-qc cp))))
1242
1243 ;; Perform check to see if string is already normalized. The Unicode
1244 ;; example can return YES, NO, or MAYBE. For our purposes, only YES
1245 ;; is important, for which we return T. For NO or MAYBE, we return NIL.
1246 (defun normalized-form-p (string &optional (form :nfc))
1247 (declare (type (member :nfc :nfkc :nfd :nfkd) form)
1248 (optimize (speed 3)))
1249 (with-string string
1250 (let ((last-class 0))
1251 (declare (type (integer 0 256) last-class))
1252 (do ((k start (1+ k)))
1253 ((>= k end))
1254 (declare (type kernel:index k))
1255 (multiple-value-bind (ch widep)
1256 (codepoint string k end)
1257 (when widep (incf k))
1258 ;; Handle ASCII specially
1259 (unless (< ch 128)
1260 (let ((class (unicode-combining-class ch)))
1261 (declare (type (unsigned-byte 8) class))
1262 (when (and (> last-class class) (not (zerop class)))
1263 ;; Definitely not normalized
1264 (return-from normalized-form-p nil))
1265 (let ((check (normalized-codepoint-p ch form)))
1266 (unless (eq check :y)
1267 (return-from normalized-form-p nil)))
1268 (setf last-class class)))))
1269 t)))
1270
1271
1272 ;; Compose a string in place. The string must already be in decomposed form.
1273 (defun %compose (target)
1274 (declare (type string target)
1275 (optimize (speed 3)))
1276 (let ((len (length target))
1277 (starter-pos 0))
1278 (declare (type kernel:index starter-pos))
1279 (multiple-value-bind (starter-ch wide)
1280 (codepoint target 0 len)
1281 (let ((comp-pos (if wide 2 1))
1282 (last-class (unicode-combining-class starter-ch)))
1283 (declare (type (integer 0 256) last-class)
1284 (type kernel:index comp-pos))
1285 (unless (zerop last-class)
1286 ;; Fix for strings starting with a combining character
1287 (setf last-class 256))
1288 ;; Loop on decomposed characters, combining where possible
1289 (do ((decomp-pos comp-pos (1+ decomp-pos)))
1290 ((>= decomp-pos len))
1291 (declare (type kernel:index decomp-pos))
1292 (multiple-value-bind (ch wide)
1293 (codepoint target decomp-pos len)
1294 (when wide (incf decomp-pos))
1295 (let ((ch-class (unicode-combining-class ch))
1296 (composite (get-pairwise-composition starter-ch ch)))
1297 (declare (type (integer 0 256) ch-class))
1298 (cond ((and composite
1299 (or (< last-class ch-class) (zerop last-class)))
1300 ;; Don't have to worry about surrogate pairs here
1301 ;; because the composite is always in the BMP.
1302 (setf (aref target starter-pos) (code-char composite))
1303 (setf starter-ch composite))
1304 (t
1305 (when (zerop ch-class)
1306 (setf starter-pos comp-pos)
1307 (setf starter-ch ch))
1308 (setf last-class ch-class)
1309 (multiple-value-bind (hi lo)
1310 (surrogates ch)
1311 (setf (aref target comp-pos) hi)
1312 (when lo
1313 (incf comp-pos)
1314 (setf (aref target comp-pos) lo))
1315 (incf comp-pos)))))))
1316 (shrink-vector target comp-pos)))))
1317
1318 (defun string-to-nfd (string)
1319 "Convert String to Unicode Normalization Form D (NFD) using the
1320 canonical decomposition. The NFD string is returned"
1321 (decompose string nil))
1322
1323 (defun string-to-nfkd (string)
1324 "Convert String to Unicode Normalization Form KD (NFKD) uisng the
1325 compatible decomposition form. The NFKD string is returned."
1326 (decompose string t))
1327
1328 #+unicode
1329 (defun string-to-nfc (string)
1330 "Convert String to Unicode Normalization Form C (NFC). If the
1331 string a simple string and is already normalized, the original
1332 string is returned."
1333 (if (normalized-form-p string :nfc)
1334 (if (simple-string-p string) string (coerce string 'simple-string))
1335 (coerce (if (normalized-form-p string :nfd)
1336 (%compose (copy-seq string))
1337 (%compose (string-to-nfd string)))
1338 'simple-string)))
1339
1340 #-unicode ;; Needed by package.lisp
1341 (defun string-to-nfc (string)
1342 (if (simple-string-p string) string (coerce string 'simple-string)))
1343
1344 (defun string-to-nfkc (string)
1345 "Convert String to Unicode Normalization Form KC (NFKC). If the
1346 string is a simple string and is already normalized, the original
1347 string is returned."
1348 (if (normalized-form-p string :nfkc)
1349 (if (simple-string-p string) string (coerce string 'simple-string))
1350 (coerce (if (normalized-form-p string :nfkd)
1351 (%compose (copy-seq string))
1352 (%compose (string-to-nfkd string)))
1353 'simple-string)))

  ViewVC Help
Powered by ViewVC 1.1.5