/[cmucl]/src/code/string.lisp
ViewVC logotype

Contents of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12.30.21 - (show annotations)
Tue May 26 02:15:55 2009 UTC (4 years, 10 months ago) by rtoy
Branch: unicode-utf16-extfmt-branch
Changes since 1.12.30.20: +169 -6 lines
Add support for Unicode NFC and NFKC forms.  Implement STRING-TO-NFC
and STRING-TO-NFKC.

This probably needs some more work.  The composition table should
probably be a trie and should be in unidata.bin instead of the hash
table that we use now.  The composition exclusion list should be
probably be in unidata.bin too instead of here.

These functions pass all of the normalization tests.
1 ;;; -*- Log: code.log; Package: Lisp -*-
2 ;;;
3 ;;; **********************************************************************
4 ;;; This code was written as part of the CMU Common Lisp project at
5 ;;; Carnegie Mellon University, and has been placed in the public domain.
6 ;;;
7 (ext:file-comment
8 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/string.lisp,v 1.12.30.21 2009/05/26 02:15:55 rtoy Exp $")
9 ;;;
10 ;;; **********************************************************************
11 ;;;
12 ;;; Functions to implement strings for CMU Common Lisp
13 ;;; Written by David Dill
14 ;;; Rewritten by Skef Wholey, Bill Chiles and Rob MacLachlan.
15 ;;;
16 ;;; ****************************************************************
17 ;;;
18 (in-package "LISP")
19 (export '(char schar glyph sglyph string
20 string= string-equal string< string> string<= string>= string/=
21 string-lessp string-greaterp string-not-lessp string-not-greaterp
22 string-not-equal
23 string-to-nfd string-to-nfkd string-to-nfc string-to-nfkc
24 make-string
25 string-trim string-left-trim string-right-trim
26 string-upcase
27 string-downcase string-capitalize nstring-upcase nstring-downcase
28 nstring-capitalize))
29
30
31 (declaim (inline surrogatep surrogates-to-codepoint codepoint surrogates))
32
33 (defun surrogatep (c &optional surrogate-type)
34 "Test if C is a surrogate. C may be either an integer or a
35 character. Surrogate-type indicates what kind of surrogate to test
36 for. :High means to test for the high (leading) surrogate; :Low
37 tests for the low (trailing surrogate). A value of :Any or Nil
38 tests for any surrogate value (high or low)."
39 (declare (type (or character (integer 0 #x10ffff)) c))
40 (let ((code (if (characterp c)
41 (char-code c)
42 c)))
43 (ecase surrogate-type
44 ((:high :leading)
45 ;; Test for high surrogate
46 (<= #xD800 code #xDBFF))
47 ((:low :trailing)
48 ;; Test for low surrogate
49 (<= #xDC00 code #xDFFF))
50 ((:any nil)
51 ;; Test for any surrogate
52 (<= #xD800 code #xDFFF)))))
53
54 (defun surrogates-to-codepoint (hi lo)
55 "Convert the given Hi and Lo surrogate characters to the
56 corresponding codepoint value"
57 (declare (type character hi lo))
58 (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi)) #xD800) 10)
59 (the (integer #xDC00 #xDFFF) (char-code lo)) #x2400))
60
61 (defun codepoint (string i &optional (end (length string)))
62 "Return the codepoint value from String at position I. If that
63 position is a surrogate, it is combined with either the previous or
64 following character (when possible) to compute the codepoint. The
65 second return value is NIL if the position is not a surrogate pair.
66 Otherwise +1 or -1 is returned if the position is the high or low
67 surrogate value, respectively."
68 (declare (type simple-string string) (type kernel:index i end))
69 (let ((code (char-code (schar string i))))
70 (cond ((and (surrogatep code :high) (< (1+ i) end))
71 (let ((tmp (char-code (schar string (1+ i)))))
72 (if (surrogatep tmp :low)
73 (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)
74 (values code nil))))
75 ((and (surrogatep code :low) (> i 0))
76 (let ((tmp (char-code (schar string (1- i)))))
77 (if (surrogatep tmp :high)
78 (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)
79 (values code nil))))
80 (t (values code nil)))))
81
82 (defun surrogates (codepoint)
83 "Return the high and low surrogate characters for Codepoint. If
84 Codepoint is in the BMP, the first return value is the corresponding
85 character and the second is NIL."
86 (declare (type (integer 0 #x10FFFF) codepoint))
87 (if (< codepoint #x10000)
88 (values (code-char codepoint) nil)
89 (let* ((tmp (- codepoint #x10000))
90 (hi (logior (ldb (byte 10 10) tmp) #xD800))
91 (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
92 (values (code-char hi) (code-char lo)))))
93
94 (defun utf16-string-p (string)
95 "Check if String is a valid UTF-16 string. If the string is valid,
96 T is returned. If the string is not valid, NIL is returned, and the
97 second value is the index into the string of the invalid character."
98 (do ((len (length string))
99 (index 0 (1+ index)))
100 ((>= index len)
101 t)
102 (multiple-value-bind (codepoint wide)
103 (codepoint string index)
104 ;; We step through the string in order. If there are any
105 ;; surrogates pairs, we must reach the lead surrogate first,
106 ;; which means WIDE is +1. Otherwise, we have an invalid
107 ;; surrogate pair. If we get any codepoint that is in
108 ;; the surrogate range, we also have an invalid string.
109 (when (or (eq wide -1)
110 (surrogatep codepoint))
111 (return-from utf16-string-p (values nil index)))
112 (when wide (incf index)))))
113
114 (defun string (X)
115 "Coerces X into a string. If X is a string, X is returned. If X is a
116 symbol, X's pname is returned. If X is a character then a one element
117 string containing that character is returned. If X cannot be coerced
118 into a string, an error occurs."
119 (cond ((stringp x) x)
120 ((symbolp x) (symbol-name x))
121 ((characterp x)
122 (let ((res (make-string 1)))
123 (setf (schar res 0) x) res))
124 (t
125 (error 'simple-type-error
126 :datum x
127 :expected-type '(or string symbol character)
128 :format-control "~S cannot be coerced to a string."
129 :format-arguments (list x)))))
130
131 ;;; With-One-String is used to set up some string hacking things. The keywords
132 ;;; are parsed, and the string is hacked into a simple-string.
133
134 (eval-when (compile)
135
136 (defmacro with-one-string (string start end cum-offset &rest forms)
137 `(let ((,string (if (stringp ,string) ,string (string ,string))))
138 ;; Optimizer may prove STRING is one.
139 (declare (optimize (ext:inhibit-warnings 3)))
140 (with-array-data ((,string ,string :offset-var ,cum-offset)
141 (,start ,start)
142 (,end (or ,end (length (the vector ,string)))))
143 ,@forms)))
144
145 )
146
147 ;;; With-String is like With-One-String, but doesn't parse keywords.
148
149 (eval-when (compile)
150
151 (defmacro with-string (string &rest forms)
152 `(let ((,string (if (stringp ,string) ,string (string ,string))))
153 (with-array-data ((,string ,string)
154 (start)
155 (end (length (the vector ,string))))
156 ,@forms)))
157
158 )
159
160 ;;; With-Two-Strings is used to set up string comparison operations. The
161 ;;; keywords are parsed, and the strings are hacked into simple-strings.
162
163 (eval-when (compile)
164
165 (defmacro with-two-strings (string1 string2 start1 end1 cum-offset-1
166 start2 end2 &rest forms)
167 `(let ((,string1 (if (stringp ,string1) ,string1 (string ,string1)))
168 (,string2 (if (stringp ,string2) ,string2 (string ,string2))))
169 (with-array-data ((,string1 ,string1 :offset-var ,cum-offset-1)
170 (,start1 ,start1)
171 (,end1 (or ,end1 (length (the vector ,string1)))))
172 (with-array-data ((,string2 ,string2)
173 (,start2 ,start2)
174 (,end2 (or ,end2 (length (the vector ,string2)))))
175 ,@forms))))
176
177 )
178
179
180 (defun char (string index)
181 "Given a string and a non-negative integer index less than the length of
182 the string, returns the character object representing the character at
183 that position in the string."
184 (declare (optimize (safety 1)))
185 (char string index))
186
187 (defun %charset (string index new-el)
188 (declare (optimize (safety 1)))
189 (setf (char string index) new-el))
190
191 (defun schar (string index)
192 "SCHAR returns the character object at an indexed position in a string
193 just as CHAR does, except the string must be a simple-string."
194 (declare (optimize (safety 1)))
195 (schar string index))
196
197 (defun %scharset (string index new-el)
198 (declare (optimize (safety 1)))
199 (setf (schar string index) new-el))
200
201 (defun string=* (string1 string2 start1 end1 start2 end2)
202 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
203 (not (%sp-string-compare string1 start1 end1 string2 start2 end2))))
204
205
206 (defun string/=* (string1 string2 start1 end1 start2 end2)
207 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
208 (let ((comparison (%sp-string-compare string1 start1 end1
209 string2 start2 end2)))
210 (if comparison (- (the fixnum comparison) offset1)))))
211
212 (eval-when (compile eval)
213
214 ;;; Lessp is true if the desired expansion is for string<* or string<=*.
215 ;;; Equalp is true if the desired expansion is for string<=* or string>=*.
216 (defmacro string<>=*-body (lessp equalp)
217 (let ((offset1 (gensym)))
218 `(with-two-strings string1 string2 start1 end1 ,offset1 start2 end2
219 (let ((index (%sp-string-compare string1 start1 end1
220 string2 start2 end2)))
221 (if index
222 (cond ((= (the fixnum index) (the fixnum end1))
223 ,(if lessp
224 `(- (the fixnum index) ,offset1)
225 `nil))
226 ((= (+ (the fixnum index) (- start2 start1))
227 (the fixnum end2))
228 ,(if lessp
229 `nil
230 `(- (the fixnum index) ,offset1)))
231 #-unicode
232 ((,(if lessp 'char< 'char>)
233 (schar string1 index)
234 (schar string2 (+ (the fixnum index) (- start2 start1))))
235 (- (the fixnum index) ,offset1))
236 #-unicode
237 (t nil)
238 #+unicode
239 (t
240 ;; Compare in code point order. See
241 ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
242 (flet ((fixup (code)
243 (if (>= code #xe000)
244 (- code #x800)
245 (+ code #x2000))))
246 (declare (inline fixup))
247 (let* ((c1 (char-code (schar string1 index)))
248 (c2 (char-code (schar string2
249 (+ (the fixnum index)
250 (- start2 start1))))))
251 (cond ((and (>= c1 #xd800)
252 (>= c2 #xd800))
253 (let ((fix-c1 (fixup c1))
254 (fix-c2 (fixup c2)))
255 (if (,(if lessp '< '>) fix-c1 fix-c2)
256 (- (the fixnum index) ,offset1)
257 nil)))
258 (t
259 (if (,(if lessp '< '>) c1 c2)
260 (- (the fixnum index) ,offset1)
261 nil)))))))
262 ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
263 ) ; eval-when
264
265 (defun string<* (string1 string2 start1 end1 start2 end2)
266 (declare (fixnum start1 start2))
267 (string<>=*-body t nil))
268
269 (defun string>* (string1 string2 start1 end1 start2 end2)
270 (declare (fixnum start1 start2))
271 (string<>=*-body nil nil))
272
273 (defun string<=* (string1 string2 start1 end1 start2 end2)
274 (declare (fixnum start1 start2))
275 (string<>=*-body t t))
276
277 (defun string>=* (string1 string2 start1 end1 start2 end2)
278 (declare (fixnum start1 start2))
279 (string<>=*-body nil t))
280
281
282
283 (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
284 "Given two strings, if the first string is lexicographically less than
285 the second string, returns the longest common prefix (using char=)
286 of the two strings. Otherwise, returns ()."
287 (string<* string1 string2 start1 end1 start2 end2))
288
289 (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
290 "Given two strings, if the first string is lexicographically greater than
291 the second string, returns the longest common prefix (using char=)
292 of the two strings. Otherwise, returns ()."
293 (string>* string1 string2 start1 end1 start2 end2))
294
295
296 (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
297 "Given two strings, if the first string is lexicographically less than
298 or equal to the second string, returns the longest common prefix
299 (using char=) of the two strings. Otherwise, returns ()."
300 (string<=* string1 string2 start1 end1 start2 end2))
301
302 (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
303 "Given two strings, if the first string is lexicographically greater
304 than or equal to the second string, returns the longest common prefix
305 (using char=) of the two strings. Otherwise, returns ()."
306 (string>=* string1 string2 start1 end1 start2 end2))
307
308 (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
309 "Given two strings (string1 and string2), and optional integers start1,
310 start2, end1 and end2, compares characters in string1 to characters in
311 string2 (using char=)."
312 (string=* string1 string2 start1 end1 start2 end2))
313
314 (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
315 "Given two strings, if the first string is not lexicographically equal
316 to the second string, returns the longest common prefix (using char=)
317 of the two strings. Otherwise, returns ()."
318 (string/=* string1 string2 start1 end1 start2 end2))
319
320
321 (eval-when (compile eval)
322
323 ;;; STRING-NOT-EQUAL-LOOP is used to generate character comparison loops for
324 ;;; STRING-EQUAL and STRING-NOT-EQUAL.
325 (defmacro string-not-equal-loop (end end-value
326 &optional (abort-value nil abortp))
327 (declare (fixnum end))
328 (let ((end-test (if (= end 1)
329 `(= index1 (the fixnum end1))
330 `(= index2 (the fixnum end2)))))
331 `(do ((index1 start1 (1+ index1))
332 (index2 start2 (1+ index2)))
333 (,(if abortp
334 end-test
335 `(or ,end-test
336 (not (char-equal (schar string1 index1)
337 (schar string2 index2)))))
338 ,end-value)
339 (declare (fixnum index1 index2))
340 ,@(if abortp
341 `((if (not (char-equal (schar string1 index1)
342 (schar string2 index2)))
343 (return ,abort-value)))))))
344
345 ) ; eval-when
346
347 (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
348 "Given two strings (string1 and string2), and optional integers start1,
349 start2, end1 and end2, compares characters in string1 to characters in
350 string2 (using char-equal)."
351 (declare (fixnum start1 start2))
352 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
353 (let ((slen1 (- (the fixnum end1) start1))
354 (slen2 (- (the fixnum end2) start2)))
355 (declare (fixnum slen1 slen2))
356 (if (or (minusp slen1) (minusp slen2))
357 ;;prevent endless looping later.
358 (error "Improper bounds for string comparison."))
359 (if (= slen1 slen2)
360 ;;return () immediately if lengths aren't equal.
361 (string-not-equal-loop 1 t nil)))))
362
363 (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
364 "Given two strings, if the first string is not lexicographically equal
365 to the second string, returns the longest common prefix (using char-equal)
366 of the two strings. Otherwise, returns ()."
367 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
368 (let ((slen1 (- end1 start1))
369 (slen2 (- end2 start2)))
370 (declare (fixnum slen1 slen2))
371 (if (or (minusp slen1) (minusp slen2))
372 ;;prevent endless looping later.
373 (error "Improper bounds for string comparison."))
374 (cond ((or (minusp slen1) (or (minusp slen2)))
375 (error "Improper substring for comparison."))
376 ((= slen1 slen2)
377 (string-not-equal-loop 1 nil (- index1 offset1)))
378 ((< slen1 slen2)
379 (string-not-equal-loop 1 (- index1 offset1)))
380 (t
381 (string-not-equal-loop 2 (- index1 offset1)))))))
382
383
384
385 (eval-when (compile eval)
386
387 ;;; STRING-LESS-GREATER-EQUAL-TESTS returns a test on the lengths of string1
388 ;;; and string2 and a test on the current characters from string1 and string2
389 ;;; for the following macro.
390 (defun string-less-greater-equal-tests (lessp equalp)
391 (if lessp
392 (if equalp
393 ;; STRING-NOT-GREATERP
394 (values '<=
395 #-unicode `(not (char-greaterp char1 char2))
396 #+unicode `(<= char1 char2))
397 ;; STRING-LESSP
398 (values '<
399 #-unicode `(char-lessp char1 char2)
400 #+unicode `(< char1 char2)))
401 (if equalp
402 ;; STRING-NOT-LESSP
403 (values '>=
404 #-unicode `(not (char-lessp char1 char2))
405 #+unicode `(>= char1 char2))
406 ;; STRING-GREATERP
407 (values '>
408 #-unicode `(char-greaterp char1 char2)
409 #+unicode `(> char1 char2)))))
410
411 #-unicode
412 (defmacro string-less-greater-equal (lessp equalp)
413 (multiple-value-bind (length-test character-test)
414 (string-less-greater-equal-tests lessp equalp)
415 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
416 (let ((slen1 (- (the fixnum end1) start1))
417 (slen2 (- (the fixnum end2) start2)))
418 (declare (fixnum slen1 slen2))
419 (if (or (minusp slen1) (minusp slen2))
420 ;;prevent endless looping later.
421 (error "Improper bounds for string comparison."))
422 (do ((index1 start1 (1+ index1))
423 (index2 start2 (1+ index2))
424 (char1)
425 (char2))
426 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
427 (if (,length-test slen1 slen2) (- index1 offset1)))
428 (declare (fixnum index1 index2))
429 (setq char1 (schar string1 index1))
430 (setq char2 (schar string2 index2))
431 (if (not (char-equal char1 char2))
432 (if ,character-test
433 (return (- index1 offset1))
434 (return ()))))))))
435
436 ;; Convert to lowercase for case folding, to match what Unicode
437 ;; CaseFolding.txt says. An example where this matters: U+1E9E maps
438 ;; to U+00DF. But the uppercase version of U+00DF is U+00DF.
439 #+unicode
440 (defmacro equal-char-codepoint (codepoint)
441 `(let ((ch ,codepoint))
442 ;; Handle ASCII separately for bootstrapping and for unidata missing.
443 (if (< 64 ch 91)
444 (+ ch 32)
445 #-(and unicode (not unicode-bootstrap))
446 ch
447 #+(and unicode (not unicode-bootstrap))
448 (if (> ch 127) (unicode-lower ch) ch))))
449
450 #+unicode
451 (defmacro string-less-greater-equal (lessp equalp)
452 (multiple-value-bind (length-test character-test)
453 (string-less-greater-equal-tests lessp equalp)
454 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
455 (let ((slen1 (- (the fixnum end1) start1))
456 (slen2 (- (the fixnum end2) start2)))
457 (declare (fixnum slen1 slen2))
458 (if (or (minusp slen1) (minusp slen2))
459 ;;prevent endless looping later.
460 (error "Improper bounds for string comparison."))
461 (do ((index1 start1 (1+ index1))
462 (index2 start2 (1+ index2)))
463 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
464 (if (,length-test slen1 slen2) (- index1 offset1)))
465 (declare (fixnum index1 index2))
466 (multiple-value-bind (char1 wide1)
467 (codepoint string1 index1)
468 (declare (type (integer 0 #x10ffff) char1))
469 (multiple-value-bind (char2 wide2)
470 (codepoint string2 index2)
471 (declare (type (integer 0 #x10ffff) char2))
472 (setf char1 (equal-char-codepoint char1))
473 (setf char2 (equal-char-codepoint char2))
474 (if (= char1 char2)
475 (progn
476 (when wide1 (incf index1))
477 (when wide2 (incf index2)))
478 (if ,character-test
479 (return (- index1 offset1))
480 (return ()))))))))))
481
482 ) ; eval-when
483
484 (defun string-lessp* (string1 string2 start1 end1 start2 end2)
485 (declare (fixnum start1 start2))
486 (string-less-greater-equal t nil))
487
488 (defun string-greaterp* (string1 string2 start1 end1 start2 end2)
489 (declare (fixnum start1 start2))
490 (string-less-greater-equal nil nil))
491
492 (defun string-not-lessp* (string1 string2 start1 end1 start2 end2)
493 (declare (fixnum start1 start2))
494 (string-less-greater-equal nil t))
495
496 (defun string-not-greaterp* (string1 string2 start1 end1 start2 end2)
497 (declare (fixnum start1 start2))
498 (string-less-greater-equal t t))
499
500 (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
501 "Given two strings, if the first string is lexicographically less than
502 the second string, returns the longest common prefix (using char-equal)
503 of the two strings. Otherwise, returns ()."
504 (string-lessp* string1 string2 start1 end1 start2 end2))
505
506 (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)
507 "Given two strings, if the first string is lexicographically greater than
508 the second string, returns the longest common prefix (using char-equal)
509 of the two strings. Otherwise, returns ()."
510 (string-greaterp* string1 string2 start1 end1 start2 end2))
511
512 (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
513 "Given two strings, if the first string is lexicographically greater
514 than or equal to the second string, returns the longest common prefix
515 (using char-equal) of the two strings. Otherwise, returns ()."
516 (string-not-lessp* string1 string2 start1 end1 start2 end2))
517
518 (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
519 end2)
520 "Given two strings, if the first string is lexicographically less than
521 or equal to the second string, returns the longest common prefix
522 (using char-equal) of the two strings. Otherwise, returns ()."
523 (string-not-greaterp* string1 string2 start1 end1 start2 end2))
524
525
526 (defun make-string (count &key element-type ((:initial-element fill-char)))
527 "Given a character count and an optional fill character, makes and returns
528 a new string Count long filled with the fill character."
529 (declare (type fixnum count))
530 (assert (subtypep element-type 'character))
531 (if fill-char
532 (do ((i 0 (1+ i))
533 (string (make-string count)))
534 ((= i count) string)
535 (declare (fixnum i))
536 (setf (schar string i) fill-char))
537 (make-string count)))
538
539 (defun string-upcase (string &key (start 0) end)
540 "Given a string, returns a new string that is a copy of it with
541 all lower case alphabetic characters converted to uppercase."
542 (declare (fixnum start))
543 (let* ((string (if (stringp string) string (string string)))
544 (slen (length string)))
545 (declare (fixnum slen))
546 (with-one-string string start end offset
547 (let ((offset-slen (+ slen offset))
548 (newstring (make-string slen)))
549 (declare (fixnum offset-slen))
550 (do ((index offset (1+ index))
551 (new-index 0 (1+ new-index)))
552 ((= index start))
553 (declare (fixnum index new-index))
554 (setf (schar newstring new-index) (schar string index)))
555 (do ((index start (1+ index))
556 (new-index (- start offset) (1+ new-index)))
557 ((= index (the fixnum end)))
558 (declare (fixnum index new-index))
559 (multiple-value-bind (code wide) (codepoint string index)
560 (when wide (incf index))
561 ;; Handle ASCII specially because this is called early in
562 ;; initialization, before unidata is available.
563 (cond ((< 96 code 123) (decf code 32))
564 ((> code 127) (setq code (unicode-upper code))))
565 ;;@@ WARNING: this may, in theory, need to extend newstring
566 ;; but that never actually occurs as of Unicode 5.1.0,
567 ;; so I'm just going to ignore it for now...
568 (multiple-value-bind (hi lo) (surrogates code)
569 (setf (schar newstring new-index) hi)
570 (when lo
571 (setf (schar newstring (incf new-index)) lo)))))
572 ;;@@ WARNING: see above
573 (do ((index end (1+ index))
574 (new-index (- (the fixnum end) offset) (1+ new-index)))
575 ((= index offset-slen))
576 (declare (fixnum index new-index))
577 (setf (schar newstring new-index) (schar string index)))
578 newstring))))
579
580 (defun string-downcase (string &key (start 0) end)
581 "Given a string, returns a new string that is a copy of it with
582 all upper case alphabetic characters converted to lowercase."
583 (declare (fixnum start))
584 (let* ((string (if (stringp string) string (string string)))
585 (slen (length string)))
586 (declare (fixnum slen))
587 (with-one-string string start end offset
588 (let ((offset-slen (+ slen offset))
589 (newstring (make-string slen)))
590 (declare (fixnum offset-slen))
591 (do ((index offset (1+ index))
592 (new-index 0 (1+ new-index)))
593 ((= index start))
594 (declare (fixnum index new-index))
595 (setf (schar newstring new-index) (schar string index)))
596 (do ((index start (1+ index))
597 (new-index (- start offset) (1+ new-index)))
598 ((= index (the fixnum end)))
599 (declare (fixnum index new-index))
600 (multiple-value-bind (code wide) (codepoint string index)
601 (when wide (incf index))
602 ;; Handle ASCII specially because this is called early in
603 ;; initialization, before unidata is available.
604 (cond ((< 64 code 91) (incf code 32))
605 ((> code 127) (setq code (unicode-lower code))))
606 ;;@@ WARNING: this may, in theory, need to extend newstring
607 ;; but that never actually occurs as of Unicode 5.1.0,
608 ;; so I'm just going to ignore it for now...
609 (multiple-value-bind (hi lo) (surrogates code)
610 (setf (schar newstring new-index) hi)
611 (when lo
612 (setf (schar newstring (incf new-index)) lo)))))
613 ;;@@ WARNING: see above
614 (do ((index end (1+ index))
615 (new-index (- (the fixnum end) offset) (1+ new-index)))
616 ((= index offset-slen))
617 (declare (fixnum index new-index))
618 (setf (schar newstring new-index) (schar string index)))
619 newstring))))
620
621 (defun string-capitalize (string &key (start 0) end)
622 "Given a string, returns a copy of the string with the first
623 character of each ``word'' converted to upper-case, and remaining
624 chars in the word converted to lower case. A ``word'' is defined
625 to be a string of case-modifiable characters delimited by
626 non-case-modifiable chars."
627 (declare (fixnum start))
628 (let* ((string (if (stringp string) string (string string)))
629 (slen (length string)))
630 (declare (fixnum slen))
631 (with-one-string string start end offset
632 (let ((offset-slen (+ slen offset))
633 (newstring (make-string slen)))
634 (declare (fixnum offset-slen))
635 (do ((index offset (1+ index))
636 (new-index 0 (1+ new-index)))
637 ((= index start))
638 (declare (fixnum index new-index))
639 (setf (schar newstring new-index) (schar string index)))
640 (do ((index start (1+ index))
641 (new-index (- start offset) (1+ new-index))
642 (newword t)
643 (char ()))
644 ((= index (the fixnum end)))
645 (declare (fixnum index new-index))
646 (setq char (schar string index))
647 (cond ((not (alphanumericp char))
648 (setq newword t))
649 (newword
650 ;;char is first case-modifiable after non-case-modifiable
651 (setq char (char-titlecase char))
652 (setq newword ()))
653 ;;char is case-modifiable, but not first
654 (t (setq char (char-downcase char))))
655 (setf (schar newstring new-index) char))
656 (do ((index end (1+ index))
657 (new-index (- (the fixnum end) offset) (1+ new-index)))
658 ((= index offset-slen))
659 (declare (fixnum index new-index))
660 (setf (schar newstring new-index) (schar string index)))
661 newstring))))
662
663 (defun nstring-upcase (string &key (start 0) end)
664 "Given a string, returns that string with all lower case alphabetic
665 characters converted to uppercase."
666 (declare (fixnum start))
667 (let ((save-header string))
668 (with-one-string string start end offset
669 (do ((index start (1+ index)))
670 ((= index (the fixnum end)))
671 (declare (fixnum index))
672 (multiple-value-bind (code wide) (codepoint string index)
673 (declare (ignore wide))
674 ;; Handle ASCII specially because this is called early in
675 ;; initialization, before unidata is available.
676 (cond ((< 96 code 123) (decf code 32))
677 ((> code 127) (setq code (unicode-upper code))))
678 ;;@@ WARNING: this may, in theory, need to extend string
679 ;; (which, obviously, we can't do here. Unless
680 ;; STRING is adjustable, maybe)
681 ;; but that never actually occurs as of Unicode 5.1.0,
682 ;; so I'm just going to ignore it for now...
683 (multiple-value-bind (hi lo) (surrogates code)
684 (setf (schar string index) hi)
685 (when lo
686 (setf (schar string (incf index)) lo))))))
687 save-header))
688
689 (defun nstring-downcase (string &key (start 0) end)
690 "Given a string, returns that string with all upper case alphabetic
691 characters converted to lowercase."
692 (declare (fixnum start))
693 (let ((save-header string))
694 (with-one-string string start end offset
695 (do ((index start (1+ index)))
696 ((= index (the fixnum end)))
697 (declare (fixnum index))
698 (multiple-value-bind (code wide) (codepoint string index)
699 (declare (ignore wide))
700 (cond ((< 64 code 91) (incf code 32))
701 ((> code 127) (setq code (unicode-lower code))))
702 ;;@@ WARNING: this may, in theory, need to extend string
703 ;; (which, obviously, we can't do here. Unless
704 ;; STRING is adjustable, maybe)
705 ;; but that never actually occurs as of Unicode 5.1.0,
706 ;; so I'm just going to ignore it for now...
707 (multiple-value-bind (hi lo) (surrogates code)
708 (setf (schar string index) hi)
709 (when lo
710 (setf (schar string (incf index)) lo))))))
711 save-header))
712
713 (defun nstring-capitalize (string &key (start 0) end)
714 "Given a string, returns that string with the first
715 character of each ``word'' converted to upper-case, and remaining
716 chars in the word converted to lower case. A ``word'' is defined
717 to be a string of case-modifiable characters delimited by
718 non-case-modifiable chars."
719 (declare (fixnum start))
720 (let ((save-header string))
721 (with-one-string string start end offset
722 (do ((index start (1+ index))
723 (newword t)
724 (char ()))
725 ((= index (the fixnum end)))
726 (declare (fixnum index))
727 (setq char (schar string index))
728 (cond ((not (alphanumericp char))
729 (setq newword t))
730 (newword
731 ;;char is first case-modifiable after non-case-modifiable
732 (setf (schar string index) (char-titlecase char))
733 (setq newword ()))
734 (t
735 (setf (schar string index) (char-downcase char))))))
736 save-header))
737
738 (defun string-left-trim (char-bag string)
739 "Given a set of characters (a list or string) and a string, returns
740 a copy of the string with the characters in the set removed from the
741 left end."
742 (with-string string
743 (do ((index start (1+ index)))
744 ((or (= index (the fixnum end))
745 (not (find (schar string index) char-bag)))
746 (subseq (the simple-string string) index end))
747 (declare (fixnum index)))))
748
749 (defun string-right-trim (char-bag string)
750 "Given a set of characters (a list or string) and a string, returns
751 a copy of the string with the characters in the set removed from the
752 right end."
753 (with-string string
754 (do ((index (1- (the fixnum end)) (1- index)))
755 ((or (< index start) (not (find (schar string index) char-bag)))
756 (subseq (the simple-string string) start (1+ index)))
757 (declare (fixnum index)))))
758
759 (defun string-trim (char-bag string)
760 "Given a set of characters (a list or string) and a string, returns a
761 copy of the string with the characters in the set removed from both
762 ends."
763 (with-string string
764 (let* ((left-end (do ((index start (1+ index)))
765 ((or (= index (the fixnum end))
766 (not (find (schar string index) char-bag)))
767 index)
768 (declare (fixnum index))))
769 (right-end (do ((index (1- (the fixnum end)) (1- index)))
770 ((or (< index left-end)
771 (not (find (schar string index) char-bag)))
772 (1+ index))
773 (declare (fixnum index)))))
774 (subseq (the simple-string string) left-end right-end))))
775
776 (declaim (inline %glyph-f %glyph-b))
777 (defun %glyph-f (string index)
778 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
779 (type simple-string string) (type kernel:index index))
780 (let* ((prev 0)
781 (l (length string))
782 (c (codepoint string index l))
783 (n (+ index (if (> c #xFFFF) 2 1))))
784 (declare (type (integer 0 #x10FFFF) c) (type kernel:index l n))
785 (loop while (< n l) do
786 (let* ((c (codepoint string n l))
787 (d (the (unsigned-byte 8) (unicode-combining-class c))))
788 (when (or (zerop d) (< d prev))
789 (return))
790 (setq prev d)
791 (incf n (if (> c #xFFFF) 2 1))))
792 n))
793
794 (defun %glyph-b (string index)
795 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
796 (type simple-string string) (type kernel:index index))
797 (let* ((prev 255)
798 (n (1- index)))
799 (declare (type kernel:index n))
800 (loop until (< n 0) do
801 (let* ((c (codepoint string n 0))
802 (d (the (unsigned-byte 8) (unicode-combining-class c))))
803 (cond ((zerop d) (return))
804 ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
805 (setq prev d)
806 (decf n (if (> c #xFFFF) 2 1))))
807 n))
808
809 (defun glyph (string index &key (from-end nil))
810 "GLYPH returns the glyph at the indexed position in a string, and the
811 position of the next glyph (or NIL) as a second value. A glyph is
812 a substring consisting of the character at INDEX followed by all
813 subsequent combining characters."
814 (declare (type simple-string string) (type kernel:index index))
815 #-unicode
816 (char string index)
817 #+unicode
818 (with-array-data ((string string) (start) (end))
819 (declare (ignore start end))
820 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
821 (if from-end
822 (values (subseq string n index) (and (> n 0) n))
823 (values (subseq string index n) (and (< n (length string)) n))))))
824
825 (defun sglyph (string index &key (from-end nil))
826 "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
827 except that the string must be a simple-string"
828 (declare (type simple-string string) (type kernel:index index))
829 #-unicode
830 (schar string index)
831 #+unicode
832 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
833 (if from-end
834 (values (subseq string n index) (and (> n 0) n))
835 (values (subseq string index n) (and (< n (length string)) n)))))
836
837 (defun decompose (string &optional (compatibility t))
838 (declare (type string string))
839 (let ((result (make-string (cond ((< (length string) 40)
840 (* 5 (length string)))
841 ((< (length string) 4096)
842 (* 2 (length string)))
843 (t (round (length string) 5/6)))))
844 (fillptr 0))
845 (declare (type kernel:index fillptr))
846 (labels ((rec (string)
847 (declare (type simple-string string))
848 (do ((i 0 (1+ i)))
849 ((= i (length string)))
850 (declare (type kernel:index i))
851 (multiple-value-bind (code wide) (codepoint string i)
852 (when wide (incf i))
853 (let ((decomp (unicode-decomp code compatibility)))
854 (if decomp (rec decomp) (out code))))))
855 (out (code)
856 (multiple-value-bind (hi lo) (surrogates code)
857 (outch hi)
858 (when lo
859 (outch lo))
860 (let ((cc (unicode-combining-class code)))
861 (unless (zerop cc)
862 (order lo cc (- fillptr (if lo 3 2)))))))
863 (outch (char)
864 (when (= fillptr (length result))
865 (let ((tmp (make-string (round (length result) 5/6))))
866 (replace tmp result)
867 (setq result tmp)))
868 (setf (schar result fillptr) char)
869 (incf fillptr))
870 (order (wide1 cc last)
871 (loop until (minusp last) do
872 (multiple-value-bind (code2 wide2) (codepoint result last)
873 (let ((cc2 (unicode-combining-class code2)))
874 (cond ((zerop cc2) (return))
875 ((> cc2 cc)
876 (case (+ (if wide2 2 0) (if wide1 1 0))
877 (0 (rotatef (schar result last)
878 (schar result (1+ last))))
879 (1 (rotatef (schar result last)
880 (schar result (+ last 1))
881 (schar result (+ last 2))))
882 (2 (rotatef (schar result last)
883 (schar result (1- last))
884 (schar result (1+ last))))
885 (3 (rotatef (schar result last)
886 (schar result (+ last 2)))
887 (rotatef (schar result (1- last))
888 (schar result (1+ last)))))
889 (decf last (if wide2 2 1)))
890 (t (return))))))))
891 (with-array-data ((string string) (start) (end))
892 (declare (ignore start end))
893 (rec string))
894 (shrink-vector result fillptr))))
895
896 (declaim (inline normalized-codepoint-p))
897 (defun normalized-codepoint-p (cp form)
898 (ecase form
899 (:nfc (unicode-nfc-qc cp))
900 (:nfkc (unicode-nfkc-qc cp))
901 (:nfd (unicode-nfd-qc cp))
902 (:nfkd (unicode-nfkd-qc cp))))
903
904 ;; Perform check to see if string is already normalized. The Unicode
905 ;; example can return YES, NO, or MAYBE. For our purposes, only YES
906 ;; is important, for which we return T. For NO or MAYBE, we return NIL.
907 (defun normalized-form-p (string &optional (form :nfc))
908 (declare (type string string)
909 (type (member :nfc :nfkc :nfd :nfkd) form)
910 (optimize (speed 3)))
911 (let ((last-class 0)
912 (len (length string)))
913 (declare (type (integer 0 256) last-class))
914 (do ((k 0 (1+ k)))
915 ((>= k len))
916 (declare (type kernel:index k))
917 (multiple-value-bind (ch widep)
918 (codepoint string k len)
919 (when widep (incf k))
920 ;; Handle ASCII specially
921 (unless (< ch 128)
922 (let ((class (unicode-combining-class ch)))
923 (declare (type (unsigned-byte 8) class))
924 (when (and (> last-class class) (not (zerop class)))
925 ;; Definitely not normalized
926 (return-from normalized-form-p nil))
927 (let ((check (normalized-codepoint-p ch form)))
928 (unless (eq check :y)
929 (return-from normalized-form-p nil)))
930 (setf last-class class)))))
931 t))
932
933 ;; @@ FIXME: This should be read from unidata.bin, but it's not there
934 ;; yet. This is CompositionExclusions.txt, with the four extra code
935 ;; points that could be derived from the decompositions.
936 (defvar *composition-exclusion*
937 '(#x0958 #x0959 #x095A #x095B #x095C #x095D #x095E #x095F #x09DC #x09DD #x09DF
938 #x0A33 #x0A36 #x0A59 #x0A5A #x0A5B #x0A5E #x0B5C #x0B5D #x0F43 #x0F4D #x0F52
939 #x0F57 #x0F5C #x0F69 #x0F76 #x0F78 #x0F93 #x0F9D #x0FA2 #x0FA7 #x0FAC #x0FB9
940 #xFB1D #xFB1F #xFB2A #xFB2B #xFB2C #xFB2D #xFB2E #xFB2F #xFB30 #xFB31 #xFB32
941 #xFB33 #xFB34 #xFB35 #xFB36 #xFB38 #xFB39 #xFB3A #xFB3B #xFB3C #xFB3E #xFB40
942 #xFB41 #xFB43 #xFB44 #xFB46 #xFB47 #xFB48 #xFB49 #xFB4A #xFB4B #xFB4C #xFB4D
943 #xFB4E #x2ADC #x1D15E #x1D15F #x1D160 #x1D161 #x1D162 #x1D163 #x1D164 #x1D1BB
944 #x1D1BC #x1D1BD #x1D1BE #x1D1BF #x1D1C0
945 ;; Non-starters
946 #x0344 #x0F73 #x0F75 #x0F81))
947
948 ;; Build the composition pair table.
949 ;;
950 ;; @@ FIXME:: The composition table should probably be in unidata.bin,
951 ;; but it's not there yet.
952 (defun build-composition-table ()
953 (let ((table (make-hash-table)))
954 (dotimes (cp #x10ffff)
955 ;; Ignore Hangul characters, which can be done algorithmically.
956 (unless (<= #xac00 cp #xd7a3)
957 (let ((decomp (unicode-decomp cp nil)))
958 (when (and decomp (= (length decomp) 2))
959 (let ((c1 (char-code (aref decomp 0)))
960 (c2 (char-code (aref decomp 1))))
961 (setf (gethash (logior (ash c1 16) c2) table) cp))))))
962 ;; Remove any in the exclusion list
963 (dolist (cp *composition-exclusion*)
964 (let ((decomp (unicode-decomp cp nil)))
965 (when (and decomp (= (length decomp) 2))
966 (let ((c1 (char-code (aref decomp 0)))
967 (c2 (char-code (aref decomp 1))))
968 (remhash (logior (ash c1 16) c2) table)))))
969 (values table)))
970
971 (defvar *composition-pair-table* nil)
972
973 (declaim (inline compose-hangul))
974 (defun compose-hangul (c1 c2)
975 (declare (type (integer 0 #x10FFFF) c1 c2)
976 (optimize (speed 3)))
977 (let ((index-l (- c1 #x1100)))
978 (cond ((and (<= 0 index-l)
979 (< index-l 19))
980 (let ((index-v (- c2 #x1161)))
981 (when (and (<= 0 index-v)
982 (< index-v 21))
983 (+ #xac00 (* 28 (+ (* index-l 21) index-v))))))
984 (t
985 (let ((index-s (- c1 #xac00)))
986 (when (and (<= 0 index-s)
987 (< index-s 11172)
988 (zerop (rem index-s 28)))
989 (let ((index-t (- c2 #x11a7)))
990 (when (and (plusp index-t)
991 (< index-t 28))
992 (+ c1 index-t)))))))))
993
994
995 (defun get-pairwise-composition (c1 c2)
996 (declare (type (integer 0 #x10FFFF) c1 c2)
997 (optimize (speed 3)))
998 (unless *composition-pair-table*
999 (setf *composition-pair-table* (build-composition-table)))
1000 (cond ((compose-hangul c1 c2))
1001 (t
1002 (if (and (< c1 #x10000) (< c2 #x10000))
1003 (gethash (logior (ash c1 16) c2) *composition-pair-table*)
1004 nil))))
1005
1006 ;; Compose a string in place. The string must already be in decomposed form.
1007 (defun %compose (target)
1008 (declare (type string target)
1009 (optimize (speed 3)))
1010 (let ((len (length target))
1011 (starter-pos 0))
1012 (declare (type kernel:index starter-pos))
1013 (multiple-value-bind (starter-ch wide)
1014 (codepoint target 0 len)
1015 (let ((comp-pos (if wide 2 1))
1016 (last-class (unicode-combining-class starter-ch)))
1017 (declare (type (integer 0 256) last-class)
1018 (type kernel:index comp-pos))
1019 (unless (zerop last-class)
1020 ;; Fix for strings starting with a combining character
1021 (setf last-class 256))
1022 ;; Loop on decomposed characters, combining where possible
1023 (do ((decomp-pos comp-pos (1+ decomp-pos)))
1024 ((>= decomp-pos len))
1025 (declare (type kernel:index decomp-pos))
1026 (multiple-value-bind (ch wide)
1027 (codepoint target decomp-pos len)
1028 (when wide (incf decomp-pos))
1029 (let ((ch-class (unicode-combining-class ch))
1030 (composite (get-pairwise-composition starter-ch ch)))
1031 (declare (type (integer 0 256) ch-class))
1032 (cond ((and composite
1033 (or (< last-class ch-class) (zerop last-class)))
1034 ;; Don't have to worry about surrogate pairs here
1035 ;; because the composite is always in the BMP.
1036 (setf (aref target starter-pos) (code-char composite))
1037 (setf starter-ch composite))
1038 (t
1039 (when (zerop ch-class)
1040 (setf starter-pos comp-pos)
1041 (setf starter-ch ch))
1042 (setf last-class ch-class)
1043 (multiple-value-bind (hi lo)
1044 (surrogates ch)
1045 (setf (aref target comp-pos) hi)
1046 (when lo
1047 (incf comp-pos)
1048 (setf (aref target comp-pos) lo))
1049 (incf comp-pos)))))))
1050 (shrink-vector target comp-pos)))))
1051
1052 (defun string-to-nfd (string)
1053 "Convert String to Unicode Normalization Form D (NFD) using the
1054 canonical decomposition. The NFD string is returned"
1055 (decompose string nil))
1056
1057 (defun string-to-nfkd (string)
1058 "Convert String to Unicode Normalization Form KD (NFKD) uisng the
1059 compatible decomposition form. The NFKD string is returned."
1060 (decompose string t))
1061
1062 #+unicode
1063 (defun string-to-nfc (string)
1064 (if (normalized-form-p string :nfc)
1065 (if (simple-string-p string) string (coerce string 'simple-string))
1066 (coerce (if (normalized-form-p string :nfd)
1067 (%compose (copy-seq string))
1068 (%compose (string-to-nfd string)))
1069 'simple-string)))
1070
1071 #-unicode ;; Needed by package.lisp
1072 (defun string-to-nfc (string)
1073 (if (simple-string-p string) string (coerce string 'simple-string)))
1074
1075 (defun string-to-nfkc (string)
1076 (if (normalized-form-p string :nfkc)
1077 (if (simple-string-p string) string (coerce string 'simple-string))
1078 (coerce (if (normalized-form-p string :nfkd)
1079 (%compose (copy-seq string))
1080 (%compose (string-to-nfkd string)))
1081 'simple-string)))

  ViewVC Help
Powered by ViewVC 1.1.5