/[cmucl]/src/code/string.lisp
ViewVC logotype

Contents of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12.30.28 - (show annotations)
Thu Jun 4 15:47:40 2009 UTC (4 years, 10 months ago) by rtoy
Branch: unicode-utf16-extfmt-branch
Changes since 1.12.30.27: +8 -5 lines
code/unidata.lisp:
o Add UNICODE-ASSIGNED-CODEPOINT-P

code/string.lisp:
o Make UTF16-STRING-P check for unassigned codepoints in the string.
1 ;;; -*- Log: code.log; Package: Lisp -*-
2 ;;;
3 ;;; **********************************************************************
4 ;;; This code was written as part of the CMU Common Lisp project at
5 ;;; Carnegie Mellon University, and has been placed in the public domain.
6 ;;;
7 (ext:file-comment
8 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/string.lisp,v 1.12.30.28 2009/06/04 15:47:40 rtoy Exp $")
9 ;;;
10 ;;; **********************************************************************
11 ;;;
12 ;;; Functions to implement strings for CMU Common Lisp
13 ;;; Written by David Dill
14 ;;; Rewritten by Skef Wholey, Bill Chiles and Rob MacLachlan.
15 ;;;
16 ;;; ****************************************************************
17 ;;;
18 (in-package "LISP")
19 (export '(char schar glyph sglyph string
20 string= string-equal string< string> string<= string>= string/=
21 string-lessp string-greaterp string-not-lessp string-not-greaterp
22 string-not-equal
23 string-to-nfd string-to-nfkd string-to-nfc string-to-nfkc
24 make-string
25 string-trim string-left-trim string-right-trim
26 string-upcase
27 string-downcase string-capitalize nstring-upcase nstring-downcase
28 nstring-capitalize))
29
30
31 (declaim (inline surrogatep surrogates-to-codepoint codepoint surrogates))
32
33 (defun surrogatep (c &optional surrogate-type)
34 "Test if C is a surrogate. C may be either an integer or a
35 character. Surrogate-type indicates what kind of surrogate to test
36 for. :High means to test for the high (leading) surrogate; :Low
37 tests for the low (trailing surrogate). A value of :Any or Nil
38 tests for any surrogate value (high or low)."
39 (declare (type (or character codepoint) c))
40 (let ((code (if (characterp c)
41 (char-code c)
42 c)))
43 (ecase surrogate-type
44 ((:high :leading)
45 ;; Test for high surrogate
46 (<= #xD800 code #xDBFF))
47 ((:low :trailing)
48 ;; Test for low surrogate
49 (<= #xDC00 code #xDFFF))
50 ((:any nil)
51 ;; Test for any surrogate
52 (<= #xD800 code #xDFFF)))))
53
54 (defun surrogates-to-codepoint (hi lo)
55 "Convert the given Hi and Lo surrogate characters to the
56 corresponding codepoint value"
57 (declare (type character hi lo))
58 (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi)) #xD800) 10)
59 (the (integer #xDC00 #xDFFF) (char-code lo)) #x2400))
60
61 (defun codepoint (string i &optional (end (length string)))
62 "Return the codepoint value from String at position I. If that
63 position is a surrogate, it is combined with either the previous or
64 following character (when possible) to compute the codepoint. The
65 second return value is NIL if the position is not a surrogate pair.
66 Otherwise +1 or -1 is returned if the position is the high or low
67 surrogate value, respectively."
68 (declare (type simple-string string) (type kernel:index i end))
69 (let ((code (char-code (schar string i))))
70 (cond ((and (surrogatep code :high) (< (1+ i) end))
71 (let ((tmp (char-code (schar string (1+ i)))))
72 (if (surrogatep tmp :low)
73 (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)
74 (values code nil))))
75 ((and (surrogatep code :low) (> i 0))
76 (let ((tmp (char-code (schar string (1- i)))))
77 (if (surrogatep tmp :high)
78 (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)
79 (values code nil))))
80 (t (values code nil)))))
81
82 (defun surrogates (codepoint)
83 "Return the high and low surrogate characters for Codepoint. If
84 Codepoint is in the BMP, the first return value is the corresponding
85 character and the second is NIL."
86 (declare (type codepoint codepoint))
87 (if (< codepoint #x10000)
88 (values (code-char codepoint) nil)
89 (let* ((tmp (- codepoint #x10000))
90 (hi (logior (ldb (byte 10 10) tmp) #xD800))
91 (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
92 (values (code-char hi) (code-char lo)))))
93
94 (defun (setf codepoint) (codepoint string i)
95 "Set the codepoint at string position I to the Codepoint. If the
96 codepoint requires a surrogate pair, the high (leading surrogate) is
97 stored at position I and the low (trailing) surrogate is stored at
98 I+1"
99 (declare (type codepoint codepoint)
100 (type simple-string string))
101 (let ((widep nil))
102 (multiple-value-bind (hi lo)
103 (surrogates codepoint)
104 (setf (aref string i) hi)
105 (when lo
106 (setf (aref string (1+ i)) lo)
107 (setf widep t)))
108 (values codepoint widep)))
109
110 (defun utf16-string-p (string)
111 "Check if String is a valid UTF-16 string. If the string is valid,
112 T is returned. If the string is not valid, NIL is returned, and the
113 second value is the index into the string of the invalid character.
114 A string is also invalid if it contains any unassigned codepoints."
115 (do ((len (length string))
116 (index 0 (1+ index)))
117 ((>= index len)
118 t)
119 (multiple-value-bind (codepoint wide)
120 (codepoint string index)
121 ;; We step through the string in order. If there are any
122 ;; surrogates pairs, we must reach the lead surrogate first,
123 ;; which means WIDE is +1. Otherwise, we have an invalid
124 ;; surrogate pair. If we get any codepoint that is in the
125 ;; surrogate range, we also have an invalid string. An
126 ;; unassigned codepoint is also considered invalid.
127 (when (or (eq wide -1)
128 (surrogatep codepoint)
129 (lisp::unicode-assigned-codepoint-p codepoint))
130 (return-from utf16-string-p (values nil index)))
131 (when wide (incf index)))))
132
133 (defun string (X)
134 "Coerces X into a string. If X is a string, X is returned. If X is a
135 symbol, X's pname is returned. If X is a character then a one element
136 string containing that character is returned. If X cannot be coerced
137 into a string, an error occurs."
138 (cond ((stringp x) x)
139 ((symbolp x) (symbol-name x))
140 ((characterp x)
141 (let ((res (make-string 1)))
142 (setf (schar res 0) x) res))
143 (t
144 (error 'simple-type-error
145 :datum x
146 :expected-type '(or string symbol character)
147 :format-control "~S cannot be coerced to a string."
148 :format-arguments (list x)))))
149
150 ;;; With-One-String is used to set up some string hacking things. The keywords
151 ;;; are parsed, and the string is hacked into a simple-string.
152
153 (eval-when (compile)
154
155 (defmacro with-one-string (string start end cum-offset &rest forms)
156 `(let ((,string (if (stringp ,string) ,string (string ,string))))
157 ;; Optimizer may prove STRING is one.
158 (declare (optimize (ext:inhibit-warnings 3)))
159 (with-array-data ((,string ,string :offset-var ,cum-offset)
160 (,start ,start)
161 (,end (or ,end (length (the vector ,string)))))
162 ,@forms)))
163
164 )
165
166 ;;; With-String is like With-One-String, but doesn't parse keywords.
167
168 (eval-when (compile)
169
170 (defmacro with-string (string &rest forms)
171 `(let ((,string (if (stringp ,string) ,string (string ,string))))
172 (with-array-data ((,string ,string)
173 (start)
174 (end (length (the vector ,string))))
175 ,@forms)))
176
177 )
178
179 ;;; With-Two-Strings is used to set up string comparison operations. The
180 ;;; keywords are parsed, and the strings are hacked into simple-strings.
181
182 (eval-when (compile)
183
184 (defmacro with-two-strings (string1 string2 start1 end1 cum-offset-1
185 start2 end2 &rest forms)
186 `(let ((,string1 (if (stringp ,string1) ,string1 (string ,string1)))
187 (,string2 (if (stringp ,string2) ,string2 (string ,string2))))
188 (with-array-data ((,string1 ,string1 :offset-var ,cum-offset-1)
189 (,start1 ,start1)
190 (,end1 (or ,end1 (length (the vector ,string1)))))
191 (with-array-data ((,string2 ,string2)
192 (,start2 ,start2)
193 (,end2 (or ,end2 (length (the vector ,string2)))))
194 ,@forms))))
195
196 )
197
198
199 (defun char (string index)
200 "Given a string and a non-negative integer index less than the length of
201 the string, returns the character object representing the character at
202 that position in the string."
203 (declare (optimize (safety 1)))
204 (char string index))
205
206 (defun %charset (string index new-el)
207 (declare (optimize (safety 1)))
208 (setf (char string index) new-el))
209
210 (defun schar (string index)
211 "SCHAR returns the character object at an indexed position in a string
212 just as CHAR does, except the string must be a simple-string."
213 (declare (optimize (safety 1)))
214 (schar string index))
215
216 (defun %scharset (string index new-el)
217 (declare (optimize (safety 1)))
218 (setf (schar string index) new-el))
219
220 (defun string=* (string1 string2 start1 end1 start2 end2)
221 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
222 (not (%sp-string-compare string1 start1 end1 string2 start2 end2))))
223
224
225 (defun string/=* (string1 string2 start1 end1 start2 end2)
226 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
227 (let ((comparison (%sp-string-compare string1 start1 end1
228 string2 start2 end2)))
229 (if comparison (- (the fixnum comparison) offset1)))))
230
231 (eval-when (compile eval)
232
233 ;;; Lessp is true if the desired expansion is for string<* or string<=*.
234 ;;; Equalp is true if the desired expansion is for string<=* or string>=*.
235 (defmacro string<>=*-body (lessp equalp)
236 (let ((offset1 (gensym)))
237 `(with-two-strings string1 string2 start1 end1 ,offset1 start2 end2
238 (let ((index (%sp-string-compare string1 start1 end1
239 string2 start2 end2)))
240 (if index
241 (cond ((= (the fixnum index) (the fixnum end1))
242 ,(if lessp
243 `(- (the fixnum index) ,offset1)
244 `nil))
245 ((= (+ (the fixnum index) (- start2 start1))
246 (the fixnum end2))
247 ,(if lessp
248 `nil
249 `(- (the fixnum index) ,offset1)))
250 #-unicode
251 ((,(if lessp 'char< 'char>)
252 (schar string1 index)
253 (schar string2 (+ (the fixnum index) (- start2 start1))))
254 (- (the fixnum index) ,offset1))
255 #-unicode
256 (t nil)
257 #+unicode
258 (t
259 ;; Compare in code point order. See
260 ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
261 (flet ((fixup (code)
262 (if (>= code #xe000)
263 (- code #x800)
264 (+ code #x2000))))
265 (declare (inline fixup))
266 (let* ((c1 (char-code (schar string1 index)))
267 (c2 (char-code (schar string2
268 (+ (the fixnum index)
269 (- start2 start1))))))
270 (cond ((and (>= c1 #xd800)
271 (>= c2 #xd800))
272 (let ((fix-c1 (fixup c1))
273 (fix-c2 (fixup c2)))
274 (if (,(if lessp '< '>) fix-c1 fix-c2)
275 (- (the fixnum index) ,offset1)
276 nil)))
277 (t
278 (if (,(if lessp '< '>) c1 c2)
279 (- (the fixnum index) ,offset1)
280 nil)))))))
281 ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
282 ) ; eval-when
283
284 (defun string<* (string1 string2 start1 end1 start2 end2)
285 (declare (fixnum start1 start2))
286 (string<>=*-body t nil))
287
288 (defun string>* (string1 string2 start1 end1 start2 end2)
289 (declare (fixnum start1 start2))
290 (string<>=*-body nil nil))
291
292 (defun string<=* (string1 string2 start1 end1 start2 end2)
293 (declare (fixnum start1 start2))
294 (string<>=*-body t t))
295
296 (defun string>=* (string1 string2 start1 end1 start2 end2)
297 (declare (fixnum start1 start2))
298 (string<>=*-body nil t))
299
300
301
302 (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
303 "Given two strings, if the first string is lexicographically less than
304 the second string, returns the longest common prefix (using char=)
305 of the two strings. Otherwise, returns ()."
306 (string<* string1 string2 start1 end1 start2 end2))
307
308 (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
309 "Given two strings, if the first string is lexicographically greater than
310 the second string, returns the longest common prefix (using char=)
311 of the two strings. Otherwise, returns ()."
312 (string>* string1 string2 start1 end1 start2 end2))
313
314
315 (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
316 "Given two strings, if the first string is lexicographically less than
317 or equal to the second string, returns the longest common prefix
318 (using char=) of the two strings. Otherwise, returns ()."
319 (string<=* string1 string2 start1 end1 start2 end2))
320
321 (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
322 "Given two strings, if the first string is lexicographically greater
323 than or equal to the second string, returns the longest common prefix
324 (using char=) of the two strings. Otherwise, returns ()."
325 (string>=* string1 string2 start1 end1 start2 end2))
326
327 (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
328 "Given two strings (string1 and string2), and optional integers start1,
329 start2, end1 and end2, compares characters in string1 to characters in
330 string2 (using char=)."
331 (string=* string1 string2 start1 end1 start2 end2))
332
333 (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
334 "Given two strings, if the first string is not lexicographically equal
335 to the second string, returns the longest common prefix (using char=)
336 of the two strings. Otherwise, returns ()."
337 (string/=* string1 string2 start1 end1 start2 end2))
338
339
340 (eval-when (compile eval)
341
342 ;;; STRING-NOT-EQUAL-LOOP is used to generate character comparison loops for
343 ;;; STRING-EQUAL and STRING-NOT-EQUAL.
344 (defmacro string-not-equal-loop (end end-value
345 &optional (abort-value nil abortp))
346 (declare (fixnum end))
347 (let ((end-test (if (= end 1)
348 `(= index1 (the fixnum end1))
349 `(= index2 (the fixnum end2)))))
350 `(do ((index1 start1 (1+ index1))
351 (index2 start2 (1+ index2)))
352 (,(if abortp
353 end-test
354 `(or ,end-test
355 (not (char-equal (schar string1 index1)
356 (schar string2 index2)))))
357 ,end-value)
358 (declare (fixnum index1 index2))
359 ,@(if abortp
360 `((if (not (char-equal (schar string1 index1)
361 (schar string2 index2)))
362 (return ,abort-value)))))))
363
364 ) ; eval-when
365
366 (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
367 "Given two strings (string1 and string2), and optional integers start1,
368 start2, end1 and end2, compares characters in string1 to characters in
369 string2 (using char-equal)."
370 (declare (fixnum start1 start2))
371 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
372 (let ((slen1 (- (the fixnum end1) start1))
373 (slen2 (- (the fixnum end2) start2)))
374 (declare (fixnum slen1 slen2))
375 (if (or (minusp slen1) (minusp slen2))
376 ;;prevent endless looping later.
377 (error "Improper bounds for string comparison."))
378 (if (= slen1 slen2)
379 ;;return () immediately if lengths aren't equal.
380 (string-not-equal-loop 1 t nil)))))
381
382 (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
383 "Given two strings, if the first string is not lexicographically equal
384 to the second string, returns the longest common prefix (using char-equal)
385 of the two strings. Otherwise, returns ()."
386 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
387 (let ((slen1 (- end1 start1))
388 (slen2 (- end2 start2)))
389 (declare (fixnum slen1 slen2))
390 (if (or (minusp slen1) (minusp slen2))
391 ;;prevent endless looping later.
392 (error "Improper bounds for string comparison."))
393 (cond ((or (minusp slen1) (or (minusp slen2)))
394 (error "Improper substring for comparison."))
395 ((= slen1 slen2)
396 (string-not-equal-loop 1 nil (- index1 offset1)))
397 ((< slen1 slen2)
398 (string-not-equal-loop 1 (- index1 offset1)))
399 (t
400 (string-not-equal-loop 2 (- index1 offset1)))))))
401
402
403
404 (eval-when (compile eval)
405
406 ;;; STRING-LESS-GREATER-EQUAL-TESTS returns a test on the lengths of string1
407 ;;; and string2 and a test on the current characters from string1 and string2
408 ;;; for the following macro.
409 (defun string-less-greater-equal-tests (lessp equalp)
410 (if lessp
411 (if equalp
412 ;; STRING-NOT-GREATERP
413 (values '<=
414 #-unicode `(not (char-greaterp char1 char2))
415 #+unicode `(<= char1 char2))
416 ;; STRING-LESSP
417 (values '<
418 #-unicode `(char-lessp char1 char2)
419 #+unicode `(< char1 char2)))
420 (if equalp
421 ;; STRING-NOT-LESSP
422 (values '>=
423 #-unicode `(not (char-lessp char1 char2))
424 #+unicode `(>= char1 char2))
425 ;; STRING-GREATERP
426 (values '>
427 #-unicode `(char-greaterp char1 char2)
428 #+unicode `(> char1 char2)))))
429
430 #-unicode
431 (defmacro string-less-greater-equal (lessp equalp)
432 (multiple-value-bind (length-test character-test)
433 (string-less-greater-equal-tests lessp equalp)
434 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
435 (let ((slen1 (- (the fixnum end1) start1))
436 (slen2 (- (the fixnum end2) start2)))
437 (declare (fixnum slen1 slen2))
438 (if (or (minusp slen1) (minusp slen2))
439 ;;prevent endless looping later.
440 (error "Improper bounds for string comparison."))
441 (do ((index1 start1 (1+ index1))
442 (index2 start2 (1+ index2))
443 (char1)
444 (char2))
445 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
446 (if (,length-test slen1 slen2) (- index1 offset1)))
447 (declare (fixnum index1 index2))
448 (setq char1 (schar string1 index1))
449 (setq char2 (schar string2 index2))
450 (if (not (char-equal char1 char2))
451 (if ,character-test
452 (return (- index1 offset1))
453 (return ()))))))))
454
455 ;; Convert to lowercase for case folding, to match what Unicode
456 ;; CaseFolding.txt says. An example where this matters: U+1E9E maps
457 ;; to U+00DF. But the uppercase version of U+00DF is U+00DF.
458 #+unicode
459 (defmacro equal-char-codepoint (codepoint)
460 `(let ((ch ,codepoint))
461 ;; Handle ASCII separately for bootstrapping and for unidata missing.
462 (if (< 64 ch 91)
463 (+ ch 32)
464 #-(and unicode (not unicode-bootstrap))
465 ch
466 #+(and unicode (not unicode-bootstrap))
467 (if (> ch 127) (unicode-lower ch) ch))))
468
469 #+unicode
470 (defmacro string-less-greater-equal (lessp equalp)
471 (multiple-value-bind (length-test character-test)
472 (string-less-greater-equal-tests lessp equalp)
473 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
474 (let ((slen1 (- (the fixnum end1) start1))
475 (slen2 (- (the fixnum end2) start2)))
476 (declare (fixnum slen1 slen2))
477 (if (or (minusp slen1) (minusp slen2))
478 ;;prevent endless looping later.
479 (error "Improper bounds for string comparison."))
480 (do ((index1 start1 (1+ index1))
481 (index2 start2 (1+ index2)))
482 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
483 (if (,length-test slen1 slen2) (- index1 offset1)))
484 (declare (fixnum index1 index2))
485 (multiple-value-bind (char1 wide1)
486 (codepoint string1 index1)
487 (declare (type codepoint char1))
488 (multiple-value-bind (char2 wide2)
489 (codepoint string2 index2)
490 (declare (type codepoint char2))
491 (setf char1 (equal-char-codepoint char1))
492 (setf char2 (equal-char-codepoint char2))
493 (if (= char1 char2)
494 (progn
495 (when wide1 (incf index1))
496 (when wide2 (incf index2)))
497 (if ,character-test
498 (return (- index1 offset1))
499 (return ()))))))))))
500
501 ) ; eval-when
502
503 (defun string-lessp* (string1 string2 start1 end1 start2 end2)
504 (declare (fixnum start1 start2))
505 (string-less-greater-equal t nil))
506
507 (defun string-greaterp* (string1 string2 start1 end1 start2 end2)
508 (declare (fixnum start1 start2))
509 (string-less-greater-equal nil nil))
510
511 (defun string-not-lessp* (string1 string2 start1 end1 start2 end2)
512 (declare (fixnum start1 start2))
513 (string-less-greater-equal nil t))
514
515 (defun string-not-greaterp* (string1 string2 start1 end1 start2 end2)
516 (declare (fixnum start1 start2))
517 (string-less-greater-equal t t))
518
519 (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
520 "Given two strings, if the first string is lexicographically less than
521 the second string, returns the longest common prefix (using char-equal)
522 of the two strings. Otherwise, returns ()."
523 (string-lessp* string1 string2 start1 end1 start2 end2))
524
525 (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)
526 "Given two strings, if the first string is lexicographically greater than
527 the second string, returns the longest common prefix (using char-equal)
528 of the two strings. Otherwise, returns ()."
529 (string-greaterp* string1 string2 start1 end1 start2 end2))
530
531 (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
532 "Given two strings, if the first string is lexicographically greater
533 than or equal to the second string, returns the longest common prefix
534 (using char-equal) of the two strings. Otherwise, returns ()."
535 (string-not-lessp* string1 string2 start1 end1 start2 end2))
536
537 (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
538 end2)
539 "Given two strings, if the first string is lexicographically less than
540 or equal to the second string, returns the longest common prefix
541 (using char-equal) of the two strings. Otherwise, returns ()."
542 (string-not-greaterp* string1 string2 start1 end1 start2 end2))
543
544
545 (defun make-string (count &key element-type ((:initial-element fill-char)))
546 "Given a character count and an optional fill character, makes and returns
547 a new string Count long filled with the fill character."
548 (declare (type fixnum count))
549 (assert (subtypep element-type 'character))
550 (if fill-char
551 (do ((i 0 (1+ i))
552 (string (make-string count)))
553 ((= i count) string)
554 (declare (fixnum i))
555 (setf (schar string i) fill-char))
556 (make-string count)))
557
558 (defun string-upcase (string &key (start 0) end)
559 "Given a string, returns a new string that is a copy of it with
560 all lower case alphabetic characters converted to uppercase."
561 (declare (fixnum start))
562 (let* ((string (if (stringp string) string (string string)))
563 (slen (length string)))
564 (declare (fixnum slen))
565 (with-one-string string start end offset
566 (let ((offset-slen (+ slen offset))
567 (newstring (make-string slen)))
568 (declare (fixnum offset-slen))
569 (do ((index offset (1+ index))
570 (new-index 0 (1+ new-index)))
571 ((= index start))
572 (declare (fixnum index new-index))
573 (setf (schar newstring new-index) (schar string index)))
574 (do ((index start (1+ index))
575 (new-index (- start offset) (1+ new-index)))
576 ((= index (the fixnum end)))
577 (declare (fixnum index new-index))
578 (multiple-value-bind (code wide) (codepoint string index)
579 (when wide (incf index))
580 ;; Handle ASCII specially because this is called early in
581 ;; initialization, before unidata is available.
582 (cond ((< 96 code 123) (decf code 32))
583 ((> code 127) (setq code (unicode-upper code))))
584 ;;@@ WARNING: this may, in theory, need to extend newstring
585 ;; but that never actually occurs as of Unicode 5.1.0,
586 ;; so I'm just going to ignore it for now...
587 (multiple-value-bind (hi lo) (surrogates code)
588 (setf (schar newstring new-index) hi)
589 (when lo
590 (setf (schar newstring (incf new-index)) lo)))))
591 ;;@@ WARNING: see above
592 (do ((index end (1+ index))
593 (new-index (- (the fixnum end) offset) (1+ new-index)))
594 ((= index offset-slen))
595 (declare (fixnum index new-index))
596 (setf (schar newstring new-index) (schar string index)))
597 newstring))))
598
599 (defun string-downcase (string &key (start 0) end)
600 "Given a string, returns a new string that is a copy of it with
601 all upper case alphabetic characters converted to lowercase."
602 (declare (fixnum start))
603 (let* ((string (if (stringp string) string (string string)))
604 (slen (length string)))
605 (declare (fixnum slen))
606 (with-one-string string start end offset
607 (let ((offset-slen (+ slen offset))
608 (newstring (make-string slen)))
609 (declare (fixnum offset-slen))
610 (do ((index offset (1+ index))
611 (new-index 0 (1+ new-index)))
612 ((= index start))
613 (declare (fixnum index new-index))
614 (setf (schar newstring new-index) (schar string index)))
615 (do ((index start (1+ index))
616 (new-index (- start offset) (1+ new-index)))
617 ((= index (the fixnum end)))
618 (declare (fixnum index new-index))
619 (multiple-value-bind (code wide) (codepoint string index)
620 (when wide (incf index))
621 ;; Handle ASCII specially because this is called early in
622 ;; initialization, before unidata is available.
623 (cond ((< 64 code 91) (incf code 32))
624 ((> code 127) (setq code (unicode-lower code))))
625 ;;@@ WARNING: this may, in theory, need to extend newstring
626 ;; but that never actually occurs as of Unicode 5.1.0,
627 ;; so I'm just going to ignore it for now...
628 (multiple-value-bind (hi lo) (surrogates code)
629 (setf (schar newstring new-index) hi)
630 (when lo
631 (setf (schar newstring (incf new-index)) lo)))))
632 ;;@@ WARNING: see above
633 (do ((index end (1+ index))
634 (new-index (- (the fixnum end) offset) (1+ new-index)))
635 ((= index offset-slen))
636 (declare (fixnum index new-index))
637 (setf (schar newstring new-index) (schar string index)))
638 newstring))))
639
640 (defun string-capitalize (string &key (start 0) end)
641 "Given a string, returns a copy of the string with the first
642 character of each ``word'' converted to upper-case, and remaining
643 chars in the word converted to lower case. A ``word'' is defined
644 to be a string of case-modifiable characters delimited by
645 non-case-modifiable chars."
646 (declare (fixnum start))
647 (let* ((string (if (stringp string) string (string string)))
648 (slen (length string)))
649 (declare (fixnum slen))
650 (with-one-string string start end offset
651 (let ((offset-slen (+ slen offset))
652 (newstring (make-string slen)))
653 (declare (fixnum offset-slen))
654 (do ((index offset (1+ index))
655 (new-index 0 (1+ new-index)))
656 ((= index start))
657 (declare (fixnum index new-index))
658 (setf (schar newstring new-index) (schar string index)))
659 (do ((index start (1+ index))
660 (new-index (- start offset) (1+ new-index))
661 (newword t)
662 (char ()))
663 ((= index (the fixnum end)))
664 (declare (fixnum index new-index))
665 (setq char (schar string index))
666 (cond ((not (alphanumericp char))
667 (setq newword t))
668 (newword
669 ;;char is first case-modifiable after non-case-modifiable
670 (setq char (char-titlecase char))
671 (setq newword ()))
672 ;;char is case-modifiable, but not first
673 (t (setq char (char-downcase char))))
674 (setf (schar newstring new-index) char))
675 (do ((index end (1+ index))
676 (new-index (- (the fixnum end) offset) (1+ new-index)))
677 ((= index offset-slen))
678 (declare (fixnum index new-index))
679 (setf (schar newstring new-index) (schar string index)))
680 newstring))))
681
682 (defun nstring-upcase (string &key (start 0) end)
683 "Given a string, returns that string with all lower case alphabetic
684 characters converted to uppercase."
685 (declare (fixnum start))
686 (let ((save-header string))
687 (with-one-string string start end offset
688 (do ((index start (1+ index)))
689 ((= index (the fixnum end)))
690 (declare (fixnum index))
691 (multiple-value-bind (code wide) (codepoint string index)
692 (declare (ignore wide))
693 ;; Handle ASCII specially because this is called early in
694 ;; initialization, before unidata is available.
695 (cond ((< 96 code 123) (decf code 32))
696 ((> code 127) (setq code (unicode-upper code))))
697 ;;@@ WARNING: this may, in theory, need to extend string
698 ;; (which, obviously, we can't do here. Unless
699 ;; STRING is adjustable, maybe)
700 ;; but that never actually occurs as of Unicode 5.1.0,
701 ;; so I'm just going to ignore it for now...
702 (multiple-value-bind (hi lo) (surrogates code)
703 (setf (schar string index) hi)
704 (when lo
705 (setf (schar string (incf index)) lo))))))
706 save-header))
707
708 (defun nstring-downcase (string &key (start 0) end)
709 "Given a string, returns that string with all upper case alphabetic
710 characters converted to lowercase."
711 (declare (fixnum start))
712 (let ((save-header string))
713 (with-one-string string start end offset
714 (do ((index start (1+ index)))
715 ((= index (the fixnum end)))
716 (declare (fixnum index))
717 (multiple-value-bind (code wide) (codepoint string index)
718 (declare (ignore wide))
719 (cond ((< 64 code 91) (incf code 32))
720 ((> code 127) (setq code (unicode-lower code))))
721 ;;@@ WARNING: this may, in theory, need to extend string
722 ;; (which, obviously, we can't do here. Unless
723 ;; STRING is adjustable, maybe)
724 ;; but that never actually occurs as of Unicode 5.1.0,
725 ;; so I'm just going to ignore it for now...
726 (multiple-value-bind (hi lo) (surrogates code)
727 (setf (schar string index) hi)
728 (when lo
729 (setf (schar string (incf index)) lo))))))
730 save-header))
731
732 (defun nstring-capitalize (string &key (start 0) end)
733 "Given a string, returns that string with the first
734 character of each ``word'' converted to upper-case, and remaining
735 chars in the word converted to lower case. A ``word'' is defined
736 to be a string of case-modifiable characters delimited by
737 non-case-modifiable chars."
738 (declare (fixnum start))
739 (let ((save-header string))
740 (with-one-string string start end offset
741 (do ((index start (1+ index))
742 (newword t)
743 (char ()))
744 ((= index (the fixnum end)))
745 (declare (fixnum index))
746 (setq char (schar string index))
747 (cond ((not (alphanumericp char))
748 (setq newword t))
749 (newword
750 ;;char is first case-modifiable after non-case-modifiable
751 (setf (schar string index) (char-titlecase char))
752 (setq newword ()))
753 (t
754 (setf (schar string index) (char-downcase char))))))
755 save-header))
756
757 (defun string-left-trim (char-bag string)
758 "Given a set of characters (a list or string) and a string, returns
759 a copy of the string with the characters in the set removed from the
760 left end."
761 (with-string string
762 (do ((index start (1+ index)))
763 ((or (= index (the fixnum end))
764 (not (find (schar string index) char-bag)))
765 (subseq (the simple-string string) index end))
766 (declare (fixnum index)))))
767
768 (defun string-right-trim (char-bag string)
769 "Given a set of characters (a list or string) and a string, returns
770 a copy of the string with the characters in the set removed from the
771 right end."
772 (with-string string
773 (do ((index (1- (the fixnum end)) (1- index)))
774 ((or (< index start) (not (find (schar string index) char-bag)))
775 (subseq (the simple-string string) start (1+ index)))
776 (declare (fixnum index)))))
777
778 (defun string-trim (char-bag string)
779 "Given a set of characters (a list or string) and a string, returns a
780 copy of the string with the characters in the set removed from both
781 ends."
782 (with-string string
783 (let* ((left-end (do ((index start (1+ index)))
784 ((or (= index (the fixnum end))
785 (not (find (schar string index) char-bag)))
786 index)
787 (declare (fixnum index))))
788 (right-end (do ((index (1- (the fixnum end)) (1- index)))
789 ((or (< index left-end)
790 (not (find (schar string index) char-bag)))
791 (1+ index))
792 (declare (fixnum index)))))
793 (subseq (the simple-string string) left-end right-end))))
794
795 (declaim (inline %glyph-f %glyph-b))
796 (defun %glyph-f (string index)
797 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
798 (type simple-string string) (type kernel:index index))
799 (let* ((prev 0)
800 (l (length string))
801 (c (codepoint string index l))
802 (n (+ index (if (> c #xFFFF) 2 1))))
803 (declare (type codepoint c) (type kernel:index l n))
804 (loop while (< n l) do
805 (let* ((c (codepoint string n l))
806 (d (the (unsigned-byte 8) (unicode-combining-class c))))
807 (when (or (zerop d) (< d prev))
808 (return))
809 (setq prev d)
810 (incf n (if (> c #xFFFF) 2 1))))
811 n))
812
813 (defun %glyph-b (string index)
814 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
815 (type simple-string string) (type kernel:index index))
816 (let* ((prev 255)
817 (n (1- index)))
818 (declare (type kernel:index n))
819 (loop until (< n 0) do
820 (let* ((c (codepoint string n 0))
821 (d (the (unsigned-byte 8) (unicode-combining-class c))))
822 (cond ((zerop d) (return))
823 ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
824 (setq prev d)
825 (decf n (if (> c #xFFFF) 2 1))))
826 n))
827
828 (defun glyph (string index &key (from-end nil))
829 "GLYPH returns the glyph at the indexed position in a string, and the
830 position of the next glyph (or NIL) as a second value. A glyph is
831 a substring consisting of the character at INDEX followed by all
832 subsequent combining characters."
833 (declare (type simple-string string) (type kernel:index index))
834 #-unicode
835 (char string index)
836 #+unicode
837 (with-array-data ((string string) (start) (end))
838 (declare (ignore start end))
839 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
840 (if from-end
841 (values (subseq string n index) (and (> n 0) n))
842 (values (subseq string index n) (and (< n (length string)) n))))))
843
844 (defun sglyph (string index &key (from-end nil))
845 "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
846 except that the string must be a simple-string"
847 (declare (type simple-string string) (type kernel:index index))
848 #-unicode
849 (schar string index)
850 #+unicode
851 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
852 (if from-end
853 (values (subseq string n index) (and (> n 0) n))
854 (values (subseq string index n) (and (< n (length string)) n)))))
855
856 #+unicode
857 (defun string-reverse* (sequence)
858 (declare (optimize (speed 3) (space 0) (safety 0))
859 (type string sequence))
860 (with-string sequence
861 (let* ((length (- end start))
862 (string (make-string length))
863 (j length))
864 (declare (type kernel:index length j))
865 (loop for i = start then n as n = (%glyph-f sequence i) do
866 (replace string sequence :start1 (decf j (- n i)) :start2 i :end2 n)
867 while (< n end))
868 string)))
869
870 #+unicode
871 (defun string-nreverse* (sequence)
872 (declare (optimize (speed 3) (space 0) (safety 0))
873 (type string sequence))
874 (with-string sequence
875 (flet ((rev (start end)
876 (do ((i start (1+ i))
877 (j (1- end) (1- j)))
878 ((>= i j))
879 (declare (type kernel:index i j))
880 (rotatef (schar sequence i) (schar sequence j)))))
881 (let ((len end))
882 (loop for i = start then n as n = (%glyph-f sequence i) do
883 (rev i n) while (< n len))
884 (rev start end))))
885 sequence)
886
887
888
889
890 (defun decompose (string &optional (compatibility t))
891 (declare (type string string))
892 (let ((result (make-string (cond ((< (length string) 40)
893 (* 5 (length string)))
894 ((< (length string) 4096)
895 (* 2 (length string)))
896 (t (round (length string) 5/6)))))
897 (fillptr 0))
898 (declare (type kernel:index fillptr))
899 (labels ((rec (string start end)
900 (declare (type simple-string string))
901 (do ((i start (1+ i)))
902 ((= i end))
903 (declare (type kernel:index i))
904 (multiple-value-bind (code wide) (codepoint string i)
905 (when wide (incf i))
906 (let ((decomp (unicode-decomp code compatibility)))
907 (if decomp (rec decomp 0 (length decomp)) (out code))))))
908 (out (code)
909 (multiple-value-bind (hi lo) (surrogates code)
910 (outch hi)
911 (when lo
912 (outch lo))
913 (let ((cc (unicode-combining-class code)))
914 (unless (zerop cc)
915 (order lo cc (- fillptr (if lo 3 2)))))))
916 (outch (char)
917 (when (= fillptr (length result))
918 (let ((tmp (make-string (round (length result) 5/6))))
919 (replace tmp result)
920 (setq result tmp)))
921 (setf (schar result fillptr) char)
922 (incf fillptr))
923 (order (wide1 cc last)
924 (loop until (minusp last) do
925 (multiple-value-bind (code2 wide2) (codepoint result last)
926 (let ((cc2 (unicode-combining-class code2)))
927 (cond ((zerop cc2) (return))
928 ((> cc2 cc)
929 (case (+ (if wide2 2 0) (if wide1 1 0))
930 (0 (rotatef (schar result last)
931 (schar result (1+ last))))
932 (1 (rotatef (schar result last)
933 (schar result (+ last 1))
934 (schar result (+ last 2))))
935 (2 (rotatef (schar result last)
936 (schar result (1- last))
937 (schar result (1+ last))))
938 (3 (rotatef (schar result last)
939 (schar result (+ last 2)))
940 (rotatef (schar result (1- last))
941 (schar result (1+ last)))))
942 (decf last (if wide2 2 1)))
943 (t (return))))))))
944 (with-string string
945 (rec string start end))
946 (shrink-vector result fillptr))))
947
948 (declaim (inline normalized-codepoint-p))
949 (defun normalized-codepoint-p (cp form)
950 (ecase form
951 (:nfc (unicode-nfc-qc cp))
952 (:nfkc (unicode-nfkc-qc cp))
953 (:nfd (unicode-nfd-qc cp))
954 (:nfkd (unicode-nfkd-qc cp))))
955
956 ;; Perform check to see if string is already normalized. The Unicode
957 ;; example can return YES, NO, or MAYBE. For our purposes, only YES
958 ;; is important, for which we return T. For NO or MAYBE, we return NIL.
959 (defun normalized-form-p (string &optional (form :nfc))
960 (declare (type (member :nfc :nfkc :nfd :nfkd) form)
961 (optimize (speed 3)))
962 (with-string string
963 (let ((last-class 0))
964 (declare (type (integer 0 256) last-class))
965 (do ((k start (1+ k)))
966 ((>= k end))
967 (declare (type kernel:index k))
968 (multiple-value-bind (ch widep)
969 (codepoint string k end)
970 (when widep (incf k))
971 ;; Handle ASCII specially
972 (unless (< ch 128)
973 (let ((class (unicode-combining-class ch)))
974 (declare (type (unsigned-byte 8) class))
975 (when (and (> last-class class) (not (zerop class)))
976 ;; Definitely not normalized
977 (return-from normalized-form-p nil))
978 (let ((check (normalized-codepoint-p ch form)))
979 (unless (eq check :y)
980 (return-from normalized-form-p nil)))
981 (setf last-class class)))))
982 t)))
983
984
985 ;; Compose a string in place. The string must already be in decomposed form.
986 (defun %compose (target)
987 (declare (type string target)
988 (optimize (speed 3)))
989 (let ((len (length target))
990 (starter-pos 0))
991 (declare (type kernel:index starter-pos))
992 (multiple-value-bind (starter-ch wide)
993 (codepoint target 0 len)
994 (let ((comp-pos (if wide 2 1))
995 (last-class (unicode-combining-class starter-ch)))
996 (declare (type (integer 0 256) last-class)
997 (type kernel:index comp-pos))
998 (unless (zerop last-class)
999 ;; Fix for strings starting with a combining character
1000 (setf last-class 256))
1001 ;; Loop on decomposed characters, combining where possible
1002 (do ((decomp-pos comp-pos (1+ decomp-pos)))
1003 ((>= decomp-pos len))
1004 (declare (type kernel:index decomp-pos))
1005 (multiple-value-bind (ch wide)
1006 (codepoint target decomp-pos len)
1007 (when wide (incf decomp-pos))
1008 (let ((ch-class (unicode-combining-class ch))
1009 (composite (get-pairwise-composition starter-ch ch)))
1010 (declare (type (integer 0 256) ch-class))
1011 (cond ((and composite
1012 (or (< last-class ch-class) (zerop last-class)))
1013 ;; Don't have to worry about surrogate pairs here
1014 ;; because the composite is always in the BMP.
1015 (setf (aref target starter-pos) (code-char composite))
1016 (setf starter-ch composite))
1017 (t
1018 (when (zerop ch-class)
1019 (setf starter-pos comp-pos)
1020 (setf starter-ch ch))
1021 (setf last-class ch-class)
1022 (multiple-value-bind (hi lo)
1023 (surrogates ch)
1024 (setf (aref target comp-pos) hi)
1025 (when lo
1026 (incf comp-pos)
1027 (setf (aref target comp-pos) lo))
1028 (incf comp-pos)))))))
1029 (shrink-vector target comp-pos)))))
1030
1031 (defun string-to-nfd (string)
1032 "Convert String to Unicode Normalization Form D (NFD) using the
1033 canonical decomposition. The NFD string is returned"
1034 (decompose string nil))
1035
1036 (defun string-to-nfkd (string)
1037 "Convert String to Unicode Normalization Form KD (NFKD) uisng the
1038 compatible decomposition form. The NFKD string is returned."
1039 (decompose string t))
1040
1041 #+unicode
1042 (defun string-to-nfc (string)
1043 "Convert String to Unicode Normalization Form C (NFC)."
1044 (if (normalized-form-p string :nfc)
1045 (if (simple-string-p string) string (coerce string 'simple-string))
1046 (coerce (if (normalized-form-p string :nfd)
1047 (%compose (copy-seq string))
1048 (%compose (string-to-nfd string)))
1049 'simple-string)))
1050
1051 #-unicode ;; Needed by package.lisp
1052 (defun string-to-nfc (string)
1053 (if (simple-string-p string) string (coerce string 'simple-string)))
1054
1055 (defun string-to-nfkc (string)
1056 "Convert String to Unicode Normalization Form KC (NFKC)."
1057 (if (normalized-form-p string :nfkc)
1058 (if (simple-string-p string) string (coerce string 'simple-string))
1059 (coerce (if (normalized-form-p string :nfkd)
1060 (%compose (copy-seq string))
1061 (%compose (string-to-nfkd string)))
1062 'simple-string)))

  ViewVC Help
Powered by ViewVC 1.1.5