/[cmucl]/src/code/string.lisp
ViewVC logotype

Contents of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12.30.17 - (show annotations)
Tue May 19 20:36:28 2009 UTC (4 years, 11 months ago) by rtoy
Branch: unicode-utf16-extfmt-branch
Changes since 1.12.30.16: +6 -5 lines
Fix grammar.
1 ;;; -*- Log: code.log; Package: Lisp -*-
2 ;;;
3 ;;; **********************************************************************
4 ;;; This code was written as part of the CMU Common Lisp project at
5 ;;; Carnegie Mellon University, and has been placed in the public domain.
6 ;;;
7 (ext:file-comment
8 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/string.lisp,v 1.12.30.17 2009/05/19 20:36:28 rtoy Exp $")
9 ;;;
10 ;;; **********************************************************************
11 ;;;
12 ;;; Functions to implement strings for CMU Common Lisp
13 ;;; Written by David Dill
14 ;;; Rewritten by Skef Wholey, Bill Chiles and Rob MacLachlan.
15 ;;;
16 ;;; ****************************************************************
17 ;;;
18 (in-package "LISP")
19 (export '(char schar glyph sglyph string
20 string= string-equal string< string> string<= string>= string/=
21 string-lessp string-greaterp string-not-lessp string-not-greaterp
22 string-not-equal
23 string-to-nfd string-to-nfkd string-to-nfc string-to-nfkc
24 make-string
25 string-trim string-left-trim string-right-trim
26 string-upcase
27 string-downcase string-capitalize nstring-upcase nstring-downcase
28 nstring-capitalize))
29
30
31 (declaim (inline surrogates-to-codepoint codepoint surrogates))
32
33 (defun surrogates-to-codepoint (hi lo)
34 "Convert the given Hi and Lo surrogate characters to the
35 corresponding codepoint value"
36 (declare (type character hi lo))
37 (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi)) #xD800) 10)
38 (the (integer #xDC00 #xDFFF) (char-code lo)) #x2400))
39
40 (defun codepoint (string i &optional (end (length string)))
41 "Return the codepoint value from String at position I. If that
42 position is a surrogate, it is combined with either the previous or
43 following character (when possible) to compute the codepoint. The
44 second return value is NIL if the position is not a surrogate pair.
45 Otherwise +1 or -1 is returned if the position is the high or low
46 surrogate value, respectively."
47 (declare (type simple-string string) (type kernel:index i end))
48 (let ((code (char-code (schar string i))))
49 (cond ((and (<= #xD800 code #xDBFF) (< (1+ i) end))
50 (let ((tmp (char-code (schar string (1+ i)))))
51 (if (<= #xDC00 tmp #xDFFF)
52 (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)
53 (values code nil))))
54 ((and (<= #xDC00 code #xDFFF) (> i 0))
55 (let ((tmp (char-code (schar string (1- i)))))
56 (if (<= #xD800 tmp #xDBFF)
57 (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)
58 (values code nil))))
59 (t (values code nil)))))
60
61 (defun surrogates (codepoint)
62 "Return the high and low surrogate characters for Codepoint. If
63 Codepoint is in the BMP, the first return value is the corresponding
64 character and the second is NIL."
65 (declare (type (integer 0 #x10FFFF) codepoint))
66 (if (< codepoint #x10000)
67 (values (code-char codepoint) nil)
68 (let* ((tmp (- codepoint #x10000))
69 (hi (logior (ldb (byte 10 10) tmp) #xD800))
70 (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
71 (values (code-char hi) (code-char lo)))))
72
73 (defun utf16-string-p (string)
74 "Check if String is a valid UTF-16 string. If the string is valid,
75 T is returned. If the string is not valid, NIL is returned, and the
76 second value is the index into the string of the invalid character."
77 (do ((len (length string))
78 (index 0 (1+ index)))
79 ((>= index len)
80 t)
81 (multiple-value-bind (codepoint wide)
82 (codepoint string index)
83 ;; We step through the string in order. If there are any
84 ;; surrogates pairs, we must reach the lead surrogate first,
85 ;; which means WIDE is +1. Otherwise, we have an invalid
86 ;; surrogate pair. If we get any codepoint that is in
87 ;; the surrogate range, we also have an invalid string.
88 (when (or (eq wide -1)
89 (<= #xD800 codepoint #xDFFF))
90 (return-from utf16-string-p (values nil index)))
91 (when wide (incf index)))))
92
93 (defun string (X)
94 "Coerces X into a string. If X is a string, X is returned. If X is a
95 symbol, X's pname is returned. If X is a character then a one element
96 string containing that character is returned. If X cannot be coerced
97 into a string, an error occurs."
98 (cond ((stringp x) x)
99 ((symbolp x) (symbol-name x))
100 ((characterp x)
101 (let ((res (make-string 1)))
102 (setf (schar res 0) x) res))
103 (t
104 (error 'simple-type-error
105 :datum x
106 :expected-type '(or string symbol character)
107 :format-control "~S cannot be coerced to a string."
108 :format-arguments (list x)))))
109
110 ;;; With-One-String is used to set up some string hacking things. The keywords
111 ;;; are parsed, and the string is hacked into a simple-string.
112
113 (eval-when (compile)
114
115 (defmacro with-one-string (string start end cum-offset &rest forms)
116 `(let ((,string (if (stringp ,string) ,string (string ,string))))
117 ;; Optimizer may prove STRING is one.
118 (declare (optimize (ext:inhibit-warnings 3)))
119 (with-array-data ((,string ,string :offset-var ,cum-offset)
120 (,start ,start)
121 (,end (or ,end (length (the vector ,string)))))
122 ,@forms)))
123
124 )
125
126 ;;; With-String is like With-One-String, but doesn't parse keywords.
127
128 (eval-when (compile)
129
130 (defmacro with-string (string &rest forms)
131 `(let ((,string (if (stringp ,string) ,string (string ,string))))
132 (with-array-data ((,string ,string)
133 (start)
134 (end (length (the vector ,string))))
135 ,@forms)))
136
137 )
138
139 ;;; With-Two-Strings is used to set up string comparison operations. The
140 ;;; keywords are parsed, and the strings are hacked into simple-strings.
141
142 (eval-when (compile)
143
144 (defmacro with-two-strings (string1 string2 start1 end1 cum-offset-1
145 start2 end2 &rest forms)
146 `(let ((,string1 (if (stringp ,string1) ,string1 (string ,string1)))
147 (,string2 (if (stringp ,string2) ,string2 (string ,string2))))
148 (with-array-data ((,string1 ,string1 :offset-var ,cum-offset-1)
149 (,start1 ,start1)
150 (,end1 (or ,end1 (length (the vector ,string1)))))
151 (with-array-data ((,string2 ,string2)
152 (,start2 ,start2)
153 (,end2 (or ,end2 (length (the vector ,string2)))))
154 ,@forms))))
155
156 )
157
158
159 (defun char (string index)
160 "Given a string and a non-negative integer index less than the length of
161 the string, returns the character object representing the character at
162 that position in the string."
163 (declare (optimize (safety 1)))
164 (char string index))
165
166 (defun %charset (string index new-el)
167 (declare (optimize (safety 1)))
168 (setf (char string index) new-el))
169
170 (defun schar (string index)
171 "SCHAR returns the character object at an indexed position in a string
172 just as CHAR does, except the string must be a simple-string."
173 (declare (optimize (safety 1)))
174 (schar string index))
175
176 (defun %scharset (string index new-el)
177 (declare (optimize (safety 1)))
178 (setf (schar string index) new-el))
179
180 (defun string=* (string1 string2 start1 end1 start2 end2)
181 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
182 (not (%sp-string-compare string1 start1 end1 string2 start2 end2))))
183
184
185 (defun string/=* (string1 string2 start1 end1 start2 end2)
186 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
187 (let ((comparison (%sp-string-compare string1 start1 end1
188 string2 start2 end2)))
189 (if comparison (- (the fixnum comparison) offset1)))))
190
191 (eval-when (compile eval)
192
193 ;;; Lessp is true if the desired expansion is for string<* or string<=*.
194 ;;; Equalp is true if the desired expansion is for string<=* or string>=*.
195 (defmacro string<>=*-body (lessp equalp)
196 (let ((offset1 (gensym)))
197 `(with-two-strings string1 string2 start1 end1 ,offset1 start2 end2
198 (let ((index (%sp-string-compare string1 start1 end1
199 string2 start2 end2)))
200 (if index
201 (cond ((= (the fixnum index) (the fixnum end1))
202 ,(if lessp
203 `(- (the fixnum index) ,offset1)
204 `nil))
205 ((= (+ (the fixnum index) (- start2 start1))
206 (the fixnum end2))
207 ,(if lessp
208 `nil
209 `(- (the fixnum index) ,offset1)))
210 #-unicode
211 ((,(if lessp 'char< 'char>)
212 (schar string1 index)
213 (schar string2 (+ (the fixnum index) (- start2 start1))))
214 (- (the fixnum index) ,offset1))
215 #-unicode
216 (t nil)
217 #+unicode
218 (t
219 ;; Compare in code point order. See
220 ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
221 (flet ((fixup (code)
222 (if (>= code #xe000)
223 (- code #x800)
224 (+ code #x2000))))
225 (declare (inline fixup))
226 (let* ((c1 (char-code (schar string1 index)))
227 (c2 (char-code (schar string2
228 (+ (the fixnum index)
229 (- start2 start1))))))
230 (cond ((and (>= c1 #xd800)
231 (>= c2 #xd800))
232 (let ((fix-c1 (fixup c1))
233 (fix-c2 (fixup c2)))
234 (if (,(if lessp '< '>) fix-c1 fix-c2)
235 (- (the fixnum index) ,offset1)
236 nil)))
237 (t
238 (if (,(if lessp '< '>) c1 c2)
239 (- (the fixnum index) ,offset1)
240 nil)))))))
241 ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
242 ) ; eval-when
243
244 (defun string<* (string1 string2 start1 end1 start2 end2)
245 (declare (fixnum start1 start2))
246 (string<>=*-body t nil))
247
248 (defun string>* (string1 string2 start1 end1 start2 end2)
249 (declare (fixnum start1 start2))
250 (string<>=*-body nil nil))
251
252 (defun string<=* (string1 string2 start1 end1 start2 end2)
253 (declare (fixnum start1 start2))
254 (string<>=*-body t t))
255
256 (defun string>=* (string1 string2 start1 end1 start2 end2)
257 (declare (fixnum start1 start2))
258 (string<>=*-body nil t))
259
260
261
262 (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
263 "Given two strings, if the first string is lexicographically less than
264 the second string, returns the longest common prefix (using char=)
265 of the two strings. Otherwise, returns ()."
266 (string<* string1 string2 start1 end1 start2 end2))
267
268 (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
269 "Given two strings, if the first string is lexicographically greater than
270 the second string, returns the longest common prefix (using char=)
271 of the two strings. Otherwise, returns ()."
272 (string>* string1 string2 start1 end1 start2 end2))
273
274
275 (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
276 "Given two strings, if the first string is lexicographically less than
277 or equal to the second string, returns the longest common prefix
278 (using char=) of the two strings. Otherwise, returns ()."
279 (string<=* string1 string2 start1 end1 start2 end2))
280
281 (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
282 "Given two strings, if the first string is lexicographically greater
283 than or equal to the second string, returns the longest common prefix
284 (using char=) of the two strings. Otherwise, returns ()."
285 (string>=* string1 string2 start1 end1 start2 end2))
286
287 (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
288 "Given two strings (string1 and string2), and optional integers start1,
289 start2, end1 and end2, compares characters in string1 to characters in
290 string2 (using char=)."
291 (string=* string1 string2 start1 end1 start2 end2))
292
293 (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
294 "Given two strings, if the first string is not lexicographically equal
295 to the second string, returns the longest common prefix (using char=)
296 of the two strings. Otherwise, returns ()."
297 (string/=* string1 string2 start1 end1 start2 end2))
298
299
300 (eval-when (compile eval)
301
302 ;;; STRING-NOT-EQUAL-LOOP is used to generate character comparison loops for
303 ;;; STRING-EQUAL and STRING-NOT-EQUAL.
304 (defmacro string-not-equal-loop (end end-value
305 &optional (abort-value nil abortp))
306 (declare (fixnum end))
307 (let ((end-test (if (= end 1)
308 `(= index1 (the fixnum end1))
309 `(= index2 (the fixnum end2)))))
310 `(do ((index1 start1 (1+ index1))
311 (index2 start2 (1+ index2)))
312 (,(if abortp
313 end-test
314 `(or ,end-test
315 (not (char-equal (schar string1 index1)
316 (schar string2 index2)))))
317 ,end-value)
318 (declare (fixnum index1 index2))
319 ,@(if abortp
320 `((if (not (char-equal (schar string1 index1)
321 (schar string2 index2)))
322 (return ,abort-value)))))))
323
324 ) ; eval-when
325
326 (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
327 "Given two strings (string1 and string2), and optional integers start1,
328 start2, end1 and end2, compares characters in string1 to characters in
329 string2 (using char-equal)."
330 (declare (fixnum start1 start2))
331 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
332 (let ((slen1 (- (the fixnum end1) start1))
333 (slen2 (- (the fixnum end2) start2)))
334 (declare (fixnum slen1 slen2))
335 (if (or (minusp slen1) (minusp slen2))
336 ;;prevent endless looping later.
337 (error "Improper bounds for string comparison."))
338 (if (= slen1 slen2)
339 ;;return () immediately if lengths aren't equal.
340 (string-not-equal-loop 1 t nil)))))
341
342 (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
343 "Given two strings, if the first string is not lexicographically equal
344 to the second string, returns the longest common prefix (using char-equal)
345 of the two strings. Otherwise, returns ()."
346 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
347 (let ((slen1 (- end1 start1))
348 (slen2 (- end2 start2)))
349 (declare (fixnum slen1 slen2))
350 (if (or (minusp slen1) (minusp slen2))
351 ;;prevent endless looping later.
352 (error "Improper bounds for string comparison."))
353 (cond ((or (minusp slen1) (or (minusp slen2)))
354 (error "Improper substring for comparison."))
355 ((= slen1 slen2)
356 (string-not-equal-loop 1 nil (- index1 offset1)))
357 ((< slen1 slen2)
358 (string-not-equal-loop 1 (- index1 offset1)))
359 (t
360 (string-not-equal-loop 2 (- index1 offset1)))))))
361
362
363
364 (eval-when (compile eval)
365
366 ;;; STRING-LESS-GREATER-EQUAL-TESTS returns a test on the lengths of string1
367 ;;; and string2 and a test on the current characters from string1 and string2
368 ;;; for the following macro.
369 (defun string-less-greater-equal-tests (lessp equalp)
370 (if lessp
371 (if equalp
372 ;; STRING-NOT-GREATERP
373 (values '<=
374 #-unicode `(not (char-greaterp char1 char2))
375 #+unicode `(<= char1 char2))
376 ;; STRING-LESSP
377 (values '<
378 #-unicode `(char-lessp char1 char2)
379 #+unicode `(< char1 char2)))
380 (if equalp
381 ;; STRING-NOT-LESSP
382 (values '>=
383 #-unicode `(not (char-lessp char1 char2))
384 #+unicode `(>= char1 char2))
385 ;; STRING-GREATERP
386 (values '>
387 #-unicode `(char-greaterp char1 char2)
388 #+unicode `(> char1 char2)))))
389
390 #-unicode
391 (defmacro string-less-greater-equal (lessp equalp)
392 (multiple-value-bind (length-test character-test)
393 (string-less-greater-equal-tests lessp equalp)
394 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
395 (let ((slen1 (- (the fixnum end1) start1))
396 (slen2 (- (the fixnum end2) start2)))
397 (declare (fixnum slen1 slen2))
398 (if (or (minusp slen1) (minusp slen2))
399 ;;prevent endless looping later.
400 (error "Improper bounds for string comparison."))
401 (do ((index1 start1 (1+ index1))
402 (index2 start2 (1+ index2))
403 (char1)
404 (char2))
405 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
406 (if (,length-test slen1 slen2) (- index1 offset1)))
407 (declare (fixnum index1 index2))
408 (setq char1 (schar string1 index1))
409 (setq char2 (schar string2 index2))
410 (if (not (char-equal char1 char2))
411 (if ,character-test
412 (return (- index1 offset1))
413 (return ()))))))))
414
415 #+unicode
416 (defmacro equal-char-codepoint (codepoint)
417 `(let ((ch ,codepoint))
418 (if (< 96 ch 123)
419 (- ch 32)
420 #-(and unicode (not unicode-bootstrap))
421 ch
422 #+(and unicode (not unicode-bootstrap))
423 (if (> ch 127) (unicode-upper ch) ch))))
424
425 #+unicode
426 (defmacro string-less-greater-equal (lessp equalp)
427 (multiple-value-bind (length-test character-test)
428 (string-less-greater-equal-tests lessp equalp)
429 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
430 (let ((slen1 (- (the fixnum end1) start1))
431 (slen2 (- (the fixnum end2) start2)))
432 (declare (fixnum slen1 slen2))
433 (if (or (minusp slen1) (minusp slen2))
434 ;;prevent endless looping later.
435 (error "Improper bounds for string comparison."))
436 (do ((index1 start1 (1+ index1))
437 (index2 start2 (1+ index2)))
438 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
439 (if (,length-test slen1 slen2) (- index1 offset1)))
440 (declare (fixnum index1 index2))
441 (multiple-value-bind (char1 wide1)
442 (codepoint string1 index1)
443 (multiple-value-bind (char2 wide2)
444 (codepoint string2 index2)
445 (if (= (equal-char-codepoint char1)
446 (equal-char-codepoint char2))
447 (progn
448 (when wide1 (incf index1))
449 (when wide2 (incf index2)))
450 (if ,character-test
451 (return (- index1 offset1))
452 (return ()))))))))))
453
454 ) ; eval-when
455
456 (defun string-lessp* (string1 string2 start1 end1 start2 end2)
457 (declare (fixnum start1 start2))
458 (string-less-greater-equal t nil))
459
460 (defun string-greaterp* (string1 string2 start1 end1 start2 end2)
461 (declare (fixnum start1 start2))
462 (string-less-greater-equal nil nil))
463
464 (defun string-not-lessp* (string1 string2 start1 end1 start2 end2)
465 (declare (fixnum start1 start2))
466 (string-less-greater-equal nil t))
467
468 (defun string-not-greaterp* (string1 string2 start1 end1 start2 end2)
469 (declare (fixnum start1 start2))
470 (string-less-greater-equal t t))
471
472 (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
473 "Given two strings, if the first string is lexicographically less than
474 the second string, returns the longest common prefix (using char-equal)
475 of the two strings. Otherwise, returns ()."
476 (string-lessp* string1 string2 start1 end1 start2 end2))
477
478 (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)
479 "Given two strings, if the first string is lexicographically greater than
480 the second string, returns the longest common prefix (using char-equal)
481 of the two strings. Otherwise, returns ()."
482 (string-greaterp* string1 string2 start1 end1 start2 end2))
483
484 (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
485 "Given two strings, if the first string is lexicographically greater
486 than or equal to the second string, returns the longest common prefix
487 (using char-equal) of the two strings. Otherwise, returns ()."
488 (string-not-lessp* string1 string2 start1 end1 start2 end2))
489
490 (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
491 end2)
492 "Given two strings, if the first string is lexicographically less than
493 or equal to the second string, returns the longest common prefix
494 (using char-equal) of the two strings. Otherwise, returns ()."
495 (string-not-greaterp* string1 string2 start1 end1 start2 end2))
496
497
498 (defun make-string (count &key element-type ((:initial-element fill-char)))
499 "Given a character count and an optional fill character, makes and returns
500 a new string Count long filled with the fill character."
501 (declare (type fixnum count))
502 (assert (subtypep element-type 'character))
503 (if fill-char
504 (do ((i 0 (1+ i))
505 (string (make-string count)))
506 ((= i count) string)
507 (declare (fixnum i))
508 (setf (schar string i) fill-char))
509 (make-string count)))
510
511 (defun string-upcase (string &key (start 0) end)
512 "Given a string, returns a new string that is a copy of it with
513 all lower case alphabetic characters converted to uppercase."
514 (declare (fixnum start))
515 (let* ((string (if (stringp string) string (string string)))
516 (slen (length string)))
517 (declare (fixnum slen))
518 (with-one-string string start end offset
519 (let ((offset-slen (+ slen offset))
520 (newstring (make-string slen)))
521 (declare (fixnum offset-slen))
522 (do ((index offset (1+ index))
523 (new-index 0 (1+ new-index)))
524 ((= index start))
525 (declare (fixnum index new-index))
526 (setf (schar newstring new-index) (schar string index)))
527 (do ((index start (1+ index))
528 (new-index (- start offset) (1+ new-index)))
529 ((= index (the fixnum end)))
530 (declare (fixnum index new-index))
531 (multiple-value-bind (code wide) (codepoint string index)
532 (when wide (incf index))
533 ;; Handle ASCII specially because this is called early in
534 ;; initialization, before unidata is available.
535 (cond ((< 96 code 123) (decf code 32))
536 ((> code 127) (setq code (unicode-upper code))))
537 ;;@@ WARNING: this may, in theory, need to extend newstring
538 ;; but that never actually occurs as of Unicode 5.1.0,
539 ;; so I'm just going to ignore it for now...
540 (multiple-value-bind (hi lo) (surrogates code)
541 (setf (schar newstring new-index) hi)
542 (when lo
543 (setf (schar newstring (incf new-index)) lo)))))
544 ;;@@ WARNING: see above
545 (do ((index end (1+ index))
546 (new-index (- (the fixnum end) offset) (1+ new-index)))
547 ((= index offset-slen))
548 (declare (fixnum index new-index))
549 (setf (schar newstring new-index) (schar string index)))
550 newstring))))
551
552 (defun string-downcase (string &key (start 0) end)
553 "Given a string, returns a new string that is a copy of it with
554 all upper case alphabetic characters converted to lowercase."
555 (declare (fixnum start))
556 (let* ((string (if (stringp string) string (string string)))
557 (slen (length string)))
558 (declare (fixnum slen))
559 (with-one-string string start end offset
560 (let ((offset-slen (+ slen offset))
561 (newstring (make-string slen)))
562 (declare (fixnum offset-slen))
563 (do ((index offset (1+ index))
564 (new-index 0 (1+ new-index)))
565 ((= index start))
566 (declare (fixnum index new-index))
567 (setf (schar newstring new-index) (schar string index)))
568 (do ((index start (1+ index))
569 (new-index (- start offset) (1+ new-index)))
570 ((= index (the fixnum end)))
571 (declare (fixnum index new-index))
572 (multiple-value-bind (code wide) (codepoint string index)
573 (when wide (incf index))
574 ;; Handle ASCII specially because this is called early in
575 ;; initialization, before unidata is available.
576 (cond ((< 64 code 91) (incf code 32))
577 ((> code 127) (setq code (unicode-lower code))))
578 ;;@@ WARNING: this may, in theory, need to extend newstring
579 ;; but that never actually occurs as of Unicode 5.1.0,
580 ;; so I'm just going to ignore it for now...
581 (multiple-value-bind (hi lo) (surrogates code)
582 (setf (schar newstring new-index) hi)
583 (when lo
584 (setf (schar newstring (incf new-index)) lo)))))
585 ;;@@ WARNING: see above
586 (do ((index end (1+ index))
587 (new-index (- (the fixnum end) offset) (1+ new-index)))
588 ((= index offset-slen))
589 (declare (fixnum index new-index))
590 (setf (schar newstring new-index) (schar string index)))
591 newstring))))
592
593 (defun string-capitalize (string &key (start 0) end)
594 "Given a string, returns a copy of the string with the first
595 character of each ``word'' converted to upper-case, and remaining
596 chars in the word converted to lower case. A ``word'' is defined
597 to be a string of case-modifiable characters delimited by
598 non-case-modifiable chars."
599 (declare (fixnum start))
600 (let* ((string (if (stringp string) string (string string)))
601 (slen (length string)))
602 (declare (fixnum slen))
603 (with-one-string string start end offset
604 (let ((offset-slen (+ slen offset))
605 (newstring (make-string slen)))
606 (declare (fixnum offset-slen))
607 (do ((index offset (1+ index))
608 (new-index 0 (1+ new-index)))
609 ((= index start))
610 (declare (fixnum index new-index))
611 (setf (schar newstring new-index) (schar string index)))
612 (do ((index start (1+ index))
613 (new-index (- start offset) (1+ new-index))
614 (newword t)
615 (char ()))
616 ((= index (the fixnum end)))
617 (declare (fixnum index new-index))
618 (setq char (schar string index))
619 (cond ((not (alphanumericp char))
620 (setq newword t))
621 (newword
622 ;;char is first case-modifiable after non-case-modifiable
623 (setq char (char-titlecase char))
624 (setq newword ()))
625 ;;char is case-modifiable, but not first
626 (t (setq char (char-downcase char))))
627 (setf (schar newstring new-index) char))
628 (do ((index end (1+ index))
629 (new-index (- (the fixnum end) offset) (1+ new-index)))
630 ((= index offset-slen))
631 (declare (fixnum index new-index))
632 (setf (schar newstring new-index) (schar string index)))
633 newstring))))
634
635 (defun nstring-upcase (string &key (start 0) end)
636 "Given a string, returns that string with all lower case alphabetic
637 characters converted to uppercase."
638 (declare (fixnum start))
639 (let ((save-header string))
640 (with-one-string string start end offset
641 (do ((index start (1+ index)))
642 ((= index (the fixnum end)))
643 (declare (fixnum index))
644 (multiple-value-bind (code wide) (codepoint string index)
645 (declare (ignore wide))
646 ;; Handle ASCII specially because this is called early in
647 ;; initialization, before unidata is available.
648 (cond ((< 96 code 123) (decf code 32))
649 ((> code 127) (setq code (unicode-upper code))))
650 ;;@@ WARNING: this may, in theory, need to extend string
651 ;; (which, obviously, we can't do here. Unless
652 ;; STRING is adjustable, maybe)
653 ;; but that never actually occurs as of Unicode 5.1.0,
654 ;; so I'm just going to ignore it for now...
655 (multiple-value-bind (hi lo) (surrogates code)
656 (setf (schar string index) hi)
657 (when lo
658 (setf (schar string (incf index)) lo))))))
659 save-header))
660
661 (defun nstring-downcase (string &key (start 0) end)
662 "Given a string, returns that string with all upper case alphabetic
663 characters converted to lowercase."
664 (declare (fixnum start))
665 (let ((save-header string))
666 (with-one-string string start end offset
667 (do ((index start (1+ index)))
668 ((= index (the fixnum end)))
669 (declare (fixnum index))
670 (multiple-value-bind (code wide) (codepoint string index)
671 (declare (ignore wide))
672 (cond ((< 64 code 91) (incf code 32))
673 ((> code 127) (setq code (unicode-lower code))))
674 ;;@@ WARNING: this may, in theory, need to extend string
675 ;; (which, obviously, we can't do here. Unless
676 ;; STRING is adjustable, maybe)
677 ;; but that never actually occurs as of Unicode 5.1.0,
678 ;; so I'm just going to ignore it for now...
679 (multiple-value-bind (hi lo) (surrogates code)
680 (setf (schar string index) hi)
681 (when lo
682 (setf (schar string (incf index)) lo))))))
683 save-header))
684
685 (defun nstring-capitalize (string &key (start 0) end)
686 "Given a string, returns that string with the first
687 character of each ``word'' converted to upper-case, and remaining
688 chars in the word converted to lower case. A ``word'' is defined
689 to be a string of case-modifiable characters delimited by
690 non-case-modifiable chars."
691 (declare (fixnum start))
692 (let ((save-header string))
693 (with-one-string string start end offset
694 (do ((index start (1+ index))
695 (newword t)
696 (char ()))
697 ((= index (the fixnum end)))
698 (declare (fixnum index))
699 (setq char (schar string index))
700 (cond ((not (alphanumericp char))
701 (setq newword t))
702 (newword
703 ;;char is first case-modifiable after non-case-modifiable
704 (setf (schar string index) (char-titlecase char))
705 (setq newword ()))
706 (t
707 (setf (schar string index) (char-downcase char))))))
708 save-header))
709
710 (defun string-left-trim (char-bag string)
711 "Given a set of characters (a list or string) and a string, returns
712 a copy of the string with the characters in the set removed from the
713 left end."
714 (with-string string
715 (do ((index start (1+ index)))
716 ((or (= index (the fixnum end))
717 (not (find (schar string index) char-bag)))
718 (subseq (the simple-string string) index end))
719 (declare (fixnum index)))))
720
721 (defun string-right-trim (char-bag string)
722 "Given a set of characters (a list or string) and a string, returns
723 a copy of the string with the characters in the set removed from the
724 right end."
725 (with-string string
726 (do ((index (1- (the fixnum end)) (1- index)))
727 ((or (< index start) (not (find (schar string index) char-bag)))
728 (subseq (the simple-string string) start (1+ index)))
729 (declare (fixnum index)))))
730
731 (defun string-trim (char-bag string)
732 "Given a set of characters (a list or string) and a string, returns a
733 copy of the string with the characters in the set removed from both
734 ends."
735 (with-string string
736 (let* ((left-end (do ((index start (1+ index)))
737 ((or (= index (the fixnum end))
738 (not (find (schar string index) char-bag)))
739 index)
740 (declare (fixnum index))))
741 (right-end (do ((index (1- (the fixnum end)) (1- index)))
742 ((or (< index left-end)
743 (not (find (schar string index) char-bag)))
744 (1+ index))
745 (declare (fixnum index)))))
746 (subseq (the simple-string string) left-end right-end))))
747
748 (declaim (inline %glyph-f %glyph-b))
749 (defun %glyph-f (string index)
750 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
751 (type simple-string string) (type kernel:index index))
752 (let* ((prev 0)
753 (l (length string))
754 (c (codepoint string index l))
755 (n (+ index (if (> c #xFFFF) 2 1))))
756 (declare (type (integer 0 #x10FFFF) c) (type kernel:index l n))
757 (loop while (< n l) do
758 (let* ((c (codepoint string n l))
759 (d (the (unsigned-byte 8) (unicode-combining-class c))))
760 (when (or (zerop d) (< d prev))
761 (return))
762 (setq prev d)
763 (incf n (if (> c #xFFFF) 2 1))))
764 n))
765
766 (defun %glyph-b (string index)
767 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
768 (type simple-string string) (type kernel:index index))
769 (let* ((prev 255)
770 (n (1- index)))
771 (declare (type kernel:index n))
772 (loop until (< n 0) do
773 (let* ((c (codepoint string n 0))
774 (d (the (unsigned-byte 8) (unicode-combining-class c))))
775 (cond ((zerop d) (return))
776 ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
777 (setq prev d)
778 (decf n (if (> c #xFFFF) 2 1))))
779 n))
780
781 (defun glyph (string index &key (from-end nil))
782 "GLYPH returns the glyph at the indexed position in a string, and the
783 position of the next glyph (or NIL) as a second value. A glyph is
784 a substring consisting of the character at INDEX followed by all
785 subsequent combining characters."
786 (declare (type simple-string string) (type kernel:index index))
787 #-unicode
788 (char string index)
789 #+unicode
790 (with-array-data ((string string) (start) (end))
791 (declare (ignore start end))
792 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
793 (if from-end
794 (values (subseq string n index) (and (> n 0) n))
795 (values (subseq string index n) (and (< n (length string)) n))))))
796
797 (defun sglyph (string index &key (from-end nil))
798 "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
799 except that the string must be a simple-string"
800 (declare (type simple-string string) (type kernel:index index))
801 #-unicode
802 (schar string index)
803 #+unicode
804 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
805 (if from-end
806 (values (subseq string n index) (and (> n 0) n))
807 (values (subseq string index n) (and (< n (length string)) n)))))
808
809 (defun decompose (string &optional (compatibility t))
810 (declare (type string string))
811 (let ((result (make-string (cond ((< (length string) 40)
812 (* 5 (length string)))
813 ((< (length string) 4096)
814 (* 2 (length string)))
815 (t (round (length string) 5/6)))))
816 (fillptr 0))
817 (declare (type kernel:index fillptr))
818 (labels ((rec (string)
819 (declare (type simple-string string))
820 (do ((i 0 (1+ i)))
821 ((= i (length string)))
822 (declare (type kernel:index i))
823 (multiple-value-bind (code wide) (codepoint string i)
824 (when wide (incf i))
825 (let ((decomp (unicode-decomp code compatibility)))
826 (if decomp (rec decomp) (out code))))))
827 (out (code)
828 (multiple-value-bind (hi lo) (surrogates code)
829 (outch hi)
830 (when lo
831 (outch lo))
832 (let ((cc (unicode-combining-class code)))
833 (unless (zerop cc)
834 (order lo cc (- fillptr (if lo 2 1)))))))
835 (outch (char)
836 (when (= fillptr (length result))
837 (let ((tmp (make-string (round (length result) 5/6))))
838 (replace tmp result)
839 (setq result tmp)))
840 (setf (schar result fillptr) char)
841 (incf fillptr))
842 (order (wide1 cc last)
843 (loop until (minusp last) do
844 (multiple-value-bind (code2 wide2) (codepoint result last)
845 (let ((cc2 (unicode-combining-class code2)))
846 (cond ((zerop cc2) (return))
847 ((> cc2 cc)
848 (case (+ (if wide2 2 0) (if wide1 1 0))
849 (0 (rotatef (schar result last)
850 (schar result (1+ last))))
851 (1 (rotatef (schar result last)
852 (schar result (+ last 1))
853 (schar result (+ last 2))))
854 (2 (rotatef (schar result last)
855 (schar result (1- last))
856 (schar result (1+ last))))
857 (3 (rotatef (schar result last)
858 (schar result (+ last 2)))
859 (rotatef (schar result (1- last))
860 (schar result (1+ last)))))
861 (decf last (if wide2 2 1)))
862 (t (return))))))))
863 (with-array-data ((string string) (start) (end))
864 (declare (ignore start end))
865 (rec string))
866 (shrink-vector result fillptr))))
867
868 (defun string-to-nfd (string)
869 "Convert String to Unicode Normalization Form D (NFD) using the
870 canonical decomposition. The NFD string is returned"
871 (decompose string nil))
872
873 (defun string-to-nfkd (string)
874 "Convert String to Unicode Normalization Form KD (NFKD) uisng the
875 compatible decomposition form. The NFKD string is returned."
876 (decompose string t))
877
878 #+unicode
879 (defun string-to-nfc (string)
880 ;;@@ Implement me
881 ;; must return a simple-string for the package machinery
882 (if (simple-string-p string) string (coerce string 'simple-string)))
883
884 #-unicode ;; Needed by package.lisp
885 (defun string-to-nfc (string)
886 (if (simple-string-p string) string (coerce string 'simple-string)))
887
888 (defun string-to-nfkc (string)
889 ;;@@ Implement me
890 (if (simple-string-p string) string (coerce string 'simple-string)))

  ViewVC Help
Powered by ViewVC 1.1.5