/[cmucl]/src/code/string.lisp
ViewVC logotype

Contents of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12.30.16 - (show annotations)
Tue May 19 20:24:19 2009 UTC (4 years, 11 months ago) by rtoy
Branch: unicode-utf16-extfmt-branch
Changes since 1.12.30.15: +19 -1 lines
Add UTF16-STRING-P to determine if a string is a valid UTF-16 encoded
string.
1 ;;; -*- Log: code.log; Package: Lisp -*-
2 ;;;
3 ;;; **********************************************************************
4 ;;; This code was written as part of the CMU Common Lisp project at
5 ;;; Carnegie Mellon University, and has been placed in the public domain.
6 ;;;
7 (ext:file-comment
8 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/string.lisp,v 1.12.30.16 2009/05/19 20:24:19 rtoy Exp $")
9 ;;;
10 ;;; **********************************************************************
11 ;;;
12 ;;; Functions to implement strings for CMU Common Lisp
13 ;;; Written by David Dill
14 ;;; Rewritten by Skef Wholey, Bill Chiles and Rob MacLachlan.
15 ;;;
16 ;;; ****************************************************************
17 ;;;
18 (in-package "LISP")
19 (export '(char schar glyph sglyph string
20 string= string-equal string< string> string<= string>= string/=
21 string-lessp string-greaterp string-not-lessp string-not-greaterp
22 string-not-equal
23 string-to-nfd string-to-nfkd string-to-nfc string-to-nfkc
24 make-string
25 string-trim string-left-trim string-right-trim
26 string-upcase
27 string-downcase string-capitalize nstring-upcase nstring-downcase
28 nstring-capitalize))
29
30
31 (declaim (inline surrogates-to-codepoint codepoint surrogates))
32
33 (defun surrogates-to-codepoint (hi lo)
34 "Convert the given Hi and Lo surrogate characters to the
35 corresponding codepoint value"
36 (declare (type character hi lo))
37 (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi)) #xD800) 10)
38 (the (integer #xDC00 #xDFFF) (char-code lo)) #x2400))
39
40 (defun codepoint (string i &optional (end (length string)))
41 "Return the codepoint value from String at position I. If that
42 position is a surrogate, it is combined with either the previous or
43 following character (when possible) to compute the codepoint. The
44 second return value is NIL if the position is not a surrogate pair.
45 Otherwise +1 or -1 is returned if the position is the high or low
46 surrogate value, respectively."
47 (declare (type simple-string string) (type kernel:index i end))
48 (let ((code (char-code (schar string i))))
49 (cond ((and (<= #xD800 code #xDBFF) (< (1+ i) end))
50 (let ((tmp (char-code (schar string (1+ i)))))
51 (if (<= #xDC00 tmp #xDFFF)
52 (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)
53 (values code nil))))
54 ((and (<= #xDC00 code #xDFFF) (> i 0))
55 (let ((tmp (char-code (schar string (1- i)))))
56 (if (<= #xD800 tmp #xDBFF)
57 (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)
58 (values code nil))))
59 (t (values code nil)))))
60
61 (defun surrogates (codepoint)
62 "Return the high and low surrogate characters for Codepoint. If
63 Codepoint is in the BMP, the first return value is the corresponding
64 character and the second is NIL."
65 (declare (type (integer 0 #x10FFFF) codepoint))
66 (if (< codepoint #x10000)
67 (values (code-char codepoint) nil)
68 (let* ((tmp (- codepoint #x10000))
69 (hi (logior (ldb (byte 10 10) tmp) #xD800))
70 (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
71 (values (code-char hi) (code-char lo)))))
72
73 (defun utf16-string-p (string)
74 "Check if String is a valid UTF-16 string. If the string is valid,
75 T is returned. If the string is not valid, NIL is returned, and the
76 second value is the index into the string of the invalid character."
77 (do ((len (length string))
78 (index 0 (1+ index)))
79 ((>= index len)
80 t)
81 (multiple-value-bind (codepoint wide)
82 (codepoint string index)
83 ;; We stepping through the string in order. If there are any
84 ;; surrogates, we must reach the lead surrogate first, which
85 ;; means WIDE is +1. If we get any surrogate codepoint that
86 ;; is in the surrogate range, we have an invalid string.
87 (when (or (eq wide -1)
88 (<= #xD800 codepoint #xDFFF))
89 (return-from utf16-string-p (values nil index)))
90 (when wide (incf index)))))
91
92 (defun string (X)
93 "Coerces X into a string. If X is a string, X is returned. If X is a
94 symbol, X's pname is returned. If X is a character then a one element
95 string containing that character is returned. If X cannot be coerced
96 into a string, an error occurs."
97 (cond ((stringp x) x)
98 ((symbolp x) (symbol-name x))
99 ((characterp x)
100 (let ((res (make-string 1)))
101 (setf (schar res 0) x) res))
102 (t
103 (error 'simple-type-error
104 :datum x
105 :expected-type '(or string symbol character)
106 :format-control "~S cannot be coerced to a string."
107 :format-arguments (list x)))))
108
109 ;;; With-One-String is used to set up some string hacking things. The keywords
110 ;;; are parsed, and the string is hacked into a simple-string.
111
112 (eval-when (compile)
113
114 (defmacro with-one-string (string start end cum-offset &rest forms)
115 `(let ((,string (if (stringp ,string) ,string (string ,string))))
116 ;; Optimizer may prove STRING is one.
117 (declare (optimize (ext:inhibit-warnings 3)))
118 (with-array-data ((,string ,string :offset-var ,cum-offset)
119 (,start ,start)
120 (,end (or ,end (length (the vector ,string)))))
121 ,@forms)))
122
123 )
124
125 ;;; With-String is like With-One-String, but doesn't parse keywords.
126
127 (eval-when (compile)
128
129 (defmacro with-string (string &rest forms)
130 `(let ((,string (if (stringp ,string) ,string (string ,string))))
131 (with-array-data ((,string ,string)
132 (start)
133 (end (length (the vector ,string))))
134 ,@forms)))
135
136 )
137
138 ;;; With-Two-Strings is used to set up string comparison operations. The
139 ;;; keywords are parsed, and the strings are hacked into simple-strings.
140
141 (eval-when (compile)
142
143 (defmacro with-two-strings (string1 string2 start1 end1 cum-offset-1
144 start2 end2 &rest forms)
145 `(let ((,string1 (if (stringp ,string1) ,string1 (string ,string1)))
146 (,string2 (if (stringp ,string2) ,string2 (string ,string2))))
147 (with-array-data ((,string1 ,string1 :offset-var ,cum-offset-1)
148 (,start1 ,start1)
149 (,end1 (or ,end1 (length (the vector ,string1)))))
150 (with-array-data ((,string2 ,string2)
151 (,start2 ,start2)
152 (,end2 (or ,end2 (length (the vector ,string2)))))
153 ,@forms))))
154
155 )
156
157
158 (defun char (string index)
159 "Given a string and a non-negative integer index less than the length of
160 the string, returns the character object representing the character at
161 that position in the string."
162 (declare (optimize (safety 1)))
163 (char string index))
164
165 (defun %charset (string index new-el)
166 (declare (optimize (safety 1)))
167 (setf (char string index) new-el))
168
169 (defun schar (string index)
170 "SCHAR returns the character object at an indexed position in a string
171 just as CHAR does, except the string must be a simple-string."
172 (declare (optimize (safety 1)))
173 (schar string index))
174
175 (defun %scharset (string index new-el)
176 (declare (optimize (safety 1)))
177 (setf (schar string index) new-el))
178
179 (defun string=* (string1 string2 start1 end1 start2 end2)
180 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
181 (not (%sp-string-compare string1 start1 end1 string2 start2 end2))))
182
183
184 (defun string/=* (string1 string2 start1 end1 start2 end2)
185 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
186 (let ((comparison (%sp-string-compare string1 start1 end1
187 string2 start2 end2)))
188 (if comparison (- (the fixnum comparison) offset1)))))
189
190 (eval-when (compile eval)
191
192 ;;; Lessp is true if the desired expansion is for string<* or string<=*.
193 ;;; Equalp is true if the desired expansion is for string<=* or string>=*.
194 (defmacro string<>=*-body (lessp equalp)
195 (let ((offset1 (gensym)))
196 `(with-two-strings string1 string2 start1 end1 ,offset1 start2 end2
197 (let ((index (%sp-string-compare string1 start1 end1
198 string2 start2 end2)))
199 (if index
200 (cond ((= (the fixnum index) (the fixnum end1))
201 ,(if lessp
202 `(- (the fixnum index) ,offset1)
203 `nil))
204 ((= (+ (the fixnum index) (- start2 start1))
205 (the fixnum end2))
206 ,(if lessp
207 `nil
208 `(- (the fixnum index) ,offset1)))
209 #-unicode
210 ((,(if lessp 'char< 'char>)
211 (schar string1 index)
212 (schar string2 (+ (the fixnum index) (- start2 start1))))
213 (- (the fixnum index) ,offset1))
214 #-unicode
215 (t nil)
216 #+unicode
217 (t
218 ;; Compare in code point order. See
219 ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
220 (flet ((fixup (code)
221 (if (>= code #xe000)
222 (- code #x800)
223 (+ code #x2000))))
224 (declare (inline fixup))
225 (let* ((c1 (char-code (schar string1 index)))
226 (c2 (char-code (schar string2
227 (+ (the fixnum index)
228 (- start2 start1))))))
229 (cond ((and (>= c1 #xd800)
230 (>= c2 #xd800))
231 (let ((fix-c1 (fixup c1))
232 (fix-c2 (fixup c2)))
233 (if (,(if lessp '< '>) fix-c1 fix-c2)
234 (- (the fixnum index) ,offset1)
235 nil)))
236 (t
237 (if (,(if lessp '< '>) c1 c2)
238 (- (the fixnum index) ,offset1)
239 nil)))))))
240 ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
241 ) ; eval-when
242
243 (defun string<* (string1 string2 start1 end1 start2 end2)
244 (declare (fixnum start1 start2))
245 (string<>=*-body t nil))
246
247 (defun string>* (string1 string2 start1 end1 start2 end2)
248 (declare (fixnum start1 start2))
249 (string<>=*-body nil nil))
250
251 (defun string<=* (string1 string2 start1 end1 start2 end2)
252 (declare (fixnum start1 start2))
253 (string<>=*-body t t))
254
255 (defun string>=* (string1 string2 start1 end1 start2 end2)
256 (declare (fixnum start1 start2))
257 (string<>=*-body nil t))
258
259
260
261 (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
262 "Given two strings, if the first string is lexicographically less than
263 the second string, returns the longest common prefix (using char=)
264 of the two strings. Otherwise, returns ()."
265 (string<* string1 string2 start1 end1 start2 end2))
266
267 (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
268 "Given two strings, if the first string is lexicographically greater than
269 the second string, returns the longest common prefix (using char=)
270 of the two strings. Otherwise, returns ()."
271 (string>* string1 string2 start1 end1 start2 end2))
272
273
274 (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
275 "Given two strings, if the first string is lexicographically less than
276 or equal to the second string, returns the longest common prefix
277 (using char=) of the two strings. Otherwise, returns ()."
278 (string<=* string1 string2 start1 end1 start2 end2))
279
280 (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
281 "Given two strings, if the first string is lexicographically greater
282 than or equal to the second string, returns the longest common prefix
283 (using char=) of the two strings. Otherwise, returns ()."
284 (string>=* string1 string2 start1 end1 start2 end2))
285
286 (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
287 "Given two strings (string1 and string2), and optional integers start1,
288 start2, end1 and end2, compares characters in string1 to characters in
289 string2 (using char=)."
290 (string=* string1 string2 start1 end1 start2 end2))
291
292 (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
293 "Given two strings, if the first string is not lexicographically equal
294 to the second string, returns the longest common prefix (using char=)
295 of the two strings. Otherwise, returns ()."
296 (string/=* string1 string2 start1 end1 start2 end2))
297
298
299 (eval-when (compile eval)
300
301 ;;; STRING-NOT-EQUAL-LOOP is used to generate character comparison loops for
302 ;;; STRING-EQUAL and STRING-NOT-EQUAL.
303 (defmacro string-not-equal-loop (end end-value
304 &optional (abort-value nil abortp))
305 (declare (fixnum end))
306 (let ((end-test (if (= end 1)
307 `(= index1 (the fixnum end1))
308 `(= index2 (the fixnum end2)))))
309 `(do ((index1 start1 (1+ index1))
310 (index2 start2 (1+ index2)))
311 (,(if abortp
312 end-test
313 `(or ,end-test
314 (not (char-equal (schar string1 index1)
315 (schar string2 index2)))))
316 ,end-value)
317 (declare (fixnum index1 index2))
318 ,@(if abortp
319 `((if (not (char-equal (schar string1 index1)
320 (schar string2 index2)))
321 (return ,abort-value)))))))
322
323 ) ; eval-when
324
325 (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
326 "Given two strings (string1 and string2), and optional integers start1,
327 start2, end1 and end2, compares characters in string1 to characters in
328 string2 (using char-equal)."
329 (declare (fixnum start1 start2))
330 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
331 (let ((slen1 (- (the fixnum end1) start1))
332 (slen2 (- (the fixnum end2) start2)))
333 (declare (fixnum slen1 slen2))
334 (if (or (minusp slen1) (minusp slen2))
335 ;;prevent endless looping later.
336 (error "Improper bounds for string comparison."))
337 (if (= slen1 slen2)
338 ;;return () immediately if lengths aren't equal.
339 (string-not-equal-loop 1 t nil)))))
340
341 (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
342 "Given two strings, if the first string is not lexicographically equal
343 to the second string, returns the longest common prefix (using char-equal)
344 of the two strings. Otherwise, returns ()."
345 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
346 (let ((slen1 (- end1 start1))
347 (slen2 (- end2 start2)))
348 (declare (fixnum slen1 slen2))
349 (if (or (minusp slen1) (minusp slen2))
350 ;;prevent endless looping later.
351 (error "Improper bounds for string comparison."))
352 (cond ((or (minusp slen1) (or (minusp slen2)))
353 (error "Improper substring for comparison."))
354 ((= slen1 slen2)
355 (string-not-equal-loop 1 nil (- index1 offset1)))
356 ((< slen1 slen2)
357 (string-not-equal-loop 1 (- index1 offset1)))
358 (t
359 (string-not-equal-loop 2 (- index1 offset1)))))))
360
361
362
363 (eval-when (compile eval)
364
365 ;;; STRING-LESS-GREATER-EQUAL-TESTS returns a test on the lengths of string1
366 ;;; and string2 and a test on the current characters from string1 and string2
367 ;;; for the following macro.
368 (defun string-less-greater-equal-tests (lessp equalp)
369 (if lessp
370 (if equalp
371 ;; STRING-NOT-GREATERP
372 (values '<=
373 #-unicode `(not (char-greaterp char1 char2))
374 #+unicode `(<= char1 char2))
375 ;; STRING-LESSP
376 (values '<
377 #-unicode `(char-lessp char1 char2)
378 #+unicode `(< char1 char2)))
379 (if equalp
380 ;; STRING-NOT-LESSP
381 (values '>=
382 #-unicode `(not (char-lessp char1 char2))
383 #+unicode `(>= char1 char2))
384 ;; STRING-GREATERP
385 (values '>
386 #-unicode `(char-greaterp char1 char2)
387 #+unicode `(> char1 char2)))))
388
389 #-unicode
390 (defmacro string-less-greater-equal (lessp equalp)
391 (multiple-value-bind (length-test character-test)
392 (string-less-greater-equal-tests lessp equalp)
393 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
394 (let ((slen1 (- (the fixnum end1) start1))
395 (slen2 (- (the fixnum end2) start2)))
396 (declare (fixnum slen1 slen2))
397 (if (or (minusp slen1) (minusp slen2))
398 ;;prevent endless looping later.
399 (error "Improper bounds for string comparison."))
400 (do ((index1 start1 (1+ index1))
401 (index2 start2 (1+ index2))
402 (char1)
403 (char2))
404 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
405 (if (,length-test slen1 slen2) (- index1 offset1)))
406 (declare (fixnum index1 index2))
407 (setq char1 (schar string1 index1))
408 (setq char2 (schar string2 index2))
409 (if (not (char-equal char1 char2))
410 (if ,character-test
411 (return (- index1 offset1))
412 (return ()))))))))
413
414 #+unicode
415 (defmacro equal-char-codepoint (codepoint)
416 `(let ((ch ,codepoint))
417 (if (< 96 ch 123)
418 (- ch 32)
419 #-(and unicode (not unicode-bootstrap))
420 ch
421 #+(and unicode (not unicode-bootstrap))
422 (if (> ch 127) (unicode-upper ch) ch))))
423
424 #+unicode
425 (defmacro string-less-greater-equal (lessp equalp)
426 (multiple-value-bind (length-test character-test)
427 (string-less-greater-equal-tests lessp equalp)
428 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
429 (let ((slen1 (- (the fixnum end1) start1))
430 (slen2 (- (the fixnum end2) start2)))
431 (declare (fixnum slen1 slen2))
432 (if (or (minusp slen1) (minusp slen2))
433 ;;prevent endless looping later.
434 (error "Improper bounds for string comparison."))
435 (do ((index1 start1 (1+ index1))
436 (index2 start2 (1+ index2)))
437 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
438 (if (,length-test slen1 slen2) (- index1 offset1)))
439 (declare (fixnum index1 index2))
440 (multiple-value-bind (char1 wide1)
441 (codepoint string1 index1)
442 (multiple-value-bind (char2 wide2)
443 (codepoint string2 index2)
444 (if (= (equal-char-codepoint char1)
445 (equal-char-codepoint char2))
446 (progn
447 (when wide1 (incf index1))
448 (when wide2 (incf index2)))
449 (if ,character-test
450 (return (- index1 offset1))
451 (return ()))))))))))
452
453 ) ; eval-when
454
455 (defun string-lessp* (string1 string2 start1 end1 start2 end2)
456 (declare (fixnum start1 start2))
457 (string-less-greater-equal t nil))
458
459 (defun string-greaterp* (string1 string2 start1 end1 start2 end2)
460 (declare (fixnum start1 start2))
461 (string-less-greater-equal nil nil))
462
463 (defun string-not-lessp* (string1 string2 start1 end1 start2 end2)
464 (declare (fixnum start1 start2))
465 (string-less-greater-equal nil t))
466
467 (defun string-not-greaterp* (string1 string2 start1 end1 start2 end2)
468 (declare (fixnum start1 start2))
469 (string-less-greater-equal t t))
470
471 (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
472 "Given two strings, if the first string is lexicographically less than
473 the second string, returns the longest common prefix (using char-equal)
474 of the two strings. Otherwise, returns ()."
475 (string-lessp* string1 string2 start1 end1 start2 end2))
476
477 (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)
478 "Given two strings, if the first string is lexicographically greater than
479 the second string, returns the longest common prefix (using char-equal)
480 of the two strings. Otherwise, returns ()."
481 (string-greaterp* string1 string2 start1 end1 start2 end2))
482
483 (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
484 "Given two strings, if the first string is lexicographically greater
485 than or equal to the second string, returns the longest common prefix
486 (using char-equal) of the two strings. Otherwise, returns ()."
487 (string-not-lessp* string1 string2 start1 end1 start2 end2))
488
489 (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
490 end2)
491 "Given two strings, if the first string is lexicographically less than
492 or equal to the second string, returns the longest common prefix
493 (using char-equal) of the two strings. Otherwise, returns ()."
494 (string-not-greaterp* string1 string2 start1 end1 start2 end2))
495
496
497 (defun make-string (count &key element-type ((:initial-element fill-char)))
498 "Given a character count and an optional fill character, makes and returns
499 a new string Count long filled with the fill character."
500 (declare (type fixnum count))
501 (assert (subtypep element-type 'character))
502 (if fill-char
503 (do ((i 0 (1+ i))
504 (string (make-string count)))
505 ((= i count) string)
506 (declare (fixnum i))
507 (setf (schar string i) fill-char))
508 (make-string count)))
509
510 (defun string-upcase (string &key (start 0) end)
511 "Given a string, returns a new string that is a copy of it with
512 all lower case alphabetic characters converted to uppercase."
513 (declare (fixnum start))
514 (let* ((string (if (stringp string) string (string string)))
515 (slen (length string)))
516 (declare (fixnum slen))
517 (with-one-string string start end offset
518 (let ((offset-slen (+ slen offset))
519 (newstring (make-string slen)))
520 (declare (fixnum offset-slen))
521 (do ((index offset (1+ index))
522 (new-index 0 (1+ new-index)))
523 ((= index start))
524 (declare (fixnum index new-index))
525 (setf (schar newstring new-index) (schar string index)))
526 (do ((index start (1+ index))
527 (new-index (- start offset) (1+ new-index)))
528 ((= index (the fixnum end)))
529 (declare (fixnum index new-index))
530 (multiple-value-bind (code wide) (codepoint string index)
531 (when wide (incf index))
532 ;; Handle ASCII specially because this is called early in
533 ;; initialization, before unidata is available.
534 (cond ((< 96 code 123) (decf code 32))
535 ((> code 127) (setq code (unicode-upper code))))
536 ;;@@ WARNING: this may, in theory, need to extend newstring
537 ;; but that never actually occurs as of Unicode 5.1.0,
538 ;; so I'm just going to ignore it for now...
539 (multiple-value-bind (hi lo) (surrogates code)
540 (setf (schar newstring new-index) hi)
541 (when lo
542 (setf (schar newstring (incf new-index)) lo)))))
543 ;;@@ WARNING: see above
544 (do ((index end (1+ index))
545 (new-index (- (the fixnum end) offset) (1+ new-index)))
546 ((= index offset-slen))
547 (declare (fixnum index new-index))
548 (setf (schar newstring new-index) (schar string index)))
549 newstring))))
550
551 (defun string-downcase (string &key (start 0) end)
552 "Given a string, returns a new string that is a copy of it with
553 all upper case alphabetic characters converted to lowercase."
554 (declare (fixnum start))
555 (let* ((string (if (stringp string) string (string string)))
556 (slen (length string)))
557 (declare (fixnum slen))
558 (with-one-string string start end offset
559 (let ((offset-slen (+ slen offset))
560 (newstring (make-string slen)))
561 (declare (fixnum offset-slen))
562 (do ((index offset (1+ index))
563 (new-index 0 (1+ new-index)))
564 ((= index start))
565 (declare (fixnum index new-index))
566 (setf (schar newstring new-index) (schar string index)))
567 (do ((index start (1+ index))
568 (new-index (- start offset) (1+ new-index)))
569 ((= index (the fixnum end)))
570 (declare (fixnum index new-index))
571 (multiple-value-bind (code wide) (codepoint string index)
572 (when wide (incf index))
573 ;; Handle ASCII specially because this is called early in
574 ;; initialization, before unidata is available.
575 (cond ((< 64 code 91) (incf code 32))
576 ((> code 127) (setq code (unicode-lower code))))
577 ;;@@ WARNING: this may, in theory, need to extend newstring
578 ;; but that never actually occurs as of Unicode 5.1.0,
579 ;; so I'm just going to ignore it for now...
580 (multiple-value-bind (hi lo) (surrogates code)
581 (setf (schar newstring new-index) hi)
582 (when lo
583 (setf (schar newstring (incf new-index)) lo)))))
584 ;;@@ WARNING: see above
585 (do ((index end (1+ index))
586 (new-index (- (the fixnum end) offset) (1+ new-index)))
587 ((= index offset-slen))
588 (declare (fixnum index new-index))
589 (setf (schar newstring new-index) (schar string index)))
590 newstring))))
591
592 (defun string-capitalize (string &key (start 0) end)
593 "Given a string, returns a copy of the string with the first
594 character of each ``word'' converted to upper-case, and remaining
595 chars in the word converted to lower case. A ``word'' is defined
596 to be a string of case-modifiable characters delimited by
597 non-case-modifiable chars."
598 (declare (fixnum start))
599 (let* ((string (if (stringp string) string (string string)))
600 (slen (length string)))
601 (declare (fixnum slen))
602 (with-one-string string start end offset
603 (let ((offset-slen (+ slen offset))
604 (newstring (make-string slen)))
605 (declare (fixnum offset-slen))
606 (do ((index offset (1+ index))
607 (new-index 0 (1+ new-index)))
608 ((= index start))
609 (declare (fixnum index new-index))
610 (setf (schar newstring new-index) (schar string index)))
611 (do ((index start (1+ index))
612 (new-index (- start offset) (1+ new-index))
613 (newword t)
614 (char ()))
615 ((= index (the fixnum end)))
616 (declare (fixnum index new-index))
617 (setq char (schar string index))
618 (cond ((not (alphanumericp char))
619 (setq newword t))
620 (newword
621 ;;char is first case-modifiable after non-case-modifiable
622 (setq char (char-titlecase char))
623 (setq newword ()))
624 ;;char is case-modifiable, but not first
625 (t (setq char (char-downcase char))))
626 (setf (schar newstring new-index) char))
627 (do ((index end (1+ index))
628 (new-index (- (the fixnum end) offset) (1+ new-index)))
629 ((= index offset-slen))
630 (declare (fixnum index new-index))
631 (setf (schar newstring new-index) (schar string index)))
632 newstring))))
633
634 (defun nstring-upcase (string &key (start 0) end)
635 "Given a string, returns that string with all lower case alphabetic
636 characters converted to uppercase."
637 (declare (fixnum start))
638 (let ((save-header string))
639 (with-one-string string start end offset
640 (do ((index start (1+ index)))
641 ((= index (the fixnum end)))
642 (declare (fixnum index))
643 (multiple-value-bind (code wide) (codepoint string index)
644 (declare (ignore wide))
645 ;; Handle ASCII specially because this is called early in
646 ;; initialization, before unidata is available.
647 (cond ((< 96 code 123) (decf code 32))
648 ((> code 127) (setq code (unicode-upper code))))
649 ;;@@ WARNING: this may, in theory, need to extend string
650 ;; (which, obviously, we can't do here. Unless
651 ;; STRING is adjustable, maybe)
652 ;; but that never actually occurs as of Unicode 5.1.0,
653 ;; so I'm just going to ignore it for now...
654 (multiple-value-bind (hi lo) (surrogates code)
655 (setf (schar string index) hi)
656 (when lo
657 (setf (schar string (incf index)) lo))))))
658 save-header))
659
660 (defun nstring-downcase (string &key (start 0) end)
661 "Given a string, returns that string with all upper case alphabetic
662 characters converted to lowercase."
663 (declare (fixnum start))
664 (let ((save-header string))
665 (with-one-string string start end offset
666 (do ((index start (1+ index)))
667 ((= index (the fixnum end)))
668 (declare (fixnum index))
669 (multiple-value-bind (code wide) (codepoint string index)
670 (declare (ignore wide))
671 (cond ((< 64 code 91) (incf code 32))
672 ((> code 127) (setq code (unicode-lower code))))
673 ;;@@ WARNING: this may, in theory, need to extend string
674 ;; (which, obviously, we can't do here. Unless
675 ;; STRING is adjustable, maybe)
676 ;; but that never actually occurs as of Unicode 5.1.0,
677 ;; so I'm just going to ignore it for now...
678 (multiple-value-bind (hi lo) (surrogates code)
679 (setf (schar string index) hi)
680 (when lo
681 (setf (schar string (incf index)) lo))))))
682 save-header))
683
684 (defun nstring-capitalize (string &key (start 0) end)
685 "Given a string, returns that string with the first
686 character of each ``word'' converted to upper-case, and remaining
687 chars in the word converted to lower case. A ``word'' is defined
688 to be a string of case-modifiable characters delimited by
689 non-case-modifiable chars."
690 (declare (fixnum start))
691 (let ((save-header string))
692 (with-one-string string start end offset
693 (do ((index start (1+ index))
694 (newword t)
695 (char ()))
696 ((= index (the fixnum end)))
697 (declare (fixnum index))
698 (setq char (schar string index))
699 (cond ((not (alphanumericp char))
700 (setq newword t))
701 (newword
702 ;;char is first case-modifiable after non-case-modifiable
703 (setf (schar string index) (char-titlecase char))
704 (setq newword ()))
705 (t
706 (setf (schar string index) (char-downcase char))))))
707 save-header))
708
709 (defun string-left-trim (char-bag string)
710 "Given a set of characters (a list or string) and a string, returns
711 a copy of the string with the characters in the set removed from the
712 left end."
713 (with-string string
714 (do ((index start (1+ index)))
715 ((or (= index (the fixnum end))
716 (not (find (schar string index) char-bag)))
717 (subseq (the simple-string string) index end))
718 (declare (fixnum index)))))
719
720 (defun string-right-trim (char-bag string)
721 "Given a set of characters (a list or string) and a string, returns
722 a copy of the string with the characters in the set removed from the
723 right end."
724 (with-string string
725 (do ((index (1- (the fixnum end)) (1- index)))
726 ((or (< index start) (not (find (schar string index) char-bag)))
727 (subseq (the simple-string string) start (1+ index)))
728 (declare (fixnum index)))))
729
730 (defun string-trim (char-bag string)
731 "Given a set of characters (a list or string) and a string, returns a
732 copy of the string with the characters in the set removed from both
733 ends."
734 (with-string string
735 (let* ((left-end (do ((index start (1+ index)))
736 ((or (= index (the fixnum end))
737 (not (find (schar string index) char-bag)))
738 index)
739 (declare (fixnum index))))
740 (right-end (do ((index (1- (the fixnum end)) (1- index)))
741 ((or (< index left-end)
742 (not (find (schar string index) char-bag)))
743 (1+ index))
744 (declare (fixnum index)))))
745 (subseq (the simple-string string) left-end right-end))))
746
747 (declaim (inline %glyph-f %glyph-b))
748 (defun %glyph-f (string index)
749 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
750 (type simple-string string) (type kernel:index index))
751 (let* ((prev 0)
752 (l (length string))
753 (c (codepoint string index l))
754 (n (+ index (if (> c #xFFFF) 2 1))))
755 (declare (type (integer 0 #x10FFFF) c) (type kernel:index l n))
756 (loop while (< n l) do
757 (let* ((c (codepoint string n l))
758 (d (the (unsigned-byte 8) (unicode-combining-class c))))
759 (when (or (zerop d) (< d prev))
760 (return))
761 (setq prev d)
762 (incf n (if (> c #xFFFF) 2 1))))
763 n))
764
765 (defun %glyph-b (string index)
766 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
767 (type simple-string string) (type kernel:index index))
768 (let* ((prev 255)
769 (n (1- index)))
770 (declare (type kernel:index n))
771 (loop until (< n 0) do
772 (let* ((c (codepoint string n 0))
773 (d (the (unsigned-byte 8) (unicode-combining-class c))))
774 (cond ((zerop d) (return))
775 ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
776 (setq prev d)
777 (decf n (if (> c #xFFFF) 2 1))))
778 n))
779
780 (defun glyph (string index &key (from-end nil))
781 "GLYPH returns the glyph at the indexed position in a string, and the
782 position of the next glyph (or NIL) as a second value. A glyph is
783 a substring consisting of the character at INDEX followed by all
784 subsequent combining characters."
785 (declare (type simple-string string) (type kernel:index index))
786 #-unicode
787 (char string index)
788 #+unicode
789 (with-array-data ((string string) (start) (end))
790 (declare (ignore start end))
791 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
792 (if from-end
793 (values (subseq string n index) (and (> n 0) n))
794 (values (subseq string index n) (and (< n (length string)) n))))))
795
796 (defun sglyph (string index &key (from-end nil))
797 "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
798 except that the string must be a simple-string"
799 (declare (type simple-string string) (type kernel:index index))
800 #-unicode
801 (schar string index)
802 #+unicode
803 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
804 (if from-end
805 (values (subseq string n index) (and (> n 0) n))
806 (values (subseq string index n) (and (< n (length string)) n)))))
807
808 (defun decompose (string &optional (compatibility t))
809 (declare (type string string))
810 (let ((result (make-string (cond ((< (length string) 40)
811 (* 5 (length string)))
812 ((< (length string) 4096)
813 (* 2 (length string)))
814 (t (round (length string) 5/6)))))
815 (fillptr 0))
816 (declare (type kernel:index fillptr))
817 (labels ((rec (string)
818 (declare (type simple-string string))
819 (do ((i 0 (1+ i)))
820 ((= i (length string)))
821 (declare (type kernel:index i))
822 (multiple-value-bind (code wide) (codepoint string i)
823 (when wide (incf i))
824 (let ((decomp (unicode-decomp code compatibility)))
825 (if decomp (rec decomp) (out code))))))
826 (out (code)
827 (multiple-value-bind (hi lo) (surrogates code)
828 (outch hi)
829 (when lo
830 (outch lo))
831 (let ((cc (unicode-combining-class code)))
832 (unless (zerop cc)
833 (order lo cc (- fillptr (if lo 2 1)))))))
834 (outch (char)
835 (when (= fillptr (length result))
836 (let ((tmp (make-string (round (length result) 5/6))))
837 (replace tmp result)
838 (setq result tmp)))
839 (setf (schar result fillptr) char)
840 (incf fillptr))
841 (order (wide1 cc last)
842 (loop until (minusp last) do
843 (multiple-value-bind (code2 wide2) (codepoint result last)
844 (let ((cc2 (unicode-combining-class code2)))
845 (cond ((zerop cc2) (return))
846 ((> cc2 cc)
847 (case (+ (if wide2 2 0) (if wide1 1 0))
848 (0 (rotatef (schar result last)
849 (schar result (1+ last))))
850 (1 (rotatef (schar result last)
851 (schar result (+ last 1))
852 (schar result (+ last 2))))
853 (2 (rotatef (schar result last)
854 (schar result (1- last))
855 (schar result (1+ last))))
856 (3 (rotatef (schar result last)
857 (schar result (+ last 2)))
858 (rotatef (schar result (1- last))
859 (schar result (1+ last)))))
860 (decf last (if wide2 2 1)))
861 (t (return))))))))
862 (with-array-data ((string string) (start) (end))
863 (declare (ignore start end))
864 (rec string))
865 (shrink-vector result fillptr))))
866
867 (defun string-to-nfd (string)
868 "Convert String to Unicode Normalization Form D (NFD) using the
869 canonical decomposition. The NFD string is returned"
870 (decompose string nil))
871
872 (defun string-to-nfkd (string)
873 "Convert String to Unicode Normalization Form KD (NFKD) uisng the
874 compatible decomposition form. The NFKD string is returned."
875 (decompose string t))
876
877 #+unicode
878 (defun string-to-nfc (string)
879 ;;@@ Implement me
880 ;; must return a simple-string for the package machinery
881 (if (simple-string-p string) string (coerce string 'simple-string)))
882
883 #-unicode ;; Needed by package.lisp
884 (defun string-to-nfc (string)
885 (if (simple-string-p string) string (coerce string 'simple-string)))
886
887 (defun string-to-nfkc (string)
888 ;;@@ Implement me
889 (if (simple-string-p string) string (coerce string 'simple-string)))

  ViewVC Help
Powered by ViewVC 1.1.5