/[cmucl]/src/code/string.lisp
ViewVC logotype

Contents of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12.30.7 - (show annotations)
Wed Apr 22 17:05:51 2009 UTC (4 years, 11 months ago) by rtoy
Branch: unicode-utf16-extfmt-branch
Changes since 1.12.30.6: +21 -12 lines
o Add new function CODEPOINT-FROM-SURROGATES to compute the codepoint
  from two surrogate values.  (Should we use a better name?)
o Use the new function in CODEPOINT.
o Add docstrings to the functions.
1 ;;; -*- Log: code.log; Package: Lisp -*-
2 ;;;
3 ;;; **********************************************************************
4 ;;; This code was written as part of the CMU Common Lisp project at
5 ;;; Carnegie Mellon University, and has been placed in the public domain.
6 ;;;
7 (ext:file-comment
8 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/string.lisp,v 1.12.30.7 2009/04/22 17:05:51 rtoy Exp $")
9 ;;;
10 ;;; **********************************************************************
11 ;;;
12 ;;; Functions to implement strings for CMU Common Lisp
13 ;;; Written by David Dill
14 ;;; Rewritten by Skef Wholey, Bill Chiles and Rob MacLachlan.
15 ;;;
16 ;;; ****************************************************************
17 ;;;
18 (in-package "LISP")
19 (export '(char schar glyph sglyph string
20 string= string-equal string< string> string<= string>= string/=
21 string-lessp string-greaterp string-not-lessp string-not-greaterp
22 string-not-equal
23 string-to-nfd string-to-nfkd string-to-nfc string-to-nfkc
24 make-string
25 string-trim string-left-trim string-right-trim
26 string-upcase
27 string-downcase string-capitalize nstring-upcase nstring-downcase
28 nstring-capitalize))
29
30
31 (declaim (inline codepoint-from-surrogates codepoint surrogates))
32
33 (defun codepoint-from-surrogates (high low)
34 "Return the codepoint given the high and low surrogate values"
35 (declare (type (integer #xD800 #xDBFF) high)
36 (type (integer #xDC00 #xDFFF) low))
37 (+ (ash (- high #xD800) 10)
38 low
39 #x2400))
40
41 (defun codepoint (string i &optional (end (length string)))
42 "Return the codepoint value from String at position I. If that
43 position is a surrogate, it is combined with either the previous or
44 following character (when possible) to compute the codepoint. The
45 second return value is NIL if the position is not a surrogate pair.
46 Otherwise +1 or -1 is returned if the position is the high or low
47 surrogate value, respectively."
48 (declare (type simple-string string) (type kernel:index i end))
49 (let ((code (char-code (schar string i))))
50 (cond ((and (<= #xD800 code #xDBFF) (< (1+ i) end))
51 (let ((tmp (char-code (schar string (1+ i)))))
52 (if (<= #xDC00 tmp #xDFFF)
53 (values (codepoint-from-surrogates code tmp) +1)
54 (values code nil))))
55 ((and (<= #xDC00 code #xDFFF) (> i 0))
56 (let ((tmp (char-code (schar string (1- i)))))
57 (if (<= #xD800 tmp #xDBFF)
58 (values (codepoint-from-surrogates tmp code) -1)
59 (values code nil))))
60 (t (values code nil)))))
61
62 (defun surrogates (codepoint)
63 "Return the high and low surrogate values for Codepoint. If
64 Codepoint is in the BMP, just return the Codepoint itself"
65 (declare (type (integer 0 #x10FFFF) codepoint))
66 (if (< codepoint #x10000)
67 (values codepoint nil)
68 (let* ((tmp (- codepoint #x10000))
69 (hi (logior (ldb (byte 10 10) tmp) #xD800))
70 (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
71 (values hi lo))))
72
73
74 (defun string (X)
75 "Coerces X into a string. If X is a string, X is returned. If X is a
76 symbol, X's pname is returned. If X is a character then a one element
77 string containing that character is returned. If X cannot be coerced
78 into a string, an error occurs."
79 (cond ((stringp x) x)
80 ((symbolp x) (symbol-name x))
81 ((characterp x)
82 (let ((res (make-string 1)))
83 (setf (schar res 0) x) res))
84 (t
85 (error 'simple-type-error
86 :datum x
87 :expected-type '(or string symbol character)
88 :format-control "~S cannot be coerced to a string."
89 :format-arguments (list x)))))
90
91 ;;; With-One-String is used to set up some string hacking things. The keywords
92 ;;; are parsed, and the string is hacked into a simple-string.
93
94 (eval-when (compile)
95
96 (defmacro with-one-string (string start end cum-offset &rest forms)
97 `(let ((,string (if (stringp ,string) ,string (string ,string))))
98 ;; Optimizer may prove STRING is one.
99 (declare (optimize (ext:inhibit-warnings 3)))
100 (with-array-data ((,string ,string :offset-var ,cum-offset)
101 (,start ,start)
102 (,end (or ,end (length (the vector ,string)))))
103 ,@forms)))
104
105 )
106
107 ;;; With-String is like With-One-String, but doesn't parse keywords.
108
109 (eval-when (compile)
110
111 (defmacro with-string (string &rest forms)
112 `(let ((,string (if (stringp ,string) ,string (string ,string))))
113 (with-array-data ((,string ,string)
114 (start)
115 (end (length (the vector ,string))))
116 ,@forms)))
117
118 )
119
120 ;;; With-Two-Strings is used to set up string comparison operations. The
121 ;;; keywords are parsed, and the strings are hacked into simple-strings.
122
123 (eval-when (compile)
124
125 (defmacro with-two-strings (string1 string2 start1 end1 cum-offset-1
126 start2 end2 &rest forms)
127 `(let ((,string1 (if (stringp ,string1) ,string1 (string ,string1)))
128 (,string2 (if (stringp ,string2) ,string2 (string ,string2))))
129 (with-array-data ((,string1 ,string1 :offset-var ,cum-offset-1)
130 (,start1 ,start1)
131 (,end1 (or ,end1 (length (the vector ,string1)))))
132 (with-array-data ((,string2 ,string2)
133 (,start2 ,start2)
134 (,end2 (or ,end2 (length (the vector ,string2)))))
135 ,@forms))))
136
137 )
138
139
140 (defun char (string index)
141 "Given a string and a non-negative integer index less than the length of
142 the string, returns the character object representing the character at
143 that position in the string."
144 (declare (optimize (safety 1)))
145 (char string index))
146
147 (defun %charset (string index new-el)
148 (declare (optimize (safety 1)))
149 (setf (char string index) new-el))
150
151 (defun schar (string index)
152 "SCHAR returns the character object at an indexed position in a string
153 just as CHAR does, except the string must be a simple-string."
154 (declare (optimize (safety 1)))
155 (schar string index))
156
157 (defun %scharset (string index new-el)
158 (declare (optimize (safety 1)))
159 (setf (schar string index) new-el))
160
161 (defun string=* (string1 string2 start1 end1 start2 end2)
162 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
163 (not (%sp-string-compare string1 start1 end1 string2 start2 end2))))
164
165
166 (defun string/=* (string1 string2 start1 end1 start2 end2)
167 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
168 (let ((comparison (%sp-string-compare string1 start1 end1
169 string2 start2 end2)))
170 (if comparison (- (the fixnum comparison) offset1)))))
171
172 (eval-when (compile eval)
173
174 ;;; Lessp is true if the desired expansion is for string<* or string<=*.
175 ;;; Equalp is true if the desired expansion is for string<=* or string>=*.
176 (defmacro string<>=*-body (lessp equalp)
177 (let ((offset1 (gensym)))
178 `(with-two-strings string1 string2 start1 end1 ,offset1 start2 end2
179 (let ((index (%sp-string-compare string1 start1 end1
180 string2 start2 end2)))
181 (if index
182 (cond ((= (the fixnum index) (the fixnum end1))
183 ,(if lessp
184 `(- (the fixnum index) ,offset1)
185 `nil))
186 ((= (+ (the fixnum index) (- start2 start1))
187 (the fixnum end2))
188 ,(if lessp
189 `nil
190 `(- (the fixnum index) ,offset1)))
191 #-unicode
192 ((,(if lessp 'char< 'char>)
193 (schar string1 index)
194 (schar string2 (+ (the fixnum index) (- start2 start1))))
195 (- (the fixnum index) ,offset1))
196 #-unicode
197 (t nil)
198 #+unicode
199 (t
200 ;; Compare in code point order. See
201 ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
202 (flet ((fixup (code)
203 (if (>= code #xe000)
204 (- code #x800)
205 (+ code #x2000))))
206 (declare (inline fixup))
207 (let* ((c1 (char-code (schar string1 index)))
208 (c2 (char-code (schar string2 (+ (the fixnum index) (- start2 start1))))))
209 (cond ((and (>= c1 #xd800)
210 (>= c2 #xd800))
211 (let ((fix-c1 (fixup c1))
212 (fix-c2 (fixup c2)))
213 (if (,(if lessp '< '>) fix-c1 fix-c2)
214 (- (the fixnum index) ,offset1)
215 nil)))
216 (t
217 (if (,(if lessp '< '>) c1 c2)
218 (- (the fixnum index) ,offset1)
219 nil)))))))
220 ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
221 ) ; eval-when
222
223 (defun string<* (string1 string2 start1 end1 start2 end2)
224 (declare (fixnum start1 start2))
225 (string<>=*-body t nil))
226
227 (defun string>* (string1 string2 start1 end1 start2 end2)
228 (declare (fixnum start1 start2))
229 (string<>=*-body nil nil))
230
231 (defun string<=* (string1 string2 start1 end1 start2 end2)
232 (declare (fixnum start1 start2))
233 (string<>=*-body t t))
234
235 (defun string>=* (string1 string2 start1 end1 start2 end2)
236 (declare (fixnum start1 start2))
237 (string<>=*-body nil t))
238
239
240
241 (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
242 "Given two strings, if the first string is lexicographically less than
243 the second string, returns the longest common prefix (using char=)
244 of the two strings. Otherwise, returns ()."
245 (string<* string1 string2 start1 end1 start2 end2))
246
247 (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
248 "Given two strings, if the first string is lexicographically greater than
249 the second string, returns the longest common prefix (using char=)
250 of the two strings. Otherwise, returns ()."
251 (string>* string1 string2 start1 end1 start2 end2))
252
253
254 (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
255 "Given two strings, if the first string is lexicographically less than
256 or equal to the second string, returns the longest common prefix
257 (using char=) of the two strings. Otherwise, returns ()."
258 (string<=* string1 string2 start1 end1 start2 end2))
259
260 (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
261 "Given two strings, if the first string is lexicographically greater
262 than or equal to the second string, returns the longest common prefix
263 (using char=) of the two strings. Otherwise, returns ()."
264 (string>=* string1 string2 start1 end1 start2 end2))
265
266 (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
267 "Given two strings (string1 and string2), and optional integers start1,
268 start2, end1 and end2, compares characters in string1 to characters in
269 string2 (using char=)."
270 (string=* string1 string2 start1 end1 start2 end2))
271
272 (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
273 "Given two strings, if the first string is not lexicographically equal
274 to the second string, returns the longest common prefix (using char=)
275 of the two strings. Otherwise, returns ()."
276 (string/=* string1 string2 start1 end1 start2 end2))
277
278
279 (eval-when (compile eval)
280
281 ;;; STRING-NOT-EQUAL-LOOP is used to generate character comparison loops for
282 ;;; STRING-EQUAL and STRING-NOT-EQUAL.
283 (defmacro string-not-equal-loop (end end-value
284 &optional (abort-value nil abortp))
285 (declare (fixnum end))
286 (let ((end-test (if (= end 1)
287 `(= index1 (the fixnum end1))
288 `(= index2 (the fixnum end2)))))
289 `(do ((index1 start1 (1+ index1))
290 (index2 start2 (1+ index2)))
291 (,(if abortp
292 end-test
293 `(or ,end-test
294 (not (char-equal (schar string1 index1)
295 (schar string2 index2)))))
296 ,end-value)
297 (declare (fixnum index1 index2))
298 ,@(if abortp
299 `((if (not (char-equal (schar string1 index1)
300 (schar string2 index2)))
301 (return ,abort-value)))))))
302
303 ) ; eval-when
304
305 (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
306 "Given two strings (string1 and string2), and optional integers start1,
307 start2, end1 and end2, compares characters in string1 to characters in
308 string2 (using char-equal)."
309 (declare (fixnum start1 start2))
310 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
311 (let ((slen1 (- (the fixnum end1) start1))
312 (slen2 (- (the fixnum end2) start2)))
313 (declare (fixnum slen1 slen2))
314 (if (or (minusp slen1) (minusp slen2))
315 ;;prevent endless looping later.
316 (error "Improper bounds for string comparison."))
317 (if (= slen1 slen2)
318 ;;return () immediately if lengths aren't equal.
319 (string-not-equal-loop 1 t nil)))))
320
321 (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
322 "Given two strings, if the first string is not lexicographically equal
323 to the second string, returns the longest common prefix (using char-equal)
324 of the two strings. Otherwise, returns ()."
325 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
326 (let ((slen1 (- end1 start1))
327 (slen2 (- end2 start2)))
328 (declare (fixnum slen1 slen2))
329 (if (or (minusp slen1) (minusp slen2))
330 ;;prevent endless looping later.
331 (error "Improper bounds for string comparison."))
332 (cond ((or (minusp slen1) (or (minusp slen2)))
333 (error "Improper substring for comparison."))
334 ((= slen1 slen2)
335 (string-not-equal-loop 1 nil (- index1 offset1)))
336 ((< slen1 slen2)
337 (string-not-equal-loop 1 (- index1 offset1)))
338 (t
339 (string-not-equal-loop 2 (- index1 offset1)))))))
340
341
342
343 (eval-when (compile eval)
344
345 ;;; STRING-LESS-GREATER-EQUAL-TESTS returns a test on the lengths of string1
346 ;;; and string2 and a test on the current characters from string1 and string2
347 ;;; for the following macro.
348 (defun string-less-greater-equal-tests (lessp equalp)
349 (if lessp
350 (if equalp
351 ;; STRING-NOT-GREATERP
352 (values '<= `(not (char-greaterp char1 char2)))
353 ;; STRING-LESSP
354 (values '< `(char-lessp char1 char2)))
355 (if equalp
356 ;; STRING-NOT-LESSP
357 (values '>= `(not (char-lessp char1 char2)))
358 ;; STRING-GREATERP
359 (values '> `(char-greaterp char1 char2)))))
360
361 (defmacro string-less-greater-equal (lessp equalp)
362 (multiple-value-bind (length-test character-test)
363 (string-less-greater-equal-tests lessp equalp)
364 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
365 (let ((slen1 (- (the fixnum end1) start1))
366 (slen2 (- (the fixnum end2) start2)))
367 (declare (fixnum slen1 slen2))
368 (if (or (minusp slen1) (minusp slen2))
369 ;;prevent endless looping later.
370 (error "Improper bounds for string comparison."))
371 (do ((index1 start1 (1+ index1))
372 (index2 start2 (1+ index2))
373 (char1)
374 (char2))
375 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
376 (if (,length-test slen1 slen2) (- index1 offset1)))
377 (declare (fixnum index1 index2))
378 (setq char1 (schar string1 index1))
379 (setq char2 (schar string2 index2))
380 (if (not (char-equal char1 char2))
381 (if ,character-test
382 (return (- index1 offset1))
383 (return ()))))))))
384
385 ) ; eval-when
386
387 (defun string-lessp* (string1 string2 start1 end1 start2 end2)
388 (declare (fixnum start1 start2))
389 (string-less-greater-equal t nil))
390
391 (defun string-greaterp* (string1 string2 start1 end1 start2 end2)
392 (declare (fixnum start1 start2))
393 (string-less-greater-equal nil nil))
394
395 (defun string-not-lessp* (string1 string2 start1 end1 start2 end2)
396 (declare (fixnum start1 start2))
397 (string-less-greater-equal nil t))
398
399 (defun string-not-greaterp* (string1 string2 start1 end1 start2 end2)
400 (declare (fixnum start1 start2))
401 (string-less-greater-equal t t))
402
403 (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
404 "Given two strings, if the first string is lexicographically less than
405 the second string, returns the longest common prefix (using char-equal)
406 of the two strings. Otherwise, returns ()."
407 (string-lessp* string1 string2 start1 end1 start2 end2))
408
409 (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)
410 "Given two strings, if the first string is lexicographically greater than
411 the second string, returns the longest common prefix (using char-equal)
412 of the two strings. Otherwise, returns ()."
413 (string-greaterp* string1 string2 start1 end1 start2 end2))
414
415 (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
416 "Given two strings, if the first string is lexicographically greater
417 than or equal to the second string, returns the longest common prefix
418 (using char-equal) of the two strings. Otherwise, returns ()."
419 (string-not-lessp* string1 string2 start1 end1 start2 end2))
420
421 (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
422 end2)
423 "Given two strings, if the first string is lexicographically less than
424 or equal to the second string, returns the longest common prefix
425 (using char-equal) of the two strings. Otherwise, returns ()."
426 (string-not-greaterp* string1 string2 start1 end1 start2 end2))
427
428
429 (defun make-string (count &key element-type ((:initial-element fill-char)))
430 "Given a character count and an optional fill character, makes and returns
431 a new string Count long filled with the fill character."
432 (declare (type fixnum count))
433 (assert (subtypep element-type 'character))
434 (if fill-char
435 (do ((i 0 (1+ i))
436 (string (make-string count)))
437 ((= i count) string)
438 (declare (fixnum i))
439 (setf (schar string i) fill-char))
440 (make-string count)))
441
442 (defun string-upcase (string &key (start 0) end)
443 "Given a string, returns a new string that is a copy of it with
444 all lower case alphabetic characters converted to uppercase."
445 (declare (fixnum start))
446 (let* ((string (if (stringp string) string (string string)))
447 (slen (length string)))
448 (declare (fixnum slen))
449 (with-one-string string start end offset
450 (let ((offset-slen (+ slen offset))
451 (newstring (make-string slen)))
452 (declare (fixnum offset-slen))
453 (do ((index offset (1+ index))
454 (new-index 0 (1+ new-index)))
455 ((= index start))
456 (declare (fixnum index new-index))
457 (setf (schar newstring new-index) (schar string index)))
458 (do ((index start (1+ index))
459 (new-index (- start offset) (1+ new-index)))
460 ((= index (the fixnum end)))
461 (declare (fixnum index new-index))
462 (multiple-value-bind (code wide) (codepoint string index)
463 (when wide (incf index))
464 ;; Handle ASCII specially because this is called early in
465 ;; initialization, before unidata is available.
466 (cond ((< 96 code 123) (decf code 32))
467 ((> code 127) (setq code (unicode-upper code))))
468 ;;@@ WARNING: this may, in theory, need to extend newstring
469 ;; but that never actually occurs as of Unicode 5.1.0,
470 ;; so I'm just going to ignore it for now...
471 (multiple-value-bind (hi lo) (surrogates code)
472 (setf (schar newstring new-index) (code-char hi))
473 (when lo
474 (setf (schar newstring (incf new-index)) (code-char lo))))))
475 ;;@@ WARNING: see above
476 (do ((index end (1+ index))
477 (new-index (- (the fixnum end) offset) (1+ new-index)))
478 ((= index offset-slen))
479 (declare (fixnum index new-index))
480 (setf (schar newstring new-index) (schar string index)))
481 newstring))))
482
483 (defun string-downcase (string &key (start 0) end)
484 "Given a string, returns a new string that is a copy of it with
485 all upper case alphabetic characters converted to lowercase."
486 (declare (fixnum start))
487 (let* ((string (if (stringp string) string (string string)))
488 (slen (length string)))
489 (declare (fixnum slen))
490 (with-one-string string start end offset
491 (let ((offset-slen (+ slen offset))
492 (newstring (make-string slen)))
493 (declare (fixnum offset-slen))
494 (do ((index offset (1+ index))
495 (new-index 0 (1+ new-index)))
496 ((= index start))
497 (declare (fixnum index new-index))
498 (setf (schar newstring new-index) (schar string index)))
499 (do ((index start (1+ index))
500 (new-index (- start offset) (1+ new-index)))
501 ((= index (the fixnum end)))
502 (declare (fixnum index new-index))
503 (multiple-value-bind (code wide) (codepoint string index)
504 (when wide (incf index))
505 ;; Handle ASCII specially because this is called early in
506 ;; initialization, before unidata is available.
507 (cond ((< 64 code 91) (incf code 32))
508 ((> code 127) (setq code (unicode-lower code))))
509 ;;@@ WARNING: this may, in theory, need to extend newstring
510 ;; but that never actually occurs as of Unicode 5.1.0,
511 ;; so I'm just going to ignore it for now...
512 (multiple-value-bind (hi lo) (surrogates code)
513 (setf (schar newstring new-index) (code-char hi))
514 (when lo
515 (setf (schar newstring (incf new-index)) (code-char lo))))))
516 ;;@@ WARNING: see above
517 (do ((index end (1+ index))
518 (new-index (- (the fixnum end) offset) (1+ new-index)))
519 ((= index offset-slen))
520 (declare (fixnum index new-index))
521 (setf (schar newstring new-index) (schar string index)))
522 newstring))))
523
524 (defun string-capitalize (string &key (start 0) end)
525 "Given a string, returns a copy of the string with the first
526 character of each ``word'' converted to upper-case, and remaining
527 chars in the word converted to lower case. A ``word'' is defined
528 to be a string of case-modifiable characters delimited by
529 non-case-modifiable chars."
530 (declare (fixnum start))
531 (let* ((string (if (stringp string) string (string string)))
532 (slen (length string)))
533 (declare (fixnum slen))
534 (with-one-string string start end offset
535 (let ((offset-slen (+ slen offset))
536 (newstring (make-string slen)))
537 (declare (fixnum offset-slen))
538 (do ((index offset (1+ index))
539 (new-index 0 (1+ new-index)))
540 ((= index start))
541 (declare (fixnum index new-index))
542 (setf (schar newstring new-index) (schar string index)))
543 (do ((index start (1+ index))
544 (new-index (- start offset) (1+ new-index))
545 (newword t)
546 (char ()))
547 ((= index (the fixnum end)))
548 (declare (fixnum index new-index))
549 (setq char (schar string index))
550 (cond ((not (alphanumericp char))
551 (setq newword t))
552 (newword
553 ;;char is first case-modifiable after non-case-modifiable
554 (setq char (char-titlecase char))
555 (setq newword ()))
556 ;;char is case-modifiable, but not first
557 (t (setq char (char-downcase char))))
558 (setf (schar newstring new-index) char))
559 (do ((index end (1+ index))
560 (new-index (- (the fixnum end) offset) (1+ new-index)))
561 ((= index offset-slen))
562 (declare (fixnum index new-index))
563 (setf (schar newstring new-index) (schar string index)))
564 newstring))))
565
566 (defun nstring-upcase (string &key (start 0) end)
567 "Given a string, returns that string with all lower case alphabetic
568 characters converted to uppercase."
569 (declare (fixnum start))
570 (let ((save-header string))
571 (with-one-string string start end offset
572 (do ((index start (1+ index)))
573 ((= index (the fixnum end)))
574 (declare (fixnum index))
575 (multiple-value-bind (code wide) (codepoint string index)
576 (declare (ignore wide))
577 ;; Handle ASCII specially because this is called early in
578 ;; initialization, before unidata is available.
579 (cond ((< 96 code 123) (decf code 32))
580 ((> code 127) (setq code (unicode-upper code))))
581 ;;@@ WARNING: this may, in theory, need to extend string
582 ;; (which, obviously, we can't do here. Unless
583 ;; STRING is adjustable, maybe)
584 ;; but that never actually occurs as of Unicode 5.1.0,
585 ;; so I'm just going to ignore it for now...
586 (multiple-value-bind (hi lo) (surrogates code)
587 (setf (schar string index) (code-char hi))
588 (when lo
589 (setf (schar string (incf index)) (code-char lo)))))))
590 save-header))
591
592 (defun nstring-downcase (string &key (start 0) end)
593 "Given a string, returns that string with all upper case alphabetic
594 characters converted to lowercase."
595 (declare (fixnum start))
596 (let ((save-header string))
597 (with-one-string string start end offset
598 (do ((index start (1+ index)))
599 ((= index (the fixnum end)))
600 (declare (fixnum index))
601 (multiple-value-bind (code wide) (codepoint string index)
602 (declare (ignore wide))
603 (cond ((< 64 code 91) (incf code 32))
604 ((> code 127) (setq code (unicode-lower code))))
605 ;;@@ WARNING: this may, in theory, need to extend string
606 ;; (which, obviously, we can't do here. Unless
607 ;; STRING is adjustable, maybe)
608 ;; but that never actually occurs as of Unicode 5.1.0,
609 ;; so I'm just going to ignore it for now...
610 (multiple-value-bind (hi lo) (surrogates code)
611 (setf (schar string index) (code-char hi))
612 (when lo
613 (setf (schar string (incf index)) (code-char lo)))))))
614 save-header))
615
616 (defun nstring-capitalize (string &key (start 0) end)
617 "Given a string, returns that string with the first
618 character of each ``word'' converted to upper-case, and remaining
619 chars in the word converted to lower case. A ``word'' is defined
620 to be a string of case-modifiable characters delimited by
621 non-case-modifiable chars."
622 (declare (fixnum start))
623 (let ((save-header string))
624 (with-one-string string start end offset
625 (do ((index start (1+ index))
626 (newword t)
627 (char ()))
628 ((= index (the fixnum end)))
629 (declare (fixnum index))
630 (setq char (schar string index))
631 (cond ((not (alphanumericp char))
632 (setq newword t))
633 (newword
634 ;;char is first case-modifiable after non-case-modifiable
635 (setf (schar string index) (char-titlecase char))
636 (setq newword ()))
637 (t
638 (setf (schar string index) (char-downcase char))))))
639 save-header))
640
641 (defun string-left-trim (char-bag string)
642 "Given a set of characters (a list or string) and a string, returns
643 a copy of the string with the characters in the set removed from the
644 left end."
645 (with-string string
646 (do ((index start (1+ index)))
647 ((or (= index (the fixnum end))
648 (not (find (schar string index) char-bag)))
649 (subseq (the simple-string string) index end))
650 (declare (fixnum index)))))
651
652 (defun string-right-trim (char-bag string)
653 "Given a set of characters (a list or string) and a string, returns
654 a copy of the string with the characters in the set removed from the
655 right end."
656 (with-string string
657 (do ((index (1- (the fixnum end)) (1- index)))
658 ((or (< index start) (not (find (schar string index) char-bag)))
659 (subseq (the simple-string string) start (1+ index)))
660 (declare (fixnum index)))))
661
662 (defun string-trim (char-bag string)
663 "Given a set of characters (a list or string) and a string, returns a
664 copy of the string with the characters in the set removed from both
665 ends."
666 (with-string string
667 (let* ((left-end (do ((index start (1+ index)))
668 ((or (= index (the fixnum end))
669 (not (find (schar string index) char-bag)))
670 index)
671 (declare (fixnum index))))
672 (right-end (do ((index (1- (the fixnum end)) (1- index)))
673 ((or (< index left-end)
674 (not (find (schar string index) char-bag)))
675 (1+ index))
676 (declare (fixnum index)))))
677 (subseq (the simple-string string) left-end right-end))))
678
679 (declaim (inline %glyph-f %glyph-b))
680 (defun %glyph-f (string index)
681 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
682 (type simple-string string) (type kernel:index index))
683 (flet ((xchar (string index)
684 (let ((c (char-code (schar string index))))
685 (declare (type (integer 0 #x10FFFF) c))
686 (cond ((<= #xD800 c #xDBFF)
687 (if (= (1+ index) (length string))
688 (error "String ends with an unpaired surrogate.")
689 (let ((c2 (char-code (schar string (1+ index)))))
690 (if (<= #xDC00 c2 #xDFFF)
691 (+ (ash (- c #xD800) 10) c2 #x2400)
692 (error "Naked high surrogate in string.")))))
693 ((<= #xDC00 c #xDFFF)
694 (error "Naked low surrogate in string."))
695 (t c)))))
696 (let* ((prev 0)
697 (l (length string))
698 (c (xchar string index))
699 (n (+ index (if (> c #xFFFF) 2 1))))
700 (declare (type (integer 0 #x10FFFF) c) (type kernel:index l n))
701 (loop while (< n l) do
702 (let* ((c (xchar string n))
703 (d (the (unsigned-byte 8) (unicode-combining-class c))))
704 (when (or (zerop d) (< d prev))
705 (return))
706 (setq prev d)
707 (incf n (if (> c #xFFFF) 2 1))))
708 n)))
709
710 (defun %glyph-b (string index)
711 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
712 (type simple-string string) (type kernel:index index))
713 (flet ((xchar (string index)
714 (let ((c (char-code (schar string index))))
715 (declare (type (integer 0 #x10FFFF) c))
716 (cond ((<= #xDC00 c #xDFFF)
717 (let ((c2 (char-code (schar string (1- index)))))
718 (if (<= #xD800 c2 #xDBFF)
719 (+ (ash (- c2 #xD800) 10) c #x2400)
720 (error "Naked low surrogate in string."))))
721 ((<= #xD800 c #xDBFF)
722 (error "Naked high surrogate in string."))
723 (t c)))))
724 (let ((prev 255)
725 (n (1- index)))
726 (declare (type kernel:index n))
727 (loop while (> n 0) do
728 (let* ((c (xchar string n))
729 (d (the (unsigned-byte 8) (unicode-combining-class c))))
730 (cond ((zerop d) (return))
731 ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
732 (setq prev d)
733 (decf n (if (> c #xFFFF) 2 1))))
734 n)))
735
736 (defun glyph (string index &key (from-end nil))
737 "GLYPH returns the glyph at the indexed position in a string, and the
738 position of the next glyph (or NIL) as a second value. A glyph is
739 a substring consisting of the character at INDEX followed by all
740 subsequent combining characters."
741 (declare (type simple-string string) (type kernel:index index))
742 #-unicode
743 (char string index)
744 #+unicode
745 (with-array-data ((string string) (start) (end))
746 (declare (ignore start end))
747 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
748 (if from-end
749 (values (subseq string n index) (and (> n 0) n))
750 (values (subseq string index n) (and (< n (length string)) n))))))
751
752 (defun sglyph (string index &key (from-end nil))
753 "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
754 except that the string must be a simple-string"
755 (declare (type simple-string string) (type kernel:index index))
756 #-unicode
757 (schar string index)
758 #+unicode
759 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
760 (if from-end
761 (values (subseq string n index) (and (> n 0) n))
762 (values (subseq string index n) (and (< n (length string)) n)))))
763
764 (defun string-to-nfd (string)
765 ;;@@ Implement me
766 string)
767
768 (defun string-to-nfkd (string)
769 ;;@@ Implement me
770 string)
771
772 (defun string-to-nfc (string)
773 ;;@@ Implement me
774 string)
775
776 (defun string-to-nfkc (string)
777 ;;@@ Implement me
778 string)

  ViewVC Help
Powered by ViewVC 1.1.5