/[cmucl]/src/code/string.lisp
ViewVC logotype

Contents of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.26 - (show annotations)
Wed Sep 15 21:06:38 2010 UTC (3 years, 7 months ago) by rtoy
Branch: MAIN
Changes since 1.25: +15 -5 lines
Add support for Unicode 5.2.  The normalization and wordbreak tests pass.

code/string.lisp:
o In %compose, handle the case where the composite character is
  outside the BMP and thus needs special handling for our UTF-16
  strings.

code/unidata.lisp
o CKJ Ideograph range has changed in 5.2.
o Fix bug in build-composition-table.  We were not correctly handling
  the case where the decomposition of a codepoint was outside the
  BMP.  Special care is needed to handle the UTF-16 strings that we
  use.
o The key for the pairwise composition table are the full codepoints,
  so we need to shift one by 21 bits instead of 16.

tools/build-unidata.lisp
o Update minor version to 2.

i18n/BidiMirroring.txt
i18n/CaseFolding.txt
i18n/CompositionExclusions.txt
i18n/DerivedNormalizationProps.txt
i18n/NameAliases.txt
i18n/NormalizationCorrections.txt
i18n/SpecialCasing.txt
i18n/UnicodeData.txt
i18n/WordBreakProperty.txt
i18n/tests/NormalizationTest.txt
i18n/tests/WordBreakTest.txt
o Updated from Unicode 5.2.

i18n/unidata.bin
o Regenerated from new Unicode 5.2 files.
1 ;;; -*- Log: code.log; Package: Lisp -*-
2 ;;;
3 ;;; **********************************************************************
4 ;;; This code was written as part of the CMU Common Lisp project at
5 ;;; Carnegie Mellon University, and has been placed in the public domain.
6 ;;;
7 (ext:file-comment
8 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/string.lisp,v 1.26 2010/09/15 21:06:38 rtoy Exp $")
9 ;;;
10 ;;; **********************************************************************
11 ;;;
12 ;;; Functions to implement strings for CMU Common Lisp
13 ;;; Written by David Dill
14 ;;; Rewritten by Skef Wholey, Bill Chiles and Rob MacLachlan.
15 ;;;
16 ;;; ****************************************************************
17 ;;;
18 (in-package "LISP")
19 (intl:textdomain "cmucl")
20
21 (export '(char schar glyph sglyph string
22 string= string-equal string< string> string<= string>= string/=
23 string-lessp string-greaterp string-not-lessp string-not-greaterp
24 string-not-equal
25 string-to-nfc
26 make-string
27 string-trim string-left-trim string-right-trim
28 string-upcase
29 string-downcase string-capitalize nstring-upcase nstring-downcase
30 nstring-capitalize))
31
32 #+unicode
33 (export '(string-to-nfd string-to-nfkd string-to-nfkc))
34
35 (declaim (inline surrogatep surrogates-to-codepoint codepoint surrogates))
36
37 (defun surrogatep (char-or-code &optional surrogate-type)
38 "Test if C is a surrogate. C may be either an integer or a
39 character. Surrogate-type indicates what kind of surrogate to test
40 for. :High means to test for the high (leading) surrogate; :Low
41 tests for the low (trailing surrogate). A value of :Any or Nil
42 tests for any surrogate value (high or low)."
43 (declare (type (or character codepoint) char-or-code))
44 (let ((code (if (characterp char-or-code)
45 (char-code char-or-code)
46 char-or-code)))
47 (ecase surrogate-type
48 ((:high :leading)
49 ;; Test for high surrogate (#xD800 to #xDBFF)
50 (= #b110110 (ash code -10)))
51 ((:low :trailing)
52 ;; Test for low surrogate (#xDC00 to #xDFFF)
53 (= #b110111 (ash code -10)))
54 ((:any nil)
55 ;; Test for any surrogate (#xD800 to #xDFFF)
56 (= #b11011 (ash code -11))))))
57
58 (defun surrogates-to-codepoint (hi-surrogate-char lo-surrogate-char)
59 "Convert the given Hi and Lo surrogate characters to the
60 corresponding codepoint value"
61 (declare (type character hi-surrogate-char lo-surrogate-char))
62 (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi-surrogate-char)) #xD800) 10)
63 (the (integer #xDC00 #xDFFF) (char-code lo-surrogate-char)) #x2400))
64
65 (defun codepoint (string i &optional (end (length string)))
66 "Return the codepoint value from String at position I. If that
67 position is a surrogate, it is combined with either the previous or
68 following character (when possible) to compute the codepoint. The
69 second return value is NIL if the position is not a surrogate pair.
70 Otherwise +1 or -1 is returned if the position is the high or low
71 surrogate value, respectively."
72 (declare (type simple-string string) (type kernel:index i end))
73 (let ((code (char-code (schar string i))))
74 (cond ((and (surrogatep code :high) (< (1+ i) end))
75 (let ((tmp (char-code (schar string (1+ i)))))
76 (if (surrogatep tmp :low)
77 (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)
78 (values code nil))))
79 ((and (surrogatep code :low) (> i 0))
80 (let ((tmp (char-code (schar string (1- i)))))
81 (if (surrogatep tmp :high)
82 (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)
83 (values code nil))))
84 (t (values code nil)))))
85
86 (defun surrogates (codepoint)
87 "Return the high and low surrogate characters for Codepoint. If
88 Codepoint is in the BMP, the first return value is the corresponding
89 character and the second is NIL."
90 (declare (type codepoint codepoint))
91 (if (< codepoint #x10000)
92 (values (code-char codepoint) nil)
93 (let* ((tmp (- codepoint #x10000))
94 (hi (logior (ldb (byte 10 10) tmp) #xD800))
95 (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
96 (values (code-char hi) (code-char lo)))))
97
98 (defun (setf codepoint) (codepoint string i)
99 "Set the codepoint at string position I to the Codepoint. If the
100 codepoint requires a surrogate pair, the high (leading surrogate) is
101 stored at position I and the low (trailing) surrogate is stored at
102 I+1"
103 (declare (type codepoint codepoint)
104 (type simple-string string))
105 (let ((widep nil))
106 (multiple-value-bind (hi lo)
107 (surrogates codepoint)
108 (setf (aref string i) hi)
109 (when lo
110 (setf (aref string (1+ i)) lo)
111 (setf widep t)))
112 (values codepoint widep)))
113
114 #+unicode
115 (defun utf16-string-p (string)
116 _N"Check if String is a valid UTF-16 string. If the string is valid,
117 T is returned. If the string is not valid, NIL is returned, and the
118 second value is the index into the string of the invalid character.
119 A string is also invalid if it contains any unassigned codepoints."
120 (do ((len (length string))
121 (index 0 (1+ index)))
122 ((>= index len)
123 t)
124 (multiple-value-bind (codepoint wide)
125 (codepoint string index)
126 ;; We step through the string in order. If there are any
127 ;; surrogates pairs, we must reach the lead surrogate first,
128 ;; which means WIDE is +1. Otherwise, we have an invalid
129 ;; surrogate pair. If we get any codepoint that is in the
130 ;; surrogate range, we also have an invalid string. An
131 ;; unassigned codepoint is also considered invalid.
132 (when (or (eq wide -1)
133 (surrogatep codepoint)
134 (not (unicode-assigned-codepoint-p codepoint)))
135 (return-from utf16-string-p (values nil index)))
136 (when wide (incf index)))))
137
138 (defun string (X)
139 "Coerces X into a string. If X is a string, X is returned. If X is a
140 symbol, X's pname is returned. If X is a character then a one element
141 string containing that character is returned. If X cannot be coerced
142 into a string, an error occurs."
143 (cond ((stringp x) x)
144 ((symbolp x) (symbol-name x))
145 ((characterp x)
146 (let ((res (make-string 1)))
147 (setf (schar res 0) x) res))
148 (t
149 (error 'simple-type-error
150 :datum x
151 :expected-type '(or string symbol character)
152 :format-control (intl:gettext "~S cannot be coerced to a string.")
153 :format-arguments (list x)))))
154
155 ;;; With-One-String is used to set up some string hacking things. The keywords
156 ;;; are parsed, and the string is hacked into a simple-string.
157
158 (eval-when (compile)
159
160 (defmacro with-one-string (string start end cum-offset &rest forms)
161 `(let ((,string (if (stringp ,string) ,string (string ,string))))
162 ;; Optimizer may prove STRING is one.
163 (declare (optimize (ext:inhibit-warnings 3)))
164 (with-array-data ((,string ,string :offset-var ,cum-offset)
165 (,start ,start)
166 (,end (or ,end (length (the vector ,string)))))
167 ,@forms)))
168
169 )
170
171 ;;; With-String is like With-One-String, but doesn't parse keywords.
172
173 (eval-when (compile)
174
175 (defmacro with-string (string &rest forms)
176 `(let ((,string (if (stringp ,string) ,string (string ,string))))
177 (with-array-data ((,string ,string)
178 (start)
179 (end (length (the vector ,string))))
180 ,@forms)))
181
182 )
183
184 ;;; With-Two-Strings is used to set up string comparison operations. The
185 ;;; keywords are parsed, and the strings are hacked into simple-strings.
186
187 (eval-when (compile)
188
189 (defmacro with-two-strings (string1 string2 start1 end1 cum-offset-1
190 start2 end2 &rest forms)
191 `(let ((,string1 (if (stringp ,string1) ,string1 (string ,string1)))
192 (,string2 (if (stringp ,string2) ,string2 (string ,string2))))
193 (with-array-data ((,string1 ,string1 :offset-var ,cum-offset-1)
194 (,start1 ,start1)
195 (,end1 (or ,end1 (length (the vector ,string1)))))
196 (with-array-data ((,string2 ,string2)
197 (,start2 ,start2)
198 (,end2 (or ,end2 (length (the vector ,string2)))))
199 ,@forms))))
200
201 )
202
203
204 (defun char (string index)
205 "Given a string and a non-negative integer index less than the length of
206 the string, returns the character object representing the character at
207 that position in the string."
208 (declare (optimize (safety 1)))
209 (char string index))
210
211 (defun %charset (string index new-el)
212 (declare (optimize (safety 1)))
213 (setf (char string index) new-el))
214
215 (defun schar (string index)
216 "SCHAR returns the character object at an indexed position in a string
217 just as CHAR does, except the string must be a simple-string."
218 (declare (optimize (safety 1)))
219 (schar string index))
220
221 (defun %scharset (string index new-el)
222 (declare (optimize (safety 1)))
223 (setf (schar string index) new-el))
224
225 (defun string=* (string1 string2 start1 end1 start2 end2)
226 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
227 (not (%sp-string-compare string1 start1 end1 string2 start2 end2))))
228
229
230 (defun string/=* (string1 string2 start1 end1 start2 end2)
231 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
232 (let ((comparison (%sp-string-compare string1 start1 end1
233 string2 start2 end2)))
234 (if comparison (- (the fixnum comparison) offset1)))))
235
236 (eval-when (compile eval)
237
238 ;;; Lessp is true if the desired expansion is for string<* or string<=*.
239 ;;; Equalp is true if the desired expansion is for string<=* or string>=*.
240 (defmacro string<>=*-body (lessp equalp)
241 (let ((offset1 (gensym)))
242 `(with-two-strings string1 string2 start1 end1 ,offset1 start2 end2
243 (let ((index (%sp-string-compare string1 start1 end1
244 string2 start2 end2)))
245 (if index
246 (cond ((= (the fixnum index) (the fixnum end1))
247 ,(if lessp
248 `(- (the fixnum index) ,offset1)
249 `nil))
250 ((= (+ (the fixnum index) (- start2 start1))
251 (the fixnum end2))
252 ,(if lessp
253 `nil
254 `(- (the fixnum index) ,offset1)))
255 #-unicode
256 ((,(if lessp 'char< 'char>)
257 (schar string1 index)
258 (schar string2 (+ (the fixnum index) (- start2 start1))))
259 (- (the fixnum index) ,offset1))
260 #-unicode
261 (t nil)
262 #+unicode
263 (t
264 ;; Compare in code point order. See
265 ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
266 (flet ((fixup (code)
267 (if (>= code #xe000)
268 (- code #x800)
269 (+ code #x2000))))
270 (declare (inline fixup))
271 (let* ((c1 (char-code (schar string1 index)))
272 (c2 (char-code (schar string2
273 (+ (the fixnum index)
274 (- start2 start1))))))
275 (cond ((and (>= c1 #xd800)
276 (>= c2 #xd800))
277 (let ((fix-c1 (fixup c1))
278 (fix-c2 (fixup c2)))
279 (if (,(if lessp '< '>) fix-c1 fix-c2)
280 (- (the fixnum index) ,offset1)
281 nil)))
282 (t
283 (if (,(if lessp '< '>) c1 c2)
284 (- (the fixnum index) ,offset1)
285 nil)))))))
286 ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
287 ) ; eval-when
288
289 (defun string<* (string1 string2 start1 end1 start2 end2)
290 (declare (fixnum start1 start2))
291 (string<>=*-body t nil))
292
293 (defun string>* (string1 string2 start1 end1 start2 end2)
294 (declare (fixnum start1 start2))
295 (string<>=*-body nil nil))
296
297 (defun string<=* (string1 string2 start1 end1 start2 end2)
298 (declare (fixnum start1 start2))
299 (string<>=*-body t t))
300
301 (defun string>=* (string1 string2 start1 end1 start2 end2)
302 (declare (fixnum start1 start2))
303 (string<>=*-body nil t))
304
305
306
307 (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
308 "Given two strings, if the first string is lexicographically less than
309 the second string, returns the longest common prefix (using char=)
310 of the two strings. Otherwise, returns ()."
311 (string<* string1 string2 start1 end1 start2 end2))
312
313 (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
314 "Given two strings, if the first string is lexicographically greater than
315 the second string, returns the longest common prefix (using char=)
316 of the two strings. Otherwise, returns ()."
317 (string>* string1 string2 start1 end1 start2 end2))
318
319
320 (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
321 "Given two strings, if the first string is lexicographically less than
322 or equal to the second string, returns the longest common prefix
323 (using char=) of the two strings. Otherwise, returns ()."
324 (string<=* string1 string2 start1 end1 start2 end2))
325
326 (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
327 "Given two strings, if the first string is lexicographically greater
328 than or equal to the second string, returns the longest common prefix
329 (using char=) of the two strings. Otherwise, returns ()."
330 (string>=* string1 string2 start1 end1 start2 end2))
331
332 (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
333 "Given two strings (string1 and string2), and optional integers start1,
334 start2, end1 and end2, compares characters in string1 to characters in
335 string2 (using char=)."
336 (string=* string1 string2 start1 end1 start2 end2))
337
338 (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
339 "Given two strings, if the first string is not lexicographically equal
340 to the second string, returns the longest common prefix (using char=)
341 of the two strings. Otherwise, returns ()."
342 (string/=* string1 string2 start1 end1 start2 end2))
343
344
345 (eval-when (compile eval)
346
347 ;;; STRING-NOT-EQUAL-LOOP is used to generate character comparison loops for
348 ;;; STRING-EQUAL and STRING-NOT-EQUAL.
349 (defmacro string-not-equal-loop (end end-value
350 &optional (abort-value nil abortp))
351 (declare (fixnum end))
352 (let ((end-test (if (= end 1)
353 `(= index1 (the fixnum end1))
354 `(= index2 (the fixnum end2)))))
355 `(do ((index1 start1 (1+ index1))
356 (index2 start2 (1+ index2)))
357 (,(if abortp
358 end-test
359 `(or ,end-test
360 (not (char-equal (schar string1 index1)
361 (schar string2 index2)))))
362 ,end-value)
363 (declare (fixnum index1 index2))
364 ,@(if abortp
365 `((if (not (char-equal (schar string1 index1)
366 (schar string2 index2)))
367 (return ,abort-value)))))))
368 ) ; eval-when
369
370 #+unicode
371 (defun string-case-fold (string &key (start 0) end (casing :simple))
372 _N"Return a new string with the case folded according to Casing as follows:
373
374 :SIMPLE Unicode simple case folding (preserving length)
375 :FULL Unicode full case folding (possibly changing length)
376
377 Default Casing is :SIMPLE."
378 (ecase casing
379 (:simple
380 (with-output-to-string (s)
381 (with-one-string string start end offset
382 (do ((index offset (1+ index)))
383 ((>= index end))
384 (multiple-value-bind (code widep)
385 (codepoint string index)
386 (when widep (incf index))
387 (multiple-value-bind (hi lo)
388 (surrogates (unicode-case-fold-simple code))
389 (write-char hi s)
390 (when lo (write-char lo s))))))))
391 (:full
392 (with-output-to-string (s)
393 (with-one-string string start end offset
394 (do ((index offset (1+ index)))
395 ((>= index end))
396 (multiple-value-bind (code widep)
397 (codepoint string index)
398 (when widep (incf index))
399 (write-string (unicode-case-fold-full code) s))))))))
400
401 (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
402 "Given two strings (string1 and string2), and optional integers start1,
403 start2, end1 and end2, compares characters in string1 to characters in
404 string2 (using char-equal)."
405 (declare (fixnum start1 start2))
406 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
407 (let ((slen1 (- (the fixnum end1) start1))
408 (slen2 (- (the fixnum end2) start2)))
409 (declare (fixnum slen1 slen2))
410 (if (or (minusp slen1) (minusp slen2))
411 ;;prevent endless looping later.
412 (error (intl:gettext "Improper bounds for string comparison.")))
413 (if (= slen1 slen2)
414 ;;return () immediately if lengths aren't equal.
415 (string-not-equal-loop 1 t nil)))))
416
417 (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
418 "Given two strings, if the first string is not lexicographically equal
419 to the second string, returns the longest common prefix (using char-equal)
420 of the two strings. Otherwise, returns ()."
421 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
422 (let ((slen1 (- end1 start1))
423 (slen2 (- end2 start2)))
424 (declare (fixnum slen1 slen2))
425 (if (or (minusp slen1) (minusp slen2))
426 ;;prevent endless looping later.
427 (error (intl:gettext "Improper bounds for string comparison.")))
428 (cond ((or (minusp slen1) (or (minusp slen2)))
429 (error (intl:gettext "Improper substring for comparison.")))
430 ((= slen1 slen2)
431 (string-not-equal-loop 1 nil (- index1 offset1)))
432 ((< slen1 slen2)
433 (string-not-equal-loop 1 (- index1 offset1)))
434 (t
435 (string-not-equal-loop 2 (- index1 offset1)))))))
436
437
438
439 (eval-when (compile eval)
440
441 ;;; STRING-LESS-GREATER-EQUAL-TESTS returns a test on the lengths of string1
442 ;;; and string2 and a test on the current characters from string1 and string2
443 ;;; for the following macro.
444 (defun string-less-greater-equal-tests (lessp equalp)
445 (if lessp
446 (if equalp
447 ;; STRING-NOT-GREATERP
448 (values '<=
449 #-unicode `(not (char-greaterp char1 char2))
450 #+unicode `(<= char1 char2))
451 ;; STRING-LESSP
452 (values '<
453 #-unicode `(char-lessp char1 char2)
454 #+unicode `(< char1 char2)))
455 (if equalp
456 ;; STRING-NOT-LESSP
457 (values '>=
458 #-unicode `(not (char-lessp char1 char2))
459 #+unicode `(>= char1 char2))
460 ;; STRING-GREATERP
461 (values '>
462 #-unicode `(char-greaterp char1 char2)
463 #+unicode `(> char1 char2)))))
464
465 #-unicode
466 (defmacro string-less-greater-equal (lessp equalp)
467 (multiple-value-bind (length-test character-test)
468 (string-less-greater-equal-tests lessp equalp)
469 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
470 (let ((slen1 (- (the fixnum end1) start1))
471 (slen2 (- (the fixnum end2) start2)))
472 (declare (fixnum slen1 slen2))
473 (if (or (minusp slen1) (minusp slen2))
474 ;;prevent endless looping later.
475 (error (intl:gettext "Improper bounds for string comparison.")))
476 (do ((index1 start1 (1+ index1))
477 (index2 start2 (1+ index2))
478 (char1)
479 (char2))
480 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
481 (if (,length-test slen1 slen2) (- index1 offset1)))
482 (declare (fixnum index1 index2))
483 (setq char1 (schar string1 index1))
484 (setq char2 (schar string2 index2))
485 (if (not (char-equal char1 char2))
486 (if ,character-test
487 (return (- index1 offset1))
488 (return ()))))))))
489
490 ;; Convert to lowercase for case folding, to match what Unicode
491 ;; CaseFolding.txt says. An example where this matters: U+1E9E maps
492 ;; to U+00DF. But the uppercase version of U+00DF is U+00DF.
493 #+unicode
494 (defmacro equal-char-codepoint (codepoint)
495 `(let ((ch ,codepoint))
496 ;; Handle ASCII separately for bootstrapping and for unidata missing.
497 (if (< 64 ch 91)
498 (+ ch 32)
499 #-(and unicode (not unicode-bootstrap))
500 ch
501 #+(and unicode (not unicode-bootstrap))
502 (if (> ch 127) (unicode-lower ch) ch))))
503
504 #+unicode
505 (defmacro string-less-greater-equal (lessp equalp)
506 (multiple-value-bind (length-test character-test)
507 (string-less-greater-equal-tests lessp equalp)
508 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
509 (let ((slen1 (- (the fixnum end1) start1))
510 (slen2 (- (the fixnum end2) start2)))
511 (declare (fixnum slen1 slen2))
512 (if (or (minusp slen1) (minusp slen2))
513 ;;prevent endless looping later.
514 (error (intl:gettext "Improper bounds for string comparison.")))
515 (do ((index1 start1 (1+ index1))
516 (index2 start2 (1+ index2)))
517 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
518 (if (,length-test slen1 slen2) (- index1 offset1)))
519 (declare (fixnum index1 index2))
520 (multiple-value-bind (char1 wide1)
521 (codepoint string1 index1)
522 (declare (type codepoint char1))
523 (multiple-value-bind (char2 wide2)
524 (codepoint string2 index2)
525 (declare (type codepoint char2))
526 (setf char1 (equal-char-codepoint char1))
527 (setf char2 (equal-char-codepoint char2))
528 (if (= char1 char2)
529 (progn
530 (when wide1 (incf index1))
531 (when wide2 (incf index2)))
532 (if ,character-test
533 (return (- index1 offset1))
534 (return ()))))))))))
535
536 ) ; eval-when
537
538 (defun string-lessp* (string1 string2 start1 end1 start2 end2)
539 (declare (fixnum start1 start2))
540 (string-less-greater-equal t nil))
541
542 (defun string-greaterp* (string1 string2 start1 end1 start2 end2)
543 (declare (fixnum start1 start2))
544 (string-less-greater-equal nil nil))
545
546 (defun string-not-lessp* (string1 string2 start1 end1 start2 end2)
547 (declare (fixnum start1 start2))
548 (string-less-greater-equal nil t))
549
550 (defun string-not-greaterp* (string1 string2 start1 end1 start2 end2)
551 (declare (fixnum start1 start2))
552 (string-less-greater-equal t t))
553
554 (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
555 "Given two strings, if the first string is lexicographically less than
556 the second string, returns the longest common prefix (using char-equal)
557 of the two strings. Otherwise, returns ()."
558 (string-lessp* string1 string2 start1 end1 start2 end2))
559
560 (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)
561 "Given two strings, if the first string is lexicographically greater than
562 the second string, returns the longest common prefix (using char-equal)
563 of the two strings. Otherwise, returns ()."
564 (string-greaterp* string1 string2 start1 end1 start2 end2))
565
566 (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
567 "Given two strings, if the first string is lexicographically greater
568 than or equal to the second string, returns the longest common prefix
569 (using char-equal) of the two strings. Otherwise, returns ()."
570 (string-not-lessp* string1 string2 start1 end1 start2 end2))
571
572 (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
573 end2)
574 "Given two strings, if the first string is lexicographically less than
575 or equal to the second string, returns the longest common prefix
576 (using char-equal) of the two strings. Otherwise, returns ()."
577 (string-not-greaterp* string1 string2 start1 end1 start2 end2))
578
579
580 (defun make-string (count &key element-type ((:initial-element fill-char)))
581 "Given a character count and an optional fill character, makes and returns
582 a new string Count long filled with the fill character."
583 (declare (type fixnum count))
584 (assert (subtypep element-type 'character))
585 (if fill-char
586 (do ((i 0 (1+ i))
587 (string (make-string count)))
588 ((= i count) string)
589 (declare (fixnum i))
590 (setf (schar string i) fill-char))
591 (make-string count)))
592
593 (defun string-upcase-simple (string &key (start 0) end)
594 (declare (fixnum start))
595 (let* ((string (if (stringp string) string (string string)))
596 (slen (length string)))
597 (declare (fixnum slen))
598 (with-one-string string start end offset
599 (let ((offset-slen (+ slen offset))
600 (newstring (make-string slen)))
601 (declare (fixnum offset-slen))
602 (do ((index offset (1+ index))
603 (new-index 0 (1+ new-index)))
604 ((= index start))
605 (declare (fixnum index new-index))
606 (setf (schar newstring new-index) (schar string index)))
607 (do ((index start (1+ index))
608 (new-index (- start offset) (1+ new-index)))
609 ((= index (the fixnum end)))
610 (declare (fixnum index new-index))
611 (multiple-value-bind (code wide) (codepoint string index)
612 (when wide (incf index))
613 ;; Handle ASCII specially because this is called early in
614 ;; initialization, before unidata is available.
615 (cond ((< 96 code 123) (decf code 32))
616 #+unicode
617 ((> code 127) (setq code (unicode-upper code))))
618 ;;@@ WARNING: this may, in theory, need to extend newstring
619 ;; but that never actually occurs as of Unicode 5.1.0,
620 ;; so I'm just going to ignore it for now...
621 (multiple-value-bind (hi lo) (surrogates code)
622 (setf (schar newstring new-index) hi)
623 (when lo
624 (setf (schar newstring (incf new-index)) lo)))))
625 ;;@@ WARNING: see above
626 (do ((index end (1+ index))
627 (new-index (- (the fixnum end) offset) (1+ new-index)))
628 ((= index offset-slen))
629 (declare (fixnum index new-index))
630 (setf (schar newstring new-index) (schar string index)))
631 newstring))))
632
633 #+unicode
634 (defun string-upcase-full (string &key (start 0) end)
635 (declare (fixnum start))
636 (let* ((string (if (stringp string) string (string string)))
637 (slen (length string)))
638 (declare (fixnum slen))
639 (with-output-to-string (s)
640 (with-one-string string start end offset
641 (let ((offset-slen (+ slen offset)))
642 (declare (fixnum offset-slen))
643 (write-string string s :start offset :end start)
644 (do ((index start (1+ index)))
645 ((= index (the fixnum end)))
646 (declare (fixnum index))
647 (multiple-value-bind (code wide)
648 (codepoint string index)
649 (when wide (incf index))
650 ;; Handle ASCII specially because this is called early in
651 ;; initialization, before unidata is available.
652 (cond ((< 96 code 123)
653 (write-char (code-char (decf code 32)) s))
654 ((> code 127)
655 (write-string (unicode-full-case-upper code) s))
656 (t
657 (multiple-value-bind (hi lo)
658 (surrogates code)
659 (write-char hi s)
660 (when lo
661 (write-char lo s)))))))
662 (write-string string s :start end :end offset-slen))))))
663
664 (defun string-upcase (string &key (start 0) end #+unicode (casing :simple))
665 #-unicode
666 _N"Given a string, returns a new string that is a copy of it with all
667 lower case alphabetic characters converted to uppercase."
668 #+unicode
669 _N"Given a string, returns a new string that is a copy of it with all
670 lower case alphabetic characters converted to uppercase. Casing is
671 :simple or :full for simple or full case conversion, respectively."
672 (declare (fixnum start))
673 #-unicode
674 (string-upcase-simple string :start start :end end)
675 #+unicode
676 (if (eq casing :simple)
677 (string-upcase-simple string :start start :end end)
678 (string-upcase-full string :start start :end end)))
679
680 (defun string-downcase-simple (string &key (start 0) end)
681 (declare (fixnum start))
682 (let* ((string (if (stringp string) string (string string)))
683 (slen (length string)))
684 (declare (fixnum slen))
685 (with-one-string string start end offset
686 (let ((offset-slen (+ slen offset))
687 (newstring (make-string slen)))
688 (declare (fixnum offset-slen))
689 (do ((index offset (1+ index))
690 (new-index 0 (1+ new-index)))
691 ((= index start))
692 (declare (fixnum index new-index))
693 (setf (schar newstring new-index) (schar string index)))
694 (do ((index start (1+ index))
695 (new-index (- start offset) (1+ new-index)))
696 ((= index (the fixnum end)))
697 (declare (fixnum index new-index))
698 (multiple-value-bind (code wide) (codepoint string index)
699 (when wide (incf index))
700 ;; Handle ASCII specially because this is called early in
701 ;; initialization, before unidata is available.
702 (cond ((< 64 code 91) (incf code 32))
703 ((> code 127) (setq code (unicode-lower code))))
704 ;;@@ WARNING: this may, in theory, need to extend newstring
705 ;; but that never actually occurs as of Unicode 5.1.0,
706 ;; so I'm just going to ignore it for now...
707 (multiple-value-bind (hi lo) (surrogates code)
708 (setf (schar newstring new-index) hi)
709 (when lo
710 (setf (schar newstring (incf new-index)) lo)))))
711 ;;@@ WARNING: see above
712 (do ((index end (1+ index))
713 (new-index (- (the fixnum end) offset) (1+ new-index)))
714 ((= index offset-slen))
715 (declare (fixnum index new-index))
716 (setf (schar newstring new-index) (schar string index)))
717 newstring))))
718
719 #+unicode
720 (defun string-downcase-full (string &key (start 0) end)
721 (declare (fixnum start))
722 (let* ((string (if (stringp string) string (string string)))
723 (slen (length string)))
724 (declare (fixnum slen))
725 (with-output-to-string (s)
726 (with-one-string string start end offset
727 (let ((offset-slen (+ slen offset)))
728 (declare (fixnum offset-slen))
729 (write-string string s :start offset :end start)
730 (do ((index start (1+ index)))
731 ((= index (the fixnum end)))
732 (declare (fixnum index))
733 (multiple-value-bind (code wide)
734 (codepoint string index)
735 (when wide (incf index))
736 ;; Handle ASCII specially because this is called early in
737 ;; initialization, before unidata is available.
738 (cond ((< 64 code 91)
739 (write-char (code-char (incf code 32)) s))
740 ((> code 127)
741 (write-string (unicode-full-case-lower code) s))
742 (t
743 (multiple-value-bind (hi lo)
744 (surrogates code)
745 (write-char hi s)
746 (when lo
747 (write-char lo s)))))))
748 (write-string string s :start end :end offset-slen))))))
749
750 (defun string-downcase (string &key (start 0) end #+unicode (casing :simple))
751 #-unicode
752 _N"Given a string, returns a new string that is a copy of it with all
753 upper case alphabetic characters converted to lowercase."
754 #+unicode
755 _N"Given a string, returns a new string that is a copy of it with all
756 upper case alphabetic characters converted to lowercase. Casing is
757 :simple or :full for simple or full case conversion, respectively."
758 (declare (fixnum start))
759 #-unicode
760 (string-downcase-simple string :start start :end end)
761 #+unicode
762 (if (eq casing :simple)
763 (string-downcase-simple string :start start :end end)
764 (string-downcase-full string :start start :end end)))
765
766 (defun string-capitalize-simple (string &key (start 0) end)
767 (declare (fixnum start))
768 (let* ((string (if (stringp string) string (string string)))
769 (slen (length string)))
770 (declare (fixnum slen))
771 (with-one-string string start end offset
772 (let ((offset-slen (+ slen offset))
773 (newstring (make-string slen)))
774 (declare (fixnum offset-slen))
775 (do ((index offset (1+ index))
776 (new-index 0 (1+ new-index)))
777 ((= index start))
778 (declare (fixnum index new-index))
779 (setf (schar newstring new-index) (schar string index)))
780 (do ((index start (1+ index))
781 (new-index (- start offset) (1+ new-index))
782 (newword t)
783 (char ()))
784 ((= index (the fixnum end)))
785 (declare (fixnum index new-index))
786 (setq char (schar string index))
787 (cond ((not (alphanumericp char))
788 (setq newword t))
789 (newword
790 ;;char is first case-modifiable after non-case-modifiable
791 (setq char (char-titlecase char))
792 (setq newword ()))
793 ;;char is case-modifiable, but not first
794 (t (setq char (char-downcase char))))
795 (setf (schar newstring new-index) char))
796 (do ((index end (1+ index))
797 (new-index (- (the fixnum end) offset) (1+ new-index)))
798 ((= index offset-slen))
799 (declare (fixnum index new-index))
800 (setf (schar newstring new-index) (schar string index)))
801 newstring))))
802
803 #+unicode
804 (defun string-capitalize-full (string &key (start 0) end)
805 (declare (fixnum start))
806 (let* ((string (if (stringp string) string (string string)))
807 (slen (length string)))
808 (declare (fixnum slen))
809 (with-output-to-string (s)
810 (with-one-string string start end offset
811 (let ((offset-slen (+ slen offset)))
812 (declare (fixnum offset-slen))
813 (write-string string s :start offset :end start)
814 (flet ((alphanump (m)
815 (or (< 47 m 58) (< 64 m 91) (< 96 m 123)
816 #+(and unicode (not unicode-bootstrap))
817 (and (> m 127)
818 (<= +unicode-category-letter+ (unicode-category m)
819 (+ +unicode-category-letter+ #x0F))))))
820 (do ((index start (1+ index))
821 (newword t))
822 ((= index (the fixnum end)))
823 (declare (fixnum index))
824 (multiple-value-bind (code wide)
825 (codepoint string index)
826 (when wide (incf index))
827 (cond ((not (alphanump code))
828 (multiple-value-bind (hi lo)
829 (surrogates code)
830 (write-char hi s)
831 (when lo (write-char lo s)))
832 (setq newword t))
833 (newword
834 ;;char is first case-modifiable after non-case-modifiable
835 (write-string (unicode-full-case-title code) s)
836 (setq newword ()))
837 ;;char is case-modifiable, but not first
838 (t
839 (write-string (unicode-full-case-lower code) s))))))
840 (write-string string s :start end :end offset-slen))))))
841
842 (defun string-capitalize (string &key (start 0) end
843 #+unicode (casing :simple)
844 #+unicode unicode-word-break)
845 #-unicode
846 _N"Given a string, returns a copy of the string with the first
847 character of each ``word'' converted to upper-case, and remaining
848 chars in the word converted to lower case. A ``word'' is defined
849 to be a string of case-modifiable characters delimited by
850 non-case-modifiable chars."
851 #+unicode
852 _N"Given a string, returns a copy of the string with the first
853 character of each ``word'' converted to upper-case, and remaining
854 chars in the word converted to lower case. A ``word'' is defined
855 to be a string of case-modifiable characters delimited by
856 non-case-modifiable chars. Casing is :simple or :full for
857 simple or full case conversion, respectively."
858
859 (declare (fixnum start))
860 #-unicode
861 (string-capitalize-simple string :start start :end end)
862 #+unicode
863 (if unicode-word-break
864 (string-capitalize-unicode string :start start :end end :casing casing)
865 (if (eq casing :simple)
866 (string-capitalize-simple string :start start :end end)
867 (string-capitalize-full string :start start :end end))))
868
869 (defun nstring-upcase (string &key (start 0) end)
870 "Given a string, returns that string with all lower case alphabetic
871 characters converted to uppercase."
872 (declare (fixnum start))
873 (let ((save-header string))
874 (with-one-string string start end offset
875 (do ((index start (1+ index)))
876 ((= index (the fixnum end)))
877 (declare (fixnum index))
878 (multiple-value-bind (code wide) (codepoint string index)
879 (declare (ignore wide))
880 ;; Handle ASCII specially because this is called early in
881 ;; initialization, before unidata is available.
882 (cond ((< 96 code 123) (decf code 32))
883 #+unicode
884 ((> code 127) (setq code (unicode-upper code))))
885 ;;@@ WARNING: this may, in theory, need to extend string
886 ;; (which, obviously, we can't do here. Unless
887 ;; STRING is adjustable, maybe)
888 ;; but that never actually occurs as of Unicode 5.1.0,
889 ;; so I'm just going to ignore it for now...
890 (multiple-value-bind (hi lo) (surrogates code)
891 (setf (schar string index) hi)
892 (when lo
893 (setf (schar string (incf index)) lo))))))
894 save-header))
895
896 (defun nstring-downcase (string &key (start 0) end)
897 "Given a string, returns that string with all upper case alphabetic
898 characters converted to lowercase."
899 (declare (fixnum start))
900 (let ((save-header string))
901 (with-one-string string start end offset
902 (do ((index start (1+ index)))
903 ((= index (the fixnum end)))
904 (declare (fixnum index))
905 (multiple-value-bind (code wide) (codepoint string index)
906 (declare (ignore wide))
907 (cond ((< 64 code 91) (incf code 32))
908 #+unicode
909 ((> code 127) (setq code (unicode-lower code))))
910 ;;@@ WARNING: this may, in theory, need to extend string
911 ;; (which, obviously, we can't do here. Unless
912 ;; STRING is adjustable, maybe)
913 ;; but that never actually occurs as of Unicode 5.1.0,
914 ;; so I'm just going to ignore it for now...
915 (multiple-value-bind (hi lo) (surrogates code)
916 (setf (schar string index) hi)
917 (when lo
918 (setf (schar string (incf index)) lo))))))
919 save-header))
920
921 (defun nstring-capitalize (string &key (start 0) end)
922 "Given a string, returns that string with the first
923 character of each ``word'' converted to upper-case, and remaining
924 chars in the word converted to lower case. A ``word'' is defined
925 to be a string of case-modifiable characters delimited by
926 non-case-modifiable chars."
927 (declare (fixnum start))
928 (let ((save-header string))
929 (with-one-string string start end offset
930 (do ((index start (1+ index))
931 (newword t)
932 (char ()))
933 ((= index (the fixnum end)))
934 (declare (fixnum index))
935 (setq char (schar string index))
936 (cond ((not (alphanumericp char))
937 (setq newword t))
938 (newword
939 ;;char is first case-modifiable after non-case-modifiable
940 (setf (schar string index) (char-titlecase char))
941 (setq newword ()))
942 (t
943 (setf (schar string index) (char-downcase char))))))
944 save-header))
945
946
947 #+unicode
948 (progn
949 ;; Like string-left-trim, but return the index
950 (defun string-left-trim-index (char-bag string)
951 (with-string string
952 (if (stringp char-bag)
953 ;; When char-bag is a string, we try to do the right thing.
954 ;; Convert char-bag to a list of codepoints and compare the
955 ;; codepoints in the string with this.
956 (let ((code-bag (with-string char-bag
957 (do ((index start (1+ index))
958 (result nil))
959 ((= index end)
960 (nreverse result))
961 (multiple-value-bind (c widep)
962 (codepoint char-bag index)
963 (push c result)
964 (when widep (incf index)))))))
965 (do ((index start (1+ index)))
966 ((= index (the fixnum end))
967 end)
968 (declare (fixnum index))
969 (multiple-value-bind (c widep)
970 (codepoint string index)
971 (unless (find c code-bag)
972 (return-from string-left-trim-index index))
973 (when widep (incf index)))))
974 ;; When char-bag is a list, we just look at each codepoint of
975 ;; STRING to see if it's in char-bag. If char-bag contains a
976 ;; surrogate, we could accidentally trim off a surrogate,
977 ;; leaving an invalid UTF16 string.
978 (do ((index start (1+ index)))
979 ((= index (the fixnum end))
980 end)
981 (declare (fixnum index))
982 (multiple-value-bind (c widep)
983 (codepoint string index)
984 (unless (find c char-bag :key #'char-code)
985 (return-from string-left-trim-index index))
986 (when widep (incf index)))))))
987
988 (defun string-left-trim (char-bag string)
989 "Given a set of characters (a list or string) and a string, returns
990 a copy of the string with the characters in the set removed from the
991 left end. If the set of characters is a string, surrogates will be
992 properly handled."
993 (let ((begin (string-left-trim-index char-bag string)))
994 (with-string string
995 (declare (ignore start))
996 (subseq string begin end))))
997
998 (defun string-right-trim-index (char-bag string)
999 (with-string string
1000 (if (stringp char-bag)
1001 ;; When char-bag is a string, we try to do the right thing
1002 ;; with surrogates. Convert char-bag to a list of codepoints
1003 ;; and compare the codepoints in the string with this.
1004 (let ((code-bag (with-string char-bag
1005 (do ((index start (1+ index))
1006 (result nil))
1007 ((= index end)
1008 result)
1009 (multiple-value-bind (c widep)
1010 (codepoint char-bag index)
1011 (push c result)
1012 (when widep (incf index)))))))
1013 (do ((index (1- end) (1- index)))
1014 ((< index start)
1015 start)
1016 (declare (fixnum index))
1017 (multiple-value-bind (c widep)
1018 (codepoint string index)
1019 (unless (find c code-bag)
1020 (return-from string-right-trim-index (1+ index)))
1021 (when widep (decf index)))))
1022 ;; When char-bag is a list, we just look at each codepoint of
1023 ;; STRING to see if it's in char-bag. If char-bag contains a
1024 ;; surrogate, we could accidentally trim off a surrogate,
1025 ;; leaving an invalid UTF16 string.
1026 (do ((index (1- end) (1- index)))
1027 ((< index start)
1028 start)
1029 (declare (fixnum index))
1030 (multiple-value-bind (c widep)
1031 (codepoint string index)
1032 (unless (find c char-bag :key #'char-code)
1033 (return-from string-right-trim-index (1+ index)))
1034 (when widep (decf index)))))))
1035
1036 (defun string-right-trim (char-bag string)
1037 "Given a set of characters (a list or string) and a string, returns
1038 a copy of the string with the characters in the set removed from the
1039 right end. If the set of characters is a string, surrogates will be
1040 properly handled."
1041 (let ((stop (string-right-trim-index char-bag string)))
1042 (with-string string
1043 (declare (ignore end))
1044 (subseq string start stop))))
1045
1046 (defun string-trim (char-bag string)
1047 "Given a set of characters (a list or string) and a string, returns a
1048 copy of the string with the characters in the set removed from both
1049 ends. If the set of characters is a string, surrogates will be
1050 properly handled."
1051 (let ((left-end (string-left-trim-index char-bag string))
1052 (right-end (string-right-trim-index char-bag string)))
1053 (with-string string
1054 (declare (ignore start end))
1055 (subseq (the simple-string string) left-end right-end))))
1056 ) ; end unicode version
1057
1058 #-unicode
1059 (progn
1060 (defun string-left-trim (char-bag string)
1061 _N"Given a set of characters (a list or string) and a string, returns
1062 a copy of the string with the characters in the set removed from the
1063 left end."
1064 (with-string string
1065 (do ((index start (1+ index)))
1066 ((or (= index (the fixnum end))
1067 (not (find (schar string index) char-bag)))
1068 (subseq (the simple-string string) index end))
1069 (declare (fixnum index)))))
1070
1071 (defun string-right-trim (char-bag string)
1072 _N"Given a set of characters (a list or string) and a string, returns
1073 a copy of the string with the characters in the set removed from the
1074 right end."
1075 (with-string string
1076 (do ((index (1- (the fixnum end)) (1- index)))
1077 ((or (< index start) (not (find (schar string index) char-bag)))
1078 (subseq (the simple-string string) start (1+ index)))
1079 (declare (fixnum index)))))
1080
1081 (defun string-trim (char-bag string)
1082 _N"Given a set of characters (a list or string) and a string, returns a
1083 copy of the string with the characters in the set removed from both
1084 ends."
1085 (with-string string
1086 (let* ((left-end (do ((index start (1+ index)))
1087 ((or (= index (the fixnum end))
1088 (not (find (schar string index) char-bag)))
1089 index)
1090 (declare (fixnum index))))
1091 (right-end (do ((index (1- (the fixnum end)) (1- index)))
1092 ((or (< index left-end)
1093 (not (find (schar string index) char-bag)))
1094 (1+ index))
1095 (declare (fixnum index)))))
1096 (subseq (the simple-string string) left-end right-end))))
1097 ) ; non-unicode version
1098
1099 #+unicode
1100 (progn
1101 (declaim (inline %glyph-f %glyph-b))
1102 (defun %glyph-f (string index)
1103 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
1104 (type simple-string string) (type kernel:index index))
1105 (let* ((prev 0)
1106 (l (length string))
1107 (c (codepoint string index l))
1108 (n (+ index (if (> c #xFFFF) 2 1))))
1109 (declare (type codepoint c) (type kernel:index l n))
1110 (loop while (< n l) do
1111 (let* ((c (codepoint string n l))
1112 (d (the (unsigned-byte 8) (unicode-combining-class c))))
1113 (when (or (zerop d) (< d prev))
1114 (return))
1115 (setq prev d)
1116 (incf n (if (> c #xFFFF) 2 1))))
1117 n))
1118
1119 (defun %glyph-b (string index)
1120 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
1121 (type simple-string string) (type kernel:index index))
1122 (let* ((prev 255)
1123 (n (1- index)))
1124 (declare (type kernel:index n))
1125 (loop until (< n 0) do
1126 (let* ((c (codepoint string n 0))
1127 (d (the (unsigned-byte 8) (unicode-combining-class c))))
1128 (cond ((zerop d) (return))
1129 ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
1130 (setq prev d)
1131 (decf n (if (> c #xFFFF) 2 1))))
1132 n))
1133 ) ; unicode
1134
1135 (defun glyph (string index &key (from-end nil))
1136 "GLYPH returns the glyph at the indexed position in a string, and the
1137 position of the next glyph (or NIL) as a second value. A glyph is
1138 a substring consisting of the character at INDEX followed by all
1139 subsequent combining characters."
1140 (declare (type simple-string string) (type kernel:index index))
1141 #-unicode
1142 (char string index)
1143 #+unicode
1144 (with-array-data ((string string) (start) (end))
1145 (declare (ignore start end))
1146 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
1147 (if from-end
1148 (values (subseq string n index) (and (> n 0) n))
1149 (values (subseq string index n) (and (< n (length string)) n))))))
1150
1151 (defun sglyph (string index &key (from-end nil))
1152 "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
1153 except that the string must be a simple-string"
1154 (declare (type simple-string string) (type kernel:index index))
1155 #-unicode
1156 (schar string index)
1157 #+unicode
1158 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
1159 (if from-end
1160 (values (subseq string n index) (and (> n 0) n))
1161 (values (subseq string index n) (and (< n (length string)) n)))))
1162
1163 #+unicode
1164 (defun string-reverse* (sequence)
1165 (declare (optimize (speed 3) (space 0) (safety 0))
1166 (type string sequence))
1167 (with-string sequence
1168 (let* ((length (- end start))
1169 (string (make-string length))
1170 (j length))
1171 (declare (type kernel:index length j))
1172 (loop for i = start then n as n = (%glyph-f sequence i) do
1173 (replace string sequence :start1 (decf j (- n i)) :start2 i :end2 n)
1174 while (< n end))
1175 string)))
1176
1177 #+unicode
1178 (defun string-nreverse* (sequence)
1179 (declare (optimize (speed 3) (space 0) (safety 0))
1180 (type string sequence))
1181 (with-string sequence
1182 (flet ((rev (start end)
1183 (do ((i start (1+ i))
1184 (j (1- end) (1- j)))
1185 ((>= i j))
1186 (declare (type kernel:index i j))
1187 (rotatef (schar sequence i) (schar sequence j)))))
1188 (let ((len end))
1189 (loop for i = start then n as n = (%glyph-f sequence i) do
1190 (rev i n) while (< n len))
1191 (rev start end))))
1192 sequence)
1193
1194
1195
1196
1197 #+unicode
1198 (progn
1199 (defun decompose (string &optional (compatibility t))
1200 (declare (type string string))
1201 (let ((result (make-string (cond ((< (length string) 40)
1202 (* 5 (length string)))
1203 ((< (length string) 4096)
1204 (* 2 (length string)))
1205 (t (round (length string) 5/6)))))
1206 (fillptr 0))
1207 (declare (type kernel:index fillptr))
1208 (labels ((rec (string start end)
1209 (declare (type simple-string string))
1210 (do ((i start (1+ i)))
1211 ((= i end))
1212 (declare (type kernel:index i))
1213 (multiple-value-bind (code wide) (codepoint string i)
1214 (when wide (incf i))
1215 (let ((decomp (unicode-decomp code compatibility)))
1216 (if decomp (rec decomp 0 (length decomp)) (out code))))))
1217 (out (code)
1218 (multiple-value-bind (hi lo) (surrogates code)
1219 (outch hi)
1220 (when lo
1221 (outch lo))
1222 (let ((cc (unicode-combining-class code)))
1223 (unless (zerop cc)
1224 (order lo cc (- fillptr (if lo 3 2)))))))
1225 (outch (char)
1226 (when (= fillptr (length result))
1227 (let ((tmp (make-string (round (length result) 5/6))))
1228 (replace tmp result)
1229 (setq result tmp)))
1230 (setf (schar result fillptr) char)
1231 (incf fillptr))
1232 (order (wide1 cc last)
1233 (loop until (minusp last) do
1234 (multiple-value-bind (code2 wide2) (codepoint result last)
1235 (let ((cc2 (unicode-combining-class code2)))
1236 (cond ((zerop cc2) (return))
1237 ((> cc2 cc)
1238 (case (+ (if wide2 2 0) (if wide1 1 0))
1239 (0 (rotatef (schar result last)
1240 (schar result (1+ last))))
1241 (1 (rotatef (schar result last)
1242 (schar result (+ last 1))
1243 (schar result (+ last 2))))
1244 (2 (rotatef (schar result last)
1245 (schar result (1- last))
1246 (schar result (1+ last))))
1247 (3 (rotatef (schar result last)
1248 (schar result (+ last 2)))
1249 (rotatef (schar result (1- last))
1250 (schar result (1+ last)))))
1251 (decf last (if wide2 2 1)))
1252 (t (return))))))))
1253 (with-string string
1254 (rec string start end))
1255 (shrink-vector result fillptr))))
1256
1257 (declaim (inline normalized-codepoint-p))
1258 (defun normalized-codepoint-p (cp form)
1259 (ecase form
1260 (:nfc (unicode-nfc-qc cp))
1261 (:nfkc (unicode-nfkc-qc cp))
1262 (:nfd (unicode-nfd-qc cp))
1263 (:nfkd (unicode-nfkd-qc cp))))
1264
1265 ;; Perform check to see if string is already normalized. The Unicode
1266 ;; example can return YES, NO, or MAYBE. For our purposes, only YES
1267 ;; is important, for which we return T. For NO or MAYBE, we return NIL.
1268 (defun normalized-form-p (string &optional (form :nfc))
1269 (declare (type (member :nfc :nfkc :nfd :nfkd) form)
1270 (optimize (speed 3)))
1271 (with-string string
1272 (let ((last-class 0))
1273 (declare (type (integer 0 256) last-class))
1274 (do ((k start (1+ k)))
1275 ((>= k end))
1276 (declare (type kernel:index k))
1277 (multiple-value-bind (ch widep)
1278 (codepoint string k end)
1279 (when widep (incf k))
1280 ;; Handle ASCII specially
1281 (unless (< ch 128)
1282 (let ((class (unicode-combining-class ch)))
1283 (declare (type (unsigned-byte 8) class))
1284 (when (and (> last-class class) (not (zerop class)))
1285 ;; Definitely not normalized
1286 (return-from normalized-form-p nil))
1287 (let ((check (normalized-codepoint-p ch form)))
1288 (unless (eq check :y)
1289 (return-from normalized-form-p nil)))
1290 (setf last-class class)))))
1291 t)))
1292
1293
1294 ;; Compose a string in place. The string must already be in decomposed form.
1295 (defun %compose (target)
1296 (declare (type string target)
1297 (optimize (speed 3)))
1298 (let ((len (length target))
1299 (starter-pos 0))
1300 (declare (type kernel:index starter-pos))
1301 (multiple-value-bind (starter-ch wide)
1302 (codepoint target 0 len)
1303 (let ((comp-pos (if wide 2 1))
1304 (last-class (unicode-combining-class starter-ch)))
1305 (declare (type (integer 0 256) last-class)
1306 (type kernel:index comp-pos))
1307 (unless (zerop last-class)
1308 ;; Fix for strings starting with a combining character
1309 (setf last-class 256))
1310 ;; Loop on decomposed characters, combining where possible
1311 (do ((decomp-pos comp-pos (1+ decomp-pos)))
1312 ((>= decomp-pos len))
1313 (declare (type kernel:index decomp-pos))
1314 (multiple-value-bind (ch wide)
1315 (codepoint target decomp-pos len)
1316 (when wide (incf decomp-pos))
1317 (let ((ch-class (unicode-combining-class ch))
1318 (composite (unicode-pairwise-composition starter-ch ch)))
1319 (declare (type (integer 0 256) ch-class))
1320 (cond ((and composite
1321 (or (< last-class ch-class) (zerop last-class)))
1322 ;; Note: As far as I know, there is no pairwise
1323 ;; composition such that the composite character
1324 ;; is outside the BMP but the starter-ch is
1325 ;; inside the BMP. Hence, it is always safe to
1326 ;; replace the possible surrogate at starter-pos
1327 ;; with another. We won't accidentally replace
1328 ;; the next character with our trailing surrogate
1329 ;; character.
1330 (multiple-value-bind (hi lo)
1331 (surrogates composite)
1332 (setf (aref target starter-pos) hi)
1333 (when lo
1334 (setf (aref target (1+ starter-pos)) lo))
1335 (setf starter-ch composite)))
1336 (t
1337 (when (zerop ch-class)
1338 (setf starter-pos comp-pos)
1339 (setf starter-ch ch))
1340 (setf last-class ch-class)
1341 (multiple-value-bind (hi lo)
1342 (surrogates ch)
1343 (setf (aref target comp-pos) hi)
1344 (when lo
1345 (incf comp-pos)
1346 (setf (aref target comp-pos) lo))
1347 (incf comp-pos)))))))
1348 (shrink-vector target comp-pos)))))
1349
1350 (defun string-to-nfd (string)
1351 "Convert String to Unicode Normalization Form D (NFD) using the
1352 canonical decomposition. The NFD string is returned"
1353 (decompose string nil))
1354
1355 (defun string-to-nfkd (string)
1356 "Convert String to Unicode Normalization Form KD (NFKD) uisng the
1357 compatible decomposition form. The NFKD string is returned."
1358 (decompose string t))
1359
1360 (defun string-to-nfc (string)
1361 "Convert String to Unicode Normalization Form C (NFC). If the
1362 string a simple string and is already normalized, the original
1363 string is returned."
1364 (if (normalized-form-p string :nfc)
1365 (if (simple-string-p string) string (coerce string 'simple-string))
1366 (coerce (if (normalized-form-p string :nfd)
1367 (%compose (copy-seq string))
1368 (%compose (string-to-nfd string)))
1369 'simple-string)))
1370
1371 (defun string-to-nfkc (string)
1372 "Convert String to Unicode Normalization Form KC (NFKC). If the
1373 string is a simple string and is already normalized, the original
1374 string is returned."
1375 (if (normalized-form-p string :nfkc)
1376 (if (simple-string-p string) string (coerce string 'simple-string))
1377 (coerce (if (normalized-form-p string :nfkd)
1378 (%compose (copy-seq string))
1379 (%compose (string-to-nfkd string)))
1380 'simple-string)))
1381 ) ; end unicode
1382
1383 #-unicode ;; Needed by package.lisp
1384 (defun string-to-nfc (string)
1385 (if (simple-string-p string) string (coerce string 'simple-string)))
1386
1387
1388 ;;;
1389 ;;; This is a Lisp translation of the Scheme code from William
1390 ;;; D. Clinger that implements the word-breaking algorithm. This is
1391 ;;; used with permission.
1392 ;;;
1393 ;;; This version is modified from the original at
1394 ;;; http://www.ccs.neu.edu/home/will/R6RS/ to conform to CMUCL's
1395 ;;; implementation of the word break properties.
1396 ;;;
1397 ;;;
1398 ;;; Copyright statement and original comments:
1399 ;;;
1400 ;;;--------------------------------------------------------------------------------
1401
1402 ;; Copyright 2006 William D Clinger.
1403 ;;
1404 ;; Permission to copy this software, in whole or in part, to use this
1405 ;; software for any lawful purpose, and to redistribute this software
1406 ;; is granted subject to the restriction that all copies made of this
1407 ;; software must include this copyright and permission notice in full.
1408 ;;
1409 ;; I also request that you send me a copy of any improvements that you
1410 ;; make to this software so that they may be incorporated within it to
1411 ;; the benefit of the Scheme community.
1412
1413 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1414 ;;
1415 ;; Word-breaking as defined by Unicode Standard Annex #29.
1416 ;;
1417 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1418
1419 ;; Implementation notes.
1420 ;;
1421 ;; The string-foldcase, string-downcase, and string-titlecase
1422 ;; procedures rely on the notion of a word, which is defined
1423 ;; by Unicode Standard Annex 29.
1424 ;;
1425 ;; The string-foldcase and string-downcase procedures rely on
1426 ;; word boundaries only when they encounter a Greek sigma, so
1427 ;; their performance should not be greatly affected by the
1428 ;; performance of the word-breaking algorithm.
1429 ;;
1430 ;; The string-titlecase procedure must find all word boundaries,
1431 ;; but it is typically used on short strings (titles).
1432 ;;
1433 ;; Hence the performance of the word-breaking algorithm should
1434 ;; not matter too much for this reference implementation.
1435 ;; Word-breaking is more generally useful, however, so I tried
1436 ;; to make this implementation reasonably efficient.
1437 ;;
1438 ;; Word boundaries are defined by 14 different rules in
1439 ;; Unicode Standard Annex #29, and by GraphemeBreakProperty.txt
1440 ;; and WordBreakProperty.txt. See also WordBreakTest.html.
1441 ;;
1442 ;; My original implementation of those specifications failed
1443 ;; 6 of the 494 tests in auxiliary/WordBreakTest.txt, but it
1444 ;; appeared to me that those tests were inconsistent with the
1445 ;; word-breaking rules in UAX #29. John Cowan forwarded my
1446 ;; bug report to the Unicode experts, and Mark Davis responded
1447 ;; on 29 May 2007:
1448 ;;
1449 ;; Thanks for following up on this. I think you have found a problem in the
1450 ;; formulation of word break, not the test. The intention was to break after a
1451 ;; Sep character, as is done in Sentence break. So my previous suggestion was
1452 ;; incorrect; instead, what we need is a new rule:
1453 ;;
1454 ;; *Break after paragraph separators.*
1455 ;; WB3a. Sep �
1456 ;; I'll make a propose to the UTC for this.
1457 ;;
1458 ;; Here is Will's translation of those rules (including WB3a)
1459 ;; into a finite state machine that searches forward within a
1460 ;; string, looking for the next position at which a word break
1461 ;; is allowed. The current state consists of an index i into
1462 ;; the string and a summary of the left context whose rightmost
1463 ;; character is at index i. The left context is usually
1464 ;; determined by the character at index i, but there are three
1465 ;; complications:
1466 ;;
1467 ;; Extend and Format characters are ignored unless they
1468 ;; follow a separator or the beginning of the text.
1469 ;; ALetter followed by MidLetter is treated specially.
1470 ;; Numeric followed by MidNum is treated specially.
1471 ;;
1472 ;; In the implementation below, the left context ending at i
1473 ;; is encoded by the following symbols:
1474 ;;
1475 ;; CR
1476 ;; Sep (excluding CR)
1477 ;; ALetter
1478 ;; MidLetter
1479 ;; ALetterMidLetter (ALetter followed by MidLetter)
1480 ;; Numeric
1481 ;; MidNum
1482 ;; NumericMidNum (Numeric followed by MidNum)
1483 ;; Katakana
1484 ;; ExtendNumLet
1485 ;; other (none of the above)
1486 ;;
1487 ;; Given a string s and an exact integer i (which need not be
1488 ;; a valid index into s), returns the index of the next character
1489 ;; that is not part of the word containing the character at i,
1490 ;; or the length of s if the word containing the character at i
1491 ;; extends through the end of s. If i is negative or a valid
1492 ;; index into s, then the returned value will be greater than i.
1493 ;;
1494 ;;;--------------------------------------------------------------------------------
1495
1496 (defun string-next-word-break (s i)
1497 (let ((n (length s)))
1498 (labels
1499 ((char-word-break-category (c)
1500 ;; Map our unicode word break property into what this
1501 ;; algorithm wants.
1502 (let ((cat (lisp::unicode-word-break c)))
1503 (case cat
1504 ((:lf :cr :newline)
1505 :sep)
1506 ((:extend :format)
1507 :extend-or-format)
1508 (otherwise cat))))
1509 (left-context (i)
1510 ;; Given a valid index i into s, returns the left context
1511 ;; at i.
1512 (multiple-value-bind (c widep)
1513 (lisp::codepoint s i n)
1514 (let* ((back
1515 ;; If we're at a regular character or a leading
1516 ;; surrogate, decrementing by 1 gets us the to
1517 ;; previous character. But for a trailing
1518 ;; surrogate, we need to decrement by 2!
1519 (if (eql widep -1)
1520 2
1521 1))
1522 (cat (char-word-break-category c)))
1523 (case cat
1524 ((:sep)
1525 (if (= c (char-code #\return)) :cr cat))
1526 ((:midletter :midnumlet)
1527 (let ((i-1 (- i back)))
1528 (if (and (<= 0 i-1)
1529 (eq (left-context i-1) :aletter))
1530 :aletter-midletter
1531 cat)))
1532 ((:midnum :midnumlet)
1533 (let ((i-1 (- i back)))
1534 (if (and (<= 0 i-1)
1535 (eq (left-context i-1) :numeric))
1536 :numeric-midnum
1537 cat)))
1538 ((:extendorformat)
1539 (if (< 0 i)
1540 (left-context (- i back))
1541 :other))
1542 (otherwise cat)))))
1543
1544 (index-of-previous-non-ignored (j)
1545 ;; Returns the index of the last non-Extend, non-Format
1546 ;; character within (substring s 0 j). Should not be
1547 ;; called unless such a character exists.
1548
1549 (let* ((j1 (- j 1)))
1550 (multiple-value-bind (c widep)
1551 (lisp::codepoint s j1)
1552 (when (eql widep -1)
1553 ;; Back up one more if we're at the trailing
1554 ;; surrogate.
1555 (decf j1))
1556 (let ((cat (char-word-break-category c)))
1557 (case cat
1558 ((:extend-or-format)
1559 (index-of-previous-non-ignored j1))
1560 (otherwise j1))))))
1561
1562 (lookup (j context)
1563 ;; Given j and the context to the left of (not including) j,
1564 ;; returns the index at the start of the next word
1565 ;; (or before which a word break is permitted).
1566
1567 (if (>= j n)
1568 (case context
1569 ((:aletter-midletter :numeric-midnum)
1570 (let ((j (index-of-previous-non-ignored n)))
1571 (if (< i j) j n)))
1572 (otherwise n))
1573 (multiple-value-bind (c widep)
1574 (lisp::codepoint s j)
1575 (let* ((next-j
1576 ;; The next character is either 1 or 2 code
1577 ;; units away. For a leading surrogate, it's
1578 ;; 2; Otherwise just 1.
1579 (if (eql widep 1)
1580 2
1581 1))
1582 (cat (char-word-break-category c)))
1583 (case cat
1584 ((:extend-or-format)
1585 (case context
1586 ((:cr :sep) j)
1587 (otherwise (lookup (+ j next-j) context))))
1588 (otherwise
1589 (case context
1590 ((:cr)
1591 (if (= c (char-code #\linefeed))
1592 ;; Rule WB3: Don't break CRLF, continue looking
1593 (lookup (+ j next-j) cat)
1594 j))
1595 ((:aletter)
1596 (case cat
1597 ((:aletter :numeric :extendnumlet)
1598 ;; Rules WB5, WB9, ?
1599 (lookup (+ j next-j) cat))
1600 ((:midletter :midnumlet)
1601 ;; Rule WB6, need to keep looking
1602 (lookup (+ j next-j) :aletter-midletter))
1603 (otherwise j)))
1604 ((:aletter-midletter)
1605 (case cat
1606 ((:aletter)
1607 ;; Rule WB7
1608 (lookup (+ j next-j) cat))
1609 (otherwise
1610 ;; Rule WB6 and WB7 were extended, but the
1611 ;; region didn't end with :aletter. So
1612 ;; backup and break at that point.
1613 (let ((j2 (index-of-previous-non-ignored j)))
1614 (if (< i j2) j2 j)))))
1615 ((:numeric)
1616 (case cat
1617 ((:numeric :aletter :extendnumlet)
1618 ;; Rules WB8, WB10, ?
1619 (lookup (+ j next-j) cat))
1620 ((:midnum :midnumlet)
1621 ;; Rules WB11, need to keep looking
1622 (lookup (+ j next-j) :numeric-midnum))
1623 (otherwise j)))
1624 ((:numeric-midnum)
1625 (case cat
1626 ((:numeric)
1627 ;; Rule WB11, keep looking
1628 (lookup (+ j next-j) cat))
1629 (otherwise
1630 ;; Rule WB11, WB12 were extended, but the
1631 ;; region didn't end with :numeric, so
1632 ;; backup and break at that point.
1633 (let ((j2 (index-of-previous-non-ignored j)))
1634 (if (< i j2) j2 j)))))
1635 ((:midletter :midnum :midnumlet)
1636 ;; Rule WB14
1637 j)
1638 ((:katakana)
1639 (case cat
1640 ((:katakana :extendnumlet)
1641 ;; Rule WB13, WB13a
1642 (lookup (+ j next-j) cat))
1643 (otherwise j)))
1644 ((:extendnumlet)
1645 (case cat
1646 ((:extendnumlet :aletter :numeric :katakana)
1647 ;; Rule WB13a, WB13b
1648 (lookup (+ j next-j) cat))
1649 (otherwise j)))
1650 (otherwise j)))))))))
1651 (declare (notinline lookup left-context))
1652 (cond ((< i 0)
1653 ;; Rule WB1
1654 0)
1655 ((<= n i)
1656 ;; Rule WB2
1657 n)
1658 (t
1659 (multiple-value-bind (c widep)
1660 (lisp::codepoint s i)
1661 (declare (ignore c))
1662 (lookup (+ i (if (eql widep 1) 2 1)) (left-context i))))))))
1663
1664 (defun string-capitalize-unicode (string &key (start 0) end (casing :simple))
1665 (declare (type (member :simple :full) casing))
1666 (let* ((string (if (stringp string) string (string string)))
1667 (slen (length string)))
1668 (declare (fixnum slen))
1669 (with-output-to-string (result)
1670 (lisp::with-one-string string start end offset
1671 (let ((offset-slen (+ slen offset)))
1672 (declare (fixnum offset-slen))
1673
1674 (write-string string result :start 0 :end start)
1675 (let ((upper (ecase casing
1676 (:simple
1677 #'(lambda (ch)
1678 (multiple-value-bind (hi lo)
1679 (lisp::surrogates (lisp::unicode-upper ch))
1680 (write-char hi result)
1681 (when lo (write-char lo result)))))
1682 (:full
1683 #'(lambda (ch)
1684 (write-string (lisp::unicode-full-case-title ch) result))))))
1685 (do ((start start next)
1686 (next (string-next-word-break string start)
1687 (string-next-word-break string next)))
1688 ((or (= start next)
1689 (>= start end)))
1690 ;; Convert the first character of the word to upper
1691 ;; case, and then make the rest of the word lowercase.
1692 (funcall upper (lisp::codepoint string start))
1693 (write-string (string-downcase string :start (1+ start) :end next :casing casing)
1694 result :start (1+ start) :end next)))
1695 (write-string string result :start end :end offset-slen))))))
1696
1697
1698 ;; Some utilities
1699 (defun codepoints-string (seq)
1700 "Convert a sequence of codepoints to a string. Codepoints outside
1701 the basic multilingual plane (BMP) are converted into the
1702 corresponding surrogate pairs."
1703 (with-output-to-string (s)
1704 (map nil #'(lambda (c)
1705 (multiple-value-bind (hi lo)
1706 (surrogates c)
1707 (write-char hi s)
1708 (when lo (write-char lo s))))
1709 seq)))
1710
1711 (defun string-codepoints (s)
1712 "Convert a string to a list of corresponding code points. Surrogate
1713 pairs in the string are converted into the correspoinding
1714 codepoint."
1715 (declare (type simple-string s))
1716 (let ((len (length s))
1717 cp)
1718 (do ((idx 0))
1719 ((>= idx len))
1720 (multiple-value-bind (c widep)
1721 (codepoint s idx)
1722 (if widep
1723 (incf idx 2)
1724 (incf idx))
1725 (push c cp)))
1726 (nreverse cp)))

  ViewVC Help
Powered by ViewVC 1.1.5