/[cmucl]/src/code/string.lisp
ViewVC logotype

Contents of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.28 - (hide annotations)
Wed Oct 13 18:00:44 2010 UTC (3 years, 6 months ago) by rtoy
Branch: MAIN
Changes since 1.27: +8 -8 lines
Some changes to replace calls to gettext with _"" or _N"" for things
compiled with and without Unicode.  This is needed so that the pot
files have the same content for both unicode and non-unicode builds.
(The _"" and _N"" are handled by the reader, so things that are
conditionalized out still get processed, unlike using gettext.)
1 ram 1.1 ;;; -*- Log: code.log; Package: Lisp -*-
2     ;;;
3     ;;; **********************************************************************
4 ram 1.3 ;;; This code was written as part of the CMU Common Lisp project at
5     ;;; Carnegie Mellon University, and has been placed in the public domain.
6     ;;;
7     (ext:file-comment
8 rtoy 1.28 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/string.lisp,v 1.28 2010/10/13 18:00:44 rtoy Exp $")
9 ram 1.3 ;;;
10 ram 1.1 ;;; **********************************************************************
11     ;;;
12 pw 1.11 ;;; Functions to implement strings for CMU Common Lisp
13 ram 1.1 ;;; Written by David Dill
14 ram 1.4 ;;; Rewritten by Skef Wholey, Bill Chiles and Rob MacLachlan.
15 ram 1.1 ;;;
16     ;;; ****************************************************************
17     ;;;
18 ram 1.4 (in-package "LISP")
19 rtoy 1.22 (intl:textdomain "cmucl")
20    
21 rtoy 1.13 (export '(char schar glyph sglyph string
22 ram 1.1 string= string-equal string< string> string<= string>= string/=
23     string-lessp string-greaterp string-not-lessp string-not-greaterp
24     string-not-equal
25 rtoy 1.14 string-to-nfc
26 ram 1.1 make-string
27     string-trim string-left-trim string-right-trim
28     string-upcase
29     string-downcase string-capitalize nstring-upcase nstring-downcase
30     nstring-capitalize))
31    
32 rtoy 1.14 #+unicode
33     (export '(string-to-nfd string-to-nfkd string-to-nfkc))
34 ram 1.1
35 rtoy 1.13 (declaim (inline surrogatep surrogates-to-codepoint codepoint surrogates))
36    
37 rtoy 1.18 (defun surrogatep (char-or-code &optional surrogate-type)
38 rtoy 1.23 "Test if C is a surrogate. C may be either an integer or a
39 rtoy 1.13 character. Surrogate-type indicates what kind of surrogate to test
40     for. :High means to test for the high (leading) surrogate; :Low
41     tests for the low (trailing surrogate). A value of :Any or Nil
42     tests for any surrogate value (high or low)."
43 rtoy 1.27 (declare (type (or character codepoint) char-or-code)
44     (type (or null (member :high :leading :low :trailing :any)) surrogate-type)
45     (optimize (inhibit-warnings 3)))
46 rtoy 1.18 (let ((code (if (characterp char-or-code)
47     (char-code char-or-code)
48     char-or-code)))
49 rtoy 1.13 (ecase surrogate-type
50     ((:high :leading)
51 rtoy 1.21 ;; Test for high surrogate (#xD800 to #xDBFF)
52     (= #b110110 (ash code -10)))
53 rtoy 1.13 ((:low :trailing)
54 rtoy 1.21 ;; Test for low surrogate (#xDC00 to #xDFFF)
55     (= #b110111 (ash code -10)))
56 rtoy 1.13 ((:any nil)
57 rtoy 1.21 ;; Test for any surrogate (#xD800 to #xDFFF)
58     (= #b11011 (ash code -11))))))
59 rtoy 1.13
60 rtoy 1.18 (defun surrogates-to-codepoint (hi-surrogate-char lo-surrogate-char)
61 rtoy 1.23 "Convert the given Hi and Lo surrogate characters to the
62 rtoy 1.13 corresponding codepoint value"
63 rtoy 1.18 (declare (type character hi-surrogate-char lo-surrogate-char))
64     (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi-surrogate-char)) #xD800) 10)
65     (the (integer #xDC00 #xDFFF) (char-code lo-surrogate-char)) #x2400))
66 rtoy 1.13
67     (defun codepoint (string i &optional (end (length string)))
68 rtoy 1.23 "Return the codepoint value from String at position I. If that
69 rtoy 1.13 position is a surrogate, it is combined with either the previous or
70     following character (when possible) to compute the codepoint. The
71     second return value is NIL if the position is not a surrogate pair.
72     Otherwise +1 or -1 is returned if the position is the high or low
73     surrogate value, respectively."
74     (declare (type simple-string string) (type kernel:index i end))
75     (let ((code (char-code (schar string i))))
76     (cond ((and (surrogatep code :high) (< (1+ i) end))
77     (let ((tmp (char-code (schar string (1+ i)))))
78     (if (surrogatep tmp :low)
79 rtoy 1.27 (values (truly-the codepoint (+ (ash (- code #xD800) 10) tmp #x2400))
80     +1)
81     (values (truly-the codepoint code) nil))))
82 rtoy 1.13 ((and (surrogatep code :low) (> i 0))
83     (let ((tmp (char-code (schar string (1- i)))))
84     (if (surrogatep tmp :high)
85 rtoy 1.27 (values (truly-the codepoint (+ (ash (- tmp #xD800) 10) code #x2400))
86     -1)
87     (values (truly-the codepoint code) nil))))
88     (t (values (truly-the codepoint code) nil)))))
89 rtoy 1.13
90     (defun surrogates (codepoint)
91 rtoy 1.23 "Return the high and low surrogate characters for Codepoint. If
92 rtoy 1.13 Codepoint is in the BMP, the first return value is the corresponding
93     character and the second is NIL."
94     (declare (type codepoint codepoint))
95     (if (< codepoint #x10000)
96     (values (code-char codepoint) nil)
97     (let* ((tmp (- codepoint #x10000))
98     (hi (logior (ldb (byte 10 10) tmp) #xD800))
99     (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
100     (values (code-char hi) (code-char lo)))))
101    
102     (defun (setf codepoint) (codepoint string i)
103 rtoy 1.23 "Set the codepoint at string position I to the Codepoint. If the
104 rtoy 1.13 codepoint requires a surrogate pair, the high (leading surrogate) is
105     stored at position I and the low (trailing) surrogate is stored at
106     I+1"
107     (declare (type codepoint codepoint)
108     (type simple-string string))
109     (let ((widep nil))
110     (multiple-value-bind (hi lo)
111     (surrogates codepoint)
112     (setf (aref string i) hi)
113     (when lo
114     (setf (aref string (1+ i)) lo)
115     (setf widep t)))
116     (values codepoint widep)))
117    
118 rtoy 1.15 #+unicode
119 rtoy 1.13 (defun utf16-string-p (string)
120 rtoy 1.22 _N"Check if String is a valid UTF-16 string. If the string is valid,
121 rtoy 1.13 T is returned. If the string is not valid, NIL is returned, and the
122     second value is the index into the string of the invalid character.
123     A string is also invalid if it contains any unassigned codepoints."
124     (do ((len (length string))
125     (index 0 (1+ index)))
126     ((>= index len)
127     t)
128     (multiple-value-bind (codepoint wide)
129     (codepoint string index)
130     ;; We step through the string in order. If there are any
131     ;; surrogates pairs, we must reach the lead surrogate first,
132     ;; which means WIDE is +1. Otherwise, we have an invalid
133     ;; surrogate pair. If we get any codepoint that is in the
134     ;; surrogate range, we also have an invalid string. An
135     ;; unassigned codepoint is also considered invalid.
136     (when (or (eq wide -1)
137     (surrogatep codepoint)
138 rtoy 1.17 (not (unicode-assigned-codepoint-p codepoint)))
139 rtoy 1.13 (return-from utf16-string-p (values nil index)))
140     (when wide (incf index)))))
141    
142 ram 1.1 (defun string (X)
143 rtoy 1.23 "Coerces X into a string. If X is a string, X is returned. If X is a
144 rtoy 1.13 symbol, X's pname is returned. If X is a character then a one element
145     string containing that character is returned. If X cannot be coerced
146     into a string, an error occurs."
147 ram 1.1 (cond ((stringp x) x)
148     ((symbolp x) (symbol-name x))
149     ((characterp x)
150     (let ((res (make-string 1)))
151     (setf (schar res 0) x) res))
152     (t
153 pw 1.11 (error 'simple-type-error
154     :datum x
155     :expected-type '(or string symbol character)
156 rtoy 1.24 :format-control (intl:gettext "~S cannot be coerced to a string.")
157 pw 1.11 :format-arguments (list x)))))
158 ram 1.1
159     ;;; With-One-String is used to set up some string hacking things. The keywords
160     ;;; are parsed, and the string is hacked into a simple-string.
161    
162     (eval-when (compile)
163    
164     (defmacro with-one-string (string start end cum-offset &rest forms)
165 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
166 pw 1.10 ;; Optimizer may prove STRING is one.
167     (declare (optimize (ext:inhibit-warnings 3)))
168 ram 1.4 (with-array-data ((,string ,string :offset-var ,cum-offset)
169     (,start ,start)
170     (,end (or ,end (length (the vector ,string)))))
171     ,@forms)))
172 ram 1.1
173     )
174    
175     ;;; With-String is like With-One-String, but doesn't parse keywords.
176    
177     (eval-when (compile)
178    
179     (defmacro with-string (string &rest forms)
180 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
181     (with-array-data ((,string ,string)
182     (start)
183     (end (length (the vector ,string))))
184     ,@forms)))
185 ram 1.1
186     )
187    
188     ;;; With-Two-Strings is used to set up string comparison operations. The
189     ;;; keywords are parsed, and the strings are hacked into simple-strings.
190    
191     (eval-when (compile)
192    
193     (defmacro with-two-strings (string1 string2 start1 end1 cum-offset-1
194     start2 end2 &rest forms)
195 ram 1.4 `(let ((,string1 (if (stringp ,string1) ,string1 (string ,string1)))
196     (,string2 (if (stringp ,string2) ,string2 (string ,string2))))
197     (with-array-data ((,string1 ,string1 :offset-var ,cum-offset-1)
198     (,start1 ,start1)
199     (,end1 (or ,end1 (length (the vector ,string1)))))
200     (with-array-data ((,string2 ,string2)
201     (,start2 ,start2)
202     (,end2 (or ,end2 (length (the vector ,string2)))))
203     ,@forms))))
204 ram 1.1
205     )
206 wlott 1.2
207 ram 1.1
208     (defun char (string index)
209 rtoy 1.23 "Given a string and a non-negative integer index less than the length of
210 ram 1.1 the string, returns the character object representing the character at
211     that position in the string."
212 ram 1.4 (declare (optimize (safety 1)))
213 ram 1.1 (char string index))
214    
215     (defun %charset (string index new-el)
216 ram 1.4 (declare (optimize (safety 1)))
217 ram 1.1 (setf (char string index) new-el))
218    
219     (defun schar (string index)
220 rtoy 1.23 "SCHAR returns the character object at an indexed position in a string
221 rtoy 1.13 just as CHAR does, except the string must be a simple-string."
222 ram 1.4 (declare (optimize (safety 1)))
223 ram 1.1 (schar string index))
224    
225     (defun %scharset (string index new-el)
226 ram 1.4 (declare (optimize (safety 1)))
227 ram 1.1 (setf (schar string index) new-el))
228    
229     (defun string=* (string1 string2 start1 end1 start2 end2)
230 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
231     (not (%sp-string-compare string1 start1 end1 string2 start2 end2))))
232 ram 1.1
233    
234     (defun string/=* (string1 string2 start1 end1 start2 end2)
235 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
236     (let ((comparison (%sp-string-compare string1 start1 end1
237     string2 start2 end2)))
238     (if comparison (- (the fixnum comparison) offset1)))))
239 ram 1.1
240     (eval-when (compile eval)
241    
242     ;;; Lessp is true if the desired expansion is for string<* or string<=*.
243     ;;; Equalp is true if the desired expansion is for string<=* or string>=*.
244     (defmacro string<>=*-body (lessp equalp)
245     (let ((offset1 (gensym)))
246 ram 1.4 `(with-two-strings string1 string2 start1 end1 ,offset1 start2 end2
247     (let ((index (%sp-string-compare string1 start1 end1
248     string2 start2 end2)))
249     (if index
250 ram 1.8 (cond ((= (the fixnum index) (the fixnum end1))
251     ,(if lessp
252     `(- (the fixnum index) ,offset1)
253     `nil))
254     ((= (+ (the fixnum index) (- start2 start1))
255     (the fixnum end2))
256     ,(if lessp
257     `nil
258 rtoy 1.13 `(- (the fixnum index) ,offset1)))
259     #-unicode
260 ram 1.4 ((,(if lessp 'char< 'char>)
261     (schar string1 index)
262     (schar string2 (+ (the fixnum index) (- start2 start1))))
263     (- (the fixnum index) ,offset1))
264 rtoy 1.13 #-unicode
265     (t nil)
266     #+unicode
267     (t
268     ;; Compare in code point order. See
269     ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
270     (flet ((fixup (code)
271     (if (>= code #xe000)
272     (- code #x800)
273     (+ code #x2000))))
274     (declare (inline fixup))
275     (let* ((c1 (char-code (schar string1 index)))
276     (c2 (char-code (schar string2
277     (+ (the fixnum index)
278     (- start2 start1))))))
279     (cond ((and (>= c1 #xd800)
280     (>= c2 #xd800))
281     (let ((fix-c1 (fixup c1))
282     (fix-c2 (fixup c2)))
283     (if (,(if lessp '< '>) fix-c1 fix-c2)
284     (- (the fixnum index) ,offset1)
285     nil)))
286     (t
287     (if (,(if lessp '< '>) c1 c2)
288     (- (the fixnum index) ,offset1)
289     nil)))))))
290 ram 1.4 ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
291 ram 1.1 ) ; eval-when
292    
293     (defun string<* (string1 string2 start1 end1 start2 end2)
294     (declare (fixnum start1 start2))
295     (string<>=*-body t nil))
296    
297     (defun string>* (string1 string2 start1 end1 start2 end2)
298     (declare (fixnum start1 start2))
299     (string<>=*-body nil nil))
300    
301     (defun string<=* (string1 string2 start1 end1 start2 end2)
302     (declare (fixnum start1 start2))
303     (string<>=*-body t t))
304    
305     (defun string>=* (string1 string2 start1 end1 start2 end2)
306     (declare (fixnum start1 start2))
307     (string<>=*-body nil t))
308    
309    
310    
311     (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
312 rtoy 1.23 "Given two strings, if the first string is lexicographically less than
313 ram 1.1 the second string, returns the longest common prefix (using char=)
314     of the two strings. Otherwise, returns ()."
315     (string<* string1 string2 start1 end1 start2 end2))
316    
317     (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
318 rtoy 1.23 "Given two strings, if the first string is lexicographically greater than
319 ram 1.1 the second string, returns the longest common prefix (using char=)
320     of the two strings. Otherwise, returns ()."
321     (string>* string1 string2 start1 end1 start2 end2))
322    
323    
324     (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
325 rtoy 1.23 "Given two strings, if the first string is lexicographically less than
326 ram 1.1 or equal to the second string, returns the longest common prefix
327     (using char=) of the two strings. Otherwise, returns ()."
328     (string<=* string1 string2 start1 end1 start2 end2))
329    
330     (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
331 rtoy 1.23 "Given two strings, if the first string is lexicographically greater
332 ram 1.1 than or equal to the second string, returns the longest common prefix
333     (using char=) of the two strings. Otherwise, returns ()."
334     (string>=* string1 string2 start1 end1 start2 end2))
335    
336     (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
337 rtoy 1.23 "Given two strings (string1 and string2), and optional integers start1,
338 ram 1.1 start2, end1 and end2, compares characters in string1 to characters in
339     string2 (using char=)."
340     (string=* string1 string2 start1 end1 start2 end2))
341    
342     (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
343 rtoy 1.23 "Given two strings, if the first string is not lexicographically equal
344 ram 1.1 to the second string, returns the longest common prefix (using char=)
345     of the two strings. Otherwise, returns ()."
346     (string/=* string1 string2 start1 end1 start2 end2))
347    
348    
349     (eval-when (compile eval)
350    
351     ;;; STRING-NOT-EQUAL-LOOP is used to generate character comparison loops for
352     ;;; STRING-EQUAL and STRING-NOT-EQUAL.
353     (defmacro string-not-equal-loop (end end-value
354     &optional (abort-value nil abortp))
355     (declare (fixnum end))
356     (let ((end-test (if (= end 1)
357     `(= index1 (the fixnum end1))
358     `(= index2 (the fixnum end2)))))
359     `(do ((index1 start1 (1+ index1))
360     (index2 start2 (1+ index2)))
361     (,(if abortp
362     end-test
363     `(or ,end-test
364     (not (char-equal (schar string1 index1)
365     (schar string2 index2)))))
366     ,end-value)
367     (declare (fixnum index1 index2))
368     ,@(if abortp
369     `((if (not (char-equal (schar string1 index1)
370     (schar string2 index2)))
371     (return ,abort-value)))))))
372 rtoy 1.13 ) ; eval-when
373 ram 1.1
374 rtoy 1.13 #+unicode
375     (defun string-case-fold (string &key (start 0) end (casing :simple))
376 rtoy 1.22 _N"Return a new string with the case folded according to Casing as follows:
377 rtoy 1.13
378     :SIMPLE Unicode simple case folding (preserving length)
379     :FULL Unicode full case folding (possibly changing length)
380    
381     Default Casing is :SIMPLE."
382     (ecase casing
383     (:simple
384     (with-output-to-string (s)
385     (with-one-string string start end offset
386     (do ((index offset (1+ index)))
387     ((>= index end))
388     (multiple-value-bind (code widep)
389     (codepoint string index)
390     (when widep (incf index))
391     (multiple-value-bind (hi lo)
392     (surrogates (unicode-case-fold-simple code))
393     (write-char hi s)
394     (when lo (write-char lo s))))))))
395     (:full
396     (with-output-to-string (s)
397     (with-one-string string start end offset
398     (do ((index offset (1+ index)))
399     ((>= index end))
400     (multiple-value-bind (code widep)
401     (codepoint string index)
402     (when widep (incf index))
403     (write-string (unicode-case-fold-full code) s))))))))
404 ram 1.1
405     (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
406 rtoy 1.23 "Given two strings (string1 and string2), and optional integers start1,
407 ram 1.1 start2, end1 and end2, compares characters in string1 to characters in
408     string2 (using char-equal)."
409     (declare (fixnum start1 start2))
410 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
411     (let ((slen1 (- (the fixnum end1) start1))
412     (slen2 (- (the fixnum end2) start2)))
413     (declare (fixnum slen1 slen2))
414     (if (or (minusp slen1) (minusp slen2))
415     ;;prevent endless looping later.
416 rtoy 1.24 (error (intl:gettext "Improper bounds for string comparison.")))
417 ram 1.4 (if (= slen1 slen2)
418     ;;return () immediately if lengths aren't equal.
419 rtoy 1.13 (string-not-equal-loop 1 t nil)))))
420 ram 1.1
421     (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
422 rtoy 1.23 "Given two strings, if the first string is not lexicographically equal
423 ram 1.1 to the second string, returns the longest common prefix (using char-equal)
424     of the two strings. Otherwise, returns ()."
425 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
426     (let ((slen1 (- end1 start1))
427     (slen2 (- end2 start2)))
428     (declare (fixnum slen1 slen2))
429     (if (or (minusp slen1) (minusp slen2))
430     ;;prevent endless looping later.
431 rtoy 1.24 (error (intl:gettext "Improper bounds for string comparison.")))
432 ram 1.4 (cond ((or (minusp slen1) (or (minusp slen2)))
433 rtoy 1.24 (error (intl:gettext "Improper substring for comparison.")))
434 ram 1.4 ((= slen1 slen2)
435     (string-not-equal-loop 1 nil (- index1 offset1)))
436     ((< slen1 slen2)
437     (string-not-equal-loop 1 (- index1 offset1)))
438     (t
439     (string-not-equal-loop 2 (- index1 offset1)))))))
440 ram 1.1
441    
442    
443     (eval-when (compile eval)
444    
445     ;;; STRING-LESS-GREATER-EQUAL-TESTS returns a test on the lengths of string1
446     ;;; and string2 and a test on the current characters from string1 and string2
447     ;;; for the following macro.
448     (defun string-less-greater-equal-tests (lessp equalp)
449     (if lessp
450     (if equalp
451     ;; STRING-NOT-GREATERP
452 rtoy 1.13 (values '<=
453     #-unicode `(not (char-greaterp char1 char2))
454     #+unicode `(<= char1 char2))
455 ram 1.1 ;; STRING-LESSP
456 rtoy 1.13 (values '<
457     #-unicode `(char-lessp char1 char2)
458     #+unicode `(< char1 char2)))
459 ram 1.1 (if equalp
460     ;; STRING-NOT-LESSP
461 rtoy 1.13 (values '>=
462     #-unicode `(not (char-lessp char1 char2))
463     #+unicode `(>= char1 char2))
464 ram 1.1 ;; STRING-GREATERP
465 rtoy 1.13 (values '>
466     #-unicode `(char-greaterp char1 char2)
467     #+unicode `(> char1 char2)))))
468 ram 1.1
469 rtoy 1.13 #-unicode
470 ram 1.1 (defmacro string-less-greater-equal (lessp equalp)
471     (multiple-value-bind (length-test character-test)
472     (string-less-greater-equal-tests lessp equalp)
473 ram 1.4 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
474     (let ((slen1 (- (the fixnum end1) start1))
475     (slen2 (- (the fixnum end2) start2)))
476     (declare (fixnum slen1 slen2))
477     (if (or (minusp slen1) (minusp slen2))
478     ;;prevent endless looping later.
479 rtoy 1.24 (error (intl:gettext "Improper bounds for string comparison.")))
480 ram 1.4 (do ((index1 start1 (1+ index1))
481     (index2 start2 (1+ index2))
482     (char1)
483     (char2))
484     ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
485     (if (,length-test slen1 slen2) (- index1 offset1)))
486     (declare (fixnum index1 index2))
487     (setq char1 (schar string1 index1))
488     (setq char2 (schar string2 index2))
489     (if (not (char-equal char1 char2))
490     (if ,character-test
491     (return (- index1 offset1))
492     (return ()))))))))
493 ram 1.1
494 rtoy 1.13 ;; Convert to lowercase for case folding, to match what Unicode
495     ;; CaseFolding.txt says. An example where this matters: U+1E9E maps
496     ;; to U+00DF. But the uppercase version of U+00DF is U+00DF.
497     #+unicode
498     (defmacro equal-char-codepoint (codepoint)
499     `(let ((ch ,codepoint))
500     ;; Handle ASCII separately for bootstrapping and for unidata missing.
501     (if (< 64 ch 91)
502     (+ ch 32)
503     #-(and unicode (not unicode-bootstrap))
504     ch
505     #+(and unicode (not unicode-bootstrap))
506     (if (> ch 127) (unicode-lower ch) ch))))
507    
508     #+unicode
509     (defmacro string-less-greater-equal (lessp equalp)
510     (multiple-value-bind (length-test character-test)
511     (string-less-greater-equal-tests lessp equalp)
512     `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
513     (let ((slen1 (- (the fixnum end1) start1))
514     (slen2 (- (the fixnum end2) start2)))
515     (declare (fixnum slen1 slen2))
516     (if (or (minusp slen1) (minusp slen2))
517     ;;prevent endless looping later.
518 rtoy 1.24 (error (intl:gettext "Improper bounds for string comparison.")))
519 rtoy 1.13 (do ((index1 start1 (1+ index1))
520     (index2 start2 (1+ index2)))
521     ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
522     (if (,length-test slen1 slen2) (- index1 offset1)))
523     (declare (fixnum index1 index2))
524     (multiple-value-bind (char1 wide1)
525     (codepoint string1 index1)
526     (declare (type codepoint char1))
527     (multiple-value-bind (char2 wide2)
528     (codepoint string2 index2)
529     (declare (type codepoint char2))
530     (setf char1 (equal-char-codepoint char1))
531     (setf char2 (equal-char-codepoint char2))
532     (if (= char1 char2)
533     (progn
534     (when wide1 (incf index1))
535     (when wide2 (incf index2)))
536     (if ,character-test
537     (return (- index1 offset1))
538     (return ()))))))))))
539    
540 ram 1.1 ) ; eval-when
541    
542     (defun string-lessp* (string1 string2 start1 end1 start2 end2)
543     (declare (fixnum start1 start2))
544     (string-less-greater-equal t nil))
545    
546     (defun string-greaterp* (string1 string2 start1 end1 start2 end2)
547     (declare (fixnum start1 start2))
548     (string-less-greater-equal nil nil))
549    
550     (defun string-not-lessp* (string1 string2 start1 end1 start2 end2)
551     (declare (fixnum start1 start2))
552     (string-less-greater-equal nil t))
553    
554     (defun string-not-greaterp* (string1 string2 start1 end1 start2 end2)
555     (declare (fixnum start1 start2))
556     (string-less-greater-equal t t))
557    
558     (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
559 rtoy 1.23 "Given two strings, if the first string is lexicographically less than
560 ram 1.1 the second string, returns the longest common prefix (using char-equal)
561     of the two strings. Otherwise, returns ()."
562     (string-lessp* string1 string2 start1 end1 start2 end2))
563    
564     (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)
565 rtoy 1.23 "Given two strings, if the first string is lexicographically greater than
566 ram 1.1 the second string, returns the longest common prefix (using char-equal)
567     of the two strings. Otherwise, returns ()."
568     (string-greaterp* string1 string2 start1 end1 start2 end2))
569    
570     (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
571 rtoy 1.23 "Given two strings, if the first string is lexicographically greater
572 ram 1.1 than or equal to the second string, returns the longest common prefix
573     (using char-equal) of the two strings. Otherwise, returns ()."
574     (string-not-lessp* string1 string2 start1 end1 start2 end2))
575    
576     (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
577     end2)
578 rtoy 1.23 "Given two strings, if the first string is lexicographically less than
579 ram 1.1 or equal to the second string, returns the longest common prefix
580     (using char-equal) of the two strings. Otherwise, returns ()."
581     (string-not-greaterp* string1 string2 start1 end1 start2 end2))
582    
583    
584 dtc 1.9 (defun make-string (count &key element-type ((:initial-element fill-char)))
585 rtoy 1.23 "Given a character count and an optional fill character, makes and returns
586 rtoy 1.13 a new string Count long filled with the fill character."
587 emarsden 1.12 (declare (type fixnum count))
588     (assert (subtypep element-type 'character))
589 ram 1.1 (if fill-char
590     (do ((i 0 (1+ i))
591     (string (make-string count)))
592     ((= i count) string)
593     (declare (fixnum i))
594     (setf (schar string i) fill-char))
595     (make-string count)))
596    
597 rtoy 1.13 (defun string-upcase-simple (string &key (start 0) end)
598 ram 1.1 (declare (fixnum start))
599 ram 1.5 (let* ((string (if (stringp string) string (string string)))
600     (slen (length string)))
601 ram 1.4 (declare (fixnum slen))
602 ram 1.1 (with-one-string string start end offset
603     (let ((offset-slen (+ slen offset))
604     (newstring (make-string slen)))
605     (declare (fixnum offset-slen))
606     (do ((index offset (1+ index))
607     (new-index 0 (1+ new-index)))
608     ((= index start))
609     (declare (fixnum index new-index))
610     (setf (schar newstring new-index) (schar string index)))
611     (do ((index start (1+ index))
612     (new-index (- start offset) (1+ new-index)))
613     ((= index (the fixnum end)))
614     (declare (fixnum index new-index))
615 rtoy 1.13 (multiple-value-bind (code wide) (codepoint string index)
616     (when wide (incf index))
617     ;; Handle ASCII specially because this is called early in
618     ;; initialization, before unidata is available.
619     (cond ((< 96 code 123) (decf code 32))
620 rtoy 1.15 #+unicode
621 rtoy 1.13 ((> code 127) (setq code (unicode-upper code))))
622     ;;@@ WARNING: this may, in theory, need to extend newstring
623     ;; but that never actually occurs as of Unicode 5.1.0,
624     ;; so I'm just going to ignore it for now...
625     (multiple-value-bind (hi lo) (surrogates code)
626     (setf (schar newstring new-index) hi)
627     (when lo
628     (setf (schar newstring (incf new-index)) lo)))))
629     ;;@@ WARNING: see above
630 ram 1.1 (do ((index end (1+ index))
631     (new-index (- (the fixnum end) offset) (1+ new-index)))
632     ((= index offset-slen))
633     (declare (fixnum index new-index))
634     (setf (schar newstring new-index) (schar string index)))
635     newstring))))
636    
637 rtoy 1.15 #+unicode
638 rtoy 1.13 (defun string-upcase-full (string &key (start 0) end)
639     (declare (fixnum start))
640     (let* ((string (if (stringp string) string (string string)))
641     (slen (length string)))
642     (declare (fixnum slen))
643     (with-output-to-string (s)
644     (with-one-string string start end offset
645     (let ((offset-slen (+ slen offset)))
646     (declare (fixnum offset-slen))
647     (write-string string s :start offset :end start)
648     (do ((index start (1+ index)))
649     ((= index (the fixnum end)))
650     (declare (fixnum index))
651     (multiple-value-bind (code wide)
652     (codepoint string index)
653     (when wide (incf index))
654     ;; Handle ASCII specially because this is called early in
655     ;; initialization, before unidata is available.
656     (cond ((< 96 code 123)
657     (write-char (code-char (decf code 32)) s))
658     ((> code 127)
659     (write-string (unicode-full-case-upper code) s))
660     (t
661     (multiple-value-bind (hi lo)
662     (surrogates code)
663     (write-char hi s)
664     (when lo
665     (write-char lo s)))))))
666     (write-string string s :start end :end offset-slen))))))
667    
668     (defun string-upcase (string &key (start 0) end #+unicode (casing :simple))
669     #-unicode
670 rtoy 1.22 _N"Given a string, returns a new string that is a copy of it with all
671 rtoy 1.13 lower case alphabetic characters converted to uppercase."
672     #+unicode
673 rtoy 1.22 _N"Given a string, returns a new string that is a copy of it with all
674 rtoy 1.13 lower case alphabetic characters converted to uppercase. Casing is
675     :simple or :full for simple or full case conversion, respectively."
676     (declare (fixnum start))
677     #-unicode
678     (string-upcase-simple string :start start :end end)
679     #+unicode
680     (if (eq casing :simple)
681     (string-upcase-simple string :start start :end end)
682     (string-upcase-full string :start start :end end)))
683    
684     (defun string-downcase-simple (string &key (start 0) end)
685 ram 1.1 (declare (fixnum start))
686 ram 1.5 (let* ((string (if (stringp string) string (string string)))
687     (slen (length string)))
688 ram 1.4 (declare (fixnum slen))
689 ram 1.1 (with-one-string string start end offset
690     (let ((offset-slen (+ slen offset))
691     (newstring (make-string slen)))
692     (declare (fixnum offset-slen))
693     (do ((index offset (1+ index))
694     (new-index 0 (1+ new-index)))
695     ((= index start))
696     (declare (fixnum index new-index))
697     (setf (schar newstring new-index) (schar string index)))
698     (do ((index start (1+ index))
699     (new-index (- start offset) (1+ new-index)))
700     ((= index (the fixnum end)))
701     (declare (fixnum index new-index))
702 rtoy 1.13 (multiple-value-bind (code wide) (codepoint string index)
703     (when wide (incf index))
704     ;; Handle ASCII specially because this is called early in
705     ;; initialization, before unidata is available.
706     (cond ((< 64 code 91) (incf code 32))
707     ((> code 127) (setq code (unicode-lower code))))
708     ;;@@ WARNING: this may, in theory, need to extend newstring
709     ;; but that never actually occurs as of Unicode 5.1.0,
710     ;; so I'm just going to ignore it for now...
711     (multiple-value-bind (hi lo) (surrogates code)
712     (setf (schar newstring new-index) hi)
713     (when lo
714     (setf (schar newstring (incf new-index)) lo)))))
715     ;;@@ WARNING: see above
716 ram 1.1 (do ((index end (1+ index))
717     (new-index (- (the fixnum end) offset) (1+ new-index)))
718     ((= index offset-slen))
719     (declare (fixnum index new-index))
720     (setf (schar newstring new-index) (schar string index)))
721     newstring))))
722    
723 rtoy 1.15 #+unicode
724 rtoy 1.13 (defun string-downcase-full (string &key (start 0) end)
725     (declare (fixnum start))
726     (let* ((string (if (stringp string) string (string string)))
727     (slen (length string)))
728     (declare (fixnum slen))
729     (with-output-to-string (s)
730     (with-one-string string start end offset
731     (let ((offset-slen (+ slen offset)))
732     (declare (fixnum offset-slen))
733     (write-string string s :start offset :end start)
734     (do ((index start (1+ index)))
735     ((= index (the fixnum end)))
736     (declare (fixnum index))
737     (multiple-value-bind (code wide)
738     (codepoint string index)
739     (when wide (incf index))
740     ;; Handle ASCII specially because this is called early in
741     ;; initialization, before unidata is available.
742     (cond ((< 64 code 91)
743     (write-char (code-char (incf code 32)) s))
744     ((> code 127)
745     (write-string (unicode-full-case-lower code) s))
746     (t
747     (multiple-value-bind (hi lo)
748     (surrogates code)
749     (write-char hi s)
750     (when lo
751     (write-char lo s)))))))
752     (write-string string s :start end :end offset-slen))))))
753    
754     (defun string-downcase (string &key (start 0) end #+unicode (casing :simple))
755     #-unicode
756 rtoy 1.22 _N"Given a string, returns a new string that is a copy of it with all
757 rtoy 1.13 upper case alphabetic characters converted to lowercase."
758     #+unicode
759 rtoy 1.22 _N"Given a string, returns a new string that is a copy of it with all
760 rtoy 1.13 upper case alphabetic characters converted to lowercase. Casing is
761     :simple or :full for simple or full case conversion, respectively."
762     (declare (fixnum start))
763     #-unicode
764     (string-downcase-simple string :start start :end end)
765     #+unicode
766     (if (eq casing :simple)
767     (string-downcase-simple string :start start :end end)
768     (string-downcase-full string :start start :end end)))
769    
770     (defun string-capitalize-simple (string &key (start 0) end)
771 ram 1.1 (declare (fixnum start))
772 ram 1.5 (let* ((string (if (stringp string) string (string string)))
773     (slen (length string)))
774 ram 1.4 (declare (fixnum slen))
775 ram 1.1 (with-one-string string start end offset
776     (let ((offset-slen (+ slen offset))
777     (newstring (make-string slen)))
778     (declare (fixnum offset-slen))
779     (do ((index offset (1+ index))
780     (new-index 0 (1+ new-index)))
781     ((= index start))
782     (declare (fixnum index new-index))
783     (setf (schar newstring new-index) (schar string index)))
784     (do ((index start (1+ index))
785     (new-index (- start offset) (1+ new-index))
786     (newword t)
787     (char ()))
788     ((= index (the fixnum end)))
789     (declare (fixnum index new-index))
790     (setq char (schar string index))
791     (cond ((not (alphanumericp char))
792     (setq newword t))
793     (newword
794     ;;char is first case-modifiable after non-case-modifiable
795 rtoy 1.13 (setq char (char-titlecase char))
796 ram 1.1 (setq newword ()))
797     ;;char is case-modifiable, but not first
798     (t (setq char (char-downcase char))))
799     (setf (schar newstring new-index) char))
800     (do ((index end (1+ index))
801     (new-index (- (the fixnum end) offset) (1+ new-index)))
802     ((= index offset-slen))
803     (declare (fixnum index new-index))
804     (setf (schar newstring new-index) (schar string index)))
805     newstring))))
806    
807 rtoy 1.15 #+unicode
808 rtoy 1.13 (defun string-capitalize-full (string &key (start 0) end)
809     (declare (fixnum start))
810     (let* ((string (if (stringp string) string (string string)))
811     (slen (length string)))
812     (declare (fixnum slen))
813     (with-output-to-string (s)
814     (with-one-string string start end offset
815     (let ((offset-slen (+ slen offset)))
816     (declare (fixnum offset-slen))
817     (write-string string s :start offset :end start)
818     (flet ((alphanump (m)
819     (or (< 47 m 58) (< 64 m 91) (< 96 m 123)
820     #+(and unicode (not unicode-bootstrap))
821     (and (> m 127)
822     (<= +unicode-category-letter+ (unicode-category m)
823     (+ +unicode-category-letter+ #x0F))))))
824     (do ((index start (1+ index))
825     (newword t))
826     ((= index (the fixnum end)))
827     (declare (fixnum index))
828     (multiple-value-bind (code wide)
829     (codepoint string index)
830     (when wide (incf index))
831     (cond ((not (alphanump code))
832     (multiple-value-bind (hi lo)
833     (surrogates code)
834     (write-char hi s)
835     (when lo (write-char lo s)))
836     (setq newword t))
837     (newword
838     ;;char is first case-modifiable after non-case-modifiable
839     (write-string (unicode-full-case-title code) s)
840     (setq newword ()))
841     ;;char is case-modifiable, but not first
842     (t
843     (write-string (unicode-full-case-lower code) s))))))
844     (write-string string s :start end :end offset-slen))))))
845    
846 rtoy 1.19 (defun string-capitalize (string &key (start 0) end
847     #+unicode (casing :simple)
848     #+unicode unicode-word-break)
849 rtoy 1.13 #-unicode
850 rtoy 1.22 _N"Given a string, returns a copy of the string with the first
851 rtoy 1.13 character of each ``word'' converted to upper-case, and remaining
852     chars in the word converted to lower case. A ``word'' is defined
853     to be a string of case-modifiable characters delimited by
854     non-case-modifiable chars."
855     #+unicode
856 rtoy 1.22 _N"Given a string, returns a copy of the string with the first
857 rtoy 1.13 character of each ``word'' converted to upper-case, and remaining
858     chars in the word converted to lower case. A ``word'' is defined
859     to be a string of case-modifiable characters delimited by
860     non-case-modifiable chars. Casing is :simple or :full for
861     simple or full case conversion, respectively."
862    
863     (declare (fixnum start))
864     #-unicode
865     (string-capitalize-simple string :start start :end end)
866     #+unicode
867 rtoy 1.19 (if unicode-word-break
868     (string-capitalize-unicode string :start start :end end :casing casing)
869     (if (eq casing :simple)
870     (string-capitalize-simple string :start start :end end)
871     (string-capitalize-full string :start start :end end))))
872 rtoy 1.13
873 ram 1.1 (defun nstring-upcase (string &key (start 0) end)
874 rtoy 1.23 "Given a string, returns that string with all lower case alphabetic
875 ram 1.1 characters converted to uppercase."
876     (declare (fixnum start))
877 ram 1.4 (let ((save-header string))
878 ram 1.1 (with-one-string string start end offset
879     (do ((index start (1+ index)))
880     ((= index (the fixnum end)))
881     (declare (fixnum index))
882 rtoy 1.13 (multiple-value-bind (code wide) (codepoint string index)
883     (declare (ignore wide))
884     ;; Handle ASCII specially because this is called early in
885     ;; initialization, before unidata is available.
886     (cond ((< 96 code 123) (decf code 32))
887 rtoy 1.15 #+unicode
888 rtoy 1.13 ((> code 127) (setq code (unicode-upper code))))
889     ;;@@ WARNING: this may, in theory, need to extend string
890     ;; (which, obviously, we can't do here. Unless
891     ;; STRING is adjustable, maybe)
892     ;; but that never actually occurs as of Unicode 5.1.0,
893     ;; so I'm just going to ignore it for now...
894     (multiple-value-bind (hi lo) (surrogates code)
895     (setf (schar string index) hi)
896     (when lo
897     (setf (schar string (incf index)) lo))))))
898 ram 1.1 save-header))
899    
900     (defun nstring-downcase (string &key (start 0) end)
901 rtoy 1.23 "Given a string, returns that string with all upper case alphabetic
902 ram 1.1 characters converted to lowercase."
903     (declare (fixnum start))
904 ram 1.4 (let ((save-header string))
905 ram 1.1 (with-one-string string start end offset
906     (do ((index start (1+ index)))
907     ((= index (the fixnum end)))
908     (declare (fixnum index))
909 rtoy 1.13 (multiple-value-bind (code wide) (codepoint string index)
910     (declare (ignore wide))
911     (cond ((< 64 code 91) (incf code 32))
912 rtoy 1.15 #+unicode
913 rtoy 1.13 ((> code 127) (setq code (unicode-lower code))))
914     ;;@@ WARNING: this may, in theory, need to extend string
915     ;; (which, obviously, we can't do here. Unless
916     ;; STRING is adjustable, maybe)
917     ;; but that never actually occurs as of Unicode 5.1.0,
918     ;; so I'm just going to ignore it for now...
919     (multiple-value-bind (hi lo) (surrogates code)
920     (setf (schar string index) hi)
921     (when lo
922     (setf (schar string (incf index)) lo))))))
923 wlott 1.6 save-header))
924 ram 1.1
925     (defun nstring-capitalize (string &key (start 0) end)
926 rtoy 1.23 "Given a string, returns that string with the first
927 ram 1.1 character of each ``word'' converted to upper-case, and remaining
928     chars in the word converted to lower case. A ``word'' is defined
929     to be a string of case-modifiable characters delimited by
930     non-case-modifiable chars."
931     (declare (fixnum start))
932 ram 1.4 (let ((save-header string))
933 ram 1.1 (with-one-string string start end offset
934     (do ((index start (1+ index))
935     (newword t)
936     (char ()))
937     ((= index (the fixnum end)))
938     (declare (fixnum index))
939     (setq char (schar string index))
940     (cond ((not (alphanumericp char))
941     (setq newword t))
942     (newword
943     ;;char is first case-modifiable after non-case-modifiable
944 rtoy 1.13 (setf (schar string index) (char-titlecase char))
945 ram 1.1 (setq newword ()))
946     (t
947     (setf (schar string index) (char-downcase char))))))
948     save-header))
949    
950 rtoy 1.13
951     #+unicode
952     (progn
953     ;; Like string-left-trim, but return the index
954     (defun string-left-trim-index (char-bag string)
955     (with-string string
956     (if (stringp char-bag)
957     ;; When char-bag is a string, we try to do the right thing.
958     ;; Convert char-bag to a list of codepoints and compare the
959     ;; codepoints in the string with this.
960     (let ((code-bag (with-string char-bag
961     (do ((index start (1+ index))
962     (result nil))
963     ((= index end)
964     (nreverse result))
965     (multiple-value-bind (c widep)
966     (codepoint char-bag index)
967     (push c result)
968     (when widep (incf index)))))))
969     (do ((index start (1+ index)))
970     ((= index (the fixnum end))
971     end)
972     (declare (fixnum index))
973     (multiple-value-bind (c widep)
974     (codepoint string index)
975     (unless (find c code-bag)
976     (return-from string-left-trim-index index))
977     (when widep (incf index)))))
978     ;; When char-bag is a list, we just look at each codepoint of
979     ;; STRING to see if it's in char-bag. If char-bag contains a
980     ;; surrogate, we could accidentally trim off a surrogate,
981     ;; leaving an invalid UTF16 string.
982     (do ((index start (1+ index)))
983     ((= index (the fixnum end))
984     end)
985     (declare (fixnum index))
986     (multiple-value-bind (c widep)
987     (codepoint string index)
988     (unless (find c char-bag :key #'char-code)
989     (return-from string-left-trim-index index))
990     (when widep (incf index)))))))
991    
992     (defun string-left-trim (char-bag string)
993 rtoy 1.28 _N"Given a set of characters (a list or string) and a string, returns
994 rtoy 1.13 a copy of the string with the characters in the set removed from the
995     left end. If the set of characters is a string, surrogates will be
996     properly handled."
997     (let ((begin (string-left-trim-index char-bag string)))
998     (with-string string
999 rtoy 1.16 (declare (ignore start))
1000 rtoy 1.13 (subseq string begin end))))
1001    
1002     (defun string-right-trim-index (char-bag string)
1003     (with-string string
1004     (if (stringp char-bag)
1005     ;; When char-bag is a string, we try to do the right thing
1006     ;; with surrogates. Convert char-bag to a list of codepoints
1007     ;; and compare the codepoints in the string with this.
1008     (let ((code-bag (with-string char-bag
1009     (do ((index start (1+ index))
1010     (result nil))
1011     ((= index end)
1012     result)
1013     (multiple-value-bind (c widep)
1014     (codepoint char-bag index)
1015     (push c result)
1016     (when widep (incf index)))))))
1017     (do ((index (1- end) (1- index)))
1018     ((< index start)
1019     start)
1020     (declare (fixnum index))
1021     (multiple-value-bind (c widep)
1022     (codepoint string index)
1023     (unless (find c code-bag)
1024     (return-from string-right-trim-index (1+ index)))
1025     (when widep (decf index)))))
1026     ;; When char-bag is a list, we just look at each codepoint of
1027     ;; STRING to see if it's in char-bag. If char-bag contains a
1028     ;; surrogate, we could accidentally trim off a surrogate,
1029     ;; leaving an invalid UTF16 string.
1030     (do ((index (1- end) (1- index)))
1031     ((< index start)
1032     start)
1033     (declare (fixnum index))
1034     (multiple-value-bind (c widep)
1035     (codepoint string index)
1036     (unless (find c char-bag :key #'char-code)
1037     (return-from string-right-trim-index (1+ index)))
1038     (when widep (decf index)))))))
1039    
1040     (defun string-right-trim (char-bag string)
1041 rtoy 1.28 _N"Given a set of characters (a list or string) and a string, returns
1042 rtoy 1.13 a copy of the string with the characters in the set removed from the
1043     right end. If the set of characters is a string, surrogates will be
1044     properly handled."
1045     (let ((stop (string-right-trim-index char-bag string)))
1046     (with-string string
1047 rtoy 1.16 (declare (ignore end))
1048 rtoy 1.13 (subseq string start stop))))
1049    
1050     (defun string-trim (char-bag string)
1051 rtoy 1.28 _N"Given a set of characters (a list or string) and a string, returns a
1052 rtoy 1.13 copy of the string with the characters in the set removed from both
1053     ends. If the set of characters is a string, surrogates will be
1054     properly handled."
1055     (let ((left-end (string-left-trim-index char-bag string))
1056     (right-end (string-right-trim-index char-bag string)))
1057     (with-string string
1058 rtoy 1.16 (declare (ignore start end))
1059 rtoy 1.13 (subseq (the simple-string string) left-end right-end))))
1060     ) ; end unicode version
1061    
1062     #-unicode
1063     (progn
1064 ram 1.1 (defun string-left-trim (char-bag string)
1065 rtoy 1.22 _N"Given a set of characters (a list or string) and a string, returns
1066 ram 1.1 a copy of the string with the characters in the set removed from the
1067     left end."
1068     (with-string string
1069     (do ((index start (1+ index)))
1070     ((or (= index (the fixnum end))
1071     (not (find (schar string index) char-bag)))
1072     (subseq (the simple-string string) index end))
1073     (declare (fixnum index)))))
1074    
1075     (defun string-right-trim (char-bag string)
1076 rtoy 1.22 _N"Given a set of characters (a list or string) and a string, returns
1077 ram 1.1 a copy of the string with the characters in the set removed from the
1078     right end."
1079     (with-string string
1080     (do ((index (1- (the fixnum end)) (1- index)))
1081     ((or (< index start) (not (find (schar string index) char-bag)))
1082     (subseq (the simple-string string) start (1+ index)))
1083     (declare (fixnum index)))))
1084    
1085     (defun string-trim (char-bag string)
1086 rtoy 1.22 _N"Given a set of characters (a list or string) and a string, returns a
1087 ram 1.1 copy of the string with the characters in the set removed from both
1088     ends."
1089     (with-string string
1090     (let* ((left-end (do ((index start (1+ index)))
1091     ((or (= index (the fixnum end))
1092     (not (find (schar string index) char-bag)))
1093     index)
1094     (declare (fixnum index))))
1095     (right-end (do ((index (1- (the fixnum end)) (1- index)))
1096     ((or (< index left-end)
1097     (not (find (schar string index) char-bag)))
1098     (1+ index))
1099     (declare (fixnum index)))))
1100     (subseq (the simple-string string) left-end right-end))))
1101 rtoy 1.13 ) ; non-unicode version
1102    
1103 rtoy 1.15 #+unicode
1104     (progn
1105 rtoy 1.13 (declaim (inline %glyph-f %glyph-b))
1106     (defun %glyph-f (string index)
1107     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
1108     (type simple-string string) (type kernel:index index))
1109     (let* ((prev 0)
1110     (l (length string))
1111     (c (codepoint string index l))
1112     (n (+ index (if (> c #xFFFF) 2 1))))
1113     (declare (type codepoint c) (type kernel:index l n))
1114     (loop while (< n l) do
1115     (let* ((c (codepoint string n l))
1116     (d (the (unsigned-byte 8) (unicode-combining-class c))))
1117     (when (or (zerop d) (< d prev))
1118     (return))
1119     (setq prev d)
1120     (incf n (if (> c #xFFFF) 2 1))))
1121     n))
1122    
1123     (defun %glyph-b (string index)
1124     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
1125     (type simple-string string) (type kernel:index index))
1126     (let* ((prev 255)
1127     (n (1- index)))
1128     (declare (type kernel:index n))
1129     (loop until (< n 0) do
1130     (let* ((c (codepoint string n 0))
1131     (d (the (unsigned-byte 8) (unicode-combining-class c))))
1132     (cond ((zerop d) (return))
1133     ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
1134     (setq prev d)
1135     (decf n (if (> c #xFFFF) 2 1))))
1136     n))
1137 rtoy 1.15 ) ; unicode
1138 rtoy 1.13
1139     (defun glyph (string index &key (from-end nil))
1140 rtoy 1.23 "GLYPH returns the glyph at the indexed position in a string, and the
1141 rtoy 1.13 position of the next glyph (or NIL) as a second value. A glyph is
1142     a substring consisting of the character at INDEX followed by all
1143     subsequent combining characters."
1144     (declare (type simple-string string) (type kernel:index index))
1145     #-unicode
1146     (char string index)
1147     #+unicode
1148     (with-array-data ((string string) (start) (end))
1149     (declare (ignore start end))
1150     (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
1151     (if from-end
1152     (values (subseq string n index) (and (> n 0) n))
1153     (values (subseq string index n) (and (< n (length string)) n))))))
1154    
1155     (defun sglyph (string index &key (from-end nil))
1156 rtoy 1.23 "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
1157 rtoy 1.13 except that the string must be a simple-string"
1158     (declare (type simple-string string) (type kernel:index index))
1159     #-unicode
1160     (schar string index)
1161     #+unicode
1162     (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
1163     (if from-end
1164     (values (subseq string n index) (and (> n 0) n))
1165     (values (subseq string index n) (and (< n (length string)) n)))))
1166    
1167     #+unicode
1168     (defun string-reverse* (sequence)
1169     (declare (optimize (speed 3) (space 0) (safety 0))
1170     (type string sequence))
1171     (with-string sequence
1172     (let* ((length (- end start))
1173     (string (make-string length))
1174     (j length))
1175     (declare (type kernel:index length j))
1176     (loop for i = start then n as n = (%glyph-f sequence i) do
1177     (replace string sequence :start1 (decf j (- n i)) :start2 i :end2 n)
1178     while (< n end))
1179     string)))
1180    
1181     #+unicode
1182     (defun string-nreverse* (sequence)
1183     (declare (optimize (speed 3) (space 0) (safety 0))
1184     (type string sequence))
1185     (with-string sequence
1186     (flet ((rev (start end)
1187     (do ((i start (1+ i))
1188     (j (1- end) (1- j)))
1189     ((>= i j))
1190     (declare (type kernel:index i j))
1191     (rotatef (schar sequence i) (schar sequence j)))))
1192     (let ((len end))
1193     (loop for i = start then n as n = (%glyph-f sequence i) do
1194     (rev i n) while (< n len))
1195     (rev start end))))
1196     sequence)
1197    
1198    
1199    
1200    
1201 rtoy 1.14 #+unicode
1202     (progn
1203 rtoy 1.13 (defun decompose (string &optional (compatibility t))
1204     (declare (type string string))
1205     (let ((result (make-string (cond ((< (length string) 40)
1206     (* 5 (length string)))
1207     ((< (length string) 4096)
1208     (* 2 (length string)))
1209     (t (round (length string) 5/6)))))
1210     (fillptr 0))
1211     (declare (type kernel:index fillptr))
1212     (labels ((rec (string start end)
1213     (declare (type simple-string string))
1214     (do ((i start (1+ i)))
1215     ((= i end))
1216     (declare (type kernel:index i))
1217     (multiple-value-bind (code wide) (codepoint string i)
1218     (when wide (incf i))
1219     (let ((decomp (unicode-decomp code compatibility)))
1220     (if decomp (rec decomp 0 (length decomp)) (out code))))))
1221     (out (code)
1222     (multiple-value-bind (hi lo) (surrogates code)
1223     (outch hi)
1224     (when lo
1225     (outch lo))
1226     (let ((cc (unicode-combining-class code)))
1227     (unless (zerop cc)
1228     (order lo cc (- fillptr (if lo 3 2)))))))
1229     (outch (char)
1230     (when (= fillptr (length result))
1231     (let ((tmp (make-string (round (length result) 5/6))))
1232     (replace tmp result)
1233     (setq result tmp)))
1234     (setf (schar result fillptr) char)
1235     (incf fillptr))
1236     (order (wide1 cc last)
1237     (loop until (minusp last) do
1238     (multiple-value-bind (code2 wide2) (codepoint result last)
1239     (let ((cc2 (unicode-combining-class code2)))
1240     (cond ((zerop cc2) (return))
1241     ((> cc2 cc)
1242     (case (+ (if wide2 2 0) (if wide1 1 0))
1243     (0 (rotatef (schar result last)
1244     (schar result (1+ last))))
1245     (1 (rotatef (schar result last)
1246     (schar result (+ last 1))
1247     (schar result (+ last 2))))
1248     (2 (rotatef (schar result last)
1249     (schar result (1- last))
1250     (schar result (1+ last))))
1251     (3 (rotatef (schar result last)
1252     (schar result (+ last 2)))
1253     (rotatef (schar result (1- last))
1254     (schar result (1+ last)))))
1255     (decf last (if wide2 2 1)))
1256     (t (return))))))))
1257     (with-string string
1258     (rec string start end))
1259     (shrink-vector result fillptr))))
1260    
1261     (declaim (inline normalized-codepoint-p))
1262     (defun normalized-codepoint-p (cp form)
1263     (ecase form
1264     (:nfc (unicode-nfc-qc cp))
1265     (:nfkc (unicode-nfkc-qc cp))
1266     (:nfd (unicode-nfd-qc cp))
1267     (:nfkd (unicode-nfkd-qc cp))))
1268    
1269     ;; Perform check to see if string is already normalized. The Unicode
1270     ;; example can return YES, NO, or MAYBE. For our purposes, only YES
1271     ;; is important, for which we return T. For NO or MAYBE, we return NIL.
1272     (defun normalized-form-p (string &optional (form :nfc))
1273     (declare (type (member :nfc :nfkc :nfd :nfkd) form)
1274     (optimize (speed 3)))
1275     (with-string string
1276     (let ((last-class 0))
1277     (declare (type (integer 0 256) last-class))
1278     (do ((k start (1+ k)))
1279     ((>= k end))
1280     (declare (type kernel:index k))
1281     (multiple-value-bind (ch widep)
1282     (codepoint string k end)
1283     (when widep (incf k))
1284     ;; Handle ASCII specially
1285     (unless (< ch 128)
1286     (let ((class (unicode-combining-class ch)))
1287     (declare (type (unsigned-byte 8) class))
1288     (when (and (> last-class class) (not (zerop class)))
1289     ;; Definitely not normalized
1290     (return-from normalized-form-p nil))
1291     (let ((check (normalized-codepoint-p ch form)))
1292     (unless (eq check :y)
1293     (return-from normalized-form-p nil)))
1294     (setf last-class class)))))
1295     t)))
1296    
1297    
1298     ;; Compose a string in place. The string must already be in decomposed form.
1299     (defun %compose (target)
1300     (declare (type string target)
1301     (optimize (speed 3)))
1302     (let ((len (length target))
1303     (starter-pos 0))
1304     (declare (type kernel:index starter-pos))
1305     (multiple-value-bind (starter-ch wide)
1306     (codepoint target 0 len)
1307     (let ((comp-pos (if wide 2 1))
1308     (last-class (unicode-combining-class starter-ch)))
1309     (declare (type (integer 0 256) last-class)
1310     (type kernel:index comp-pos))
1311     (unless (zerop last-class)
1312     ;; Fix for strings starting with a combining character
1313     (setf last-class 256))
1314     ;; Loop on decomposed characters, combining where possible
1315     (do ((decomp-pos comp-pos (1+ decomp-pos)))
1316     ((>= decomp-pos len))
1317     (declare (type kernel:index decomp-pos))
1318     (multiple-value-bind (ch wide)
1319     (codepoint target decomp-pos len)
1320     (when wide (incf decomp-pos))
1321     (let ((ch-class (unicode-combining-class ch))
1322 rtoy 1.14 (composite (unicode-pairwise-composition starter-ch ch)))
1323 rtoy 1.13 (declare (type (integer 0 256) ch-class))
1324     (cond ((and composite
1325     (or (< last-class ch-class) (zerop last-class)))
1326 rtoy 1.26 ;; Note: As far as I know, there is no pairwise
1327     ;; composition such that the composite character
1328     ;; is outside the BMP but the starter-ch is
1329     ;; inside the BMP. Hence, it is always safe to
1330     ;; replace the possible surrogate at starter-pos
1331     ;; with another. We won't accidentally replace
1332     ;; the next character with our trailing surrogate
1333     ;; character.
1334     (multiple-value-bind (hi lo)
1335     (surrogates composite)
1336     (setf (aref target starter-pos) hi)
1337     (when lo
1338     (setf (aref target (1+ starter-pos)) lo))
1339     (setf starter-ch composite)))
1340 rtoy 1.13 (t
1341     (when (zerop ch-class)
1342     (setf starter-pos comp-pos)
1343     (setf starter-ch ch))
1344     (setf last-class ch-class)
1345     (multiple-value-bind (hi lo)
1346     (surrogates ch)
1347     (setf (aref target comp-pos) hi)
1348     (when lo
1349     (incf comp-pos)
1350     (setf (aref target comp-pos) lo))
1351     (incf comp-pos)))))))
1352     (shrink-vector target comp-pos)))))
1353    
1354     (defun string-to-nfd (string)
1355 rtoy 1.28 _N"Convert String to Unicode Normalization Form D (NFD) using the
1356 rtoy 1.13 canonical decomposition. The NFD string is returned"
1357     (decompose string nil))
1358    
1359     (defun string-to-nfkd (string)
1360 rtoy 1.28 _N"Convert String to Unicode Normalization Form KD (NFKD) uisng the
1361 rtoy 1.13 compatible decomposition form. The NFKD string is returned."
1362     (decompose string t))
1363    
1364     (defun string-to-nfc (string)
1365 rtoy 1.28 _N"Convert String to Unicode Normalization Form C (NFC). If the
1366 rtoy 1.13 string a simple string and is already normalized, the original
1367     string is returned."
1368     (if (normalized-form-p string :nfc)
1369     (if (simple-string-p string) string (coerce string 'simple-string))
1370     (coerce (if (normalized-form-p string :nfd)
1371     (%compose (copy-seq string))
1372     (%compose (string-to-nfd string)))
1373     'simple-string)))
1374    
1375     (defun string-to-nfkc (string)
1376 rtoy 1.28 _N"Convert String to Unicode Normalization Form KC (NFKC). If the
1377 rtoy 1.13 string is a simple string and is already normalized, the original
1378     string is returned."
1379     (if (normalized-form-p string :nfkc)
1380     (if (simple-string-p string) string (coerce string 'simple-string))
1381     (coerce (if (normalized-form-p string :nfkd)
1382     (%compose (copy-seq string))
1383     (%compose (string-to-nfkd string)))
1384     'simple-string)))
1385 rtoy 1.14 ) ; end unicode
1386    
1387     #-unicode ;; Needed by package.lisp
1388     (defun string-to-nfc (string)
1389     (if (simple-string-p string) string (coerce string 'simple-string)))
1390    
1391 rtoy 1.19
1392     ;;;
1393     ;;; This is a Lisp translation of the Scheme code from William
1394     ;;; D. Clinger that implements the word-breaking algorithm. This is
1395     ;;; used with permission.
1396     ;;;
1397     ;;; This version is modified from the original at
1398     ;;; http://www.ccs.neu.edu/home/will/R6RS/ to conform to CMUCL's
1399     ;;; implementation of the word break properties.
1400     ;;;
1401     ;;;
1402     ;;; Copyright statement and original comments:
1403     ;;;
1404     ;;;--------------------------------------------------------------------------------
1405    
1406     ;; Copyright 2006 William D Clinger.
1407     ;;
1408     ;; Permission to copy this software, in whole or in part, to use this
1409     ;; software for any lawful purpose, and to redistribute this software
1410     ;; is granted subject to the restriction that all copies made of this
1411     ;; software must include this copyright and permission notice in full.
1412     ;;
1413     ;; I also request that you send me a copy of any improvements that you
1414     ;; make to this software so that they may be incorporated within it to
1415     ;; the benefit of the Scheme community.
1416    
1417     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1418     ;;
1419     ;; Word-breaking as defined by Unicode Standard Annex #29.
1420     ;;
1421     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1422    
1423     ;; Implementation notes.
1424     ;;
1425     ;; The string-foldcase, string-downcase, and string-titlecase
1426     ;; procedures rely on the notion of a word, which is defined
1427     ;; by Unicode Standard Annex 29.
1428     ;;
1429     ;; The string-foldcase and string-downcase procedures rely on
1430     ;; word boundaries only when they encounter a Greek sigma, so
1431     ;; their performance should not be greatly affected by the
1432     ;; performance of the word-breaking algorithm.
1433     ;;
1434     ;; The string-titlecase procedure must find all word boundaries,
1435     ;; but it is typically used on short strings (titles).
1436     ;;
1437     ;; Hence the performance of the word-breaking algorithm should
1438     ;; not matter too much for this reference implementation.
1439     ;; Word-breaking is more generally useful, however, so I tried
1440     ;; to make this implementation reasonably efficient.
1441     ;;
1442     ;; Word boundaries are defined by 14 different rules in
1443     ;; Unicode Standard Annex #29, and by GraphemeBreakProperty.txt
1444     ;; and WordBreakProperty.txt. See also WordBreakTest.html.
1445     ;;
1446     ;; My original implementation of those specifications failed
1447     ;; 6 of the 494 tests in auxiliary/WordBreakTest.txt, but it
1448     ;; appeared to me that those tests were inconsistent with the
1449     ;; word-breaking rules in UAX #29. John Cowan forwarded my
1450     ;; bug report to the Unicode experts, and Mark Davis responded
1451     ;; on 29 May 2007:
1452     ;;
1453     ;; Thanks for following up on this. I think you have found a problem in the
1454     ;; formulation of word break, not the test. The intention was to break after a
1455     ;; Sep character, as is done in Sentence break. So my previous suggestion was
1456     ;; incorrect; instead, what we need is a new rule:
1457     ;;
1458     ;; *Break after paragraph separators.*
1459 rtoy 1.24 ;; WB3a. Sep �
1460 rtoy 1.19 ;; I'll make a propose to the UTC for this.
1461     ;;
1462     ;; Here is Will's translation of those rules (including WB3a)
1463     ;; into a finite state machine that searches forward within a
1464     ;; string, looking for the next position at which a word break
1465     ;; is allowed. The current state consists of an index i into
1466     ;; the string and a summary of the left context whose rightmost
1467     ;; character is at index i. The left context is usually
1468     ;; determined by the character at index i, but there are three
1469     ;; complications:
1470     ;;
1471     ;; Extend and Format characters are ignored unless they
1472     ;; follow a separator or the beginning of the text.
1473     ;; ALetter followed by MidLetter is treated specially.
1474     ;; Numeric followed by MidNum is treated specially.
1475     ;;
1476     ;; In the implementation below, the left context ending at i
1477     ;; is encoded by the following symbols:
1478     ;;
1479     ;; CR
1480     ;; Sep (excluding CR)
1481     ;; ALetter
1482     ;; MidLetter
1483     ;; ALetterMidLetter (ALetter followed by MidLetter)
1484     ;; Numeric
1485     ;; MidNum
1486     ;; NumericMidNum (Numeric followed by MidNum)
1487     ;; Katakana
1488     ;; ExtendNumLet
1489     ;; other (none of the above)
1490     ;;
1491     ;; Given a string s and an exact integer i (which need not be
1492     ;; a valid index into s), returns the index of the next character
1493     ;; that is not part of the word containing the character at i,
1494     ;; or the length of s if the word containing the character at i
1495     ;; extends through the end of s. If i is negative or a valid
1496     ;; index into s, then the returned value will be greater than i.
1497     ;;
1498     ;;;--------------------------------------------------------------------------------
1499    
1500     (defun string-next-word-break (s i)
1501     (let ((n (length s)))
1502     (labels
1503     ((char-word-break-category (c)
1504     ;; Map our unicode word break property into what this
1505     ;; algorithm wants.
1506     (let ((cat (lisp::unicode-word-break c)))
1507     (case cat
1508     ((:lf :cr :newline)
1509     :sep)
1510     ((:extend :format)
1511     :extend-or-format)
1512     (otherwise cat))))
1513     (left-context (i)
1514     ;; Given a valid index i into s, returns the left context
1515     ;; at i.
1516     (multiple-value-bind (c widep)
1517     (lisp::codepoint s i n)
1518     (let* ((back
1519     ;; If we're at a regular character or a leading
1520     ;; surrogate, decrementing by 1 gets us the to
1521     ;; previous character. But for a trailing
1522     ;; surrogate, we need to decrement by 2!
1523     (if (eql widep -1)
1524     2
1525     1))
1526     (cat (char-word-break-category c)))
1527     (case cat
1528     ((:sep)
1529     (if (= c (char-code #\return)) :cr cat))
1530     ((:midletter :midnumlet)
1531     (let ((i-1 (- i back)))
1532     (if (and (<= 0 i-1)
1533     (eq (left-context i-1) :aletter))
1534     :aletter-midletter
1535     cat)))
1536     ((:midnum :midnumlet)
1537     (let ((i-1 (- i back)))
1538     (if (and (<= 0 i-1)
1539     (eq (left-context i-1) :numeric))
1540     :numeric-midnum
1541     cat)))
1542     ((:extendorformat)
1543     (if (< 0 i)
1544     (left-context (- i back))
1545     :other))
1546     (otherwise cat)))))
1547    
1548     (index-of-previous-non-ignored (j)
1549     ;; Returns the index of the last non-Extend, non-Format
1550     ;; character within (substring s 0 j). Should not be
1551     ;; called unless such a character exists.
1552    
1553     (let* ((j1 (- j 1)))
1554     (multiple-value-bind (c widep)
1555     (lisp::codepoint s j1)
1556     (when (eql widep -1)
1557     ;; Back up one more if we're at the trailing
1558     ;; surrogate.
1559     (decf j1))
1560     (let ((cat (char-word-break-category c)))
1561     (case cat
1562     ((:extend-or-format)
1563     (index-of-previous-non-ignored j1))
1564     (otherwise j1))))))
1565    
1566     (lookup (j context)
1567     ;; Given j and the context to the left of (not including) j,
1568     ;; returns the index at the start of the next word
1569     ;; (or before which a word break is permitted).
1570    
1571     (if (>= j n)
1572     (case context
1573     ((:aletter-midletter :numeric-midnum)
1574     (let ((j (index-of-previous-non-ignored n)))
1575     (if (< i j) j n)))
1576     (otherwise n))
1577     (multiple-value-bind (c widep)
1578     (lisp::codepoint s j)
1579     (let* ((next-j
1580     ;; The next character is either 1 or 2 code
1581     ;; units away. For a leading surrogate, it's
1582     ;; 2; Otherwise just 1.
1583     (if (eql widep 1)
1584     2
1585     1))
1586     (cat (char-word-break-category c)))
1587     (case cat
1588     ((:extend-or-format)
1589     (case context
1590     ((:cr :sep) j)
1591     (otherwise (lookup (+ j next-j) context))))
1592     (otherwise
1593     (case context
1594     ((:cr)
1595     (if (= c (char-code #\linefeed))
1596     ;; Rule WB3: Don't break CRLF, continue looking
1597     (lookup (+ j next-j) cat)
1598     j))
1599     ((:aletter)
1600     (case cat
1601     ((:aletter :numeric :extendnumlet)
1602     ;; Rules WB5, WB9, ?
1603     (lookup (+ j next-j) cat))
1604     ((:midletter :midnumlet)
1605     ;; Rule WB6, need to keep looking
1606     (lookup (+ j next-j) :aletter-midletter))
1607     (otherwise j)))
1608     ((:aletter-midletter)
1609     (case cat
1610     ((:aletter)
1611     ;; Rule WB7
1612     (lookup (+ j next-j) cat))
1613     (otherwise
1614     ;; Rule WB6 and WB7 were extended, but the
1615     ;; region didn't end with :aletter. So
1616     ;; backup and break at that point.
1617     (let ((j2 (index-of-previous-non-ignored j)))
1618     (if (< i j2) j2 j)))))
1619     ((:numeric)
1620     (case cat
1621     ((:numeric :aletter :extendnumlet)
1622     ;; Rules WB8, WB10, ?
1623     (lookup (+ j next-j) cat))
1624     ((:midnum :midnumlet)
1625     ;; Rules WB11, need to keep looking
1626     (lookup (+ j next-j) :numeric-midnum))
1627     (otherwise j)))
1628     ((:numeric-midnum)
1629     (case cat
1630     ((:numeric)
1631     ;; Rule WB11, keep looking
1632     (lookup (+ j next-j) cat))
1633     (otherwise
1634     ;; Rule WB11, WB12 were extended, but the
1635     ;; region didn't end with :numeric, so
1636     ;; backup and break at that point.
1637     (let ((j2 (index-of-previous-non-ignored j)))
1638     (if (< i j2) j2 j)))))
1639     ((:midletter :midnum :midnumlet)
1640     ;; Rule WB14
1641     j)
1642     ((:katakana)
1643     (case cat
1644     ((:katakana :extendnumlet)
1645     ;; Rule WB13, WB13a
1646     (lookup (+ j next-j) cat))
1647     (otherwise j)))
1648     ((:extendnumlet)
1649     (case cat
1650     ((:extendnumlet :aletter :numeric :katakana)
1651     ;; Rule WB13a, WB13b
1652     (lookup (+ j next-j) cat))
1653     (otherwise j)))
1654     (otherwise j)))))))))
1655     (declare (notinline lookup left-context))
1656     (cond ((< i 0)
1657     ;; Rule WB1
1658     0)
1659     ((<= n i)
1660     ;; Rule WB2
1661     n)
1662     (t
1663     (multiple-value-bind (c widep)
1664     (lisp::codepoint s i)
1665     (declare (ignore c))
1666     (lookup (+ i (if (eql widep 1) 2 1)) (left-context i))))))))
1667    
1668     (defun string-capitalize-unicode (string &key (start 0) end (casing :simple))
1669     (declare (type (member :simple :full) casing))
1670     (let* ((string (if (stringp string) string (string string)))
1671     (slen (length string)))
1672     (declare (fixnum slen))
1673     (with-output-to-string (result)
1674     (lisp::with-one-string string start end offset
1675     (let ((offset-slen (+ slen offset)))
1676     (declare (fixnum offset-slen))
1677    
1678     (write-string string result :start 0 :end start)
1679     (let ((upper (ecase casing
1680     (:simple
1681     #'(lambda (ch)
1682     (multiple-value-bind (hi lo)
1683     (lisp::surrogates (lisp::unicode-upper ch))
1684     (write-char hi result)
1685     (when lo (write-char lo result)))))
1686     (:full
1687     #'(lambda (ch)
1688     (write-string (lisp::unicode-full-case-title ch) result))))))
1689     (do ((start start next)
1690     (next (string-next-word-break string start)
1691     (string-next-word-break string next)))
1692     ((or (= start next)
1693     (>= start end)))
1694     ;; Convert the first character of the word to upper
1695     ;; case, and then make the rest of the word lowercase.
1696     (funcall upper (lisp::codepoint string start))
1697     (write-string (string-downcase string :start (1+ start) :end next :casing casing)
1698     result :start (1+ start) :end next)))
1699     (write-string string result :start end :end offset-slen))))))
1700    
1701 rtoy 1.25
1702     ;; Some utilities
1703     (defun codepoints-string (seq)
1704     "Convert a sequence of codepoints to a string. Codepoints outside
1705     the basic multilingual plane (BMP) are converted into the
1706     corresponding surrogate pairs."
1707     (with-output-to-string (s)
1708     (map nil #'(lambda (c)
1709     (multiple-value-bind (hi lo)
1710     (surrogates c)
1711     (write-char hi s)
1712     (when lo (write-char lo s))))
1713     seq)))
1714    
1715     (defun string-codepoints (s)
1716     "Convert a string to a list of corresponding code points. Surrogate
1717     pairs in the string are converted into the correspoinding
1718     codepoint."
1719     (declare (type simple-string s))
1720     (let ((len (length s))
1721     cp)
1722     (do ((idx 0))
1723     ((>= idx len))
1724     (multiple-value-bind (c widep)
1725     (codepoint s idx)
1726     (if widep
1727     (incf idx 2)
1728     (incf idx))
1729     (push c cp)))
1730     (nreverse cp)))

  ViewVC Help
Powered by ViewVC 1.1.5