/[cmucl]/src/code/string.lisp
ViewVC logotype

Contents of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12.30.32 - (hide annotations)
Tue Jun 9 18:16:17 2009 UTC (4 years, 10 months ago) by rtoy
Branch: unicode-utf16-extfmt-branch
Changes since 1.12.30.31: +73 -13 lines
o Only recognize :simple and :full for the casing parameter.
o Update docstrings to mention the casing parameter.
1 ram 1.1 ;;; -*- Log: code.log; Package: Lisp -*-
2     ;;;
3     ;;; **********************************************************************
4 ram 1.3 ;;; This code was written as part of the CMU Common Lisp project at
5     ;;; Carnegie Mellon University, and has been placed in the public domain.
6     ;;;
7     (ext:file-comment
8 rtoy 1.12.30.32 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/string.lisp,v 1.12.30.32 2009/06/09 18:16:17 rtoy Exp $")
9 ram 1.3 ;;;
10 ram 1.1 ;;; **********************************************************************
11     ;;;
12 pw 1.11 ;;; Functions to implement strings for CMU Common Lisp
13 ram 1.1 ;;; Written by David Dill
14 ram 1.4 ;;; Rewritten by Skef Wholey, Bill Chiles and Rob MacLachlan.
15 ram 1.1 ;;;
16     ;;; ****************************************************************
17     ;;;
18 ram 1.4 (in-package "LISP")
19 rtoy 1.12.30.3 (export '(char schar glyph sglyph string
20 ram 1.1 string= string-equal string< string> string<= string>= string/=
21     string-lessp string-greaterp string-not-lessp string-not-greaterp
22     string-not-equal
23 rtoy 1.12.30.4 string-to-nfd string-to-nfkd string-to-nfc string-to-nfkc
24 ram 1.1 make-string
25     string-trim string-left-trim string-right-trim
26     string-upcase
27     string-downcase string-capitalize nstring-upcase nstring-downcase
28     nstring-capitalize))
29    
30    
31 rtoy 1.12.30.19 (declaim (inline surrogatep surrogates-to-codepoint codepoint surrogates))
32    
33     (defun surrogatep (c &optional surrogate-type)
34     "Test if C is a surrogate. C may be either an integer or a
35     character. Surrogate-type indicates what kind of surrogate to test
36     for. :High means to test for the high (leading) surrogate; :Low
37     tests for the low (trailing surrogate). A value of :Any or Nil
38     tests for any surrogate value (high or low)."
39 rtoy 1.12.30.26 (declare (type (or character codepoint) c))
40 rtoy 1.12.30.19 (let ((code (if (characterp c)
41     (char-code c)
42     c)))
43     (ecase surrogate-type
44     ((:high :leading)
45     ;; Test for high surrogate
46     (<= #xD800 code #xDBFF))
47     ((:low :trailing)
48     ;; Test for low surrogate
49     (<= #xDC00 code #xDFFF))
50     ((:any nil)
51     ;; Test for any surrogate
52     (<= #xD800 code #xDFFF)))))
53 rtoy 1.12.30.7
54 rtoy 1.12.30.8 (defun surrogates-to-codepoint (hi lo)
55     "Convert the given Hi and Lo surrogate characters to the
56     corresponding codepoint value"
57     (declare (type character hi lo))
58     (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi)) #xD800) 10)
59     (the (integer #xDC00 #xDFFF) (char-code lo)) #x2400))
60 rtoy 1.12.30.7
61 rtoy 1.12.30.6 (defun codepoint (string i &optional (end (length string)))
62 rtoy 1.12.30.7 "Return the codepoint value from String at position I. If that
63     position is a surrogate, it is combined with either the previous or
64     following character (when possible) to compute the codepoint. The
65     second return value is NIL if the position is not a surrogate pair.
66     Otherwise +1 or -1 is returned if the position is the high or low
67     surrogate value, respectively."
68 rtoy 1.12.30.6 (declare (type simple-string string) (type kernel:index i end))
69     (let ((code (char-code (schar string i))))
70 rtoy 1.12.30.19 (cond ((and (surrogatep code :high) (< (1+ i) end))
71 rtoy 1.12.30.6 (let ((tmp (char-code (schar string (1+ i)))))
72 rtoy 1.12.30.19 (if (surrogatep tmp :low)
73 rtoy 1.12.30.10 (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)
74 rtoy 1.12.30.6 (values code nil))))
75 rtoy 1.12.30.19 ((and (surrogatep code :low) (> i 0))
76 rtoy 1.12.30.6 (let ((tmp (char-code (schar string (1- i)))))
77 rtoy 1.12.30.19 (if (surrogatep tmp :high)
78 rtoy 1.12.30.10 (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)
79 rtoy 1.12.30.6 (values code nil))))
80     (t (values code nil)))))
81    
82     (defun surrogates (codepoint)
83 rtoy 1.12.30.8 "Return the high and low surrogate characters for Codepoint. If
84     Codepoint is in the BMP, the first return value is the corresponding
85     character and the second is NIL."
86 rtoy 1.12.30.26 (declare (type codepoint codepoint))
87 rtoy 1.12.30.6 (if (< codepoint #x10000)
88 rtoy 1.12.30.8 (values (code-char codepoint) nil)
89 rtoy 1.12.30.6 (let* ((tmp (- codepoint #x10000))
90     (hi (logior (ldb (byte 10 10) tmp) #xD800))
91     (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
92 rtoy 1.12.30.8 (values (code-char hi) (code-char lo)))))
93 rtoy 1.12.30.6
94 rtoy 1.12.30.22 (defun (setf codepoint) (codepoint string i)
95     "Set the codepoint at string position I to the Codepoint. If the
96     codepoint requires a surrogate pair, the high (leading surrogate) is
97     stored at position I and the low (trailing) surrogate is stored at
98     I+1"
99 rtoy 1.12.30.26 (declare (type codepoint codepoint)
100 rtoy 1.12.30.22 (type simple-string string))
101     (let ((widep nil))
102     (multiple-value-bind (hi lo)
103     (surrogates codepoint)
104     (setf (aref string i) hi)
105     (when lo
106     (setf (aref string (1+ i)) lo)
107     (setf widep t)))
108     (values codepoint widep)))
109    
110 rtoy 1.12.30.16 (defun utf16-string-p (string)
111     "Check if String is a valid UTF-16 string. If the string is valid,
112     T is returned. If the string is not valid, NIL is returned, and the
113 rtoy 1.12.30.28 second value is the index into the string of the invalid character.
114     A string is also invalid if it contains any unassigned codepoints."
115 rtoy 1.12.30.16 (do ((len (length string))
116     (index 0 (1+ index)))
117     ((>= index len)
118     t)
119     (multiple-value-bind (codepoint wide)
120     (codepoint string index)
121 rtoy 1.12.30.17 ;; We step through the string in order. If there are any
122     ;; surrogates pairs, we must reach the lead surrogate first,
123     ;; which means WIDE is +1. Otherwise, we have an invalid
124 rtoy 1.12.30.28 ;; surrogate pair. If we get any codepoint that is in the
125     ;; surrogate range, we also have an invalid string. An
126     ;; unassigned codepoint is also considered invalid.
127 rtoy 1.12.30.16 (when (or (eq wide -1)
128 rtoy 1.12.30.28 (surrogatep codepoint)
129     (lisp::unicode-assigned-codepoint-p codepoint))
130 rtoy 1.12.30.16 (return-from utf16-string-p (values nil index)))
131     (when wide (incf index)))))
132 rtoy 1.12.30.6
133 ram 1.1 (defun string (X)
134     "Coerces X into a string. If X is a string, X is returned. If X is a
135 rtoy 1.12.30.14 symbol, X's pname is returned. If X is a character then a one element
136     string containing that character is returned. If X cannot be coerced
137     into a string, an error occurs."
138 ram 1.1 (cond ((stringp x) x)
139     ((symbolp x) (symbol-name x))
140     ((characterp x)
141     (let ((res (make-string 1)))
142     (setf (schar res 0) x) res))
143     (t
144 pw 1.11 (error 'simple-type-error
145     :datum x
146     :expected-type '(or string symbol character)
147     :format-control "~S cannot be coerced to a string."
148     :format-arguments (list x)))))
149 ram 1.1
150     ;;; With-One-String is used to set up some string hacking things. The keywords
151     ;;; are parsed, and the string is hacked into a simple-string.
152    
153     (eval-when (compile)
154    
155     (defmacro with-one-string (string start end cum-offset &rest forms)
156 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
157 pw 1.10 ;; Optimizer may prove STRING is one.
158     (declare (optimize (ext:inhibit-warnings 3)))
159 ram 1.4 (with-array-data ((,string ,string :offset-var ,cum-offset)
160     (,start ,start)
161     (,end (or ,end (length (the vector ,string)))))
162     ,@forms)))
163 ram 1.1
164     )
165    
166     ;;; With-String is like With-One-String, but doesn't parse keywords.
167    
168     (eval-when (compile)
169    
170     (defmacro with-string (string &rest forms)
171 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
172     (with-array-data ((,string ,string)
173     (start)
174     (end (length (the vector ,string))))
175     ,@forms)))
176 ram 1.1
177     )
178    
179     ;;; With-Two-Strings is used to set up string comparison operations. The
180     ;;; keywords are parsed, and the strings are hacked into simple-strings.
181    
182     (eval-when (compile)
183    
184     (defmacro with-two-strings (string1 string2 start1 end1 cum-offset-1
185     start2 end2 &rest forms)
186 ram 1.4 `(let ((,string1 (if (stringp ,string1) ,string1 (string ,string1)))
187     (,string2 (if (stringp ,string2) ,string2 (string ,string2))))
188     (with-array-data ((,string1 ,string1 :offset-var ,cum-offset-1)
189     (,start1 ,start1)
190     (,end1 (or ,end1 (length (the vector ,string1)))))
191     (with-array-data ((,string2 ,string2)
192     (,start2 ,start2)
193     (,end2 (or ,end2 (length (the vector ,string2)))))
194     ,@forms))))
195 ram 1.1
196     )
197 wlott 1.2
198 ram 1.1
199     (defun char (string index)
200     "Given a string and a non-negative integer index less than the length of
201     the string, returns the character object representing the character at
202     that position in the string."
203 ram 1.4 (declare (optimize (safety 1)))
204 ram 1.1 (char string index))
205    
206     (defun %charset (string index new-el)
207 ram 1.4 (declare (optimize (safety 1)))
208 ram 1.1 (setf (char string index) new-el))
209    
210     (defun schar (string index)
211     "SCHAR returns the character object at an indexed position in a string
212 rtoy 1.12.30.14 just as CHAR does, except the string must be a simple-string."
213 ram 1.4 (declare (optimize (safety 1)))
214 ram 1.1 (schar string index))
215    
216     (defun %scharset (string index new-el)
217 ram 1.4 (declare (optimize (safety 1)))
218 ram 1.1 (setf (schar string index) new-el))
219    
220     (defun string=* (string1 string2 start1 end1 start2 end2)
221 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
222     (not (%sp-string-compare string1 start1 end1 string2 start2 end2))))
223 ram 1.1
224    
225     (defun string/=* (string1 string2 start1 end1 start2 end2)
226 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
227     (let ((comparison (%sp-string-compare string1 start1 end1
228     string2 start2 end2)))
229     (if comparison (- (the fixnum comparison) offset1)))))
230 ram 1.1
231     (eval-when (compile eval)
232    
233     ;;; Lessp is true if the desired expansion is for string<* or string<=*.
234     ;;; Equalp is true if the desired expansion is for string<=* or string>=*.
235     (defmacro string<>=*-body (lessp equalp)
236     (let ((offset1 (gensym)))
237 ram 1.4 `(with-two-strings string1 string2 start1 end1 ,offset1 start2 end2
238     (let ((index (%sp-string-compare string1 start1 end1
239     string2 start2 end2)))
240     (if index
241 ram 1.8 (cond ((= (the fixnum index) (the fixnum end1))
242     ,(if lessp
243     `(- (the fixnum index) ,offset1)
244     `nil))
245     ((= (+ (the fixnum index) (- start2 start1))
246     (the fixnum end2))
247     ,(if lessp
248     `nil
249 rtoy 1.12.30.1 `(- (the fixnum index) ,offset1)))
250     #-unicode
251 ram 1.4 ((,(if lessp 'char< 'char>)
252     (schar string1 index)
253     (schar string2 (+ (the fixnum index) (- start2 start1))))
254     (- (the fixnum index) ,offset1))
255 rtoy 1.12.30.1 #-unicode
256     (t nil)
257     #+unicode
258     (t
259     ;; Compare in code point order. See
260     ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
261     (flet ((fixup (code)
262     (if (>= code #xe000)
263     (- code #x800)
264     (+ code #x2000))))
265     (declare (inline fixup))
266     (let* ((c1 (char-code (schar string1 index)))
267 rtoy 1.12.30.10 (c2 (char-code (schar string2
268     (+ (the fixnum index)
269     (- start2 start1))))))
270 rtoy 1.12.30.1 (cond ((and (>= c1 #xd800)
271     (>= c2 #xd800))
272     (let ((fix-c1 (fixup c1))
273     (fix-c2 (fixup c2)))
274     (if (,(if lessp '< '>) fix-c1 fix-c2)
275     (- (the fixnum index) ,offset1)
276     nil)))
277     (t
278     (if (,(if lessp '< '>) c1 c2)
279 rtoy 1.12.30.10 (- (the fixnum index) ,offset1)
280     nil)))))))
281 ram 1.4 ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
282 ram 1.1 ) ; eval-when
283    
284     (defun string<* (string1 string2 start1 end1 start2 end2)
285     (declare (fixnum start1 start2))
286     (string<>=*-body t nil))
287    
288     (defun string>* (string1 string2 start1 end1 start2 end2)
289     (declare (fixnum start1 start2))
290     (string<>=*-body nil nil))
291    
292     (defun string<=* (string1 string2 start1 end1 start2 end2)
293     (declare (fixnum start1 start2))
294     (string<>=*-body t t))
295    
296     (defun string>=* (string1 string2 start1 end1 start2 end2)
297     (declare (fixnum start1 start2))
298     (string<>=*-body nil t))
299    
300    
301    
302     (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
303     "Given two strings, if the first string is lexicographically less than
304     the second string, returns the longest common prefix (using char=)
305     of the two strings. Otherwise, returns ()."
306     (string<* string1 string2 start1 end1 start2 end2))
307    
308     (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
309     "Given two strings, if the first string is lexicographically greater than
310     the second string, returns the longest common prefix (using char=)
311     of the two strings. Otherwise, returns ()."
312     (string>* string1 string2 start1 end1 start2 end2))
313    
314    
315     (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
316     "Given two strings, if the first string is lexicographically less than
317     or equal to the second string, returns the longest common prefix
318     (using char=) of the two strings. Otherwise, returns ()."
319     (string<=* string1 string2 start1 end1 start2 end2))
320    
321     (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
322     "Given two strings, if the first string is lexicographically greater
323     than or equal to the second string, returns the longest common prefix
324     (using char=) of the two strings. Otherwise, returns ()."
325     (string>=* string1 string2 start1 end1 start2 end2))
326    
327     (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
328     "Given two strings (string1 and string2), and optional integers start1,
329     start2, end1 and end2, compares characters in string1 to characters in
330     string2 (using char=)."
331     (string=* string1 string2 start1 end1 start2 end2))
332    
333     (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
334     "Given two strings, if the first string is not lexicographically equal
335     to the second string, returns the longest common prefix (using char=)
336     of the two strings. Otherwise, returns ()."
337     (string/=* string1 string2 start1 end1 start2 end2))
338    
339    
340     (eval-when (compile eval)
341    
342     ;;; STRING-NOT-EQUAL-LOOP is used to generate character comparison loops for
343     ;;; STRING-EQUAL and STRING-NOT-EQUAL.
344     (defmacro string-not-equal-loop (end end-value
345     &optional (abort-value nil abortp))
346     (declare (fixnum end))
347     (let ((end-test (if (= end 1)
348     `(= index1 (the fixnum end1))
349     `(= index2 (the fixnum end2)))))
350     `(do ((index1 start1 (1+ index1))
351     (index2 start2 (1+ index2)))
352     (,(if abortp
353     end-test
354     `(or ,end-test
355     (not (char-equal (schar string1 index1)
356     (schar string2 index2)))))
357     ,end-value)
358     (declare (fixnum index1 index2))
359     ,@(if abortp
360     `((if (not (char-equal (schar string1 index1)
361     (schar string2 index2)))
362     (return ,abort-value)))))))
363    
364 rtoy 1.12.30.31 #+unicode
365     (defmacro handle-case-fold-equal (f &body body)
366 rtoy 1.12.30.32 `(ecase casing
367     (:simple
368     ,@body)
369     (:full
370     ;; We should probably do this in a different way with less
371     ;; consing, but this is easy.
372     (let* ((s1 (case-fold string1 start1 end1))
373     (s2 (case-fold string1 start2 end2)))
374     (,f s1 s2)))))
375 rtoy 1.12.30.31
376     #-unicode
377     (defmacro handle-case-fold-equal (f &body body)
378     (declare (ignore f))
379     `(progn ,@body))
380    
381 ram 1.1 ) ; eval-when
382    
383 rtoy 1.12.30.31 #+unicode
384     (defun case-fold (string start end)
385     ;; Create a new string performing full case folding of String
386     (with-output-to-string (s)
387     (with-one-string string start end offset
388     (do ((index offset (1+ index)))
389     ((>= index end))
390     (multiple-value-bind (code widep)
391     (codepoint string index)
392     (when widep (incf index))
393     (write-string (unicode-case-fold-full code) s))))))
394    
395     (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2 #+unicode (casing :simple))
396 rtoy 1.12.30.32 #-unicode
397 ram 1.1 "Given two strings (string1 and string2), and optional integers start1,
398     start2, end1 and end2, compares characters in string1 to characters in
399     string2 (using char-equal)."
400 rtoy 1.12.30.32 #+unicode
401     "Given two strings (string1 and string2), and optional integers
402     start1, start2, end1 and end2, compares characters in string1 to
403     characters in string2. Casing is :simple or :full for simple or full
404     case folding, respectively."
405 ram 1.1 (declare (fixnum start1 start2))
406 rtoy 1.12.30.31 (handle-case-fold-equal string-equal
407     (with-two-strings string1 string2 start1 end1 offset1 start2 end2
408     (let ((slen1 (- (the fixnum end1) start1))
409     (slen2 (- (the fixnum end2) start2)))
410     (declare (fixnum slen1 slen2))
411     (if (or (minusp slen1) (minusp slen2))
412     ;;prevent endless looping later.
413     (error "Improper bounds for string comparison."))
414     (if (= slen1 slen2)
415     ;;return () immediately if lengths aren't equal.
416     (string-not-equal-loop 1 t nil))))))
417 ram 1.1
418 rtoy 1.12.30.31 (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2 #+unicode (casing :simple))
419 rtoy 1.12.30.32 #-unicode
420 ram 1.1 "Given two strings, if the first string is not lexicographically equal
421     to the second string, returns the longest common prefix (using char-equal)
422     of the two strings. Otherwise, returns ()."
423 rtoy 1.12.30.32 #-unicode
424     "Given two strings, if the first string is not lexicographically
425     equal to the second string, returns the longest common prefix of the
426     two strings. Otherwise, returns (). Casing is :simple or :full for
427     simple or full case folding."
428 rtoy 1.12.30.31 (handle-case-fold-equal string-not-equal
429     (with-two-strings string1 string2 start1 end1 offset1 start2 end2
430     (let ((slen1 (- end1 start1))
431     (slen2 (- end2 start2)))
432     (declare (fixnum slen1 slen2))
433     (if (or (minusp slen1) (minusp slen2))
434     ;;prevent endless looping later.
435     (error "Improper bounds for string comparison."))
436     (cond ((or (minusp slen1) (or (minusp slen2)))
437     (error "Improper substring for comparison."))
438     ((= slen1 slen2)
439     (string-not-equal-loop 1 nil (- index1 offset1)))
440     ((< slen1 slen2)
441     (string-not-equal-loop 1 (- index1 offset1)))
442     (t
443     (string-not-equal-loop 2 (- index1 offset1))))))))
444 ram 1.1
445    
446    
447     (eval-when (compile eval)
448    
449     ;;; STRING-LESS-GREATER-EQUAL-TESTS returns a test on the lengths of string1
450     ;;; and string2 and a test on the current characters from string1 and string2
451     ;;; for the following macro.
452     (defun string-less-greater-equal-tests (lessp equalp)
453     (if lessp
454     (if equalp
455     ;; STRING-NOT-GREATERP
456 rtoy 1.12.30.15 (values '<=
457     #-unicode `(not (char-greaterp char1 char2))
458     #+unicode `(<= char1 char2))
459 ram 1.1 ;; STRING-LESSP
460 rtoy 1.12.30.15 (values '<
461     #-unicode `(char-lessp char1 char2)
462     #+unicode `(< char1 char2)))
463 ram 1.1 (if equalp
464     ;; STRING-NOT-LESSP
465 rtoy 1.12.30.15 (values '>=
466     #-unicode `(not (char-lessp char1 char2))
467     #+unicode `(>= char1 char2))
468 ram 1.1 ;; STRING-GREATERP
469 rtoy 1.12.30.15 (values '>
470     #-unicode `(char-greaterp char1 char2)
471     #+unicode `(> char1 char2)))))
472 ram 1.1
473 rtoy 1.12.30.15 #-unicode
474 ram 1.1 (defmacro string-less-greater-equal (lessp equalp)
475     (multiple-value-bind (length-test character-test)
476     (string-less-greater-equal-tests lessp equalp)
477 ram 1.4 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
478     (let ((slen1 (- (the fixnum end1) start1))
479     (slen2 (- (the fixnum end2) start2)))
480     (declare (fixnum slen1 slen2))
481     (if (or (minusp slen1) (minusp slen2))
482     ;;prevent endless looping later.
483     (error "Improper bounds for string comparison."))
484     (do ((index1 start1 (1+ index1))
485     (index2 start2 (1+ index2))
486     (char1)
487     (char2))
488     ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
489     (if (,length-test slen1 slen2) (- index1 offset1)))
490     (declare (fixnum index1 index2))
491     (setq char1 (schar string1 index1))
492     (setq char2 (schar string2 index2))
493     (if (not (char-equal char1 char2))
494     (if ,character-test
495     (return (- index1 offset1))
496     (return ()))))))))
497 ram 1.1
498 rtoy 1.12.30.18 ;; Convert to lowercase for case folding, to match what Unicode
499     ;; CaseFolding.txt says. An example where this matters: U+1E9E maps
500     ;; to U+00DF. But the uppercase version of U+00DF is U+00DF.
501 rtoy 1.12.30.15 #+unicode
502     (defmacro equal-char-codepoint (codepoint)
503     `(let ((ch ,codepoint))
504 rtoy 1.12.30.18 ;; Handle ASCII separately for bootstrapping and for unidata missing.
505     (if (< 64 ch 91)
506     (+ ch 32)
507 rtoy 1.12.30.15 #-(and unicode (not unicode-bootstrap))
508     ch
509     #+(and unicode (not unicode-bootstrap))
510 rtoy 1.12.30.31 (if (> ch 127) (unicode-case-fold-simple ch) ch))))
511 rtoy 1.12.30.15
512     #+unicode
513     (defmacro string-less-greater-equal (lessp equalp)
514     (multiple-value-bind (length-test character-test)
515     (string-less-greater-equal-tests lessp equalp)
516     `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
517     (let ((slen1 (- (the fixnum end1) start1))
518     (slen2 (- (the fixnum end2) start2)))
519     (declare (fixnum slen1 slen2))
520     (if (or (minusp slen1) (minusp slen2))
521     ;;prevent endless looping later.
522     (error "Improper bounds for string comparison."))
523     (do ((index1 start1 (1+ index1))
524     (index2 start2 (1+ index2)))
525     ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
526     (if (,length-test slen1 slen2) (- index1 offset1)))
527     (declare (fixnum index1 index2))
528     (multiple-value-bind (char1 wide1)
529     (codepoint string1 index1)
530 rtoy 1.12.30.26 (declare (type codepoint char1))
531 rtoy 1.12.30.15 (multiple-value-bind (char2 wide2)
532     (codepoint string2 index2)
533 rtoy 1.12.30.26 (declare (type codepoint char2))
534 rtoy 1.12.30.18 (setf char1 (equal-char-codepoint char1))
535     (setf char2 (equal-char-codepoint char2))
536     (if (= char1 char2)
537 rtoy 1.12.30.15 (progn
538     (when wide1 (incf index1))
539     (when wide2 (incf index2)))
540     (if ,character-test
541     (return (- index1 offset1))
542     (return ()))))))))))
543    
544 ram 1.1 ) ; eval-when
545    
546     (defun string-lessp* (string1 string2 start1 end1 start2 end2)
547     (declare (fixnum start1 start2))
548     (string-less-greater-equal t nil))
549    
550     (defun string-greaterp* (string1 string2 start1 end1 start2 end2)
551     (declare (fixnum start1 start2))
552     (string-less-greater-equal nil nil))
553    
554     (defun string-not-lessp* (string1 string2 start1 end1 start2 end2)
555     (declare (fixnum start1 start2))
556     (string-less-greater-equal nil t))
557    
558     (defun string-not-greaterp* (string1 string2 start1 end1 start2 end2)
559     (declare (fixnum start1 start2))
560     (string-less-greater-equal t t))
561    
562 rtoy 1.12.30.31
563     (eval-when (compile)
564    
565     #+unicode
566     (defmacro handle-case-folding (f)
567 rtoy 1.12.30.32 `(ecase casing
568     (:simple
569     (,f string1 string2 start1 end1 start2 end2))
570     (:full
571 rtoy 1.12.30.31 (let* ((s1 (case-fold string1 start1 end1))
572     (s2 (case-fold string2 start2 end2))
573     (result (,f s1 s2 0 (length s1) 0 (length s2))))
574     (when result
575 rtoy 1.12.30.32 (+ result start1))))))
576 rtoy 1.12.30.31
577     #-unicode
578     (defmacro handle-case-folding (f)
579     `(,f string1 string2 start1 end1 start2 end2))
580    
581     ) ; compile
582    
583     (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2 #+unicode (casing :simple))
584 rtoy 1.12.30.32 #-unicode
585 ram 1.1 "Given two strings, if the first string is lexicographically less than
586     the second string, returns the longest common prefix (using char-equal)
587     of the two strings. Otherwise, returns ()."
588 rtoy 1.12.30.32 #+unicode
589     "Given two strings, if the first string is lexicographically less
590     than the second string, returns the longest common prefix of the two
591     strings. Otherwise, returns (). Casing is :simple or :full for
592     simple or full case folding, respectively."
593    
594 rtoy 1.12.30.31 (handle-case-folding string-lessp*))
595 ram 1.1
596 rtoy 1.12.30.31 (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2 #+unicode (casing :simple))
597 rtoy 1.12.30.32 #-unicode
598 ram 1.1 "Given two strings, if the first string is lexicographically greater than
599     the second string, returns the longest common prefix (using char-equal)
600     of the two strings. Otherwise, returns ()."
601 rtoy 1.12.30.32 #+unicode
602     "Given two strings, if the first string is lexicographically greater
603     than the second string, returns the longest common prefix of the two
604     strings. Otherwise, returns (). Casing is :simple or :full for
605     simple or full case folding, respectively."
606 rtoy 1.12.30.31 (handle-case-folding string-greaterp*))
607 ram 1.1
608 rtoy 1.12.30.31 (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2 #+unicode (casing :simple))
609 rtoy 1.12.30.32 #-unicode
610 ram 1.1 "Given two strings, if the first string is lexicographically greater
611     than or equal to the second string, returns the longest common prefix
612     (using char-equal) of the two strings. Otherwise, returns ()."
613 rtoy 1.12.30.32 #+unicode
614     "Given two strings, if the first string is lexicographically greater
615     than or equal to the second string, returns the longest common
616     prefix of the two strings. Otherwise, returns (). Casing is :simple
617     or :full for simple or full case folding, respectively."
618 rtoy 1.12.30.31 (handle-case-folding string-not-lessp*))
619 ram 1.1
620     (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
621 rtoy 1.12.30.31 end2 #+unicode (casing :simple))
622 rtoy 1.12.30.32 #-unicode
623 ram 1.1 "Given two strings, if the first string is lexicographically less than
624     or equal to the second string, returns the longest common prefix
625     (using char-equal) of the two strings. Otherwise, returns ()."
626 rtoy 1.12.30.32 #+unicode
627     "Given two strings, if the first string is lexicographically less
628     than or equal to the second string, returns the longest common
629     prefix of the two strings. Otherwise, returns (). Casing is :simple
630     or :full for simple or full case folding, respectively."
631 rtoy 1.12.30.31 (handle-case-folding string-not-greaterp*))
632 ram 1.1
633    
634 dtc 1.9 (defun make-string (count &key element-type ((:initial-element fill-char)))
635 ram 1.1 "Given a character count and an optional fill character, makes and returns
636 rtoy 1.12.30.14 a new string Count long filled with the fill character."
637 emarsden 1.12 (declare (type fixnum count))
638     (assert (subtypep element-type 'character))
639 ram 1.1 (if fill-char
640     (do ((i 0 (1+ i))
641     (string (make-string count)))
642     ((= i count) string)
643     (declare (fixnum i))
644     (setf (schar string i) fill-char))
645     (make-string count)))
646    
647 rtoy 1.12.30.29 (defun string-upcase-simple (string &key (start 0) end)
648 ram 1.1 (declare (fixnum start))
649 ram 1.5 (let* ((string (if (stringp string) string (string string)))
650     (slen (length string)))
651 ram 1.4 (declare (fixnum slen))
652 ram 1.1 (with-one-string string start end offset
653     (let ((offset-slen (+ slen offset))
654     (newstring (make-string slen)))
655     (declare (fixnum offset-slen))
656     (do ((index offset (1+ index))
657     (new-index 0 (1+ new-index)))
658     ((= index start))
659     (declare (fixnum index new-index))
660     (setf (schar newstring new-index) (schar string index)))
661     (do ((index start (1+ index))
662     (new-index (- start offset) (1+ new-index)))
663     ((= index (the fixnum end)))
664     (declare (fixnum index new-index))
665 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
666     (when wide (incf index))
667     ;; Handle ASCII specially because this is called early in
668     ;; initialization, before unidata is available.
669     (cond ((< 96 code 123) (decf code 32))
670     ((> code 127) (setq code (unicode-upper code))))
671     ;;@@ WARNING: this may, in theory, need to extend newstring
672     ;; but that never actually occurs as of Unicode 5.1.0,
673     ;; so I'm just going to ignore it for now...
674     (multiple-value-bind (hi lo) (surrogates code)
675 rtoy 1.12.30.8 (setf (schar newstring new-index) hi)
676 rtoy 1.12.30.6 (when lo
677 rtoy 1.12.30.8 (setf (schar newstring (incf new-index)) lo)))))
678 rtoy 1.12.30.4 ;;@@ WARNING: see above
679 ram 1.1 (do ((index end (1+ index))
680     (new-index (- (the fixnum end) offset) (1+ new-index)))
681     ((= index offset-slen))
682     (declare (fixnum index new-index))
683     (setf (schar newstring new-index) (schar string index)))
684     newstring))))
685    
686 rtoy 1.12.30.29 (defun string-upcase-full (string &key (start 0) end)
687     (declare (fixnum start))
688     (let* ((string (if (stringp string) string (string string)))
689     (slen (length string)))
690     (declare (fixnum slen))
691     (with-output-to-string (s)
692     (with-one-string string start end offset
693     (let ((offset-slen (+ slen offset)))
694     (declare (fixnum offset-slen))
695     (write-string string s :start offset :end start)
696     (do ((index start (1+ index)))
697     ((= index (the fixnum end)))
698     (declare (fixnum index))
699     (multiple-value-bind (code wide)
700     (codepoint string index)
701     (when wide (incf index))
702     ;; Handle ASCII specially because this is called early in
703     ;; initialization, before unidata is available.
704     (cond ((< 96 code 123)
705     (write-char (code-char (decf code 32)) s))
706     ((> code 127)
707     (write-string (unicode-full-case-upper code) s))
708     (t
709     (multiple-value-bind (hi lo)
710     (surrogates code)
711     (write-char hi s)
712     (when lo
713     (write-char lo s)))))))
714     (write-string string s :start end :end offset-slen))))))
715    
716     (defun string-upcase (string &key (start 0) end #+unicode (casing :simple))
717 rtoy 1.12.30.32 #-unicode
718     "Given a string, returns a new string that is a copy of it with all
719     lower case alphabetic characters converted to uppercase."
720     #+unicode
721 rtoy 1.12.30.29 "Given a string, returns a new string that is a copy of it with all
722 rtoy 1.12.30.32 lower case alphabetic characters converted to uppercase. Casing is
723     :simple or :full for simple or full case conversion, respectively."
724 rtoy 1.12.30.29 (declare (fixnum start))
725     #-unicode
726     (string-upcase-simple string :start start :end end)
727     #+unicode
728     (if (eq casing :simple)
729     (string-upcase-simple string :start start :end end)
730     (string-upcase-full string :start start :end end)))
731    
732     (defun string-downcase-simple (string &key (start 0) end)
733 ram 1.1 (declare (fixnum start))
734 ram 1.5 (let* ((string (if (stringp string) string (string string)))
735     (slen (length string)))
736 ram 1.4 (declare (fixnum slen))
737 ram 1.1 (with-one-string string start end offset
738     (let ((offset-slen (+ slen offset))
739     (newstring (make-string slen)))
740     (declare (fixnum offset-slen))
741     (do ((index offset (1+ index))
742     (new-index 0 (1+ new-index)))
743     ((= index start))
744     (declare (fixnum index new-index))
745     (setf (schar newstring new-index) (schar string index)))
746     (do ((index start (1+ index))
747     (new-index (- start offset) (1+ new-index)))
748     ((= index (the fixnum end)))
749     (declare (fixnum index new-index))
750 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
751     (when wide (incf index))
752     ;; Handle ASCII specially because this is called early in
753     ;; initialization, before unidata is available.
754     (cond ((< 64 code 91) (incf code 32))
755     ((> code 127) (setq code (unicode-lower code))))
756     ;;@@ WARNING: this may, in theory, need to extend newstring
757     ;; but that never actually occurs as of Unicode 5.1.0,
758     ;; so I'm just going to ignore it for now...
759     (multiple-value-bind (hi lo) (surrogates code)
760 rtoy 1.12.30.8 (setf (schar newstring new-index) hi)
761 rtoy 1.12.30.6 (when lo
762 rtoy 1.12.30.8 (setf (schar newstring (incf new-index)) lo)))))
763 rtoy 1.12.30.4 ;;@@ WARNING: see above
764 ram 1.1 (do ((index end (1+ index))
765     (new-index (- (the fixnum end) offset) (1+ new-index)))
766     ((= index offset-slen))
767     (declare (fixnum index new-index))
768     (setf (schar newstring new-index) (schar string index)))
769     newstring))))
770    
771 rtoy 1.12.30.29 (defun string-downcase-full (string &key (start 0) end)
772     (declare (fixnum start))
773     (let* ((string (if (stringp string) string (string string)))
774     (slen (length string)))
775     (declare (fixnum slen))
776     (with-output-to-string (s)
777     (with-one-string string start end offset
778     (let ((offset-slen (+ slen offset)))
779     (declare (fixnum offset-slen))
780     (write-string string s :start offset :end start)
781     (do ((index start (1+ index)))
782     ((= index (the fixnum end)))
783     (declare (fixnum index))
784     (multiple-value-bind (code wide)
785     (codepoint string index)
786     (when wide (incf index))
787     ;; Handle ASCII specially because this is called early in
788     ;; initialization, before unidata is available.
789     (cond ((< 64 code 91)
790     (write-char (code-char (incf code 32)) s))
791     ((> code 127)
792     (write-string (unicode-full-case-lower code) s))
793     (t
794     (multiple-value-bind (hi lo)
795     (surrogates code)
796     (write-char hi s)
797     (when lo
798     (write-char lo s)))))))
799     (write-string string s :start end :end offset-slen))))))
800    
801     (defun string-downcase (string &key (start 0) end #+unicode (casing :simple))
802 rtoy 1.12.30.32 #-unicode
803     "Given a string, returns a new string that is a copy of it with all
804     upper case alphabetic characters converted to lowercase."
805     #+unicode
806 rtoy 1.12.30.29 "Given a string, returns a new string that is a copy of it with all
807 rtoy 1.12.30.32 upper case alphabetic characters converted to lowercase. Casing is
808     :simple or :full for simple or full case conversion, respectively."
809 rtoy 1.12.30.29 (declare (fixnum start))
810     #-unicode
811     (string-downcase-simple string :start start :end end)
812     #+unicode
813     (if (eq casing :simple)
814     (string-downcase-simple string :start start :end end)
815     (string-downcase-full string :start start :end end)))
816    
817     (defun string-capitalize-simple (string &key (start 0) end)
818 ram 1.1 (declare (fixnum start))
819 ram 1.5 (let* ((string (if (stringp string) string (string string)))
820     (slen (length string)))
821 ram 1.4 (declare (fixnum slen))
822 ram 1.1 (with-one-string string start end offset
823     (let ((offset-slen (+ slen offset))
824     (newstring (make-string slen)))
825     (declare (fixnum offset-slen))
826     (do ((index offset (1+ index))
827     (new-index 0 (1+ new-index)))
828     ((= index start))
829     (declare (fixnum index new-index))
830     (setf (schar newstring new-index) (schar string index)))
831     (do ((index start (1+ index))
832     (new-index (- start offset) (1+ new-index))
833     (newword t)
834     (char ()))
835     ((= index (the fixnum end)))
836     (declare (fixnum index new-index))
837     (setq char (schar string index))
838     (cond ((not (alphanumericp char))
839     (setq newword t))
840     (newword
841     ;;char is first case-modifiable after non-case-modifiable
842 rtoy 1.12.30.2 (setq char (char-titlecase char))
843 ram 1.1 (setq newword ()))
844     ;;char is case-modifiable, but not first
845     (t (setq char (char-downcase char))))
846     (setf (schar newstring new-index) char))
847     (do ((index end (1+ index))
848     (new-index (- (the fixnum end) offset) (1+ new-index)))
849     ((= index offset-slen))
850     (declare (fixnum index new-index))
851     (setf (schar newstring new-index) (schar string index)))
852     newstring))))
853    
854 rtoy 1.12.30.29 (defun string-capitalize-full (string &key (start 0) end)
855     (declare (fixnum start))
856     (let* ((string (if (stringp string) string (string string)))
857     (slen (length string)))
858     (declare (fixnum slen))
859     (with-output-to-string (s)
860     (with-one-string string start end offset
861     (let ((offset-slen (+ slen offset)))
862     (declare (fixnum offset-slen))
863     (write-string string s :start offset :end start)
864     (flet ((alphanump (m)
865     (or (< 47 m 58) (< 64 m 91) (< 96 m 123)
866     #+(and unicode (not unicode-bootstrap))
867     (and (> m 127)
868     (<= +unicode-category-letter+ (unicode-category m)
869     (+ +unicode-category-letter+ #x0F))))))
870     (do ((index start (1+ index))
871     (newword t))
872     ((= index (the fixnum end)))
873     (declare (fixnum index))
874     (multiple-value-bind (code wide)
875     (codepoint string index)
876     (when wide (incf index))
877     (cond ((not (alphanump code))
878     (multiple-value-bind (hi lo)
879     (surrogates code)
880     (write-char hi s)
881     (when lo (write-char lo s)))
882     (setq newword t))
883     (newword
884     ;;char is first case-modifiable after non-case-modifiable
885     (write-string (unicode-full-case-title code) s)
886     (setq newword ()))
887     ;;char is case-modifiable, but not first
888     (t
889     (write-string (unicode-full-case-lower code) s))))))
890     (write-string string s :start end :end offset-slen))))))
891    
892     (defun string-capitalize (string &key (start 0) end #+unicode (casing :simple))
893 rtoy 1.12.30.32 #-unicode
894 rtoy 1.12.30.29 "Given a string, returns a copy of the string with the first
895     character of each ``word'' converted to upper-case, and remaining
896     chars in the word converted to lower case. A ``word'' is defined
897     to be a string of case-modifiable characters delimited by
898     non-case-modifiable chars."
899 rtoy 1.12.30.32 #+unicode
900     "Given a string, returns a copy of the string with the first
901     character of each ``word'' converted to upper-case, and remaining
902     chars in the word converted to lower case. A ``word'' is defined
903     to be a string of case-modifiable characters delimited by
904     non-case-modifiable chars. Casing is :simple or :full for
905     simple or full case conversion, respectively."
906    
907 rtoy 1.12.30.29 (declare (fixnum start))
908     #-unicode
909     (string-capitalize-simple string :start start :end end)
910     #+unicode
911     (if (eq casing :simple)
912     (string-capitalize-simple string :start start :end end)
913     (string-capitalize-full string :start start :end end)))
914    
915 ram 1.1 (defun nstring-upcase (string &key (start 0) end)
916     "Given a string, returns that string with all lower case alphabetic
917     characters converted to uppercase."
918     (declare (fixnum start))
919 ram 1.4 (let ((save-header string))
920 ram 1.1 (with-one-string string start end offset
921     (do ((index start (1+ index)))
922     ((= index (the fixnum end)))
923     (declare (fixnum index))
924 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
925     (declare (ignore wide))
926     ;; Handle ASCII specially because this is called early in
927     ;; initialization, before unidata is available.
928     (cond ((< 96 code 123) (decf code 32))
929     ((> code 127) (setq code (unicode-upper code))))
930     ;;@@ WARNING: this may, in theory, need to extend string
931     ;; (which, obviously, we can't do here. Unless
932     ;; STRING is adjustable, maybe)
933     ;; but that never actually occurs as of Unicode 5.1.0,
934     ;; so I'm just going to ignore it for now...
935     (multiple-value-bind (hi lo) (surrogates code)
936 rtoy 1.12.30.8 (setf (schar string index) hi)
937 rtoy 1.12.30.6 (when lo
938 rtoy 1.12.30.8 (setf (schar string (incf index)) lo))))))
939 ram 1.1 save-header))
940    
941     (defun nstring-downcase (string &key (start 0) end)
942     "Given a string, returns that string with all upper case alphabetic
943     characters converted to lowercase."
944     (declare (fixnum start))
945 ram 1.4 (let ((save-header string))
946 ram 1.1 (with-one-string string start end offset
947     (do ((index start (1+ index)))
948     ((= index (the fixnum end)))
949     (declare (fixnum index))
950 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
951     (declare (ignore wide))
952     (cond ((< 64 code 91) (incf code 32))
953     ((> code 127) (setq code (unicode-lower code))))
954     ;;@@ WARNING: this may, in theory, need to extend string
955     ;; (which, obviously, we can't do here. Unless
956     ;; STRING is adjustable, maybe)
957     ;; but that never actually occurs as of Unicode 5.1.0,
958     ;; so I'm just going to ignore it for now...
959     (multiple-value-bind (hi lo) (surrogates code)
960 rtoy 1.12.30.8 (setf (schar string index) hi)
961 rtoy 1.12.30.6 (when lo
962 rtoy 1.12.30.8 (setf (schar string (incf index)) lo))))))
963 wlott 1.6 save-header))
964 ram 1.1
965     (defun nstring-capitalize (string &key (start 0) end)
966     "Given a string, returns that string with the first
967     character of each ``word'' converted to upper-case, and remaining
968     chars in the word converted to lower case. A ``word'' is defined
969     to be a string of case-modifiable characters delimited by
970     non-case-modifiable chars."
971     (declare (fixnum start))
972 ram 1.4 (let ((save-header string))
973 ram 1.1 (with-one-string string start end offset
974     (do ((index start (1+ index))
975     (newword t)
976     (char ()))
977     ((= index (the fixnum end)))
978     (declare (fixnum index))
979     (setq char (schar string index))
980     (cond ((not (alphanumericp char))
981     (setq newword t))
982     (newword
983     ;;char is first case-modifiable after non-case-modifiable
984 rtoy 1.12.30.2 (setf (schar string index) (char-titlecase char))
985 ram 1.1 (setq newword ()))
986     (t
987     (setf (schar string index) (char-downcase char))))))
988     save-header))
989    
990 rtoy 1.12.30.30
991     #+unicode
992     (progn
993     ;; Like string-left-trim, but return the index
994     (defun string-left-trim-index (char-bag string)
995     (with-string string
996     (if (stringp char-bag)
997     ;; When char-bag is a string, we try to do the right thing.
998     ;; Convert char-bag to a list of codepoints and compare the
999     ;; codepoints in the string with this.
1000     (let ((code-bag (with-string char-bag
1001     (do ((index start (1+ index))
1002     (result nil))
1003     ((= index end)
1004     (nreverse result))
1005     (multiple-value-bind (c widep)
1006     (codepoint char-bag index)
1007     (push c result)
1008     (when widep (incf index)))))))
1009     (do ((index start (1+ index)))
1010     ((= index (the fixnum end))
1011     end)
1012     (declare (fixnum index))
1013     (multiple-value-bind (c widep)
1014     (codepoint string index)
1015     (unless (find c code-bag)
1016     (return-from string-left-trim-index index))
1017     (when widep (incf index)))))
1018     ;; When char-bag is a list, we just look at each codepoint of
1019     ;; STRING to see if it's in char-bag. If char-bag contains a
1020     ;; surrogate, we could accidentally trim off a surrogate,
1021     ;; leaving an invalid UTF16 string.
1022     (do ((index start (1+ index)))
1023     ((= index (the fixnum end))
1024     end)
1025     (declare (fixnum index))
1026     (multiple-value-bind (c widep)
1027     (codepoint string index)
1028     (unless (find c char-bag :key #'char-code)
1029     (return-from string-left-trim-index index))
1030     (when widep (incf index)))))))
1031    
1032     (defun string-left-trim (char-bag string)
1033     "Given a set of characters (a list or string) and a string, returns
1034     a copy of the string with the characters in the set removed from the
1035     left end. If the set of characters is a string, surrogates will be
1036     properly handled."
1037     (let ((begin (string-left-trim-index char-bag string)))
1038     (with-string string
1039     (subseq string begin end))))
1040    
1041     (defun string-right-trim-index (char-bag string)
1042     (with-string string
1043     (if (stringp char-bag)
1044     ;; When char-bag is a string, we try to do the right thing
1045     ;; with surrogates. Convert char-bag to a list of codepoints
1046     ;; and compare the codepoints in the string with this.
1047     (let ((code-bag (with-string char-bag
1048     (do ((index start (1+ index))
1049     (result nil))
1050     ((= index end)
1051     result)
1052     (multiple-value-bind (c widep)
1053     (codepoint char-bag index)
1054     (push c result)
1055     (when widep (incf index)))))))
1056     (do ((index (1- end) (1- index)))
1057     ((< index start)
1058     start)
1059     (declare (fixnum index))
1060     (multiple-value-bind (c widep)
1061     (codepoint string index)
1062     (unless (find c code-bag)
1063     (return-from string-right-trim-index (1+ index)))
1064     (when widep (decf index)))))
1065     ;; When char-bag is a list, we just look at each codepoint of
1066     ;; STRING to see if it's in char-bag. If char-bag contains a
1067     ;; surrogate, we could accidentally trim off a surrogate,
1068     ;; leaving an invalid UTF16 string.
1069     (do ((index (1- end) (1- index)))
1070     ((< index start)
1071     start)
1072     (declare (fixnum index))
1073     (multiple-value-bind (c widep)
1074     (codepoint string index)
1075     (unless (find c char-bag :key #'char-code)
1076     (return-from string-right-trim-index (1+ index)))
1077     (when widep (decf index)))))))
1078    
1079     (defun string-right-trim (char-bag string)
1080     "Given a set of characters (a list or string) and a string, returns
1081     a copy of the string with the characters in the set removed from the
1082     right end. If the set of characters is a string, surrogates will be
1083     properly handled."
1084     (let ((stop (string-right-trim-index char-bag string)))
1085     (with-string string
1086     (subseq string start stop))))
1087    
1088     (defun string-trim (char-bag string)
1089     "Given a set of characters (a list or string) and a string, returns a
1090     copy of the string with the characters in the set removed from both
1091     ends. If the set of characters is a string, surrogates will be
1092     properly handled."
1093     (let ((left-end (string-left-trim-index char-bag string))
1094     (right-end (string-right-trim-index char-bag string)))
1095     (with-string string
1096     (subseq (the simple-string string) left-end right-end))))
1097     ) ; end unicode version
1098    
1099     #-unicode
1100     (progn
1101 ram 1.1 (defun string-left-trim (char-bag string)
1102     "Given a set of characters (a list or string) and a string, returns
1103     a copy of the string with the characters in the set removed from the
1104     left end."
1105     (with-string string
1106     (do ((index start (1+ index)))
1107     ((or (= index (the fixnum end))
1108     (not (find (schar string index) char-bag)))
1109     (subseq (the simple-string string) index end))
1110     (declare (fixnum index)))))
1111    
1112     (defun string-right-trim (char-bag string)
1113     "Given a set of characters (a list or string) and a string, returns
1114     a copy of the string with the characters in the set removed from the
1115     right end."
1116     (with-string string
1117     (do ((index (1- (the fixnum end)) (1- index)))
1118     ((or (< index start) (not (find (schar string index) char-bag)))
1119     (subseq (the simple-string string) start (1+ index)))
1120     (declare (fixnum index)))))
1121    
1122     (defun string-trim (char-bag string)
1123     "Given a set of characters (a list or string) and a string, returns a
1124     copy of the string with the characters in the set removed from both
1125     ends."
1126     (with-string string
1127     (let* ((left-end (do ((index start (1+ index)))
1128     ((or (= index (the fixnum end))
1129     (not (find (schar string index) char-bag)))
1130     index)
1131     (declare (fixnum index))))
1132     (right-end (do ((index (1- (the fixnum end)) (1- index)))
1133     ((or (< index left-end)
1134     (not (find (schar string index) char-bag)))
1135     (1+ index))
1136     (declare (fixnum index)))))
1137     (subseq (the simple-string string) left-end right-end))))
1138 rtoy 1.12.30.30 ) ; non-unicode version
1139 rtoy 1.12.30.3
1140 rtoy 1.12.30.4 (declaim (inline %glyph-f %glyph-b))
1141     (defun %glyph-f (string index)
1142     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
1143     (type simple-string string) (type kernel:index index))
1144 rtoy 1.12.30.10 (let* ((prev 0)
1145     (l (length string))
1146     (c (codepoint string index l))
1147     (n (+ index (if (> c #xFFFF) 2 1))))
1148 rtoy 1.12.30.26 (declare (type codepoint c) (type kernel:index l n))
1149 rtoy 1.12.30.10 (loop while (< n l) do
1150     (let* ((c (codepoint string n l))
1151     (d (the (unsigned-byte 8) (unicode-combining-class c))))
1152     (when (or (zerop d) (< d prev))
1153     (return))
1154     (setq prev d)
1155     (incf n (if (> c #xFFFF) 2 1))))
1156     n))
1157 rtoy 1.12.30.4
1158     (defun %glyph-b (string index)
1159     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
1160     (type simple-string string) (type kernel:index index))
1161 rtoy 1.12.30.11 (let* ((prev 255)
1162     (n (1- index)))
1163     (declare (type kernel:index n))
1164     (loop until (< n 0) do
1165     (let* ((c (codepoint string n 0))
1166     (d (the (unsigned-byte 8) (unicode-combining-class c))))
1167     (cond ((zerop d) (return))
1168     ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
1169     (setq prev d)
1170     (decf n (if (> c #xFFFF) 2 1))))
1171     n))
1172 rtoy 1.12.30.4
1173     (defun glyph (string index &key (from-end nil))
1174 rtoy 1.12.30.3 "GLYPH returns the glyph at the indexed position in a string, and the
1175 rtoy 1.12.30.4 position of the next glyph (or NIL) as a second value. A glyph is
1176     a substring consisting of the character at INDEX followed by all
1177     subsequent combining characters."
1178 rtoy 1.12.30.3 (declare (type simple-string string) (type kernel:index index))
1179     #-unicode
1180     (char string index)
1181     #+unicode
1182     (with-array-data ((string string) (start) (end))
1183     (declare (ignore start end))
1184 rtoy 1.12.30.4 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
1185     (if from-end
1186     (values (subseq string n index) (and (> n 0) n))
1187     (values (subseq string index n) (and (< n (length string)) n))))))
1188 rtoy 1.12.30.3
1189 rtoy 1.12.30.4 (defun sglyph (string index &key (from-end nil))
1190 rtoy 1.12.30.3 "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
1191 rtoy 1.12.30.4 except that the string must be a simple-string"
1192 rtoy 1.12.30.3 (declare (type simple-string string) (type kernel:index index))
1193     #-unicode
1194     (schar string index)
1195     #+unicode
1196 rtoy 1.12.30.4 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
1197     (if from-end
1198     (values (subseq string n index) (and (> n 0) n))
1199     (values (subseq string index n) (and (< n (length string)) n)))))
1200    
1201 rtoy 1.12.30.25 #+unicode
1202     (defun string-reverse* (sequence)
1203 rtoy 1.12.30.26 (declare (optimize (speed 3) (space 0) (safety 0))
1204 rtoy 1.12.30.25 (type string sequence))
1205     (with-string sequence
1206     (let* ((length (- end start))
1207     (string (make-string length))
1208     (j length))
1209     (declare (type kernel:index length j))
1210     (loop for i = start then n as n = (%glyph-f sequence i) do
1211     (replace string sequence :start1 (decf j (- n i)) :start2 i :end2 n)
1212     while (< n end))
1213     string)))
1214    
1215     #+unicode
1216     (defun string-nreverse* (sequence)
1217 rtoy 1.12.30.26 (declare (optimize (speed 3) (space 0) (safety 0))
1218 rtoy 1.12.30.25 (type string sequence))
1219     (with-string sequence
1220     (flet ((rev (start end)
1221     (do ((i start (1+ i))
1222     (j (1- end) (1- j)))
1223     ((>= i j))
1224     (declare (type kernel:index i j))
1225     (rotatef (schar sequence i) (schar sequence j)))))
1226     (let ((len end))
1227     (loop for i = start then n as n = (%glyph-f sequence i) do
1228     (rev i n) while (< n len))
1229     (rev start end))))
1230     sequence)
1231    
1232    
1233    
1234    
1235 rtoy 1.12.30.9 (defun decompose (string &optional (compatibility t))
1236     (declare (type string string))
1237     (let ((result (make-string (cond ((< (length string) 40)
1238     (* 5 (length string)))
1239     ((< (length string) 4096)
1240     (* 2 (length string)))
1241     (t (round (length string) 5/6)))))
1242 rtoy 1.12.30.10 (fillptr 0))
1243     (declare (type kernel:index fillptr))
1244 rtoy 1.12.30.27 (labels ((rec (string start end)
1245 rtoy 1.12.30.9 (declare (type simple-string string))
1246 rtoy 1.12.30.27 (do ((i start (1+ i)))
1247     ((= i end))
1248 rtoy 1.12.30.9 (declare (type kernel:index i))
1249     (multiple-value-bind (code wide) (codepoint string i)
1250     (when wide (incf i))
1251     (let ((decomp (unicode-decomp code compatibility)))
1252 rtoy 1.12.30.27 (if decomp (rec decomp 0 (length decomp)) (out code))))))
1253 rtoy 1.12.30.9 (out (code)
1254     (multiple-value-bind (hi lo) (surrogates code)
1255     (outch hi)
1256     (when lo
1257 rtoy 1.12.30.10 (outch lo))
1258     (let ((cc (unicode-combining-class code)))
1259     (unless (zerop cc)
1260 rtoy 1.12.30.20 (order lo cc (- fillptr (if lo 3 2)))))))
1261 rtoy 1.12.30.9 (outch (char)
1262     (when (= fillptr (length result))
1263     (let ((tmp (make-string (round (length result) 5/6))))
1264     (replace tmp result)
1265     (setq result tmp)))
1266     (setf (schar result fillptr) char)
1267     (incf fillptr))
1268 rtoy 1.12.30.10 (order (wide1 cc last)
1269     (loop until (minusp last) do
1270     (multiple-value-bind (code2 wide2) (codepoint result last)
1271     (let ((cc2 (unicode-combining-class code2)))
1272     (cond ((zerop cc2) (return))
1273     ((> cc2 cc)
1274     (case (+ (if wide2 2 0) (if wide1 1 0))
1275     (0 (rotatef (schar result last)
1276     (schar result (1+ last))))
1277     (1 (rotatef (schar result last)
1278     (schar result (+ last 1))
1279     (schar result (+ last 2))))
1280     (2 (rotatef (schar result last)
1281     (schar result (1- last))
1282     (schar result (1+ last))))
1283     (3 (rotatef (schar result last)
1284     (schar result (+ last 2)))
1285     (rotatef (schar result (1- last))
1286     (schar result (1+ last)))))
1287     (decf last (if wide2 2 1)))
1288     (t (return))))))))
1289 rtoy 1.12.30.27 (with-string string
1290     (rec string start end))
1291 rtoy 1.12.30.9 (shrink-vector result fillptr))))
1292    
1293 rtoy 1.12.30.21 (declaim (inline normalized-codepoint-p))
1294     (defun normalized-codepoint-p (cp form)
1295     (ecase form
1296     (:nfc (unicode-nfc-qc cp))
1297     (:nfkc (unicode-nfkc-qc cp))
1298     (:nfd (unicode-nfd-qc cp))
1299     (:nfkd (unicode-nfkd-qc cp))))
1300    
1301     ;; Perform check to see if string is already normalized. The Unicode
1302     ;; example can return YES, NO, or MAYBE. For our purposes, only YES
1303     ;; is important, for which we return T. For NO or MAYBE, we return NIL.
1304     (defun normalized-form-p (string &optional (form :nfc))
1305 rtoy 1.12.30.24 (declare (type (member :nfc :nfkc :nfd :nfkd) form)
1306 rtoy 1.12.30.21 (optimize (speed 3)))
1307 rtoy 1.12.30.24 (with-string string
1308     (let ((last-class 0))
1309     (declare (type (integer 0 256) last-class))
1310     (do ((k start (1+ k)))
1311     ((>= k end))
1312     (declare (type kernel:index k))
1313     (multiple-value-bind (ch widep)
1314     (codepoint string k end)
1315     (when widep (incf k))
1316     ;; Handle ASCII specially
1317     (unless (< ch 128)
1318     (let ((class (unicode-combining-class ch)))
1319     (declare (type (unsigned-byte 8) class))
1320     (when (and (> last-class class) (not (zerop class)))
1321     ;; Definitely not normalized
1322     (return-from normalized-form-p nil))
1323     (let ((check (normalized-codepoint-p ch form)))
1324     (unless (eq check :y)
1325     (return-from normalized-form-p nil)))
1326     (setf last-class class)))))
1327     t)))
1328 rtoy 1.12.30.21
1329    
1330     ;; Compose a string in place. The string must already be in decomposed form.
1331     (defun %compose (target)
1332     (declare (type string target)
1333     (optimize (speed 3)))
1334     (let ((len (length target))
1335     (starter-pos 0))
1336     (declare (type kernel:index starter-pos))
1337     (multiple-value-bind (starter-ch wide)
1338     (codepoint target 0 len)
1339     (let ((comp-pos (if wide 2 1))
1340     (last-class (unicode-combining-class starter-ch)))
1341     (declare (type (integer 0 256) last-class)
1342     (type kernel:index comp-pos))
1343     (unless (zerop last-class)
1344     ;; Fix for strings starting with a combining character
1345     (setf last-class 256))
1346     ;; Loop on decomposed characters, combining where possible
1347     (do ((decomp-pos comp-pos (1+ decomp-pos)))
1348     ((>= decomp-pos len))
1349     (declare (type kernel:index decomp-pos))
1350     (multiple-value-bind (ch wide)
1351     (codepoint target decomp-pos len)
1352     (when wide (incf decomp-pos))
1353     (let ((ch-class (unicode-combining-class ch))
1354     (composite (get-pairwise-composition starter-ch ch)))
1355     (declare (type (integer 0 256) ch-class))
1356     (cond ((and composite
1357     (or (< last-class ch-class) (zerop last-class)))
1358     ;; Don't have to worry about surrogate pairs here
1359     ;; because the composite is always in the BMP.
1360     (setf (aref target starter-pos) (code-char composite))
1361     (setf starter-ch composite))
1362     (t
1363     (when (zerop ch-class)
1364     (setf starter-pos comp-pos)
1365     (setf starter-ch ch))
1366     (setf last-class ch-class)
1367     (multiple-value-bind (hi lo)
1368     (surrogates ch)
1369     (setf (aref target comp-pos) hi)
1370     (when lo
1371     (incf comp-pos)
1372     (setf (aref target comp-pos) lo))
1373     (incf comp-pos)))))))
1374     (shrink-vector target comp-pos)))))
1375    
1376 rtoy 1.12.30.4 (defun string-to-nfd (string)
1377 rtoy 1.12.30.13 "Convert String to Unicode Normalization Form D (NFD) using the
1378     canonical decomposition. The NFD string is returned"
1379 rtoy 1.12.30.9 (decompose string nil))
1380 rtoy 1.12.30.4
1381     (defun string-to-nfkd (string)
1382 rtoy 1.12.30.13 "Convert String to Unicode Normalization Form KD (NFKD) uisng the
1383     compatible decomposition form. The NFKD string is returned."
1384 rtoy 1.12.30.9 (decompose string t))
1385 rtoy 1.12.30.4
1386 rtoy 1.12.30.12 #+unicode
1387 rtoy 1.12.30.4 (defun string-to-nfc (string)
1388 rtoy 1.12.30.30 "Convert String to Unicode Normalization Form C (NFC). If the
1389     string a simple string and is already normalized, the original
1390     string is returned."
1391 rtoy 1.12.30.24 (if (normalized-form-p string :nfc)
1392     (if (simple-string-p string) string (coerce string 'simple-string))
1393     (coerce (if (normalized-form-p string :nfd)
1394     (%compose (copy-seq string))
1395     (%compose (string-to-nfd string)))
1396     'simple-string)))
1397 rtoy 1.12.30.12
1398     #-unicode ;; Needed by package.lisp
1399     (defun string-to-nfc (string)
1400     (if (simple-string-p string) string (coerce string 'simple-string)))
1401 rtoy 1.12.30.4
1402     (defun string-to-nfkc (string)
1403 rtoy 1.12.30.30 "Convert String to Unicode Normalization Form KC (NFKC). If the
1404     string is a simple string and is already normalized, the original
1405     string is returned."
1406 rtoy 1.12.30.24 (if (normalized-form-p string :nfkc)
1407     (if (simple-string-p string) string (coerce string 'simple-string))
1408     (coerce (if (normalized-form-p string :nfkd)
1409     (%compose (copy-seq string))
1410     (%compose (string-to-nfkd string)))
1411     'simple-string)))

  ViewVC Help
Powered by ViewVC 1.1.5