/[cmucl]/src/code/string.lisp
ViewVC logotype

Contents of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12.30.18 - (hide annotations)
Wed May 20 16:30:08 2009 UTC (4 years, 11 months ago) by rtoy
Branch: unicode-utf16-extfmt-branch
Changes since 1.12.30.17: +13 -6 lines
Do case-insensitive comparison by converting to lower case instead of
upper case.  This is what Unicode CaseFolding.txt does.  One example
of where it matters is U+1E9E is mapped to a lower case U+DF.  But the
upper case version of U+DF is U+DF.

char.lisp:
o Change EQUAL-CHAR-CODE to convert to lowercase.

string.lisp:
o Change EQUAL-CHAR-CODEPOINT to convert to lowercase.
o Fix mistake in STRING-LESS-GREATER-EQUAL which was incorrectly
  comparing the codepoints instead of the equal-char-codepoint values.
1 ram 1.1 ;;; -*- Log: code.log; Package: Lisp -*-
2     ;;;
3     ;;; **********************************************************************
4 ram 1.3 ;;; This code was written as part of the CMU Common Lisp project at
5     ;;; Carnegie Mellon University, and has been placed in the public domain.
6     ;;;
7     (ext:file-comment
8 rtoy 1.12.30.18 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/string.lisp,v 1.12.30.18 2009/05/20 16:30:08 rtoy Exp $")
9 ram 1.3 ;;;
10 ram 1.1 ;;; **********************************************************************
11     ;;;
12 pw 1.11 ;;; Functions to implement strings for CMU Common Lisp
13 ram 1.1 ;;; Written by David Dill
14 ram 1.4 ;;; Rewritten by Skef Wholey, Bill Chiles and Rob MacLachlan.
15 ram 1.1 ;;;
16     ;;; ****************************************************************
17     ;;;
18 ram 1.4 (in-package "LISP")
19 rtoy 1.12.30.3 (export '(char schar glyph sglyph string
20 ram 1.1 string= string-equal string< string> string<= string>= string/=
21     string-lessp string-greaterp string-not-lessp string-not-greaterp
22     string-not-equal
23 rtoy 1.12.30.4 string-to-nfd string-to-nfkd string-to-nfc string-to-nfkc
24 ram 1.1 make-string
25     string-trim string-left-trim string-right-trim
26     string-upcase
27     string-downcase string-capitalize nstring-upcase nstring-downcase
28     nstring-capitalize))
29    
30    
31 rtoy 1.12.30.8 (declaim (inline surrogates-to-codepoint codepoint surrogates))
32 rtoy 1.12.30.7
33 rtoy 1.12.30.8 (defun surrogates-to-codepoint (hi lo)
34     "Convert the given Hi and Lo surrogate characters to the
35     corresponding codepoint value"
36     (declare (type character hi lo))
37     (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi)) #xD800) 10)
38     (the (integer #xDC00 #xDFFF) (char-code lo)) #x2400))
39 rtoy 1.12.30.7
40 rtoy 1.12.30.6 (defun codepoint (string i &optional (end (length string)))
41 rtoy 1.12.30.7 "Return the codepoint value from String at position I. If that
42     position is a surrogate, it is combined with either the previous or
43     following character (when possible) to compute the codepoint. The
44     second return value is NIL if the position is not a surrogate pair.
45     Otherwise +1 or -1 is returned if the position is the high or low
46     surrogate value, respectively."
47 rtoy 1.12.30.6 (declare (type simple-string string) (type kernel:index i end))
48     (let ((code (char-code (schar string i))))
49     (cond ((and (<= #xD800 code #xDBFF) (< (1+ i) end))
50     (let ((tmp (char-code (schar string (1+ i)))))
51     (if (<= #xDC00 tmp #xDFFF)
52 rtoy 1.12.30.10 (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)
53 rtoy 1.12.30.6 (values code nil))))
54     ((and (<= #xDC00 code #xDFFF) (> i 0))
55     (let ((tmp (char-code (schar string (1- i)))))
56     (if (<= #xD800 tmp #xDBFF)
57 rtoy 1.12.30.10 (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)
58 rtoy 1.12.30.6 (values code nil))))
59     (t (values code nil)))))
60    
61     (defun surrogates (codepoint)
62 rtoy 1.12.30.8 "Return the high and low surrogate characters for Codepoint. If
63     Codepoint is in the BMP, the first return value is the corresponding
64     character and the second is NIL."
65 rtoy 1.12.30.6 (declare (type (integer 0 #x10FFFF) codepoint))
66     (if (< codepoint #x10000)
67 rtoy 1.12.30.8 (values (code-char codepoint) nil)
68 rtoy 1.12.30.6 (let* ((tmp (- codepoint #x10000))
69     (hi (logior (ldb (byte 10 10) tmp) #xD800))
70     (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
71 rtoy 1.12.30.8 (values (code-char hi) (code-char lo)))))
72 rtoy 1.12.30.6
73 rtoy 1.12.30.16 (defun utf16-string-p (string)
74     "Check if String is a valid UTF-16 string. If the string is valid,
75     T is returned. If the string is not valid, NIL is returned, and the
76     second value is the index into the string of the invalid character."
77     (do ((len (length string))
78     (index 0 (1+ index)))
79     ((>= index len)
80     t)
81     (multiple-value-bind (codepoint wide)
82     (codepoint string index)
83 rtoy 1.12.30.17 ;; We step through the string in order. If there are any
84     ;; surrogates pairs, we must reach the lead surrogate first,
85     ;; which means WIDE is +1. Otherwise, we have an invalid
86     ;; surrogate pair. If we get any codepoint that is in
87     ;; the surrogate range, we also have an invalid string.
88 rtoy 1.12.30.16 (when (or (eq wide -1)
89     (<= #xD800 codepoint #xDFFF))
90     (return-from utf16-string-p (values nil index)))
91     (when wide (incf index)))))
92 rtoy 1.12.30.6
93 ram 1.1 (defun string (X)
94     "Coerces X into a string. If X is a string, X is returned. If X is a
95 rtoy 1.12.30.14 symbol, X's pname is returned. If X is a character then a one element
96     string containing that character is returned. If X cannot be coerced
97     into a string, an error occurs."
98 ram 1.1 (cond ((stringp x) x)
99     ((symbolp x) (symbol-name x))
100     ((characterp x)
101     (let ((res (make-string 1)))
102     (setf (schar res 0) x) res))
103     (t
104 pw 1.11 (error 'simple-type-error
105     :datum x
106     :expected-type '(or string symbol character)
107     :format-control "~S cannot be coerced to a string."
108     :format-arguments (list x)))))
109 ram 1.1
110     ;;; With-One-String is used to set up some string hacking things. The keywords
111     ;;; are parsed, and the string is hacked into a simple-string.
112    
113     (eval-when (compile)
114    
115     (defmacro with-one-string (string start end cum-offset &rest forms)
116 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
117 pw 1.10 ;; Optimizer may prove STRING is one.
118     (declare (optimize (ext:inhibit-warnings 3)))
119 ram 1.4 (with-array-data ((,string ,string :offset-var ,cum-offset)
120     (,start ,start)
121     (,end (or ,end (length (the vector ,string)))))
122     ,@forms)))
123 ram 1.1
124     )
125    
126     ;;; With-String is like With-One-String, but doesn't parse keywords.
127    
128     (eval-when (compile)
129    
130     (defmacro with-string (string &rest forms)
131 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
132     (with-array-data ((,string ,string)
133     (start)
134     (end (length (the vector ,string))))
135     ,@forms)))
136 ram 1.1
137     )
138    
139     ;;; With-Two-Strings is used to set up string comparison operations. The
140     ;;; keywords are parsed, and the strings are hacked into simple-strings.
141    
142     (eval-when (compile)
143    
144     (defmacro with-two-strings (string1 string2 start1 end1 cum-offset-1
145     start2 end2 &rest forms)
146 ram 1.4 `(let ((,string1 (if (stringp ,string1) ,string1 (string ,string1)))
147     (,string2 (if (stringp ,string2) ,string2 (string ,string2))))
148     (with-array-data ((,string1 ,string1 :offset-var ,cum-offset-1)
149     (,start1 ,start1)
150     (,end1 (or ,end1 (length (the vector ,string1)))))
151     (with-array-data ((,string2 ,string2)
152     (,start2 ,start2)
153     (,end2 (or ,end2 (length (the vector ,string2)))))
154     ,@forms))))
155 ram 1.1
156     )
157 wlott 1.2
158 ram 1.1
159     (defun char (string index)
160     "Given a string and a non-negative integer index less than the length of
161     the string, returns the character object representing the character at
162     that position in the string."
163 ram 1.4 (declare (optimize (safety 1)))
164 ram 1.1 (char string index))
165    
166     (defun %charset (string index new-el)
167 ram 1.4 (declare (optimize (safety 1)))
168 ram 1.1 (setf (char string index) new-el))
169    
170     (defun schar (string index)
171     "SCHAR returns the character object at an indexed position in a string
172 rtoy 1.12.30.14 just as CHAR does, except the string must be a simple-string."
173 ram 1.4 (declare (optimize (safety 1)))
174 ram 1.1 (schar string index))
175    
176     (defun %scharset (string index new-el)
177 ram 1.4 (declare (optimize (safety 1)))
178 ram 1.1 (setf (schar string index) new-el))
179    
180     (defun string=* (string1 string2 start1 end1 start2 end2)
181 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
182     (not (%sp-string-compare string1 start1 end1 string2 start2 end2))))
183 ram 1.1
184    
185     (defun string/=* (string1 string2 start1 end1 start2 end2)
186 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
187     (let ((comparison (%sp-string-compare string1 start1 end1
188     string2 start2 end2)))
189     (if comparison (- (the fixnum comparison) offset1)))))
190 ram 1.1
191     (eval-when (compile eval)
192    
193     ;;; Lessp is true if the desired expansion is for string<* or string<=*.
194     ;;; Equalp is true if the desired expansion is for string<=* or string>=*.
195     (defmacro string<>=*-body (lessp equalp)
196     (let ((offset1 (gensym)))
197 ram 1.4 `(with-two-strings string1 string2 start1 end1 ,offset1 start2 end2
198     (let ((index (%sp-string-compare string1 start1 end1
199     string2 start2 end2)))
200     (if index
201 ram 1.8 (cond ((= (the fixnum index) (the fixnum end1))
202     ,(if lessp
203     `(- (the fixnum index) ,offset1)
204     `nil))
205     ((= (+ (the fixnum index) (- start2 start1))
206     (the fixnum end2))
207     ,(if lessp
208     `nil
209 rtoy 1.12.30.1 `(- (the fixnum index) ,offset1)))
210     #-unicode
211 ram 1.4 ((,(if lessp 'char< 'char>)
212     (schar string1 index)
213     (schar string2 (+ (the fixnum index) (- start2 start1))))
214     (- (the fixnum index) ,offset1))
215 rtoy 1.12.30.1 #-unicode
216     (t nil)
217     #+unicode
218     (t
219     ;; Compare in code point order. See
220     ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
221     (flet ((fixup (code)
222     (if (>= code #xe000)
223     (- code #x800)
224     (+ code #x2000))))
225     (declare (inline fixup))
226     (let* ((c1 (char-code (schar string1 index)))
227 rtoy 1.12.30.10 (c2 (char-code (schar string2
228     (+ (the fixnum index)
229     (- start2 start1))))))
230 rtoy 1.12.30.1 (cond ((and (>= c1 #xd800)
231     (>= c2 #xd800))
232     (let ((fix-c1 (fixup c1))
233     (fix-c2 (fixup c2)))
234     (if (,(if lessp '< '>) fix-c1 fix-c2)
235     (- (the fixnum index) ,offset1)
236     nil)))
237     (t
238     (if (,(if lessp '< '>) c1 c2)
239 rtoy 1.12.30.10 (- (the fixnum index) ,offset1)
240     nil)))))))
241 ram 1.4 ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
242 ram 1.1 ) ; eval-when
243    
244     (defun string<* (string1 string2 start1 end1 start2 end2)
245     (declare (fixnum start1 start2))
246     (string<>=*-body t nil))
247    
248     (defun string>* (string1 string2 start1 end1 start2 end2)
249     (declare (fixnum start1 start2))
250     (string<>=*-body nil nil))
251    
252     (defun string<=* (string1 string2 start1 end1 start2 end2)
253     (declare (fixnum start1 start2))
254     (string<>=*-body t t))
255    
256     (defun string>=* (string1 string2 start1 end1 start2 end2)
257     (declare (fixnum start1 start2))
258     (string<>=*-body nil t))
259    
260    
261    
262     (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
263     "Given two strings, if the first string is lexicographically less than
264     the second string, returns the longest common prefix (using char=)
265     of the two strings. Otherwise, returns ()."
266     (string<* string1 string2 start1 end1 start2 end2))
267    
268     (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
269     "Given two strings, if the first string is lexicographically greater than
270     the second string, returns the longest common prefix (using char=)
271     of the two strings. Otherwise, returns ()."
272     (string>* string1 string2 start1 end1 start2 end2))
273    
274    
275     (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
276     "Given two strings, if the first string is lexicographically less than
277     or equal to the second string, returns the longest common prefix
278     (using char=) of the two strings. Otherwise, returns ()."
279     (string<=* string1 string2 start1 end1 start2 end2))
280    
281     (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
282     "Given two strings, if the first string is lexicographically greater
283     than or equal to the second string, returns the longest common prefix
284     (using char=) of the two strings. Otherwise, returns ()."
285     (string>=* string1 string2 start1 end1 start2 end2))
286    
287     (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
288     "Given two strings (string1 and string2), and optional integers start1,
289     start2, end1 and end2, compares characters in string1 to characters in
290     string2 (using char=)."
291     (string=* string1 string2 start1 end1 start2 end2))
292    
293     (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
294     "Given two strings, if the first string is not lexicographically equal
295     to the second string, returns the longest common prefix (using char=)
296     of the two strings. Otherwise, returns ()."
297     (string/=* string1 string2 start1 end1 start2 end2))
298    
299    
300     (eval-when (compile eval)
301    
302     ;;; STRING-NOT-EQUAL-LOOP is used to generate character comparison loops for
303     ;;; STRING-EQUAL and STRING-NOT-EQUAL.
304     (defmacro string-not-equal-loop (end end-value
305     &optional (abort-value nil abortp))
306     (declare (fixnum end))
307     (let ((end-test (if (= end 1)
308     `(= index1 (the fixnum end1))
309     `(= index2 (the fixnum end2)))))
310     `(do ((index1 start1 (1+ index1))
311     (index2 start2 (1+ index2)))
312     (,(if abortp
313     end-test
314     `(or ,end-test
315     (not (char-equal (schar string1 index1)
316     (schar string2 index2)))))
317     ,end-value)
318     (declare (fixnum index1 index2))
319     ,@(if abortp
320     `((if (not (char-equal (schar string1 index1)
321     (schar string2 index2)))
322     (return ,abort-value)))))))
323    
324     ) ; eval-when
325    
326     (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
327     "Given two strings (string1 and string2), and optional integers start1,
328     start2, end1 and end2, compares characters in string1 to characters in
329     string2 (using char-equal)."
330     (declare (fixnum start1 start2))
331 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
332     (let ((slen1 (- (the fixnum end1) start1))
333     (slen2 (- (the fixnum end2) start2)))
334     (declare (fixnum slen1 slen2))
335     (if (or (minusp slen1) (minusp slen2))
336     ;;prevent endless looping later.
337     (error "Improper bounds for string comparison."))
338     (if (= slen1 slen2)
339     ;;return () immediately if lengths aren't equal.
340     (string-not-equal-loop 1 t nil)))))
341 ram 1.1
342     (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
343     "Given two strings, if the first string is not lexicographically equal
344     to the second string, returns the longest common prefix (using char-equal)
345     of the two strings. Otherwise, returns ()."
346 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
347     (let ((slen1 (- end1 start1))
348     (slen2 (- end2 start2)))
349     (declare (fixnum slen1 slen2))
350     (if (or (minusp slen1) (minusp slen2))
351     ;;prevent endless looping later.
352     (error "Improper bounds for string comparison."))
353     (cond ((or (minusp slen1) (or (minusp slen2)))
354     (error "Improper substring for comparison."))
355     ((= slen1 slen2)
356     (string-not-equal-loop 1 nil (- index1 offset1)))
357     ((< slen1 slen2)
358     (string-not-equal-loop 1 (- index1 offset1)))
359     (t
360     (string-not-equal-loop 2 (- index1 offset1)))))))
361 ram 1.1
362    
363    
364     (eval-when (compile eval)
365    
366     ;;; STRING-LESS-GREATER-EQUAL-TESTS returns a test on the lengths of string1
367     ;;; and string2 and a test on the current characters from string1 and string2
368     ;;; for the following macro.
369     (defun string-less-greater-equal-tests (lessp equalp)
370     (if lessp
371     (if equalp
372     ;; STRING-NOT-GREATERP
373 rtoy 1.12.30.15 (values '<=
374     #-unicode `(not (char-greaterp char1 char2))
375     #+unicode `(<= char1 char2))
376 ram 1.1 ;; STRING-LESSP
377 rtoy 1.12.30.15 (values '<
378     #-unicode `(char-lessp char1 char2)
379     #+unicode `(< char1 char2)))
380 ram 1.1 (if equalp
381     ;; STRING-NOT-LESSP
382 rtoy 1.12.30.15 (values '>=
383     #-unicode `(not (char-lessp char1 char2))
384     #+unicode `(>= char1 char2))
385 ram 1.1 ;; STRING-GREATERP
386 rtoy 1.12.30.15 (values '>
387     #-unicode `(char-greaterp char1 char2)
388     #+unicode `(> char1 char2)))))
389 ram 1.1
390 rtoy 1.12.30.15 #-unicode
391 ram 1.1 (defmacro string-less-greater-equal (lessp equalp)
392     (multiple-value-bind (length-test character-test)
393     (string-less-greater-equal-tests lessp equalp)
394 ram 1.4 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
395     (let ((slen1 (- (the fixnum end1) start1))
396     (slen2 (- (the fixnum end2) start2)))
397     (declare (fixnum slen1 slen2))
398     (if (or (minusp slen1) (minusp slen2))
399     ;;prevent endless looping later.
400     (error "Improper bounds for string comparison."))
401     (do ((index1 start1 (1+ index1))
402     (index2 start2 (1+ index2))
403     (char1)
404     (char2))
405     ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
406     (if (,length-test slen1 slen2) (- index1 offset1)))
407     (declare (fixnum index1 index2))
408     (setq char1 (schar string1 index1))
409     (setq char2 (schar string2 index2))
410     (if (not (char-equal char1 char2))
411     (if ,character-test
412     (return (- index1 offset1))
413     (return ()))))))))
414 ram 1.1
415 rtoy 1.12.30.18 ;; Convert to lowercase for case folding, to match what Unicode
416     ;; CaseFolding.txt says. An example where this matters: U+1E9E maps
417     ;; to U+00DF. But the uppercase version of U+00DF is U+00DF.
418 rtoy 1.12.30.15 #+unicode
419     (defmacro equal-char-codepoint (codepoint)
420     `(let ((ch ,codepoint))
421 rtoy 1.12.30.18 ;; Handle ASCII separately for bootstrapping and for unidata missing.
422     (if (< 64 ch 91)
423     (+ ch 32)
424 rtoy 1.12.30.15 #-(and unicode (not unicode-bootstrap))
425     ch
426     #+(and unicode (not unicode-bootstrap))
427 rtoy 1.12.30.18 (if (> ch 127) (unicode-lower ch) ch))))
428 rtoy 1.12.30.15
429     #+unicode
430     (defmacro string-less-greater-equal (lessp equalp)
431     (multiple-value-bind (length-test character-test)
432     (string-less-greater-equal-tests lessp equalp)
433     `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
434     (let ((slen1 (- (the fixnum end1) start1))
435     (slen2 (- (the fixnum end2) start2)))
436     (declare (fixnum slen1 slen2))
437     (if (or (minusp slen1) (minusp slen2))
438     ;;prevent endless looping later.
439     (error "Improper bounds for string comparison."))
440     (do ((index1 start1 (1+ index1))
441     (index2 start2 (1+ index2)))
442     ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
443     (if (,length-test slen1 slen2) (- index1 offset1)))
444     (declare (fixnum index1 index2))
445     (multiple-value-bind (char1 wide1)
446     (codepoint string1 index1)
447 rtoy 1.12.30.18 (declare (type (integer 0 #x10ffff) char1))
448 rtoy 1.12.30.15 (multiple-value-bind (char2 wide2)
449     (codepoint string2 index2)
450 rtoy 1.12.30.18 (declare (type (integer 0 #x10ffff) char2))
451     (setf char1 (equal-char-codepoint char1))
452     (setf char2 (equal-char-codepoint char2))
453     (if (= char1 char2)
454 rtoy 1.12.30.15 (progn
455     (when wide1 (incf index1))
456     (when wide2 (incf index2)))
457     (if ,character-test
458     (return (- index1 offset1))
459     (return ()))))))))))
460    
461 ram 1.1 ) ; eval-when
462    
463     (defun string-lessp* (string1 string2 start1 end1 start2 end2)
464     (declare (fixnum start1 start2))
465     (string-less-greater-equal t nil))
466    
467     (defun string-greaterp* (string1 string2 start1 end1 start2 end2)
468     (declare (fixnum start1 start2))
469     (string-less-greater-equal nil nil))
470    
471     (defun string-not-lessp* (string1 string2 start1 end1 start2 end2)
472     (declare (fixnum start1 start2))
473     (string-less-greater-equal nil t))
474    
475     (defun string-not-greaterp* (string1 string2 start1 end1 start2 end2)
476     (declare (fixnum start1 start2))
477     (string-less-greater-equal t t))
478    
479     (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
480     "Given two strings, if the first string is lexicographically less than
481     the second string, returns the longest common prefix (using char-equal)
482     of the two strings. Otherwise, returns ()."
483     (string-lessp* string1 string2 start1 end1 start2 end2))
484    
485     (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)
486     "Given two strings, if the first string is lexicographically greater than
487     the second string, returns the longest common prefix (using char-equal)
488     of the two strings. Otherwise, returns ()."
489     (string-greaterp* string1 string2 start1 end1 start2 end2))
490    
491     (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
492     "Given two strings, if the first string is lexicographically greater
493     than or equal to the second string, returns the longest common prefix
494     (using char-equal) of the two strings. Otherwise, returns ()."
495     (string-not-lessp* string1 string2 start1 end1 start2 end2))
496    
497     (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
498     end2)
499     "Given two strings, if the first string is lexicographically less than
500     or equal to the second string, returns the longest common prefix
501     (using char-equal) of the two strings. Otherwise, returns ()."
502     (string-not-greaterp* string1 string2 start1 end1 start2 end2))
503    
504    
505 dtc 1.9 (defun make-string (count &key element-type ((:initial-element fill-char)))
506 ram 1.1 "Given a character count and an optional fill character, makes and returns
507 rtoy 1.12.30.14 a new string Count long filled with the fill character."
508 emarsden 1.12 (declare (type fixnum count))
509     (assert (subtypep element-type 'character))
510 ram 1.1 (if fill-char
511     (do ((i 0 (1+ i))
512     (string (make-string count)))
513     ((= i count) string)
514     (declare (fixnum i))
515     (setf (schar string i) fill-char))
516     (make-string count)))
517    
518     (defun string-upcase (string &key (start 0) end)
519     "Given a string, returns a new string that is a copy of it with
520     all lower case alphabetic characters converted to uppercase."
521     (declare (fixnum start))
522 ram 1.5 (let* ((string (if (stringp string) string (string string)))
523     (slen (length string)))
524 ram 1.4 (declare (fixnum slen))
525 ram 1.1 (with-one-string string start end offset
526     (let ((offset-slen (+ slen offset))
527     (newstring (make-string slen)))
528     (declare (fixnum offset-slen))
529     (do ((index offset (1+ index))
530     (new-index 0 (1+ new-index)))
531     ((= index start))
532     (declare (fixnum index new-index))
533     (setf (schar newstring new-index) (schar string index)))
534     (do ((index start (1+ index))
535     (new-index (- start offset) (1+ new-index)))
536     ((= index (the fixnum end)))
537     (declare (fixnum index new-index))
538 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
539     (when wide (incf index))
540     ;; Handle ASCII specially because this is called early in
541     ;; initialization, before unidata is available.
542     (cond ((< 96 code 123) (decf code 32))
543     ((> code 127) (setq code (unicode-upper code))))
544     ;;@@ WARNING: this may, in theory, need to extend newstring
545     ;; but that never actually occurs as of Unicode 5.1.0,
546     ;; so I'm just going to ignore it for now...
547     (multiple-value-bind (hi lo) (surrogates code)
548 rtoy 1.12.30.8 (setf (schar newstring new-index) hi)
549 rtoy 1.12.30.6 (when lo
550 rtoy 1.12.30.8 (setf (schar newstring (incf new-index)) lo)))))
551 rtoy 1.12.30.4 ;;@@ WARNING: see above
552 ram 1.1 (do ((index end (1+ index))
553     (new-index (- (the fixnum end) offset) (1+ new-index)))
554     ((= index offset-slen))
555     (declare (fixnum index new-index))
556     (setf (schar newstring new-index) (schar string index)))
557     newstring))))
558    
559     (defun string-downcase (string &key (start 0) end)
560     "Given a string, returns a new string that is a copy of it with
561     all upper case alphabetic characters converted to lowercase."
562     (declare (fixnum start))
563 ram 1.5 (let* ((string (if (stringp string) string (string string)))
564     (slen (length string)))
565 ram 1.4 (declare (fixnum slen))
566 ram 1.1 (with-one-string string start end offset
567     (let ((offset-slen (+ slen offset))
568     (newstring (make-string slen)))
569     (declare (fixnum offset-slen))
570     (do ((index offset (1+ index))
571     (new-index 0 (1+ new-index)))
572     ((= index start))
573     (declare (fixnum index new-index))
574     (setf (schar newstring new-index) (schar string index)))
575     (do ((index start (1+ index))
576     (new-index (- start offset) (1+ new-index)))
577     ((= index (the fixnum end)))
578     (declare (fixnum index new-index))
579 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
580     (when wide (incf index))
581     ;; Handle ASCII specially because this is called early in
582     ;; initialization, before unidata is available.
583     (cond ((< 64 code 91) (incf code 32))
584     ((> code 127) (setq code (unicode-lower code))))
585     ;;@@ WARNING: this may, in theory, need to extend newstring
586     ;; but that never actually occurs as of Unicode 5.1.0,
587     ;; so I'm just going to ignore it for now...
588     (multiple-value-bind (hi lo) (surrogates code)
589 rtoy 1.12.30.8 (setf (schar newstring new-index) hi)
590 rtoy 1.12.30.6 (when lo
591 rtoy 1.12.30.8 (setf (schar newstring (incf new-index)) lo)))))
592 rtoy 1.12.30.4 ;;@@ WARNING: see above
593 ram 1.1 (do ((index end (1+ index))
594     (new-index (- (the fixnum end) offset) (1+ new-index)))
595     ((= index offset-slen))
596     (declare (fixnum index new-index))
597     (setf (schar newstring new-index) (schar string index)))
598     newstring))))
599    
600     (defun string-capitalize (string &key (start 0) end)
601     "Given a string, returns a copy of the string with the first
602     character of each ``word'' converted to upper-case, and remaining
603     chars in the word converted to lower case. A ``word'' is defined
604     to be a string of case-modifiable characters delimited by
605     non-case-modifiable chars."
606     (declare (fixnum start))
607 ram 1.5 (let* ((string (if (stringp string) string (string string)))
608     (slen (length string)))
609 ram 1.4 (declare (fixnum slen))
610 ram 1.1 (with-one-string string start end offset
611     (let ((offset-slen (+ slen offset))
612     (newstring (make-string slen)))
613     (declare (fixnum offset-slen))
614     (do ((index offset (1+ index))
615     (new-index 0 (1+ new-index)))
616     ((= index start))
617     (declare (fixnum index new-index))
618     (setf (schar newstring new-index) (schar string index)))
619     (do ((index start (1+ index))
620     (new-index (- start offset) (1+ new-index))
621     (newword t)
622     (char ()))
623     ((= index (the fixnum end)))
624     (declare (fixnum index new-index))
625     (setq char (schar string index))
626     (cond ((not (alphanumericp char))
627     (setq newword t))
628     (newword
629     ;;char is first case-modifiable after non-case-modifiable
630 rtoy 1.12.30.2 (setq char (char-titlecase char))
631 ram 1.1 (setq newword ()))
632     ;;char is case-modifiable, but not first
633     (t (setq char (char-downcase char))))
634     (setf (schar newstring new-index) char))
635     (do ((index end (1+ index))
636     (new-index (- (the fixnum end) offset) (1+ new-index)))
637     ((= index offset-slen))
638     (declare (fixnum index new-index))
639     (setf (schar newstring new-index) (schar string index)))
640     newstring))))
641    
642     (defun nstring-upcase (string &key (start 0) end)
643     "Given a string, returns that string with all lower case alphabetic
644     characters converted to uppercase."
645     (declare (fixnum start))
646 ram 1.4 (let ((save-header string))
647 ram 1.1 (with-one-string string start end offset
648     (do ((index start (1+ index)))
649     ((= index (the fixnum end)))
650     (declare (fixnum index))
651 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
652     (declare (ignore wide))
653     ;; Handle ASCII specially because this is called early in
654     ;; initialization, before unidata is available.
655     (cond ((< 96 code 123) (decf code 32))
656     ((> code 127) (setq code (unicode-upper code))))
657     ;;@@ WARNING: this may, in theory, need to extend string
658     ;; (which, obviously, we can't do here. Unless
659     ;; STRING is adjustable, maybe)
660     ;; but that never actually occurs as of Unicode 5.1.0,
661     ;; so I'm just going to ignore it for now...
662     (multiple-value-bind (hi lo) (surrogates code)
663 rtoy 1.12.30.8 (setf (schar string index) hi)
664 rtoy 1.12.30.6 (when lo
665 rtoy 1.12.30.8 (setf (schar string (incf index)) lo))))))
666 ram 1.1 save-header))
667    
668     (defun nstring-downcase (string &key (start 0) end)
669     "Given a string, returns that string with all upper case alphabetic
670     characters converted to lowercase."
671     (declare (fixnum start))
672 ram 1.4 (let ((save-header string))
673 ram 1.1 (with-one-string string start end offset
674     (do ((index start (1+ index)))
675     ((= index (the fixnum end)))
676     (declare (fixnum index))
677 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
678     (declare (ignore wide))
679     (cond ((< 64 code 91) (incf code 32))
680     ((> code 127) (setq code (unicode-lower code))))
681     ;;@@ WARNING: this may, in theory, need to extend string
682     ;; (which, obviously, we can't do here. Unless
683     ;; STRING is adjustable, maybe)
684     ;; but that never actually occurs as of Unicode 5.1.0,
685     ;; so I'm just going to ignore it for now...
686     (multiple-value-bind (hi lo) (surrogates code)
687 rtoy 1.12.30.8 (setf (schar string index) hi)
688 rtoy 1.12.30.6 (when lo
689 rtoy 1.12.30.8 (setf (schar string (incf index)) lo))))))
690 wlott 1.6 save-header))
691 ram 1.1
692     (defun nstring-capitalize (string &key (start 0) end)
693     "Given a string, returns that string with the first
694     character of each ``word'' converted to upper-case, and remaining
695     chars in the word converted to lower case. A ``word'' is defined
696     to be a string of case-modifiable characters delimited by
697     non-case-modifiable chars."
698     (declare (fixnum start))
699 ram 1.4 (let ((save-header string))
700 ram 1.1 (with-one-string string start end offset
701     (do ((index start (1+ index))
702     (newword t)
703     (char ()))
704     ((= index (the fixnum end)))
705     (declare (fixnum index))
706     (setq char (schar string index))
707     (cond ((not (alphanumericp char))
708     (setq newword t))
709     (newword
710     ;;char is first case-modifiable after non-case-modifiable
711 rtoy 1.12.30.2 (setf (schar string index) (char-titlecase char))
712 ram 1.1 (setq newword ()))
713     (t
714     (setf (schar string index) (char-downcase char))))))
715     save-header))
716    
717     (defun string-left-trim (char-bag string)
718     "Given a set of characters (a list or string) and a string, returns
719     a copy of the string with the characters in the set removed from the
720     left end."
721     (with-string string
722     (do ((index start (1+ index)))
723     ((or (= index (the fixnum end))
724     (not (find (schar string index) char-bag)))
725     (subseq (the simple-string string) index end))
726     (declare (fixnum index)))))
727    
728     (defun string-right-trim (char-bag string)
729     "Given a set of characters (a list or string) and a string, returns
730     a copy of the string with the characters in the set removed from the
731     right end."
732     (with-string string
733     (do ((index (1- (the fixnum end)) (1- index)))
734     ((or (< index start) (not (find (schar string index) char-bag)))
735     (subseq (the simple-string string) start (1+ index)))
736     (declare (fixnum index)))))
737    
738     (defun string-trim (char-bag string)
739     "Given a set of characters (a list or string) and a string, returns a
740     copy of the string with the characters in the set removed from both
741     ends."
742     (with-string string
743     (let* ((left-end (do ((index start (1+ index)))
744     ((or (= index (the fixnum end))
745     (not (find (schar string index) char-bag)))
746     index)
747     (declare (fixnum index))))
748     (right-end (do ((index (1- (the fixnum end)) (1- index)))
749     ((or (< index left-end)
750     (not (find (schar string index) char-bag)))
751     (1+ index))
752     (declare (fixnum index)))))
753     (subseq (the simple-string string) left-end right-end))))
754 rtoy 1.12.30.3
755 rtoy 1.12.30.4 (declaim (inline %glyph-f %glyph-b))
756     (defun %glyph-f (string index)
757     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
758     (type simple-string string) (type kernel:index index))
759 rtoy 1.12.30.10 (let* ((prev 0)
760     (l (length string))
761     (c (codepoint string index l))
762     (n (+ index (if (> c #xFFFF) 2 1))))
763     (declare (type (integer 0 #x10FFFF) c) (type kernel:index l n))
764     (loop while (< n l) do
765     (let* ((c (codepoint string n l))
766     (d (the (unsigned-byte 8) (unicode-combining-class c))))
767     (when (or (zerop d) (< d prev))
768     (return))
769     (setq prev d)
770     (incf n (if (> c #xFFFF) 2 1))))
771     n))
772 rtoy 1.12.30.4
773     (defun %glyph-b (string index)
774     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
775     (type simple-string string) (type kernel:index index))
776 rtoy 1.12.30.11 (let* ((prev 255)
777     (n (1- index)))
778     (declare (type kernel:index n))
779     (loop until (< n 0) do
780     (let* ((c (codepoint string n 0))
781     (d (the (unsigned-byte 8) (unicode-combining-class c))))
782     (cond ((zerop d) (return))
783     ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
784     (setq prev d)
785     (decf n (if (> c #xFFFF) 2 1))))
786     n))
787 rtoy 1.12.30.4
788     (defun glyph (string index &key (from-end nil))
789 rtoy 1.12.30.3 "GLYPH returns the glyph at the indexed position in a string, and the
790 rtoy 1.12.30.4 position of the next glyph (or NIL) as a second value. A glyph is
791     a substring consisting of the character at INDEX followed by all
792     subsequent combining characters."
793 rtoy 1.12.30.3 (declare (type simple-string string) (type kernel:index index))
794     #-unicode
795     (char string index)
796     #+unicode
797     (with-array-data ((string string) (start) (end))
798     (declare (ignore start end))
799 rtoy 1.12.30.4 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
800     (if from-end
801     (values (subseq string n index) (and (> n 0) n))
802     (values (subseq string index n) (and (< n (length string)) n))))))
803 rtoy 1.12.30.3
804 rtoy 1.12.30.4 (defun sglyph (string index &key (from-end nil))
805 rtoy 1.12.30.3 "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
806 rtoy 1.12.30.4 except that the string must be a simple-string"
807 rtoy 1.12.30.3 (declare (type simple-string string) (type kernel:index index))
808     #-unicode
809     (schar string index)
810     #+unicode
811 rtoy 1.12.30.4 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
812     (if from-end
813     (values (subseq string n index) (and (> n 0) n))
814     (values (subseq string index n) (and (< n (length string)) n)))))
815    
816 rtoy 1.12.30.9 (defun decompose (string &optional (compatibility t))
817     (declare (type string string))
818     (let ((result (make-string (cond ((< (length string) 40)
819     (* 5 (length string)))
820     ((< (length string) 4096)
821     (* 2 (length string)))
822     (t (round (length string) 5/6)))))
823 rtoy 1.12.30.10 (fillptr 0))
824     (declare (type kernel:index fillptr))
825 rtoy 1.12.30.9 (labels ((rec (string)
826     (declare (type simple-string string))
827     (do ((i 0 (1+ i)))
828     ((= i (length string)))
829     (declare (type kernel:index i))
830     (multiple-value-bind (code wide) (codepoint string i)
831     (when wide (incf i))
832     (let ((decomp (unicode-decomp code compatibility)))
833     (if decomp (rec decomp) (out code))))))
834     (out (code)
835     (multiple-value-bind (hi lo) (surrogates code)
836     (outch hi)
837     (when lo
838 rtoy 1.12.30.10 (outch lo))
839     (let ((cc (unicode-combining-class code)))
840     (unless (zerop cc)
841     (order lo cc (- fillptr (if lo 2 1)))))))
842 rtoy 1.12.30.9 (outch (char)
843     (when (= fillptr (length result))
844     (let ((tmp (make-string (round (length result) 5/6))))
845     (replace tmp result)
846     (setq result tmp)))
847     (setf (schar result fillptr) char)
848     (incf fillptr))
849 rtoy 1.12.30.10 (order (wide1 cc last)
850     (loop until (minusp last) do
851     (multiple-value-bind (code2 wide2) (codepoint result last)
852     (let ((cc2 (unicode-combining-class code2)))
853     (cond ((zerop cc2) (return))
854     ((> cc2 cc)
855     (case (+ (if wide2 2 0) (if wide1 1 0))
856     (0 (rotatef (schar result last)
857     (schar result (1+ last))))
858     (1 (rotatef (schar result last)
859     (schar result (+ last 1))
860     (schar result (+ last 2))))
861     (2 (rotatef (schar result last)
862     (schar result (1- last))
863     (schar result (1+ last))))
864     (3 (rotatef (schar result last)
865     (schar result (+ last 2)))
866     (rotatef (schar result (1- last))
867     (schar result (1+ last)))))
868     (decf last (if wide2 2 1)))
869     (t (return))))))))
870 rtoy 1.12.30.9 (with-array-data ((string string) (start) (end))
871     (declare (ignore start end))
872     (rec string))
873     (shrink-vector result fillptr))))
874    
875 rtoy 1.12.30.4 (defun string-to-nfd (string)
876 rtoy 1.12.30.13 "Convert String to Unicode Normalization Form D (NFD) using the
877     canonical decomposition. The NFD string is returned"
878 rtoy 1.12.30.9 (decompose string nil))
879 rtoy 1.12.30.4
880     (defun string-to-nfkd (string)
881 rtoy 1.12.30.13 "Convert String to Unicode Normalization Form KD (NFKD) uisng the
882     compatible decomposition form. The NFKD string is returned."
883 rtoy 1.12.30.9 (decompose string t))
884 rtoy 1.12.30.4
885 rtoy 1.12.30.12 #+unicode
886 rtoy 1.12.30.4 (defun string-to-nfc (string)
887     ;;@@ Implement me
888 rtoy 1.12.30.12 ;; must return a simple-string for the package machinery
889     (if (simple-string-p string) string (coerce string 'simple-string)))
890    
891     #-unicode ;; Needed by package.lisp
892     (defun string-to-nfc (string)
893     (if (simple-string-p string) string (coerce string 'simple-string)))
894 rtoy 1.12.30.4
895     (defun string-to-nfkc (string)
896     ;;@@ Implement me
897 rtoy 1.12.30.12 (if (simple-string-p string) string (coerce string 'simple-string)))

  ViewVC Help
Powered by ViewVC 1.1.5