/[cmucl]/src/code/string.lisp
ViewVC logotype

Contents of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12.30.23 - (hide annotations)
Wed May 27 01:06:19 2009 UTC (4 years, 10 months ago) by rtoy
Branch: unicode-utf16-extfmt-branch
Changes since 1.12.30.22: +15 -13 lines
NORMALIZED-FORM-P needs simple-strings.  We should to this in a
different way, but this will do for now.
1 ram 1.1 ;;; -*- Log: code.log; Package: Lisp -*-
2     ;;;
3     ;;; **********************************************************************
4 ram 1.3 ;;; This code was written as part of the CMU Common Lisp project at
5     ;;; Carnegie Mellon University, and has been placed in the public domain.
6     ;;;
7     (ext:file-comment
8 rtoy 1.12.30.23 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/string.lisp,v 1.12.30.23 2009/05/27 01:06:19 rtoy Exp $")
9 ram 1.3 ;;;
10 ram 1.1 ;;; **********************************************************************
11     ;;;
12 pw 1.11 ;;; Functions to implement strings for CMU Common Lisp
13 ram 1.1 ;;; Written by David Dill
14 ram 1.4 ;;; Rewritten by Skef Wholey, Bill Chiles and Rob MacLachlan.
15 ram 1.1 ;;;
16     ;;; ****************************************************************
17     ;;;
18 ram 1.4 (in-package "LISP")
19 rtoy 1.12.30.3 (export '(char schar glyph sglyph string
20 ram 1.1 string= string-equal string< string> string<= string>= string/=
21     string-lessp string-greaterp string-not-lessp string-not-greaterp
22     string-not-equal
23 rtoy 1.12.30.4 string-to-nfd string-to-nfkd string-to-nfc string-to-nfkc
24 ram 1.1 make-string
25     string-trim string-left-trim string-right-trim
26     string-upcase
27     string-downcase string-capitalize nstring-upcase nstring-downcase
28     nstring-capitalize))
29    
30    
31 rtoy 1.12.30.19 (declaim (inline surrogatep surrogates-to-codepoint codepoint surrogates))
32    
33     (defun surrogatep (c &optional surrogate-type)
34     "Test if C is a surrogate. C may be either an integer or a
35     character. Surrogate-type indicates what kind of surrogate to test
36     for. :High means to test for the high (leading) surrogate; :Low
37     tests for the low (trailing surrogate). A value of :Any or Nil
38     tests for any surrogate value (high or low)."
39     (declare (type (or character (integer 0 #x10ffff)) c))
40     (let ((code (if (characterp c)
41     (char-code c)
42     c)))
43     (ecase surrogate-type
44     ((:high :leading)
45     ;; Test for high surrogate
46     (<= #xD800 code #xDBFF))
47     ((:low :trailing)
48     ;; Test for low surrogate
49     (<= #xDC00 code #xDFFF))
50     ((:any nil)
51     ;; Test for any surrogate
52     (<= #xD800 code #xDFFF)))))
53 rtoy 1.12.30.7
54 rtoy 1.12.30.8 (defun surrogates-to-codepoint (hi lo)
55     "Convert the given Hi and Lo surrogate characters to the
56     corresponding codepoint value"
57     (declare (type character hi lo))
58     (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi)) #xD800) 10)
59     (the (integer #xDC00 #xDFFF) (char-code lo)) #x2400))
60 rtoy 1.12.30.7
61 rtoy 1.12.30.6 (defun codepoint (string i &optional (end (length string)))
62 rtoy 1.12.30.7 "Return the codepoint value from String at position I. If that
63     position is a surrogate, it is combined with either the previous or
64     following character (when possible) to compute the codepoint. The
65     second return value is NIL if the position is not a surrogate pair.
66     Otherwise +1 or -1 is returned if the position is the high or low
67     surrogate value, respectively."
68 rtoy 1.12.30.6 (declare (type simple-string string) (type kernel:index i end))
69     (let ((code (char-code (schar string i))))
70 rtoy 1.12.30.19 (cond ((and (surrogatep code :high) (< (1+ i) end))
71 rtoy 1.12.30.6 (let ((tmp (char-code (schar string (1+ i)))))
72 rtoy 1.12.30.19 (if (surrogatep tmp :low)
73 rtoy 1.12.30.10 (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)
74 rtoy 1.12.30.6 (values code nil))))
75 rtoy 1.12.30.19 ((and (surrogatep code :low) (> i 0))
76 rtoy 1.12.30.6 (let ((tmp (char-code (schar string (1- i)))))
77 rtoy 1.12.30.19 (if (surrogatep tmp :high)
78 rtoy 1.12.30.10 (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)
79 rtoy 1.12.30.6 (values code nil))))
80     (t (values code nil)))))
81    
82     (defun surrogates (codepoint)
83 rtoy 1.12.30.8 "Return the high and low surrogate characters for Codepoint. If
84     Codepoint is in the BMP, the first return value is the corresponding
85     character and the second is NIL."
86 rtoy 1.12.30.6 (declare (type (integer 0 #x10FFFF) codepoint))
87     (if (< codepoint #x10000)
88 rtoy 1.12.30.8 (values (code-char codepoint) nil)
89 rtoy 1.12.30.6 (let* ((tmp (- codepoint #x10000))
90     (hi (logior (ldb (byte 10 10) tmp) #xD800))
91     (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
92 rtoy 1.12.30.8 (values (code-char hi) (code-char lo)))))
93 rtoy 1.12.30.6
94 rtoy 1.12.30.22 (defun (setf codepoint) (codepoint string i)
95     "Set the codepoint at string position I to the Codepoint. If the
96     codepoint requires a surrogate pair, the high (leading surrogate) is
97     stored at position I and the low (trailing) surrogate is stored at
98     I+1"
99     (declare (type (integer 0 #x10FFFF) codepoint)
100     (type simple-string string))
101     (let ((widep nil))
102     (multiple-value-bind (hi lo)
103     (surrogates codepoint)
104     (setf (aref string i) hi)
105     (when lo
106     (setf (aref string (1+ i)) lo)
107     (setf widep t)))
108     (values codepoint widep)))
109    
110 rtoy 1.12.30.16 (defun utf16-string-p (string)
111     "Check if String is a valid UTF-16 string. If the string is valid,
112     T is returned. If the string is not valid, NIL is returned, and the
113     second value is the index into the string of the invalid character."
114     (do ((len (length string))
115     (index 0 (1+ index)))
116     ((>= index len)
117     t)
118     (multiple-value-bind (codepoint wide)
119     (codepoint string index)
120 rtoy 1.12.30.17 ;; We step through the string in order. If there are any
121     ;; surrogates pairs, we must reach the lead surrogate first,
122     ;; which means WIDE is +1. Otherwise, we have an invalid
123     ;; surrogate pair. If we get any codepoint that is in
124     ;; the surrogate range, we also have an invalid string.
125 rtoy 1.12.30.16 (when (or (eq wide -1)
126 rtoy 1.12.30.19 (surrogatep codepoint))
127 rtoy 1.12.30.16 (return-from utf16-string-p (values nil index)))
128     (when wide (incf index)))))
129 rtoy 1.12.30.6
130 ram 1.1 (defun string (X)
131     "Coerces X into a string. If X is a string, X is returned. If X is a
132 rtoy 1.12.30.14 symbol, X's pname is returned. If X is a character then a one element
133     string containing that character is returned. If X cannot be coerced
134     into a string, an error occurs."
135 ram 1.1 (cond ((stringp x) x)
136     ((symbolp x) (symbol-name x))
137     ((characterp x)
138     (let ((res (make-string 1)))
139     (setf (schar res 0) x) res))
140     (t
141 pw 1.11 (error 'simple-type-error
142     :datum x
143     :expected-type '(or string symbol character)
144     :format-control "~S cannot be coerced to a string."
145     :format-arguments (list x)))))
146 ram 1.1
147     ;;; With-One-String is used to set up some string hacking things. The keywords
148     ;;; are parsed, and the string is hacked into a simple-string.
149    
150     (eval-when (compile)
151    
152     (defmacro with-one-string (string start end cum-offset &rest forms)
153 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
154 pw 1.10 ;; Optimizer may prove STRING is one.
155     (declare (optimize (ext:inhibit-warnings 3)))
156 ram 1.4 (with-array-data ((,string ,string :offset-var ,cum-offset)
157     (,start ,start)
158     (,end (or ,end (length (the vector ,string)))))
159     ,@forms)))
160 ram 1.1
161     )
162    
163     ;;; With-String is like With-One-String, but doesn't parse keywords.
164    
165     (eval-when (compile)
166    
167     (defmacro with-string (string &rest forms)
168 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
169     (with-array-data ((,string ,string)
170     (start)
171     (end (length (the vector ,string))))
172     ,@forms)))
173 ram 1.1
174     )
175    
176     ;;; With-Two-Strings is used to set up string comparison operations. The
177     ;;; keywords are parsed, and the strings are hacked into simple-strings.
178    
179     (eval-when (compile)
180    
181     (defmacro with-two-strings (string1 string2 start1 end1 cum-offset-1
182     start2 end2 &rest forms)
183 ram 1.4 `(let ((,string1 (if (stringp ,string1) ,string1 (string ,string1)))
184     (,string2 (if (stringp ,string2) ,string2 (string ,string2))))
185     (with-array-data ((,string1 ,string1 :offset-var ,cum-offset-1)
186     (,start1 ,start1)
187     (,end1 (or ,end1 (length (the vector ,string1)))))
188     (with-array-data ((,string2 ,string2)
189     (,start2 ,start2)
190     (,end2 (or ,end2 (length (the vector ,string2)))))
191     ,@forms))))
192 ram 1.1
193     )
194 wlott 1.2
195 ram 1.1
196     (defun char (string index)
197     "Given a string and a non-negative integer index less than the length of
198     the string, returns the character object representing the character at
199     that position in the string."
200 ram 1.4 (declare (optimize (safety 1)))
201 ram 1.1 (char string index))
202    
203     (defun %charset (string index new-el)
204 ram 1.4 (declare (optimize (safety 1)))
205 ram 1.1 (setf (char string index) new-el))
206    
207     (defun schar (string index)
208     "SCHAR returns the character object at an indexed position in a string
209 rtoy 1.12.30.14 just as CHAR does, except the string must be a simple-string."
210 ram 1.4 (declare (optimize (safety 1)))
211 ram 1.1 (schar string index))
212    
213     (defun %scharset (string index new-el)
214 ram 1.4 (declare (optimize (safety 1)))
215 ram 1.1 (setf (schar string index) new-el))
216    
217     (defun string=* (string1 string2 start1 end1 start2 end2)
218 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
219     (not (%sp-string-compare string1 start1 end1 string2 start2 end2))))
220 ram 1.1
221    
222     (defun string/=* (string1 string2 start1 end1 start2 end2)
223 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
224     (let ((comparison (%sp-string-compare string1 start1 end1
225     string2 start2 end2)))
226     (if comparison (- (the fixnum comparison) offset1)))))
227 ram 1.1
228     (eval-when (compile eval)
229    
230     ;;; Lessp is true if the desired expansion is for string<* or string<=*.
231     ;;; Equalp is true if the desired expansion is for string<=* or string>=*.
232     (defmacro string<>=*-body (lessp equalp)
233     (let ((offset1 (gensym)))
234 ram 1.4 `(with-two-strings string1 string2 start1 end1 ,offset1 start2 end2
235     (let ((index (%sp-string-compare string1 start1 end1
236     string2 start2 end2)))
237     (if index
238 ram 1.8 (cond ((= (the fixnum index) (the fixnum end1))
239     ,(if lessp
240     `(- (the fixnum index) ,offset1)
241     `nil))
242     ((= (+ (the fixnum index) (- start2 start1))
243     (the fixnum end2))
244     ,(if lessp
245     `nil
246 rtoy 1.12.30.1 `(- (the fixnum index) ,offset1)))
247     #-unicode
248 ram 1.4 ((,(if lessp 'char< 'char>)
249     (schar string1 index)
250     (schar string2 (+ (the fixnum index) (- start2 start1))))
251     (- (the fixnum index) ,offset1))
252 rtoy 1.12.30.1 #-unicode
253     (t nil)
254     #+unicode
255     (t
256     ;; Compare in code point order. See
257     ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
258     (flet ((fixup (code)
259     (if (>= code #xe000)
260     (- code #x800)
261     (+ code #x2000))))
262     (declare (inline fixup))
263     (let* ((c1 (char-code (schar string1 index)))
264 rtoy 1.12.30.10 (c2 (char-code (schar string2
265     (+ (the fixnum index)
266     (- start2 start1))))))
267 rtoy 1.12.30.1 (cond ((and (>= c1 #xd800)
268     (>= c2 #xd800))
269     (let ((fix-c1 (fixup c1))
270     (fix-c2 (fixup c2)))
271     (if (,(if lessp '< '>) fix-c1 fix-c2)
272     (- (the fixnum index) ,offset1)
273     nil)))
274     (t
275     (if (,(if lessp '< '>) c1 c2)
276 rtoy 1.12.30.10 (- (the fixnum index) ,offset1)
277     nil)))))))
278 ram 1.4 ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
279 ram 1.1 ) ; eval-when
280    
281     (defun string<* (string1 string2 start1 end1 start2 end2)
282     (declare (fixnum start1 start2))
283     (string<>=*-body t nil))
284    
285     (defun string>* (string1 string2 start1 end1 start2 end2)
286     (declare (fixnum start1 start2))
287     (string<>=*-body nil nil))
288    
289     (defun string<=* (string1 string2 start1 end1 start2 end2)
290     (declare (fixnum start1 start2))
291     (string<>=*-body t t))
292    
293     (defun string>=* (string1 string2 start1 end1 start2 end2)
294     (declare (fixnum start1 start2))
295     (string<>=*-body nil t))
296    
297    
298    
299     (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
300     "Given two strings, if the first string is lexicographically less than
301     the second string, returns the longest common prefix (using char=)
302     of the two strings. Otherwise, returns ()."
303     (string<* string1 string2 start1 end1 start2 end2))
304    
305     (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
306     "Given two strings, if the first string is lexicographically greater than
307     the second string, returns the longest common prefix (using char=)
308     of the two strings. Otherwise, returns ()."
309     (string>* string1 string2 start1 end1 start2 end2))
310    
311    
312     (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
313     "Given two strings, if the first string is lexicographically less than
314     or equal to the second string, returns the longest common prefix
315     (using char=) of the two strings. Otherwise, returns ()."
316     (string<=* string1 string2 start1 end1 start2 end2))
317    
318     (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
319     "Given two strings, if the first string is lexicographically greater
320     than or equal to the second string, returns the longest common prefix
321     (using char=) of the two strings. Otherwise, returns ()."
322     (string>=* string1 string2 start1 end1 start2 end2))
323    
324     (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
325     "Given two strings (string1 and string2), and optional integers start1,
326     start2, end1 and end2, compares characters in string1 to characters in
327     string2 (using char=)."
328     (string=* string1 string2 start1 end1 start2 end2))
329    
330     (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
331     "Given two strings, if the first string is not lexicographically equal
332     to the second string, returns the longest common prefix (using char=)
333     of the two strings. Otherwise, returns ()."
334     (string/=* string1 string2 start1 end1 start2 end2))
335    
336    
337     (eval-when (compile eval)
338    
339     ;;; STRING-NOT-EQUAL-LOOP is used to generate character comparison loops for
340     ;;; STRING-EQUAL and STRING-NOT-EQUAL.
341     (defmacro string-not-equal-loop (end end-value
342     &optional (abort-value nil abortp))
343     (declare (fixnum end))
344     (let ((end-test (if (= end 1)
345     `(= index1 (the fixnum end1))
346     `(= index2 (the fixnum end2)))))
347     `(do ((index1 start1 (1+ index1))
348     (index2 start2 (1+ index2)))
349     (,(if abortp
350     end-test
351     `(or ,end-test
352     (not (char-equal (schar string1 index1)
353     (schar string2 index2)))))
354     ,end-value)
355     (declare (fixnum index1 index2))
356     ,@(if abortp
357     `((if (not (char-equal (schar string1 index1)
358     (schar string2 index2)))
359     (return ,abort-value)))))))
360    
361     ) ; eval-when
362    
363     (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
364     "Given two strings (string1 and string2), and optional integers start1,
365     start2, end1 and end2, compares characters in string1 to characters in
366     string2 (using char-equal)."
367     (declare (fixnum start1 start2))
368 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
369     (let ((slen1 (- (the fixnum end1) start1))
370     (slen2 (- (the fixnum end2) start2)))
371     (declare (fixnum slen1 slen2))
372     (if (or (minusp slen1) (minusp slen2))
373     ;;prevent endless looping later.
374     (error "Improper bounds for string comparison."))
375     (if (= slen1 slen2)
376     ;;return () immediately if lengths aren't equal.
377     (string-not-equal-loop 1 t nil)))))
378 ram 1.1
379     (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
380     "Given two strings, if the first string is not lexicographically equal
381     to the second string, returns the longest common prefix (using char-equal)
382     of the two strings. Otherwise, returns ()."
383 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
384     (let ((slen1 (- end1 start1))
385     (slen2 (- end2 start2)))
386     (declare (fixnum slen1 slen2))
387     (if (or (minusp slen1) (minusp slen2))
388     ;;prevent endless looping later.
389     (error "Improper bounds for string comparison."))
390     (cond ((or (minusp slen1) (or (minusp slen2)))
391     (error "Improper substring for comparison."))
392     ((= slen1 slen2)
393     (string-not-equal-loop 1 nil (- index1 offset1)))
394     ((< slen1 slen2)
395     (string-not-equal-loop 1 (- index1 offset1)))
396     (t
397     (string-not-equal-loop 2 (- index1 offset1)))))))
398 ram 1.1
399    
400    
401     (eval-when (compile eval)
402    
403     ;;; STRING-LESS-GREATER-EQUAL-TESTS returns a test on the lengths of string1
404     ;;; and string2 and a test on the current characters from string1 and string2
405     ;;; for the following macro.
406     (defun string-less-greater-equal-tests (lessp equalp)
407     (if lessp
408     (if equalp
409     ;; STRING-NOT-GREATERP
410 rtoy 1.12.30.15 (values '<=
411     #-unicode `(not (char-greaterp char1 char2))
412     #+unicode `(<= char1 char2))
413 ram 1.1 ;; STRING-LESSP
414 rtoy 1.12.30.15 (values '<
415     #-unicode `(char-lessp char1 char2)
416     #+unicode `(< char1 char2)))
417 ram 1.1 (if equalp
418     ;; STRING-NOT-LESSP
419 rtoy 1.12.30.15 (values '>=
420     #-unicode `(not (char-lessp char1 char2))
421     #+unicode `(>= char1 char2))
422 ram 1.1 ;; STRING-GREATERP
423 rtoy 1.12.30.15 (values '>
424     #-unicode `(char-greaterp char1 char2)
425     #+unicode `(> char1 char2)))))
426 ram 1.1
427 rtoy 1.12.30.15 #-unicode
428 ram 1.1 (defmacro string-less-greater-equal (lessp equalp)
429     (multiple-value-bind (length-test character-test)
430     (string-less-greater-equal-tests lessp equalp)
431 ram 1.4 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
432     (let ((slen1 (- (the fixnum end1) start1))
433     (slen2 (- (the fixnum end2) start2)))
434     (declare (fixnum slen1 slen2))
435     (if (or (minusp slen1) (minusp slen2))
436     ;;prevent endless looping later.
437     (error "Improper bounds for string comparison."))
438     (do ((index1 start1 (1+ index1))
439     (index2 start2 (1+ index2))
440     (char1)
441     (char2))
442     ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
443     (if (,length-test slen1 slen2) (- index1 offset1)))
444     (declare (fixnum index1 index2))
445     (setq char1 (schar string1 index1))
446     (setq char2 (schar string2 index2))
447     (if (not (char-equal char1 char2))
448     (if ,character-test
449     (return (- index1 offset1))
450     (return ()))))))))
451 ram 1.1
452 rtoy 1.12.30.18 ;; Convert to lowercase for case folding, to match what Unicode
453     ;; CaseFolding.txt says. An example where this matters: U+1E9E maps
454     ;; to U+00DF. But the uppercase version of U+00DF is U+00DF.
455 rtoy 1.12.30.15 #+unicode
456     (defmacro equal-char-codepoint (codepoint)
457     `(let ((ch ,codepoint))
458 rtoy 1.12.30.18 ;; Handle ASCII separately for bootstrapping and for unidata missing.
459     (if (< 64 ch 91)
460     (+ ch 32)
461 rtoy 1.12.30.15 #-(and unicode (not unicode-bootstrap))
462     ch
463     #+(and unicode (not unicode-bootstrap))
464 rtoy 1.12.30.18 (if (> ch 127) (unicode-lower ch) ch))))
465 rtoy 1.12.30.15
466     #+unicode
467     (defmacro string-less-greater-equal (lessp equalp)
468     (multiple-value-bind (length-test character-test)
469     (string-less-greater-equal-tests lessp equalp)
470     `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
471     (let ((slen1 (- (the fixnum end1) start1))
472     (slen2 (- (the fixnum end2) start2)))
473     (declare (fixnum slen1 slen2))
474     (if (or (minusp slen1) (minusp slen2))
475     ;;prevent endless looping later.
476     (error "Improper bounds for string comparison."))
477     (do ((index1 start1 (1+ index1))
478     (index2 start2 (1+ index2)))
479     ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
480     (if (,length-test slen1 slen2) (- index1 offset1)))
481     (declare (fixnum index1 index2))
482     (multiple-value-bind (char1 wide1)
483     (codepoint string1 index1)
484 rtoy 1.12.30.18 (declare (type (integer 0 #x10ffff) char1))
485 rtoy 1.12.30.15 (multiple-value-bind (char2 wide2)
486     (codepoint string2 index2)
487 rtoy 1.12.30.18 (declare (type (integer 0 #x10ffff) char2))
488     (setf char1 (equal-char-codepoint char1))
489     (setf char2 (equal-char-codepoint char2))
490     (if (= char1 char2)
491 rtoy 1.12.30.15 (progn
492     (when wide1 (incf index1))
493     (when wide2 (incf index2)))
494     (if ,character-test
495     (return (- index1 offset1))
496     (return ()))))))))))
497    
498 ram 1.1 ) ; eval-when
499    
500     (defun string-lessp* (string1 string2 start1 end1 start2 end2)
501     (declare (fixnum start1 start2))
502     (string-less-greater-equal t nil))
503    
504     (defun string-greaterp* (string1 string2 start1 end1 start2 end2)
505     (declare (fixnum start1 start2))
506     (string-less-greater-equal nil nil))
507    
508     (defun string-not-lessp* (string1 string2 start1 end1 start2 end2)
509     (declare (fixnum start1 start2))
510     (string-less-greater-equal nil t))
511    
512     (defun string-not-greaterp* (string1 string2 start1 end1 start2 end2)
513     (declare (fixnum start1 start2))
514     (string-less-greater-equal t t))
515    
516     (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
517     "Given two strings, if the first string is lexicographically less than
518     the second string, returns the longest common prefix (using char-equal)
519     of the two strings. Otherwise, returns ()."
520     (string-lessp* string1 string2 start1 end1 start2 end2))
521    
522     (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)
523     "Given two strings, if the first string is lexicographically greater than
524     the second string, returns the longest common prefix (using char-equal)
525     of the two strings. Otherwise, returns ()."
526     (string-greaterp* string1 string2 start1 end1 start2 end2))
527    
528     (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
529     "Given two strings, if the first string is lexicographically greater
530     than or equal to the second string, returns the longest common prefix
531     (using char-equal) of the two strings. Otherwise, returns ()."
532     (string-not-lessp* string1 string2 start1 end1 start2 end2))
533    
534     (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
535     end2)
536     "Given two strings, if the first string is lexicographically less than
537     or equal to the second string, returns the longest common prefix
538     (using char-equal) of the two strings. Otherwise, returns ()."
539     (string-not-greaterp* string1 string2 start1 end1 start2 end2))
540    
541    
542 dtc 1.9 (defun make-string (count &key element-type ((:initial-element fill-char)))
543 ram 1.1 "Given a character count and an optional fill character, makes and returns
544 rtoy 1.12.30.14 a new string Count long filled with the fill character."
545 emarsden 1.12 (declare (type fixnum count))
546     (assert (subtypep element-type 'character))
547 ram 1.1 (if fill-char
548     (do ((i 0 (1+ i))
549     (string (make-string count)))
550     ((= i count) string)
551     (declare (fixnum i))
552     (setf (schar string i) fill-char))
553     (make-string count)))
554    
555     (defun string-upcase (string &key (start 0) end)
556     "Given a string, returns a new string that is a copy of it with
557     all lower case alphabetic characters converted to uppercase."
558     (declare (fixnum start))
559 ram 1.5 (let* ((string (if (stringp string) string (string string)))
560     (slen (length string)))
561 ram 1.4 (declare (fixnum slen))
562 ram 1.1 (with-one-string string start end offset
563     (let ((offset-slen (+ slen offset))
564     (newstring (make-string slen)))
565     (declare (fixnum offset-slen))
566     (do ((index offset (1+ index))
567     (new-index 0 (1+ new-index)))
568     ((= index start))
569     (declare (fixnum index new-index))
570     (setf (schar newstring new-index) (schar string index)))
571     (do ((index start (1+ index))
572     (new-index (- start offset) (1+ new-index)))
573     ((= index (the fixnum end)))
574     (declare (fixnum index new-index))
575 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
576     (when wide (incf index))
577     ;; Handle ASCII specially because this is called early in
578     ;; initialization, before unidata is available.
579     (cond ((< 96 code 123) (decf code 32))
580     ((> code 127) (setq code (unicode-upper code))))
581     ;;@@ WARNING: this may, in theory, need to extend newstring
582     ;; but that never actually occurs as of Unicode 5.1.0,
583     ;; so I'm just going to ignore it for now...
584     (multiple-value-bind (hi lo) (surrogates code)
585 rtoy 1.12.30.8 (setf (schar newstring new-index) hi)
586 rtoy 1.12.30.6 (when lo
587 rtoy 1.12.30.8 (setf (schar newstring (incf new-index)) lo)))))
588 rtoy 1.12.30.4 ;;@@ WARNING: see above
589 ram 1.1 (do ((index end (1+ index))
590     (new-index (- (the fixnum end) offset) (1+ new-index)))
591     ((= index offset-slen))
592     (declare (fixnum index new-index))
593     (setf (schar newstring new-index) (schar string index)))
594     newstring))))
595    
596     (defun string-downcase (string &key (start 0) end)
597     "Given a string, returns a new string that is a copy of it with
598     all upper case alphabetic characters converted to lowercase."
599     (declare (fixnum start))
600 ram 1.5 (let* ((string (if (stringp string) string (string string)))
601     (slen (length string)))
602 ram 1.4 (declare (fixnum slen))
603 ram 1.1 (with-one-string string start end offset
604     (let ((offset-slen (+ slen offset))
605     (newstring (make-string slen)))
606     (declare (fixnum offset-slen))
607     (do ((index offset (1+ index))
608     (new-index 0 (1+ new-index)))
609     ((= index start))
610     (declare (fixnum index new-index))
611     (setf (schar newstring new-index) (schar string index)))
612     (do ((index start (1+ index))
613     (new-index (- start offset) (1+ new-index)))
614     ((= index (the fixnum end)))
615     (declare (fixnum index new-index))
616 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
617     (when wide (incf index))
618     ;; Handle ASCII specially because this is called early in
619     ;; initialization, before unidata is available.
620     (cond ((< 64 code 91) (incf code 32))
621     ((> code 127) (setq code (unicode-lower code))))
622     ;;@@ WARNING: this may, in theory, need to extend newstring
623     ;; but that never actually occurs as of Unicode 5.1.0,
624     ;; so I'm just going to ignore it for now...
625     (multiple-value-bind (hi lo) (surrogates code)
626 rtoy 1.12.30.8 (setf (schar newstring new-index) hi)
627 rtoy 1.12.30.6 (when lo
628 rtoy 1.12.30.8 (setf (schar newstring (incf new-index)) lo)))))
629 rtoy 1.12.30.4 ;;@@ WARNING: see above
630 ram 1.1 (do ((index end (1+ index))
631     (new-index (- (the fixnum end) offset) (1+ new-index)))
632     ((= index offset-slen))
633     (declare (fixnum index new-index))
634     (setf (schar newstring new-index) (schar string index)))
635     newstring))))
636    
637     (defun string-capitalize (string &key (start 0) end)
638     "Given a string, returns a copy of the string with the first
639     character of each ``word'' converted to upper-case, and remaining
640     chars in the word converted to lower case. A ``word'' is defined
641     to be a string of case-modifiable characters delimited by
642     non-case-modifiable chars."
643     (declare (fixnum start))
644 ram 1.5 (let* ((string (if (stringp string) string (string string)))
645     (slen (length string)))
646 ram 1.4 (declare (fixnum slen))
647 ram 1.1 (with-one-string string start end offset
648     (let ((offset-slen (+ slen offset))
649     (newstring (make-string slen)))
650     (declare (fixnum offset-slen))
651     (do ((index offset (1+ index))
652     (new-index 0 (1+ new-index)))
653     ((= index start))
654     (declare (fixnum index new-index))
655     (setf (schar newstring new-index) (schar string index)))
656     (do ((index start (1+ index))
657     (new-index (- start offset) (1+ new-index))
658     (newword t)
659     (char ()))
660     ((= index (the fixnum end)))
661     (declare (fixnum index new-index))
662     (setq char (schar string index))
663     (cond ((not (alphanumericp char))
664     (setq newword t))
665     (newword
666     ;;char is first case-modifiable after non-case-modifiable
667 rtoy 1.12.30.2 (setq char (char-titlecase char))
668 ram 1.1 (setq newword ()))
669     ;;char is case-modifiable, but not first
670     (t (setq char (char-downcase char))))
671     (setf (schar newstring new-index) char))
672     (do ((index end (1+ index))
673     (new-index (- (the fixnum end) offset) (1+ new-index)))
674     ((= index offset-slen))
675     (declare (fixnum index new-index))
676     (setf (schar newstring new-index) (schar string index)))
677     newstring))))
678    
679     (defun nstring-upcase (string &key (start 0) end)
680     "Given a string, returns that string with all lower case alphabetic
681     characters converted to uppercase."
682     (declare (fixnum start))
683 ram 1.4 (let ((save-header string))
684 ram 1.1 (with-one-string string start end offset
685     (do ((index start (1+ index)))
686     ((= index (the fixnum end)))
687     (declare (fixnum index))
688 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
689     (declare (ignore wide))
690     ;; Handle ASCII specially because this is called early in
691     ;; initialization, before unidata is available.
692     (cond ((< 96 code 123) (decf code 32))
693     ((> code 127) (setq code (unicode-upper code))))
694     ;;@@ WARNING: this may, in theory, need to extend string
695     ;; (which, obviously, we can't do here. Unless
696     ;; STRING is adjustable, maybe)
697     ;; but that never actually occurs as of Unicode 5.1.0,
698     ;; so I'm just going to ignore it for now...
699     (multiple-value-bind (hi lo) (surrogates code)
700 rtoy 1.12.30.8 (setf (schar string index) hi)
701 rtoy 1.12.30.6 (when lo
702 rtoy 1.12.30.8 (setf (schar string (incf index)) lo))))))
703 ram 1.1 save-header))
704    
705     (defun nstring-downcase (string &key (start 0) end)
706     "Given a string, returns that string with all upper case alphabetic
707     characters converted to lowercase."
708     (declare (fixnum start))
709 ram 1.4 (let ((save-header string))
710 ram 1.1 (with-one-string string start end offset
711     (do ((index start (1+ index)))
712     ((= index (the fixnum end)))
713     (declare (fixnum index))
714 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
715     (declare (ignore wide))
716     (cond ((< 64 code 91) (incf code 32))
717     ((> code 127) (setq code (unicode-lower code))))
718     ;;@@ WARNING: this may, in theory, need to extend string
719     ;; (which, obviously, we can't do here. Unless
720     ;; STRING is adjustable, maybe)
721     ;; but that never actually occurs as of Unicode 5.1.0,
722     ;; so I'm just going to ignore it for now...
723     (multiple-value-bind (hi lo) (surrogates code)
724 rtoy 1.12.30.8 (setf (schar string index) hi)
725 rtoy 1.12.30.6 (when lo
726 rtoy 1.12.30.8 (setf (schar string (incf index)) lo))))))
727 wlott 1.6 save-header))
728 ram 1.1
729     (defun nstring-capitalize (string &key (start 0) end)
730     "Given a string, returns that string with the first
731     character of each ``word'' converted to upper-case, and remaining
732     chars in the word converted to lower case. A ``word'' is defined
733     to be a string of case-modifiable characters delimited by
734     non-case-modifiable chars."
735     (declare (fixnum start))
736 ram 1.4 (let ((save-header string))
737 ram 1.1 (with-one-string string start end offset
738     (do ((index start (1+ index))
739     (newword t)
740     (char ()))
741     ((= index (the fixnum end)))
742     (declare (fixnum index))
743     (setq char (schar string index))
744     (cond ((not (alphanumericp char))
745     (setq newword t))
746     (newword
747     ;;char is first case-modifiable after non-case-modifiable
748 rtoy 1.12.30.2 (setf (schar string index) (char-titlecase char))
749 ram 1.1 (setq newword ()))
750     (t
751     (setf (schar string index) (char-downcase char))))))
752     save-header))
753    
754     (defun string-left-trim (char-bag string)
755     "Given a set of characters (a list or string) and a string, returns
756     a copy of the string with the characters in the set removed from the
757     left end."
758     (with-string string
759     (do ((index start (1+ index)))
760     ((or (= index (the fixnum end))
761     (not (find (schar string index) char-bag)))
762     (subseq (the simple-string string) index end))
763     (declare (fixnum index)))))
764    
765     (defun string-right-trim (char-bag string)
766     "Given a set of characters (a list or string) and a string, returns
767     a copy of the string with the characters in the set removed from the
768     right end."
769     (with-string string
770     (do ((index (1- (the fixnum end)) (1- index)))
771     ((or (< index start) (not (find (schar string index) char-bag)))
772     (subseq (the simple-string string) start (1+ index)))
773     (declare (fixnum index)))))
774    
775     (defun string-trim (char-bag string)
776     "Given a set of characters (a list or string) and a string, returns a
777     copy of the string with the characters in the set removed from both
778     ends."
779     (with-string string
780     (let* ((left-end (do ((index start (1+ index)))
781     ((or (= index (the fixnum end))
782     (not (find (schar string index) char-bag)))
783     index)
784     (declare (fixnum index))))
785     (right-end (do ((index (1- (the fixnum end)) (1- index)))
786     ((or (< index left-end)
787     (not (find (schar string index) char-bag)))
788     (1+ index))
789     (declare (fixnum index)))))
790     (subseq (the simple-string string) left-end right-end))))
791 rtoy 1.12.30.3
792 rtoy 1.12.30.4 (declaim (inline %glyph-f %glyph-b))
793     (defun %glyph-f (string index)
794     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
795     (type simple-string string) (type kernel:index index))
796 rtoy 1.12.30.10 (let* ((prev 0)
797     (l (length string))
798     (c (codepoint string index l))
799     (n (+ index (if (> c #xFFFF) 2 1))))
800     (declare (type (integer 0 #x10FFFF) c) (type kernel:index l n))
801     (loop while (< n l) do
802     (let* ((c (codepoint string n l))
803     (d (the (unsigned-byte 8) (unicode-combining-class c))))
804     (when (or (zerop d) (< d prev))
805     (return))
806     (setq prev d)
807     (incf n (if (> c #xFFFF) 2 1))))
808     n))
809 rtoy 1.12.30.4
810     (defun %glyph-b (string index)
811     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
812     (type simple-string string) (type kernel:index index))
813 rtoy 1.12.30.11 (let* ((prev 255)
814     (n (1- index)))
815     (declare (type kernel:index n))
816     (loop until (< n 0) do
817     (let* ((c (codepoint string n 0))
818     (d (the (unsigned-byte 8) (unicode-combining-class c))))
819     (cond ((zerop d) (return))
820     ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
821     (setq prev d)
822     (decf n (if (> c #xFFFF) 2 1))))
823     n))
824 rtoy 1.12.30.4
825     (defun glyph (string index &key (from-end nil))
826 rtoy 1.12.30.3 "GLYPH returns the glyph at the indexed position in a string, and the
827 rtoy 1.12.30.4 position of the next glyph (or NIL) as a second value. A glyph is
828     a substring consisting of the character at INDEX followed by all
829     subsequent combining characters."
830 rtoy 1.12.30.3 (declare (type simple-string string) (type kernel:index index))
831     #-unicode
832     (char string index)
833     #+unicode
834     (with-array-data ((string string) (start) (end))
835     (declare (ignore start end))
836 rtoy 1.12.30.4 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
837     (if from-end
838     (values (subseq string n index) (and (> n 0) n))
839     (values (subseq string index n) (and (< n (length string)) n))))))
840 rtoy 1.12.30.3
841 rtoy 1.12.30.4 (defun sglyph (string index &key (from-end nil))
842 rtoy 1.12.30.3 "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
843 rtoy 1.12.30.4 except that the string must be a simple-string"
844 rtoy 1.12.30.3 (declare (type simple-string string) (type kernel:index index))
845     #-unicode
846     (schar string index)
847     #+unicode
848 rtoy 1.12.30.4 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
849     (if from-end
850     (values (subseq string n index) (and (> n 0) n))
851     (values (subseq string index n) (and (< n (length string)) n)))))
852    
853 rtoy 1.12.30.9 (defun decompose (string &optional (compatibility t))
854     (declare (type string string))
855     (let ((result (make-string (cond ((< (length string) 40)
856     (* 5 (length string)))
857     ((< (length string) 4096)
858     (* 2 (length string)))
859     (t (round (length string) 5/6)))))
860 rtoy 1.12.30.10 (fillptr 0))
861     (declare (type kernel:index fillptr))
862 rtoy 1.12.30.9 (labels ((rec (string)
863     (declare (type simple-string string))
864     (do ((i 0 (1+ i)))
865     ((= i (length string)))
866     (declare (type kernel:index i))
867     (multiple-value-bind (code wide) (codepoint string i)
868     (when wide (incf i))
869     (let ((decomp (unicode-decomp code compatibility)))
870     (if decomp (rec decomp) (out code))))))
871     (out (code)
872     (multiple-value-bind (hi lo) (surrogates code)
873     (outch hi)
874     (when lo
875 rtoy 1.12.30.10 (outch lo))
876     (let ((cc (unicode-combining-class code)))
877     (unless (zerop cc)
878 rtoy 1.12.30.20 (order lo cc (- fillptr (if lo 3 2)))))))
879 rtoy 1.12.30.9 (outch (char)
880     (when (= fillptr (length result))
881     (let ((tmp (make-string (round (length result) 5/6))))
882     (replace tmp result)
883     (setq result tmp)))
884     (setf (schar result fillptr) char)
885     (incf fillptr))
886 rtoy 1.12.30.10 (order (wide1 cc last)
887     (loop until (minusp last) do
888     (multiple-value-bind (code2 wide2) (codepoint result last)
889     (let ((cc2 (unicode-combining-class code2)))
890     (cond ((zerop cc2) (return))
891     ((> cc2 cc)
892     (case (+ (if wide2 2 0) (if wide1 1 0))
893     (0 (rotatef (schar result last)
894     (schar result (1+ last))))
895     (1 (rotatef (schar result last)
896     (schar result (+ last 1))
897     (schar result (+ last 2))))
898     (2 (rotatef (schar result last)
899     (schar result (1- last))
900     (schar result (1+ last))))
901     (3 (rotatef (schar result last)
902     (schar result (+ last 2)))
903     (rotatef (schar result (1- last))
904     (schar result (1+ last)))))
905     (decf last (if wide2 2 1)))
906     (t (return))))))))
907 rtoy 1.12.30.9 (with-array-data ((string string) (start) (end))
908     (declare (ignore start end))
909     (rec string))
910     (shrink-vector result fillptr))))
911    
912 rtoy 1.12.30.21 (declaim (inline normalized-codepoint-p))
913     (defun normalized-codepoint-p (cp form)
914     (ecase form
915     (:nfc (unicode-nfc-qc cp))
916     (:nfkc (unicode-nfkc-qc cp))
917     (:nfd (unicode-nfd-qc cp))
918     (:nfkd (unicode-nfkd-qc cp))))
919    
920     ;; Perform check to see if string is already normalized. The Unicode
921     ;; example can return YES, NO, or MAYBE. For our purposes, only YES
922     ;; is important, for which we return T. For NO or MAYBE, we return NIL.
923     (defun normalized-form-p (string &optional (form :nfc))
924     (declare (type string string)
925     (type (member :nfc :nfkc :nfd :nfkd) form)
926     (optimize (speed 3)))
927     (let ((last-class 0)
928     (len (length string)))
929     (declare (type (integer 0 256) last-class))
930     (do ((k 0 (1+ k)))
931     ((>= k len))
932     (declare (type kernel:index k))
933     (multiple-value-bind (ch widep)
934     (codepoint string k len)
935     (when widep (incf k))
936     ;; Handle ASCII specially
937     (unless (< ch 128)
938     (let ((class (unicode-combining-class ch)))
939     (declare (type (unsigned-byte 8) class))
940     (when (and (> last-class class) (not (zerop class)))
941     ;; Definitely not normalized
942     (return-from normalized-form-p nil))
943     (let ((check (normalized-codepoint-p ch form)))
944     (unless (eq check :y)
945     (return-from normalized-form-p nil)))
946     (setf last-class class)))))
947     t))
948    
949    
950     ;; Compose a string in place. The string must already be in decomposed form.
951     (defun %compose (target)
952     (declare (type string target)
953     (optimize (speed 3)))
954     (let ((len (length target))
955     (starter-pos 0))
956     (declare (type kernel:index starter-pos))
957     (multiple-value-bind (starter-ch wide)
958     (codepoint target 0 len)
959     (let ((comp-pos (if wide 2 1))
960     (last-class (unicode-combining-class starter-ch)))
961     (declare (type (integer 0 256) last-class)
962     (type kernel:index comp-pos))
963     (unless (zerop last-class)
964     ;; Fix for strings starting with a combining character
965     (setf last-class 256))
966     ;; Loop on decomposed characters, combining where possible
967     (do ((decomp-pos comp-pos (1+ decomp-pos)))
968     ((>= decomp-pos len))
969     (declare (type kernel:index decomp-pos))
970     (multiple-value-bind (ch wide)
971     (codepoint target decomp-pos len)
972     (when wide (incf decomp-pos))
973     (let ((ch-class (unicode-combining-class ch))
974     (composite (get-pairwise-composition starter-ch ch)))
975     (declare (type (integer 0 256) ch-class))
976     (cond ((and composite
977     (or (< last-class ch-class) (zerop last-class)))
978     ;; Don't have to worry about surrogate pairs here
979     ;; because the composite is always in the BMP.
980     (setf (aref target starter-pos) (code-char composite))
981     (setf starter-ch composite))
982     (t
983     (when (zerop ch-class)
984     (setf starter-pos comp-pos)
985     (setf starter-ch ch))
986     (setf last-class ch-class)
987     (multiple-value-bind (hi lo)
988     (surrogates ch)
989     (setf (aref target comp-pos) hi)
990     (when lo
991     (incf comp-pos)
992     (setf (aref target comp-pos) lo))
993     (incf comp-pos)))))))
994     (shrink-vector target comp-pos)))))
995    
996 rtoy 1.12.30.4 (defun string-to-nfd (string)
997 rtoy 1.12.30.13 "Convert String to Unicode Normalization Form D (NFD) using the
998     canonical decomposition. The NFD string is returned"
999 rtoy 1.12.30.9 (decompose string nil))
1000 rtoy 1.12.30.4
1001     (defun string-to-nfkd (string)
1002 rtoy 1.12.30.13 "Convert String to Unicode Normalization Form KD (NFKD) uisng the
1003     compatible decomposition form. The NFKD string is returned."
1004 rtoy 1.12.30.9 (decompose string t))
1005 rtoy 1.12.30.4
1006 rtoy 1.12.30.12 #+unicode
1007 rtoy 1.12.30.4 (defun string-to-nfc (string)
1008 rtoy 1.12.30.22 "Convert String to Unicode Normalization Form C (NFC)."
1009 rtoy 1.12.30.23 (let ((s (copy-seq string)))
1010     (if (normalized-form-p s :nfc)
1011     s
1012     (coerce (if (normalized-form-p s :nfd)
1013     (%compose s)
1014     (%compose (string-to-nfd s)))
1015     'simple-string))))
1016 rtoy 1.12.30.12
1017     #-unicode ;; Needed by package.lisp
1018     (defun string-to-nfc (string)
1019     (if (simple-string-p string) string (coerce string 'simple-string)))
1020 rtoy 1.12.30.4
1021     (defun string-to-nfkc (string)
1022 rtoy 1.12.30.22 "Convert String to Unicode Normalization Form KC (NFKC)."
1023 rtoy 1.12.30.23 (let ((s (copy-seq string)))
1024     (if (normalized-form-p s :nfkc)
1025     s
1026     (coerce (if (normalized-form-p s :nfkd)
1027     (%compose s)
1028     (%compose (string-to-nfkd s)))
1029     'simple-string))))

  ViewVC Help
Powered by ViewVC 1.1.5