/[cmucl]/src/code/string.lisp
ViewVC logotype

Contents of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12.30.19 - (hide annotations)
Wed May 20 21:47:36 2009 UTC (4 years, 10 months ago) by rtoy
Branch: unicode-utf16-extfmt-branch
Changes since 1.12.30.18: +28 -7 lines
string.lisp:
o Add SURROGATEP function to test if something is a surrogate value.

extfmts.lisp:
utf-16-be.lisp:
utf-16-le.lisp:
utf-16.lisp:
utf-32-be.lisp:
utf-32-le.lisp:
utf-32.lisp:
utf-8.lisp:
o Use SURROGATEP.
1 ram 1.1 ;;; -*- Log: code.log; Package: Lisp -*-
2     ;;;
3     ;;; **********************************************************************
4 ram 1.3 ;;; This code was written as part of the CMU Common Lisp project at
5     ;;; Carnegie Mellon University, and has been placed in the public domain.
6     ;;;
7     (ext:file-comment
8 rtoy 1.12.30.19 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/string.lisp,v 1.12.30.19 2009/05/20 21:47:36 rtoy Exp $")
9 ram 1.3 ;;;
10 ram 1.1 ;;; **********************************************************************
11     ;;;
12 pw 1.11 ;;; Functions to implement strings for CMU Common Lisp
13 ram 1.1 ;;; Written by David Dill
14 ram 1.4 ;;; Rewritten by Skef Wholey, Bill Chiles and Rob MacLachlan.
15 ram 1.1 ;;;
16     ;;; ****************************************************************
17     ;;;
18 ram 1.4 (in-package "LISP")
19 rtoy 1.12.30.3 (export '(char schar glyph sglyph string
20 ram 1.1 string= string-equal string< string> string<= string>= string/=
21     string-lessp string-greaterp string-not-lessp string-not-greaterp
22     string-not-equal
23 rtoy 1.12.30.4 string-to-nfd string-to-nfkd string-to-nfc string-to-nfkc
24 ram 1.1 make-string
25     string-trim string-left-trim string-right-trim
26     string-upcase
27     string-downcase string-capitalize nstring-upcase nstring-downcase
28     nstring-capitalize))
29    
30    
31 rtoy 1.12.30.19 (declaim (inline surrogatep surrogates-to-codepoint codepoint surrogates))
32    
33     (defun surrogatep (c &optional surrogate-type)
34     "Test if C is a surrogate. C may be either an integer or a
35     character. Surrogate-type indicates what kind of surrogate to test
36     for. :High means to test for the high (leading) surrogate; :Low
37     tests for the low (trailing surrogate). A value of :Any or Nil
38     tests for any surrogate value (high or low)."
39     (declare (type (or character (integer 0 #x10ffff)) c))
40     (let ((code (if (characterp c)
41     (char-code c)
42     c)))
43     (ecase surrogate-type
44     ((:high :leading)
45     ;; Test for high surrogate
46     (<= #xD800 code #xDBFF))
47     ((:low :trailing)
48     ;; Test for low surrogate
49     (<= #xDC00 code #xDFFF))
50     ((:any nil)
51     ;; Test for any surrogate
52     (<= #xD800 code #xDFFF)))))
53 rtoy 1.12.30.7
54 rtoy 1.12.30.8 (defun surrogates-to-codepoint (hi lo)
55     "Convert the given Hi and Lo surrogate characters to the
56     corresponding codepoint value"
57     (declare (type character hi lo))
58     (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi)) #xD800) 10)
59     (the (integer #xDC00 #xDFFF) (char-code lo)) #x2400))
60 rtoy 1.12.30.7
61 rtoy 1.12.30.6 (defun codepoint (string i &optional (end (length string)))
62 rtoy 1.12.30.7 "Return the codepoint value from String at position I. If that
63     position is a surrogate, it is combined with either the previous or
64     following character (when possible) to compute the codepoint. The
65     second return value is NIL if the position is not a surrogate pair.
66     Otherwise +1 or -1 is returned if the position is the high or low
67     surrogate value, respectively."
68 rtoy 1.12.30.6 (declare (type simple-string string) (type kernel:index i end))
69     (let ((code (char-code (schar string i))))
70 rtoy 1.12.30.19 (cond ((and (surrogatep code :high) (< (1+ i) end))
71 rtoy 1.12.30.6 (let ((tmp (char-code (schar string (1+ i)))))
72 rtoy 1.12.30.19 (if (surrogatep tmp :low)
73 rtoy 1.12.30.10 (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)
74 rtoy 1.12.30.6 (values code nil))))
75 rtoy 1.12.30.19 ((and (surrogatep code :low) (> i 0))
76 rtoy 1.12.30.6 (let ((tmp (char-code (schar string (1- i)))))
77 rtoy 1.12.30.19 (if (surrogatep tmp :high)
78 rtoy 1.12.30.10 (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)
79 rtoy 1.12.30.6 (values code nil))))
80     (t (values code nil)))))
81    
82     (defun surrogates (codepoint)
83 rtoy 1.12.30.8 "Return the high and low surrogate characters for Codepoint. If
84     Codepoint is in the BMP, the first return value is the corresponding
85     character and the second is NIL."
86 rtoy 1.12.30.6 (declare (type (integer 0 #x10FFFF) codepoint))
87     (if (< codepoint #x10000)
88 rtoy 1.12.30.8 (values (code-char codepoint) nil)
89 rtoy 1.12.30.6 (let* ((tmp (- codepoint #x10000))
90     (hi (logior (ldb (byte 10 10) tmp) #xD800))
91     (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
92 rtoy 1.12.30.8 (values (code-char hi) (code-char lo)))))
93 rtoy 1.12.30.6
94 rtoy 1.12.30.16 (defun utf16-string-p (string)
95     "Check if String is a valid UTF-16 string. If the string is valid,
96     T is returned. If the string is not valid, NIL is returned, and the
97     second value is the index into the string of the invalid character."
98     (do ((len (length string))
99     (index 0 (1+ index)))
100     ((>= index len)
101     t)
102     (multiple-value-bind (codepoint wide)
103     (codepoint string index)
104 rtoy 1.12.30.17 ;; We step through the string in order. If there are any
105     ;; surrogates pairs, we must reach the lead surrogate first,
106     ;; which means WIDE is +1. Otherwise, we have an invalid
107     ;; surrogate pair. If we get any codepoint that is in
108     ;; the surrogate range, we also have an invalid string.
109 rtoy 1.12.30.16 (when (or (eq wide -1)
110 rtoy 1.12.30.19 (surrogatep codepoint))
111 rtoy 1.12.30.16 (return-from utf16-string-p (values nil index)))
112     (when wide (incf index)))))
113 rtoy 1.12.30.6
114 ram 1.1 (defun string (X)
115     "Coerces X into a string. If X is a string, X is returned. If X is a
116 rtoy 1.12.30.14 symbol, X's pname is returned. If X is a character then a one element
117     string containing that character is returned. If X cannot be coerced
118     into a string, an error occurs."
119 ram 1.1 (cond ((stringp x) x)
120     ((symbolp x) (symbol-name x))
121     ((characterp x)
122     (let ((res (make-string 1)))
123     (setf (schar res 0) x) res))
124     (t
125 pw 1.11 (error 'simple-type-error
126     :datum x
127     :expected-type '(or string symbol character)
128     :format-control "~S cannot be coerced to a string."
129     :format-arguments (list x)))))
130 ram 1.1
131     ;;; With-One-String is used to set up some string hacking things. The keywords
132     ;;; are parsed, and the string is hacked into a simple-string.
133    
134     (eval-when (compile)
135    
136     (defmacro with-one-string (string start end cum-offset &rest forms)
137 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
138 pw 1.10 ;; Optimizer may prove STRING is one.
139     (declare (optimize (ext:inhibit-warnings 3)))
140 ram 1.4 (with-array-data ((,string ,string :offset-var ,cum-offset)
141     (,start ,start)
142     (,end (or ,end (length (the vector ,string)))))
143     ,@forms)))
144 ram 1.1
145     )
146    
147     ;;; With-String is like With-One-String, but doesn't parse keywords.
148    
149     (eval-when (compile)
150    
151     (defmacro with-string (string &rest forms)
152 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
153     (with-array-data ((,string ,string)
154     (start)
155     (end (length (the vector ,string))))
156     ,@forms)))
157 ram 1.1
158     )
159    
160     ;;; With-Two-Strings is used to set up string comparison operations. The
161     ;;; keywords are parsed, and the strings are hacked into simple-strings.
162    
163     (eval-when (compile)
164    
165     (defmacro with-two-strings (string1 string2 start1 end1 cum-offset-1
166     start2 end2 &rest forms)
167 ram 1.4 `(let ((,string1 (if (stringp ,string1) ,string1 (string ,string1)))
168     (,string2 (if (stringp ,string2) ,string2 (string ,string2))))
169     (with-array-data ((,string1 ,string1 :offset-var ,cum-offset-1)
170     (,start1 ,start1)
171     (,end1 (or ,end1 (length (the vector ,string1)))))
172     (with-array-data ((,string2 ,string2)
173     (,start2 ,start2)
174     (,end2 (or ,end2 (length (the vector ,string2)))))
175     ,@forms))))
176 ram 1.1
177     )
178 wlott 1.2
179 ram 1.1
180     (defun char (string index)
181     "Given a string and a non-negative integer index less than the length of
182     the string, returns the character object representing the character at
183     that position in the string."
184 ram 1.4 (declare (optimize (safety 1)))
185 ram 1.1 (char string index))
186    
187     (defun %charset (string index new-el)
188 ram 1.4 (declare (optimize (safety 1)))
189 ram 1.1 (setf (char string index) new-el))
190    
191     (defun schar (string index)
192     "SCHAR returns the character object at an indexed position in a string
193 rtoy 1.12.30.14 just as CHAR does, except the string must be a simple-string."
194 ram 1.4 (declare (optimize (safety 1)))
195 ram 1.1 (schar string index))
196    
197     (defun %scharset (string index new-el)
198 ram 1.4 (declare (optimize (safety 1)))
199 ram 1.1 (setf (schar string index) new-el))
200    
201     (defun string=* (string1 string2 start1 end1 start2 end2)
202 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
203     (not (%sp-string-compare string1 start1 end1 string2 start2 end2))))
204 ram 1.1
205    
206     (defun string/=* (string1 string2 start1 end1 start2 end2)
207 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
208     (let ((comparison (%sp-string-compare string1 start1 end1
209     string2 start2 end2)))
210     (if comparison (- (the fixnum comparison) offset1)))))
211 ram 1.1
212     (eval-when (compile eval)
213    
214     ;;; Lessp is true if the desired expansion is for string<* or string<=*.
215     ;;; Equalp is true if the desired expansion is for string<=* or string>=*.
216     (defmacro string<>=*-body (lessp equalp)
217     (let ((offset1 (gensym)))
218 ram 1.4 `(with-two-strings string1 string2 start1 end1 ,offset1 start2 end2
219     (let ((index (%sp-string-compare string1 start1 end1
220     string2 start2 end2)))
221     (if index
222 ram 1.8 (cond ((= (the fixnum index) (the fixnum end1))
223     ,(if lessp
224     `(- (the fixnum index) ,offset1)
225     `nil))
226     ((= (+ (the fixnum index) (- start2 start1))
227     (the fixnum end2))
228     ,(if lessp
229     `nil
230 rtoy 1.12.30.1 `(- (the fixnum index) ,offset1)))
231     #-unicode
232 ram 1.4 ((,(if lessp 'char< 'char>)
233     (schar string1 index)
234     (schar string2 (+ (the fixnum index) (- start2 start1))))
235     (- (the fixnum index) ,offset1))
236 rtoy 1.12.30.1 #-unicode
237     (t nil)
238     #+unicode
239     (t
240     ;; Compare in code point order. See
241     ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
242     (flet ((fixup (code)
243     (if (>= code #xe000)
244     (- code #x800)
245     (+ code #x2000))))
246     (declare (inline fixup))
247     (let* ((c1 (char-code (schar string1 index)))
248 rtoy 1.12.30.10 (c2 (char-code (schar string2
249     (+ (the fixnum index)
250     (- start2 start1))))))
251 rtoy 1.12.30.1 (cond ((and (>= c1 #xd800)
252     (>= c2 #xd800))
253     (let ((fix-c1 (fixup c1))
254     (fix-c2 (fixup c2)))
255     (if (,(if lessp '< '>) fix-c1 fix-c2)
256     (- (the fixnum index) ,offset1)
257     nil)))
258     (t
259     (if (,(if lessp '< '>) c1 c2)
260 rtoy 1.12.30.10 (- (the fixnum index) ,offset1)
261     nil)))))))
262 ram 1.4 ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
263 ram 1.1 ) ; eval-when
264    
265     (defun string<* (string1 string2 start1 end1 start2 end2)
266     (declare (fixnum start1 start2))
267     (string<>=*-body t nil))
268    
269     (defun string>* (string1 string2 start1 end1 start2 end2)
270     (declare (fixnum start1 start2))
271     (string<>=*-body nil nil))
272    
273     (defun string<=* (string1 string2 start1 end1 start2 end2)
274     (declare (fixnum start1 start2))
275     (string<>=*-body t t))
276    
277     (defun string>=* (string1 string2 start1 end1 start2 end2)
278     (declare (fixnum start1 start2))
279     (string<>=*-body nil t))
280    
281    
282    
283     (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
284     "Given two strings, if the first string is lexicographically less than
285     the second string, returns the longest common prefix (using char=)
286     of the two strings. Otherwise, returns ()."
287     (string<* string1 string2 start1 end1 start2 end2))
288    
289     (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
290     "Given two strings, if the first string is lexicographically greater than
291     the second string, returns the longest common prefix (using char=)
292     of the two strings. Otherwise, returns ()."
293     (string>* string1 string2 start1 end1 start2 end2))
294    
295    
296     (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
297     "Given two strings, if the first string is lexicographically less than
298     or equal to the second string, returns the longest common prefix
299     (using char=) of the two strings. Otherwise, returns ()."
300     (string<=* string1 string2 start1 end1 start2 end2))
301    
302     (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
303     "Given two strings, if the first string is lexicographically greater
304     than or equal to the second string, returns the longest common prefix
305     (using char=) of the two strings. Otherwise, returns ()."
306     (string>=* string1 string2 start1 end1 start2 end2))
307    
308     (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
309     "Given two strings (string1 and string2), and optional integers start1,
310     start2, end1 and end2, compares characters in string1 to characters in
311     string2 (using char=)."
312     (string=* string1 string2 start1 end1 start2 end2))
313    
314     (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
315     "Given two strings, if the first string is not lexicographically equal
316     to the second string, returns the longest common prefix (using char=)
317     of the two strings. Otherwise, returns ()."
318     (string/=* string1 string2 start1 end1 start2 end2))
319    
320    
321     (eval-when (compile eval)
322    
323     ;;; STRING-NOT-EQUAL-LOOP is used to generate character comparison loops for
324     ;;; STRING-EQUAL and STRING-NOT-EQUAL.
325     (defmacro string-not-equal-loop (end end-value
326     &optional (abort-value nil abortp))
327     (declare (fixnum end))
328     (let ((end-test (if (= end 1)
329     `(= index1 (the fixnum end1))
330     `(= index2 (the fixnum end2)))))
331     `(do ((index1 start1 (1+ index1))
332     (index2 start2 (1+ index2)))
333     (,(if abortp
334     end-test
335     `(or ,end-test
336     (not (char-equal (schar string1 index1)
337     (schar string2 index2)))))
338     ,end-value)
339     (declare (fixnum index1 index2))
340     ,@(if abortp
341     `((if (not (char-equal (schar string1 index1)
342     (schar string2 index2)))
343     (return ,abort-value)))))))
344    
345     ) ; eval-when
346    
347     (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
348     "Given two strings (string1 and string2), and optional integers start1,
349     start2, end1 and end2, compares characters in string1 to characters in
350     string2 (using char-equal)."
351     (declare (fixnum start1 start2))
352 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
353     (let ((slen1 (- (the fixnum end1) start1))
354     (slen2 (- (the fixnum end2) start2)))
355     (declare (fixnum slen1 slen2))
356     (if (or (minusp slen1) (minusp slen2))
357     ;;prevent endless looping later.
358     (error "Improper bounds for string comparison."))
359     (if (= slen1 slen2)
360     ;;return () immediately if lengths aren't equal.
361     (string-not-equal-loop 1 t nil)))))
362 ram 1.1
363     (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
364     "Given two strings, if the first string is not lexicographically equal
365     to the second string, returns the longest common prefix (using char-equal)
366     of the two strings. Otherwise, returns ()."
367 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
368     (let ((slen1 (- end1 start1))
369     (slen2 (- end2 start2)))
370     (declare (fixnum slen1 slen2))
371     (if (or (minusp slen1) (minusp slen2))
372     ;;prevent endless looping later.
373     (error "Improper bounds for string comparison."))
374     (cond ((or (minusp slen1) (or (minusp slen2)))
375     (error "Improper substring for comparison."))
376     ((= slen1 slen2)
377     (string-not-equal-loop 1 nil (- index1 offset1)))
378     ((< slen1 slen2)
379     (string-not-equal-loop 1 (- index1 offset1)))
380     (t
381     (string-not-equal-loop 2 (- index1 offset1)))))))
382 ram 1.1
383    
384    
385     (eval-when (compile eval)
386    
387     ;;; STRING-LESS-GREATER-EQUAL-TESTS returns a test on the lengths of string1
388     ;;; and string2 and a test on the current characters from string1 and string2
389     ;;; for the following macro.
390     (defun string-less-greater-equal-tests (lessp equalp)
391     (if lessp
392     (if equalp
393     ;; STRING-NOT-GREATERP
394 rtoy 1.12.30.15 (values '<=
395     #-unicode `(not (char-greaterp char1 char2))
396     #+unicode `(<= char1 char2))
397 ram 1.1 ;; STRING-LESSP
398 rtoy 1.12.30.15 (values '<
399     #-unicode `(char-lessp char1 char2)
400     #+unicode `(< char1 char2)))
401 ram 1.1 (if equalp
402     ;; STRING-NOT-LESSP
403 rtoy 1.12.30.15 (values '>=
404     #-unicode `(not (char-lessp char1 char2))
405     #+unicode `(>= char1 char2))
406 ram 1.1 ;; STRING-GREATERP
407 rtoy 1.12.30.15 (values '>
408     #-unicode `(char-greaterp char1 char2)
409     #+unicode `(> char1 char2)))))
410 ram 1.1
411 rtoy 1.12.30.15 #-unicode
412 ram 1.1 (defmacro string-less-greater-equal (lessp equalp)
413     (multiple-value-bind (length-test character-test)
414     (string-less-greater-equal-tests lessp equalp)
415 ram 1.4 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
416     (let ((slen1 (- (the fixnum end1) start1))
417     (slen2 (- (the fixnum end2) start2)))
418     (declare (fixnum slen1 slen2))
419     (if (or (minusp slen1) (minusp slen2))
420     ;;prevent endless looping later.
421     (error "Improper bounds for string comparison."))
422     (do ((index1 start1 (1+ index1))
423     (index2 start2 (1+ index2))
424     (char1)
425     (char2))
426     ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
427     (if (,length-test slen1 slen2) (- index1 offset1)))
428     (declare (fixnum index1 index2))
429     (setq char1 (schar string1 index1))
430     (setq char2 (schar string2 index2))
431     (if (not (char-equal char1 char2))
432     (if ,character-test
433     (return (- index1 offset1))
434     (return ()))))))))
435 ram 1.1
436 rtoy 1.12.30.18 ;; Convert to lowercase for case folding, to match what Unicode
437     ;; CaseFolding.txt says. An example where this matters: U+1E9E maps
438     ;; to U+00DF. But the uppercase version of U+00DF is U+00DF.
439 rtoy 1.12.30.15 #+unicode
440     (defmacro equal-char-codepoint (codepoint)
441     `(let ((ch ,codepoint))
442 rtoy 1.12.30.18 ;; Handle ASCII separately for bootstrapping and for unidata missing.
443     (if (< 64 ch 91)
444     (+ ch 32)
445 rtoy 1.12.30.15 #-(and unicode (not unicode-bootstrap))
446     ch
447     #+(and unicode (not unicode-bootstrap))
448 rtoy 1.12.30.18 (if (> ch 127) (unicode-lower ch) ch))))
449 rtoy 1.12.30.15
450     #+unicode
451     (defmacro string-less-greater-equal (lessp equalp)
452     (multiple-value-bind (length-test character-test)
453     (string-less-greater-equal-tests lessp equalp)
454     `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
455     (let ((slen1 (- (the fixnum end1) start1))
456     (slen2 (- (the fixnum end2) start2)))
457     (declare (fixnum slen1 slen2))
458     (if (or (minusp slen1) (minusp slen2))
459     ;;prevent endless looping later.
460     (error "Improper bounds for string comparison."))
461     (do ((index1 start1 (1+ index1))
462     (index2 start2 (1+ index2)))
463     ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
464     (if (,length-test slen1 slen2) (- index1 offset1)))
465     (declare (fixnum index1 index2))
466     (multiple-value-bind (char1 wide1)
467     (codepoint string1 index1)
468 rtoy 1.12.30.18 (declare (type (integer 0 #x10ffff) char1))
469 rtoy 1.12.30.15 (multiple-value-bind (char2 wide2)
470     (codepoint string2 index2)
471 rtoy 1.12.30.18 (declare (type (integer 0 #x10ffff) char2))
472     (setf char1 (equal-char-codepoint char1))
473     (setf char2 (equal-char-codepoint char2))
474     (if (= char1 char2)
475 rtoy 1.12.30.15 (progn
476     (when wide1 (incf index1))
477     (when wide2 (incf index2)))
478     (if ,character-test
479     (return (- index1 offset1))
480     (return ()))))))))))
481    
482 ram 1.1 ) ; eval-when
483    
484     (defun string-lessp* (string1 string2 start1 end1 start2 end2)
485     (declare (fixnum start1 start2))
486     (string-less-greater-equal t nil))
487    
488     (defun string-greaterp* (string1 string2 start1 end1 start2 end2)
489     (declare (fixnum start1 start2))
490     (string-less-greater-equal nil nil))
491    
492     (defun string-not-lessp* (string1 string2 start1 end1 start2 end2)
493     (declare (fixnum start1 start2))
494     (string-less-greater-equal nil t))
495    
496     (defun string-not-greaterp* (string1 string2 start1 end1 start2 end2)
497     (declare (fixnum start1 start2))
498     (string-less-greater-equal t t))
499    
500     (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
501     "Given two strings, if the first string is lexicographically less than
502     the second string, returns the longest common prefix (using char-equal)
503     of the two strings. Otherwise, returns ()."
504     (string-lessp* string1 string2 start1 end1 start2 end2))
505    
506     (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)
507     "Given two strings, if the first string is lexicographically greater than
508     the second string, returns the longest common prefix (using char-equal)
509     of the two strings. Otherwise, returns ()."
510     (string-greaterp* string1 string2 start1 end1 start2 end2))
511    
512     (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
513     "Given two strings, if the first string is lexicographically greater
514     than or equal to the second string, returns the longest common prefix
515     (using char-equal) of the two strings. Otherwise, returns ()."
516     (string-not-lessp* string1 string2 start1 end1 start2 end2))
517    
518     (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
519     end2)
520     "Given two strings, if the first string is lexicographically less than
521     or equal to the second string, returns the longest common prefix
522     (using char-equal) of the two strings. Otherwise, returns ()."
523     (string-not-greaterp* string1 string2 start1 end1 start2 end2))
524    
525    
526 dtc 1.9 (defun make-string (count &key element-type ((:initial-element fill-char)))
527 ram 1.1 "Given a character count and an optional fill character, makes and returns
528 rtoy 1.12.30.14 a new string Count long filled with the fill character."
529 emarsden 1.12 (declare (type fixnum count))
530     (assert (subtypep element-type 'character))
531 ram 1.1 (if fill-char
532     (do ((i 0 (1+ i))
533     (string (make-string count)))
534     ((= i count) string)
535     (declare (fixnum i))
536     (setf (schar string i) fill-char))
537     (make-string count)))
538    
539     (defun string-upcase (string &key (start 0) end)
540     "Given a string, returns a new string that is a copy of it with
541     all lower case alphabetic characters converted to uppercase."
542     (declare (fixnum start))
543 ram 1.5 (let* ((string (if (stringp string) string (string string)))
544     (slen (length string)))
545 ram 1.4 (declare (fixnum slen))
546 ram 1.1 (with-one-string string start end offset
547     (let ((offset-slen (+ slen offset))
548     (newstring (make-string slen)))
549     (declare (fixnum offset-slen))
550     (do ((index offset (1+ index))
551     (new-index 0 (1+ new-index)))
552     ((= index start))
553     (declare (fixnum index new-index))
554     (setf (schar newstring new-index) (schar string index)))
555     (do ((index start (1+ index))
556     (new-index (- start offset) (1+ new-index)))
557     ((= index (the fixnum end)))
558     (declare (fixnum index new-index))
559 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
560     (when wide (incf index))
561     ;; Handle ASCII specially because this is called early in
562     ;; initialization, before unidata is available.
563     (cond ((< 96 code 123) (decf code 32))
564     ((> code 127) (setq code (unicode-upper code))))
565     ;;@@ WARNING: this may, in theory, need to extend newstring
566     ;; but that never actually occurs as of Unicode 5.1.0,
567     ;; so I'm just going to ignore it for now...
568     (multiple-value-bind (hi lo) (surrogates code)
569 rtoy 1.12.30.8 (setf (schar newstring new-index) hi)
570 rtoy 1.12.30.6 (when lo
571 rtoy 1.12.30.8 (setf (schar newstring (incf new-index)) lo)))))
572 rtoy 1.12.30.4 ;;@@ WARNING: see above
573 ram 1.1 (do ((index end (1+ index))
574     (new-index (- (the fixnum end) offset) (1+ new-index)))
575     ((= index offset-slen))
576     (declare (fixnum index new-index))
577     (setf (schar newstring new-index) (schar string index)))
578     newstring))))
579    
580     (defun string-downcase (string &key (start 0) end)
581     "Given a string, returns a new string that is a copy of it with
582     all upper case alphabetic characters converted to lowercase."
583     (declare (fixnum start))
584 ram 1.5 (let* ((string (if (stringp string) string (string string)))
585     (slen (length string)))
586 ram 1.4 (declare (fixnum slen))
587 ram 1.1 (with-one-string string start end offset
588     (let ((offset-slen (+ slen offset))
589     (newstring (make-string slen)))
590     (declare (fixnum offset-slen))
591     (do ((index offset (1+ index))
592     (new-index 0 (1+ new-index)))
593     ((= index start))
594     (declare (fixnum index new-index))
595     (setf (schar newstring new-index) (schar string index)))
596     (do ((index start (1+ index))
597     (new-index (- start offset) (1+ new-index)))
598     ((= index (the fixnum end)))
599     (declare (fixnum index new-index))
600 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
601     (when wide (incf index))
602     ;; Handle ASCII specially because this is called early in
603     ;; initialization, before unidata is available.
604     (cond ((< 64 code 91) (incf code 32))
605     ((> code 127) (setq code (unicode-lower code))))
606     ;;@@ WARNING: this may, in theory, need to extend newstring
607     ;; but that never actually occurs as of Unicode 5.1.0,
608     ;; so I'm just going to ignore it for now...
609     (multiple-value-bind (hi lo) (surrogates code)
610 rtoy 1.12.30.8 (setf (schar newstring new-index) hi)
611 rtoy 1.12.30.6 (when lo
612 rtoy 1.12.30.8 (setf (schar newstring (incf new-index)) lo)))))
613 rtoy 1.12.30.4 ;;@@ WARNING: see above
614 ram 1.1 (do ((index end (1+ index))
615     (new-index (- (the fixnum end) offset) (1+ new-index)))
616     ((= index offset-slen))
617     (declare (fixnum index new-index))
618     (setf (schar newstring new-index) (schar string index)))
619     newstring))))
620    
621     (defun string-capitalize (string &key (start 0) end)
622     "Given a string, returns a copy of the string with the first
623     character of each ``word'' converted to upper-case, and remaining
624     chars in the word converted to lower case. A ``word'' is defined
625     to be a string of case-modifiable characters delimited by
626     non-case-modifiable chars."
627     (declare (fixnum start))
628 ram 1.5 (let* ((string (if (stringp string) string (string string)))
629     (slen (length string)))
630 ram 1.4 (declare (fixnum slen))
631 ram 1.1 (with-one-string string start end offset
632     (let ((offset-slen (+ slen offset))
633     (newstring (make-string slen)))
634     (declare (fixnum offset-slen))
635     (do ((index offset (1+ index))
636     (new-index 0 (1+ new-index)))
637     ((= index start))
638     (declare (fixnum index new-index))
639     (setf (schar newstring new-index) (schar string index)))
640     (do ((index start (1+ index))
641     (new-index (- start offset) (1+ new-index))
642     (newword t)
643     (char ()))
644     ((= index (the fixnum end)))
645     (declare (fixnum index new-index))
646     (setq char (schar string index))
647     (cond ((not (alphanumericp char))
648     (setq newword t))
649     (newword
650     ;;char is first case-modifiable after non-case-modifiable
651 rtoy 1.12.30.2 (setq char (char-titlecase char))
652 ram 1.1 (setq newword ()))
653     ;;char is case-modifiable, but not first
654     (t (setq char (char-downcase char))))
655     (setf (schar newstring new-index) char))
656     (do ((index end (1+ index))
657     (new-index (- (the fixnum end) offset) (1+ new-index)))
658     ((= index offset-slen))
659     (declare (fixnum index new-index))
660     (setf (schar newstring new-index) (schar string index)))
661     newstring))))
662    
663     (defun nstring-upcase (string &key (start 0) end)
664     "Given a string, returns that string with all lower case alphabetic
665     characters converted to uppercase."
666     (declare (fixnum start))
667 ram 1.4 (let ((save-header string))
668 ram 1.1 (with-one-string string start end offset
669     (do ((index start (1+ index)))
670     ((= index (the fixnum end)))
671     (declare (fixnum index))
672 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
673     (declare (ignore wide))
674     ;; Handle ASCII specially because this is called early in
675     ;; initialization, before unidata is available.
676     (cond ((< 96 code 123) (decf code 32))
677     ((> code 127) (setq code (unicode-upper code))))
678     ;;@@ WARNING: this may, in theory, need to extend string
679     ;; (which, obviously, we can't do here. Unless
680     ;; STRING is adjustable, maybe)
681     ;; but that never actually occurs as of Unicode 5.1.0,
682     ;; so I'm just going to ignore it for now...
683     (multiple-value-bind (hi lo) (surrogates code)
684 rtoy 1.12.30.8 (setf (schar string index) hi)
685 rtoy 1.12.30.6 (when lo
686 rtoy 1.12.30.8 (setf (schar string (incf index)) lo))))))
687 ram 1.1 save-header))
688    
689     (defun nstring-downcase (string &key (start 0) end)
690     "Given a string, returns that string with all upper case alphabetic
691     characters converted to lowercase."
692     (declare (fixnum start))
693 ram 1.4 (let ((save-header string))
694 ram 1.1 (with-one-string string start end offset
695     (do ((index start (1+ index)))
696     ((= index (the fixnum end)))
697     (declare (fixnum index))
698 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
699     (declare (ignore wide))
700     (cond ((< 64 code 91) (incf code 32))
701     ((> code 127) (setq code (unicode-lower code))))
702     ;;@@ WARNING: this may, in theory, need to extend string
703     ;; (which, obviously, we can't do here. Unless
704     ;; STRING is adjustable, maybe)
705     ;; but that never actually occurs as of Unicode 5.1.0,
706     ;; so I'm just going to ignore it for now...
707     (multiple-value-bind (hi lo) (surrogates code)
708 rtoy 1.12.30.8 (setf (schar string index) hi)
709 rtoy 1.12.30.6 (when lo
710 rtoy 1.12.30.8 (setf (schar string (incf index)) lo))))))
711 wlott 1.6 save-header))
712 ram 1.1
713     (defun nstring-capitalize (string &key (start 0) end)
714     "Given a string, returns that string with the first
715     character of each ``word'' converted to upper-case, and remaining
716     chars in the word converted to lower case. A ``word'' is defined
717     to be a string of case-modifiable characters delimited by
718     non-case-modifiable chars."
719     (declare (fixnum start))
720 ram 1.4 (let ((save-header string))
721 ram 1.1 (with-one-string string start end offset
722     (do ((index start (1+ index))
723     (newword t)
724     (char ()))
725     ((= index (the fixnum end)))
726     (declare (fixnum index))
727     (setq char (schar string index))
728     (cond ((not (alphanumericp char))
729     (setq newword t))
730     (newword
731     ;;char is first case-modifiable after non-case-modifiable
732 rtoy 1.12.30.2 (setf (schar string index) (char-titlecase char))
733 ram 1.1 (setq newword ()))
734     (t
735     (setf (schar string index) (char-downcase char))))))
736     save-header))
737    
738     (defun string-left-trim (char-bag string)
739     "Given a set of characters (a list or string) and a string, returns
740     a copy of the string with the characters in the set removed from the
741     left end."
742     (with-string string
743     (do ((index start (1+ index)))
744     ((or (= index (the fixnum end))
745     (not (find (schar string index) char-bag)))
746     (subseq (the simple-string string) index end))
747     (declare (fixnum index)))))
748    
749     (defun string-right-trim (char-bag string)
750     "Given a set of characters (a list or string) and a string, returns
751     a copy of the string with the characters in the set removed from the
752     right end."
753     (with-string string
754     (do ((index (1- (the fixnum end)) (1- index)))
755     ((or (< index start) (not (find (schar string index) char-bag)))
756     (subseq (the simple-string string) start (1+ index)))
757     (declare (fixnum index)))))
758    
759     (defun string-trim (char-bag string)
760     "Given a set of characters (a list or string) and a string, returns a
761     copy of the string with the characters in the set removed from both
762     ends."
763     (with-string string
764     (let* ((left-end (do ((index start (1+ index)))
765     ((or (= index (the fixnum end))
766     (not (find (schar string index) char-bag)))
767     index)
768     (declare (fixnum index))))
769     (right-end (do ((index (1- (the fixnum end)) (1- index)))
770     ((or (< index left-end)
771     (not (find (schar string index) char-bag)))
772     (1+ index))
773     (declare (fixnum index)))))
774     (subseq (the simple-string string) left-end right-end))))
775 rtoy 1.12.30.3
776 rtoy 1.12.30.4 (declaim (inline %glyph-f %glyph-b))
777     (defun %glyph-f (string index)
778     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
779     (type simple-string string) (type kernel:index index))
780 rtoy 1.12.30.10 (let* ((prev 0)
781     (l (length string))
782     (c (codepoint string index l))
783     (n (+ index (if (> c #xFFFF) 2 1))))
784     (declare (type (integer 0 #x10FFFF) c) (type kernel:index l n))
785     (loop while (< n l) do
786     (let* ((c (codepoint string n l))
787     (d (the (unsigned-byte 8) (unicode-combining-class c))))
788     (when (or (zerop d) (< d prev))
789     (return))
790     (setq prev d)
791     (incf n (if (> c #xFFFF) 2 1))))
792     n))
793 rtoy 1.12.30.4
794     (defun %glyph-b (string index)
795     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
796     (type simple-string string) (type kernel:index index))
797 rtoy 1.12.30.11 (let* ((prev 255)
798     (n (1- index)))
799     (declare (type kernel:index n))
800     (loop until (< n 0) do
801     (let* ((c (codepoint string n 0))
802     (d (the (unsigned-byte 8) (unicode-combining-class c))))
803     (cond ((zerop d) (return))
804     ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
805     (setq prev d)
806     (decf n (if (> c #xFFFF) 2 1))))
807     n))
808 rtoy 1.12.30.4
809     (defun glyph (string index &key (from-end nil))
810 rtoy 1.12.30.3 "GLYPH returns the glyph at the indexed position in a string, and the
811 rtoy 1.12.30.4 position of the next glyph (or NIL) as a second value. A glyph is
812     a substring consisting of the character at INDEX followed by all
813     subsequent combining characters."
814 rtoy 1.12.30.3 (declare (type simple-string string) (type kernel:index index))
815     #-unicode
816     (char string index)
817     #+unicode
818     (with-array-data ((string string) (start) (end))
819     (declare (ignore start end))
820 rtoy 1.12.30.4 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
821     (if from-end
822     (values (subseq string n index) (and (> n 0) n))
823     (values (subseq string index n) (and (< n (length string)) n))))))
824 rtoy 1.12.30.3
825 rtoy 1.12.30.4 (defun sglyph (string index &key (from-end nil))
826 rtoy 1.12.30.3 "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
827 rtoy 1.12.30.4 except that the string must be a simple-string"
828 rtoy 1.12.30.3 (declare (type simple-string string) (type kernel:index index))
829     #-unicode
830     (schar string index)
831     #+unicode
832 rtoy 1.12.30.4 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
833     (if from-end
834     (values (subseq string n index) (and (> n 0) n))
835     (values (subseq string index n) (and (< n (length string)) n)))))
836    
837 rtoy 1.12.30.9 (defun decompose (string &optional (compatibility t))
838     (declare (type string string))
839     (let ((result (make-string (cond ((< (length string) 40)
840     (* 5 (length string)))
841     ((< (length string) 4096)
842     (* 2 (length string)))
843     (t (round (length string) 5/6)))))
844 rtoy 1.12.30.10 (fillptr 0))
845     (declare (type kernel:index fillptr))
846 rtoy 1.12.30.9 (labels ((rec (string)
847     (declare (type simple-string string))
848     (do ((i 0 (1+ i)))
849     ((= i (length string)))
850     (declare (type kernel:index i))
851     (multiple-value-bind (code wide) (codepoint string i)
852     (when wide (incf i))
853     (let ((decomp (unicode-decomp code compatibility)))
854     (if decomp (rec decomp) (out code))))))
855     (out (code)
856     (multiple-value-bind (hi lo) (surrogates code)
857     (outch hi)
858     (when lo
859 rtoy 1.12.30.10 (outch lo))
860     (let ((cc (unicode-combining-class code)))
861     (unless (zerop cc)
862     (order lo cc (- fillptr (if lo 2 1)))))))
863 rtoy 1.12.30.9 (outch (char)
864     (when (= fillptr (length result))
865     (let ((tmp (make-string (round (length result) 5/6))))
866     (replace tmp result)
867     (setq result tmp)))
868     (setf (schar result fillptr) char)
869     (incf fillptr))
870 rtoy 1.12.30.10 (order (wide1 cc last)
871     (loop until (minusp last) do
872     (multiple-value-bind (code2 wide2) (codepoint result last)
873     (let ((cc2 (unicode-combining-class code2)))
874     (cond ((zerop cc2) (return))
875     ((> cc2 cc)
876     (case (+ (if wide2 2 0) (if wide1 1 0))
877     (0 (rotatef (schar result last)
878     (schar result (1+ last))))
879     (1 (rotatef (schar result last)
880     (schar result (+ last 1))
881     (schar result (+ last 2))))
882     (2 (rotatef (schar result last)
883     (schar result (1- last))
884     (schar result (1+ last))))
885     (3 (rotatef (schar result last)
886     (schar result (+ last 2)))
887     (rotatef (schar result (1- last))
888     (schar result (1+ last)))))
889     (decf last (if wide2 2 1)))
890     (t (return))))))))
891 rtoy 1.12.30.9 (with-array-data ((string string) (start) (end))
892     (declare (ignore start end))
893     (rec string))
894     (shrink-vector result fillptr))))
895    
896 rtoy 1.12.30.4 (defun string-to-nfd (string)
897 rtoy 1.12.30.13 "Convert String to Unicode Normalization Form D (NFD) using the
898     canonical decomposition. The NFD string is returned"
899 rtoy 1.12.30.9 (decompose string nil))
900 rtoy 1.12.30.4
901     (defun string-to-nfkd (string)
902 rtoy 1.12.30.13 "Convert String to Unicode Normalization Form KD (NFKD) uisng the
903     compatible decomposition form. The NFKD string is returned."
904 rtoy 1.12.30.9 (decompose string t))
905 rtoy 1.12.30.4
906 rtoy 1.12.30.12 #+unicode
907 rtoy 1.12.30.4 (defun string-to-nfc (string)
908     ;;@@ Implement me
909 rtoy 1.12.30.12 ;; must return a simple-string for the package machinery
910     (if (simple-string-p string) string (coerce string 'simple-string)))
911    
912     #-unicode ;; Needed by package.lisp
913     (defun string-to-nfc (string)
914     (if (simple-string-p string) string (coerce string 'simple-string)))
915 rtoy 1.12.30.4
916     (defun string-to-nfkc (string)
917     ;;@@ Implement me
918 rtoy 1.12.30.12 (if (simple-string-p string) string (coerce string 'simple-string)))

  ViewVC Help
Powered by ViewVC 1.1.5