/[cmucl]/src/code/string.lisp
ViewVC logotype

Contents of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12.30.16 - (hide annotations)
Tue May 19 20:24:19 2009 UTC (4 years, 10 months ago) by rtoy
Branch: unicode-utf16-extfmt-branch
Changes since 1.12.30.15: +19 -1 lines
Add UTF16-STRING-P to determine if a string is a valid UTF-16 encoded
string.
1 ram 1.1 ;;; -*- Log: code.log; Package: Lisp -*-
2     ;;;
3     ;;; **********************************************************************
4 ram 1.3 ;;; This code was written as part of the CMU Common Lisp project at
5     ;;; Carnegie Mellon University, and has been placed in the public domain.
6     ;;;
7     (ext:file-comment
8 rtoy 1.12.30.16 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/string.lisp,v 1.12.30.16 2009/05/19 20:24:19 rtoy Exp $")
9 ram 1.3 ;;;
10 ram 1.1 ;;; **********************************************************************
11     ;;;
12 pw 1.11 ;;; Functions to implement strings for CMU Common Lisp
13 ram 1.1 ;;; Written by David Dill
14 ram 1.4 ;;; Rewritten by Skef Wholey, Bill Chiles and Rob MacLachlan.
15 ram 1.1 ;;;
16     ;;; ****************************************************************
17     ;;;
18 ram 1.4 (in-package "LISP")
19 rtoy 1.12.30.3 (export '(char schar glyph sglyph string
20 ram 1.1 string= string-equal string< string> string<= string>= string/=
21     string-lessp string-greaterp string-not-lessp string-not-greaterp
22     string-not-equal
23 rtoy 1.12.30.4 string-to-nfd string-to-nfkd string-to-nfc string-to-nfkc
24 ram 1.1 make-string
25     string-trim string-left-trim string-right-trim
26     string-upcase
27     string-downcase string-capitalize nstring-upcase nstring-downcase
28     nstring-capitalize))
29    
30    
31 rtoy 1.12.30.8 (declaim (inline surrogates-to-codepoint codepoint surrogates))
32 rtoy 1.12.30.7
33 rtoy 1.12.30.8 (defun surrogates-to-codepoint (hi lo)
34     "Convert the given Hi and Lo surrogate characters to the
35     corresponding codepoint value"
36     (declare (type character hi lo))
37     (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi)) #xD800) 10)
38     (the (integer #xDC00 #xDFFF) (char-code lo)) #x2400))
39 rtoy 1.12.30.7
40 rtoy 1.12.30.6 (defun codepoint (string i &optional (end (length string)))
41 rtoy 1.12.30.7 "Return the codepoint value from String at position I. If that
42     position is a surrogate, it is combined with either the previous or
43     following character (when possible) to compute the codepoint. The
44     second return value is NIL if the position is not a surrogate pair.
45     Otherwise +1 or -1 is returned if the position is the high or low
46     surrogate value, respectively."
47 rtoy 1.12.30.6 (declare (type simple-string string) (type kernel:index i end))
48     (let ((code (char-code (schar string i))))
49     (cond ((and (<= #xD800 code #xDBFF) (< (1+ i) end))
50     (let ((tmp (char-code (schar string (1+ i)))))
51     (if (<= #xDC00 tmp #xDFFF)
52 rtoy 1.12.30.10 (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)
53 rtoy 1.12.30.6 (values code nil))))
54     ((and (<= #xDC00 code #xDFFF) (> i 0))
55     (let ((tmp (char-code (schar string (1- i)))))
56     (if (<= #xD800 tmp #xDBFF)
57 rtoy 1.12.30.10 (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)
58 rtoy 1.12.30.6 (values code nil))))
59     (t (values code nil)))))
60    
61     (defun surrogates (codepoint)
62 rtoy 1.12.30.8 "Return the high and low surrogate characters for Codepoint. If
63     Codepoint is in the BMP, the first return value is the corresponding
64     character and the second is NIL."
65 rtoy 1.12.30.6 (declare (type (integer 0 #x10FFFF) codepoint))
66     (if (< codepoint #x10000)
67 rtoy 1.12.30.8 (values (code-char codepoint) nil)
68 rtoy 1.12.30.6 (let* ((tmp (- codepoint #x10000))
69     (hi (logior (ldb (byte 10 10) tmp) #xD800))
70     (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
71 rtoy 1.12.30.8 (values (code-char hi) (code-char lo)))))
72 rtoy 1.12.30.6
73 rtoy 1.12.30.16 (defun utf16-string-p (string)
74     "Check if String is a valid UTF-16 string. If the string is valid,
75     T is returned. If the string is not valid, NIL is returned, and the
76     second value is the index into the string of the invalid character."
77     (do ((len (length string))
78     (index 0 (1+ index)))
79     ((>= index len)
80     t)
81     (multiple-value-bind (codepoint wide)
82     (codepoint string index)
83     ;; We stepping through the string in order. If there are any
84     ;; surrogates, we must reach the lead surrogate first, which
85     ;; means WIDE is +1. If we get any surrogate codepoint that
86     ;; is in the surrogate range, we have an invalid string.
87     (when (or (eq wide -1)
88     (<= #xD800 codepoint #xDFFF))
89     (return-from utf16-string-p (values nil index)))
90     (when wide (incf index)))))
91 rtoy 1.12.30.6
92 ram 1.1 (defun string (X)
93     "Coerces X into a string. If X is a string, X is returned. If X is a
94 rtoy 1.12.30.14 symbol, X's pname is returned. If X is a character then a one element
95     string containing that character is returned. If X cannot be coerced
96     into a string, an error occurs."
97 ram 1.1 (cond ((stringp x) x)
98     ((symbolp x) (symbol-name x))
99     ((characterp x)
100     (let ((res (make-string 1)))
101     (setf (schar res 0) x) res))
102     (t
103 pw 1.11 (error 'simple-type-error
104     :datum x
105     :expected-type '(or string symbol character)
106     :format-control "~S cannot be coerced to a string."
107     :format-arguments (list x)))))
108 ram 1.1
109     ;;; With-One-String is used to set up some string hacking things. The keywords
110     ;;; are parsed, and the string is hacked into a simple-string.
111    
112     (eval-when (compile)
113    
114     (defmacro with-one-string (string start end cum-offset &rest forms)
115 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
116 pw 1.10 ;; Optimizer may prove STRING is one.
117     (declare (optimize (ext:inhibit-warnings 3)))
118 ram 1.4 (with-array-data ((,string ,string :offset-var ,cum-offset)
119     (,start ,start)
120     (,end (or ,end (length (the vector ,string)))))
121     ,@forms)))
122 ram 1.1
123     )
124    
125     ;;; With-String is like With-One-String, but doesn't parse keywords.
126    
127     (eval-when (compile)
128    
129     (defmacro with-string (string &rest forms)
130 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
131     (with-array-data ((,string ,string)
132     (start)
133     (end (length (the vector ,string))))
134     ,@forms)))
135 ram 1.1
136     )
137    
138     ;;; With-Two-Strings is used to set up string comparison operations. The
139     ;;; keywords are parsed, and the strings are hacked into simple-strings.
140    
141     (eval-when (compile)
142    
143     (defmacro with-two-strings (string1 string2 start1 end1 cum-offset-1
144     start2 end2 &rest forms)
145 ram 1.4 `(let ((,string1 (if (stringp ,string1) ,string1 (string ,string1)))
146     (,string2 (if (stringp ,string2) ,string2 (string ,string2))))
147     (with-array-data ((,string1 ,string1 :offset-var ,cum-offset-1)
148     (,start1 ,start1)
149     (,end1 (or ,end1 (length (the vector ,string1)))))
150     (with-array-data ((,string2 ,string2)
151     (,start2 ,start2)
152     (,end2 (or ,end2 (length (the vector ,string2)))))
153     ,@forms))))
154 ram 1.1
155     )
156 wlott 1.2
157 ram 1.1
158     (defun char (string index)
159     "Given a string and a non-negative integer index less than the length of
160     the string, returns the character object representing the character at
161     that position in the string."
162 ram 1.4 (declare (optimize (safety 1)))
163 ram 1.1 (char string index))
164    
165     (defun %charset (string index new-el)
166 ram 1.4 (declare (optimize (safety 1)))
167 ram 1.1 (setf (char string index) new-el))
168    
169     (defun schar (string index)
170     "SCHAR returns the character object at an indexed position in a string
171 rtoy 1.12.30.14 just as CHAR does, except the string must be a simple-string."
172 ram 1.4 (declare (optimize (safety 1)))
173 ram 1.1 (schar string index))
174    
175     (defun %scharset (string index new-el)
176 ram 1.4 (declare (optimize (safety 1)))
177 ram 1.1 (setf (schar string index) new-el))
178    
179     (defun string=* (string1 string2 start1 end1 start2 end2)
180 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
181     (not (%sp-string-compare string1 start1 end1 string2 start2 end2))))
182 ram 1.1
183    
184     (defun string/=* (string1 string2 start1 end1 start2 end2)
185 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
186     (let ((comparison (%sp-string-compare string1 start1 end1
187     string2 start2 end2)))
188     (if comparison (- (the fixnum comparison) offset1)))))
189 ram 1.1
190     (eval-when (compile eval)
191    
192     ;;; Lessp is true if the desired expansion is for string<* or string<=*.
193     ;;; Equalp is true if the desired expansion is for string<=* or string>=*.
194     (defmacro string<>=*-body (lessp equalp)
195     (let ((offset1 (gensym)))
196 ram 1.4 `(with-two-strings string1 string2 start1 end1 ,offset1 start2 end2
197     (let ((index (%sp-string-compare string1 start1 end1
198     string2 start2 end2)))
199     (if index
200 ram 1.8 (cond ((= (the fixnum index) (the fixnum end1))
201     ,(if lessp
202     `(- (the fixnum index) ,offset1)
203     `nil))
204     ((= (+ (the fixnum index) (- start2 start1))
205     (the fixnum end2))
206     ,(if lessp
207     `nil
208 rtoy 1.12.30.1 `(- (the fixnum index) ,offset1)))
209     #-unicode
210 ram 1.4 ((,(if lessp 'char< 'char>)
211     (schar string1 index)
212     (schar string2 (+ (the fixnum index) (- start2 start1))))
213     (- (the fixnum index) ,offset1))
214 rtoy 1.12.30.1 #-unicode
215     (t nil)
216     #+unicode
217     (t
218     ;; Compare in code point order. See
219     ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
220     (flet ((fixup (code)
221     (if (>= code #xe000)
222     (- code #x800)
223     (+ code #x2000))))
224     (declare (inline fixup))
225     (let* ((c1 (char-code (schar string1 index)))
226 rtoy 1.12.30.10 (c2 (char-code (schar string2
227     (+ (the fixnum index)
228     (- start2 start1))))))
229 rtoy 1.12.30.1 (cond ((and (>= c1 #xd800)
230     (>= c2 #xd800))
231     (let ((fix-c1 (fixup c1))
232     (fix-c2 (fixup c2)))
233     (if (,(if lessp '< '>) fix-c1 fix-c2)
234     (- (the fixnum index) ,offset1)
235     nil)))
236     (t
237     (if (,(if lessp '< '>) c1 c2)
238 rtoy 1.12.30.10 (- (the fixnum index) ,offset1)
239     nil)))))))
240 ram 1.4 ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
241 ram 1.1 ) ; eval-when
242    
243     (defun string<* (string1 string2 start1 end1 start2 end2)
244     (declare (fixnum start1 start2))
245     (string<>=*-body t nil))
246    
247     (defun string>* (string1 string2 start1 end1 start2 end2)
248     (declare (fixnum start1 start2))
249     (string<>=*-body nil nil))
250    
251     (defun string<=* (string1 string2 start1 end1 start2 end2)
252     (declare (fixnum start1 start2))
253     (string<>=*-body t t))
254    
255     (defun string>=* (string1 string2 start1 end1 start2 end2)
256     (declare (fixnum start1 start2))
257     (string<>=*-body nil t))
258    
259    
260    
261     (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
262     "Given two strings, if the first string is lexicographically less than
263     the second string, returns the longest common prefix (using char=)
264     of the two strings. Otherwise, returns ()."
265     (string<* string1 string2 start1 end1 start2 end2))
266    
267     (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
268     "Given two strings, if the first string is lexicographically greater than
269     the second string, returns the longest common prefix (using char=)
270     of the two strings. Otherwise, returns ()."
271     (string>* string1 string2 start1 end1 start2 end2))
272    
273    
274     (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
275     "Given two strings, if the first string is lexicographically less than
276     or equal to the second string, returns the longest common prefix
277     (using char=) of the two strings. Otherwise, returns ()."
278     (string<=* string1 string2 start1 end1 start2 end2))
279    
280     (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
281     "Given two strings, if the first string is lexicographically greater
282     than or equal to the second string, returns the longest common prefix
283     (using char=) of the two strings. Otherwise, returns ()."
284     (string>=* string1 string2 start1 end1 start2 end2))
285    
286     (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
287     "Given two strings (string1 and string2), and optional integers start1,
288     start2, end1 and end2, compares characters in string1 to characters in
289     string2 (using char=)."
290     (string=* string1 string2 start1 end1 start2 end2))
291    
292     (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
293     "Given two strings, if the first string is not lexicographically equal
294     to the second string, returns the longest common prefix (using char=)
295     of the two strings. Otherwise, returns ()."
296     (string/=* string1 string2 start1 end1 start2 end2))
297    
298    
299     (eval-when (compile eval)
300    
301     ;;; STRING-NOT-EQUAL-LOOP is used to generate character comparison loops for
302     ;;; STRING-EQUAL and STRING-NOT-EQUAL.
303     (defmacro string-not-equal-loop (end end-value
304     &optional (abort-value nil abortp))
305     (declare (fixnum end))
306     (let ((end-test (if (= end 1)
307     `(= index1 (the fixnum end1))
308     `(= index2 (the fixnum end2)))))
309     `(do ((index1 start1 (1+ index1))
310     (index2 start2 (1+ index2)))
311     (,(if abortp
312     end-test
313     `(or ,end-test
314     (not (char-equal (schar string1 index1)
315     (schar string2 index2)))))
316     ,end-value)
317     (declare (fixnum index1 index2))
318     ,@(if abortp
319     `((if (not (char-equal (schar string1 index1)
320     (schar string2 index2)))
321     (return ,abort-value)))))))
322    
323     ) ; eval-when
324    
325     (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
326     "Given two strings (string1 and string2), and optional integers start1,
327     start2, end1 and end2, compares characters in string1 to characters in
328     string2 (using char-equal)."
329     (declare (fixnum start1 start2))
330 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
331     (let ((slen1 (- (the fixnum end1) start1))
332     (slen2 (- (the fixnum end2) start2)))
333     (declare (fixnum slen1 slen2))
334     (if (or (minusp slen1) (minusp slen2))
335     ;;prevent endless looping later.
336     (error "Improper bounds for string comparison."))
337     (if (= slen1 slen2)
338     ;;return () immediately if lengths aren't equal.
339     (string-not-equal-loop 1 t nil)))))
340 ram 1.1
341     (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
342     "Given two strings, if the first string is not lexicographically equal
343     to the second string, returns the longest common prefix (using char-equal)
344     of the two strings. Otherwise, returns ()."
345 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
346     (let ((slen1 (- end1 start1))
347     (slen2 (- end2 start2)))
348     (declare (fixnum slen1 slen2))
349     (if (or (minusp slen1) (minusp slen2))
350     ;;prevent endless looping later.
351     (error "Improper bounds for string comparison."))
352     (cond ((or (minusp slen1) (or (minusp slen2)))
353     (error "Improper substring for comparison."))
354     ((= slen1 slen2)
355     (string-not-equal-loop 1 nil (- index1 offset1)))
356     ((< slen1 slen2)
357     (string-not-equal-loop 1 (- index1 offset1)))
358     (t
359     (string-not-equal-loop 2 (- index1 offset1)))))))
360 ram 1.1
361    
362    
363     (eval-when (compile eval)
364    
365     ;;; STRING-LESS-GREATER-EQUAL-TESTS returns a test on the lengths of string1
366     ;;; and string2 and a test on the current characters from string1 and string2
367     ;;; for the following macro.
368     (defun string-less-greater-equal-tests (lessp equalp)
369     (if lessp
370     (if equalp
371     ;; STRING-NOT-GREATERP
372 rtoy 1.12.30.15 (values '<=
373     #-unicode `(not (char-greaterp char1 char2))
374     #+unicode `(<= char1 char2))
375 ram 1.1 ;; STRING-LESSP
376 rtoy 1.12.30.15 (values '<
377     #-unicode `(char-lessp char1 char2)
378     #+unicode `(< char1 char2)))
379 ram 1.1 (if equalp
380     ;; STRING-NOT-LESSP
381 rtoy 1.12.30.15 (values '>=
382     #-unicode `(not (char-lessp char1 char2))
383     #+unicode `(>= char1 char2))
384 ram 1.1 ;; STRING-GREATERP
385 rtoy 1.12.30.15 (values '>
386     #-unicode `(char-greaterp char1 char2)
387     #+unicode `(> char1 char2)))))
388 ram 1.1
389 rtoy 1.12.30.15 #-unicode
390 ram 1.1 (defmacro string-less-greater-equal (lessp equalp)
391     (multiple-value-bind (length-test character-test)
392     (string-less-greater-equal-tests lessp equalp)
393 ram 1.4 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
394     (let ((slen1 (- (the fixnum end1) start1))
395     (slen2 (- (the fixnum end2) start2)))
396     (declare (fixnum slen1 slen2))
397     (if (or (minusp slen1) (minusp slen2))
398     ;;prevent endless looping later.
399     (error "Improper bounds for string comparison."))
400     (do ((index1 start1 (1+ index1))
401     (index2 start2 (1+ index2))
402     (char1)
403     (char2))
404     ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
405     (if (,length-test slen1 slen2) (- index1 offset1)))
406     (declare (fixnum index1 index2))
407     (setq char1 (schar string1 index1))
408     (setq char2 (schar string2 index2))
409     (if (not (char-equal char1 char2))
410     (if ,character-test
411     (return (- index1 offset1))
412     (return ()))))))))
413 ram 1.1
414 rtoy 1.12.30.15 #+unicode
415     (defmacro equal-char-codepoint (codepoint)
416     `(let ((ch ,codepoint))
417     (if (< 96 ch 123)
418     (- ch 32)
419     #-(and unicode (not unicode-bootstrap))
420     ch
421     #+(and unicode (not unicode-bootstrap))
422     (if (> ch 127) (unicode-upper ch) ch))))
423    
424     #+unicode
425     (defmacro string-less-greater-equal (lessp equalp)
426     (multiple-value-bind (length-test character-test)
427     (string-less-greater-equal-tests lessp equalp)
428     `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
429     (let ((slen1 (- (the fixnum end1) start1))
430     (slen2 (- (the fixnum end2) start2)))
431     (declare (fixnum slen1 slen2))
432     (if (or (minusp slen1) (minusp slen2))
433     ;;prevent endless looping later.
434     (error "Improper bounds for string comparison."))
435     (do ((index1 start1 (1+ index1))
436     (index2 start2 (1+ index2)))
437     ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
438     (if (,length-test slen1 slen2) (- index1 offset1)))
439     (declare (fixnum index1 index2))
440     (multiple-value-bind (char1 wide1)
441     (codepoint string1 index1)
442     (multiple-value-bind (char2 wide2)
443     (codepoint string2 index2)
444     (if (= (equal-char-codepoint char1)
445     (equal-char-codepoint char2))
446     (progn
447     (when wide1 (incf index1))
448     (when wide2 (incf index2)))
449     (if ,character-test
450     (return (- index1 offset1))
451     (return ()))))))))))
452    
453 ram 1.1 ) ; eval-when
454    
455     (defun string-lessp* (string1 string2 start1 end1 start2 end2)
456     (declare (fixnum start1 start2))
457     (string-less-greater-equal t nil))
458    
459     (defun string-greaterp* (string1 string2 start1 end1 start2 end2)
460     (declare (fixnum start1 start2))
461     (string-less-greater-equal nil nil))
462    
463     (defun string-not-lessp* (string1 string2 start1 end1 start2 end2)
464     (declare (fixnum start1 start2))
465     (string-less-greater-equal nil t))
466    
467     (defun string-not-greaterp* (string1 string2 start1 end1 start2 end2)
468     (declare (fixnum start1 start2))
469     (string-less-greater-equal t t))
470    
471     (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
472     "Given two strings, if the first string is lexicographically less than
473     the second string, returns the longest common prefix (using char-equal)
474     of the two strings. Otherwise, returns ()."
475     (string-lessp* string1 string2 start1 end1 start2 end2))
476    
477     (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)
478     "Given two strings, if the first string is lexicographically greater than
479     the second string, returns the longest common prefix (using char-equal)
480     of the two strings. Otherwise, returns ()."
481     (string-greaterp* string1 string2 start1 end1 start2 end2))
482    
483     (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
484     "Given two strings, if the first string is lexicographically greater
485     than or equal to the second string, returns the longest common prefix
486     (using char-equal) of the two strings. Otherwise, returns ()."
487     (string-not-lessp* string1 string2 start1 end1 start2 end2))
488    
489     (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
490     end2)
491     "Given two strings, if the first string is lexicographically less than
492     or equal to the second string, returns the longest common prefix
493     (using char-equal) of the two strings. Otherwise, returns ()."
494     (string-not-greaterp* string1 string2 start1 end1 start2 end2))
495    
496    
497 dtc 1.9 (defun make-string (count &key element-type ((:initial-element fill-char)))
498 ram 1.1 "Given a character count and an optional fill character, makes and returns
499 rtoy 1.12.30.14 a new string Count long filled with the fill character."
500 emarsden 1.12 (declare (type fixnum count))
501     (assert (subtypep element-type 'character))
502 ram 1.1 (if fill-char
503     (do ((i 0 (1+ i))
504     (string (make-string count)))
505     ((= i count) string)
506     (declare (fixnum i))
507     (setf (schar string i) fill-char))
508     (make-string count)))
509    
510     (defun string-upcase (string &key (start 0) end)
511     "Given a string, returns a new string that is a copy of it with
512     all lower case alphabetic characters converted to uppercase."
513     (declare (fixnum start))
514 ram 1.5 (let* ((string (if (stringp string) string (string string)))
515     (slen (length string)))
516 ram 1.4 (declare (fixnum slen))
517 ram 1.1 (with-one-string string start end offset
518     (let ((offset-slen (+ slen offset))
519     (newstring (make-string slen)))
520     (declare (fixnum offset-slen))
521     (do ((index offset (1+ index))
522     (new-index 0 (1+ new-index)))
523     ((= index start))
524     (declare (fixnum index new-index))
525     (setf (schar newstring new-index) (schar string index)))
526     (do ((index start (1+ index))
527     (new-index (- start offset) (1+ new-index)))
528     ((= index (the fixnum end)))
529     (declare (fixnum index new-index))
530 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
531     (when wide (incf index))
532     ;; Handle ASCII specially because this is called early in
533     ;; initialization, before unidata is available.
534     (cond ((< 96 code 123) (decf code 32))
535     ((> code 127) (setq code (unicode-upper code))))
536     ;;@@ WARNING: this may, in theory, need to extend newstring
537     ;; but that never actually occurs as of Unicode 5.1.0,
538     ;; so I'm just going to ignore it for now...
539     (multiple-value-bind (hi lo) (surrogates code)
540 rtoy 1.12.30.8 (setf (schar newstring new-index) hi)
541 rtoy 1.12.30.6 (when lo
542 rtoy 1.12.30.8 (setf (schar newstring (incf new-index)) lo)))))
543 rtoy 1.12.30.4 ;;@@ WARNING: see above
544 ram 1.1 (do ((index end (1+ index))
545     (new-index (- (the fixnum end) offset) (1+ new-index)))
546     ((= index offset-slen))
547     (declare (fixnum index new-index))
548     (setf (schar newstring new-index) (schar string index)))
549     newstring))))
550    
551     (defun string-downcase (string &key (start 0) end)
552     "Given a string, returns a new string that is a copy of it with
553     all upper case alphabetic characters converted to lowercase."
554     (declare (fixnum start))
555 ram 1.5 (let* ((string (if (stringp string) string (string string)))
556     (slen (length string)))
557 ram 1.4 (declare (fixnum slen))
558 ram 1.1 (with-one-string string start end offset
559     (let ((offset-slen (+ slen offset))
560     (newstring (make-string slen)))
561     (declare (fixnum offset-slen))
562     (do ((index offset (1+ index))
563     (new-index 0 (1+ new-index)))
564     ((= index start))
565     (declare (fixnum index new-index))
566     (setf (schar newstring new-index) (schar string index)))
567     (do ((index start (1+ index))
568     (new-index (- start offset) (1+ new-index)))
569     ((= index (the fixnum end)))
570     (declare (fixnum index new-index))
571 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
572     (when wide (incf index))
573     ;; Handle ASCII specially because this is called early in
574     ;; initialization, before unidata is available.
575     (cond ((< 64 code 91) (incf code 32))
576     ((> code 127) (setq code (unicode-lower code))))
577     ;;@@ WARNING: this may, in theory, need to extend newstring
578     ;; but that never actually occurs as of Unicode 5.1.0,
579     ;; so I'm just going to ignore it for now...
580     (multiple-value-bind (hi lo) (surrogates code)
581 rtoy 1.12.30.8 (setf (schar newstring new-index) hi)
582 rtoy 1.12.30.6 (when lo
583 rtoy 1.12.30.8 (setf (schar newstring (incf new-index)) lo)))))
584 rtoy 1.12.30.4 ;;@@ WARNING: see above
585 ram 1.1 (do ((index end (1+ index))
586     (new-index (- (the fixnum end) offset) (1+ new-index)))
587     ((= index offset-slen))
588     (declare (fixnum index new-index))
589     (setf (schar newstring new-index) (schar string index)))
590     newstring))))
591    
592     (defun string-capitalize (string &key (start 0) end)
593     "Given a string, returns a copy of the string with the first
594     character of each ``word'' converted to upper-case, and remaining
595     chars in the word converted to lower case. A ``word'' is defined
596     to be a string of case-modifiable characters delimited by
597     non-case-modifiable chars."
598     (declare (fixnum start))
599 ram 1.5 (let* ((string (if (stringp string) string (string string)))
600     (slen (length string)))
601 ram 1.4 (declare (fixnum slen))
602 ram 1.1 (with-one-string string start end offset
603     (let ((offset-slen (+ slen offset))
604     (newstring (make-string slen)))
605     (declare (fixnum offset-slen))
606     (do ((index offset (1+ index))
607     (new-index 0 (1+ new-index)))
608     ((= index start))
609     (declare (fixnum index new-index))
610     (setf (schar newstring new-index) (schar string index)))
611     (do ((index start (1+ index))
612     (new-index (- start offset) (1+ new-index))
613     (newword t)
614     (char ()))
615     ((= index (the fixnum end)))
616     (declare (fixnum index new-index))
617     (setq char (schar string index))
618     (cond ((not (alphanumericp char))
619     (setq newword t))
620     (newword
621     ;;char is first case-modifiable after non-case-modifiable
622 rtoy 1.12.30.2 (setq char (char-titlecase char))
623 ram 1.1 (setq newword ()))
624     ;;char is case-modifiable, but not first
625     (t (setq char (char-downcase char))))
626     (setf (schar newstring new-index) char))
627     (do ((index end (1+ index))
628     (new-index (- (the fixnum end) offset) (1+ new-index)))
629     ((= index offset-slen))
630     (declare (fixnum index new-index))
631     (setf (schar newstring new-index) (schar string index)))
632     newstring))))
633    
634     (defun nstring-upcase (string &key (start 0) end)
635     "Given a string, returns that string with all lower case alphabetic
636     characters converted to uppercase."
637     (declare (fixnum start))
638 ram 1.4 (let ((save-header string))
639 ram 1.1 (with-one-string string start end offset
640     (do ((index start (1+ index)))
641     ((= index (the fixnum end)))
642     (declare (fixnum index))
643 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
644     (declare (ignore wide))
645     ;; Handle ASCII specially because this is called early in
646     ;; initialization, before unidata is available.
647     (cond ((< 96 code 123) (decf code 32))
648     ((> code 127) (setq code (unicode-upper code))))
649     ;;@@ WARNING: this may, in theory, need to extend string
650     ;; (which, obviously, we can't do here. Unless
651     ;; STRING is adjustable, maybe)
652     ;; but that never actually occurs as of Unicode 5.1.0,
653     ;; so I'm just going to ignore it for now...
654     (multiple-value-bind (hi lo) (surrogates code)
655 rtoy 1.12.30.8 (setf (schar string index) hi)
656 rtoy 1.12.30.6 (when lo
657 rtoy 1.12.30.8 (setf (schar string (incf index)) lo))))))
658 ram 1.1 save-header))
659    
660     (defun nstring-downcase (string &key (start 0) end)
661     "Given a string, returns that string with all upper case alphabetic
662     characters converted to lowercase."
663     (declare (fixnum start))
664 ram 1.4 (let ((save-header string))
665 ram 1.1 (with-one-string string start end offset
666     (do ((index start (1+ index)))
667     ((= index (the fixnum end)))
668     (declare (fixnum index))
669 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
670     (declare (ignore wide))
671     (cond ((< 64 code 91) (incf code 32))
672     ((> code 127) (setq code (unicode-lower code))))
673     ;;@@ WARNING: this may, in theory, need to extend string
674     ;; (which, obviously, we can't do here. Unless
675     ;; STRING is adjustable, maybe)
676     ;; but that never actually occurs as of Unicode 5.1.0,
677     ;; so I'm just going to ignore it for now...
678     (multiple-value-bind (hi lo) (surrogates code)
679 rtoy 1.12.30.8 (setf (schar string index) hi)
680 rtoy 1.12.30.6 (when lo
681 rtoy 1.12.30.8 (setf (schar string (incf index)) lo))))))
682 wlott 1.6 save-header))
683 ram 1.1
684     (defun nstring-capitalize (string &key (start 0) end)
685     "Given a string, returns that string with the first
686     character of each ``word'' converted to upper-case, and remaining
687     chars in the word converted to lower case. A ``word'' is defined
688     to be a string of case-modifiable characters delimited by
689     non-case-modifiable chars."
690     (declare (fixnum start))
691 ram 1.4 (let ((save-header string))
692 ram 1.1 (with-one-string string start end offset
693     (do ((index start (1+ index))
694     (newword t)
695     (char ()))
696     ((= index (the fixnum end)))
697     (declare (fixnum index))
698     (setq char (schar string index))
699     (cond ((not (alphanumericp char))
700     (setq newword t))
701     (newword
702     ;;char is first case-modifiable after non-case-modifiable
703 rtoy 1.12.30.2 (setf (schar string index) (char-titlecase char))
704 ram 1.1 (setq newword ()))
705     (t
706     (setf (schar string index) (char-downcase char))))))
707     save-header))
708    
709     (defun string-left-trim (char-bag string)
710     "Given a set of characters (a list or string) and a string, returns
711     a copy of the string with the characters in the set removed from the
712     left end."
713     (with-string string
714     (do ((index start (1+ index)))
715     ((or (= index (the fixnum end))
716     (not (find (schar string index) char-bag)))
717     (subseq (the simple-string string) index end))
718     (declare (fixnum index)))))
719    
720     (defun string-right-trim (char-bag string)
721     "Given a set of characters (a list or string) and a string, returns
722     a copy of the string with the characters in the set removed from the
723     right end."
724     (with-string string
725     (do ((index (1- (the fixnum end)) (1- index)))
726     ((or (< index start) (not (find (schar string index) char-bag)))
727     (subseq (the simple-string string) start (1+ index)))
728     (declare (fixnum index)))))
729    
730     (defun string-trim (char-bag string)
731     "Given a set of characters (a list or string) and a string, returns a
732     copy of the string with the characters in the set removed from both
733     ends."
734     (with-string string
735     (let* ((left-end (do ((index start (1+ index)))
736     ((or (= index (the fixnum end))
737     (not (find (schar string index) char-bag)))
738     index)
739     (declare (fixnum index))))
740     (right-end (do ((index (1- (the fixnum end)) (1- index)))
741     ((or (< index left-end)
742     (not (find (schar string index) char-bag)))
743     (1+ index))
744     (declare (fixnum index)))))
745     (subseq (the simple-string string) left-end right-end))))
746 rtoy 1.12.30.3
747 rtoy 1.12.30.4 (declaim (inline %glyph-f %glyph-b))
748     (defun %glyph-f (string index)
749     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
750     (type simple-string string) (type kernel:index index))
751 rtoy 1.12.30.10 (let* ((prev 0)
752     (l (length string))
753     (c (codepoint string index l))
754     (n (+ index (if (> c #xFFFF) 2 1))))
755     (declare (type (integer 0 #x10FFFF) c) (type kernel:index l n))
756     (loop while (< n l) do
757     (let* ((c (codepoint string n l))
758     (d (the (unsigned-byte 8) (unicode-combining-class c))))
759     (when (or (zerop d) (< d prev))
760     (return))
761     (setq prev d)
762     (incf n (if (> c #xFFFF) 2 1))))
763     n))
764 rtoy 1.12.30.4
765     (defun %glyph-b (string index)
766     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
767     (type simple-string string) (type kernel:index index))
768 rtoy 1.12.30.11 (let* ((prev 255)
769     (n (1- index)))
770     (declare (type kernel:index n))
771     (loop until (< n 0) do
772     (let* ((c (codepoint string n 0))
773     (d (the (unsigned-byte 8) (unicode-combining-class c))))
774     (cond ((zerop d) (return))
775     ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
776     (setq prev d)
777     (decf n (if (> c #xFFFF) 2 1))))
778     n))
779 rtoy 1.12.30.4
780     (defun glyph (string index &key (from-end nil))
781 rtoy 1.12.30.3 "GLYPH returns the glyph at the indexed position in a string, and the
782 rtoy 1.12.30.4 position of the next glyph (or NIL) as a second value. A glyph is
783     a substring consisting of the character at INDEX followed by all
784     subsequent combining characters."
785 rtoy 1.12.30.3 (declare (type simple-string string) (type kernel:index index))
786     #-unicode
787     (char string index)
788     #+unicode
789     (with-array-data ((string string) (start) (end))
790     (declare (ignore start end))
791 rtoy 1.12.30.4 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
792     (if from-end
793     (values (subseq string n index) (and (> n 0) n))
794     (values (subseq string index n) (and (< n (length string)) n))))))
795 rtoy 1.12.30.3
796 rtoy 1.12.30.4 (defun sglyph (string index &key (from-end nil))
797 rtoy 1.12.30.3 "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
798 rtoy 1.12.30.4 except that the string must be a simple-string"
799 rtoy 1.12.30.3 (declare (type simple-string string) (type kernel:index index))
800     #-unicode
801     (schar string index)
802     #+unicode
803 rtoy 1.12.30.4 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
804     (if from-end
805     (values (subseq string n index) (and (> n 0) n))
806     (values (subseq string index n) (and (< n (length string)) n)))))
807    
808 rtoy 1.12.30.9 (defun decompose (string &optional (compatibility t))
809     (declare (type string string))
810     (let ((result (make-string (cond ((< (length string) 40)
811     (* 5 (length string)))
812     ((< (length string) 4096)
813     (* 2 (length string)))
814     (t (round (length string) 5/6)))))
815 rtoy 1.12.30.10 (fillptr 0))
816     (declare (type kernel:index fillptr))
817 rtoy 1.12.30.9 (labels ((rec (string)
818     (declare (type simple-string string))
819     (do ((i 0 (1+ i)))
820     ((= i (length string)))
821     (declare (type kernel:index i))
822     (multiple-value-bind (code wide) (codepoint string i)
823     (when wide (incf i))
824     (let ((decomp (unicode-decomp code compatibility)))
825     (if decomp (rec decomp) (out code))))))
826     (out (code)
827     (multiple-value-bind (hi lo) (surrogates code)
828     (outch hi)
829     (when lo
830 rtoy 1.12.30.10 (outch lo))
831     (let ((cc (unicode-combining-class code)))
832     (unless (zerop cc)
833     (order lo cc (- fillptr (if lo 2 1)))))))
834 rtoy 1.12.30.9 (outch (char)
835     (when (= fillptr (length result))
836     (let ((tmp (make-string (round (length result) 5/6))))
837     (replace tmp result)
838     (setq result tmp)))
839     (setf (schar result fillptr) char)
840     (incf fillptr))
841 rtoy 1.12.30.10 (order (wide1 cc last)
842     (loop until (minusp last) do
843     (multiple-value-bind (code2 wide2) (codepoint result last)
844     (let ((cc2 (unicode-combining-class code2)))
845     (cond ((zerop cc2) (return))
846     ((> cc2 cc)
847     (case (+ (if wide2 2 0) (if wide1 1 0))
848     (0 (rotatef (schar result last)
849     (schar result (1+ last))))
850     (1 (rotatef (schar result last)
851     (schar result (+ last 1))
852     (schar result (+ last 2))))
853     (2 (rotatef (schar result last)
854     (schar result (1- last))
855     (schar result (1+ last))))
856     (3 (rotatef (schar result last)
857     (schar result (+ last 2)))
858     (rotatef (schar result (1- last))
859     (schar result (1+ last)))))
860     (decf last (if wide2 2 1)))
861     (t (return))))))))
862 rtoy 1.12.30.9 (with-array-data ((string string) (start) (end))
863     (declare (ignore start end))
864     (rec string))
865     (shrink-vector result fillptr))))
866    
867 rtoy 1.12.30.4 (defun string-to-nfd (string)
868 rtoy 1.12.30.13 "Convert String to Unicode Normalization Form D (NFD) using the
869     canonical decomposition. The NFD string is returned"
870 rtoy 1.12.30.9 (decompose string nil))
871 rtoy 1.12.30.4
872     (defun string-to-nfkd (string)
873 rtoy 1.12.30.13 "Convert String to Unicode Normalization Form KD (NFKD) uisng the
874     compatible decomposition form. The NFKD string is returned."
875 rtoy 1.12.30.9 (decompose string t))
876 rtoy 1.12.30.4
877 rtoy 1.12.30.12 #+unicode
878 rtoy 1.12.30.4 (defun string-to-nfc (string)
879     ;;@@ Implement me
880 rtoy 1.12.30.12 ;; must return a simple-string for the package machinery
881     (if (simple-string-p string) string (coerce string 'simple-string)))
882    
883     #-unicode ;; Needed by package.lisp
884     (defun string-to-nfc (string)
885     (if (simple-string-p string) string (coerce string 'simple-string)))
886 rtoy 1.12.30.4
887     (defun string-to-nfkc (string)
888     ;;@@ Implement me
889 rtoy 1.12.30.12 (if (simple-string-p string) string (coerce string 'simple-string)))

  ViewVC Help
Powered by ViewVC 1.1.5