/[cmucl]/src/code/string.lisp
ViewVC logotype

Contents of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12.30.12 - (hide annotations)
Mon May 4 14:13:32 2009 UTC (4 years, 11 months ago) by rtoy
Branch: unicode-utf16-extfmt-branch
Changes since 1.12.30.11: +9 -3 lines
From Paul: Package and symbols names in Unicode need to be in a
canonical normalization form (eventually...when NFC is implemented)
1 ram 1.1 ;;; -*- Log: code.log; Package: Lisp -*-
2     ;;;
3     ;;; **********************************************************************
4 ram 1.3 ;;; This code was written as part of the CMU Common Lisp project at
5     ;;; Carnegie Mellon University, and has been placed in the public domain.
6     ;;;
7     (ext:file-comment
8 rtoy 1.12.30.12 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/string.lisp,v 1.12.30.12 2009/05/04 14:13:32 rtoy Exp $")
9 ram 1.3 ;;;
10 ram 1.1 ;;; **********************************************************************
11     ;;;
12 pw 1.11 ;;; Functions to implement strings for CMU Common Lisp
13 ram 1.1 ;;; Written by David Dill
14 ram 1.4 ;;; Rewritten by Skef Wholey, Bill Chiles and Rob MacLachlan.
15 ram 1.1 ;;;
16     ;;; ****************************************************************
17     ;;;
18 ram 1.4 (in-package "LISP")
19 rtoy 1.12.30.3 (export '(char schar glyph sglyph string
20 ram 1.1 string= string-equal string< string> string<= string>= string/=
21     string-lessp string-greaterp string-not-lessp string-not-greaterp
22     string-not-equal
23 rtoy 1.12.30.4 string-to-nfd string-to-nfkd string-to-nfc string-to-nfkc
24 ram 1.1 make-string
25     string-trim string-left-trim string-right-trim
26     string-upcase
27     string-downcase string-capitalize nstring-upcase nstring-downcase
28     nstring-capitalize))
29    
30    
31 rtoy 1.12.30.8 (declaim (inline surrogates-to-codepoint codepoint surrogates))
32 rtoy 1.12.30.7
33 rtoy 1.12.30.8 (defun surrogates-to-codepoint (hi lo)
34     "Convert the given Hi and Lo surrogate characters to the
35     corresponding codepoint value"
36     (declare (type character hi lo))
37     (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi)) #xD800) 10)
38     (the (integer #xDC00 #xDFFF) (char-code lo)) #x2400))
39 rtoy 1.12.30.7
40 rtoy 1.12.30.6 (defun codepoint (string i &optional (end (length string)))
41 rtoy 1.12.30.7 "Return the codepoint value from String at position I. If that
42     position is a surrogate, it is combined with either the previous or
43     following character (when possible) to compute the codepoint. The
44     second return value is NIL if the position is not a surrogate pair.
45     Otherwise +1 or -1 is returned if the position is the high or low
46     surrogate value, respectively."
47 rtoy 1.12.30.6 (declare (type simple-string string) (type kernel:index i end))
48     (let ((code (char-code (schar string i))))
49     (cond ((and (<= #xD800 code #xDBFF) (< (1+ i) end))
50     (let ((tmp (char-code (schar string (1+ i)))))
51     (if (<= #xDC00 tmp #xDFFF)
52 rtoy 1.12.30.10 (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)
53 rtoy 1.12.30.6 (values code nil))))
54     ((and (<= #xDC00 code #xDFFF) (> i 0))
55     (let ((tmp (char-code (schar string (1- i)))))
56     (if (<= #xD800 tmp #xDBFF)
57 rtoy 1.12.30.10 (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)
58 rtoy 1.12.30.6 (values code nil))))
59     (t (values code nil)))))
60    
61     (defun surrogates (codepoint)
62 rtoy 1.12.30.8 "Return the high and low surrogate characters for Codepoint. If
63     Codepoint is in the BMP, the first return value is the corresponding
64     character and the second is NIL."
65 rtoy 1.12.30.6 (declare (type (integer 0 #x10FFFF) codepoint))
66     (if (< codepoint #x10000)
67 rtoy 1.12.30.8 (values (code-char codepoint) nil)
68 rtoy 1.12.30.6 (let* ((tmp (- codepoint #x10000))
69     (hi (logior (ldb (byte 10 10) tmp) #xD800))
70     (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
71 rtoy 1.12.30.8 (values (code-char hi) (code-char lo)))))
72 rtoy 1.12.30.6
73    
74 ram 1.1 (defun string (X)
75     "Coerces X into a string. If X is a string, X is returned. If X is a
76     symbol, X's pname is returned. If X is a character then a one element
77     string containing that character is returned. If X cannot be coerced
78     into a string, an error occurs."
79     (cond ((stringp x) x)
80     ((symbolp x) (symbol-name x))
81     ((characterp x)
82     (let ((res (make-string 1)))
83     (setf (schar res 0) x) res))
84     (t
85 pw 1.11 (error 'simple-type-error
86     :datum x
87     :expected-type '(or string symbol character)
88     :format-control "~S cannot be coerced to a string."
89     :format-arguments (list x)))))
90 ram 1.1
91     ;;; With-One-String is used to set up some string hacking things. The keywords
92     ;;; are parsed, and the string is hacked into a simple-string.
93    
94     (eval-when (compile)
95    
96     (defmacro with-one-string (string start end cum-offset &rest forms)
97 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
98 pw 1.10 ;; Optimizer may prove STRING is one.
99     (declare (optimize (ext:inhibit-warnings 3)))
100 ram 1.4 (with-array-data ((,string ,string :offset-var ,cum-offset)
101     (,start ,start)
102     (,end (or ,end (length (the vector ,string)))))
103     ,@forms)))
104 ram 1.1
105     )
106    
107     ;;; With-String is like With-One-String, but doesn't parse keywords.
108    
109     (eval-when (compile)
110    
111     (defmacro with-string (string &rest forms)
112 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
113     (with-array-data ((,string ,string)
114     (start)
115     (end (length (the vector ,string))))
116     ,@forms)))
117 ram 1.1
118     )
119    
120     ;;; With-Two-Strings is used to set up string comparison operations. The
121     ;;; keywords are parsed, and the strings are hacked into simple-strings.
122    
123     (eval-when (compile)
124    
125     (defmacro with-two-strings (string1 string2 start1 end1 cum-offset-1
126     start2 end2 &rest forms)
127 ram 1.4 `(let ((,string1 (if (stringp ,string1) ,string1 (string ,string1)))
128     (,string2 (if (stringp ,string2) ,string2 (string ,string2))))
129     (with-array-data ((,string1 ,string1 :offset-var ,cum-offset-1)
130     (,start1 ,start1)
131     (,end1 (or ,end1 (length (the vector ,string1)))))
132     (with-array-data ((,string2 ,string2)
133     (,start2 ,start2)
134     (,end2 (or ,end2 (length (the vector ,string2)))))
135     ,@forms))))
136 ram 1.1
137     )
138 wlott 1.2
139 ram 1.1
140     (defun char (string index)
141     "Given a string and a non-negative integer index less than the length of
142     the string, returns the character object representing the character at
143     that position in the string."
144 ram 1.4 (declare (optimize (safety 1)))
145 ram 1.1 (char string index))
146    
147     (defun %charset (string index new-el)
148 ram 1.4 (declare (optimize (safety 1)))
149 ram 1.1 (setf (char string index) new-el))
150    
151     (defun schar (string index)
152     "SCHAR returns the character object at an indexed position in a string
153     just as CHAR does, except the string must be a simple-string."
154 ram 1.4 (declare (optimize (safety 1)))
155 ram 1.1 (schar string index))
156    
157     (defun %scharset (string index new-el)
158 ram 1.4 (declare (optimize (safety 1)))
159 ram 1.1 (setf (schar string index) new-el))
160    
161     (defun string=* (string1 string2 start1 end1 start2 end2)
162 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
163     (not (%sp-string-compare string1 start1 end1 string2 start2 end2))))
164 ram 1.1
165    
166     (defun string/=* (string1 string2 start1 end1 start2 end2)
167 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
168     (let ((comparison (%sp-string-compare string1 start1 end1
169     string2 start2 end2)))
170     (if comparison (- (the fixnum comparison) offset1)))))
171 ram 1.1
172     (eval-when (compile eval)
173    
174     ;;; Lessp is true if the desired expansion is for string<* or string<=*.
175     ;;; Equalp is true if the desired expansion is for string<=* or string>=*.
176     (defmacro string<>=*-body (lessp equalp)
177     (let ((offset1 (gensym)))
178 ram 1.4 `(with-two-strings string1 string2 start1 end1 ,offset1 start2 end2
179     (let ((index (%sp-string-compare string1 start1 end1
180     string2 start2 end2)))
181     (if index
182 ram 1.8 (cond ((= (the fixnum index) (the fixnum end1))
183     ,(if lessp
184     `(- (the fixnum index) ,offset1)
185     `nil))
186     ((= (+ (the fixnum index) (- start2 start1))
187     (the fixnum end2))
188     ,(if lessp
189     `nil
190 rtoy 1.12.30.1 `(- (the fixnum index) ,offset1)))
191     #-unicode
192 ram 1.4 ((,(if lessp 'char< 'char>)
193     (schar string1 index)
194     (schar string2 (+ (the fixnum index) (- start2 start1))))
195     (- (the fixnum index) ,offset1))
196 rtoy 1.12.30.1 #-unicode
197     (t nil)
198     #+unicode
199     (t
200     ;; Compare in code point order. See
201     ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
202     (flet ((fixup (code)
203     (if (>= code #xe000)
204     (- code #x800)
205     (+ code #x2000))))
206     (declare (inline fixup))
207     (let* ((c1 (char-code (schar string1 index)))
208 rtoy 1.12.30.10 (c2 (char-code (schar string2
209     (+ (the fixnum index)
210     (- start2 start1))))))
211 rtoy 1.12.30.1 (cond ((and (>= c1 #xd800)
212     (>= c2 #xd800))
213     (let ((fix-c1 (fixup c1))
214     (fix-c2 (fixup c2)))
215     (if (,(if lessp '< '>) fix-c1 fix-c2)
216     (- (the fixnum index) ,offset1)
217     nil)))
218     (t
219     (if (,(if lessp '< '>) c1 c2)
220 rtoy 1.12.30.10 (- (the fixnum index) ,offset1)
221     nil)))))))
222 ram 1.4 ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
223 ram 1.1 ) ; eval-when
224    
225     (defun string<* (string1 string2 start1 end1 start2 end2)
226     (declare (fixnum start1 start2))
227     (string<>=*-body t nil))
228    
229     (defun string>* (string1 string2 start1 end1 start2 end2)
230     (declare (fixnum start1 start2))
231     (string<>=*-body nil nil))
232    
233     (defun string<=* (string1 string2 start1 end1 start2 end2)
234     (declare (fixnum start1 start2))
235     (string<>=*-body t t))
236    
237     (defun string>=* (string1 string2 start1 end1 start2 end2)
238     (declare (fixnum start1 start2))
239     (string<>=*-body nil t))
240    
241    
242    
243     (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
244     "Given two strings, if the first string is lexicographically less than
245     the second string, returns the longest common prefix (using char=)
246     of the two strings. Otherwise, returns ()."
247     (string<* string1 string2 start1 end1 start2 end2))
248    
249     (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
250     "Given two strings, if the first string is lexicographically greater than
251     the second string, returns the longest common prefix (using char=)
252     of the two strings. Otherwise, returns ()."
253     (string>* string1 string2 start1 end1 start2 end2))
254    
255    
256     (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
257     "Given two strings, if the first string is lexicographically less than
258     or equal to the second string, returns the longest common prefix
259     (using char=) of the two strings. Otherwise, returns ()."
260     (string<=* string1 string2 start1 end1 start2 end2))
261    
262     (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
263     "Given two strings, if the first string is lexicographically greater
264     than or equal to the second string, returns the longest common prefix
265     (using char=) of the two strings. Otherwise, returns ()."
266     (string>=* string1 string2 start1 end1 start2 end2))
267    
268     (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
269     "Given two strings (string1 and string2), and optional integers start1,
270     start2, end1 and end2, compares characters in string1 to characters in
271     string2 (using char=)."
272     (string=* string1 string2 start1 end1 start2 end2))
273    
274     (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
275     "Given two strings, if the first string is not lexicographically equal
276     to the second string, returns the longest common prefix (using char=)
277     of the two strings. Otherwise, returns ()."
278     (string/=* string1 string2 start1 end1 start2 end2))
279    
280    
281     (eval-when (compile eval)
282    
283     ;;; STRING-NOT-EQUAL-LOOP is used to generate character comparison loops for
284     ;;; STRING-EQUAL and STRING-NOT-EQUAL.
285     (defmacro string-not-equal-loop (end end-value
286     &optional (abort-value nil abortp))
287     (declare (fixnum end))
288     (let ((end-test (if (= end 1)
289     `(= index1 (the fixnum end1))
290     `(= index2 (the fixnum end2)))))
291     `(do ((index1 start1 (1+ index1))
292     (index2 start2 (1+ index2)))
293     (,(if abortp
294     end-test
295     `(or ,end-test
296     (not (char-equal (schar string1 index1)
297     (schar string2 index2)))))
298     ,end-value)
299     (declare (fixnum index1 index2))
300     ,@(if abortp
301     `((if (not (char-equal (schar string1 index1)
302     (schar string2 index2)))
303     (return ,abort-value)))))))
304    
305     ) ; eval-when
306    
307     (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
308     "Given two strings (string1 and string2), and optional integers start1,
309     start2, end1 and end2, compares characters in string1 to characters in
310     string2 (using char-equal)."
311     (declare (fixnum start1 start2))
312 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
313     (let ((slen1 (- (the fixnum end1) start1))
314     (slen2 (- (the fixnum end2) start2)))
315     (declare (fixnum slen1 slen2))
316     (if (or (minusp slen1) (minusp slen2))
317     ;;prevent endless looping later.
318     (error "Improper bounds for string comparison."))
319     (if (= slen1 slen2)
320     ;;return () immediately if lengths aren't equal.
321     (string-not-equal-loop 1 t nil)))))
322 ram 1.1
323     (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
324     "Given two strings, if the first string is not lexicographically equal
325     to the second string, returns the longest common prefix (using char-equal)
326     of the two strings. Otherwise, returns ()."
327 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
328     (let ((slen1 (- end1 start1))
329     (slen2 (- end2 start2)))
330     (declare (fixnum slen1 slen2))
331     (if (or (minusp slen1) (minusp slen2))
332     ;;prevent endless looping later.
333     (error "Improper bounds for string comparison."))
334     (cond ((or (minusp slen1) (or (minusp slen2)))
335     (error "Improper substring for comparison."))
336     ((= slen1 slen2)
337     (string-not-equal-loop 1 nil (- index1 offset1)))
338     ((< slen1 slen2)
339     (string-not-equal-loop 1 (- index1 offset1)))
340     (t
341     (string-not-equal-loop 2 (- index1 offset1)))))))
342 ram 1.1
343    
344    
345     (eval-when (compile eval)
346    
347     ;;; STRING-LESS-GREATER-EQUAL-TESTS returns a test on the lengths of string1
348     ;;; and string2 and a test on the current characters from string1 and string2
349     ;;; for the following macro.
350     (defun string-less-greater-equal-tests (lessp equalp)
351     (if lessp
352     (if equalp
353     ;; STRING-NOT-GREATERP
354     (values '<= `(not (char-greaterp char1 char2)))
355     ;; STRING-LESSP
356     (values '< `(char-lessp char1 char2)))
357     (if equalp
358     ;; STRING-NOT-LESSP
359     (values '>= `(not (char-lessp char1 char2)))
360     ;; STRING-GREATERP
361     (values '> `(char-greaterp char1 char2)))))
362    
363     (defmacro string-less-greater-equal (lessp equalp)
364     (multiple-value-bind (length-test character-test)
365     (string-less-greater-equal-tests lessp equalp)
366 ram 1.4 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
367     (let ((slen1 (- (the fixnum end1) start1))
368     (slen2 (- (the fixnum end2) start2)))
369     (declare (fixnum slen1 slen2))
370     (if (or (minusp slen1) (minusp slen2))
371     ;;prevent endless looping later.
372     (error "Improper bounds for string comparison."))
373     (do ((index1 start1 (1+ index1))
374     (index2 start2 (1+ index2))
375     (char1)
376     (char2))
377     ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
378     (if (,length-test slen1 slen2) (- index1 offset1)))
379     (declare (fixnum index1 index2))
380     (setq char1 (schar string1 index1))
381     (setq char2 (schar string2 index2))
382     (if (not (char-equal char1 char2))
383     (if ,character-test
384     (return (- index1 offset1))
385     (return ()))))))))
386 ram 1.1
387     ) ; eval-when
388    
389     (defun string-lessp* (string1 string2 start1 end1 start2 end2)
390     (declare (fixnum start1 start2))
391     (string-less-greater-equal t nil))
392    
393     (defun string-greaterp* (string1 string2 start1 end1 start2 end2)
394     (declare (fixnum start1 start2))
395     (string-less-greater-equal nil nil))
396    
397     (defun string-not-lessp* (string1 string2 start1 end1 start2 end2)
398     (declare (fixnum start1 start2))
399     (string-less-greater-equal nil t))
400    
401     (defun string-not-greaterp* (string1 string2 start1 end1 start2 end2)
402     (declare (fixnum start1 start2))
403     (string-less-greater-equal t t))
404    
405     (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
406     "Given two strings, if the first string is lexicographically less than
407     the second string, returns the longest common prefix (using char-equal)
408     of the two strings. Otherwise, returns ()."
409     (string-lessp* string1 string2 start1 end1 start2 end2))
410    
411     (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)
412     "Given two strings, if the first string is lexicographically greater than
413     the second string, returns the longest common prefix (using char-equal)
414     of the two strings. Otherwise, returns ()."
415     (string-greaterp* string1 string2 start1 end1 start2 end2))
416    
417     (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
418     "Given two strings, if the first string is lexicographically greater
419     than or equal to the second string, returns the longest common prefix
420     (using char-equal) of the two strings. Otherwise, returns ()."
421     (string-not-lessp* string1 string2 start1 end1 start2 end2))
422    
423     (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
424     end2)
425     "Given two strings, if the first string is lexicographically less than
426     or equal to the second string, returns the longest common prefix
427     (using char-equal) of the two strings. Otherwise, returns ()."
428     (string-not-greaterp* string1 string2 start1 end1 start2 end2))
429    
430    
431 dtc 1.9 (defun make-string (count &key element-type ((:initial-element fill-char)))
432 ram 1.1 "Given a character count and an optional fill character, makes and returns
433     a new string Count long filled with the fill character."
434 emarsden 1.12 (declare (type fixnum count))
435     (assert (subtypep element-type 'character))
436 ram 1.1 (if fill-char
437     (do ((i 0 (1+ i))
438     (string (make-string count)))
439     ((= i count) string)
440     (declare (fixnum i))
441     (setf (schar string i) fill-char))
442     (make-string count)))
443    
444     (defun string-upcase (string &key (start 0) end)
445     "Given a string, returns a new string that is a copy of it with
446     all lower case alphabetic characters converted to uppercase."
447     (declare (fixnum start))
448 ram 1.5 (let* ((string (if (stringp string) string (string string)))
449     (slen (length string)))
450 ram 1.4 (declare (fixnum slen))
451 ram 1.1 (with-one-string string start end offset
452     (let ((offset-slen (+ slen offset))
453     (newstring (make-string slen)))
454     (declare (fixnum offset-slen))
455     (do ((index offset (1+ index))
456     (new-index 0 (1+ new-index)))
457     ((= index start))
458     (declare (fixnum index new-index))
459     (setf (schar newstring new-index) (schar string index)))
460     (do ((index start (1+ index))
461     (new-index (- start offset) (1+ new-index)))
462     ((= index (the fixnum end)))
463     (declare (fixnum index new-index))
464 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
465     (when wide (incf index))
466     ;; Handle ASCII specially because this is called early in
467     ;; initialization, before unidata is available.
468     (cond ((< 96 code 123) (decf code 32))
469     ((> code 127) (setq code (unicode-upper code))))
470     ;;@@ WARNING: this may, in theory, need to extend newstring
471     ;; but that never actually occurs as of Unicode 5.1.0,
472     ;; so I'm just going to ignore it for now...
473     (multiple-value-bind (hi lo) (surrogates code)
474 rtoy 1.12.30.8 (setf (schar newstring new-index) hi)
475 rtoy 1.12.30.6 (when lo
476 rtoy 1.12.30.8 (setf (schar newstring (incf new-index)) lo)))))
477 rtoy 1.12.30.4 ;;@@ WARNING: see above
478 ram 1.1 (do ((index end (1+ index))
479     (new-index (- (the fixnum end) offset) (1+ new-index)))
480     ((= index offset-slen))
481     (declare (fixnum index new-index))
482     (setf (schar newstring new-index) (schar string index)))
483     newstring))))
484    
485     (defun string-downcase (string &key (start 0) end)
486     "Given a string, returns a new string that is a copy of it with
487     all upper case alphabetic characters converted to lowercase."
488     (declare (fixnum start))
489 ram 1.5 (let* ((string (if (stringp string) string (string string)))
490     (slen (length string)))
491 ram 1.4 (declare (fixnum slen))
492 ram 1.1 (with-one-string string start end offset
493     (let ((offset-slen (+ slen offset))
494     (newstring (make-string slen)))
495     (declare (fixnum offset-slen))
496     (do ((index offset (1+ index))
497     (new-index 0 (1+ new-index)))
498     ((= index start))
499     (declare (fixnum index new-index))
500     (setf (schar newstring new-index) (schar string index)))
501     (do ((index start (1+ index))
502     (new-index (- start offset) (1+ new-index)))
503     ((= index (the fixnum end)))
504     (declare (fixnum index new-index))
505 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
506     (when wide (incf index))
507     ;; Handle ASCII specially because this is called early in
508     ;; initialization, before unidata is available.
509     (cond ((< 64 code 91) (incf code 32))
510     ((> code 127) (setq code (unicode-lower code))))
511     ;;@@ WARNING: this may, in theory, need to extend newstring
512     ;; but that never actually occurs as of Unicode 5.1.0,
513     ;; so I'm just going to ignore it for now...
514     (multiple-value-bind (hi lo) (surrogates code)
515 rtoy 1.12.30.8 (setf (schar newstring new-index) hi)
516 rtoy 1.12.30.6 (when lo
517 rtoy 1.12.30.8 (setf (schar newstring (incf new-index)) lo)))))
518 rtoy 1.12.30.4 ;;@@ WARNING: see above
519 ram 1.1 (do ((index end (1+ index))
520     (new-index (- (the fixnum end) offset) (1+ new-index)))
521     ((= index offset-slen))
522     (declare (fixnum index new-index))
523     (setf (schar newstring new-index) (schar string index)))
524     newstring))))
525    
526     (defun string-capitalize (string &key (start 0) end)
527     "Given a string, returns a copy of the string with the first
528     character of each ``word'' converted to upper-case, and remaining
529     chars in the word converted to lower case. A ``word'' is defined
530     to be a string of case-modifiable characters delimited by
531     non-case-modifiable chars."
532     (declare (fixnum start))
533 ram 1.5 (let* ((string (if (stringp string) string (string string)))
534     (slen (length string)))
535 ram 1.4 (declare (fixnum slen))
536 ram 1.1 (with-one-string string start end offset
537     (let ((offset-slen (+ slen offset))
538     (newstring (make-string slen)))
539     (declare (fixnum offset-slen))
540     (do ((index offset (1+ index))
541     (new-index 0 (1+ new-index)))
542     ((= index start))
543     (declare (fixnum index new-index))
544     (setf (schar newstring new-index) (schar string index)))
545     (do ((index start (1+ index))
546     (new-index (- start offset) (1+ new-index))
547     (newword t)
548     (char ()))
549     ((= index (the fixnum end)))
550     (declare (fixnum index new-index))
551     (setq char (schar string index))
552     (cond ((not (alphanumericp char))
553     (setq newword t))
554     (newword
555     ;;char is first case-modifiable after non-case-modifiable
556 rtoy 1.12.30.2 (setq char (char-titlecase char))
557 ram 1.1 (setq newword ()))
558     ;;char is case-modifiable, but not first
559     (t (setq char (char-downcase char))))
560     (setf (schar newstring new-index) char))
561     (do ((index end (1+ index))
562     (new-index (- (the fixnum end) offset) (1+ new-index)))
563     ((= index offset-slen))
564     (declare (fixnum index new-index))
565     (setf (schar newstring new-index) (schar string index)))
566     newstring))))
567    
568     (defun nstring-upcase (string &key (start 0) end)
569     "Given a string, returns that string with all lower case alphabetic
570     characters converted to uppercase."
571     (declare (fixnum start))
572 ram 1.4 (let ((save-header string))
573 ram 1.1 (with-one-string string start end offset
574     (do ((index start (1+ index)))
575     ((= index (the fixnum end)))
576     (declare (fixnum index))
577 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
578     (declare (ignore wide))
579     ;; Handle ASCII specially because this is called early in
580     ;; initialization, before unidata is available.
581     (cond ((< 96 code 123) (decf code 32))
582     ((> code 127) (setq code (unicode-upper code))))
583     ;;@@ WARNING: this may, in theory, need to extend string
584     ;; (which, obviously, we can't do here. Unless
585     ;; STRING is adjustable, maybe)
586     ;; but that never actually occurs as of Unicode 5.1.0,
587     ;; so I'm just going to ignore it for now...
588     (multiple-value-bind (hi lo) (surrogates code)
589 rtoy 1.12.30.8 (setf (schar string index) hi)
590 rtoy 1.12.30.6 (when lo
591 rtoy 1.12.30.8 (setf (schar string (incf index)) lo))))))
592 ram 1.1 save-header))
593    
594     (defun nstring-downcase (string &key (start 0) end)
595     "Given a string, returns that string with all upper case alphabetic
596     characters converted to lowercase."
597     (declare (fixnum start))
598 ram 1.4 (let ((save-header string))
599 ram 1.1 (with-one-string string start end offset
600     (do ((index start (1+ index)))
601     ((= index (the fixnum end)))
602     (declare (fixnum index))
603 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
604     (declare (ignore wide))
605     (cond ((< 64 code 91) (incf code 32))
606     ((> code 127) (setq code (unicode-lower code))))
607     ;;@@ WARNING: this may, in theory, need to extend string
608     ;; (which, obviously, we can't do here. Unless
609     ;; STRING is adjustable, maybe)
610     ;; but that never actually occurs as of Unicode 5.1.0,
611     ;; so I'm just going to ignore it for now...
612     (multiple-value-bind (hi lo) (surrogates code)
613 rtoy 1.12.30.8 (setf (schar string index) hi)
614 rtoy 1.12.30.6 (when lo
615 rtoy 1.12.30.8 (setf (schar string (incf index)) lo))))))
616 wlott 1.6 save-header))
617 ram 1.1
618     (defun nstring-capitalize (string &key (start 0) end)
619     "Given a string, returns that string with the first
620     character of each ``word'' converted to upper-case, and remaining
621     chars in the word converted to lower case. A ``word'' is defined
622     to be a string of case-modifiable characters delimited by
623     non-case-modifiable chars."
624     (declare (fixnum start))
625 ram 1.4 (let ((save-header string))
626 ram 1.1 (with-one-string string start end offset
627     (do ((index start (1+ index))
628     (newword t)
629     (char ()))
630     ((= index (the fixnum end)))
631     (declare (fixnum index))
632     (setq char (schar string index))
633     (cond ((not (alphanumericp char))
634     (setq newword t))
635     (newword
636     ;;char is first case-modifiable after non-case-modifiable
637 rtoy 1.12.30.2 (setf (schar string index) (char-titlecase char))
638 ram 1.1 (setq newword ()))
639     (t
640     (setf (schar string index) (char-downcase char))))))
641     save-header))
642    
643     (defun string-left-trim (char-bag string)
644     "Given a set of characters (a list or string) and a string, returns
645     a copy of the string with the characters in the set removed from the
646     left end."
647     (with-string string
648     (do ((index start (1+ index)))
649     ((or (= index (the fixnum end))
650     (not (find (schar string index) char-bag)))
651     (subseq (the simple-string string) index end))
652     (declare (fixnum index)))))
653    
654     (defun string-right-trim (char-bag string)
655     "Given a set of characters (a list or string) and a string, returns
656     a copy of the string with the characters in the set removed from the
657     right end."
658     (with-string string
659     (do ((index (1- (the fixnum end)) (1- index)))
660     ((or (< index start) (not (find (schar string index) char-bag)))
661     (subseq (the simple-string string) start (1+ index)))
662     (declare (fixnum index)))))
663    
664     (defun string-trim (char-bag string)
665     "Given a set of characters (a list or string) and a string, returns a
666     copy of the string with the characters in the set removed from both
667     ends."
668     (with-string string
669     (let* ((left-end (do ((index start (1+ index)))
670     ((or (= index (the fixnum end))
671     (not (find (schar string index) char-bag)))
672     index)
673     (declare (fixnum index))))
674     (right-end (do ((index (1- (the fixnum end)) (1- index)))
675     ((or (< index left-end)
676     (not (find (schar string index) char-bag)))
677     (1+ index))
678     (declare (fixnum index)))))
679     (subseq (the simple-string string) left-end right-end))))
680 rtoy 1.12.30.3
681 rtoy 1.12.30.4 (declaim (inline %glyph-f %glyph-b))
682     (defun %glyph-f (string index)
683     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
684     (type simple-string string) (type kernel:index index))
685 rtoy 1.12.30.10 (let* ((prev 0)
686     (l (length string))
687     (c (codepoint string index l))
688     (n (+ index (if (> c #xFFFF) 2 1))))
689     (declare (type (integer 0 #x10FFFF) c) (type kernel:index l n))
690     (loop while (< n l) do
691     (let* ((c (codepoint string n l))
692     (d (the (unsigned-byte 8) (unicode-combining-class c))))
693     (when (or (zerop d) (< d prev))
694     (return))
695     (setq prev d)
696     (incf n (if (> c #xFFFF) 2 1))))
697     n))
698 rtoy 1.12.30.4
699     (defun %glyph-b (string index)
700     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
701     (type simple-string string) (type kernel:index index))
702 rtoy 1.12.30.11 (let* ((prev 255)
703     (n (1- index)))
704     (declare (type kernel:index n))
705     (loop until (< n 0) do
706     (let* ((c (codepoint string n 0))
707     (d (the (unsigned-byte 8) (unicode-combining-class c))))
708     (cond ((zerop d) (return))
709     ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
710     (setq prev d)
711     (decf n (if (> c #xFFFF) 2 1))))
712     n))
713 rtoy 1.12.30.4
714     (defun glyph (string index &key (from-end nil))
715 rtoy 1.12.30.3 "GLYPH returns the glyph at the indexed position in a string, and the
716 rtoy 1.12.30.4 position of the next glyph (or NIL) as a second value. A glyph is
717     a substring consisting of the character at INDEX followed by all
718     subsequent combining characters."
719 rtoy 1.12.30.3 (declare (type simple-string string) (type kernel:index index))
720     #-unicode
721     (char string index)
722     #+unicode
723     (with-array-data ((string string) (start) (end))
724     (declare (ignore start end))
725 rtoy 1.12.30.4 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
726     (if from-end
727     (values (subseq string n index) (and (> n 0) n))
728     (values (subseq string index n) (and (< n (length string)) n))))))
729 rtoy 1.12.30.3
730 rtoy 1.12.30.4 (defun sglyph (string index &key (from-end nil))
731 rtoy 1.12.30.3 "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
732 rtoy 1.12.30.4 except that the string must be a simple-string"
733 rtoy 1.12.30.3 (declare (type simple-string string) (type kernel:index index))
734     #-unicode
735     (schar string index)
736     #+unicode
737 rtoy 1.12.30.4 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
738     (if from-end
739     (values (subseq string n index) (and (> n 0) n))
740     (values (subseq string index n) (and (< n (length string)) n)))))
741    
742 rtoy 1.12.30.9 (defun decompose (string &optional (compatibility t))
743     (declare (type string string))
744     (let ((result (make-string (cond ((< (length string) 40)
745     (* 5 (length string)))
746     ((< (length string) 4096)
747     (* 2 (length string)))
748     (t (round (length string) 5/6)))))
749 rtoy 1.12.30.10 (fillptr 0))
750     (declare (type kernel:index fillptr))
751 rtoy 1.12.30.9 (labels ((rec (string)
752     (declare (type simple-string string))
753     (do ((i 0 (1+ i)))
754     ((= i (length string)))
755     (declare (type kernel:index i))
756     (multiple-value-bind (code wide) (codepoint string i)
757     (when wide (incf i))
758     (let ((decomp (unicode-decomp code compatibility)))
759     (if decomp (rec decomp) (out code))))))
760     (out (code)
761     (multiple-value-bind (hi lo) (surrogates code)
762     (outch hi)
763     (when lo
764 rtoy 1.12.30.10 (outch lo))
765     (let ((cc (unicode-combining-class code)))
766     (unless (zerop cc)
767     (order lo cc (- fillptr (if lo 2 1)))))))
768 rtoy 1.12.30.9 (outch (char)
769     (when (= fillptr (length result))
770     (let ((tmp (make-string (round (length result) 5/6))))
771     (replace tmp result)
772     (setq result tmp)))
773     (setf (schar result fillptr) char)
774     (incf fillptr))
775 rtoy 1.12.30.10 (order (wide1 cc last)
776     (loop until (minusp last) do
777     (multiple-value-bind (code2 wide2) (codepoint result last)
778     (let ((cc2 (unicode-combining-class code2)))
779     (cond ((zerop cc2) (return))
780     ((> cc2 cc)
781     (case (+ (if wide2 2 0) (if wide1 1 0))
782     (0 (rotatef (schar result last)
783     (schar result (1+ last))))
784     (1 (rotatef (schar result last)
785     (schar result (+ last 1))
786     (schar result (+ last 2))))
787     (2 (rotatef (schar result last)
788     (schar result (1- last))
789     (schar result (1+ last))))
790     (3 (rotatef (schar result last)
791     (schar result (+ last 2)))
792     (rotatef (schar result (1- last))
793     (schar result (1+ last)))))
794     (decf last (if wide2 2 1)))
795     (t (return))))))))
796 rtoy 1.12.30.9 (with-array-data ((string string) (start) (end))
797     (declare (ignore start end))
798     (rec string))
799     (shrink-vector result fillptr))))
800    
801 rtoy 1.12.30.4 (defun string-to-nfd (string)
802 rtoy 1.12.30.9 (decompose string nil))
803 rtoy 1.12.30.4
804     (defun string-to-nfkd (string)
805 rtoy 1.12.30.9 (decompose string t))
806 rtoy 1.12.30.4
807 rtoy 1.12.30.12 #+unicode
808 rtoy 1.12.30.4 (defun string-to-nfc (string)
809     ;;@@ Implement me
810 rtoy 1.12.30.12 ;; must return a simple-string for the package machinery
811     (if (simple-string-p string) string (coerce string 'simple-string)))
812    
813     #-unicode ;; Needed by package.lisp
814     (defun string-to-nfc (string)
815     (if (simple-string-p string) string (coerce string 'simple-string)))
816 rtoy 1.12.30.4
817     (defun string-to-nfkc (string)
818     ;;@@ Implement me
819 rtoy 1.12.30.12 (if (simple-string-p string) string (coerce string 'simple-string)))

  ViewVC Help
Powered by ViewVC 1.1.5