/[cmucl]/src/code/string.lisp
ViewVC logotype

Contents of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12.30.9 - (hide annotations)
Sat May 2 11:54:37 2009 UTC (4 years, 11 months ago) by rtoy
Branch: unicode-utf16-extfmt-branch
Changes since 1.12.30.8: +73 -5 lines
Updates from Paul.  With these changes, we pass the Unicode
normalization test suite successfully for NFD and NFKD.

unidata.lisp:
o Implement algorithmic decomposition of Hangul.

string.lisp:
o Implement Unicode normalization forms NFD and NFKD.
1 ram 1.1 ;;; -*- Log: code.log; Package: Lisp -*-
2     ;;;
3     ;;; **********************************************************************
4 ram 1.3 ;;; This code was written as part of the CMU Common Lisp project at
5     ;;; Carnegie Mellon University, and has been placed in the public domain.
6     ;;;
7     (ext:file-comment
8 rtoy 1.12.30.9 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/string.lisp,v 1.12.30.9 2009/05/02 11:54:37 rtoy Exp $")
9 ram 1.3 ;;;
10 ram 1.1 ;;; **********************************************************************
11     ;;;
12 pw 1.11 ;;; Functions to implement strings for CMU Common Lisp
13 ram 1.1 ;;; Written by David Dill
14 ram 1.4 ;;; Rewritten by Skef Wholey, Bill Chiles and Rob MacLachlan.
15 ram 1.1 ;;;
16     ;;; ****************************************************************
17     ;;;
18 ram 1.4 (in-package "LISP")
19 rtoy 1.12.30.3 (export '(char schar glyph sglyph string
20 ram 1.1 string= string-equal string< string> string<= string>= string/=
21     string-lessp string-greaterp string-not-lessp string-not-greaterp
22     string-not-equal
23 rtoy 1.12.30.4 string-to-nfd string-to-nfkd string-to-nfc string-to-nfkc
24 ram 1.1 make-string
25     string-trim string-left-trim string-right-trim
26     string-upcase
27     string-downcase string-capitalize nstring-upcase nstring-downcase
28     nstring-capitalize))
29    
30    
31 rtoy 1.12.30.8 (declaim (inline surrogates-to-codepoint codepoint surrogates))
32 rtoy 1.12.30.7
33 rtoy 1.12.30.8 (defun surrogates-to-codepoint (hi lo)
34     "Convert the given Hi and Lo surrogate characters to the
35     corresponding codepoint value"
36     (declare (type character hi lo))
37     (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi)) #xD800) 10)
38     (the (integer #xDC00 #xDFFF) (char-code lo)) #x2400))
39 rtoy 1.12.30.7
40 rtoy 1.12.30.6 (defun codepoint (string i &optional (end (length string)))
41 rtoy 1.12.30.7 "Return the codepoint value from String at position I. If that
42     position is a surrogate, it is combined with either the previous or
43     following character (when possible) to compute the codepoint. The
44     second return value is NIL if the position is not a surrogate pair.
45     Otherwise +1 or -1 is returned if the position is the high or low
46     surrogate value, respectively."
47 rtoy 1.12.30.6 (declare (type simple-string string) (type kernel:index i end))
48     (let ((code (char-code (schar string i))))
49     (cond ((and (<= #xD800 code #xDBFF) (< (1+ i) end))
50     (let ((tmp (char-code (schar string (1+ i)))))
51     (if (<= #xDC00 tmp #xDFFF)
52 rtoy 1.12.30.8 (values (surrogates-to-codepoint (code-char code) (code-char tmp)) +1)
53 rtoy 1.12.30.6 (values code nil))))
54     ((and (<= #xDC00 code #xDFFF) (> i 0))
55     (let ((tmp (char-code (schar string (1- i)))))
56     (if (<= #xD800 tmp #xDBFF)
57 rtoy 1.12.30.8 (values (surrogates-to-codepoint (code-char tmp) (code-char code)) -1)
58 rtoy 1.12.30.6 (values code nil))))
59     (t (values code nil)))))
60    
61     (defun surrogates (codepoint)
62 rtoy 1.12.30.8 "Return the high and low surrogate characters for Codepoint. If
63     Codepoint is in the BMP, the first return value is the corresponding
64     character and the second is NIL."
65 rtoy 1.12.30.6 (declare (type (integer 0 #x10FFFF) codepoint))
66     (if (< codepoint #x10000)
67 rtoy 1.12.30.8 (values (code-char codepoint) nil)
68 rtoy 1.12.30.6 (let* ((tmp (- codepoint #x10000))
69     (hi (logior (ldb (byte 10 10) tmp) #xD800))
70     (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
71 rtoy 1.12.30.8 (values (code-char hi) (code-char lo)))))
72 rtoy 1.12.30.6
73    
74 ram 1.1 (defun string (X)
75     "Coerces X into a string. If X is a string, X is returned. If X is a
76     symbol, X's pname is returned. If X is a character then a one element
77     string containing that character is returned. If X cannot be coerced
78     into a string, an error occurs."
79     (cond ((stringp x) x)
80     ((symbolp x) (symbol-name x))
81     ((characterp x)
82     (let ((res (make-string 1)))
83     (setf (schar res 0) x) res))
84     (t
85 pw 1.11 (error 'simple-type-error
86     :datum x
87     :expected-type '(or string symbol character)
88     :format-control "~S cannot be coerced to a string."
89     :format-arguments (list x)))))
90 ram 1.1
91     ;;; With-One-String is used to set up some string hacking things. The keywords
92     ;;; are parsed, and the string is hacked into a simple-string.
93    
94     (eval-when (compile)
95    
96     (defmacro with-one-string (string start end cum-offset &rest forms)
97 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
98 pw 1.10 ;; Optimizer may prove STRING is one.
99     (declare (optimize (ext:inhibit-warnings 3)))
100 ram 1.4 (with-array-data ((,string ,string :offset-var ,cum-offset)
101     (,start ,start)
102     (,end (or ,end (length (the vector ,string)))))
103     ,@forms)))
104 ram 1.1
105     )
106    
107     ;;; With-String is like With-One-String, but doesn't parse keywords.
108    
109     (eval-when (compile)
110    
111     (defmacro with-string (string &rest forms)
112 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
113     (with-array-data ((,string ,string)
114     (start)
115     (end (length (the vector ,string))))
116     ,@forms)))
117 ram 1.1
118     )
119    
120     ;;; With-Two-Strings is used to set up string comparison operations. The
121     ;;; keywords are parsed, and the strings are hacked into simple-strings.
122    
123     (eval-when (compile)
124    
125     (defmacro with-two-strings (string1 string2 start1 end1 cum-offset-1
126     start2 end2 &rest forms)
127 ram 1.4 `(let ((,string1 (if (stringp ,string1) ,string1 (string ,string1)))
128     (,string2 (if (stringp ,string2) ,string2 (string ,string2))))
129     (with-array-data ((,string1 ,string1 :offset-var ,cum-offset-1)
130     (,start1 ,start1)
131     (,end1 (or ,end1 (length (the vector ,string1)))))
132     (with-array-data ((,string2 ,string2)
133     (,start2 ,start2)
134     (,end2 (or ,end2 (length (the vector ,string2)))))
135     ,@forms))))
136 ram 1.1
137     )
138 wlott 1.2
139 ram 1.1
140     (defun char (string index)
141     "Given a string and a non-negative integer index less than the length of
142     the string, returns the character object representing the character at
143     that position in the string."
144 ram 1.4 (declare (optimize (safety 1)))
145 ram 1.1 (char string index))
146    
147     (defun %charset (string index new-el)
148 ram 1.4 (declare (optimize (safety 1)))
149 ram 1.1 (setf (char string index) new-el))
150    
151     (defun schar (string index)
152     "SCHAR returns the character object at an indexed position in a string
153     just as CHAR does, except the string must be a simple-string."
154 ram 1.4 (declare (optimize (safety 1)))
155 ram 1.1 (schar string index))
156    
157     (defun %scharset (string index new-el)
158 ram 1.4 (declare (optimize (safety 1)))
159 ram 1.1 (setf (schar string index) new-el))
160    
161     (defun string=* (string1 string2 start1 end1 start2 end2)
162 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
163     (not (%sp-string-compare string1 start1 end1 string2 start2 end2))))
164 ram 1.1
165    
166     (defun string/=* (string1 string2 start1 end1 start2 end2)
167 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
168     (let ((comparison (%sp-string-compare string1 start1 end1
169     string2 start2 end2)))
170     (if comparison (- (the fixnum comparison) offset1)))))
171 ram 1.1
172     (eval-when (compile eval)
173    
174     ;;; Lessp is true if the desired expansion is for string<* or string<=*.
175     ;;; Equalp is true if the desired expansion is for string<=* or string>=*.
176     (defmacro string<>=*-body (lessp equalp)
177     (let ((offset1 (gensym)))
178 ram 1.4 `(with-two-strings string1 string2 start1 end1 ,offset1 start2 end2
179     (let ((index (%sp-string-compare string1 start1 end1
180     string2 start2 end2)))
181     (if index
182 ram 1.8 (cond ((= (the fixnum index) (the fixnum end1))
183     ,(if lessp
184     `(- (the fixnum index) ,offset1)
185     `nil))
186     ((= (+ (the fixnum index) (- start2 start1))
187     (the fixnum end2))
188     ,(if lessp
189     `nil
190 rtoy 1.12.30.1 `(- (the fixnum index) ,offset1)))
191     #-unicode
192 ram 1.4 ((,(if lessp 'char< 'char>)
193     (schar string1 index)
194     (schar string2 (+ (the fixnum index) (- start2 start1))))
195     (- (the fixnum index) ,offset1))
196 rtoy 1.12.30.1 #-unicode
197     (t nil)
198     #+unicode
199     (t
200     ;; Compare in code point order. See
201     ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
202     (flet ((fixup (code)
203     (if (>= code #xe000)
204     (- code #x800)
205     (+ code #x2000))))
206     (declare (inline fixup))
207     (let* ((c1 (char-code (schar string1 index)))
208     (c2 (char-code (schar string2 (+ (the fixnum index) (- start2 start1))))))
209     (cond ((and (>= c1 #xd800)
210     (>= c2 #xd800))
211     (let ((fix-c1 (fixup c1))
212     (fix-c2 (fixup c2)))
213     (if (,(if lessp '< '>) fix-c1 fix-c2)
214     (- (the fixnum index) ,offset1)
215     nil)))
216     (t
217     (if (,(if lessp '< '>) c1 c2)
218     (- (the fixnum index) ,offset1)
219     nil)))))))
220 ram 1.4 ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
221 ram 1.1 ) ; eval-when
222    
223     (defun string<* (string1 string2 start1 end1 start2 end2)
224     (declare (fixnum start1 start2))
225     (string<>=*-body t nil))
226    
227     (defun string>* (string1 string2 start1 end1 start2 end2)
228     (declare (fixnum start1 start2))
229     (string<>=*-body nil nil))
230    
231     (defun string<=* (string1 string2 start1 end1 start2 end2)
232     (declare (fixnum start1 start2))
233     (string<>=*-body t t))
234    
235     (defun string>=* (string1 string2 start1 end1 start2 end2)
236     (declare (fixnum start1 start2))
237     (string<>=*-body nil t))
238    
239    
240    
241     (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
242     "Given two strings, if the first string is lexicographically less than
243     the second string, returns the longest common prefix (using char=)
244     of the two strings. Otherwise, returns ()."
245     (string<* string1 string2 start1 end1 start2 end2))
246    
247     (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
248     "Given two strings, if the first string is lexicographically greater than
249     the second string, returns the longest common prefix (using char=)
250     of the two strings. Otherwise, returns ()."
251     (string>* string1 string2 start1 end1 start2 end2))
252    
253    
254     (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
255     "Given two strings, if the first string is lexicographically less than
256     or equal to the second string, returns the longest common prefix
257     (using char=) of the two strings. Otherwise, returns ()."
258     (string<=* string1 string2 start1 end1 start2 end2))
259    
260     (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
261     "Given two strings, if the first string is lexicographically greater
262     than or equal to the second string, returns the longest common prefix
263     (using char=) of the two strings. Otherwise, returns ()."
264     (string>=* string1 string2 start1 end1 start2 end2))
265    
266     (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
267     "Given two strings (string1 and string2), and optional integers start1,
268     start2, end1 and end2, compares characters in string1 to characters in
269     string2 (using char=)."
270     (string=* string1 string2 start1 end1 start2 end2))
271    
272     (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
273     "Given two strings, if the first string is not lexicographically equal
274     to the second string, returns the longest common prefix (using char=)
275     of the two strings. Otherwise, returns ()."
276     (string/=* string1 string2 start1 end1 start2 end2))
277    
278    
279     (eval-when (compile eval)
280    
281     ;;; STRING-NOT-EQUAL-LOOP is used to generate character comparison loops for
282     ;;; STRING-EQUAL and STRING-NOT-EQUAL.
283     (defmacro string-not-equal-loop (end end-value
284     &optional (abort-value nil abortp))
285     (declare (fixnum end))
286     (let ((end-test (if (= end 1)
287     `(= index1 (the fixnum end1))
288     `(= index2 (the fixnum end2)))))
289     `(do ((index1 start1 (1+ index1))
290     (index2 start2 (1+ index2)))
291     (,(if abortp
292     end-test
293     `(or ,end-test
294     (not (char-equal (schar string1 index1)
295     (schar string2 index2)))))
296     ,end-value)
297     (declare (fixnum index1 index2))
298     ,@(if abortp
299     `((if (not (char-equal (schar string1 index1)
300     (schar string2 index2)))
301     (return ,abort-value)))))))
302    
303     ) ; eval-when
304    
305     (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
306     "Given two strings (string1 and string2), and optional integers start1,
307     start2, end1 and end2, compares characters in string1 to characters in
308     string2 (using char-equal)."
309     (declare (fixnum start1 start2))
310 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
311     (let ((slen1 (- (the fixnum end1) start1))
312     (slen2 (- (the fixnum end2) start2)))
313     (declare (fixnum slen1 slen2))
314     (if (or (minusp slen1) (minusp slen2))
315     ;;prevent endless looping later.
316     (error "Improper bounds for string comparison."))
317     (if (= slen1 slen2)
318     ;;return () immediately if lengths aren't equal.
319     (string-not-equal-loop 1 t nil)))))
320 ram 1.1
321     (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
322     "Given two strings, if the first string is not lexicographically equal
323     to the second string, returns the longest common prefix (using char-equal)
324     of the two strings. Otherwise, returns ()."
325 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
326     (let ((slen1 (- end1 start1))
327     (slen2 (- end2 start2)))
328     (declare (fixnum slen1 slen2))
329     (if (or (minusp slen1) (minusp slen2))
330     ;;prevent endless looping later.
331     (error "Improper bounds for string comparison."))
332     (cond ((or (minusp slen1) (or (minusp slen2)))
333     (error "Improper substring for comparison."))
334     ((= slen1 slen2)
335     (string-not-equal-loop 1 nil (- index1 offset1)))
336     ((< slen1 slen2)
337     (string-not-equal-loop 1 (- index1 offset1)))
338     (t
339     (string-not-equal-loop 2 (- index1 offset1)))))))
340 ram 1.1
341    
342    
343     (eval-when (compile eval)
344    
345     ;;; STRING-LESS-GREATER-EQUAL-TESTS returns a test on the lengths of string1
346     ;;; and string2 and a test on the current characters from string1 and string2
347     ;;; for the following macro.
348     (defun string-less-greater-equal-tests (lessp equalp)
349     (if lessp
350     (if equalp
351     ;; STRING-NOT-GREATERP
352     (values '<= `(not (char-greaterp char1 char2)))
353     ;; STRING-LESSP
354     (values '< `(char-lessp char1 char2)))
355     (if equalp
356     ;; STRING-NOT-LESSP
357     (values '>= `(not (char-lessp char1 char2)))
358     ;; STRING-GREATERP
359     (values '> `(char-greaterp char1 char2)))))
360    
361     (defmacro string-less-greater-equal (lessp equalp)
362     (multiple-value-bind (length-test character-test)
363     (string-less-greater-equal-tests lessp equalp)
364 ram 1.4 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
365     (let ((slen1 (- (the fixnum end1) start1))
366     (slen2 (- (the fixnum end2) start2)))
367     (declare (fixnum slen1 slen2))
368     (if (or (minusp slen1) (minusp slen2))
369     ;;prevent endless looping later.
370     (error "Improper bounds for string comparison."))
371     (do ((index1 start1 (1+ index1))
372     (index2 start2 (1+ index2))
373     (char1)
374     (char2))
375     ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
376     (if (,length-test slen1 slen2) (- index1 offset1)))
377     (declare (fixnum index1 index2))
378     (setq char1 (schar string1 index1))
379     (setq char2 (schar string2 index2))
380     (if (not (char-equal char1 char2))
381     (if ,character-test
382     (return (- index1 offset1))
383     (return ()))))))))
384 ram 1.1
385     ) ; eval-when
386    
387     (defun string-lessp* (string1 string2 start1 end1 start2 end2)
388     (declare (fixnum start1 start2))
389     (string-less-greater-equal t nil))
390    
391     (defun string-greaterp* (string1 string2 start1 end1 start2 end2)
392     (declare (fixnum start1 start2))
393     (string-less-greater-equal nil nil))
394    
395     (defun string-not-lessp* (string1 string2 start1 end1 start2 end2)
396     (declare (fixnum start1 start2))
397     (string-less-greater-equal nil t))
398    
399     (defun string-not-greaterp* (string1 string2 start1 end1 start2 end2)
400     (declare (fixnum start1 start2))
401     (string-less-greater-equal t t))
402    
403     (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
404     "Given two strings, if the first string is lexicographically less than
405     the second string, returns the longest common prefix (using char-equal)
406     of the two strings. Otherwise, returns ()."
407     (string-lessp* string1 string2 start1 end1 start2 end2))
408    
409     (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)
410     "Given two strings, if the first string is lexicographically greater than
411     the second string, returns the longest common prefix (using char-equal)
412     of the two strings. Otherwise, returns ()."
413     (string-greaterp* string1 string2 start1 end1 start2 end2))
414    
415     (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
416     "Given two strings, if the first string is lexicographically greater
417     than or equal to the second string, returns the longest common prefix
418     (using char-equal) of the two strings. Otherwise, returns ()."
419     (string-not-lessp* string1 string2 start1 end1 start2 end2))
420    
421     (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
422     end2)
423     "Given two strings, if the first string is lexicographically less than
424     or equal to the second string, returns the longest common prefix
425     (using char-equal) of the two strings. Otherwise, returns ()."
426     (string-not-greaterp* string1 string2 start1 end1 start2 end2))
427    
428    
429 dtc 1.9 (defun make-string (count &key element-type ((:initial-element fill-char)))
430 ram 1.1 "Given a character count and an optional fill character, makes and returns
431     a new string Count long filled with the fill character."
432 emarsden 1.12 (declare (type fixnum count))
433     (assert (subtypep element-type 'character))
434 ram 1.1 (if fill-char
435     (do ((i 0 (1+ i))
436     (string (make-string count)))
437     ((= i count) string)
438     (declare (fixnum i))
439     (setf (schar string i) fill-char))
440     (make-string count)))
441    
442     (defun string-upcase (string &key (start 0) end)
443     "Given a string, returns a new string that is a copy of it with
444     all lower case alphabetic characters converted to uppercase."
445     (declare (fixnum start))
446 ram 1.5 (let* ((string (if (stringp string) string (string string)))
447     (slen (length string)))
448 ram 1.4 (declare (fixnum slen))
449 ram 1.1 (with-one-string string start end offset
450     (let ((offset-slen (+ slen offset))
451     (newstring (make-string slen)))
452     (declare (fixnum offset-slen))
453     (do ((index offset (1+ index))
454     (new-index 0 (1+ new-index)))
455     ((= index start))
456     (declare (fixnum index new-index))
457     (setf (schar newstring new-index) (schar string index)))
458     (do ((index start (1+ index))
459     (new-index (- start offset) (1+ new-index)))
460     ((= index (the fixnum end)))
461     (declare (fixnum index new-index))
462 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
463     (when wide (incf index))
464     ;; Handle ASCII specially because this is called early in
465     ;; initialization, before unidata is available.
466     (cond ((< 96 code 123) (decf code 32))
467     ((> code 127) (setq code (unicode-upper code))))
468     ;;@@ WARNING: this may, in theory, need to extend newstring
469     ;; but that never actually occurs as of Unicode 5.1.0,
470     ;; so I'm just going to ignore it for now...
471     (multiple-value-bind (hi lo) (surrogates code)
472 rtoy 1.12.30.8 (setf (schar newstring new-index) hi)
473 rtoy 1.12.30.6 (when lo
474 rtoy 1.12.30.8 (setf (schar newstring (incf new-index)) lo)))))
475 rtoy 1.12.30.4 ;;@@ WARNING: see above
476 ram 1.1 (do ((index end (1+ index))
477     (new-index (- (the fixnum end) offset) (1+ new-index)))
478     ((= index offset-slen))
479     (declare (fixnum index new-index))
480     (setf (schar newstring new-index) (schar string index)))
481     newstring))))
482    
483     (defun string-downcase (string &key (start 0) end)
484     "Given a string, returns a new string that is a copy of it with
485     all upper case alphabetic characters converted to lowercase."
486     (declare (fixnum start))
487 ram 1.5 (let* ((string (if (stringp string) string (string string)))
488     (slen (length string)))
489 ram 1.4 (declare (fixnum slen))
490 ram 1.1 (with-one-string string start end offset
491     (let ((offset-slen (+ slen offset))
492     (newstring (make-string slen)))
493     (declare (fixnum offset-slen))
494     (do ((index offset (1+ index))
495     (new-index 0 (1+ new-index)))
496     ((= index start))
497     (declare (fixnum index new-index))
498     (setf (schar newstring new-index) (schar string index)))
499     (do ((index start (1+ index))
500     (new-index (- start offset) (1+ new-index)))
501     ((= index (the fixnum end)))
502     (declare (fixnum index new-index))
503 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
504     (when wide (incf index))
505     ;; Handle ASCII specially because this is called early in
506     ;; initialization, before unidata is available.
507     (cond ((< 64 code 91) (incf code 32))
508     ((> code 127) (setq code (unicode-lower code))))
509     ;;@@ WARNING: this may, in theory, need to extend newstring
510     ;; but that never actually occurs as of Unicode 5.1.0,
511     ;; so I'm just going to ignore it for now...
512     (multiple-value-bind (hi lo) (surrogates code)
513 rtoy 1.12.30.8 (setf (schar newstring new-index) hi)
514 rtoy 1.12.30.6 (when lo
515 rtoy 1.12.30.8 (setf (schar newstring (incf new-index)) lo)))))
516 rtoy 1.12.30.4 ;;@@ WARNING: see above
517 ram 1.1 (do ((index end (1+ index))
518     (new-index (- (the fixnum end) offset) (1+ new-index)))
519     ((= index offset-slen))
520     (declare (fixnum index new-index))
521     (setf (schar newstring new-index) (schar string index)))
522     newstring))))
523    
524     (defun string-capitalize (string &key (start 0) end)
525     "Given a string, returns a copy of the string with the first
526     character of each ``word'' converted to upper-case, and remaining
527     chars in the word converted to lower case. A ``word'' is defined
528     to be a string of case-modifiable characters delimited by
529     non-case-modifiable chars."
530     (declare (fixnum start))
531 ram 1.5 (let* ((string (if (stringp string) string (string string)))
532     (slen (length string)))
533 ram 1.4 (declare (fixnum slen))
534 ram 1.1 (with-one-string string start end offset
535     (let ((offset-slen (+ slen offset))
536     (newstring (make-string slen)))
537     (declare (fixnum offset-slen))
538     (do ((index offset (1+ index))
539     (new-index 0 (1+ new-index)))
540     ((= index start))
541     (declare (fixnum index new-index))
542     (setf (schar newstring new-index) (schar string index)))
543     (do ((index start (1+ index))
544     (new-index (- start offset) (1+ new-index))
545     (newword t)
546     (char ()))
547     ((= index (the fixnum end)))
548     (declare (fixnum index new-index))
549     (setq char (schar string index))
550     (cond ((not (alphanumericp char))
551     (setq newword t))
552     (newword
553     ;;char is first case-modifiable after non-case-modifiable
554 rtoy 1.12.30.2 (setq char (char-titlecase char))
555 ram 1.1 (setq newword ()))
556     ;;char is case-modifiable, but not first
557     (t (setq char (char-downcase char))))
558     (setf (schar newstring new-index) char))
559     (do ((index end (1+ index))
560     (new-index (- (the fixnum end) offset) (1+ new-index)))
561     ((= index offset-slen))
562     (declare (fixnum index new-index))
563     (setf (schar newstring new-index) (schar string index)))
564     newstring))))
565    
566     (defun nstring-upcase (string &key (start 0) end)
567     "Given a string, returns that string with all lower case alphabetic
568     characters converted to uppercase."
569     (declare (fixnum start))
570 ram 1.4 (let ((save-header string))
571 ram 1.1 (with-one-string string start end offset
572     (do ((index start (1+ index)))
573     ((= index (the fixnum end)))
574     (declare (fixnum index))
575 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
576     (declare (ignore wide))
577     ;; Handle ASCII specially because this is called early in
578     ;; initialization, before unidata is available.
579     (cond ((< 96 code 123) (decf code 32))
580     ((> code 127) (setq code (unicode-upper code))))
581     ;;@@ WARNING: this may, in theory, need to extend string
582     ;; (which, obviously, we can't do here. Unless
583     ;; STRING is adjustable, maybe)
584     ;; but that never actually occurs as of Unicode 5.1.0,
585     ;; so I'm just going to ignore it for now...
586     (multiple-value-bind (hi lo) (surrogates code)
587 rtoy 1.12.30.8 (setf (schar string index) hi)
588 rtoy 1.12.30.6 (when lo
589 rtoy 1.12.30.8 (setf (schar string (incf index)) lo))))))
590 ram 1.1 save-header))
591    
592     (defun nstring-downcase (string &key (start 0) end)
593     "Given a string, returns that string with all upper case alphabetic
594     characters converted to lowercase."
595     (declare (fixnum start))
596 ram 1.4 (let ((save-header string))
597 ram 1.1 (with-one-string string start end offset
598     (do ((index start (1+ index)))
599     ((= index (the fixnum end)))
600     (declare (fixnum index))
601 rtoy 1.12.30.6 (multiple-value-bind (code wide) (codepoint string index)
602     (declare (ignore wide))
603     (cond ((< 64 code 91) (incf code 32))
604     ((> code 127) (setq code (unicode-lower code))))
605     ;;@@ WARNING: this may, in theory, need to extend string
606     ;; (which, obviously, we can't do here. Unless
607     ;; STRING is adjustable, maybe)
608     ;; but that never actually occurs as of Unicode 5.1.0,
609     ;; so I'm just going to ignore it for now...
610     (multiple-value-bind (hi lo) (surrogates code)
611 rtoy 1.12.30.8 (setf (schar string index) hi)
612 rtoy 1.12.30.6 (when lo
613 rtoy 1.12.30.8 (setf (schar string (incf index)) lo))))))
614 wlott 1.6 save-header))
615 ram 1.1
616     (defun nstring-capitalize (string &key (start 0) end)
617     "Given a string, returns that string with the first
618     character of each ``word'' converted to upper-case, and remaining
619     chars in the word converted to lower case. A ``word'' is defined
620     to be a string of case-modifiable characters delimited by
621     non-case-modifiable chars."
622     (declare (fixnum start))
623 ram 1.4 (let ((save-header string))
624 ram 1.1 (with-one-string string start end offset
625     (do ((index start (1+ index))
626     (newword t)
627     (char ()))
628     ((= index (the fixnum end)))
629     (declare (fixnum index))
630     (setq char (schar string index))
631     (cond ((not (alphanumericp char))
632     (setq newword t))
633     (newword
634     ;;char is first case-modifiable after non-case-modifiable
635 rtoy 1.12.30.2 (setf (schar string index) (char-titlecase char))
636 ram 1.1 (setq newword ()))
637     (t
638     (setf (schar string index) (char-downcase char))))))
639     save-header))
640    
641     (defun string-left-trim (char-bag string)
642     "Given a set of characters (a list or string) and a string, returns
643     a copy of the string with the characters in the set removed from the
644     left end."
645     (with-string string
646     (do ((index start (1+ index)))
647     ((or (= index (the fixnum end))
648     (not (find (schar string index) char-bag)))
649     (subseq (the simple-string string) index end))
650     (declare (fixnum index)))))
651    
652     (defun string-right-trim (char-bag string)
653     "Given a set of characters (a list or string) and a string, returns
654     a copy of the string with the characters in the set removed from the
655     right end."
656     (with-string string
657     (do ((index (1- (the fixnum end)) (1- index)))
658     ((or (< index start) (not (find (schar string index) char-bag)))
659     (subseq (the simple-string string) start (1+ index)))
660     (declare (fixnum index)))))
661    
662     (defun string-trim (char-bag string)
663     "Given a set of characters (a list or string) and a string, returns a
664     copy of the string with the characters in the set removed from both
665     ends."
666     (with-string string
667     (let* ((left-end (do ((index start (1+ index)))
668     ((or (= index (the fixnum end))
669     (not (find (schar string index) char-bag)))
670     index)
671     (declare (fixnum index))))
672     (right-end (do ((index (1- (the fixnum end)) (1- index)))
673     ((or (< index left-end)
674     (not (find (schar string index) char-bag)))
675     (1+ index))
676     (declare (fixnum index)))))
677     (subseq (the simple-string string) left-end right-end))))
678 rtoy 1.12.30.3
679 rtoy 1.12.30.4 (declaim (inline %glyph-f %glyph-b))
680     (defun %glyph-f (string index)
681     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
682     (type simple-string string) (type kernel:index index))
683     (flet ((xchar (string index)
684     (let ((c (char-code (schar string index))))
685     (declare (type (integer 0 #x10FFFF) c))
686     (cond ((<= #xD800 c #xDBFF)
687     (if (= (1+ index) (length string))
688     (error "String ends with an unpaired surrogate.")
689     (let ((c2 (char-code (schar string (1+ index)))))
690     (if (<= #xDC00 c2 #xDFFF)
691     (+ (ash (- c #xD800) 10) c2 #x2400)
692     (error "Naked high surrogate in string.")))))
693     ((<= #xDC00 c #xDFFF)
694     (error "Naked low surrogate in string."))
695     (t c)))))
696     (let* ((prev 0)
697     (l (length string))
698     (c (xchar string index))
699     (n (+ index (if (> c #xFFFF) 2 1))))
700     (declare (type (integer 0 #x10FFFF) c) (type kernel:index l n))
701     (loop while (< n l) do
702     (let* ((c (xchar string n))
703     (d (the (unsigned-byte 8) (unicode-combining-class c))))
704     (when (or (zerop d) (< d prev))
705     (return))
706     (setq prev d)
707     (incf n (if (> c #xFFFF) 2 1))))
708     n)))
709    
710     (defun %glyph-b (string index)
711     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
712     (type simple-string string) (type kernel:index index))
713     (flet ((xchar (string index)
714     (let ((c (char-code (schar string index))))
715     (declare (type (integer 0 #x10FFFF) c))
716     (cond ((<= #xDC00 c #xDFFF)
717     (let ((c2 (char-code (schar string (1- index)))))
718     (if (<= #xD800 c2 #xDBFF)
719     (+ (ash (- c2 #xD800) 10) c #x2400)
720     (error "Naked low surrogate in string."))))
721     ((<= #xD800 c #xDBFF)
722     (error "Naked high surrogate in string."))
723     (t c)))))
724     (let ((prev 255)
725     (n (1- index)))
726     (declare (type kernel:index n))
727     (loop while (> n 0) do
728     (let* ((c (xchar string n))
729     (d (the (unsigned-byte 8) (unicode-combining-class c))))
730     (cond ((zerop d) (return))
731     ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
732     (setq prev d)
733     (decf n (if (> c #xFFFF) 2 1))))
734     n)))
735    
736     (defun glyph (string index &key (from-end nil))
737 rtoy 1.12.30.3 "GLYPH returns the glyph at the indexed position in a string, and the
738 rtoy 1.12.30.4 position of the next glyph (or NIL) as a second value. A glyph is
739     a substring consisting of the character at INDEX followed by all
740     subsequent combining characters."
741 rtoy 1.12.30.3 (declare (type simple-string string) (type kernel:index index))
742     #-unicode
743     (char string index)
744     #+unicode
745     (with-array-data ((string string) (start) (end))
746     (declare (ignore start end))
747 rtoy 1.12.30.4 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
748     (if from-end
749     (values (subseq string n index) (and (> n 0) n))
750     (values (subseq string index n) (and (< n (length string)) n))))))
751 rtoy 1.12.30.3
752 rtoy 1.12.30.4 (defun sglyph (string index &key (from-end nil))
753 rtoy 1.12.30.3 "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
754 rtoy 1.12.30.4 except that the string must be a simple-string"
755 rtoy 1.12.30.3 (declare (type simple-string string) (type kernel:index index))
756     #-unicode
757     (schar string index)
758     #+unicode
759 rtoy 1.12.30.4 (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
760     (if from-end
761     (values (subseq string n index) (and (> n 0) n))
762     (values (subseq string index n) (and (< n (length string)) n)))))
763    
764 rtoy 1.12.30.9 (defun decompose (string &optional (compatibility t))
765     (declare (type string string))
766     (let ((result (make-string (cond ((< (length string) 40)
767     (* 5 (length string)))
768     ((< (length string) 4096)
769     (* 2 (length string)))
770     (t (round (length string) 5/6)))))
771     (fillptr 0)
772     (start 0))
773     (declare (type kernel:index fillptr start))
774     (labels ((rec (string)
775     (declare (type simple-string string))
776     (do ((i 0 (1+ i)))
777     ((= i (length string)))
778     (declare (type kernel:index i))
779     (multiple-value-bind (code wide) (codepoint string i)
780     (when wide (incf i))
781     (let ((decomp (unicode-decomp code compatibility)))
782     (if decomp (rec decomp) (out code))))))
783     (out (code)
784     (when (zerop (unicode-combining-class code))
785     (order result start fillptr)
786     (setq start fillptr))
787     (multiple-value-bind (hi lo) (surrogates code)
788     (outch hi)
789     (when lo
790     (outch lo))))
791     (outch (char)
792     (when (= fillptr (length result))
793     (let ((tmp (make-string (round (length result) 5/6))))
794     (replace tmp result)
795     (setq result tmp)))
796     (setf (schar result fillptr) char)
797     (incf fillptr))
798     (order (string start end)
799     (when (> (- end start) (if (zerop start) 1 2))
800     (loop for done = t do
801     (do ((x start (1+ x)))
802     ((>= x (1- end)))
803     (declare (type kernel:index x))
804     (multiple-value-bind (code1 wide1)
805     (codepoint string x)
806     (when (and wide1 (= (+ x 2) end)) (return))
807     (multiple-value-bind (code2 wide2)
808     (codepoint string (+ x (if wide1 2 1)))
809     (when (> (unicode-combining-class code1)
810     (unicode-combining-class code2))
811     (rotatef wide1 wide2)
812     (case (+ (if wide1 2 0) (if wide2 1 0))
813     (0 (rotatef (schar string x)
814     (schar string (+ x 1))))
815     (1 (rotatef (schar string x)
816     (schar string (+ x 2))
817     (schar string (+ x 1))))
818     (2 (rotatef (schar string x)
819     (schar string (+ x 1))
820     (schar string (+ x 2))))
821     (3 (rotatef (schar string x)
822     (schar string (+ x 2)))
823     (rotatef (schar string (+ x 1))
824     (schar string (+ x 3)))))
825     (setq done nil))
826     (when wide1 (incf x)))))
827     until done))))
828     (with-array-data ((string string) (start) (end))
829     (declare (ignore start end))
830     (rec string))
831     (order result start fillptr)
832     (shrink-vector result fillptr))))
833    
834 rtoy 1.12.30.4 (defun string-to-nfd (string)
835 rtoy 1.12.30.9 (decompose string nil))
836 rtoy 1.12.30.4
837     (defun string-to-nfkd (string)
838 rtoy 1.12.30.9 (decompose string t))
839 rtoy 1.12.30.4
840     (defun string-to-nfc (string)
841     ;;@@ Implement me
842     string)
843    
844     (defun string-to-nfkc (string)
845     ;;@@ Implement me
846     string)

  ViewVC Help
Powered by ViewVC 1.1.5