/[cmucl]/src/code/string.lisp
ViewVC logotype

Contents of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.13 - (hide annotations)
Thu Jun 11 16:03:59 2009 UTC (4 years, 10 months ago) by rtoy
Branch: MAIN
CVS Tags: merged-unicode-utf16-extfmt-2009-06-11
Changes since 1.12: +828 -35 lines
Merge Unicode work to trunk.  From label
unicode-utf16-extfmt-2009-06-11.
1 ram 1.1 ;;; -*- Log: code.log; Package: Lisp -*-
2     ;;;
3     ;;; **********************************************************************
4 ram 1.3 ;;; This code was written as part of the CMU Common Lisp project at
5     ;;; Carnegie Mellon University, and has been placed in the public domain.
6     ;;;
7     (ext:file-comment
8 rtoy 1.13 "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/string.lisp,v 1.13 2009/06/11 16:03:59 rtoy Exp $")
9 ram 1.3 ;;;
10 ram 1.1 ;;; **********************************************************************
11     ;;;
12 pw 1.11 ;;; Functions to implement strings for CMU Common Lisp
13 ram 1.1 ;;; Written by David Dill
14 ram 1.4 ;;; Rewritten by Skef Wholey, Bill Chiles and Rob MacLachlan.
15 ram 1.1 ;;;
16     ;;; ****************************************************************
17     ;;;
18 ram 1.4 (in-package "LISP")
19 rtoy 1.13 (export '(char schar glyph sglyph string
20 ram 1.1 string= string-equal string< string> string<= string>= string/=
21     string-lessp string-greaterp string-not-lessp string-not-greaterp
22     string-not-equal
23 rtoy 1.13 string-to-nfd string-to-nfkd string-to-nfc string-to-nfkc
24 ram 1.1 make-string
25     string-trim string-left-trim string-right-trim
26     string-upcase
27     string-downcase string-capitalize nstring-upcase nstring-downcase
28     nstring-capitalize))
29    
30    
31 rtoy 1.13 (declaim (inline surrogatep surrogates-to-codepoint codepoint surrogates))
32    
33     (defun surrogatep (c &optional surrogate-type)
34     "Test if C is a surrogate. C may be either an integer or a
35     character. Surrogate-type indicates what kind of surrogate to test
36     for. :High means to test for the high (leading) surrogate; :Low
37     tests for the low (trailing surrogate). A value of :Any or Nil
38     tests for any surrogate value (high or low)."
39     (declare (type (or character codepoint) c))
40     (let ((code (if (characterp c)
41     (char-code c)
42     c)))
43     (ecase surrogate-type
44     ((:high :leading)
45     ;; Test for high surrogate
46     (<= #xD800 code #xDBFF))
47     ((:low :trailing)
48     ;; Test for low surrogate
49     (<= #xDC00 code #xDFFF))
50     ((:any nil)
51     ;; Test for any surrogate
52     (<= #xD800 code #xDFFF)))))
53    
54     (defun surrogates-to-codepoint (hi lo)
55     "Convert the given Hi and Lo surrogate characters to the
56     corresponding codepoint value"
57     (declare (type character hi lo))
58     (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi)) #xD800) 10)
59     (the (integer #xDC00 #xDFFF) (char-code lo)) #x2400))
60    
61     (defun codepoint (string i &optional (end (length string)))
62     "Return the codepoint value from String at position I. If that
63     position is a surrogate, it is combined with either the previous or
64     following character (when possible) to compute the codepoint. The
65     second return value is NIL if the position is not a surrogate pair.
66     Otherwise +1 or -1 is returned if the position is the high or low
67     surrogate value, respectively."
68     (declare (type simple-string string) (type kernel:index i end))
69     (let ((code (char-code (schar string i))))
70     (cond ((and (surrogatep code :high) (< (1+ i) end))
71     (let ((tmp (char-code (schar string (1+ i)))))
72     (if (surrogatep tmp :low)
73     (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)
74     (values code nil))))
75     ((and (surrogatep code :low) (> i 0))
76     (let ((tmp (char-code (schar string (1- i)))))
77     (if (surrogatep tmp :high)
78     (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)
79     (values code nil))))
80     (t (values code nil)))))
81    
82     (defun surrogates (codepoint)
83     "Return the high and low surrogate characters for Codepoint. If
84     Codepoint is in the BMP, the first return value is the corresponding
85     character and the second is NIL."
86     (declare (type codepoint codepoint))
87     (if (< codepoint #x10000)
88     (values (code-char codepoint) nil)
89     (let* ((tmp (- codepoint #x10000))
90     (hi (logior (ldb (byte 10 10) tmp) #xD800))
91     (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
92     (values (code-char hi) (code-char lo)))))
93    
94     (defun (setf codepoint) (codepoint string i)
95     "Set the codepoint at string position I to the Codepoint. If the
96     codepoint requires a surrogate pair, the high (leading surrogate) is
97     stored at position I and the low (trailing) surrogate is stored at
98     I+1"
99     (declare (type codepoint codepoint)
100     (type simple-string string))
101     (let ((widep nil))
102     (multiple-value-bind (hi lo)
103     (surrogates codepoint)
104     (setf (aref string i) hi)
105     (when lo
106     (setf (aref string (1+ i)) lo)
107     (setf widep t)))
108     (values codepoint widep)))
109    
110     (defun utf16-string-p (string)
111     "Check if String is a valid UTF-16 string. If the string is valid,
112     T is returned. If the string is not valid, NIL is returned, and the
113     second value is the index into the string of the invalid character.
114     A string is also invalid if it contains any unassigned codepoints."
115     (do ((len (length string))
116     (index 0 (1+ index)))
117     ((>= index len)
118     t)
119     (multiple-value-bind (codepoint wide)
120     (codepoint string index)
121     ;; We step through the string in order. If there are any
122     ;; surrogates pairs, we must reach the lead surrogate first,
123     ;; which means WIDE is +1. Otherwise, we have an invalid
124     ;; surrogate pair. If we get any codepoint that is in the
125     ;; surrogate range, we also have an invalid string. An
126     ;; unassigned codepoint is also considered invalid.
127     (when (or (eq wide -1)
128     (surrogatep codepoint)
129     (lisp::unicode-assigned-codepoint-p codepoint))
130     (return-from utf16-string-p (values nil index)))
131     (when wide (incf index)))))
132    
133 ram 1.1 (defun string (X)
134     "Coerces X into a string. If X is a string, X is returned. If X is a
135 rtoy 1.13 symbol, X's pname is returned. If X is a character then a one element
136     string containing that character is returned. If X cannot be coerced
137     into a string, an error occurs."
138 ram 1.1 (cond ((stringp x) x)
139     ((symbolp x) (symbol-name x))
140     ((characterp x)
141     (let ((res (make-string 1)))
142     (setf (schar res 0) x) res))
143     (t
144 pw 1.11 (error 'simple-type-error
145     :datum x
146     :expected-type '(or string symbol character)
147     :format-control "~S cannot be coerced to a string."
148     :format-arguments (list x)))))
149 ram 1.1
150     ;;; With-One-String is used to set up some string hacking things. The keywords
151     ;;; are parsed, and the string is hacked into a simple-string.
152    
153     (eval-when (compile)
154    
155     (defmacro with-one-string (string start end cum-offset &rest forms)
156 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
157 pw 1.10 ;; Optimizer may prove STRING is one.
158     (declare (optimize (ext:inhibit-warnings 3)))
159 ram 1.4 (with-array-data ((,string ,string :offset-var ,cum-offset)
160     (,start ,start)
161     (,end (or ,end (length (the vector ,string)))))
162     ,@forms)))
163 ram 1.1
164     )
165    
166     ;;; With-String is like With-One-String, but doesn't parse keywords.
167    
168     (eval-when (compile)
169    
170     (defmacro with-string (string &rest forms)
171 ram 1.4 `(let ((,string (if (stringp ,string) ,string (string ,string))))
172     (with-array-data ((,string ,string)
173     (start)
174     (end (length (the vector ,string))))
175     ,@forms)))
176 ram 1.1
177     )
178    
179     ;;; With-Two-Strings is used to set up string comparison operations. The
180     ;;; keywords are parsed, and the strings are hacked into simple-strings.
181    
182     (eval-when (compile)
183    
184     (defmacro with-two-strings (string1 string2 start1 end1 cum-offset-1
185     start2 end2 &rest forms)
186 ram 1.4 `(let ((,string1 (if (stringp ,string1) ,string1 (string ,string1)))
187     (,string2 (if (stringp ,string2) ,string2 (string ,string2))))
188     (with-array-data ((,string1 ,string1 :offset-var ,cum-offset-1)
189     (,start1 ,start1)
190     (,end1 (or ,end1 (length (the vector ,string1)))))
191     (with-array-data ((,string2 ,string2)
192     (,start2 ,start2)
193     (,end2 (or ,end2 (length (the vector ,string2)))))
194     ,@forms))))
195 ram 1.1
196     )
197 wlott 1.2
198 ram 1.1
199     (defun char (string index)
200     "Given a string and a non-negative integer index less than the length of
201     the string, returns the character object representing the character at
202     that position in the string."
203 ram 1.4 (declare (optimize (safety 1)))
204 ram 1.1 (char string index))
205    
206     (defun %charset (string index new-el)
207 ram 1.4 (declare (optimize (safety 1)))
208 ram 1.1 (setf (char string index) new-el))
209    
210     (defun schar (string index)
211     "SCHAR returns the character object at an indexed position in a string
212 rtoy 1.13 just as CHAR does, except the string must be a simple-string."
213 ram 1.4 (declare (optimize (safety 1)))
214 ram 1.1 (schar string index))
215    
216     (defun %scharset (string index new-el)
217 ram 1.4 (declare (optimize (safety 1)))
218 ram 1.1 (setf (schar string index) new-el))
219    
220     (defun string=* (string1 string2 start1 end1 start2 end2)
221 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
222     (not (%sp-string-compare string1 start1 end1 string2 start2 end2))))
223 ram 1.1
224    
225     (defun string/=* (string1 string2 start1 end1 start2 end2)
226 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
227     (let ((comparison (%sp-string-compare string1 start1 end1
228     string2 start2 end2)))
229     (if comparison (- (the fixnum comparison) offset1)))))
230 ram 1.1
231     (eval-when (compile eval)
232    
233     ;;; Lessp is true if the desired expansion is for string<* or string<=*.
234     ;;; Equalp is true if the desired expansion is for string<=* or string>=*.
235     (defmacro string<>=*-body (lessp equalp)
236     (let ((offset1 (gensym)))
237 ram 1.4 `(with-two-strings string1 string2 start1 end1 ,offset1 start2 end2
238     (let ((index (%sp-string-compare string1 start1 end1
239     string2 start2 end2)))
240     (if index
241 ram 1.8 (cond ((= (the fixnum index) (the fixnum end1))
242     ,(if lessp
243     `(- (the fixnum index) ,offset1)
244     `nil))
245     ((= (+ (the fixnum index) (- start2 start1))
246     (the fixnum end2))
247     ,(if lessp
248     `nil
249 rtoy 1.13 `(- (the fixnum index) ,offset1)))
250     #-unicode
251 ram 1.4 ((,(if lessp 'char< 'char>)
252     (schar string1 index)
253     (schar string2 (+ (the fixnum index) (- start2 start1))))
254     (- (the fixnum index) ,offset1))
255 rtoy 1.13 #-unicode
256     (t nil)
257     #+unicode
258     (t
259     ;; Compare in code point order. See
260     ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
261     (flet ((fixup (code)
262     (if (>= code #xe000)
263     (- code #x800)
264     (+ code #x2000))))
265     (declare (inline fixup))
266     (let* ((c1 (char-code (schar string1 index)))
267     (c2 (char-code (schar string2
268     (+ (the fixnum index)
269     (- start2 start1))))))
270     (cond ((and (>= c1 #xd800)
271     (>= c2 #xd800))
272     (let ((fix-c1 (fixup c1))
273     (fix-c2 (fixup c2)))
274     (if (,(if lessp '< '>) fix-c1 fix-c2)
275     (- (the fixnum index) ,offset1)
276     nil)))
277     (t
278     (if (,(if lessp '< '>) c1 c2)
279     (- (the fixnum index) ,offset1)
280     nil)))))))
281 ram 1.4 ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
282 ram 1.1 ) ; eval-when
283    
284     (defun string<* (string1 string2 start1 end1 start2 end2)
285     (declare (fixnum start1 start2))
286     (string<>=*-body t nil))
287    
288     (defun string>* (string1 string2 start1 end1 start2 end2)
289     (declare (fixnum start1 start2))
290     (string<>=*-body nil nil))
291    
292     (defun string<=* (string1 string2 start1 end1 start2 end2)
293     (declare (fixnum start1 start2))
294     (string<>=*-body t t))
295    
296     (defun string>=* (string1 string2 start1 end1 start2 end2)
297     (declare (fixnum start1 start2))
298     (string<>=*-body nil t))
299    
300    
301    
302     (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
303     "Given two strings, if the first string is lexicographically less than
304     the second string, returns the longest common prefix (using char=)
305     of the two strings. Otherwise, returns ()."
306     (string<* string1 string2 start1 end1 start2 end2))
307    
308     (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
309     "Given two strings, if the first string is lexicographically greater than
310     the second string, returns the longest common prefix (using char=)
311     of the two strings. Otherwise, returns ()."
312     (string>* string1 string2 start1 end1 start2 end2))
313    
314    
315     (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
316     "Given two strings, if the first string is lexicographically less than
317     or equal to the second string, returns the longest common prefix
318     (using char=) of the two strings. Otherwise, returns ()."
319     (string<=* string1 string2 start1 end1 start2 end2))
320    
321     (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
322     "Given two strings, if the first string is lexicographically greater
323     than or equal to the second string, returns the longest common prefix
324     (using char=) of the two strings. Otherwise, returns ()."
325     (string>=* string1 string2 start1 end1 start2 end2))
326    
327     (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
328     "Given two strings (string1 and string2), and optional integers start1,
329     start2, end1 and end2, compares characters in string1 to characters in
330     string2 (using char=)."
331     (string=* string1 string2 start1 end1 start2 end2))
332    
333     (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
334     "Given two strings, if the first string is not lexicographically equal
335     to the second string, returns the longest common prefix (using char=)
336     of the two strings. Otherwise, returns ()."
337     (string/=* string1 string2 start1 end1 start2 end2))
338    
339    
340     (eval-when (compile eval)
341    
342     ;;; STRING-NOT-EQUAL-LOOP is used to generate character comparison loops for
343     ;;; STRING-EQUAL and STRING-NOT-EQUAL.
344     (defmacro string-not-equal-loop (end end-value
345     &optional (abort-value nil abortp))
346     (declare (fixnum end))
347     (let ((end-test (if (= end 1)
348     `(= index1 (the fixnum end1))
349     `(= index2 (the fixnum end2)))))
350     `(do ((index1 start1 (1+ index1))
351     (index2 start2 (1+ index2)))
352     (,(if abortp
353     end-test
354     `(or ,end-test
355     (not (char-equal (schar string1 index1)
356     (schar string2 index2)))))
357     ,end-value)
358     (declare (fixnum index1 index2))
359     ,@(if abortp
360     `((if (not (char-equal (schar string1 index1)
361     (schar string2 index2)))
362     (return ,abort-value)))))))
363 rtoy 1.13 ) ; eval-when
364 ram 1.1
365 rtoy 1.13 #+unicode
366     (defun string-case-fold (string &key (start 0) end (casing :simple))
367     "Return a new string with the case folded according to Casing as follows:
368    
369     :SIMPLE Unicode simple case folding (preserving length)
370     :FULL Unicode full case folding (possibly changing length)
371    
372     Default Casing is :SIMPLE."
373     (ecase casing
374     (:simple
375     (with-output-to-string (s)
376     (with-one-string string start end offset
377     (do ((index offset (1+ index)))
378     ((>= index end))
379     (multiple-value-bind (code widep)
380     (codepoint string index)
381     (when widep (incf index))
382     (multiple-value-bind (hi lo)
383     (surrogates (unicode-case-fold-simple code))
384     (write-char hi s)
385     (when lo (write-char lo s))))))))
386     (:full
387     (with-output-to-string (s)
388     (with-one-string string start end offset
389     (do ((index offset (1+ index)))
390     ((>= index end))
391     (multiple-value-bind (code widep)
392     (codepoint string index)
393     (when widep (incf index))
394     (write-string (unicode-case-fold-full code) s))))))))
395 ram 1.1
396     (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
397     "Given two strings (string1 and string2), and optional integers start1,
398     start2, end1 and end2, compares characters in string1 to characters in
399     string2 (using char-equal)."
400     (declare (fixnum start1 start2))
401 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
402     (let ((slen1 (- (the fixnum end1) start1))
403     (slen2 (- (the fixnum end2) start2)))
404     (declare (fixnum slen1 slen2))
405     (if (or (minusp slen1) (minusp slen2))
406     ;;prevent endless looping later.
407     (error "Improper bounds for string comparison."))
408     (if (= slen1 slen2)
409     ;;return () immediately if lengths aren't equal.
410 rtoy 1.13 (string-not-equal-loop 1 t nil)))))
411 ram 1.1
412     (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
413     "Given two strings, if the first string is not lexicographically equal
414     to the second string, returns the longest common prefix (using char-equal)
415     of the two strings. Otherwise, returns ()."
416 ram 1.4 (with-two-strings string1 string2 start1 end1 offset1 start2 end2
417     (let ((slen1 (- end1 start1))
418     (slen2 (- end2 start2)))
419     (declare (fixnum slen1 slen2))
420     (if (or (minusp slen1) (minusp slen2))
421     ;;prevent endless looping later.
422     (error "Improper bounds for string comparison."))
423     (cond ((or (minusp slen1) (or (minusp slen2)))
424     (error "Improper substring for comparison."))
425     ((= slen1 slen2)
426     (string-not-equal-loop 1 nil (- index1 offset1)))
427     ((< slen1 slen2)
428     (string-not-equal-loop 1 (- index1 offset1)))
429     (t
430     (string-not-equal-loop 2 (- index1 offset1)))))))
431 ram 1.1
432    
433    
434     (eval-when (compile eval)
435    
436     ;;; STRING-LESS-GREATER-EQUAL-TESTS returns a test on the lengths of string1
437     ;;; and string2 and a test on the current characters from string1 and string2
438     ;;; for the following macro.
439     (defun string-less-greater-equal-tests (lessp equalp)
440     (if lessp
441     (if equalp
442     ;; STRING-NOT-GREATERP
443 rtoy 1.13 (values '<=
444     #-unicode `(not (char-greaterp char1 char2))
445     #+unicode `(<= char1 char2))
446 ram 1.1 ;; STRING-LESSP
447 rtoy 1.13 (values '<
448     #-unicode `(char-lessp char1 char2)
449     #+unicode `(< char1 char2)))
450 ram 1.1 (if equalp
451     ;; STRING-NOT-LESSP
452 rtoy 1.13 (values '>=
453     #-unicode `(not (char-lessp char1 char2))
454     #+unicode `(>= char1 char2))
455 ram 1.1 ;; STRING-GREATERP
456 rtoy 1.13 (values '>
457     #-unicode `(char-greaterp char1 char2)
458     #+unicode `(> char1 char2)))))
459 ram 1.1
460 rtoy 1.13 #-unicode
461 ram 1.1 (defmacro string-less-greater-equal (lessp equalp)
462     (multiple-value-bind (length-test character-test)
463     (string-less-greater-equal-tests lessp equalp)
464 ram 1.4 `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
465     (let ((slen1 (- (the fixnum end1) start1))
466     (slen2 (- (the fixnum end2) start2)))
467     (declare (fixnum slen1 slen2))
468     (if (or (minusp slen1) (minusp slen2))
469     ;;prevent endless looping later.
470     (error "Improper bounds for string comparison."))
471     (do ((index1 start1 (1+ index1))
472     (index2 start2 (1+ index2))
473     (char1)
474     (char2))
475     ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
476     (if (,length-test slen1 slen2) (- index1 offset1)))
477     (declare (fixnum index1 index2))
478     (setq char1 (schar string1 index1))
479     (setq char2 (schar string2 index2))
480     (if (not (char-equal char1 char2))
481     (if ,character-test
482     (return (- index1 offset1))
483     (return ()))))))))
484 ram 1.1
485 rtoy 1.13 ;; Convert to lowercase for case folding, to match what Unicode
486     ;; CaseFolding.txt says. An example where this matters: U+1E9E maps
487     ;; to U+00DF. But the uppercase version of U+00DF is U+00DF.
488     #+unicode
489     (defmacro equal-char-codepoint (codepoint)
490     `(let ((ch ,codepoint))
491     ;; Handle ASCII separately for bootstrapping and for unidata missing.
492     (if (< 64 ch 91)
493     (+ ch 32)
494     #-(and unicode (not unicode-bootstrap))
495     ch
496     #+(and unicode (not unicode-bootstrap))
497     (if (> ch 127) (unicode-lower ch) ch))))
498    
499     #+unicode
500     (defmacro string-less-greater-equal (lessp equalp)
501     (multiple-value-bind (length-test character-test)
502     (string-less-greater-equal-tests lessp equalp)
503     `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
504     (let ((slen1 (- (the fixnum end1) start1))
505     (slen2 (- (the fixnum end2) start2)))
506     (declare (fixnum slen1 slen2))
507     (if (or (minusp slen1) (minusp slen2))
508     ;;prevent endless looping later.
509     (error "Improper bounds for string comparison."))
510     (do ((index1 start1 (1+ index1))
511     (index2 start2 (1+ index2)))
512     ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
513     (if (,length-test slen1 slen2) (- index1 offset1)))
514     (declare (fixnum index1 index2))
515     (multiple-value-bind (char1 wide1)
516     (codepoint string1 index1)
517     (declare (type codepoint char1))
518     (multiple-value-bind (char2 wide2)
519     (codepoint string2 index2)
520     (declare (type codepoint char2))
521     (setf char1 (equal-char-codepoint char1))
522     (setf char2 (equal-char-codepoint char2))
523     (if (= char1 char2)
524     (progn
525     (when wide1 (incf index1))
526     (when wide2 (incf index2)))
527     (if ,character-test
528     (return (- index1 offset1))
529     (return ()))))))))))
530    
531 ram 1.1 ) ; eval-when
532    
533     (defun string-lessp* (string1 string2 start1 end1 start2 end2)
534     (declare (fixnum start1 start2))
535     (string-less-greater-equal t nil))
536    
537     (defun string-greaterp* (string1 string2 start1 end1 start2 end2)
538     (declare (fixnum start1 start2))
539     (string-less-greater-equal nil nil))
540    
541     (defun string-not-lessp* (string1 string2 start1 end1 start2 end2)
542     (declare (fixnum start1 start2))
543     (string-less-greater-equal nil t))
544    
545     (defun string-not-greaterp* (string1 string2 start1 end1 start2 end2)
546     (declare (fixnum start1 start2))
547     (string-less-greater-equal t t))
548    
549     (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
550     "Given two strings, if the first string is lexicographically less than
551     the second string, returns the longest common prefix (using char-equal)
552     of the two strings. Otherwise, returns ()."
553     (string-lessp* string1 string2 start1 end1 start2 end2))
554    
555     (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)
556     "Given two strings, if the first string is lexicographically greater than
557     the second string, returns the longest common prefix (using char-equal)
558     of the two strings. Otherwise, returns ()."
559     (string-greaterp* string1 string2 start1 end1 start2 end2))
560    
561     (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
562     "Given two strings, if the first string is lexicographically greater
563     than or equal to the second string, returns the longest common prefix
564     (using char-equal) of the two strings. Otherwise, returns ()."
565     (string-not-lessp* string1 string2 start1 end1 start2 end2))
566    
567     (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
568     end2)
569     "Given two strings, if the first string is lexicographically less than
570     or equal to the second string, returns the longest common prefix
571     (using char-equal) of the two strings. Otherwise, returns ()."
572     (string-not-greaterp* string1 string2 start1 end1 start2 end2))
573    
574    
575 dtc 1.9 (defun make-string (count &key element-type ((:initial-element fill-char)))
576 ram 1.1 "Given a character count and an optional fill character, makes and returns
577 rtoy 1.13 a new string Count long filled with the fill character."
578 emarsden 1.12 (declare (type fixnum count))
579     (assert (subtypep element-type 'character))
580 ram 1.1 (if fill-char
581     (do ((i 0 (1+ i))
582     (string (make-string count)))
583     ((= i count) string)
584     (declare (fixnum i))
585     (setf (schar string i) fill-char))
586     (make-string count)))
587    
588 rtoy 1.13 (defun string-upcase-simple (string &key (start 0) end)
589 ram 1.1 (declare (fixnum start))
590 ram 1.5 (let* ((string (if (stringp string) string (string string)))
591     (slen (length string)))
592 ram 1.4 (declare (fixnum slen))
593 ram 1.1 (with-one-string string start end offset
594     (let ((offset-slen (+ slen offset))
595     (newstring (make-string slen)))
596     (declare (fixnum offset-slen))
597     (do ((index offset (1+ index))
598     (new-index 0 (1+ new-index)))
599     ((= index start))
600     (declare (fixnum index new-index))
601     (setf (schar newstring new-index) (schar string index)))
602     (do ((index start (1+ index))
603     (new-index (- start offset) (1+ new-index)))
604     ((= index (the fixnum end)))
605     (declare (fixnum index new-index))
606 rtoy 1.13 (multiple-value-bind (code wide) (codepoint string index)
607     (when wide (incf index))
608     ;; Handle ASCII specially because this is called early in
609     ;; initialization, before unidata is available.
610     (cond ((< 96 code 123) (decf code 32))
611     ((> code 127) (setq code (unicode-upper code))))
612     ;;@@ WARNING: this may, in theory, need to extend newstring
613     ;; but that never actually occurs as of Unicode 5.1.0,
614     ;; so I'm just going to ignore it for now...
615     (multiple-value-bind (hi lo) (surrogates code)
616     (setf (schar newstring new-index) hi)
617     (when lo
618     (setf (schar newstring (incf new-index)) lo)))))
619     ;;@@ WARNING: see above
620 ram 1.1 (do ((index end (1+ index))
621     (new-index (- (the fixnum end) offset) (1+ new-index)))
622     ((= index offset-slen))
623     (declare (fixnum index new-index))
624     (setf (schar newstring new-index) (schar string index)))
625     newstring))))
626    
627 rtoy 1.13 (defun string-upcase-full (string &key (start 0) end)
628     (declare (fixnum start))
629     (let* ((string (if (stringp string) string (string string)))
630     (slen (length string)))
631     (declare (fixnum slen))
632     (with-output-to-string (s)
633     (with-one-string string start end offset
634     (let ((offset-slen (+ slen offset)))
635     (declare (fixnum offset-slen))
636     (write-string string s :start offset :end start)
637     (do ((index start (1+ index)))
638     ((= index (the fixnum end)))
639     (declare (fixnum index))
640     (multiple-value-bind (code wide)
641     (codepoint string index)
642     (when wide (incf index))
643     ;; Handle ASCII specially because this is called early in
644     ;; initialization, before unidata is available.
645     (cond ((< 96 code 123)
646     (write-char (code-char (decf code 32)) s))
647     ((> code 127)
648     (write-string (unicode-full-case-upper code) s))
649     (t
650     (multiple-value-bind (hi lo)
651     (surrogates code)
652     (write-char hi s)
653     (when lo
654     (write-char lo s)))))))
655     (write-string string s :start end :end offset-slen))))))
656    
657     (defun string-upcase (string &key (start 0) end #+unicode (casing :simple))
658     #-unicode
659     "Given a string, returns a new string that is a copy of it with all
660     lower case alphabetic characters converted to uppercase."
661     #+unicode
662     "Given a string, returns a new string that is a copy of it with all
663     lower case alphabetic characters converted to uppercase. Casing is
664     :simple or :full for simple or full case conversion, respectively."
665     (declare (fixnum start))
666     #-unicode
667     (string-upcase-simple string :start start :end end)
668     #+unicode
669     (if (eq casing :simple)
670     (string-upcase-simple string :start start :end end)
671     (string-upcase-full string :start start :end end)))
672    
673     (defun string-downcase-simple (string &key (start 0) end)
674 ram 1.1 (declare (fixnum start))
675 ram 1.5 (let* ((string (if (stringp string) string (string string)))
676     (slen (length string)))
677 ram 1.4 (declare (fixnum slen))
678 ram 1.1 (with-one-string string start end offset
679     (let ((offset-slen (+ slen offset))
680     (newstring (make-string slen)))
681     (declare (fixnum offset-slen))
682     (do ((index offset (1+ index))
683     (new-index 0 (1+ new-index)))
684     ((= index start))
685     (declare (fixnum index new-index))
686     (setf (schar newstring new-index) (schar string index)))
687     (do ((index start (1+ index))
688     (new-index (- start offset) (1+ new-index)))
689     ((= index (the fixnum end)))
690     (declare (fixnum index new-index))
691 rtoy 1.13 (multiple-value-bind (code wide) (codepoint string index)
692     (when wide (incf index))
693     ;; Handle ASCII specially because this is called early in
694     ;; initialization, before unidata is available.
695     (cond ((< 64 code 91) (incf code 32))
696     ((> code 127) (setq code (unicode-lower code))))
697     ;;@@ WARNING: this may, in theory, need to extend newstring
698     ;; but that never actually occurs as of Unicode 5.1.0,
699     ;; so I'm just going to ignore it for now...
700     (multiple-value-bind (hi lo) (surrogates code)
701     (setf (schar newstring new-index) hi)
702     (when lo
703     (setf (schar newstring (incf new-index)) lo)))))
704     ;;@@ WARNING: see above
705 ram 1.1 (do ((index end (1+ index))
706     (new-index (- (the fixnum end) offset) (1+ new-index)))
707     ((= index offset-slen))
708     (declare (fixnum index new-index))
709     (setf (schar newstring new-index) (schar string index)))
710     newstring))))
711    
712 rtoy 1.13 (defun string-downcase-full (string &key (start 0) end)
713     (declare (fixnum start))
714     (let* ((string (if (stringp string) string (string string)))
715     (slen (length string)))
716     (declare (fixnum slen))
717     (with-output-to-string (s)
718     (with-one-string string start end offset
719     (let ((offset-slen (+ slen offset)))
720     (declare (fixnum offset-slen))
721     (write-string string s :start offset :end start)
722     (do ((index start (1+ index)))
723     ((= index (the fixnum end)))
724     (declare (fixnum index))
725     (multiple-value-bind (code wide)
726     (codepoint string index)
727     (when wide (incf index))
728     ;; Handle ASCII specially because this is called early in
729     ;; initialization, before unidata is available.
730     (cond ((< 64 code 91)
731     (write-char (code-char (incf code 32)) s))
732     ((> code 127)
733     (write-string (unicode-full-case-lower code) s))
734     (t
735     (multiple-value-bind (hi lo)
736     (surrogates code)
737     (write-char hi s)
738     (when lo
739     (write-char lo s)))))))
740     (write-string string s :start end :end offset-slen))))))
741    
742     (defun string-downcase (string &key (start 0) end #+unicode (casing :simple))
743     #-unicode
744     "Given a string, returns a new string that is a copy of it with all
745     upper case alphabetic characters converted to lowercase."
746     #+unicode
747     "Given a string, returns a new string that is a copy of it with all
748     upper case alphabetic characters converted to lowercase. Casing is
749     :simple or :full for simple or full case conversion, respectively."
750     (declare (fixnum start))
751     #-unicode
752     (string-downcase-simple string :start start :end end)
753     #+unicode
754     (if (eq casing :simple)
755     (string-downcase-simple string :start start :end end)
756     (string-downcase-full string :start start :end end)))
757    
758     (defun string-capitalize-simple (string &key (start 0) end)
759 ram 1.1 (declare (fixnum start))
760 ram 1.5 (let* ((string (if (stringp string) string (string string)))
761     (slen (length string)))
762 ram 1.4 (declare (fixnum slen))
763 ram 1.1 (with-one-string string start end offset
764     (let ((offset-slen (+ slen offset))
765     (newstring (make-string slen)))
766     (declare (fixnum offset-slen))
767     (do ((index offset (1+ index))
768     (new-index 0 (1+ new-index)))
769     ((= index start))
770     (declare (fixnum index new-index))
771     (setf (schar newstring new-index) (schar string index)))
772     (do ((index start (1+ index))
773     (new-index (- start offset) (1+ new-index))
774     (newword t)
775     (char ()))
776     ((= index (the fixnum end)))
777     (declare (fixnum index new-index))
778     (setq char (schar string index))
779     (cond ((not (alphanumericp char))
780     (setq newword t))
781     (newword
782     ;;char is first case-modifiable after non-case-modifiable
783 rtoy 1.13 (setq char (char-titlecase char))
784 ram 1.1 (setq newword ()))
785     ;;char is case-modifiable, but not first
786     (t (setq char (char-downcase char))))
787     (setf (schar newstring new-index) char))
788     (do ((index end (1+ index))
789     (new-index (- (the fixnum end) offset) (1+ new-index)))
790     ((= index offset-slen))
791     (declare (fixnum index new-index))
792     (setf (schar newstring new-index) (schar string index)))
793     newstring))))
794    
795 rtoy 1.13 (defun string-capitalize-full (string &key (start 0) end)
796     (declare (fixnum start))
797     (let* ((string (if (stringp string) string (string string)))
798     (slen (length string)))
799     (declare (fixnum slen))
800     (with-output-to-string (s)
801     (with-one-string string start end offset
802     (let ((offset-slen (+ slen offset)))
803     (declare (fixnum offset-slen))
804     (write-string string s :start offset :end start)
805     (flet ((alphanump (m)
806     (or (< 47 m 58) (< 64 m 91) (< 96 m 123)
807     #+(and unicode (not unicode-bootstrap))
808     (and (> m 127)
809     (<= +unicode-category-letter+ (unicode-category m)
810     (+ +unicode-category-letter+ #x0F))))))
811     (do ((index start (1+ index))
812     (newword t))
813     ((= index (the fixnum end)))
814     (declare (fixnum index))
815     (multiple-value-bind (code wide)
816     (codepoint string index)
817     (when wide (incf index))
818     (cond ((not (alphanump code))
819     (multiple-value-bind (hi lo)
820     (surrogates code)
821     (write-char hi s)
822     (when lo (write-char lo s)))
823     (setq newword t))
824     (newword
825     ;;char is first case-modifiable after non-case-modifiable
826     (write-string (unicode-full-case-title code) s)
827     (setq newword ()))
828     ;;char is case-modifiable, but not first
829     (t
830     (write-string (unicode-full-case-lower code) s))))))
831     (write-string string s :start end :end offset-slen))))))
832    
833     (defun string-capitalize (string &key (start 0) end #+unicode (casing :simple))
834     #-unicode
835     "Given a string, returns a copy of the string with the first
836     character of each ``word'' converted to upper-case, and remaining
837     chars in the word converted to lower case. A ``word'' is defined
838     to be a string of case-modifiable characters delimited by
839     non-case-modifiable chars."
840     #+unicode
841     "Given a string, returns a copy of the string with the first
842     character of each ``word'' converted to upper-case, and remaining
843     chars in the word converted to lower case. A ``word'' is defined
844     to be a string of case-modifiable characters delimited by
845     non-case-modifiable chars. Casing is :simple or :full for
846     simple or full case conversion, respectively."
847    
848     (declare (fixnum start))
849     #-unicode
850     (string-capitalize-simple string :start start :end end)
851     #+unicode
852     (if (eq casing :simple)
853     (string-capitalize-simple string :start start :end end)
854     (string-capitalize-full string :start start :end end)))
855    
856 ram 1.1 (defun nstring-upcase (string &key (start 0) end)
857     "Given a string, returns that string with all lower case alphabetic
858     characters converted to uppercase."
859     (declare (fixnum start))
860 ram 1.4 (let ((save-header string))
861 ram 1.1 (with-one-string string start end offset
862     (do ((index start (1+ index)))
863     ((= index (the fixnum end)))
864     (declare (fixnum index))
865 rtoy 1.13 (multiple-value-bind (code wide) (codepoint string index)
866     (declare (ignore wide))
867     ;; Handle ASCII specially because this is called early in
868     ;; initialization, before unidata is available.
869     (cond ((< 96 code 123) (decf code 32))
870     ((> code 127) (setq code (unicode-upper code))))
871     ;;@@ WARNING: this may, in theory, need to extend string
872     ;; (which, obviously, we can't do here. Unless
873     ;; STRING is adjustable, maybe)
874     ;; but that never actually occurs as of Unicode 5.1.0,
875     ;; so I'm just going to ignore it for now...
876     (multiple-value-bind (hi lo) (surrogates code)
877     (setf (schar string index) hi)
878     (when lo
879     (setf (schar string (incf index)) lo))))))
880 ram 1.1 save-header))
881    
882     (defun nstring-downcase (string &key (start 0) end)
883     "Given a string, returns that string with all upper case alphabetic
884     characters converted to lowercase."
885     (declare (fixnum start))
886 ram 1.4 (let ((save-header string))
887 ram 1.1 (with-one-string string start end offset
888     (do ((index start (1+ index)))
889     ((= index (the fixnum end)))
890     (declare (fixnum index))
891 rtoy 1.13 (multiple-value-bind (code wide) (codepoint string index)
892     (declare (ignore wide))
893     (cond ((< 64 code 91) (incf code 32))
894     ((> code 127) (setq code (unicode-lower code))))
895     ;;@@ WARNING: this may, in theory, need to extend string
896     ;; (which, obviously, we can't do here. Unless
897     ;; STRING is adjustable, maybe)
898     ;; but that never actually occurs as of Unicode 5.1.0,
899     ;; so I'm just going to ignore it for now...
900     (multiple-value-bind (hi lo) (surrogates code)
901     (setf (schar string index) hi)
902     (when lo
903     (setf (schar string (incf index)) lo))))))
904 wlott 1.6 save-header))
905 ram 1.1
906     (defun nstring-capitalize (string &key (start 0) end)
907     "Given a string, returns that string with the first
908     character of each ``word'' converted to upper-case, and remaining
909     chars in the word converted to lower case. A ``word'' is defined
910     to be a string of case-modifiable characters delimited by
911     non-case-modifiable chars."
912     (declare (fixnum start))
913 ram 1.4 (let ((save-header string))
914 ram 1.1 (with-one-string string start end offset
915     (do ((index start (1+ index))
916     (newword t)
917     (char ()))
918     ((= index (the fixnum end)))
919     (declare (fixnum index))
920     (setq char (schar string index))
921     (cond ((not (alphanumericp char))
922     (setq newword t))
923     (newword
924     ;;char is first case-modifiable after non-case-modifiable
925 rtoy 1.13 (setf (schar string index) (char-titlecase char))
926 ram 1.1 (setq newword ()))
927     (t
928     (setf (schar string index) (char-downcase char))))))
929     save-header))
930    
931 rtoy 1.13
932     #+unicode
933     (progn
934     ;; Like string-left-trim, but return the index
935     (defun string-left-trim-index (char-bag string)
936     (with-string string
937     (if (stringp char-bag)
938     ;; When char-bag is a string, we try to do the right thing.
939     ;; Convert char-bag to a list of codepoints and compare the
940     ;; codepoints in the string with this.
941     (let ((code-bag (with-string char-bag
942     (do ((index start (1+ index))
943     (result nil))
944     ((= index end)
945     (nreverse result))
946     (multiple-value-bind (c widep)
947     (codepoint char-bag index)
948     (push c result)
949     (when widep (incf index)))))))
950     (do ((index start (1+ index)))
951     ((= index (the fixnum end))
952     end)
953     (declare (fixnum index))
954     (multiple-value-bind (c widep)
955     (codepoint string index)
956     (unless (find c code-bag)
957     (return-from string-left-trim-index index))
958     (when widep (incf index)))))
959     ;; When char-bag is a list, we just look at each codepoint of
960     ;; STRING to see if it's in char-bag. If char-bag contains a
961     ;; surrogate, we could accidentally trim off a surrogate,
962     ;; leaving an invalid UTF16 string.
963     (do ((index start (1+ index)))
964     ((= index (the fixnum end))
965     end)
966     (declare (fixnum index))
967     (multiple-value-bind (c widep)
968     (codepoint string index)
969     (unless (find c char-bag :key #'char-code)
970     (return-from string-left-trim-index index))
971     (when widep (incf index)))))))
972    
973     (defun string-left-trim (char-bag string)
974     "Given a set of characters (a list or string) and a string, returns
975     a copy of the string with the characters in the set removed from the
976     left end. If the set of characters is a string, surrogates will be
977     properly handled."
978     (let ((begin (string-left-trim-index char-bag string)))
979     (with-string string
980     (subseq string begin end))))
981    
982     (defun string-right-trim-index (char-bag string)
983     (with-string string
984     (if (stringp char-bag)
985     ;; When char-bag is a string, we try to do the right thing
986     ;; with surrogates. Convert char-bag to a list of codepoints
987     ;; and compare the codepoints in the string with this.
988     (let ((code-bag (with-string char-bag
989     (do ((index start (1+ index))
990     (result nil))
991     ((= index end)
992     result)
993     (multiple-value-bind (c widep)
994     (codepoint char-bag index)
995     (push c result)
996     (when widep (incf index)))))))
997     (do ((index (1- end) (1- index)))
998     ((< index start)
999     start)
1000     (declare (fixnum index))
1001     (multiple-value-bind (c widep)
1002     (codepoint string index)
1003     (unless (find c code-bag)
1004     (return-from string-right-trim-index (1+ index)))
1005     (when widep (decf index)))))
1006     ;; When char-bag is a list, we just look at each codepoint of
1007     ;; STRING to see if it's in char-bag. If char-bag contains a
1008     ;; surrogate, we could accidentally trim off a surrogate,
1009     ;; leaving an invalid UTF16 string.
1010     (do ((index (1- end) (1- index)))
1011     ((< index start)
1012     start)
1013     (declare (fixnum index))
1014     (multiple-value-bind (c widep)
1015     (codepoint string index)
1016     (unless (find c char-bag :key #'char-code)
1017     (return-from string-right-trim-index (1+ index)))
1018     (when widep (decf index)))))))
1019    
1020     (defun string-right-trim (char-bag string)
1021     "Given a set of characters (a list or string) and a string, returns
1022     a copy of the string with the characters in the set removed from the
1023     right end. If the set of characters is a string, surrogates will be
1024     properly handled."
1025     (let ((stop (string-right-trim-index char-bag string)))
1026     (with-string string
1027     (subseq string start stop))))
1028    
1029     (defun string-trim (char-bag string)
1030     "Given a set of characters (a list or string) and a string, returns a
1031     copy of the string with the characters in the set removed from both
1032     ends. If the set of characters is a string, surrogates will be
1033     properly handled."
1034     (let ((left-end (string-left-trim-index char-bag string))
1035     (right-end (string-right-trim-index char-bag string)))
1036     (with-string string
1037     (subseq (the simple-string string) left-end right-end))))
1038     ) ; end unicode version
1039    
1040     #-unicode
1041     (progn
1042 ram 1.1 (defun string-left-trim (char-bag string)
1043     "Given a set of characters (a list or string) and a string, returns
1044     a copy of the string with the characters in the set removed from the
1045     left end."
1046     (with-string string
1047     (do ((index start (1+ index)))
1048     ((or (= index (the fixnum end))
1049     (not (find (schar string index) char-bag)))
1050     (subseq (the simple-string string) index end))
1051     (declare (fixnum index)))))
1052    
1053     (defun string-right-trim (char-bag string)
1054     "Given a set of characters (a list or string) and a string, returns
1055     a copy of the string with the characters in the set removed from the
1056     right end."
1057     (with-string string
1058     (do ((index (1- (the fixnum end)) (1- index)))
1059     ((or (< index start) (not (find (schar string index) char-bag)))
1060     (subseq (the simple-string string) start (1+ index)))
1061     (declare (fixnum index)))))
1062    
1063     (defun string-trim (char-bag string)
1064     "Given a set of characters (a list or string) and a string, returns a
1065     copy of the string with the characters in the set removed from both
1066     ends."
1067     (with-string string
1068     (let* ((left-end (do ((index start (1+ index)))
1069     ((or (= index (the fixnum end))
1070     (not (find (schar string index) char-bag)))
1071     index)
1072     (declare (fixnum index))))
1073     (right-end (do ((index (1- (the fixnum end)) (1- index)))
1074     ((or (< index left-end)
1075     (not (find (schar string index) char-bag)))
1076     (1+ index))
1077     (declare (fixnum index)))))
1078     (subseq (the simple-string string) left-end right-end))))
1079 rtoy 1.13 ) ; non-unicode version
1080    
1081     (declaim (inline %glyph-f %glyph-b))
1082     (defun %glyph-f (string index)
1083     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
1084     (type simple-string string) (type kernel:index index))
1085     (let* ((prev 0)
1086     (l (length string))
1087     (c (codepoint string index l))
1088     (n (+ index (if (> c #xFFFF) 2 1))))
1089     (declare (type codepoint c) (type kernel:index l n))
1090     (loop while (< n l) do
1091     (let* ((c (codepoint string n l))
1092     (d (the (unsigned-byte 8) (unicode-combining-class c))))
1093     (when (or (zerop d) (< d prev))
1094     (return))
1095     (setq prev d)
1096     (incf n (if (> c #xFFFF) 2 1))))
1097     n))
1098    
1099     (defun %glyph-b (string index)
1100     (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
1101     (type simple-string string) (type kernel:index index))
1102     (let* ((prev 255)
1103     (n (1- index)))
1104     (declare (type kernel:index n))
1105     (loop until (< n 0) do
1106     (let* ((c (codepoint string n 0))
1107     (d (the (unsigned-byte 8) (unicode-combining-class c))))
1108     (cond ((zerop d) (return))
1109     ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
1110     (setq prev d)
1111     (decf n (if (> c #xFFFF) 2 1))))
1112     n))
1113    
1114     (defun glyph (string index &key (from-end nil))
1115     "GLYPH returns the glyph at the indexed position in a string, and the
1116     position of the next glyph (or NIL) as a second value. A glyph is
1117     a substring consisting of the character at INDEX followed by all
1118     subsequent combining characters."
1119     (declare (type simple-string string) (type kernel:index index))
1120     #-unicode
1121     (char string index)
1122     #+unicode
1123     (with-array-data ((string string) (start) (end))
1124     (declare (ignore start end))
1125     (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
1126     (if from-end
1127     (values (subseq string n index) (and (> n 0) n))
1128     (values (subseq string index n) (and (< n (length string)) n))))))
1129    
1130     (defun sglyph (string index &key (from-end nil))
1131     "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
1132     except that the string must be a simple-string"
1133     (declare (type simple-string string) (type kernel:index index))
1134     #-unicode
1135     (schar string index)
1136     #+unicode
1137     (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
1138     (if from-end
1139     (values (subseq string n index) (and (> n 0) n))
1140     (values (subseq string index n) (and (< n (length string)) n)))))
1141    
1142     #+unicode
1143     (defun string-reverse* (sequence)
1144     (declare (optimize (speed 3) (space 0) (safety 0))
1145     (type string sequence))
1146     (with-string sequence
1147     (let* ((length (- end start))
1148     (string (make-string length))
1149     (j length))
1150     (declare (type kernel:index length j))
1151     (loop for i = start then n as n = (%glyph-f sequence i) do
1152     (replace string sequence :start1 (decf j (- n i)) :start2 i :end2 n)
1153     while (< n end))
1154     string)))
1155    
1156     #+unicode
1157     (defun string-nreverse* (sequence)
1158     (declare (optimize (speed 3) (space 0) (safety 0))
1159     (type string sequence))
1160     (with-string sequence
1161     (flet ((rev (start end)
1162     (do ((i start (1+ i))
1163     (j (1- end) (1- j)))
1164     ((>= i j))
1165     (declare (type kernel:index i j))
1166     (rotatef (schar sequence i) (schar sequence j)))))
1167     (let ((len end))
1168     (loop for i = start then n as n = (%glyph-f sequence i) do
1169     (rev i n) while (< n len))
1170     (rev start end))))
1171     sequence)
1172    
1173    
1174    
1175    
1176     (defun decompose (string &optional (compatibility t))
1177     (declare (type string string))
1178     (let ((result (make-string (cond ((< (length string) 40)
1179     (* 5 (length string)))
1180     ((< (length string) 4096)
1181     (* 2 (length string)))
1182     (t (round (length string) 5/6)))))
1183     (fillptr 0))
1184     (declare (type kernel:index fillptr))
1185     (labels ((rec (string start end)
1186     (declare (type simple-string string))
1187     (do ((i start (1+ i)))
1188     ((= i end))
1189     (declare (type kernel:index i))
1190     (multiple-value-bind (code wide) (codepoint string i)
1191     (when wide (incf i))
1192     (let ((decomp (unicode-decomp code compatibility)))
1193     (if decomp (rec decomp 0 (length decomp)) (out code))))))
1194     (out (code)
1195     (multiple-value-bind (hi lo) (surrogates code)
1196     (outch hi)
1197     (when lo
1198     (outch lo))
1199     (let ((cc (unicode-combining-class code)))
1200     (unless (zerop cc)
1201     (order lo cc (- fillptr (if lo 3 2)))))))
1202     (outch (char)
1203     (when (= fillptr (length result))
1204     (let ((tmp (make-string (round (length result) 5/6))))
1205     (replace tmp result)
1206     (setq result tmp)))
1207     (setf (schar result fillptr) char)
1208     (incf fillptr))
1209     (order (wide1 cc last)
1210     (loop until (minusp last) do
1211     (multiple-value-bind (code2 wide2) (codepoint result last)
1212     (let ((cc2 (unicode-combining-class code2)))
1213     (cond ((zerop cc2) (return))
1214     ((> cc2 cc)
1215     (case (+ (if wide2 2 0) (if wide1 1 0))
1216     (0 (rotatef (schar result last)
1217     (schar result (1+ last))))
1218     (1 (rotatef (schar result last)
1219     (schar result (+ last 1))
1220     (schar result (+ last 2))))
1221     (2 (rotatef (schar result last)
1222     (schar result (1- last))
1223     (schar result (1+ last))))
1224     (3 (rotatef (schar result last)
1225     (schar result (+ last 2)))
1226     (rotatef (schar result (1- last))
1227     (schar result (1+ last)))))
1228     (decf last (if wide2 2 1)))
1229     (t (return))))))))
1230     (with-string string
1231     (rec string start end))
1232     (shrink-vector result fillptr))))
1233    
1234     (declaim (inline normalized-codepoint-p))
1235     (defun normalized-codepoint-p (cp form)
1236     (ecase form
1237     (:nfc (unicode-nfc-qc cp))
1238     (:nfkc (unicode-nfkc-qc cp))
1239     (:nfd (unicode-nfd-qc cp))
1240     (:nfkd (unicode-nfkd-qc cp))))
1241    
1242     ;; Perform check to see if string is already normalized. The Unicode
1243     ;; example can return YES, NO, or MAYBE. For our purposes, only YES
1244     ;; is important, for which we return T. For NO or MAYBE, we return NIL.
1245     (defun normalized-form-p (string &optional (form :nfc))
1246     (declare (type (member :nfc :nfkc :nfd :nfkd) form)
1247     (optimize (speed 3)))
1248     (with-string string
1249     (let ((last-class 0))
1250     (declare (type (integer 0 256) last-class))
1251     (do ((k start (1+ k)))
1252     ((>= k end))
1253     (declare (type kernel:index k))
1254     (multiple-value-bind (ch widep)
1255     (codepoint string k end)
1256     (when widep (incf k))
1257     ;; Handle ASCII specially
1258     (unless (< ch 128)
1259     (let ((class (unicode-combining-class ch)))
1260     (declare (type (unsigned-byte 8) class))
1261     (when (and (> last-class class) (not (zerop class)))
1262     ;; Definitely not normalized
1263     (return-from normalized-form-p nil))
1264     (let ((check (normalized-codepoint-p ch form)))
1265     (unless (eq check :y)
1266     (return-from normalized-form-p nil)))
1267     (setf last-class class)))))
1268     t)))
1269    
1270    
1271     ;; Compose a string in place. The string must already be in decomposed form.
1272     (defun %compose (target)
1273     (declare (type string target)
1274     (optimize (speed 3)))
1275     (let ((len (length target))
1276     (starter-pos 0))
1277     (declare (type kernel:index starter-pos))
1278     (multiple-value-bind (starter-ch wide)
1279     (codepoint target 0 len)
1280     (let ((comp-pos (if wide 2 1))
1281     (last-class (unicode-combining-class starter-ch)))
1282     (declare (type (integer 0 256) last-class)
1283     (type kernel:index comp-pos))
1284     (unless (zerop last-class)
1285     ;; Fix for strings starting with a combining character
1286     (setf last-class 256))
1287     ;; Loop on decomposed characters, combining where possible
1288     (do ((decomp-pos comp-pos (1+ decomp-pos)))
1289     ((>= decomp-pos len))
1290     (declare (type kernel:index decomp-pos))
1291     (multiple-value-bind (ch wide)
1292     (codepoint target decomp-pos len)
1293     (when wide (incf decomp-pos))
1294     (let ((ch-class (unicode-combining-class ch))
1295     (composite (get-pairwise-composition starter-ch ch)))
1296     (declare (type (integer 0 256) ch-class))
1297     (cond ((and composite
1298     (or (< last-class ch-class) (zerop last-class)))
1299     ;; Don't have to worry about surrogate pairs here
1300     ;; because the composite is always in the BMP.
1301     (setf (aref target starter-pos) (code-char composite))
1302     (setf starter-ch composite))
1303     (t
1304     (when (zerop ch-class)
1305     (setf starter-pos comp-pos)
1306     (setf starter-ch ch))
1307     (setf last-class ch-class)
1308     (multiple-value-bind (hi lo)
1309     (surrogates ch)
1310     (setf (aref target comp-pos) hi)
1311     (when lo
1312     (incf comp-pos)
1313     (setf (aref target comp-pos) lo))
1314     (incf comp-pos)))))))
1315     (shrink-vector target comp-pos)))))
1316    
1317     (defun string-to-nfd (string)
1318     "Convert String to Unicode Normalization Form D (NFD) using the
1319     canonical decomposition. The NFD string is returned"
1320     (decompose string nil))
1321    
1322     (defun string-to-nfkd (string)
1323     "Convert String to Unicode Normalization Form KD (NFKD) uisng the
1324     compatible decomposition form. The NFKD string is returned."
1325     (decompose string t))
1326    
1327     #+unicode
1328     (defun string-to-nfc (string)
1329     "Convert String to Unicode Normalization Form C (NFC). If the
1330     string a simple string and is already normalized, the original
1331     string is returned."
1332     (if (normalized-form-p string :nfc)
1333     (if (simple-string-p string) string (coerce string 'simple-string))
1334     (coerce (if (normalized-form-p string :nfd)
1335     (%compose (copy-seq string))
1336     (%compose (string-to-nfd string)))
1337     'simple-string)))
1338    
1339     #-unicode ;; Needed by package.lisp
1340     (defun string-to-nfc (string)
1341     (if (simple-string-p string) string (coerce string 'simple-string)))
1342    
1343     (defun string-to-nfkc (string)
1344     "Convert String to Unicode Normalization Form KC (NFKC). If the
1345     string is a simple string and is already normalized, the original
1346     string is returned."
1347     (if (normalized-form-p string :nfkc)
1348     (if (simple-string-p string) string (coerce string 'simple-string))
1349     (coerce (if (normalized-form-p string :nfkd)
1350     (%compose (copy-seq string))
1351     (%compose (string-to-nfkd string)))
1352     'simple-string)))

  ViewVC Help
Powered by ViewVC 1.1.5