/[cmucl]/src/code/string.lisp
ViewVC logotype

Diff of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.12 by emarsden, Fri Apr 11 15:41:59 2003 UTC revision 1.12.30.31 by rtoy, Tue Jun 9 14:53:13 2009 UTC
# Line 16  Line 16 
16  ;;; ****************************************************************  ;;; ****************************************************************
17  ;;;  ;;;
18  (in-package "LISP")  (in-package "LISP")
19  (export '(char schar string  (export '(char schar glyph sglyph string
20            string= string-equal string< string> string<= string>= string/=            string= string-equal string< string> string<= string>= string/=
21            string-lessp string-greaterp string-not-lessp string-not-greaterp            string-lessp string-greaterp string-not-lessp string-not-greaterp
22            string-not-equal            string-not-equal
23              string-to-nfd string-to-nfkd string-to-nfc string-to-nfkc
24            make-string            make-string
25            string-trim string-left-trim string-right-trim            string-trim string-left-trim string-right-trim
26            string-upcase            string-upcase
# Line 27  Line 28 
28            nstring-capitalize))            nstring-capitalize))
29    
30    
31    (declaim (inline surrogatep surrogates-to-codepoint codepoint surrogates))
32    
33    (defun surrogatep (c &optional surrogate-type)
34      "Test if C is a surrogate.  C may be either an integer or a
35      character. Surrogate-type indicates what kind of surrogate to test
36      for.  :High means to test for the high (leading) surrogate; :Low
37      tests for the low (trailing surrogate).  A value of :Any or Nil
38      tests for any surrogate value (high or low)."
39      (declare (type (or character codepoint) c))
40      (let ((code (if (characterp c)
41                      (char-code c)
42                      c)))
43        (ecase surrogate-type
44          ((:high :leading)
45           ;; Test for high surrogate
46           (<= #xD800 code #xDBFF))
47          ((:low :trailing)
48           ;; Test for low surrogate
49           (<= #xDC00 code #xDFFF))
50          ((:any nil)
51           ;; Test for any surrogate
52           (<= #xD800 code #xDFFF)))))
53    
54    (defun surrogates-to-codepoint (hi lo)
55      "Convert the given Hi and Lo surrogate characters to the
56      corresponding codepoint value"
57      (declare (type character hi lo))
58      (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi)) #xD800) 10)
59         (the (integer #xDC00 #xDFFF) (char-code lo)) #x2400))
60    
61    (defun codepoint (string i &optional (end (length string)))
62      "Return the codepoint value from String at position I.  If that
63      position is a surrogate, it is combined with either the previous or
64      following character (when possible) to compute the codepoint.  The
65      second return value is NIL if the position is not a surrogate pair.
66      Otherwise +1 or -1 is returned if the position is the high or low
67      surrogate value, respectively."
68      (declare (type simple-string string) (type kernel:index i end))
69      (let ((code (char-code (schar string i))))
70        (cond ((and (surrogatep code :high) (< (1+ i) end))
71               (let ((tmp (char-code (schar string (1+ i)))))
72                 (if (surrogatep tmp :low)
73                     (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)
74                     (values code nil))))
75              ((and (surrogatep code :low) (> i 0))
76               (let ((tmp (char-code (schar string (1- i)))))
77                 (if (surrogatep tmp :high)
78                     (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)
79                     (values code nil))))
80              (t (values code nil)))))
81    
82    (defun surrogates (codepoint)
83      "Return the high and low surrogate characters for Codepoint.  If
84      Codepoint is in the BMP, the first return value is the corresponding
85      character and the second is NIL."
86      (declare (type codepoint codepoint))
87      (if (< codepoint #x10000)
88          (values (code-char codepoint) nil)
89          (let* ((tmp (- codepoint #x10000))
90                 (hi (logior (ldb (byte 10 10) tmp) #xD800))
91                 (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
92            (values (code-char hi) (code-char lo)))))
93    
94    (defun (setf codepoint) (codepoint string i)
95      "Set the codepoint at string position I to the Codepoint.  If the
96      codepoint requires a surrogate pair, the high (leading surrogate) is
97      stored at position I and the low (trailing) surrogate is stored at
98      I+1"
99      (declare (type codepoint codepoint)
100               (type simple-string string))
101      (let ((widep nil))
102        (multiple-value-bind (hi lo)
103            (surrogates codepoint)
104          (setf (aref string i) hi)
105          (when lo
106            (setf (aref string (1+ i)) lo)
107            (setf widep t)))
108        (values codepoint widep)))
109    
110    (defun utf16-string-p (string)
111      "Check if String is a valid UTF-16 string.  If the string is valid,
112      T is returned.  If the string is not valid, NIL is returned, and the
113      second value is the index into the string of the invalid character.
114      A string is also invalid if it contains any unassigned codepoints."
115      (do ((len (length string))
116           (index 0 (1+ index)))
117          ((>= index len)
118           t)
119        (multiple-value-bind (codepoint wide)
120            (codepoint string index)
121          ;; We step through the string in order.  If there are any
122          ;; surrogates pairs, we must reach the lead surrogate first,
123          ;; which means WIDE is +1.  Otherwise, we have an invalid
124          ;; surrogate pair.  If we get any codepoint that is in the
125          ;; surrogate range, we also have an invalid string.  An
126          ;; unassigned codepoint is also considered invalid.
127          (when (or (eq wide -1)
128                    (surrogatep codepoint)
129                    (lisp::unicode-assigned-codepoint-p codepoint))
130            (return-from utf16-string-p (values nil index)))
131          (when wide (incf index)))))
132    
133  (defun string (X)  (defun string (X)
134    "Coerces X into a string.  If X is a string, X is returned.  If X is a    "Coerces X into a string.  If X is a string, X is returned.  If X is a
135     symbol, X's pname is returned.  If X is a character then a one element    symbol, X's pname is returned.  If X is a character then a one element
136     string containing that character is returned.  If X cannot be coerced    string containing that character is returned.  If X cannot be coerced
137     into a string, an error occurs."    into a string, an error occurs."
138    (cond ((stringp x) x)    (cond ((stringp x) x)
139          ((symbolp x) (symbol-name x))          ((symbolp x) (symbol-name x))
140          ((characterp x)          ((characterp x)
# Line 106  Line 209 
209    
210  (defun schar (string index)  (defun schar (string index)
211    "SCHAR returns the character object at an indexed position in a string    "SCHAR returns the character object at an indexed position in a string
212     just as CHAR does, except the string must be a simple-string."    just as CHAR does, except the string must be a simple-string."
213    (declare (optimize (safety 1)))    (declare (optimize (safety 1)))
214    (schar string index))    (schar string index))
215    
# Line 143  Line 246 
246                         (the fixnum end2))                         (the fixnum end2))
247                      ,(if lessp                      ,(if lessp
248                           `nil                           `nil
249                         `(- (the fixnum index) ,offset1)))                           `(- (the fixnum index) ,offset1)))
250                       #-unicode
251                     ((,(if lessp 'char< 'char>)                     ((,(if lessp 'char< 'char>)
252                       (schar string1 index)                       (schar string1 index)
253                       (schar string2 (+ (the fixnum index) (- start2 start1))))                       (schar string2 (+ (the fixnum index) (- start2 start1))))
254                      (- (the fixnum index) ,offset1))                      (- (the fixnum index) ,offset1))
255                     (t nil))                     #-unicode
256                       (t nil)
257                       #+unicode
258                       (t
259                        ;; Compare in code point order.  See
260                        ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
261                        (flet ((fixup (code)
262                                 (if (>= code #xe000)
263                                     (- code #x800)
264                                     (+ code #x2000))))
265                          (declare (inline fixup))
266                          (let* ((c1 (char-code (schar string1 index)))
267                                 (c2 (char-code (schar string2
268                                                       (+ (the fixnum index)
269                                                          (- start2 start1))))))
270                            (cond ((and (>= c1 #xd800)
271                                        (>= c2 #xd800))
272                                   (let ((fix-c1 (fixup c1))
273                                         (fix-c2 (fixup c2)))
274                                     (if (,(if lessp '< '>) fix-c1 fix-c2)
275                                         (- (the fixnum index) ,offset1)
276                                         nil)))
277                                  (t
278                                   (if (,(if lessp '< '>) c1 c2)
279                                       (- (the fixnum index) ,offset1)
280                                       nil)))))))
281               ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))               ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
282  ) ; eval-when  ) ; eval-when
283    
# Line 232  Line 361 
361                                      (schar string2 index2)))                                      (schar string2 index2)))
362                     (return ,abort-value)))))))                     (return ,abort-value)))))))
363    
364    #+unicode
365    (defmacro handle-case-fold-equal (f &body body)
366      `(if (eq casing :simple)
367           ,@body
368           (let* ((s1 (case-fold string1 start1 end1))
369                  (s2 (case-fold string1 start2 end2)))
370             (,f s1 s2))))
371    
372    #-unicode
373    (defmacro handle-case-fold-equal (f &body body)
374      (declare (ignore f))
375      `(progn ,@body))
376    
377  ) ; eval-when  ) ; eval-when
378    
379  (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)  #+unicode
380    (defun case-fold (string start end)
381      ;; Create a new string performing full case folding of String
382      (with-output-to-string (s)
383        (with-one-string string start end offset
384          (do ((index offset (1+ index)))
385              ((>= index end))
386            (multiple-value-bind (code widep)
387                (codepoint string index)
388              (when widep (incf index))
389              (write-string (unicode-case-fold-full code) s))))))
390    
391    (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2 #+unicode (casing :simple))
392    "Given two strings (string1 and string2), and optional integers start1,    "Given two strings (string1 and string2), and optional integers start1,
393    start2, end1 and end2, compares characters in string1 to characters in    start2, end1 and end2, compares characters in string1 to characters in
394    string2 (using char-equal)."    string2 (using char-equal)."
395    (declare (fixnum start1 start2))    (declare (fixnum start1 start2))
396    (with-two-strings string1 string2 start1 end1 offset1 start2 end2    (handle-case-fold-equal string-equal
397      (let ((slen1 (- (the fixnum end1) start1))      (with-two-strings string1 string2 start1 end1 offset1 start2 end2
398            (slen2 (- (the fixnum end2) start2)))         (let ((slen1 (- (the fixnum end1) start1))
399        (declare (fixnum slen1 slen2))               (slen2 (- (the fixnum end2) start2)))
400        (if (or (minusp slen1) (minusp slen2))           (declare (fixnum slen1 slen2))
401            ;;prevent endless looping later.           (if (or (minusp slen1) (minusp slen2))
402            (error "Improper bounds for string comparison."))               ;;prevent endless looping later.
403        (if (= slen1 slen2)               (error "Improper bounds for string comparison."))
404            ;;return () immediately if lengths aren't equal.           (if (= slen1 slen2)
405            (string-not-equal-loop 1 t nil)))))               ;;return () immediately if lengths aren't equal.
406                 (string-not-equal-loop 1 t nil))))))
407    
408  (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)  (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2 #+unicode (casing :simple))
409    "Given two strings, if the first string is not lexicographically equal    "Given two strings, if the first string is not lexicographically equal
410    to the second string, returns the longest common prefix (using char-equal)    to the second string, returns the longest common prefix (using char-equal)
411    of the two strings. Otherwise, returns ()."    of the two strings. Otherwise, returns ()."
412    (with-two-strings string1 string2 start1 end1 offset1 start2 end2    (handle-case-fold-equal string-not-equal
413      (let ((slen1 (- end1 start1))      (with-two-strings string1 string2 start1 end1 offset1 start2 end2
414            (slen2 (- end2 start2)))        (let ((slen1 (- end1 start1))
415        (declare (fixnum slen1 slen2))              (slen2 (- end2 start2)))
416        (if (or (minusp slen1) (minusp slen2))          (declare (fixnum slen1 slen2))
417            ;;prevent endless looping later.          (if (or (minusp slen1) (minusp slen2))
418            (error "Improper bounds for string comparison."))              ;;prevent endless looping later.
419        (cond ((or (minusp slen1) (or (minusp slen2)))              (error "Improper bounds for string comparison."))
420               (error "Improper substring for comparison."))          (cond ((or (minusp slen1) (or (minusp slen2)))
421              ((= slen1 slen2)                 (error "Improper substring for comparison."))
422               (string-not-equal-loop 1 nil (- index1 offset1)))                ((= slen1 slen2)
423              ((< slen1 slen2)                 (string-not-equal-loop 1 nil (- index1 offset1)))
424               (string-not-equal-loop 1 (- index1 offset1)))                ((< slen1 slen2)
425              (t                 (string-not-equal-loop 1 (- index1 offset1)))
426               (string-not-equal-loop 2 (- index1 offset1)))))))                (t
427                   (string-not-equal-loop 2 (- index1 offset1))))))))
428    
429    
430    
# Line 281  Line 437 
437    (if lessp    (if lessp
438        (if equalp        (if equalp
439            ;; STRING-NOT-GREATERP            ;; STRING-NOT-GREATERP
440            (values '<= `(not (char-greaterp char1 char2)))            (values '<=
441                      #-unicode `(not (char-greaterp char1 char2))
442                      #+unicode `(<= char1 char2))
443            ;; STRING-LESSP            ;; STRING-LESSP
444            (values '< `(char-lessp char1 char2)))            (values '<
445                      #-unicode `(char-lessp char1 char2)
446                      #+unicode `(< char1 char2)))
447        (if equalp        (if equalp
448            ;; STRING-NOT-LESSP            ;; STRING-NOT-LESSP
449            (values '>= `(not (char-lessp char1 char2)))            (values '>=
450                      #-unicode `(not (char-lessp char1 char2))
451                      #+unicode `(>= char1 char2))
452            ;; STRING-GREATERP            ;; STRING-GREATERP
453            (values '> `(char-greaterp char1 char2)))))            (values '>
454                      #-unicode `(char-greaterp char1 char2)
455                      #+unicode `(> char1 char2)))))
456    
457    #-unicode
458  (defmacro string-less-greater-equal (lessp equalp)  (defmacro string-less-greater-equal (lessp equalp)
459    (multiple-value-bind (length-test character-test)    (multiple-value-bind (length-test character-test)
460                         (string-less-greater-equal-tests lessp equalp)                         (string-less-greater-equal-tests lessp equalp)
# Line 314  Line 479 
479                     (return (- index1 offset1))                     (return (- index1 offset1))
480                     (return ()))))))))                     (return ()))))))))
481    
482    ;; Convert to lowercase for case folding, to match what Unicode
483    ;; CaseFolding.txt says.  An example where this matters: U+1E9E maps
484    ;; to U+00DF.  But the uppercase version of U+00DF is U+00DF.
485    #+unicode
486    (defmacro equal-char-codepoint (codepoint)
487      `(let ((ch ,codepoint))
488         ;; Handle ASCII separately for bootstrapping and for unidata missing.
489         (if (< 64 ch 91)
490             (+ ch 32)
491             #-(and unicode (not unicode-bootstrap))
492             ch
493             #+(and unicode (not unicode-bootstrap))
494             (if (> ch 127) (unicode-case-fold-simple ch) ch))))
495    
496    #+unicode
497    (defmacro string-less-greater-equal (lessp equalp)
498      (multiple-value-bind (length-test character-test)
499          (string-less-greater-equal-tests lessp equalp)
500        `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
501           (let ((slen1 (- (the fixnum end1) start1))
502                 (slen2 (- (the fixnum end2) start2)))
503             (declare (fixnum slen1 slen2))
504             (if (or (minusp slen1) (minusp slen2))
505                 ;;prevent endless looping later.
506                 (error "Improper bounds for string comparison."))
507             (do ((index1 start1 (1+ index1))
508                  (index2 start2 (1+ index2)))
509                 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
510                  (if (,length-test slen1 slen2) (- index1 offset1)))
511               (declare (fixnum index1 index2))
512               (multiple-value-bind (char1 wide1)
513                   (codepoint string1 index1)
514                 (declare (type codepoint char1))
515                 (multiple-value-bind (char2 wide2)
516                     (codepoint string2 index2)
517                   (declare (type codepoint char2))
518                   (setf char1 (equal-char-codepoint char1))
519                   (setf char2 (equal-char-codepoint char2))
520                   (if (= char1 char2)
521                       (progn
522                         (when wide1 (incf index1))
523                         (when wide2 (incf index2)))
524                       (if ,character-test
525                           (return (- index1 offset1))
526                           (return ()))))))))))
527    
528  ) ; eval-when  ) ; eval-when
529    
530  (defun string-lessp* (string1 string2 start1 end1 start2 end2)  (defun string-lessp* (string1 string2 start1 end1 start2 end2)
# Line 332  Line 543 
543    (declare (fixnum start1 start2))    (declare (fixnum start1 start2))
544    (string-less-greater-equal t t))    (string-less-greater-equal t t))
545    
546  (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)  
547    (eval-when (compile)
548    
549    #+unicode
550    (defmacro handle-case-folding (f)
551      `(if (eq casing :simple)
552          (,f string1 string2 start1 end1 start2 end2)
553          (let* ((s1 (case-fold string1 start1 end1))
554                 (s2 (case-fold string2 start2 end2))
555                 (result (,f s1 s2 0 (length s1) 0 (length s2))))
556            (when result
557              (+ result start1)))))
558    
559    #-unicode
560    (defmacro handle-case-folding (f)
561      `(,f string1 string2 start1 end1 start2 end2))
562    
563    ) ; compile
564    
565    (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2 #+unicode (casing :simple))
566    "Given two strings, if the first string is lexicographically less than    "Given two strings, if the first string is lexicographically less than
567    the second string, returns the longest common prefix (using char-equal)    the second string, returns the longest common prefix (using char-equal)
568    of the two strings. Otherwise, returns ()."    of the two strings. Otherwise, returns ()."
569    (string-lessp* string1 string2 start1 end1 start2 end2))    (handle-case-folding string-lessp*))
570    
571  (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)  (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2 #+unicode (casing :simple))
572    "Given two strings, if the first string is lexicographically greater than    "Given two strings, if the first string is lexicographically greater than
573    the second string, returns the longest common prefix (using char-equal)    the second string, returns the longest common prefix (using char-equal)
574    of the two strings. Otherwise, returns ()."    of the two strings. Otherwise, returns ()."
575    (string-greaterp* string1 string2 start1 end1 start2 end2))    (handle-case-folding string-greaterp*))
576    
577  (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)  (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2 #+unicode (casing :simple))
578    "Given two strings, if the first string is lexicographically greater    "Given two strings, if the first string is lexicographically greater
579    than or equal to the second string, returns the longest common prefix    than or equal to the second string, returns the longest common prefix
580    (using char-equal) of the two strings. Otherwise, returns ()."    (using char-equal) of the two strings. Otherwise, returns ()."
581    (string-not-lessp* string1 string2 start1 end1 start2 end2))    (handle-case-folding string-not-lessp*))
582    
583  (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)  (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
584                                      end2)                                      end2 #+unicode (casing :simple))
585    "Given two strings, if the first string is lexicographically less than    "Given two strings, if the first string is lexicographically less than
586    or equal to the second string, returns the longest common prefix    or equal to the second string, returns the longest common prefix
587    (using char-equal) of the two strings. Otherwise, returns ()."    (using char-equal) of the two strings. Otherwise, returns ()."
588    (string-not-greaterp* string1 string2 start1 end1 start2 end2))    (handle-case-folding string-not-greaterp*))
589    
590    
591  (defun make-string (count &key element-type ((:initial-element fill-char)))  (defun make-string (count &key element-type ((:initial-element fill-char)))
592    "Given a character count and an optional fill character, makes and returns    "Given a character count and an optional fill character, makes and returns
593     a new string Count long filled with the fill character."    a new string Count long filled with the fill character."
594    (declare (type fixnum count))    (declare (type fixnum count))
595    (assert (subtypep element-type 'character))    (assert (subtypep element-type 'character))
596    (if fill-char    (if fill-char
# Line 371  Line 601 
601          (setf (schar string i) fill-char))          (setf (schar string i) fill-char))
602        (make-string count)))        (make-string count)))
603    
604  (defun string-upcase (string &key (start 0) end)  (defun string-upcase-simple (string &key (start 0) end)
   "Given a string, returns a new string that is a copy of it with  
   all lower case alphabetic characters converted to uppercase."  
605    (declare (fixnum start))    (declare (fixnum start))
606    (let* ((string (if (stringp string) string (string string)))    (let* ((string (if (stringp string) string (string string)))
607           (slen (length string)))           (slen (length string)))
# Line 391  Line 619 
619               (new-index (- start offset) (1+ new-index)))               (new-index (- start offset) (1+ new-index)))
620              ((= index (the fixnum end)))              ((= index (the fixnum end)))
621            (declare (fixnum index new-index))            (declare (fixnum index new-index))
622            (setf (schar newstring new-index)            (multiple-value-bind (code wide) (codepoint string index)
623                  (char-upcase (schar string index))))              (when wide (incf index))
624                ;; Handle ASCII specially because this is called early in
625                ;; initialization, before unidata is available.
626                (cond ((< 96 code 123) (decf code 32))
627                      ((> code 127) (setq code (unicode-upper code))))
628                ;;@@ WARNING: this may, in theory, need to extend newstring
629                ;;  but that never actually occurs as of Unicode 5.1.0,
630                ;;  so I'm just going to ignore it for now...
631                (multiple-value-bind (hi lo) (surrogates code)
632                  (setf (schar newstring new-index) hi)
633                  (when lo
634                    (setf (schar newstring (incf new-index)) lo)))))
635            ;;@@ WARNING: see above
636          (do ((index end (1+ index))          (do ((index end (1+ index))
637               (new-index (- (the fixnum end) offset) (1+ new-index)))               (new-index (- (the fixnum end) offset) (1+ new-index)))
638              ((= index offset-slen))              ((= index offset-slen))
# Line 400  Line 640 
640            (setf (schar newstring new-index) (schar string index)))            (setf (schar newstring new-index) (schar string index)))
641          newstring))))          newstring))))
642    
643  (defun string-downcase (string &key (start 0) end)  (defun string-upcase-full (string &key (start 0) end)
644    "Given a string, returns a new string that is a copy of it with    (declare (fixnum start))
645    all upper case alphabetic characters converted to lowercase."    (let* ((string (if (stringp string) string (string string)))
646             (slen (length string)))
647        (declare (fixnum slen))
648        (with-output-to-string (s)
649          (with-one-string string start end offset
650            (let ((offset-slen (+ slen offset)))
651              (declare (fixnum offset-slen))
652              (write-string string s :start offset :end start)
653              (do ((index start (1+ index)))
654                  ((= index (the fixnum end)))
655                (declare (fixnum index))
656                (multiple-value-bind (code wide)
657                    (codepoint string index)
658                  (when wide (incf index))
659                  ;; Handle ASCII specially because this is called early in
660                  ;; initialization, before unidata is available.
661                  (cond ((< 96 code 123)
662                         (write-char (code-char (decf code 32)) s))
663                        ((> code 127)
664                         (write-string (unicode-full-case-upper code) s))
665                        (t
666                         (multiple-value-bind (hi lo)
667                             (surrogates code)
668                           (write-char hi s)
669                           (when lo
670                             (write-char lo s)))))))
671              (write-string string s :start end :end offset-slen))))))
672    
673    (defun string-upcase (string &key (start 0) end #+unicode (casing :simple))
674      "Given a string, returns a new string that is a copy of it with all
675      lower case alphabetic characters converted to uppercase.  If Casing
676      is :full, then Unicode full-casing operation is done."
677      (declare (fixnum start))
678      #-unicode
679      (string-upcase-simple string :start start :end end)
680      #+unicode
681      (if (eq casing :simple)
682          (string-upcase-simple string :start start :end end)
683          (string-upcase-full string :start start :end end)))
684    
685    (defun string-downcase-simple (string &key (start 0) end)
686    (declare (fixnum start))    (declare (fixnum start))
687    (let* ((string (if (stringp string) string (string string)))    (let* ((string (if (stringp string) string (string string)))
688           (slen (length string)))           (slen (length string)))
# Line 420  Line 700 
700               (new-index (- start offset) (1+ new-index)))               (new-index (- start offset) (1+ new-index)))
701              ((= index (the fixnum end)))              ((= index (the fixnum end)))
702            (declare (fixnum index new-index))            (declare (fixnum index new-index))
703            (setf (schar newstring new-index)            (multiple-value-bind (code wide) (codepoint string index)
704                  (char-downcase (schar string index))))              (when wide (incf index))
705                ;; Handle ASCII specially because this is called early in
706                ;; initialization, before unidata is available.
707                (cond ((< 64 code 91) (incf code 32))
708                      ((> code 127) (setq code (unicode-lower code))))
709                ;;@@ WARNING: this may, in theory, need to extend newstring
710                ;;  but that never actually occurs as of Unicode 5.1.0,
711                ;;  so I'm just going to ignore it for now...
712                (multiple-value-bind (hi lo) (surrogates code)
713                  (setf (schar newstring new-index) hi)
714                  (when lo
715                    (setf (schar newstring (incf new-index)) lo)))))
716            ;;@@ WARNING: see above
717          (do ((index end (1+ index))          (do ((index end (1+ index))
718               (new-index (- (the fixnum end) offset) (1+ new-index)))               (new-index (- (the fixnum end) offset) (1+ new-index)))
719              ((= index offset-slen))              ((= index offset-slen))
# Line 429  Line 721 
721            (setf (schar newstring new-index) (schar string index)))            (setf (schar newstring new-index) (schar string index)))
722          newstring))))          newstring))))
723    
724  (defun string-capitalize (string &key (start 0) end)  (defun string-downcase-full (string &key (start 0) end)
725    "Given a string, returns a copy of the string with the first    (declare (fixnum start))
726    character of each ``word'' converted to upper-case, and remaining    (let* ((string (if (stringp string) string (string string)))
727    chars in the word converted to lower case. A ``word'' is defined           (slen (length string)))
728    to be a string of case-modifiable characters delimited by      (declare (fixnum slen))
729    non-case-modifiable chars."      (with-output-to-string (s)
730          (with-one-string string start end offset
731            (let ((offset-slen (+ slen offset)))
732              (declare (fixnum offset-slen))
733              (write-string string s :start offset :end start)
734              (do ((index start (1+ index)))
735                  ((= index (the fixnum end)))
736                (declare (fixnum index))
737                (multiple-value-bind (code wide)
738                    (codepoint string index)
739                  (when wide (incf index))
740                  ;; Handle ASCII specially because this is called early in
741                  ;; initialization, before unidata is available.
742                  (cond ((< 64 code 91)
743                         (write-char (code-char (incf code 32)) s))
744                        ((> code 127)
745                         (write-string (unicode-full-case-lower code) s))
746                        (t
747                         (multiple-value-bind (hi lo)
748                             (surrogates code)
749                           (write-char hi s)
750                           (when lo
751                             (write-char lo s)))))))
752              (write-string string s :start end :end offset-slen))))))
753    
754    (defun string-downcase (string &key (start 0) end #+unicode (casing :simple))
755      "Given a string, returns a new string that is a copy of it with all
756      upper case alphabetic characters converted to lowercase.  If Casing
757      is :full, then Unicode full-casing is done"
758      (declare (fixnum start))
759      #-unicode
760      (string-downcase-simple string :start start :end end)
761      #+unicode
762      (if (eq casing :simple)
763          (string-downcase-simple string :start start :end end)
764          (string-downcase-full string :start start :end end)))
765    
766    (defun string-capitalize-simple (string &key (start 0) end)
767    (declare (fixnum start))    (declare (fixnum start))
768    (let* ((string (if (stringp string) string (string string)))    (let* ((string (if (stringp string) string (string string)))
769           (slen (length string)))           (slen (length string)))
# Line 459  Line 788 
788                   (setq newword t))                   (setq newword t))
789                  (newword                  (newword
790                   ;;char is first case-modifiable after non-case-modifiable                   ;;char is first case-modifiable after non-case-modifiable
791                   (setq char (char-upcase char))                   (setq char (char-titlecase char))
792                   (setq newword ()))                   (setq newword ()))
793                  ;;char is case-modifiable, but not first                  ;;char is case-modifiable, but not first
794                  (t (setq char (char-downcase char))))                  (t (setq char (char-downcase char))))
# Line 471  Line 800 
800            (setf (schar newstring new-index) (schar string index)))            (setf (schar newstring new-index) (schar string index)))
801          newstring))))          newstring))))
802    
803    (defun string-capitalize-full (string &key (start 0) end)
804      (declare (fixnum start))
805      (let* ((string (if (stringp string) string (string string)))
806             (slen (length string)))
807        (declare (fixnum slen))
808        (with-output-to-string (s)
809          (with-one-string string start end offset
810            (let ((offset-slen (+ slen offset)))
811              (declare (fixnum offset-slen))
812              (write-string string s :start offset :end start)
813              (flet ((alphanump (m)
814                       (or (< 47 m 58) (< 64 m 91) (< 96 m 123)
815                           #+(and unicode (not unicode-bootstrap))
816                           (and (> m 127)
817                                (<= +unicode-category-letter+ (unicode-category m)
818                                    (+ +unicode-category-letter+ #x0F))))))
819                (do ((index start (1+ index))
820                     (newword t))
821                    ((= index (the fixnum end)))
822                  (declare (fixnum index))
823                  (multiple-value-bind (code wide)
824                      (codepoint string index)
825                    (when wide (incf index))
826                    (cond ((not (alphanump code))
827                           (multiple-value-bind (hi lo)
828                               (surrogates code)
829                             (write-char hi s)
830                             (when lo (write-char lo s)))
831                           (setq newword t))
832                          (newword
833                           ;;char is first case-modifiable after non-case-modifiable
834                           (write-string (unicode-full-case-title code) s)
835                           (setq newword ()))
836                          ;;char is case-modifiable, but not first
837                          (t
838                           (write-string (unicode-full-case-lower code) s))))))
839              (write-string string s :start end :end offset-slen))))))
840    
841    (defun string-capitalize (string &key (start 0) end #+unicode (casing :simple))
842      "Given a string, returns a copy of the string with the first
843      character of each ``word'' converted to upper-case, and remaining
844      chars in the word converted to lower case. A ``word'' is defined
845      to be a string of case-modifiable characters delimited by
846      non-case-modifiable chars."
847      (declare (fixnum start))
848      #-unicode
849      (string-capitalize-simple string :start start :end end)
850      #+unicode
851      (if (eq casing :simple)
852          (string-capitalize-simple string :start start :end end)
853          (string-capitalize-full string :start start :end end)))
854    
855  (defun nstring-upcase (string &key (start 0) end)  (defun nstring-upcase (string &key (start 0) end)
856    "Given a string, returns that string with all lower case alphabetic    "Given a string, returns that string with all lower case alphabetic
857    characters converted to uppercase."    characters converted to uppercase."
# Line 480  Line 861 
861        (do ((index start (1+ index)))        (do ((index start (1+ index)))
862            ((= index (the fixnum end)))            ((= index (the fixnum end)))
863          (declare (fixnum index))          (declare (fixnum index))
864          (setf (schar string index) (char-upcase (schar string index)))))          (multiple-value-bind (code wide) (codepoint string index)
865              (declare (ignore wide))
866              ;; Handle ASCII specially because this is called early in
867              ;; initialization, before unidata is available.
868              (cond ((< 96 code 123) (decf code 32))
869                    ((> code 127) (setq code (unicode-upper code))))
870              ;;@@ WARNING: this may, in theory, need to extend string
871              ;;      (which, obviously, we can't do here.  Unless
872              ;;       STRING is adjustable, maybe)
873              ;;  but that never actually occurs as of Unicode 5.1.0,
874              ;;  so I'm just going to ignore it for now...
875              (multiple-value-bind (hi lo) (surrogates code)
876                (setf (schar string index) hi)
877                (when lo
878                  (setf (schar string (incf index)) lo))))))
879      save-header))      save-header))
880    
881  (defun nstring-downcase (string &key (start 0) end)  (defun nstring-downcase (string &key (start 0) end)
# Line 492  Line 887 
887        (do ((index start (1+ index)))        (do ((index start (1+ index)))
888            ((= index (the fixnum end)))            ((= index (the fixnum end)))
889          (declare (fixnum index))          (declare (fixnum index))
890          (setf (schar string index) (char-downcase (schar string index)))))          (multiple-value-bind (code wide) (codepoint string index)
891              (declare (ignore wide))
892              (cond ((< 64 code 91) (incf code 32))
893                    ((> code 127) (setq code (unicode-lower code))))
894              ;;@@ WARNING: this may, in theory, need to extend string
895              ;;      (which, obviously, we can't do here.  Unless
896              ;;       STRING is adjustable, maybe)
897              ;;  but that never actually occurs as of Unicode 5.1.0,
898              ;;  so I'm just going to ignore it for now...
899              (multiple-value-bind (hi lo) (surrogates code)
900                (setf (schar string index) hi)
901                (when lo
902                  (setf (schar string (incf index)) lo))))))
903      save-header))      save-header))
904    
905  (defun nstring-capitalize (string &key (start 0) end)  (defun nstring-capitalize (string &key (start 0) end)
# Line 514  Line 921 
921                 (setq newword t))                 (setq newword t))
922                (newword                (newword
923                 ;;char is first case-modifiable after non-case-modifiable                 ;;char is first case-modifiable after non-case-modifiable
924                 (setf (schar string index) (char-upcase char))                 (setf (schar string index) (char-titlecase char))
925                 (setq newword ()))                 (setq newword ()))
926                (t                (t
927                 (setf (schar string index) (char-downcase char))))))                 (setf (schar string index) (char-downcase char))))))
928      save-header))      save-header))
929    
930    
931    #+unicode
932    (progn
933    ;; Like string-left-trim, but return the index
934    (defun string-left-trim-index (char-bag string)
935      (with-string string
936        (if (stringp char-bag)
937            ;; When char-bag is a string, we try to do the right thing.
938            ;; Convert char-bag to a list of codepoints and compare the
939            ;; codepoints in the string with this.
940            (let ((code-bag (with-string char-bag
941                              (do ((index start (1+ index))
942                                   (result nil))
943                                  ((= index end)
944                                   (nreverse result))
945                                (multiple-value-bind (c widep)
946                                    (codepoint char-bag index)
947                                  (push c result)
948                                  (when widep (incf index)))))))
949              (do ((index start (1+ index)))
950                  ((= index (the fixnum end))
951                   end)
952                (declare (fixnum index))
953                (multiple-value-bind (c widep)
954                    (codepoint string index)
955                  (unless (find c code-bag)
956                    (return-from string-left-trim-index index))
957                  (when widep (incf index)))))
958            ;; When char-bag is a list, we just look at each codepoint of
959            ;; STRING to see if it's in char-bag.  If char-bag contains a
960            ;; surrogate, we could accidentally trim off a surrogate,
961            ;; leaving an invalid UTF16 string.
962            (do ((index start (1+ index)))
963                ((= index (the fixnum end))
964                 end)
965              (declare (fixnum index))
966              (multiple-value-bind (c widep)
967                  (codepoint string index)
968                (unless (find c char-bag :key #'char-code)
969                  (return-from string-left-trim-index index))
970                (when widep (incf index)))))))
971    
972    (defun string-left-trim (char-bag string)
973      "Given a set of characters (a list or string) and a string, returns
974      a copy of the string with the characters in the set removed from the
975      left end.  If the set of characters is a string, surrogates will be
976      properly handled."
977      (let ((begin (string-left-trim-index char-bag string)))
978        (with-string string
979          (subseq string begin end))))
980    
981    (defun string-right-trim-index (char-bag string)
982      (with-string string
983        (if (stringp char-bag)
984            ;; When char-bag is a string, we try to do the right thing
985            ;; with surrogates.  Convert char-bag to a list of codepoints
986            ;; and compare the codepoints in the string with this.
987            (let ((code-bag (with-string char-bag
988                              (do ((index start (1+ index))
989                                   (result nil))
990                                  ((= index end)
991                                   result)
992                                (multiple-value-bind (c widep)
993                                    (codepoint char-bag index)
994                                  (push c result)
995                                  (when widep (incf index)))))))
996              (do ((index (1- end) (1- index)))
997                  ((< index start)
998                   start)
999                (declare (fixnum index))
1000                (multiple-value-bind (c widep)
1001                    (codepoint string index)
1002                  (unless (find c code-bag)
1003                    (return-from string-right-trim-index (1+ index)))
1004                  (when widep (decf index)))))
1005            ;; When char-bag is a list, we just look at each codepoint of
1006            ;; STRING to see if it's in char-bag.  If char-bag contains a
1007            ;; surrogate, we could accidentally trim off a surrogate,
1008            ;; leaving an invalid UTF16 string.
1009            (do ((index (1- end) (1- index)))
1010                ((< index start)
1011                 start)
1012              (declare (fixnum index))
1013              (multiple-value-bind (c widep)
1014                  (codepoint string index)
1015                (unless (find c char-bag :key #'char-code)
1016                  (return-from string-right-trim-index (1+ index)))
1017                (when widep (decf index)))))))
1018    
1019    (defun string-right-trim (char-bag string)
1020      "Given a set of characters (a list or string) and a string, returns
1021      a copy of the string with the characters in the set removed from the
1022      right end.  If the set of characters is a string, surrogates will be
1023      properly handled."
1024      (let ((stop (string-right-trim-index char-bag string)))
1025        (with-string string
1026          (subseq string start stop))))
1027    
1028    (defun string-trim (char-bag string)
1029      "Given a set of characters (a list or string) and a string, returns a
1030      copy of the string with the characters in the set removed from both
1031      ends.  If the set of characters is a string, surrogates will be
1032      properly handled."
1033      (let ((left-end (string-left-trim-index char-bag string))
1034            (right-end (string-right-trim-index char-bag string)))
1035        (with-string string
1036          (subseq (the simple-string string) left-end right-end))))
1037    ) ; end unicode version
1038    
1039    #-unicode
1040    (progn
1041  (defun string-left-trim (char-bag string)  (defun string-left-trim (char-bag string)
1042    "Given a set of characters (a list or string) and a string, returns    "Given a set of characters (a list or string) and a string, returns
1043    a copy of the string with the characters in the set removed from the    a copy of the string with the characters in the set removed from the
# Line 557  Line 1075 
1075                             (1+ index))                             (1+ index))
1076                          (declare (fixnum index)))))                          (declare (fixnum index)))))
1077        (subseq (the simple-string string) left-end right-end))))        (subseq (the simple-string string) left-end right-end))))
1078    ) ; non-unicode version
1079    
1080    (declaim (inline %glyph-f %glyph-b))
1081    (defun %glyph-f (string index)
1082      (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
1083               (type simple-string string) (type kernel:index index))
1084      (let* ((prev 0)
1085             (l (length string))
1086             (c (codepoint string index l))
1087             (n (+ index (if (> c #xFFFF) 2 1))))
1088        (declare (type codepoint c) (type kernel:index l n))
1089        (loop while (< n l) do
1090          (let* ((c (codepoint string n l))
1091                 (d (the (unsigned-byte 8) (unicode-combining-class c))))
1092            (when (or (zerop d) (< d prev))
1093              (return))
1094            (setq prev d)
1095            (incf n (if (> c #xFFFF) 2 1))))
1096        n))
1097    
1098    (defun %glyph-b (string index)
1099      (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
1100               (type simple-string string) (type kernel:index index))
1101      (let* ((prev 255)
1102             (n (1- index)))
1103        (declare (type kernel:index n))
1104        (loop until (< n 0) do
1105          (let* ((c (codepoint string n 0))
1106                 (d (the (unsigned-byte 8) (unicode-combining-class c))))
1107            (cond ((zerop d) (return))
1108                  ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
1109            (setq prev d)
1110            (decf n (if (> c #xFFFF) 2 1))))
1111        n))
1112    
1113    (defun glyph (string index &key (from-end nil))
1114      "GLYPH returns the glyph at the indexed position in a string, and the
1115      position of the next glyph (or NIL) as a second value.  A glyph is
1116      a substring consisting of the character at INDEX followed by all
1117      subsequent combining characters."
1118      (declare (type simple-string string) (type kernel:index index))
1119      #-unicode
1120      (char string index)
1121      #+unicode
1122      (with-array-data ((string string) (start) (end))
1123        (declare (ignore start end))
1124        (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
1125          (if from-end
1126              (values (subseq string n index) (and (> n 0) n))
1127              (values (subseq string index n) (and (< n (length string)) n))))))
1128    
1129    (defun sglyph (string index &key (from-end nil))
1130      "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
1131      except that the string must be a simple-string"
1132      (declare (type simple-string string) (type kernel:index index))
1133      #-unicode
1134      (schar string index)
1135      #+unicode
1136      (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
1137        (if from-end
1138            (values (subseq string n index) (and (> n 0) n))
1139            (values (subseq string index n) (and (< n (length string)) n)))))
1140    
1141    #+unicode
1142    (defun string-reverse* (sequence)
1143      (declare (optimize (speed 3) (space 0) (safety 0))
1144               (type string sequence))
1145      (with-string sequence
1146        (let* ((length (- end start))
1147               (string (make-string length))
1148               (j length))
1149          (declare (type kernel:index length j))
1150          (loop for i = start then n as n = (%glyph-f sequence i) do
1151                (replace string sequence :start1 (decf j (- n i)) :start2 i :end2 n)
1152                while (< n end))
1153          string)))
1154    
1155    #+unicode
1156    (defun string-nreverse* (sequence)
1157      (declare (optimize (speed 3) (space 0) (safety 0))
1158               (type string sequence))
1159      (with-string sequence
1160        (flet ((rev (start end)
1161                 (do ((i start (1+ i))
1162                      (j (1- end) (1- j)))
1163                     ((>= i j))
1164                   (declare (type kernel:index i j))
1165                   (rotatef (schar sequence i) (schar sequence j)))))
1166          (let ((len end))
1167            (loop for i = start then n as n = (%glyph-f sequence i) do
1168              (rev i n) while (< n len))
1169            (rev start end))))
1170      sequence)
1171    
1172    
1173    
1174    
1175    (defun decompose (string &optional (compatibility t))
1176      (declare (type string string))
1177      (let ((result (make-string (cond ((< (length string) 40)
1178                                        (* 5 (length string)))
1179                                       ((< (length string) 4096)
1180                                        (* 2 (length string)))
1181                                       (t (round (length string) 5/6)))))
1182            (fillptr 0))
1183        (declare (type kernel:index fillptr))
1184        (labels ((rec (string start end)
1185                   (declare (type simple-string string))
1186                   (do ((i start (1+ i)))
1187                       ((= i end))
1188                     (declare (type kernel:index i))
1189                     (multiple-value-bind (code wide) (codepoint string i)
1190                       (when wide (incf i))
1191                       (let ((decomp (unicode-decomp code compatibility)))
1192                         (if decomp (rec decomp 0 (length decomp)) (out code))))))
1193                 (out (code)
1194                   (multiple-value-bind (hi lo) (surrogates code)
1195                     (outch hi)
1196                     (when lo
1197                       (outch lo))
1198                     (let ((cc (unicode-combining-class code)))
1199                       (unless (zerop cc)
1200                         (order lo cc (- fillptr (if lo 3 2)))))))
1201                 (outch (char)
1202                   (when (= fillptr (length result))
1203                     (let ((tmp (make-string (round (length result) 5/6))))
1204                       (replace tmp result)
1205                       (setq result tmp)))
1206                   (setf (schar result fillptr) char)
1207                   (incf fillptr))
1208                 (order (wide1 cc last)
1209                   (loop until (minusp last) do
1210                     (multiple-value-bind (code2 wide2) (codepoint result last)
1211                       (let ((cc2 (unicode-combining-class code2)))
1212                         (cond ((zerop cc2) (return))
1213                               ((> cc2 cc)
1214                                (case (+ (if wide2 2 0) (if wide1 1 0))
1215                                  (0 (rotatef (schar result last)
1216                                              (schar result (1+ last))))
1217                                  (1 (rotatef (schar result last)
1218                                              (schar result (+ last 1))
1219                                              (schar result (+ last 2))))
1220                                  (2 (rotatef (schar result last)
1221                                              (schar result (1- last))
1222                                              (schar result (1+ last))))
1223                                  (3 (rotatef (schar result last)
1224                                              (schar result (+ last 2)))
1225                                     (rotatef (schar result (1- last))
1226                                              (schar result (1+ last)))))
1227                                (decf last (if wide2 2 1)))
1228                               (t (return))))))))
1229          (with-string string
1230            (rec string start end))
1231          (shrink-vector result fillptr))))
1232    
1233    (declaim (inline normalized-codepoint-p))
1234    (defun normalized-codepoint-p (cp form)
1235      (ecase form
1236        (:nfc (unicode-nfc-qc cp))
1237        (:nfkc (unicode-nfkc-qc cp))
1238        (:nfd (unicode-nfd-qc cp))
1239        (:nfkd (unicode-nfkd-qc cp))))
1240    
1241    ;; Perform check to see if string is already normalized.  The Unicode
1242    ;; example can return YES, NO, or MAYBE.  For our purposes, only YES
1243    ;; is important, for which we return T.   For NO or MAYBE, we return NIL.
1244    (defun normalized-form-p (string &optional (form :nfc))
1245      (declare (type (member :nfc :nfkc :nfd :nfkd) form)
1246               (optimize (speed 3)))
1247      (with-string string
1248        (let ((last-class 0))
1249          (declare (type (integer 0 256) last-class))
1250          (do ((k start (1+ k)))
1251              ((>= k end))
1252            (declare (type kernel:index k))
1253            (multiple-value-bind (ch widep)
1254                (codepoint string k end)
1255              (when widep (incf k))
1256              ;; Handle ASCII specially
1257              (unless (< ch 128)
1258                (let ((class (unicode-combining-class ch)))
1259                  (declare (type (unsigned-byte 8) class))
1260                  (when (and (> last-class class) (not (zerop class)))
1261                    ;; Definitely not normalized
1262                    (return-from normalized-form-p nil))
1263                  (let ((check (normalized-codepoint-p ch form)))
1264                    (unless (eq check :y)
1265                      (return-from normalized-form-p nil)))
1266                  (setf last-class class)))))
1267          t)))
1268    
1269    
1270    ;; Compose a string in place.  The string must already be in decomposed form.
1271    (defun %compose (target)
1272      (declare (type string target)
1273               (optimize (speed 3)))
1274      (let ((len (length target))
1275            (starter-pos 0))
1276        (declare (type kernel:index starter-pos))
1277        (multiple-value-bind (starter-ch wide)
1278            (codepoint target 0 len)
1279          (let ((comp-pos (if wide 2 1))
1280                (last-class (unicode-combining-class starter-ch)))
1281            (declare (type (integer 0 256) last-class)
1282                     (type kernel:index comp-pos))
1283            (unless (zerop last-class)
1284              ;; Fix for strings starting with a combining character
1285              (setf last-class 256))
1286            ;; Loop on decomposed characters, combining where possible
1287            (do ((decomp-pos comp-pos (1+ decomp-pos)))
1288                ((>= decomp-pos len))
1289              (declare (type kernel:index decomp-pos))
1290              (multiple-value-bind (ch wide)
1291                  (codepoint target decomp-pos len)
1292                (when wide (incf decomp-pos))
1293                (let ((ch-class (unicode-combining-class ch))
1294                      (composite (get-pairwise-composition starter-ch ch)))
1295                  (declare (type (integer 0 256) ch-class))
1296                  (cond ((and composite
1297                              (or (< last-class ch-class) (zerop last-class)))
1298                         ;; Don't have to worry about surrogate pairs here
1299                         ;; because the composite is always in the BMP.
1300                         (setf (aref target starter-pos) (code-char composite))
1301                         (setf starter-ch composite))
1302                        (t
1303                         (when (zerop ch-class)
1304                           (setf starter-pos comp-pos)
1305                           (setf starter-ch ch))
1306                         (setf last-class ch-class)
1307                         (multiple-value-bind (hi lo)
1308                             (surrogates ch)
1309                           (setf (aref target comp-pos) hi)
1310                           (when lo
1311                             (incf comp-pos)
1312                             (setf (aref target comp-pos) lo))
1313                           (incf comp-pos)))))))
1314            (shrink-vector target comp-pos)))))
1315    
1316    (defun string-to-nfd (string)
1317      "Convert String to Unicode Normalization Form D (NFD) using the
1318      canonical decomposition.  The NFD string is returned"
1319      (decompose string nil))
1320    
1321    (defun string-to-nfkd (string)
1322      "Convert String to Unicode Normalization Form KD (NFKD) uisng the
1323      compatible decomposition form.  The NFKD string is returned."
1324      (decompose string t))
1325    
1326    #+unicode
1327    (defun string-to-nfc (string)
1328      "Convert String to Unicode Normalization Form C (NFC).  If the
1329      string a simple string and is already normalized, the original
1330      string is returned."
1331      (if (normalized-form-p string :nfc)
1332          (if (simple-string-p string) string (coerce string 'simple-string))
1333          (coerce (if (normalized-form-p string :nfd)
1334                      (%compose (copy-seq string))
1335                      (%compose (string-to-nfd string)))
1336                  'simple-string)))
1337    
1338    #-unicode  ;; Needed by package.lisp
1339    (defun string-to-nfc (string)
1340      (if (simple-string-p string) string (coerce string 'simple-string)))
1341    
1342    (defun string-to-nfkc (string)
1343      "Convert String to Unicode Normalization Form KC (NFKC).  If the
1344      string is a simple string and is already normalized, the original
1345      string is returned."
1346      (if (normalized-form-p string :nfkc)
1347          (if (simple-string-p string) string (coerce string 'simple-string))
1348          (coerce (if (normalized-form-p string :nfkd)
1349                      (%compose (copy-seq string))
1350                      (%compose (string-to-nfkd string)))
1351                  'simple-string)))

Legend:
Removed from v.1.12  
changed lines
  Added in v.1.12.30.31

  ViewVC Help
Powered by ViewVC 1.1.5