/[cmucl]/src/code/string.lisp
ViewVC logotype

Diff of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.12 by emarsden, Fri Apr 11 15:41:59 2003 UTC revision 1.12.30.30 by rtoy, Sat Jun 6 20:53:46 2009 UTC
# Line 16  Line 16 
16  ;;; ****************************************************************  ;;; ****************************************************************
17  ;;;  ;;;
18  (in-package "LISP")  (in-package "LISP")
19  (export '(char schar string  (export '(char schar glyph sglyph string
20            string= string-equal string< string> string<= string>= string/=            string= string-equal string< string> string<= string>= string/=
21            string-lessp string-greaterp string-not-lessp string-not-greaterp            string-lessp string-greaterp string-not-lessp string-not-greaterp
22            string-not-equal            string-not-equal
23              string-to-nfd string-to-nfkd string-to-nfc string-to-nfkc
24            make-string            make-string
25            string-trim string-left-trim string-right-trim            string-trim string-left-trim string-right-trim
26            string-upcase            string-upcase
# Line 27  Line 28 
28            nstring-capitalize))            nstring-capitalize))
29    
30    
31    (declaim (inline surrogatep surrogates-to-codepoint codepoint surrogates))
32    
33    (defun surrogatep (c &optional surrogate-type)
34      "Test if C is a surrogate.  C may be either an integer or a
35      character. Surrogate-type indicates what kind of surrogate to test
36      for.  :High means to test for the high (leading) surrogate; :Low
37      tests for the low (trailing surrogate).  A value of :Any or Nil
38      tests for any surrogate value (high or low)."
39      (declare (type (or character codepoint) c))
40      (let ((code (if (characterp c)
41                      (char-code c)
42                      c)))
43        (ecase surrogate-type
44          ((:high :leading)
45           ;; Test for high surrogate
46           (<= #xD800 code #xDBFF))
47          ((:low :trailing)
48           ;; Test for low surrogate
49           (<= #xDC00 code #xDFFF))
50          ((:any nil)
51           ;; Test for any surrogate
52           (<= #xD800 code #xDFFF)))))
53    
54    (defun surrogates-to-codepoint (hi lo)
55      "Convert the given Hi and Lo surrogate characters to the
56      corresponding codepoint value"
57      (declare (type character hi lo))
58      (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi)) #xD800) 10)
59         (the (integer #xDC00 #xDFFF) (char-code lo)) #x2400))
60    
61    (defun codepoint (string i &optional (end (length string)))
62      "Return the codepoint value from String at position I.  If that
63      position is a surrogate, it is combined with either the previous or
64      following character (when possible) to compute the codepoint.  The
65      second return value is NIL if the position is not a surrogate pair.
66      Otherwise +1 or -1 is returned if the position is the high or low
67      surrogate value, respectively."
68      (declare (type simple-string string) (type kernel:index i end))
69      (let ((code (char-code (schar string i))))
70        (cond ((and (surrogatep code :high) (< (1+ i) end))
71               (let ((tmp (char-code (schar string (1+ i)))))
72                 (if (surrogatep tmp :low)
73                     (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)
74                     (values code nil))))
75              ((and (surrogatep code :low) (> i 0))
76               (let ((tmp (char-code (schar string (1- i)))))
77                 (if (surrogatep tmp :high)
78                     (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)
79                     (values code nil))))
80              (t (values code nil)))))
81    
82    (defun surrogates (codepoint)
83      "Return the high and low surrogate characters for Codepoint.  If
84      Codepoint is in the BMP, the first return value is the corresponding
85      character and the second is NIL."
86      (declare (type codepoint codepoint))
87      (if (< codepoint #x10000)
88          (values (code-char codepoint) nil)
89          (let* ((tmp (- codepoint #x10000))
90                 (hi (logior (ldb (byte 10 10) tmp) #xD800))
91                 (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
92            (values (code-char hi) (code-char lo)))))
93    
94    (defun (setf codepoint) (codepoint string i)
95      "Set the codepoint at string position I to the Codepoint.  If the
96      codepoint requires a surrogate pair, the high (leading surrogate) is
97      stored at position I and the low (trailing) surrogate is stored at
98      I+1"
99      (declare (type codepoint codepoint)
100               (type simple-string string))
101      (let ((widep nil))
102        (multiple-value-bind (hi lo)
103            (surrogates codepoint)
104          (setf (aref string i) hi)
105          (when lo
106            (setf (aref string (1+ i)) lo)
107            (setf widep t)))
108        (values codepoint widep)))
109    
110    (defun utf16-string-p (string)
111      "Check if String is a valid UTF-16 string.  If the string is valid,
112      T is returned.  If the string is not valid, NIL is returned, and the
113      second value is the index into the string of the invalid character.
114      A string is also invalid if it contains any unassigned codepoints."
115      (do ((len (length string))
116           (index 0 (1+ index)))
117          ((>= index len)
118           t)
119        (multiple-value-bind (codepoint wide)
120            (codepoint string index)
121          ;; We step through the string in order.  If there are any
122          ;; surrogates pairs, we must reach the lead surrogate first,
123          ;; which means WIDE is +1.  Otherwise, we have an invalid
124          ;; surrogate pair.  If we get any codepoint that is in the
125          ;; surrogate range, we also have an invalid string.  An
126          ;; unassigned codepoint is also considered invalid.
127          (when (or (eq wide -1)
128                    (surrogatep codepoint)
129                    (lisp::unicode-assigned-codepoint-p codepoint))
130            (return-from utf16-string-p (values nil index)))
131          (when wide (incf index)))))
132    
133  (defun string (X)  (defun string (X)
134    "Coerces X into a string.  If X is a string, X is returned.  If X is a    "Coerces X into a string.  If X is a string, X is returned.  If X is a
135     symbol, X's pname is returned.  If X is a character then a one element    symbol, X's pname is returned.  If X is a character then a one element
136     string containing that character is returned.  If X cannot be coerced    string containing that character is returned.  If X cannot be coerced
137     into a string, an error occurs."    into a string, an error occurs."
138    (cond ((stringp x) x)    (cond ((stringp x) x)
139          ((symbolp x) (symbol-name x))          ((symbolp x) (symbol-name x))
140          ((characterp x)          ((characterp x)
# Line 106  Line 209 
209    
210  (defun schar (string index)  (defun schar (string index)
211    "SCHAR returns the character object at an indexed position in a string    "SCHAR returns the character object at an indexed position in a string
212     just as CHAR does, except the string must be a simple-string."    just as CHAR does, except the string must be a simple-string."
213    (declare (optimize (safety 1)))    (declare (optimize (safety 1)))
214    (schar string index))    (schar string index))
215    
# Line 143  Line 246 
246                         (the fixnum end2))                         (the fixnum end2))
247                      ,(if lessp                      ,(if lessp
248                           `nil                           `nil
249                         `(- (the fixnum index) ,offset1)))                           `(- (the fixnum index) ,offset1)))
250                       #-unicode
251                     ((,(if lessp 'char< 'char>)                     ((,(if lessp 'char< 'char>)
252                       (schar string1 index)                       (schar string1 index)
253                       (schar string2 (+ (the fixnum index) (- start2 start1))))                       (schar string2 (+ (the fixnum index) (- start2 start1))))
254                      (- (the fixnum index) ,offset1))                      (- (the fixnum index) ,offset1))
255                     (t nil))                     #-unicode
256                       (t nil)
257                       #+unicode
258                       (t
259                        ;; Compare in code point order.  See
260                        ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
261                        (flet ((fixup (code)
262                                 (if (>= code #xe000)
263                                     (- code #x800)
264                                     (+ code #x2000))))
265                          (declare (inline fixup))
266                          (let* ((c1 (char-code (schar string1 index)))
267                                 (c2 (char-code (schar string2
268                                                       (+ (the fixnum index)
269                                                          (- start2 start1))))))
270                            (cond ((and (>= c1 #xd800)
271                                        (>= c2 #xd800))
272                                   (let ((fix-c1 (fixup c1))
273                                         (fix-c2 (fixup c2)))
274                                     (if (,(if lessp '< '>) fix-c1 fix-c2)
275                                         (- (the fixnum index) ,offset1)
276                                         nil)))
277                                  (t
278                                   (if (,(if lessp '< '>) c1 c2)
279                                       (- (the fixnum index) ,offset1)
280                                       nil)))))))
281               ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))               ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
282  ) ; eval-when  ) ; eval-when
283    
# Line 281  Line 410 
410    (if lessp    (if lessp
411        (if equalp        (if equalp
412            ;; STRING-NOT-GREATERP            ;; STRING-NOT-GREATERP
413            (values '<= `(not (char-greaterp char1 char2)))            (values '<=
414                      #-unicode `(not (char-greaterp char1 char2))
415                      #+unicode `(<= char1 char2))
416            ;; STRING-LESSP            ;; STRING-LESSP
417            (values '< `(char-lessp char1 char2)))            (values '<
418                      #-unicode `(char-lessp char1 char2)
419                      #+unicode `(< char1 char2)))
420        (if equalp        (if equalp
421            ;; STRING-NOT-LESSP            ;; STRING-NOT-LESSP
422            (values '>= `(not (char-lessp char1 char2)))            (values '>=
423                      #-unicode `(not (char-lessp char1 char2))
424                      #+unicode `(>= char1 char2))
425            ;; STRING-GREATERP            ;; STRING-GREATERP
426            (values '> `(char-greaterp char1 char2)))))            (values '>
427                      #-unicode `(char-greaterp char1 char2)
428                      #+unicode `(> char1 char2)))))
429    
430    #-unicode
431  (defmacro string-less-greater-equal (lessp equalp)  (defmacro string-less-greater-equal (lessp equalp)
432    (multiple-value-bind (length-test character-test)    (multiple-value-bind (length-test character-test)
433                         (string-less-greater-equal-tests lessp equalp)                         (string-less-greater-equal-tests lessp equalp)
# Line 314  Line 452 
452                     (return (- index1 offset1))                     (return (- index1 offset1))
453                     (return ()))))))))                     (return ()))))))))
454    
455    ;; Convert to lowercase for case folding, to match what Unicode
456    ;; CaseFolding.txt says.  An example where this matters: U+1E9E maps
457    ;; to U+00DF.  But the uppercase version of U+00DF is U+00DF.
458    #+unicode
459    (defmacro equal-char-codepoint (codepoint)
460      `(let ((ch ,codepoint))
461         ;; Handle ASCII separately for bootstrapping and for unidata missing.
462         (if (< 64 ch 91)
463             (+ ch 32)
464             #-(and unicode (not unicode-bootstrap))
465             ch
466             #+(and unicode (not unicode-bootstrap))
467             (if (> ch 127) (unicode-lower ch) ch))))
468    
469    #+unicode
470    (defmacro string-less-greater-equal (lessp equalp)
471      (multiple-value-bind (length-test character-test)
472          (string-less-greater-equal-tests lessp equalp)
473        `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
474           (let ((slen1 (- (the fixnum end1) start1))
475                 (slen2 (- (the fixnum end2) start2)))
476             (declare (fixnum slen1 slen2))
477             (if (or (minusp slen1) (minusp slen2))
478                 ;;prevent endless looping later.
479                 (error "Improper bounds for string comparison."))
480             (do ((index1 start1 (1+ index1))
481                  (index2 start2 (1+ index2)))
482                 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
483                  (if (,length-test slen1 slen2) (- index1 offset1)))
484               (declare (fixnum index1 index2))
485               (multiple-value-bind (char1 wide1)
486                   (codepoint string1 index1)
487                 (declare (type codepoint char1))
488                 (multiple-value-bind (char2 wide2)
489                     (codepoint string2 index2)
490                   (declare (type codepoint char2))
491                   (setf char1 (equal-char-codepoint char1))
492                   (setf char2 (equal-char-codepoint char2))
493                   (if (= char1 char2)
494                       (progn
495                         (when wide1 (incf index1))
496                         (when wide2 (incf index2)))
497                       (if ,character-test
498                           (return (- index1 offset1))
499                           (return ()))))))))))
500    
501  ) ; eval-when  ) ; eval-when
502    
503  (defun string-lessp* (string1 string2 start1 end1 start2 end2)  (defun string-lessp* (string1 string2 start1 end1 start2 end2)
# Line 360  Line 544 
544    
545  (defun make-string (count &key element-type ((:initial-element fill-char)))  (defun make-string (count &key element-type ((:initial-element fill-char)))
546    "Given a character count and an optional fill character, makes and returns    "Given a character count and an optional fill character, makes and returns
547     a new string Count long filled with the fill character."    a new string Count long filled with the fill character."
548    (declare (type fixnum count))    (declare (type fixnum count))
549    (assert (subtypep element-type 'character))    (assert (subtypep element-type 'character))
550    (if fill-char    (if fill-char
# Line 371  Line 555 
555          (setf (schar string i) fill-char))          (setf (schar string i) fill-char))
556        (make-string count)))        (make-string count)))
557    
558  (defun string-upcase (string &key (start 0) end)  (defun string-upcase-simple (string &key (start 0) end)
   "Given a string, returns a new string that is a copy of it with  
   all lower case alphabetic characters converted to uppercase."  
559    (declare (fixnum start))    (declare (fixnum start))
560    (let* ((string (if (stringp string) string (string string)))    (let* ((string (if (stringp string) string (string string)))
561           (slen (length string)))           (slen (length string)))
# Line 391  Line 573 
573               (new-index (- start offset) (1+ new-index)))               (new-index (- start offset) (1+ new-index)))
574              ((= index (the fixnum end)))              ((= index (the fixnum end)))
575            (declare (fixnum index new-index))            (declare (fixnum index new-index))
576            (setf (schar newstring new-index)            (multiple-value-bind (code wide) (codepoint string index)
577                  (char-upcase (schar string index))))              (when wide (incf index))
578                ;; Handle ASCII specially because this is called early in
579                ;; initialization, before unidata is available.
580                (cond ((< 96 code 123) (decf code 32))
581                      ((> code 127) (setq code (unicode-upper code))))
582                ;;@@ WARNING: this may, in theory, need to extend newstring
583                ;;  but that never actually occurs as of Unicode 5.1.0,
584                ;;  so I'm just going to ignore it for now...
585                (multiple-value-bind (hi lo) (surrogates code)
586                  (setf (schar newstring new-index) hi)
587                  (when lo
588                    (setf (schar newstring (incf new-index)) lo)))))
589            ;;@@ WARNING: see above
590          (do ((index end (1+ index))          (do ((index end (1+ index))
591               (new-index (- (the fixnum end) offset) (1+ new-index)))               (new-index (- (the fixnum end) offset) (1+ new-index)))
592              ((= index offset-slen))              ((= index offset-slen))
# Line 400  Line 594 
594            (setf (schar newstring new-index) (schar string index)))            (setf (schar newstring new-index) (schar string index)))
595          newstring))))          newstring))))
596    
597  (defun string-downcase (string &key (start 0) end)  (defun string-upcase-full (string &key (start 0) end)
598    "Given a string, returns a new string that is a copy of it with    (declare (fixnum start))
599    all upper case alphabetic characters converted to lowercase."    (let* ((string (if (stringp string) string (string string)))
600             (slen (length string)))
601        (declare (fixnum slen))
602        (with-output-to-string (s)
603          (with-one-string string start end offset
604            (let ((offset-slen (+ slen offset)))
605              (declare (fixnum offset-slen))
606              (write-string string s :start offset :end start)
607              (do ((index start (1+ index)))
608                  ((= index (the fixnum end)))
609                (declare (fixnum index))
610                (multiple-value-bind (code wide)
611                    (codepoint string index)
612                  (when wide (incf index))
613                  ;; Handle ASCII specially because this is called early in
614                  ;; initialization, before unidata is available.
615                  (cond ((< 96 code 123)
616                         (write-char (code-char (decf code 32)) s))
617                        ((> code 127)
618                         (write-string (unicode-full-case-upper code) s))
619                        (t
620                         (multiple-value-bind (hi lo)
621                             (surrogates code)
622                           (write-char hi s)
623                           (when lo
624                             (write-char lo s)))))))
625              (write-string string s :start end :end offset-slen))))))
626    
627    (defun string-upcase (string &key (start 0) end #+unicode (casing :simple))
628      "Given a string, returns a new string that is a copy of it with all
629      lower case alphabetic characters converted to uppercase.  If Casing
630      is :full, then Unicode full-casing operation is done."
631      (declare (fixnum start))
632      #-unicode
633      (string-upcase-simple string :start start :end end)
634      #+unicode
635      (if (eq casing :simple)
636          (string-upcase-simple string :start start :end end)
637          (string-upcase-full string :start start :end end)))
638    
639    (defun string-downcase-simple (string &key (start 0) end)
640    (declare (fixnum start))    (declare (fixnum start))
641    (let* ((string (if (stringp string) string (string string)))    (let* ((string (if (stringp string) string (string string)))
642           (slen (length string)))           (slen (length string)))
# Line 420  Line 654 
654               (new-index (- start offset) (1+ new-index)))               (new-index (- start offset) (1+ new-index)))
655              ((= index (the fixnum end)))              ((= index (the fixnum end)))
656            (declare (fixnum index new-index))            (declare (fixnum index new-index))
657            (setf (schar newstring new-index)            (multiple-value-bind (code wide) (codepoint string index)
658                  (char-downcase (schar string index))))              (when wide (incf index))
659                ;; Handle ASCII specially because this is called early in
660                ;; initialization, before unidata is available.
661                (cond ((< 64 code 91) (incf code 32))
662                      ((> code 127) (setq code (unicode-lower code))))
663                ;;@@ WARNING: this may, in theory, need to extend newstring
664                ;;  but that never actually occurs as of Unicode 5.1.0,
665                ;;  so I'm just going to ignore it for now...
666                (multiple-value-bind (hi lo) (surrogates code)
667                  (setf (schar newstring new-index) hi)
668                  (when lo
669                    (setf (schar newstring (incf new-index)) lo)))))
670            ;;@@ WARNING: see above
671          (do ((index end (1+ index))          (do ((index end (1+ index))
672               (new-index (- (the fixnum end) offset) (1+ new-index)))               (new-index (- (the fixnum end) offset) (1+ new-index)))
673              ((= index offset-slen))              ((= index offset-slen))
# Line 429  Line 675 
675            (setf (schar newstring new-index) (schar string index)))            (setf (schar newstring new-index) (schar string index)))
676          newstring))))          newstring))))
677    
678  (defun string-capitalize (string &key (start 0) end)  (defun string-downcase-full (string &key (start 0) end)
679    "Given a string, returns a copy of the string with the first    (declare (fixnum start))
680    character of each ``word'' converted to upper-case, and remaining    (let* ((string (if (stringp string) string (string string)))
681    chars in the word converted to lower case. A ``word'' is defined           (slen (length string)))
682    to be a string of case-modifiable characters delimited by      (declare (fixnum slen))
683    non-case-modifiable chars."      (with-output-to-string (s)
684          (with-one-string string start end offset
685            (let ((offset-slen (+ slen offset)))
686              (declare (fixnum offset-slen))
687              (write-string string s :start offset :end start)
688              (do ((index start (1+ index)))
689                  ((= index (the fixnum end)))
690                (declare (fixnum index))
691                (multiple-value-bind (code wide)
692                    (codepoint string index)
693                  (when wide (incf index))
694                  ;; Handle ASCII specially because this is called early in
695                  ;; initialization, before unidata is available.
696                  (cond ((< 64 code 91)
697                         (write-char (code-char (incf code 32)) s))
698                        ((> code 127)
699                         (write-string (unicode-full-case-lower code) s))
700                        (t
701                         (multiple-value-bind (hi lo)
702                             (surrogates code)
703                           (write-char hi s)
704                           (when lo
705                             (write-char lo s)))))))
706              (write-string string s :start end :end offset-slen))))))
707    
708    (defun string-downcase (string &key (start 0) end #+unicode (casing :simple))
709      "Given a string, returns a new string that is a copy of it with all
710      upper case alphabetic characters converted to lowercase.  If Casing
711      is :full, then Unicode full-casing is done"
712      (declare (fixnum start))
713      #-unicode
714      (string-downcase-simple string :start start :end end)
715      #+unicode
716      (if (eq casing :simple)
717          (string-downcase-simple string :start start :end end)
718          (string-downcase-full string :start start :end end)))
719    
720    (defun string-capitalize-simple (string &key (start 0) end)
721    (declare (fixnum start))    (declare (fixnum start))
722    (let* ((string (if (stringp string) string (string string)))    (let* ((string (if (stringp string) string (string string)))
723           (slen (length string)))           (slen (length string)))
# Line 459  Line 742 
742                   (setq newword t))                   (setq newword t))
743                  (newword                  (newword
744                   ;;char is first case-modifiable after non-case-modifiable                   ;;char is first case-modifiable after non-case-modifiable
745                   (setq char (char-upcase char))                   (setq char (char-titlecase char))
746                   (setq newword ()))                   (setq newword ()))
747                  ;;char is case-modifiable, but not first                  ;;char is case-modifiable, but not first
748                  (t (setq char (char-downcase char))))                  (t (setq char (char-downcase char))))
# Line 471  Line 754 
754            (setf (schar newstring new-index) (schar string index)))            (setf (schar newstring new-index) (schar string index)))
755          newstring))))          newstring))))
756    
757    (defun string-capitalize-full (string &key (start 0) end)
758      (declare (fixnum start))
759      (let* ((string (if (stringp string) string (string string)))
760             (slen (length string)))
761        (declare (fixnum slen))
762        (with-output-to-string (s)
763          (with-one-string string start end offset
764            (let ((offset-slen (+ slen offset)))
765              (declare (fixnum offset-slen))
766              (write-string string s :start offset :end start)
767              (flet ((alphanump (m)
768                       (or (< 47 m 58) (< 64 m 91) (< 96 m 123)
769                           #+(and unicode (not unicode-bootstrap))
770                           (and (> m 127)
771                                (<= +unicode-category-letter+ (unicode-category m)
772                                    (+ +unicode-category-letter+ #x0F))))))
773                (do ((index start (1+ index))
774                     (newword t))
775                    ((= index (the fixnum end)))
776                  (declare (fixnum index))
777                  (multiple-value-bind (code wide)
778                      (codepoint string index)
779                    (when wide (incf index))
780                    (cond ((not (alphanump code))
781                           (multiple-value-bind (hi lo)
782                               (surrogates code)
783                             (write-char hi s)
784                             (when lo (write-char lo s)))
785                           (setq newword t))
786                          (newword
787                           ;;char is first case-modifiable after non-case-modifiable
788                           (write-string (unicode-full-case-title code) s)
789                           (setq newword ()))
790                          ;;char is case-modifiable, but not first
791                          (t
792                           (write-string (unicode-full-case-lower code) s))))))
793              (write-string string s :start end :end offset-slen))))))
794    
795    (defun string-capitalize (string &key (start 0) end #+unicode (casing :simple))
796      "Given a string, returns a copy of the string with the first
797      character of each ``word'' converted to upper-case, and remaining
798      chars in the word converted to lower case. A ``word'' is defined
799      to be a string of case-modifiable characters delimited by
800      non-case-modifiable chars."
801      (declare (fixnum start))
802      #-unicode
803      (string-capitalize-simple string :start start :end end)
804      #+unicode
805      (if (eq casing :simple)
806          (string-capitalize-simple string :start start :end end)
807          (string-capitalize-full string :start start :end end)))
808    
809  (defun nstring-upcase (string &key (start 0) end)  (defun nstring-upcase (string &key (start 0) end)
810    "Given a string, returns that string with all lower case alphabetic    "Given a string, returns that string with all lower case alphabetic
811    characters converted to uppercase."    characters converted to uppercase."
# Line 480  Line 815 
815        (do ((index start (1+ index)))        (do ((index start (1+ index)))
816            ((= index (the fixnum end)))            ((= index (the fixnum end)))
817          (declare (fixnum index))          (declare (fixnum index))
818          (setf (schar string index) (char-upcase (schar string index)))))          (multiple-value-bind (code wide) (codepoint string index)
819              (declare (ignore wide))
820              ;; Handle ASCII specially because this is called early in
821              ;; initialization, before unidata is available.
822              (cond ((< 96 code 123) (decf code 32))
823                    ((> code 127) (setq code (unicode-upper code))))
824              ;;@@ WARNING: this may, in theory, need to extend string
825              ;;      (which, obviously, we can't do here.  Unless
826              ;;       STRING is adjustable, maybe)
827              ;;  but that never actually occurs as of Unicode 5.1.0,
828              ;;  so I'm just going to ignore it for now...
829              (multiple-value-bind (hi lo) (surrogates code)
830                (setf (schar string index) hi)
831                (when lo
832                  (setf (schar string (incf index)) lo))))))
833      save-header))      save-header))
834    
835  (defun nstring-downcase (string &key (start 0) end)  (defun nstring-downcase (string &key (start 0) end)
# Line 492  Line 841 
841        (do ((index start (1+ index)))        (do ((index start (1+ index)))
842            ((= index (the fixnum end)))            ((= index (the fixnum end)))
843          (declare (fixnum index))          (declare (fixnum index))
844          (setf (schar string index) (char-downcase (schar string index)))))          (multiple-value-bind (code wide) (codepoint string index)
845              (declare (ignore wide))
846              (cond ((< 64 code 91) (incf code 32))
847                    ((> code 127) (setq code (unicode-lower code))))
848              ;;@@ WARNING: this may, in theory, need to extend string
849              ;;      (which, obviously, we can't do here.  Unless
850              ;;       STRING is adjustable, maybe)
851              ;;  but that never actually occurs as of Unicode 5.1.0,
852              ;;  so I'm just going to ignore it for now...
853              (multiple-value-bind (hi lo) (surrogates code)
854                (setf (schar string index) hi)
855                (when lo
856                  (setf (schar string (incf index)) lo))))))
857      save-header))      save-header))
858    
859  (defun nstring-capitalize (string &key (start 0) end)  (defun nstring-capitalize (string &key (start 0) end)
# Line 514  Line 875 
875                 (setq newword t))                 (setq newword t))
876                (newword                (newword
877                 ;;char is first case-modifiable after non-case-modifiable                 ;;char is first case-modifiable after non-case-modifiable
878                 (setf (schar string index) (char-upcase char))                 (setf (schar string index) (char-titlecase char))
879                 (setq newword ()))                 (setq newword ()))
880                (t                (t
881                 (setf (schar string index) (char-downcase char))))))                 (setf (schar string index) (char-downcase char))))))
882      save-header))      save-header))
883    
884    
885    #+unicode
886    (progn
887    ;; Like string-left-trim, but return the index
888    (defun string-left-trim-index (char-bag string)
889      (with-string string
890        (if (stringp char-bag)
891            ;; When char-bag is a string, we try to do the right thing.
892            ;; Convert char-bag to a list of codepoints and compare the
893            ;; codepoints in the string with this.
894            (let ((code-bag (with-string char-bag
895                              (do ((index start (1+ index))
896                                   (result nil))
897                                  ((= index end)
898                                   (nreverse result))
899                                (multiple-value-bind (c widep)
900                                    (codepoint char-bag index)
901                                  (push c result)
902                                  (when widep (incf index)))))))
903              (do ((index start (1+ index)))
904                  ((= index (the fixnum end))
905                   end)
906                (declare (fixnum index))
907                (multiple-value-bind (c widep)
908                    (codepoint string index)
909                  (unless (find c code-bag)
910                    (return-from string-left-trim-index index))
911                  (when widep (incf index)))))
912            ;; When char-bag is a list, we just look at each codepoint of
913            ;; STRING to see if it's in char-bag.  If char-bag contains a
914            ;; surrogate, we could accidentally trim off a surrogate,
915            ;; leaving an invalid UTF16 string.
916            (do ((index start (1+ index)))
917                ((= index (the fixnum end))
918                 end)
919              (declare (fixnum index))
920              (multiple-value-bind (c widep)
921                  (codepoint string index)
922                (unless (find c char-bag :key #'char-code)
923                  (return-from string-left-trim-index index))
924                (when widep (incf index)))))))
925    
926    (defun string-left-trim (char-bag string)
927      "Given a set of characters (a list or string) and a string, returns
928      a copy of the string with the characters in the set removed from the
929      left end.  If the set of characters is a string, surrogates will be
930      properly handled."
931      (let ((begin (string-left-trim-index char-bag string)))
932        (with-string string
933          (subseq string begin end))))
934    
935    (defun string-right-trim-index (char-bag string)
936      (with-string string
937        (if (stringp char-bag)
938            ;; When char-bag is a string, we try to do the right thing
939            ;; with surrogates.  Convert char-bag to a list of codepoints
940            ;; and compare the codepoints in the string with this.
941            (let ((code-bag (with-string char-bag
942                              (do ((index start (1+ index))
943                                   (result nil))
944                                  ((= index end)
945                                   result)
946                                (multiple-value-bind (c widep)
947                                    (codepoint char-bag index)
948                                  (push c result)
949                                  (when widep (incf index)))))))
950              (do ((index (1- end) (1- index)))
951                  ((< index start)
952                   start)
953                (declare (fixnum index))
954                (multiple-value-bind (c widep)
955                    (codepoint string index)
956                  (unless (find c code-bag)
957                    (return-from string-right-trim-index (1+ index)))
958                  (when widep (decf index)))))
959            ;; When char-bag is a list, we just look at each codepoint of
960            ;; STRING to see if it's in char-bag.  If char-bag contains a
961            ;; surrogate, we could accidentally trim off a surrogate,
962            ;; leaving an invalid UTF16 string.
963            (do ((index (1- end) (1- index)))
964                ((< index start)
965                 start)
966              (declare (fixnum index))
967              (multiple-value-bind (c widep)
968                  (codepoint string index)
969                (unless (find c char-bag :key #'char-code)
970                  (return-from string-right-trim-index (1+ index)))
971                (when widep (decf index)))))))
972    
973    (defun string-right-trim (char-bag string)
974      "Given a set of characters (a list or string) and a string, returns
975      a copy of the string with the characters in the set removed from the
976      right end.  If the set of characters is a string, surrogates will be
977      properly handled."
978      (let ((stop (string-right-trim-index char-bag string)))
979        (with-string string
980          (subseq string start stop))))
981    
982    (defun string-trim (char-bag string)
983      "Given a set of characters (a list or string) and a string, returns a
984      copy of the string with the characters in the set removed from both
985      ends.  If the set of characters is a string, surrogates will be
986      properly handled."
987      (let ((left-end (string-left-trim-index char-bag string))
988            (right-end (string-right-trim-index char-bag string)))
989        (with-string string
990          (subseq (the simple-string string) left-end right-end))))
991    ) ; end unicode version
992    
993    #-unicode
994    (progn
995  (defun string-left-trim (char-bag string)  (defun string-left-trim (char-bag string)
996    "Given a set of characters (a list or string) and a string, returns    "Given a set of characters (a list or string) and a string, returns
997    a copy of the string with the characters in the set removed from the    a copy of the string with the characters in the set removed from the
# Line 557  Line 1029 
1029                             (1+ index))                             (1+ index))
1030                          (declare (fixnum index)))))                          (declare (fixnum index)))))
1031        (subseq (the simple-string string) left-end right-end))))        (subseq (the simple-string string) left-end right-end))))
1032    ) ; non-unicode version
1033    
1034    (declaim (inline %glyph-f %glyph-b))
1035    (defun %glyph-f (string index)
1036      (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
1037               (type simple-string string) (type kernel:index index))
1038      (let* ((prev 0)
1039             (l (length string))
1040             (c (codepoint string index l))
1041             (n (+ index (if (> c #xFFFF) 2 1))))
1042        (declare (type codepoint c) (type kernel:index l n))
1043        (loop while (< n l) do
1044          (let* ((c (codepoint string n l))
1045                 (d (the (unsigned-byte 8) (unicode-combining-class c))))
1046            (when (or (zerop d) (< d prev))
1047              (return))
1048            (setq prev d)
1049            (incf n (if (> c #xFFFF) 2 1))))
1050        n))
1051    
1052    (defun %glyph-b (string index)
1053      (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
1054               (type simple-string string) (type kernel:index index))
1055      (let* ((prev 255)
1056             (n (1- index)))
1057        (declare (type kernel:index n))
1058        (loop until (< n 0) do
1059          (let* ((c (codepoint string n 0))
1060                 (d (the (unsigned-byte 8) (unicode-combining-class c))))
1061            (cond ((zerop d) (return))
1062                  ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
1063            (setq prev d)
1064            (decf n (if (> c #xFFFF) 2 1))))
1065        n))
1066    
1067    (defun glyph (string index &key (from-end nil))
1068      "GLYPH returns the glyph at the indexed position in a string, and the
1069      position of the next glyph (or NIL) as a second value.  A glyph is
1070      a substring consisting of the character at INDEX followed by all
1071      subsequent combining characters."
1072      (declare (type simple-string string) (type kernel:index index))
1073      #-unicode
1074      (char string index)
1075      #+unicode
1076      (with-array-data ((string string) (start) (end))
1077        (declare (ignore start end))
1078        (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
1079          (if from-end
1080              (values (subseq string n index) (and (> n 0) n))
1081              (values (subseq string index n) (and (< n (length string)) n))))))
1082    
1083    (defun sglyph (string index &key (from-end nil))
1084      "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
1085      except that the string must be a simple-string"
1086      (declare (type simple-string string) (type kernel:index index))
1087      #-unicode
1088      (schar string index)
1089      #+unicode
1090      (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
1091        (if from-end
1092            (values (subseq string n index) (and (> n 0) n))
1093            (values (subseq string index n) (and (< n (length string)) n)))))
1094    
1095    #+unicode
1096    (defun string-reverse* (sequence)
1097      (declare (optimize (speed 3) (space 0) (safety 0))
1098               (type string sequence))
1099      (with-string sequence
1100        (let* ((length (- end start))
1101               (string (make-string length))
1102               (j length))
1103          (declare (type kernel:index length j))
1104          (loop for i = start then n as n = (%glyph-f sequence i) do
1105                (replace string sequence :start1 (decf j (- n i)) :start2 i :end2 n)
1106                while (< n end))
1107          string)))
1108    
1109    #+unicode
1110    (defun string-nreverse* (sequence)
1111      (declare (optimize (speed 3) (space 0) (safety 0))
1112               (type string sequence))
1113      (with-string sequence
1114        (flet ((rev (start end)
1115                 (do ((i start (1+ i))
1116                      (j (1- end) (1- j)))
1117                     ((>= i j))
1118                   (declare (type kernel:index i j))
1119                   (rotatef (schar sequence i) (schar sequence j)))))
1120          (let ((len end))
1121            (loop for i = start then n as n = (%glyph-f sequence i) do
1122              (rev i n) while (< n len))
1123            (rev start end))))
1124      sequence)
1125    
1126    
1127    
1128    
1129    (defun decompose (string &optional (compatibility t))
1130      (declare (type string string))
1131      (let ((result (make-string (cond ((< (length string) 40)
1132                                        (* 5 (length string)))
1133                                       ((< (length string) 4096)
1134                                        (* 2 (length string)))
1135                                       (t (round (length string) 5/6)))))
1136            (fillptr 0))
1137        (declare (type kernel:index fillptr))
1138        (labels ((rec (string start end)
1139                   (declare (type simple-string string))
1140                   (do ((i start (1+ i)))
1141                       ((= i end))
1142                     (declare (type kernel:index i))
1143                     (multiple-value-bind (code wide) (codepoint string i)
1144                       (when wide (incf i))
1145                       (let ((decomp (unicode-decomp code compatibility)))
1146                         (if decomp (rec decomp 0 (length decomp)) (out code))))))
1147                 (out (code)
1148                   (multiple-value-bind (hi lo) (surrogates code)
1149                     (outch hi)
1150                     (when lo
1151                       (outch lo))
1152                     (let ((cc (unicode-combining-class code)))
1153                       (unless (zerop cc)
1154                         (order lo cc (- fillptr (if lo 3 2)))))))
1155                 (outch (char)
1156                   (when (= fillptr (length result))
1157                     (let ((tmp (make-string (round (length result) 5/6))))
1158                       (replace tmp result)
1159                       (setq result tmp)))
1160                   (setf (schar result fillptr) char)
1161                   (incf fillptr))
1162                 (order (wide1 cc last)
1163                   (loop until (minusp last) do
1164                     (multiple-value-bind (code2 wide2) (codepoint result last)
1165                       (let ((cc2 (unicode-combining-class code2)))
1166                         (cond ((zerop cc2) (return))
1167                               ((> cc2 cc)
1168                                (case (+ (if wide2 2 0) (if wide1 1 0))
1169                                  (0 (rotatef (schar result last)
1170                                              (schar result (1+ last))))
1171                                  (1 (rotatef (schar result last)
1172                                              (schar result (+ last 1))
1173                                              (schar result (+ last 2))))
1174                                  (2 (rotatef (schar result last)
1175                                              (schar result (1- last))
1176                                              (schar result (1+ last))))
1177                                  (3 (rotatef (schar result last)
1178                                              (schar result (+ last 2)))
1179                                     (rotatef (schar result (1- last))
1180                                              (schar result (1+ last)))))
1181                                (decf last (if wide2 2 1)))
1182                               (t (return))))))))
1183          (with-string string
1184            (rec string start end))
1185          (shrink-vector result fillptr))))
1186    
1187    (declaim (inline normalized-codepoint-p))
1188    (defun normalized-codepoint-p (cp form)
1189      (ecase form
1190        (:nfc (unicode-nfc-qc cp))
1191        (:nfkc (unicode-nfkc-qc cp))
1192        (:nfd (unicode-nfd-qc cp))
1193        (:nfkd (unicode-nfkd-qc cp))))
1194    
1195    ;; Perform check to see if string is already normalized.  The Unicode
1196    ;; example can return YES, NO, or MAYBE.  For our purposes, only YES
1197    ;; is important, for which we return T.   For NO or MAYBE, we return NIL.
1198    (defun normalized-form-p (string &optional (form :nfc))
1199      (declare (type (member :nfc :nfkc :nfd :nfkd) form)
1200               (optimize (speed 3)))
1201      (with-string string
1202        (let ((last-class 0))
1203          (declare (type (integer 0 256) last-class))
1204          (do ((k start (1+ k)))
1205              ((>= k end))
1206            (declare (type kernel:index k))
1207            (multiple-value-bind (ch widep)
1208                (codepoint string k end)
1209              (when widep (incf k))
1210              ;; Handle ASCII specially
1211              (unless (< ch 128)
1212                (let ((class (unicode-combining-class ch)))
1213                  (declare (type (unsigned-byte 8) class))
1214                  (when (and (> last-class class) (not (zerop class)))
1215                    ;; Definitely not normalized
1216                    (return-from normalized-form-p nil))
1217                  (let ((check (normalized-codepoint-p ch form)))
1218                    (unless (eq check :y)
1219                      (return-from normalized-form-p nil)))
1220                  (setf last-class class)))))
1221          t)))
1222    
1223    
1224    ;; Compose a string in place.  The string must already be in decomposed form.
1225    (defun %compose (target)
1226      (declare (type string target)
1227               (optimize (speed 3)))
1228      (let ((len (length target))
1229            (starter-pos 0))
1230        (declare (type kernel:index starter-pos))
1231        (multiple-value-bind (starter-ch wide)
1232            (codepoint target 0 len)
1233          (let ((comp-pos (if wide 2 1))
1234                (last-class (unicode-combining-class starter-ch)))
1235            (declare (type (integer 0 256) last-class)
1236                     (type kernel:index comp-pos))
1237            (unless (zerop last-class)
1238              ;; Fix for strings starting with a combining character
1239              (setf last-class 256))
1240            ;; Loop on decomposed characters, combining where possible
1241            (do ((decomp-pos comp-pos (1+ decomp-pos)))
1242                ((>= decomp-pos len))
1243              (declare (type kernel:index decomp-pos))
1244              (multiple-value-bind (ch wide)
1245                  (codepoint target decomp-pos len)
1246                (when wide (incf decomp-pos))
1247                (let ((ch-class (unicode-combining-class ch))
1248                      (composite (get-pairwise-composition starter-ch ch)))
1249                  (declare (type (integer 0 256) ch-class))
1250                  (cond ((and composite
1251                              (or (< last-class ch-class) (zerop last-class)))
1252                         ;; Don't have to worry about surrogate pairs here
1253                         ;; because the composite is always in the BMP.
1254                         (setf (aref target starter-pos) (code-char composite))
1255                         (setf starter-ch composite))
1256                        (t
1257                         (when (zerop ch-class)
1258                           (setf starter-pos comp-pos)
1259                           (setf starter-ch ch))
1260                         (setf last-class ch-class)
1261                         (multiple-value-bind (hi lo)
1262                             (surrogates ch)
1263                           (setf (aref target comp-pos) hi)
1264                           (when lo
1265                             (incf comp-pos)
1266                             (setf (aref target comp-pos) lo))
1267                           (incf comp-pos)))))))
1268            (shrink-vector target comp-pos)))))
1269    
1270    (defun string-to-nfd (string)
1271      "Convert String to Unicode Normalization Form D (NFD) using the
1272      canonical decomposition.  The NFD string is returned"
1273      (decompose string nil))
1274    
1275    (defun string-to-nfkd (string)
1276      "Convert String to Unicode Normalization Form KD (NFKD) uisng the
1277      compatible decomposition form.  The NFKD string is returned."
1278      (decompose string t))
1279    
1280    #+unicode
1281    (defun string-to-nfc (string)
1282      "Convert String to Unicode Normalization Form C (NFC).  If the
1283      string a simple string and is already normalized, the original
1284      string is returned."
1285      (if (normalized-form-p string :nfc)
1286          (if (simple-string-p string) string (coerce string 'simple-string))
1287          (coerce (if (normalized-form-p string :nfd)
1288                      (%compose (copy-seq string))
1289                      (%compose (string-to-nfd string)))
1290                  'simple-string)))
1291    
1292    #-unicode  ;; Needed by package.lisp
1293    (defun string-to-nfc (string)
1294      (if (simple-string-p string) string (coerce string 'simple-string)))
1295    
1296    (defun string-to-nfkc (string)
1297      "Convert String to Unicode Normalization Form KC (NFKC).  If the
1298      string is a simple string and is already normalized, the original
1299      string is returned."
1300      (if (normalized-form-p string :nfkc)
1301          (if (simple-string-p string) string (coerce string 'simple-string))
1302          (coerce (if (normalized-form-p string :nfkd)
1303                      (%compose (copy-seq string))
1304                      (%compose (string-to-nfkd string)))
1305                  'simple-string)))

Legend:
Removed from v.1.12  
changed lines
  Added in v.1.12.30.30

  ViewVC Help
Powered by ViewVC 1.1.5