/[cmucl]/src/code/string.lisp
ViewVC logotype

Diff of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.12 by emarsden, Fri Apr 11 15:41:59 2003 UTC revision 1.12.30.26 by rtoy, Wed May 27 20:34:19 2009 UTC
# Line 16  Line 16 
16  ;;; ****************************************************************  ;;; ****************************************************************
17  ;;;  ;;;
18  (in-package "LISP")  (in-package "LISP")
19  (export '(char schar string  (export '(char schar glyph sglyph string
20            string= string-equal string< string> string<= string>= string/=            string= string-equal string< string> string<= string>= string/=
21            string-lessp string-greaterp string-not-lessp string-not-greaterp            string-lessp string-greaterp string-not-lessp string-not-greaterp
22            string-not-equal            string-not-equal
23              string-to-nfd string-to-nfkd string-to-nfc string-to-nfkc
24            make-string            make-string
25            string-trim string-left-trim string-right-trim            string-trim string-left-trim string-right-trim
26            string-upcase            string-upcase
# Line 27  Line 28 
28            nstring-capitalize))            nstring-capitalize))
29    
30    
31    (declaim (inline surrogatep surrogates-to-codepoint codepoint surrogates))
32    
33    (defun surrogatep (c &optional surrogate-type)
34      "Test if C is a surrogate.  C may be either an integer or a
35      character. Surrogate-type indicates what kind of surrogate to test
36      for.  :High means to test for the high (leading) surrogate; :Low
37      tests for the low (trailing surrogate).  A value of :Any or Nil
38      tests for any surrogate value (high or low)."
39      (declare (type (or character codepoint) c))
40      (let ((code (if (characterp c)
41                      (char-code c)
42                      c)))
43        (ecase surrogate-type
44          ((:high :leading)
45           ;; Test for high surrogate
46           (<= #xD800 code #xDBFF))
47          ((:low :trailing)
48           ;; Test for low surrogate
49           (<= #xDC00 code #xDFFF))
50          ((:any nil)
51           ;; Test for any surrogate
52           (<= #xD800 code #xDFFF)))))
53    
54    (defun surrogates-to-codepoint (hi lo)
55      "Convert the given Hi and Lo surrogate characters to the
56      corresponding codepoint value"
57      (declare (type character hi lo))
58      (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi)) #xD800) 10)
59         (the (integer #xDC00 #xDFFF) (char-code lo)) #x2400))
60    
61    (defun codepoint (string i &optional (end (length string)))
62      "Return the codepoint value from String at position I.  If that
63      position is a surrogate, it is combined with either the previous or
64      following character (when possible) to compute the codepoint.  The
65      second return value is NIL if the position is not a surrogate pair.
66      Otherwise +1 or -1 is returned if the position is the high or low
67      surrogate value, respectively."
68      (declare (type simple-string string) (type kernel:index i end))
69      (let ((code (char-code (schar string i))))
70        (cond ((and (surrogatep code :high) (< (1+ i) end))
71               (let ((tmp (char-code (schar string (1+ i)))))
72                 (if (surrogatep tmp :low)
73                     (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)
74                     (values code nil))))
75              ((and (surrogatep code :low) (> i 0))
76               (let ((tmp (char-code (schar string (1- i)))))
77                 (if (surrogatep tmp :high)
78                     (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)
79                     (values code nil))))
80              (t (values code nil)))))
81    
82    (defun surrogates (codepoint)
83      "Return the high and low surrogate characters for Codepoint.  If
84      Codepoint is in the BMP, the first return value is the corresponding
85      character and the second is NIL."
86      (declare (type codepoint codepoint))
87      (if (< codepoint #x10000)
88          (values (code-char codepoint) nil)
89          (let* ((tmp (- codepoint #x10000))
90                 (hi (logior (ldb (byte 10 10) tmp) #xD800))
91                 (lo (logior (ldb (byte 10 0) tmp) #xDC00)))
92            (values (code-char hi) (code-char lo)))))
93    
94    (defun (setf codepoint) (codepoint string i)
95      "Set the codepoint at string position I to the Codepoint.  If the
96      codepoint requires a surrogate pair, the high (leading surrogate) is
97      stored at position I and the low (trailing) surrogate is stored at
98      I+1"
99      (declare (type codepoint codepoint)
100               (type simple-string string))
101      (let ((widep nil))
102        (multiple-value-bind (hi lo)
103            (surrogates codepoint)
104          (setf (aref string i) hi)
105          (when lo
106            (setf (aref string (1+ i)) lo)
107            (setf widep t)))
108        (values codepoint widep)))
109    
110    (defun utf16-string-p (string)
111      "Check if String is a valid UTF-16 string.  If the string is valid,
112      T is returned.  If the string is not valid, NIL is returned, and the
113      second value is the index into the string of the invalid character."
114      (do ((len (length string))
115           (index 0 (1+ index)))
116          ((>= index len)
117           t)
118        (multiple-value-bind (codepoint wide)
119            (codepoint string index)
120          ;; We step through the string in order.  If there are any
121          ;; surrogates pairs, we must reach the lead surrogate first,
122          ;; which means WIDE is +1.  Otherwise, we have an invalid
123          ;; surrogate pair.  If we get any codepoint that is in
124          ;; the surrogate range, we also have an invalid string.
125          (when (or (eq wide -1)
126                    (surrogatep codepoint))
127            (return-from utf16-string-p (values nil index)))
128          (when wide (incf index)))))
129    
130  (defun string (X)  (defun string (X)
131    "Coerces X into a string.  If X is a string, X is returned.  If X is a    "Coerces X into a string.  If X is a string, X is returned.  If X is a
132     symbol, X's pname is returned.  If X is a character then a one element    symbol, X's pname is returned.  If X is a character then a one element
133     string containing that character is returned.  If X cannot be coerced    string containing that character is returned.  If X cannot be coerced
134     into a string, an error occurs."    into a string, an error occurs."
135    (cond ((stringp x) x)    (cond ((stringp x) x)
136          ((symbolp x) (symbol-name x))          ((symbolp x) (symbol-name x))
137          ((characterp x)          ((characterp x)
# Line 106  Line 206 
206    
207  (defun schar (string index)  (defun schar (string index)
208    "SCHAR returns the character object at an indexed position in a string    "SCHAR returns the character object at an indexed position in a string
209     just as CHAR does, except the string must be a simple-string."    just as CHAR does, except the string must be a simple-string."
210    (declare (optimize (safety 1)))    (declare (optimize (safety 1)))
211    (schar string index))    (schar string index))
212    
# Line 143  Line 243 
243                         (the fixnum end2))                         (the fixnum end2))
244                      ,(if lessp                      ,(if lessp
245                           `nil                           `nil
246                         `(- (the fixnum index) ,offset1)))                           `(- (the fixnum index) ,offset1)))
247                       #-unicode
248                     ((,(if lessp 'char< 'char>)                     ((,(if lessp 'char< 'char>)
249                       (schar string1 index)                       (schar string1 index)
250                       (schar string2 (+ (the fixnum index) (- start2 start1))))                       (schar string2 (+ (the fixnum index) (- start2 start1))))
251                      (- (the fixnum index) ,offset1))                      (- (the fixnum index) ,offset1))
252                     (t nil))                     #-unicode
253                       (t nil)
254                       #+unicode
255                       (t
256                        ;; Compare in code point order.  See
257                        ;; http://icu-project.org/docs/papers/utf16_code_point_order.html
258                        (flet ((fixup (code)
259                                 (if (>= code #xe000)
260                                     (- code #x800)
261                                     (+ code #x2000))))
262                          (declare (inline fixup))
263                          (let* ((c1 (char-code (schar string1 index)))
264                                 (c2 (char-code (schar string2
265                                                       (+ (the fixnum index)
266                                                          (- start2 start1))))))
267                            (cond ((and (>= c1 #xd800)
268                                        (>= c2 #xd800))
269                                   (let ((fix-c1 (fixup c1))
270                                         (fix-c2 (fixup c2)))
271                                     (if (,(if lessp '< '>) fix-c1 fix-c2)
272                                         (- (the fixnum index) ,offset1)
273                                         nil)))
274                                  (t
275                                   (if (,(if lessp '< '>) c1 c2)
276                                       (- (the fixnum index) ,offset1)
277                                       nil)))))))
278               ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))               ,(if equalp `(- (the fixnum end1) ,offset1) 'nil))))))
279  ) ; eval-when  ) ; eval-when
280    
# Line 281  Line 407 
407    (if lessp    (if lessp
408        (if equalp        (if equalp
409            ;; STRING-NOT-GREATERP            ;; STRING-NOT-GREATERP
410            (values '<= `(not (char-greaterp char1 char2)))            (values '<=
411                      #-unicode `(not (char-greaterp char1 char2))
412                      #+unicode `(<= char1 char2))
413            ;; STRING-LESSP            ;; STRING-LESSP
414            (values '< `(char-lessp char1 char2)))            (values '<
415                      #-unicode `(char-lessp char1 char2)
416                      #+unicode `(< char1 char2)))
417        (if equalp        (if equalp
418            ;; STRING-NOT-LESSP            ;; STRING-NOT-LESSP
419            (values '>= `(not (char-lessp char1 char2)))            (values '>=
420                      #-unicode `(not (char-lessp char1 char2))
421                      #+unicode `(>= char1 char2))
422            ;; STRING-GREATERP            ;; STRING-GREATERP
423            (values '> `(char-greaterp char1 char2)))))            (values '>
424                      #-unicode `(char-greaterp char1 char2)
425                      #+unicode `(> char1 char2)))))
426    
427    #-unicode
428  (defmacro string-less-greater-equal (lessp equalp)  (defmacro string-less-greater-equal (lessp equalp)
429    (multiple-value-bind (length-test character-test)    (multiple-value-bind (length-test character-test)
430                         (string-less-greater-equal-tests lessp equalp)                         (string-less-greater-equal-tests lessp equalp)
# Line 314  Line 449 
449                     (return (- index1 offset1))                     (return (- index1 offset1))
450                     (return ()))))))))                     (return ()))))))))
451    
452    ;; Convert to lowercase for case folding, to match what Unicode
453    ;; CaseFolding.txt says.  An example where this matters: U+1E9E maps
454    ;; to U+00DF.  But the uppercase version of U+00DF is U+00DF.
455    #+unicode
456    (defmacro equal-char-codepoint (codepoint)
457      `(let ((ch ,codepoint))
458         ;; Handle ASCII separately for bootstrapping and for unidata missing.
459         (if (< 64 ch 91)
460             (+ ch 32)
461             #-(and unicode (not unicode-bootstrap))
462             ch
463             #+(and unicode (not unicode-bootstrap))
464             (if (> ch 127) (unicode-lower ch) ch))))
465    
466    #+unicode
467    (defmacro string-less-greater-equal (lessp equalp)
468      (multiple-value-bind (length-test character-test)
469          (string-less-greater-equal-tests lessp equalp)
470        `(with-two-strings string1 string2 start1 end1 offset1 start2 end2
471           (let ((slen1 (- (the fixnum end1) start1))
472                 (slen2 (- (the fixnum end2) start2)))
473             (declare (fixnum slen1 slen2))
474             (if (or (minusp slen1) (minusp slen2))
475                 ;;prevent endless looping later.
476                 (error "Improper bounds for string comparison."))
477             (do ((index1 start1 (1+ index1))
478                  (index2 start2 (1+ index2)))
479                 ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
480                  (if (,length-test slen1 slen2) (- index1 offset1)))
481               (declare (fixnum index1 index2))
482               (multiple-value-bind (char1 wide1)
483                   (codepoint string1 index1)
484                 (declare (type codepoint char1))
485                 (multiple-value-bind (char2 wide2)
486                     (codepoint string2 index2)
487                   (declare (type codepoint char2))
488                   (setf char1 (equal-char-codepoint char1))
489                   (setf char2 (equal-char-codepoint char2))
490                   (if (= char1 char2)
491                       (progn
492                         (when wide1 (incf index1))
493                         (when wide2 (incf index2)))
494                       (if ,character-test
495                           (return (- index1 offset1))
496                           (return ()))))))))))
497    
498  ) ; eval-when  ) ; eval-when
499    
500  (defun string-lessp* (string1 string2 start1 end1 start2 end2)  (defun string-lessp* (string1 string2 start1 end1 start2 end2)
# Line 360  Line 541 
541    
542  (defun make-string (count &key element-type ((:initial-element fill-char)))  (defun make-string (count &key element-type ((:initial-element fill-char)))
543    "Given a character count and an optional fill character, makes and returns    "Given a character count and an optional fill character, makes and returns
544     a new string Count long filled with the fill character."    a new string Count long filled with the fill character."
545    (declare (type fixnum count))    (declare (type fixnum count))
546    (assert (subtypep element-type 'character))    (assert (subtypep element-type 'character))
547    (if fill-char    (if fill-char
# Line 391  Line 572 
572               (new-index (- start offset) (1+ new-index)))               (new-index (- start offset) (1+ new-index)))
573              ((= index (the fixnum end)))              ((= index (the fixnum end)))
574            (declare (fixnum index new-index))            (declare (fixnum index new-index))
575            (setf (schar newstring new-index)            (multiple-value-bind (code wide) (codepoint string index)
576                  (char-upcase (schar string index))))              (when wide (incf index))
577                ;; Handle ASCII specially because this is called early in
578                ;; initialization, before unidata is available.
579                (cond ((< 96 code 123) (decf code 32))
580                      ((> code 127) (setq code (unicode-upper code))))
581                ;;@@ WARNING: this may, in theory, need to extend newstring
582                ;;  but that never actually occurs as of Unicode 5.1.0,
583                ;;  so I'm just going to ignore it for now...
584                (multiple-value-bind (hi lo) (surrogates code)
585                  (setf (schar newstring new-index) hi)
586                  (when lo
587                    (setf (schar newstring (incf new-index)) lo)))))
588            ;;@@ WARNING: see above
589          (do ((index end (1+ index))          (do ((index end (1+ index))
590               (new-index (- (the fixnum end) offset) (1+ new-index)))               (new-index (- (the fixnum end) offset) (1+ new-index)))
591              ((= index offset-slen))              ((= index offset-slen))
# Line 420  Line 613 
613               (new-index (- start offset) (1+ new-index)))               (new-index (- start offset) (1+ new-index)))
614              ((= index (the fixnum end)))              ((= index (the fixnum end)))
615            (declare (fixnum index new-index))            (declare (fixnum index new-index))
616            (setf (schar newstring new-index)            (multiple-value-bind (code wide) (codepoint string index)
617                  (char-downcase (schar string index))))              (when wide (incf index))
618                ;; Handle ASCII specially because this is called early in
619                ;; initialization, before unidata is available.
620                (cond ((< 64 code 91) (incf code 32))
621                      ((> code 127) (setq code (unicode-lower code))))
622                ;;@@ WARNING: this may, in theory, need to extend newstring
623                ;;  but that never actually occurs as of Unicode 5.1.0,
624                ;;  so I'm just going to ignore it for now...
625                (multiple-value-bind (hi lo) (surrogates code)
626                  (setf (schar newstring new-index) hi)
627                  (when lo
628                    (setf (schar newstring (incf new-index)) lo)))))
629            ;;@@ WARNING: see above
630          (do ((index end (1+ index))          (do ((index end (1+ index))
631               (new-index (- (the fixnum end) offset) (1+ new-index)))               (new-index (- (the fixnum end) offset) (1+ new-index)))
632              ((= index offset-slen))              ((= index offset-slen))
# Line 459  Line 664 
664                   (setq newword t))                   (setq newword t))
665                  (newword                  (newword
666                   ;;char is first case-modifiable after non-case-modifiable                   ;;char is first case-modifiable after non-case-modifiable
667                   (setq char (char-upcase char))                   (setq char (char-titlecase char))
668                   (setq newword ()))                   (setq newword ()))
669                  ;;char is case-modifiable, but not first                  ;;char is case-modifiable, but not first
670                  (t (setq char (char-downcase char))))                  (t (setq char (char-downcase char))))
# Line 480  Line 685 
685        (do ((index start (1+ index)))        (do ((index start (1+ index)))
686            ((= index (the fixnum end)))            ((= index (the fixnum end)))
687          (declare (fixnum index))          (declare (fixnum index))
688          (setf (schar string index) (char-upcase (schar string index)))))          (multiple-value-bind (code wide) (codepoint string index)
689              (declare (ignore wide))
690              ;; Handle ASCII specially because this is called early in
691              ;; initialization, before unidata is available.
692              (cond ((< 96 code 123) (decf code 32))
693                    ((> code 127) (setq code (unicode-upper code))))
694              ;;@@ WARNING: this may, in theory, need to extend string
695              ;;      (which, obviously, we can't do here.  Unless
696              ;;       STRING is adjustable, maybe)
697              ;;  but that never actually occurs as of Unicode 5.1.0,
698              ;;  so I'm just going to ignore it for now...
699              (multiple-value-bind (hi lo) (surrogates code)
700                (setf (schar string index) hi)
701                (when lo
702                  (setf (schar string (incf index)) lo))))))
703      save-header))      save-header))
704    
705  (defun nstring-downcase (string &key (start 0) end)  (defun nstring-downcase (string &key (start 0) end)
# Line 492  Line 711 
711        (do ((index start (1+ index)))        (do ((index start (1+ index)))
712            ((= index (the fixnum end)))            ((= index (the fixnum end)))
713          (declare (fixnum index))          (declare (fixnum index))
714          (setf (schar string index) (char-downcase (schar string index)))))          (multiple-value-bind (code wide) (codepoint string index)
715              (declare (ignore wide))
716              (cond ((< 64 code 91) (incf code 32))
717                    ((> code 127) (setq code (unicode-lower code))))
718              ;;@@ WARNING: this may, in theory, need to extend string
719              ;;      (which, obviously, we can't do here.  Unless
720              ;;       STRING is adjustable, maybe)
721              ;;  but that never actually occurs as of Unicode 5.1.0,
722              ;;  so I'm just going to ignore it for now...
723              (multiple-value-bind (hi lo) (surrogates code)
724                (setf (schar string index) hi)
725                (when lo
726                  (setf (schar string (incf index)) lo))))))
727      save-header))      save-header))
728    
729  (defun nstring-capitalize (string &key (start 0) end)  (defun nstring-capitalize (string &key (start 0) end)
# Line 514  Line 745 
745                 (setq newword t))                 (setq newword t))
746                (newword                (newword
747                 ;;char is first case-modifiable after non-case-modifiable                 ;;char is first case-modifiable after non-case-modifiable
748                 (setf (schar string index) (char-upcase char))                 (setf (schar string index) (char-titlecase char))
749                 (setq newword ()))                 (setq newword ()))
750                (t                (t
751                 (setf (schar string index) (char-downcase char))))))                 (setf (schar string index) (char-downcase char))))))
# Line 557  Line 788 
788                             (1+ index))                             (1+ index))
789                          (declare (fixnum index)))))                          (declare (fixnum index)))))
790        (subseq (the simple-string string) left-end right-end))))        (subseq (the simple-string string) left-end right-end))))
791    
792    (declaim (inline %glyph-f %glyph-b))
793    (defun %glyph-f (string index)
794      (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
795               (type simple-string string) (type kernel:index index))
796      (let* ((prev 0)
797             (l (length string))
798             (c (codepoint string index l))
799             (n (+ index (if (> c #xFFFF) 2 1))))
800        (declare (type codepoint c) (type kernel:index l n))
801        (loop while (< n l) do
802          (let* ((c (codepoint string n l))
803                 (d (the (unsigned-byte 8) (unicode-combining-class c))))
804            (when (or (zerop d) (< d prev))
805              (return))
806            (setq prev d)
807            (incf n (if (> c #xFFFF) 2 1))))
808        n))
809    
810    (defun %glyph-b (string index)
811      (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))
812               (type simple-string string) (type kernel:index index))
813      (let* ((prev 255)
814             (n (1- index)))
815        (declare (type kernel:index n))
816        (loop until (< n 0) do
817          (let* ((c (codepoint string n 0))
818                 (d (the (unsigned-byte 8) (unicode-combining-class c))))
819            (cond ((zerop d) (return))
820                  ((> d prev) (incf n (if (> c #xFFFF) 2 1)) (return)))
821            (setq prev d)
822            (decf n (if (> c #xFFFF) 2 1))))
823        n))
824    
825    (defun glyph (string index &key (from-end nil))
826      "GLYPH returns the glyph at the indexed position in a string, and the
827      position of the next glyph (or NIL) as a second value.  A glyph is
828      a substring consisting of the character at INDEX followed by all
829      subsequent combining characters."
830      (declare (type simple-string string) (type kernel:index index))
831      #-unicode
832      (char string index)
833      #+unicode
834      (with-array-data ((string string) (start) (end))
835        (declare (ignore start end))
836        (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
837          (if from-end
838              (values (subseq string n index) (and (> n 0) n))
839              (values (subseq string index n) (and (< n (length string)) n))))))
840    
841    (defun sglyph (string index &key (from-end nil))
842      "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
843      except that the string must be a simple-string"
844      (declare (type simple-string string) (type kernel:index index))
845      #-unicode
846      (schar string index)
847      #+unicode
848      (let ((n (if from-end (%glyph-b string index) (%glyph-f string index))))
849        (if from-end
850            (values (subseq string n index) (and (> n 0) n))
851            (values (subseq string index n) (and (< n (length string)) n)))))
852    
853    #+unicode
854    (defun string-reverse* (sequence)
855      (declare (optimize (speed 3) (space 0) (safety 0))
856               (type string sequence))
857      (with-string sequence
858        (let* ((length (- end start))
859               (string (make-string length))
860               (j length))
861          (declare (type kernel:index length j))
862          (loop for i = start then n as n = (%glyph-f sequence i) do
863                (replace string sequence :start1 (decf j (- n i)) :start2 i :end2 n)
864                while (< n end))
865          string)))
866    
867    #+unicode
868    (defun string-nreverse* (sequence)
869      (declare (optimize (speed 3) (space 0) (safety 0))
870               (type string sequence))
871      (with-string sequence
872        (flet ((rev (start end)
873                 (do ((i start (1+ i))
874                      (j (1- end) (1- j)))
875                     ((>= i j))
876                   (declare (type kernel:index i j))
877                   (rotatef (schar sequence i) (schar sequence j)))))
878          (let ((len end))
879            (loop for i = start then n as n = (%glyph-f sequence i) do
880              (rev i n) while (< n len))
881            (rev start end))))
882      sequence)
883    
884    
885    
886    
887    (defun decompose (string &optional (compatibility t))
888      (declare (type string string))
889      (let ((result (make-string (cond ((< (length string) 40)
890                                        (* 5 (length string)))
891                                       ((< (length string) 4096)
892                                        (* 2 (length string)))
893                                       (t (round (length string) 5/6)))))
894            (fillptr 0))
895        (declare (type kernel:index fillptr))
896        (labels ((rec (string)
897                   (declare (type simple-string string))
898                   (do ((i 0 (1+ i)))
899                       ((= i (length string)))
900                     (declare (type kernel:index i))
901                     (multiple-value-bind (code wide) (codepoint string i)
902                       (when wide (incf i))
903                       (let ((decomp (unicode-decomp code compatibility)))
904                         (if decomp (rec decomp) (out code))))))
905                 (out (code)
906                   (multiple-value-bind (hi lo) (surrogates code)
907                     (outch hi)
908                     (when lo
909                       (outch lo))
910                     (let ((cc (unicode-combining-class code)))
911                       (unless (zerop cc)
912                         (order lo cc (- fillptr (if lo 3 2)))))))
913                 (outch (char)
914                   (when (= fillptr (length result))
915                     (let ((tmp (make-string (round (length result) 5/6))))
916                       (replace tmp result)
917                       (setq result tmp)))
918                   (setf (schar result fillptr) char)
919                   (incf fillptr))
920                 (order (wide1 cc last)
921                   (loop until (minusp last) do
922                     (multiple-value-bind (code2 wide2) (codepoint result last)
923                       (let ((cc2 (unicode-combining-class code2)))
924                         (cond ((zerop cc2) (return))
925                               ((> cc2 cc)
926                                (case (+ (if wide2 2 0) (if wide1 1 0))
927                                  (0 (rotatef (schar result last)
928                                              (schar result (1+ last))))
929                                  (1 (rotatef (schar result last)
930                                              (schar result (+ last 1))
931                                              (schar result (+ last 2))))
932                                  (2 (rotatef (schar result last)
933                                              (schar result (1- last))
934                                              (schar result (1+ last))))
935                                  (3 (rotatef (schar result last)
936                                              (schar result (+ last 2)))
937                                     (rotatef (schar result (1- last))
938                                              (schar result (1+ last)))))
939                                (decf last (if wide2 2 1)))
940                               (t (return))))))))
941          (with-array-data ((string string) (start) (end))
942            (declare (ignore start end))
943            (rec string))
944          (shrink-vector result fillptr))))
945    
946    (declaim (inline normalized-codepoint-p))
947    (defun normalized-codepoint-p (cp form)
948      (ecase form
949        (:nfc (unicode-nfc-qc cp))
950        (:nfkc (unicode-nfkc-qc cp))
951        (:nfd (unicode-nfd-qc cp))
952        (:nfkd (unicode-nfkd-qc cp))))
953    
954    ;; Perform check to see if string is already normalized.  The Unicode
955    ;; example can return YES, NO, or MAYBE.  For our purposes, only YES
956    ;; is important, for which we return T.   For NO or MAYBE, we return NIL.
957    (defun normalized-form-p (string &optional (form :nfc))
958      (declare (type (member :nfc :nfkc :nfd :nfkd) form)
959               (optimize (speed 3)))
960      (with-string string
961        (let ((last-class 0))
962          (declare (type (integer 0 256) last-class))
963          (do ((k start (1+ k)))
964              ((>= k end))
965            (declare (type kernel:index k))
966            (multiple-value-bind (ch widep)
967                (codepoint string k end)
968              (when widep (incf k))
969              ;; Handle ASCII specially
970              (unless (< ch 128)
971                (let ((class (unicode-combining-class ch)))
972                  (declare (type (unsigned-byte 8) class))
973                  (when (and (> last-class class) (not (zerop class)))
974                    ;; Definitely not normalized
975                    (return-from normalized-form-p nil))
976                  (let ((check (normalized-codepoint-p ch form)))
977                    (unless (eq check :y)
978                      (return-from normalized-form-p nil)))
979                  (setf last-class class)))))
980          t)))
981    
982    
983    ;; Compose a string in place.  The string must already be in decomposed form.
984    (defun %compose (target)
985      (declare (type string target)
986               (optimize (speed 3)))
987      (let ((len (length target))
988            (starter-pos 0))
989        (declare (type kernel:index starter-pos))
990        (multiple-value-bind (starter-ch wide)
991            (codepoint target 0 len)
992          (let ((comp-pos (if wide 2 1))
993                (last-class (unicode-combining-class starter-ch)))
994            (declare (type (integer 0 256) last-class)
995                     (type kernel:index comp-pos))
996            (unless (zerop last-class)
997              ;; Fix for strings starting with a combining character
998              (setf last-class 256))
999            ;; Loop on decomposed characters, combining where possible
1000            (do ((decomp-pos comp-pos (1+ decomp-pos)))
1001                ((>= decomp-pos len))
1002              (declare (type kernel:index decomp-pos))
1003              (multiple-value-bind (ch wide)
1004                  (codepoint target decomp-pos len)
1005                (when wide (incf decomp-pos))
1006                (let ((ch-class (unicode-combining-class ch))
1007                      (composite (get-pairwise-composition starter-ch ch)))
1008                  (declare (type (integer 0 256) ch-class))
1009                  (cond ((and composite
1010                              (or (< last-class ch-class) (zerop last-class)))
1011                         ;; Don't have to worry about surrogate pairs here
1012                         ;; because the composite is always in the BMP.
1013                         (setf (aref target starter-pos) (code-char composite))
1014                         (setf starter-ch composite))
1015                        (t
1016                         (when (zerop ch-class)
1017                           (setf starter-pos comp-pos)
1018                           (setf starter-ch ch))
1019                         (setf last-class ch-class)
1020                         (multiple-value-bind (hi lo)
1021                             (surrogates ch)
1022                           (setf (aref target comp-pos) hi)
1023                           (when lo
1024                             (incf comp-pos)
1025                             (setf (aref target comp-pos) lo))
1026                           (incf comp-pos)))))))
1027            (shrink-vector target comp-pos)))))
1028    
1029    (defun string-to-nfd (string)
1030      "Convert String to Unicode Normalization Form D (NFD) using the
1031      canonical decomposition.  The NFD string is returned"
1032      (decompose string nil))
1033    
1034    (defun string-to-nfkd (string)
1035      "Convert String to Unicode Normalization Form KD (NFKD) uisng the
1036      compatible decomposition form.  The NFKD string is returned."
1037      (decompose string t))
1038    
1039    #+unicode
1040    (defun string-to-nfc (string)
1041      "Convert String to Unicode Normalization Form C (NFC)."
1042      (if (normalized-form-p string :nfc)
1043          (if (simple-string-p string) string (coerce string 'simple-string))
1044          (coerce (if (normalized-form-p string :nfd)
1045                      (%compose (copy-seq string))
1046                      (%compose (string-to-nfd string)))
1047                  'simple-string)))
1048    
1049    #-unicode  ;; Needed by package.lisp
1050    (defun string-to-nfc (string)
1051      (if (simple-string-p string) string (coerce string 'simple-string)))
1052    
1053    (defun string-to-nfkc (string)
1054      "Convert String to Unicode Normalization Form KC (NFKC)."
1055      (if (normalized-form-p string :nfkc)
1056          (if (simple-string-p string) string (coerce string 'simple-string))
1057          (coerce (if (normalized-form-p string :nfkd)
1058                      (%compose (copy-seq string))
1059                      (%compose (string-to-nfkd string)))
1060                  'simple-string)))

Legend:
Removed from v.1.12  
changed lines
  Added in v.1.12.30.26

  ViewVC Help
Powered by ViewVC 1.1.5