/[cmucl]/src/code/string.lisp
ViewVC logotype

Diff of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.20.4.1 by rtoy, Wed Oct 7 14:46:38 2009 UTC revision 1.29 by rtoy, Tue Oct 26 13:56:08 2010 UTC
# Line 16  Line 16 
16  ;;; ****************************************************************  ;;; ****************************************************************
17  ;;;  ;;;
18  (in-package "LISP")  (in-package "LISP")
19    (intl:textdomain "cmucl")
20    
21  (export '(char schar glyph sglyph string  (export '(char schar glyph sglyph string
22            string= string-equal string< string> string<= string>= string/=            string= string-equal string< string> string<= string>= string/=
23            string-lessp string-greaterp string-not-lessp string-not-greaterp            string-lessp string-greaterp string-not-lessp string-not-greaterp
# Line 38  Line 40 
40    for.  :High means to test for the high (leading) surrogate; :Low    for.  :High means to test for the high (leading) surrogate; :Low
41    tests for the low (trailing surrogate).  A value of :Any or Nil    tests for the low (trailing surrogate).  A value of :Any or Nil
42    tests for any surrogate value (high or low)."    tests for any surrogate value (high or low)."
43    (declare (type (or character codepoint) char-or-code))    (declare (type (or character codepoint) char-or-code)
44               (type (or null (member :high :leading :low :trailing :any)) surrogate-type)
45               (optimize (inhibit-warnings 3)))
46    (let ((code (if (characterp char-or-code)    (let ((code (if (characterp char-or-code)
47                    (char-code char-or-code)                    (char-code char-or-code)
48                    char-or-code)))                    char-or-code)))
# Line 72  Line 76 
76      (cond ((and (surrogatep code :high) (< (1+ i) end))      (cond ((and (surrogatep code :high) (< (1+ i) end))
77             (let ((tmp (char-code (schar string (1+ i)))))             (let ((tmp (char-code (schar string (1+ i)))))
78               (if (surrogatep tmp :low)               (if (surrogatep tmp :low)
79                   (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)                   (values (truly-the codepoint (+ (ash (- code #xD800) 10) tmp #x2400))
80                   (values code nil))))                           +1)
81                     (values (truly-the codepoint code) nil))))
82            ((and (surrogatep code :low) (> i 0))            ((and (surrogatep code :low) (> i 0))
83             (let ((tmp (char-code (schar string (1- i)))))             (let ((tmp (char-code (schar string (1- i)))))
84               (if (surrogatep tmp :high)               (if (surrogatep tmp :high)
85                   (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)                   (values (truly-the codepoint (+ (ash (- tmp #xD800) 10) code #x2400))
86                   (values code nil))))                           -1)
87            (t (values code nil)))))                   (values (truly-the codepoint code) nil))))
88              (t (values (truly-the codepoint code) nil)))))
89    
90  (defun surrogates (codepoint)  (defun surrogates (codepoint)
91    "Return the high and low surrogate characters for Codepoint.  If    "Return the high and low surrogate characters for Codepoint.  If
# Line 111  Line 117 
117    
118  #+unicode  #+unicode
119  (defun utf16-string-p (string)  (defun utf16-string-p (string)
120    "Check if String is a valid UTF-16 string.  If the string is valid,    _N"Check if String is a valid UTF-16 string.  If the string is valid,
121    T is returned.  If the string is not valid, NIL is returned, and the    T is returned.  If the string is not valid, NIL is returned, and the
122    second value is the index into the string of the invalid character.    second value is the index into the string of the invalid character.
123    A string is also invalid if it contains any unassigned codepoints."    A string is also invalid if it contains any unassigned codepoints."
# Line 147  Line 153 
153           (error 'simple-type-error           (error 'simple-type-error
154                  :datum x                  :datum x
155                  :expected-type '(or string symbol character)                  :expected-type '(or string symbol character)
156                  :format-control "~S cannot be coerced to a string."                  :format-control (intl:gettext "~S cannot be coerced to a string.")
157                  :format-arguments (list x)))))                  :format-arguments (list x)))))
158    
159  ;;; With-One-String is used to set up some string hacking things.  The keywords  ;;; With-One-String is used to set up some string hacking things.  The keywords
# Line 367  Line 373 
373    
374  #+unicode  #+unicode
375  (defun string-case-fold (string &key (start 0) end (casing :simple))  (defun string-case-fold (string &key (start 0) end (casing :simple))
376    "Return a new string with the case folded according to Casing as follows:    _N"Return a new string with the case folded according to Casing as follows:
377    
378    :SIMPLE  Unicode simple case folding (preserving length)    :SIMPLE  Unicode simple case folding (preserving length)
379    :FULL    Unicode full case folding (possibly changing length)    :FULL    Unicode full case folding (possibly changing length)
# Line 407  Line 413 
413        (declare (fixnum slen1 slen2))        (declare (fixnum slen1 slen2))
414        (if (or (minusp slen1) (minusp slen2))        (if (or (minusp slen1) (minusp slen2))
415            ;;prevent endless looping later.            ;;prevent endless looping later.
416            (error "Improper bounds for string comparison."))            (error (intl:gettext "Improper bounds for string comparison.")))
417        (if (= slen1 slen2)        (if (= slen1 slen2)
418            ;;return () immediately if lengths aren't equal.            ;;return () immediately if lengths aren't equal.
419            (string-not-equal-loop 1 t nil)))))            (string-not-equal-loop 1 t nil)))))
# Line 422  Line 428 
428        (declare (fixnum slen1 slen2))        (declare (fixnum slen1 slen2))
429        (if (or (minusp slen1) (minusp slen2))        (if (or (minusp slen1) (minusp slen2))
430            ;;prevent endless looping later.            ;;prevent endless looping later.
431            (error "Improper bounds for string comparison."))            (error (intl:gettext "Improper bounds for string comparison.")))
432        (cond ((or (minusp slen1) (or (minusp slen2)))        (cond ((or (minusp slen1) (or (minusp slen2)))
433               (error "Improper substring for comparison."))               (error (intl:gettext "Improper substring for comparison.")))
434              ((= slen1 slen2)              ((= slen1 slen2)
435               (string-not-equal-loop 1 nil (- index1 offset1)))               (string-not-equal-loop 1 nil (- index1 offset1)))
436              ((< slen1 slen2)              ((< slen1 slen2)
# Line 470  Line 476 
476           (declare (fixnum slen1 slen2))           (declare (fixnum slen1 slen2))
477           (if (or (minusp slen1) (minusp slen2))           (if (or (minusp slen1) (minusp slen2))
478               ;;prevent endless looping later.               ;;prevent endless looping later.
479               (error "Improper bounds for string comparison."))               (error (intl:gettext "Improper bounds for string comparison.")))
480           (do ((index1 start1 (1+ index1))           (do ((index1 start1 (1+ index1))
481                (index2 start2 (1+ index2))                (index2 start2 (1+ index2))
482                (char1)                (char1)
# Line 509  Line 515 
515           (declare (fixnum slen1 slen2))           (declare (fixnum slen1 slen2))
516           (if (or (minusp slen1) (minusp slen2))           (if (or (minusp slen1) (minusp slen2))
517               ;;prevent endless looping later.               ;;prevent endless looping later.
518               (error "Improper bounds for string comparison."))               (error (intl:gettext "Improper bounds for string comparison.")))
519           (do ((index1 start1 (1+ index1))           (do ((index1 start1 (1+ index1))
520                (index2 start2 (1+ index2)))                (index2 start2 (1+ index2)))
521               ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))               ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
# Line 661  Line 667 
667    
668  (defun string-upcase (string &key (start 0) end #+unicode (casing :simple))  (defun string-upcase (string &key (start 0) end #+unicode (casing :simple))
669    #-unicode    #-unicode
670    "Given a string, returns a new string that is a copy of it with all    _N"Given a string, returns a new string that is a copy of it with all
671    lower case alphabetic characters converted to uppercase."    lower case alphabetic characters converted to uppercase."
672    #+unicode    #+unicode
673    "Given a string, returns a new string that is a copy of it with all    _N"Given a string, returns a new string that is a copy of it with all
674    lower case alphabetic characters converted to uppercase.  Casing is    lower case alphabetic characters converted to uppercase.  Casing is
675    :simple or :full for simple or full case conversion, respectively."    :simple or :full for simple or full case conversion, respectively."
676    (declare (fixnum start))    (declare (fixnum start))
# Line 747  Line 753 
753    
754  (defun string-downcase (string &key (start 0) end #+unicode (casing :simple))  (defun string-downcase (string &key (start 0) end #+unicode (casing :simple))
755    #-unicode    #-unicode
756    "Given a string, returns a new string that is a copy of it with all    _N"Given a string, returns a new string that is a copy of it with all
757    upper case alphabetic characters converted to lowercase."    upper case alphabetic characters converted to lowercase."
758    #+unicode    #+unicode
759    "Given a string, returns a new string that is a copy of it with all    _N"Given a string, returns a new string that is a copy of it with all
760    upper case alphabetic characters converted to lowercase.  Casing is    upper case alphabetic characters converted to lowercase.  Casing is
761    :simple or :full for simple or full case conversion, respectively."    :simple or :full for simple or full case conversion, respectively."
762    (declare (fixnum start))    (declare (fixnum start))
# Line 841  Line 847 
847                                   #+unicode (casing :simple)                                   #+unicode (casing :simple)
848                                   #+unicode unicode-word-break)                                   #+unicode unicode-word-break)
849    #-unicode    #-unicode
850    "Given a string, returns a copy of the string with the first    _N"Given a string, returns a copy of the string with the first
851    character of each ``word'' converted to upper-case, and remaining    character of each ``word'' converted to upper-case, and remaining
852    chars in the word converted to lower case. A ``word'' is defined    chars in the word converted to lower case. A ``word'' is defined
853    to be a string of case-modifiable characters delimited by    to be a string of case-modifiable characters delimited by
854    non-case-modifiable chars."    non-case-modifiable chars."
855    #+unicode    #+unicode
856    "Given a string, returns a copy of the string with the first    _N"Given a string, returns a copy of the string with the first
857    character of each ``word'' converted to upper-case, and remaining    character of each ``word'' converted to upper-case, and remaining
858    chars in the word converted to lower case. A ``word'' is defined    chars in the word converted to lower case. A ``word'' is defined
859    to be a string of case-modifiable characters delimited by    to be a string of case-modifiable characters delimited by
# Line 984  Line 990 
990              (when widep (incf index)))))))              (when widep (incf index)))))))
991    
992  (defun string-left-trim (char-bag string)  (defun string-left-trim (char-bag string)
993    "Given a set of characters (a list or string) and a string, returns    _N"Given a set of characters (a list or string) and a string, returns
994    a copy of the string with the characters in the set removed from the    a copy of the string with the characters in the set removed from the
995    left end.  If the set of characters is a string, surrogates will be    left end.  If the set of characters is a string, surrogates will be
996    properly handled."    properly handled."
# Line 1032  Line 1038 
1038              (when widep (decf index)))))))              (when widep (decf index)))))))
1039    
1040  (defun string-right-trim (char-bag string)  (defun string-right-trim (char-bag string)
1041    "Given a set of characters (a list or string) and a string, returns    _N"Given a set of characters (a list or string) and a string, returns
1042    a copy of the string with the characters in the set removed from the    a copy of the string with the characters in the set removed from the
1043    right end.  If the set of characters is a string, surrogates will be    right end.  If the set of characters is a string, surrogates will be
1044    properly handled."    properly handled."
# Line 1042  Line 1048 
1048        (subseq string start stop))))        (subseq string start stop))))
1049    
1050  (defun string-trim (char-bag string)  (defun string-trim (char-bag string)
1051    "Given a set of characters (a list or string) and a string, returns a    _N"Given a set of characters (a list or string) and a string, returns a
1052    copy of the string with the characters in the set removed from both    copy of the string with the characters in the set removed from both
1053    ends.  If the set of characters is a string, surrogates will be    ends.  If the set of characters is a string, surrogates will be
1054    properly handled."    properly handled."
# Line 1056  Line 1062 
1062  #-unicode  #-unicode
1063  (progn  (progn
1064  (defun string-left-trim (char-bag string)  (defun string-left-trim (char-bag string)
1065    "Given a set of characters (a list or string) and a string, returns    _N"Given a set of characters (a list or string) and a string, returns
1066    a copy of the string with the characters in the set removed from the    a copy of the string with the characters in the set removed from the
1067    left end."    left end."
1068    (with-string string    (with-string string
# Line 1067  Line 1073 
1073        (declare (fixnum index)))))        (declare (fixnum index)))))
1074    
1075  (defun string-right-trim (char-bag string)  (defun string-right-trim (char-bag string)
1076    "Given a set of characters (a list or string) and a string, returns    _N"Given a set of characters (a list or string) and a string, returns
1077    a copy of the string with the characters in the set removed from the    a copy of the string with the characters in the set removed from the
1078    right end."    right end."
1079    (with-string string    (with-string string
# Line 1077  Line 1083 
1083        (declare (fixnum index)))))        (declare (fixnum index)))))
1084    
1085  (defun string-trim (char-bag string)  (defun string-trim (char-bag string)
1086    "Given a set of characters (a list or string) and a string, returns a    _N"Given a set of characters (a list or string) and a string, returns a
1087    copy of the string with the characters in the set removed from both    copy of the string with the characters in the set removed from both
1088    ends."    ends."
1089    (with-string string    (with-string string
# Line 1317  Line 1323 
1323                (declare (type (integer 0 256) ch-class))                (declare (type (integer 0 256) ch-class))
1324                (cond ((and composite                (cond ((and composite
1325                            (or (< last-class ch-class) (zerop last-class)))                            (or (< last-class ch-class) (zerop last-class)))
1326                       ;; Don't have to worry about surrogate pairs here                       ;; Note: As far as I know, there is no pairwise
1327                       ;; because the composite is always in the BMP.                       ;; composition such that the composite character
1328                       (setf (aref target starter-pos) (code-char composite))                       ;; is outside the BMP but the starter-ch is
1329                       (setf starter-ch composite))                       ;; inside the BMP.  Hence, it is always safe to
1330                         ;; replace the possible surrogate at starter-pos
1331                         ;; with another.  We won't accidentally replace
1332                         ;; the next character with our trailing surrogate
1333                         ;; character.
1334                         (multiple-value-bind (hi lo)
1335                             (surrogates composite)
1336                           (setf (aref target starter-pos) hi)
1337                           (when lo
1338                             (setf (aref target (1+ starter-pos)) lo))
1339                         (setf starter-ch composite)))
1340                      (t                      (t
1341                       (when (zerop ch-class)                       (when (zerop ch-class)
1342                         (setf starter-pos comp-pos)                         (setf starter-pos comp-pos)
# Line 1336  Line 1352 
1352          (shrink-vector target comp-pos)))))          (shrink-vector target comp-pos)))))
1353    
1354  (defun string-to-nfd (string)  (defun string-to-nfd (string)
1355    "Convert String to Unicode Normalization Form D (NFD) using the    _N"Convert String to Unicode Normalization Form D (NFD) using the
1356    canonical decomposition.  The NFD string is returned"    canonical decomposition.  The NFD string is returned"
1357    (decompose string nil))    (decompose string nil))
1358    
1359  (defun string-to-nfkd (string)  (defun string-to-nfkd (string)
1360    "Convert String to Unicode Normalization Form KD (NFKD) uisng the    _N"Convert String to Unicode Normalization Form KD (NFKD) uisng the
1361    compatible decomposition form.  The NFKD string is returned."    compatible decomposition form.  The NFKD string is returned."
1362    (decompose string t))    (decompose string t))
1363    
1364  (defun string-to-nfc (string)  (defun string-to-nfc (string)
1365    "Convert String to Unicode Normalization Form C (NFC).  If the    _N"Convert String to Unicode Normalization Form C (NFC).  If the
1366    string a simple string and is already normalized, the original    string a simple string and is already normalized, the original
1367    string is returned."    string is returned."
1368    (if (normalized-form-p string :nfc)    (if (normalized-form-p string :nfc)
# Line 1357  Line 1373 
1373                'simple-string)))                'simple-string)))
1374    
1375  (defun string-to-nfkc (string)  (defun string-to-nfkc (string)
1376    "Convert String to Unicode Normalization Form KC (NFKC).  If the    _N"Convert String to Unicode Normalization Form KC (NFKC).  If the
1377    string is a simple string and is already normalized, the original    string is a simple string and is already normalized, the original
1378    string is returned."    string is returned."
1379    (if (normalized-form-p string :nfkc)    (if (normalized-form-p string :nfkc)
# Line 1440  Line 1456 
1456  ;;   incorrect; instead, what we need is a new rule:  ;;   incorrect; instead, what we need is a new rule:
1457  ;;  ;;
1458  ;;   *Break after paragraph separators.*  ;;   *Break after paragraph separators.*
1459  ;;    WB3a. Sep  ;;    WB3a. Sep �
1460  ;;   I'll make a propose to the UTC for this.  ;;   I'll make a propose to the UTC for this.
1461  ;;  ;;
1462  ;; Here is Will's translation of those rules (including WB3a)  ;; Here is Will's translation of those rules (including WB3a)
# Line 1682  Line 1698 
1698                              result :start (1+ start) :end next)))                              result :start (1+ start) :end next)))
1699            (write-string string result :start end :end offset-slen))))))            (write-string string result :start end :end offset-slen))))))
1700    
1701    
1702    ;; Some utilities
1703    (defun codepoints-string (seq)
1704      "Convert a sequence of codepoints to a string.  Codepoints outside
1705      the basic multilingual plane (BMP) are converted into the
1706      corresponding surrogate pairs."
1707      (with-output-to-string (s)
1708        (map nil #'(lambda (c)
1709                     (multiple-value-bind (hi lo)
1710                         (surrogates c)
1711                       (write-char hi s)
1712                       (when lo (write-char lo s))))
1713             seq)))
1714    
1715    (defun string-codepoints (s)
1716      "Convert a string to a list of corresponding code points.  Surrogate
1717      pairs in the string are converted into the correspoinding
1718      codepoint."
1719      (declare (type simple-string s))
1720      (let ((len (length s))
1721            cp)
1722        (do ((idx 0))
1723            ((>= idx len))
1724          (multiple-value-bind (c widep)
1725              (codepoint s idx)
1726            (if widep
1727                (incf idx 2)
1728                (incf idx))
1729            (push c cp)))
1730        (nreverse cp)))

Legend:
Removed from v.1.20.4.1  
changed lines
  Added in v.1.29

  ViewVC Help
Powered by ViewVC 1.1.5