/[cmucl]/src/code/string.lisp
ViewVC logotype

Diff of /src/code/string.lisp

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.21.6.1 by rtoy, Thu Feb 25 20:34:52 2010 UTC revision 1.29 by rtoy, Tue Oct 26 13:56:08 2010 UTC
# Line 35  Line 35 
35  (declaim (inline surrogatep surrogates-to-codepoint codepoint surrogates))  (declaim (inline surrogatep surrogates-to-codepoint codepoint surrogates))
36    
37  (defun surrogatep (char-or-code &optional surrogate-type)  (defun surrogatep (char-or-code &optional surrogate-type)
38    _N"Test if C is a surrogate.  C may be either an integer or a    "Test if C is a surrogate.  C may be either an integer or a
39    character. Surrogate-type indicates what kind of surrogate to test    character. Surrogate-type indicates what kind of surrogate to test
40    for.  :High means to test for the high (leading) surrogate; :Low    for.  :High means to test for the high (leading) surrogate; :Low
41    tests for the low (trailing surrogate).  A value of :Any or Nil    tests for the low (trailing surrogate).  A value of :Any or Nil
42    tests for any surrogate value (high or low)."    tests for any surrogate value (high or low)."
43    (declare (type (or character codepoint) char-or-code))    (declare (type (or character codepoint) char-or-code)
44               (type (or null (member :high :leading :low :trailing :any)) surrogate-type)
45               (optimize (inhibit-warnings 3)))
46    (let ((code (if (characterp char-or-code)    (let ((code (if (characterp char-or-code)
47                    (char-code char-or-code)                    (char-code char-or-code)
48                    char-or-code)))                    char-or-code)))
# Line 56  Line 58 
58         (= #b11011 (ash code -11))))))         (= #b11011 (ash code -11))))))
59    
60  (defun surrogates-to-codepoint (hi-surrogate-char lo-surrogate-char)  (defun surrogates-to-codepoint (hi-surrogate-char lo-surrogate-char)
61    _N"Convert the given Hi and Lo surrogate characters to the    "Convert the given Hi and Lo surrogate characters to the
62    corresponding codepoint value"    corresponding codepoint value"
63    (declare (type character hi-surrogate-char lo-surrogate-char))    (declare (type character hi-surrogate-char lo-surrogate-char))
64    (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi-surrogate-char)) #xD800) 10)    (+ (ash (- (the (integer #xD800 #xDBFF) (char-code hi-surrogate-char)) #xD800) 10)
65       (the (integer #xDC00 #xDFFF) (char-code lo-surrogate-char)) #x2400))       (the (integer #xDC00 #xDFFF) (char-code lo-surrogate-char)) #x2400))
66    
67  (defun codepoint (string i &optional (end (length string)))  (defun codepoint (string i &optional (end (length string)))
68    _N"Return the codepoint value from String at position I.  If that    "Return the codepoint value from String at position I.  If that
69    position is a surrogate, it is combined with either the previous or    position is a surrogate, it is combined with either the previous or
70    following character (when possible) to compute the codepoint.  The    following character (when possible) to compute the codepoint.  The
71    second return value is NIL if the position is not a surrogate pair.    second return value is NIL if the position is not a surrogate pair.
# Line 74  Line 76 
76      (cond ((and (surrogatep code :high) (< (1+ i) end))      (cond ((and (surrogatep code :high) (< (1+ i) end))
77             (let ((tmp (char-code (schar string (1+ i)))))             (let ((tmp (char-code (schar string (1+ i)))))
78               (if (surrogatep tmp :low)               (if (surrogatep tmp :low)
79                   (values (+ (ash (- code #xD800) 10) tmp #x2400) +1)                   (values (truly-the codepoint (+ (ash (- code #xD800) 10) tmp #x2400))
80                   (values code nil))))                           +1)
81                     (values (truly-the codepoint code) nil))))
82            ((and (surrogatep code :low) (> i 0))            ((and (surrogatep code :low) (> i 0))
83             (let ((tmp (char-code (schar string (1- i)))))             (let ((tmp (char-code (schar string (1- i)))))
84               (if (surrogatep tmp :high)               (if (surrogatep tmp :high)
85                   (values (+ (ash (- tmp #xD800) 10) code #x2400) -1)                   (values (truly-the codepoint (+ (ash (- tmp #xD800) 10) code #x2400))
86                   (values code nil))))                           -1)
87            (t (values code nil)))))                   (values (truly-the codepoint code) nil))))
88              (t (values (truly-the codepoint code) nil)))))
89    
90  (defun surrogates (codepoint)  (defun surrogates (codepoint)
91    _N"Return the high and low surrogate characters for Codepoint.  If    "Return the high and low surrogate characters for Codepoint.  If
92    Codepoint is in the BMP, the first return value is the corresponding    Codepoint is in the BMP, the first return value is the corresponding
93    character and the second is NIL."    character and the second is NIL."
94    (declare (type codepoint codepoint))    (declare (type codepoint codepoint))
# Line 96  Line 100 
100          (values (code-char hi) (code-char lo)))))          (values (code-char hi) (code-char lo)))))
101    
102  (defun (setf codepoint) (codepoint string i)  (defun (setf codepoint) (codepoint string i)
103    _N"Set the codepoint at string position I to the Codepoint.  If the    "Set the codepoint at string position I to the Codepoint.  If the
104    codepoint requires a surrogate pair, the high (leading surrogate) is    codepoint requires a surrogate pair, the high (leading surrogate) is
105    stored at position I and the low (trailing) surrogate is stored at    stored at position I and the low (trailing) surrogate is stored at
106    I+1"    I+1"
# Line 136  Line 140 
140        (when wide (incf index)))))        (when wide (incf index)))))
141    
142  (defun string (X)  (defun string (X)
143    _N"Coerces X into a string.  If X is a string, X is returned.  If X is a    "Coerces X into a string.  If X is a string, X is returned.  If X is a
144    symbol, X's pname is returned.  If X is a character then a one element    symbol, X's pname is returned.  If X is a character then a one element
145    string containing that character is returned.  If X cannot be coerced    string containing that character is returned.  If X cannot be coerced
146    into a string, an error occurs."    into a string, an error occurs."
# Line 149  Line 153 
153           (error 'simple-type-error           (error 'simple-type-error
154                  :datum x                  :datum x
155                  :expected-type '(or string symbol character)                  :expected-type '(or string symbol character)
156                  :format-control _"~S cannot be coerced to a string."                  :format-control (intl:gettext "~S cannot be coerced to a string.")
157                  :format-arguments (list x)))))                  :format-arguments (list x)))))
158    
159  ;;; With-One-String is used to set up some string hacking things.  The keywords  ;;; With-One-String is used to set up some string hacking things.  The keywords
# Line 202  Line 206 
206    
207    
208  (defun char (string index)  (defun char (string index)
209    _N"Given a string and a non-negative integer index less than the length of    "Given a string and a non-negative integer index less than the length of
210    the string, returns the character object representing the character at    the string, returns the character object representing the character at
211    that position in the string."    that position in the string."
212    (declare (optimize (safety 1)))    (declare (optimize (safety 1)))
# Line 213  Line 217 
217    (setf (char string index) new-el))    (setf (char string index) new-el))
218    
219  (defun schar (string index)  (defun schar (string index)
220    _N"SCHAR returns the character object at an indexed position in a string    "SCHAR returns the character object at an indexed position in a string
221    just as CHAR does, except the string must be a simple-string."    just as CHAR does, except the string must be a simple-string."
222    (declare (optimize (safety 1)))    (declare (optimize (safety 1)))
223    (schar string index))    (schar string index))
# Line 305  Line 309 
309    
310    
311  (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)  (defun string< (string1 string2 &key (start1 0) end1 (start2 0) end2)
312    _N"Given two strings, if the first string is lexicographically less than    "Given two strings, if the first string is lexicographically less than
313    the second string, returns the longest common prefix (using char=)    the second string, returns the longest common prefix (using char=)
314    of the two strings. Otherwise, returns ()."    of the two strings. Otherwise, returns ()."
315    (string<* string1 string2 start1 end1 start2 end2))    (string<* string1 string2 start1 end1 start2 end2))
316    
317  (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)  (defun string> (string1 string2 &key (start1 0) end1 (start2 0) end2)
318    _N"Given two strings, if the first string is lexicographically greater than    "Given two strings, if the first string is lexicographically greater than
319    the second string, returns the longest common prefix (using char=)    the second string, returns the longest common prefix (using char=)
320    of the two strings. Otherwise, returns ()."    of the two strings. Otherwise, returns ()."
321    (string>* string1 string2 start1 end1 start2 end2))    (string>* string1 string2 start1 end1 start2 end2))
322    
323    
324  (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)  (defun string<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
325    _N"Given two strings, if the first string is lexicographically less than    "Given two strings, if the first string is lexicographically less than
326    or equal to the second string, returns the longest common prefix    or equal to the second string, returns the longest common prefix
327    (using char=) of the two strings. Otherwise, returns ()."    (using char=) of the two strings. Otherwise, returns ()."
328    (string<=* string1 string2 start1 end1 start2 end2))    (string<=* string1 string2 start1 end1 start2 end2))
329    
330  (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)  (defun string>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
331    _N"Given two strings, if the first string is lexicographically greater    "Given two strings, if the first string is lexicographically greater
332    than or equal to the second string, returns the longest common prefix    than or equal to the second string, returns the longest common prefix
333    (using char=) of the two strings. Otherwise, returns ()."    (using char=) of the two strings. Otherwise, returns ()."
334    (string>=* string1 string2 start1 end1 start2 end2))    (string>=* string1 string2 start1 end1 start2 end2))
335    
336  (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)  (defun string= (string1 string2 &key (start1 0) end1 (start2 0) end2)
337    _N"Given two strings (string1 and string2), and optional integers start1,    "Given two strings (string1 and string2), and optional integers start1,
338    start2, end1 and end2, compares characters in string1 to characters in    start2, end1 and end2, compares characters in string1 to characters in
339    string2 (using char=)."    string2 (using char=)."
340    (string=* string1 string2 start1 end1 start2 end2))    (string=* string1 string2 start1 end1 start2 end2))
341    
342  (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)  (defun string/= (string1 string2 &key (start1 0) end1 (start2 0) end2)
343    _N"Given two strings, if the first string is not lexicographically equal    "Given two strings, if the first string is not lexicographically equal
344    to the second string, returns the longest common prefix (using char=)    to the second string, returns the longest common prefix (using char=)
345    of the two strings. Otherwise, returns ()."    of the two strings. Otherwise, returns ()."
346    (string/=* string1 string2 start1 end1 start2 end2))    (string/=* string1 string2 start1 end1 start2 end2))
# Line 399  Line 403 
403               (write-string (unicode-case-fold-full code) s))))))))               (write-string (unicode-case-fold-full code) s))))))))
404    
405  (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)  (defun string-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
406    _N"Given two strings (string1 and string2), and optional integers start1,    "Given two strings (string1 and string2), and optional integers start1,
407    start2, end1 and end2, compares characters in string1 to characters in    start2, end1 and end2, compares characters in string1 to characters in
408    string2 (using char-equal)."    string2 (using char-equal)."
409    (declare (fixnum start1 start2))    (declare (fixnum start1 start2))
# Line 409  Line 413 
413        (declare (fixnum slen1 slen2))        (declare (fixnum slen1 slen2))
414        (if (or (minusp slen1) (minusp slen2))        (if (or (minusp slen1) (minusp slen2))
415            ;;prevent endless looping later.            ;;prevent endless looping later.
416            (error _"Improper bounds for string comparison."))            (error (intl:gettext "Improper bounds for string comparison.")))
417        (if (= slen1 slen2)        (if (= slen1 slen2)
418            ;;return () immediately if lengths aren't equal.            ;;return () immediately if lengths aren't equal.
419            (string-not-equal-loop 1 t nil)))))            (string-not-equal-loop 1 t nil)))))
420    
421  (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)  (defun string-not-equal (string1 string2 &key (start1 0) end1 (start2 0) end2)
422    _N"Given two strings, if the first string is not lexicographically equal    "Given two strings, if the first string is not lexicographically equal
423    to the second string, returns the longest common prefix (using char-equal)    to the second string, returns the longest common prefix (using char-equal)
424    of the two strings. Otherwise, returns ()."    of the two strings. Otherwise, returns ()."
425    (with-two-strings string1 string2 start1 end1 offset1 start2 end2    (with-two-strings string1 string2 start1 end1 offset1 start2 end2
# Line 424  Line 428 
428        (declare (fixnum slen1 slen2))        (declare (fixnum slen1 slen2))
429        (if (or (minusp slen1) (minusp slen2))        (if (or (minusp slen1) (minusp slen2))
430            ;;prevent endless looping later.            ;;prevent endless looping later.
431            (error _"Improper bounds for string comparison."))            (error (intl:gettext "Improper bounds for string comparison.")))
432        (cond ((or (minusp slen1) (or (minusp slen2)))        (cond ((or (minusp slen1) (or (minusp slen2)))
433               (error _"Improper substring for comparison."))               (error (intl:gettext "Improper substring for comparison.")))
434              ((= slen1 slen2)              ((= slen1 slen2)
435               (string-not-equal-loop 1 nil (- index1 offset1)))               (string-not-equal-loop 1 nil (- index1 offset1)))
436              ((< slen1 slen2)              ((< slen1 slen2)
# Line 472  Line 476 
476           (declare (fixnum slen1 slen2))           (declare (fixnum slen1 slen2))
477           (if (or (minusp slen1) (minusp slen2))           (if (or (minusp slen1) (minusp slen2))
478               ;;prevent endless looping later.               ;;prevent endless looping later.
479               (error _"Improper bounds for string comparison."))               (error (intl:gettext "Improper bounds for string comparison.")))
480           (do ((index1 start1 (1+ index1))           (do ((index1 start1 (1+ index1))
481                (index2 start2 (1+ index2))                (index2 start2 (1+ index2))
482                (char1)                (char1)
# Line 511  Line 515 
515           (declare (fixnum slen1 slen2))           (declare (fixnum slen1 slen2))
516           (if (or (minusp slen1) (minusp slen2))           (if (or (minusp slen1) (minusp slen2))
517               ;;prevent endless looping later.               ;;prevent endless looping later.
518               (error _"Improper bounds for string comparison."))               (error (intl:gettext "Improper bounds for string comparison.")))
519           (do ((index1 start1 (1+ index1))           (do ((index1 start1 (1+ index1))
520                (index2 start2 (1+ index2)))                (index2 start2 (1+ index2)))
521               ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))               ((or (= index1 (the fixnum end1)) (= index2 (the fixnum end2)))
# Line 552  Line 556 
556    (string-less-greater-equal t t))    (string-less-greater-equal t t))
557    
558  (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)  (defun string-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
559    _N"Given two strings, if the first string is lexicographically less than    "Given two strings, if the first string is lexicographically less than
560    the second string, returns the longest common prefix (using char-equal)    the second string, returns the longest common prefix (using char-equal)
561    of the two strings. Otherwise, returns ()."    of the two strings. Otherwise, returns ()."
562    (string-lessp* string1 string2 start1 end1 start2 end2))    (string-lessp* string1 string2 start1 end1 start2 end2))
563    
564  (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)  (defun string-greaterp (string1 string2 &key (start1 0) end1 (start2 0) end2)
565    _N"Given two strings, if the first string is lexicographically greater than    "Given two strings, if the first string is lexicographically greater than
566    the second string, returns the longest common prefix (using char-equal)    the second string, returns the longest common prefix (using char-equal)
567    of the two strings. Otherwise, returns ()."    of the two strings. Otherwise, returns ()."
568    (string-greaterp* string1 string2 start1 end1 start2 end2))    (string-greaterp* string1 string2 start1 end1 start2 end2))
569    
570  (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)  (defun string-not-lessp (string1 string2 &key (start1 0) end1 (start2 0) end2)
571    _N"Given two strings, if the first string is lexicographically greater    "Given two strings, if the first string is lexicographically greater
572    than or equal to the second string, returns the longest common prefix    than or equal to the second string, returns the longest common prefix
573    (using char-equal) of the two strings. Otherwise, returns ()."    (using char-equal) of the two strings. Otherwise, returns ()."
574    (string-not-lessp* string1 string2 start1 end1 start2 end2))    (string-not-lessp* string1 string2 start1 end1 start2 end2))
575    
576  (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)  (defun string-not-greaterp (string1 string2 &key (start1 0) end1 (start2 0)
577                                      end2)                                      end2)
578    _N"Given two strings, if the first string is lexicographically less than    "Given two strings, if the first string is lexicographically less than
579    or equal to the second string, returns the longest common prefix    or equal to the second string, returns the longest common prefix
580    (using char-equal) of the two strings. Otherwise, returns ()."    (using char-equal) of the two strings. Otherwise, returns ()."
581    (string-not-greaterp* string1 string2 start1 end1 start2 end2))    (string-not-greaterp* string1 string2 start1 end1 start2 end2))
582    
583    
584  (defun make-string (count &key element-type ((:initial-element fill-char)))  (defun make-string (count &key element-type ((:initial-element fill-char)))
585    _N"Given a character count and an optional fill character, makes and returns    "Given a character count and an optional fill character, makes and returns
586    a new string Count long filled with the fill character."    a new string Count long filled with the fill character."
587    (declare (type fixnum count))    (declare (type fixnum count))
588    (assert (subtypep element-type 'character))    (assert (subtypep element-type 'character))
# Line 867  Line 871 
871            (string-capitalize-full string :start start :end end))))            (string-capitalize-full string :start start :end end))))
872    
873  (defun nstring-upcase (string &key (start 0) end)  (defun nstring-upcase (string &key (start 0) end)
874    _N"Given a string, returns that string with all lower case alphabetic    "Given a string, returns that string with all lower case alphabetic
875    characters converted to uppercase."    characters converted to uppercase."
876    (declare (fixnum start))    (declare (fixnum start))
877    (let ((save-header string))    (let ((save-header string))
# Line 894  Line 898 
898      save-header))      save-header))
899    
900  (defun nstring-downcase (string &key (start 0) end)  (defun nstring-downcase (string &key (start 0) end)
901    _N"Given a string, returns that string with all upper case alphabetic    "Given a string, returns that string with all upper case alphabetic
902    characters converted to lowercase."    characters converted to lowercase."
903    (declare (fixnum start))    (declare (fixnum start))
904    (let ((save-header string))    (let ((save-header string))
# Line 919  Line 923 
923      save-header))      save-header))
924    
925  (defun nstring-capitalize (string &key (start 0) end)  (defun nstring-capitalize (string &key (start 0) end)
926    _N"Given a string, returns that string with the first    "Given a string, returns that string with the first
927    character of each ``word'' converted to upper-case, and remaining    character of each ``word'' converted to upper-case, and remaining
928    chars in the word converted to lower case. A ``word'' is defined    chars in the word converted to lower case. A ``word'' is defined
929    to be a string of case-modifiable characters delimited by    to be a string of case-modifiable characters delimited by
# Line 1133  Line 1137 
1137  ) ; unicode  ) ; unicode
1138    
1139  (defun glyph (string index &key (from-end nil))  (defun glyph (string index &key (from-end nil))
1140    _N"GLYPH returns the glyph at the indexed position in a string, and the    "GLYPH returns the glyph at the indexed position in a string, and the
1141    position of the next glyph (or NIL) as a second value.  A glyph is    position of the next glyph (or NIL) as a second value.  A glyph is
1142    a substring consisting of the character at INDEX followed by all    a substring consisting of the character at INDEX followed by all
1143    subsequent combining characters."    subsequent combining characters."
# Line 1149  Line 1153 
1153            (values (subseq string index n) (and (< n (length string)) n))))))            (values (subseq string index n) (and (< n (length string)) n))))))
1154    
1155  (defun sglyph (string index &key (from-end nil))  (defun sglyph (string index &key (from-end nil))
1156    _N"SGLYPH returns the glyph at the indexed position, the same as GLYPH,    "SGLYPH returns the glyph at the indexed position, the same as GLYPH,
1157    except that the string must be a simple-string"    except that the string must be a simple-string"
1158    (declare (type simple-string string) (type kernel:index index))    (declare (type simple-string string) (type kernel:index index))
1159    #-unicode    #-unicode
# Line 1319  Line 1323 
1323                (declare (type (integer 0 256) ch-class))                (declare (type (integer 0 256) ch-class))
1324                (cond ((and composite                (cond ((and composite
1325                            (or (< last-class ch-class) (zerop last-class)))                            (or (< last-class ch-class) (zerop last-class)))
1326                       ;; Don't have to worry about surrogate pairs here                       ;; Note: As far as I know, there is no pairwise
1327                       ;; because the composite is always in the BMP.                       ;; composition such that the composite character
1328                       (setf (aref target starter-pos) (code-char composite))                       ;; is outside the BMP but the starter-ch is
1329                       (setf starter-ch composite))                       ;; inside the BMP.  Hence, it is always safe to
1330                         ;; replace the possible surrogate at starter-pos
1331                         ;; with another.  We won't accidentally replace
1332                         ;; the next character with our trailing surrogate
1333                         ;; character.
1334                         (multiple-value-bind (hi lo)
1335                             (surrogates composite)
1336                           (setf (aref target starter-pos) hi)
1337                           (when lo
1338                             (setf (aref target (1+ starter-pos)) lo))
1339                         (setf starter-ch composite)))
1340                      (t                      (t
1341                       (when (zerop ch-class)                       (when (zerop ch-class)
1342                         (setf starter-pos comp-pos)                         (setf starter-pos comp-pos)
# Line 1442  Line 1456 
1456  ;;   incorrect; instead, what we need is a new rule:  ;;   incorrect; instead, what we need is a new rule:
1457  ;;  ;;
1458  ;;   *Break after paragraph separators.*  ;;   *Break after paragraph separators.*
1459  ;;    WB3a. Sep  ;;    WB3a. Sep �
1460  ;;   I'll make a propose to the UTC for this.  ;;   I'll make a propose to the UTC for this.
1461  ;;  ;;
1462  ;; Here is Will's translation of those rules (including WB3a)  ;; Here is Will's translation of those rules (including WB3a)
# Line 1684  Line 1698 
1698                              result :start (1+ start) :end next)))                              result :start (1+ start) :end next)))
1699            (write-string string result :start end :end offset-slen))))))            (write-string string result :start end :end offset-slen))))))
1700    
1701    
1702    ;; Some utilities
1703    (defun codepoints-string (seq)
1704      "Convert a sequence of codepoints to a string.  Codepoints outside
1705      the basic multilingual plane (BMP) are converted into the
1706      corresponding surrogate pairs."
1707      (with-output-to-string (s)
1708        (map nil #'(lambda (c)
1709                     (multiple-value-bind (hi lo)
1710                         (surrogates c)
1711                       (write-char hi s)
1712                       (when lo (write-char lo s))))
1713             seq)))
1714    
1715    (defun string-codepoints (s)
1716      "Convert a string to a list of corresponding code points.  Surrogate
1717      pairs in the string are converted into the correspoinding
1718      codepoint."
1719      (declare (type simple-string s))
1720      (let ((len (length s))
1721            cp)
1722        (do ((idx 0))
1723            ((>= idx len))
1724          (multiple-value-bind (c widep)
1725              (codepoint s idx)
1726            (if widep
1727                (incf idx 2)
1728                (incf idx))
1729            (push c cp)))
1730        (nreverse cp)))

Legend:
Removed from v.1.21.6.1  
changed lines
  Added in v.1.29

  ViewVC Help
Powered by ViewVC 1.1.5