diff --git a/src/code/exports.lisp b/src/code/exports.lisp index 6c7bbedc5f61666fc30eb1058e29e59192415369..249bc32abe9d94f54ca7eb5fa96e666188ecacf6 100644 --- a/src/code/exports.lisp +++ b/src/code/exports.lisp @@ -913,7 +913,17 @@ (:export "STRING-TO-NFC" "STRING-TO-NFD" "STRING-TO-NFKC" "STRING-TO-NFKD" "UNICODE-COMPLETE" "UNICODE-COMPLETE-NAME" - "LOAD-ALL-UNICODE-DATA")) + "UNICODE-FULL-CASE-LOWER" + "UNICODE-FULL-CASE-UPPER" + "UNICODE-FULL-CASE-TITLE" + "UNICODE-CATEGORY" + "+UNICODE-CATEGORY-LOWER+" + "+UNICODE-CATEGORY-OTHER+" + "+UNICODE-CATEGORY-GRAPHIC+" + "+UNICODE-CATEGORY-UPPER+" + "+UNICODE-CATEGORY-TITLE+" + "LOAD-ALL-UNICODE-DATA" + "SURROGATES")) (defpackage "EVAL" (:export "*EVAL-STACK-TRACE*" "*INTERNAL-APPLY-NODE-TRACE*" @@ -2432,4 +2442,14 @@ "LAST-FWRAPPER" "DO-FWRAPPERS")) +(defpackage "UNICODE" + (:use "COMMON-LISP") + (:shadow "STRING-CAPITALIZE" + "STRING-DOWNCASE" + "STRING-UPCASE") + (:export "STRING-CAPITALIZE" + "STRING-DOWNCASE" + "STRING-UPCASE" + "STRING-NEXT-WORD-BREAK")) + diff --git a/src/code/string.lisp b/src/code/string.lisp index 5176edf58f170cd09475cce93f2171ec813d6b70..8005cda87102a2120c676ad6580f3cb20802b4d9 100644 --- a/src/code/string.lisp +++ b/src/code/string.lisp @@ -30,7 +30,7 @@ nstring-capitalize)) #+unicode -(export '(string-to-nfd string-to-nfkd string-to-nfkc)) +(export '(string-to-nfd string-to-nfkd string-to-nfkc surrogates)) (declaim (inline surrogatep surrogates-to-codepoint codepoint surrogates)) @@ -159,7 +159,7 @@ ;;; With-One-String is used to set up some string hacking things. The keywords ;;; are parsed, and the string is hacked into a simple-string. -(eval-when (compile) +(eval-when (compile load eval) (defmacro with-one-string (string start end cum-offset &rest forms) `(let ((,string (if (stringp ,string) ,string (string ,string)))) @@ -594,7 +594,9 @@ (setf (schar string i) fill-char)) (make-string count))) -(defun string-upcase-simple (string &key (start 0) end) +(defun string-upcase (string &key (start 0) end) + _N"Given a string, returns a new string that is a copy of it with all + lower case alphabetic characters converted to uppercase." (declare (fixnum start)) (let* ((string (if (stringp string) string (string string))) (slen (length string))) @@ -634,54 +636,9 @@ (setf (schar newstring new-index) (schar string index))) newstring)))) -#+unicode -(defun string-upcase-full (string &key (start 0) end) - (declare (fixnum start)) - (let* ((string (if (stringp string) string (string string))) - (slen (length string))) - (declare (fixnum slen)) - (with-output-to-string (s) - (with-one-string string start end offset - (let ((offset-slen (+ slen offset))) - (declare (fixnum offset-slen)) - (write-string string s :start offset :end start) - (do ((index start (1+ index))) - ((= index (the fixnum end))) - (declare (fixnum index)) - (multiple-value-bind (code wide) - (codepoint string index) - (when wide (incf index)) - ;; Handle ASCII specially because this is called early in - ;; initialization, before unidata is available. - (cond ((< 96 code 123) - (write-char (code-char (decf code 32)) s)) - ((> code 127) - (write-string (unicode-full-case-upper code) s)) - (t - (multiple-value-bind (hi lo) - (surrogates code) - (write-char hi s) - (when lo - (write-char lo s))))))) - (write-string string s :start end :end offset-slen)))))) - -(defun string-upcase (string &key (start 0) end #+unicode (casing :simple)) - #-unicode +(defun string-downcase (string &key (start 0) end) _N"Given a string, returns a new string that is a copy of it with all - lower case alphabetic characters converted to uppercase." - #+unicode - _N"Given a string, returns a new string that is a copy of it with all - lower case alphabetic characters converted to uppercase. Casing is - :simple or :full for simple or full case conversion, respectively." - (declare (fixnum start)) - #-unicode - (string-upcase-simple string :start start :end end) - #+unicode - (if (eq casing :simple) - (string-upcase-simple string :start start :end end) - (string-upcase-full string :start start :end end))) - -(defun string-downcase-simple (string &key (start 0) end) + upper case alphabetic characters converted to lowercase." (declare (fixnum start)) (let* ((string (if (stringp string) string (string string))) (slen (length string))) @@ -720,54 +677,12 @@ (setf (schar newstring new-index) (schar string index))) newstring)))) -#+unicode -(defun string-downcase-full (string &key (start 0) end) - (declare (fixnum start)) - (let* ((string (if (stringp string) string (string string))) - (slen (length string))) - (declare (fixnum slen)) - (with-output-to-string (s) - (with-one-string string start end offset - (let ((offset-slen (+ slen offset))) - (declare (fixnum offset-slen)) - (write-string string s :start offset :end start) - (do ((index start (1+ index))) - ((= index (the fixnum end))) - (declare (fixnum index)) - (multiple-value-bind (code wide) - (codepoint string index) - (when wide (incf index)) - ;; Handle ASCII specially because this is called early in - ;; initialization, before unidata is available. - (cond ((< 64 code 91) - (write-char (code-char (incf code 32)) s)) - ((> code 127) - (write-string (unicode-full-case-lower code) s)) - (t - (multiple-value-bind (hi lo) - (surrogates code) - (write-char hi s) - (when lo - (write-char lo s))))))) - (write-string string s :start end :end offset-slen)))))) - -(defun string-downcase (string &key (start 0) end #+unicode (casing :simple)) - #-unicode - _N"Given a string, returns a new string that is a copy of it with all - upper case alphabetic characters converted to lowercase." - #+unicode - _N"Given a string, returns a new string that is a copy of it with all - upper case alphabetic characters converted to lowercase. Casing is - :simple or :full for simple or full case conversion, respectively." - (declare (fixnum start)) - #-unicode - (string-downcase-simple string :start start :end end) - #+unicode - (if (eq casing :simple) - (string-downcase-simple string :start start :end end) - (string-downcase-full string :start start :end end))) - -(defun string-capitalize-simple (string &key (start 0) end) +(defun string-capitalize (string &key (start 0) end) + _N"Given a string, returns a copy of the string with the first + character of each ``word'' converted to upper-case, and remaining + chars in the word converted to lower case. A ``word'' is defined + to be a string of case-modifiable characters delimited by + non-case-modifiable chars." (declare (fixnum start)) (let* ((string (if (stringp string) string (string string))) (slen (length string))) @@ -804,74 +719,6 @@ (setf (schar newstring new-index) (schar string index))) newstring)))) -#+unicode -(defun string-capitalize-full (string &key (start 0) end) - (declare (fixnum start)) - (let* ((string (if (stringp string) string (string string))) - (slen (length string))) - (declare (fixnum slen)) - (with-output-to-string (s) - (with-one-string string start end offset - (let ((offset-slen (+ slen offset))) - (declare (fixnum offset-slen)) - (write-string string s :start offset :end start) - (flet ((alphanump (m) - (or (< 47 m 58) (< 64 m 91) (< 96 m 123) - #+(and unicode (not unicode-bootstrap)) - (and (> m 127) - (<= +unicode-category-letter+ (unicode-category m) - (+ +unicode-category-letter+ #x0F)))))) - (do ((index start (1+ index)) - (newword t)) - ((= index (the fixnum end))) - (declare (fixnum index)) - (multiple-value-bind (code wide) - (codepoint string index) - (when wide (incf index)) - (cond ((not (alphanump code)) - (multiple-value-bind (hi lo) - (surrogates code) - (write-char hi s) - (when lo (write-char lo s))) - (setq newword t)) - (newword - ;;char is first case-modifiable after non-case-modifiable - (write-string (unicode-full-case-title code) s) - (setq newword ())) - ;;char is case-modifiable, but not first - (t - (write-string (unicode-full-case-lower code) s)))))) - (write-string string s :start end :end offset-slen)))))) - -(defun string-capitalize (string &key (start 0) end - #+unicode (casing :simple) - #+unicode unicode-word-break) - #-unicode - _N"Given a string, returns a copy of the string with the first - character of each ``word'' converted to upper-case, and remaining - chars in the word converted to lower case. A ``word'' is defined - to be a string of case-modifiable characters delimited by - non-case-modifiable chars." - #+unicode - _N"Given a string, returns a copy of the string with the first - character of each ``word'' converted to upper-case, and remaining - chars in the word converted to lower case. Casing is :simple or - :full for simple or full case conversion, respectively. If - Unicode-Word-Break is non-Nil, then the Unicode word-breaking - algorithm is used to determine the word boundaries. Otherwise, A - ``word'' is defined to be a string of case-modifiable characters - delimited by non-case-modifiable chars. " - - (declare (fixnum start)) - #-unicode - (string-capitalize-simple string :start start :end end) - #+unicode - (if unicode-word-break - (string-capitalize-unicode string :start start :end end :casing casing) - (if (eq casing :simple) - (string-capitalize-simple string :start start :end end) - (string-capitalize-full string :start start :end end)))) - (defun nstring-upcase (string &key (start 0) end) "Given a string, returns that string with all lower case alphabetic characters converted to uppercase." @@ -1389,322 +1236,6 @@ (defun string-to-nfc (string) (if (simple-string-p string) string (coerce string 'simple-string))) - -;;; -;;; This is a Lisp translation of the Scheme code from William -;;; D. Clinger that implements the word-breaking algorithm. This is -;;; used with permission. -;;; -;;; This version is modified from the original at -;;; http://www.ccs.neu.edu/home/will/R6RS/ to conform to CMUCL's -;;; implementation of the word break properties. -;;; -;;; -;;; Copyright statement and original comments: -;;; -;;;-------------------------------------------------------------------------------- - -;; Copyright 2006 William D Clinger. -;; -;; Permission to copy this software, in whole or in part, to use this -;; software for any lawful purpose, and to redistribute this software -;; is granted subject to the restriction that all copies made of this -;; software must include this copyright and permission notice in full. -;; -;; I also request that you send me a copy of any improvements that you -;; make to this software so that they may be incorporated within it to -;; the benefit of the Scheme community. - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; -;; Word-breaking as defined by Unicode Standard Annex #29. -;; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; Implementation notes. -;; -;; The string-foldcase, string-downcase, and string-titlecase -;; procedures rely on the notion of a word, which is defined -;; by Unicode Standard Annex 29. -;; -;; The string-foldcase and string-downcase procedures rely on -;; word boundaries only when they encounter a Greek sigma, so -;; their performance should not be greatly affected by the -;; performance of the word-breaking algorithm. -;; -;; The string-titlecase procedure must find all word boundaries, -;; but it is typically used on short strings (titles). -;; -;; Hence the performance of the word-breaking algorithm should -;; not matter too much for this reference implementation. -;; Word-breaking is more generally useful, however, so I tried -;; to make this implementation reasonably efficient. -;; -;; Word boundaries are defined by 14 different rules in -;; Unicode Standard Annex #29, and by GraphemeBreakProperty.txt -;; and WordBreakProperty.txt. See also WordBreakTest.html. -;; -;; My original implementation of those specifications failed -;; 6 of the 494 tests in auxiliary/WordBreakTest.txt, but it -;; appeared to me that those tests were inconsistent with the -;; word-breaking rules in UAX #29. John Cowan forwarded my -;; bug report to the Unicode experts, and Mark Davis responded -;; on 29 May 2007: -;; -;; Thanks for following up on this. I think you have found a problem in the -;; formulation of word break, not the test. The intention was to break after a -;; Sep character, as is done in Sentence break. So my previous suggestion was -;; incorrect; instead, what we need is a new rule: -;; -;; *Break after paragraph separators.* -;; WB3a. Sep � -;; I'll make a propose to the UTC for this. -;; -;; Here is Will's translation of those rules (including WB3a) -;; into a finite state machine that searches forward within a -;; string, looking for the next position at which a word break -;; is allowed. The current state consists of an index i into -;; the string and a summary of the left context whose rightmost -;; character is at index i. The left context is usually -;; determined by the character at index i, but there are three -;; complications: -;; -;; Extend and Format characters are ignored unless they -;; follow a separator or the beginning of the text. -;; ALetter followed by MidLetter is treated specially. -;; Numeric followed by MidNum is treated specially. -;; -;; In the implementation below, the left context ending at i -;; is encoded by the following symbols: -;; -;; CR -;; Sep (excluding CR) -;; ALetter -;; MidLetter -;; ALetterMidLetter (ALetter followed by MidLetter) -;; Numeric -;; MidNum -;; NumericMidNum (Numeric followed by MidNum) -;; Katakana -;; ExtendNumLet -;; other (none of the above) -;; -;; Given a string s and an exact integer i (which need not be -;; a valid index into s), returns the index of the next character -;; that is not part of the word containing the character at i, -;; or the length of s if the word containing the character at i -;; extends through the end of s. If i is negative or a valid -;; index into s, then the returned value will be greater than i. -;; -;;;-------------------------------------------------------------------------------- - -(defun string-next-word-break (s i) - (let ((n (length s))) - (labels - ((char-word-break-category (c) - ;; Map our unicode word break property into what this - ;; algorithm wants. - (let ((cat (lisp::unicode-word-break c))) - (case cat - ((:lf :cr :newline) - :sep) - ((:extend :format) - :extend-or-format) - (otherwise cat)))) - (left-context (i) - ;; Given a valid index i into s, returns the left context - ;; at i. - (multiple-value-bind (c widep) - (lisp::codepoint s i n) - (let* ((back - ;; If we're at a regular character or a leading - ;; surrogate, decrementing by 1 gets us the to - ;; previous character. But for a trailing - ;; surrogate, we need to decrement by 2! - (if (eql widep -1) - 2 - 1)) - (cat (char-word-break-category c))) - (case cat - ((:sep) - (if (= c (char-code #\return)) :cr cat)) - ((:midletter :midnumlet) - (let ((i-1 (- i back))) - (if (and (<= 0 i-1) - (eq (left-context i-1) :aletter)) - :aletter-midletter - cat))) - ((:midnum :midnumlet) - (let ((i-1 (- i back))) - (if (and (<= 0 i-1) - (eq (left-context i-1) :numeric)) - :numeric-midnum - cat))) - ((:extendorformat) - (if (< 0 i) - (left-context (- i back)) - :other)) - (otherwise cat))))) - - (index-of-previous-non-ignored (j) - ;; Returns the index of the last non-Extend, non-Format - ;; character within (substring s 0 j). Should not be - ;; called unless such a character exists. - - (let* ((j1 (- j 1))) - (multiple-value-bind (c widep) - (lisp::codepoint s j1) - (when (eql widep -1) - ;; Back up one more if we're at the trailing - ;; surrogate. - (decf j1)) - (let ((cat (char-word-break-category c))) - (case cat - ((:extend-or-format) - (index-of-previous-non-ignored j1)) - (otherwise j1)))))) - - (lookup (j context) - ;; Given j and the context to the left of (not including) j, - ;; returns the index at the start of the next word - ;; (or before which a word break is permitted). - - (if (>= j n) - (case context - ((:aletter-midletter :numeric-midnum) - (let ((j (index-of-previous-non-ignored n))) - (if (< i j) j n))) - (otherwise n)) - (multiple-value-bind (c widep) - (lisp::codepoint s j) - (let* ((next-j - ;; The next character is either 1 or 2 code - ;; units away. For a leading surrogate, it's - ;; 2; Otherwise just 1. - (if (eql widep 1) - 2 - 1)) - (cat (char-word-break-category c))) - (case cat - ((:extend-or-format) - (case context - ((:cr :sep) j) - (otherwise (lookup (+ j next-j) context)))) - (otherwise - (case context - ((:cr) - (if (= c (char-code #\linefeed)) - ;; Rule WB3: Don't break CRLF, continue looking - (lookup (+ j next-j) cat) - j)) - ((:aletter) - (case cat - ((:aletter :numeric :extendnumlet) - ;; Rules WB5, WB9, ? - (lookup (+ j next-j) cat)) - ((:midletter :midnumlet) - ;; Rule WB6, need to keep looking - (lookup (+ j next-j) :aletter-midletter)) - (otherwise j))) - ((:aletter-midletter) - (case cat - ((:aletter) - ;; Rule WB7 - (lookup (+ j next-j) cat)) - (otherwise - ;; Rule WB6 and WB7 were extended, but the - ;; region didn't end with :aletter. So - ;; backup and break at that point. - (let ((j2 (index-of-previous-non-ignored j))) - (if (< i j2) j2 j))))) - ((:numeric) - (case cat - ((:numeric :aletter :extendnumlet) - ;; Rules WB8, WB10, ? - (lookup (+ j next-j) cat)) - ((:midnum :midnumlet) - ;; Rules WB11, need to keep looking - (lookup (+ j next-j) :numeric-midnum)) - (otherwise j))) - ((:numeric-midnum) - (case cat - ((:numeric) - ;; Rule WB11, keep looking - (lookup (+ j next-j) cat)) - (otherwise - ;; Rule WB11, WB12 were extended, but the - ;; region didn't end with :numeric, so - ;; backup and break at that point. - (let ((j2 (index-of-previous-non-ignored j))) - (if (< i j2) j2 j))))) - ((:midletter :midnum :midnumlet) - ;; Rule WB14 - j) - ((:katakana) - (case cat - ((:katakana :extendnumlet) - ;; Rule WB13, WB13a - (lookup (+ j next-j) cat)) - (otherwise j))) - ((:extendnumlet) - (case cat - ((:extendnumlet :aletter :numeric :katakana) - ;; Rule WB13a, WB13b - (lookup (+ j next-j) cat)) - (otherwise j))) - ((:regional_indicator) - (case cat - ((:regional_indicator) - ;; Rule WB13c - (lookup (+ j next-j) cat)) - (otherwise j))) - (otherwise j))))))))) - (declare (notinline lookup left-context)) - (cond ((< i 0) - ;; Rule WB1 - 0) - ((<= n i) - ;; Rule WB2 - n) - (t - (multiple-value-bind (c widep) - (lisp::codepoint s i) - (declare (ignore c)) - (lookup (+ i (if (eql widep 1) 2 1)) (left-context i)))))))) - -(defun string-capitalize-unicode (string &key (start 0) end (casing :simple)) - (declare (type (member :simple :full) casing)) - (let* ((string (if (stringp string) string (string string))) - (slen (length string))) - (declare (fixnum slen)) - (with-output-to-string (result) - (lisp::with-one-string string start end offset - (let ((offset-slen (+ slen offset))) - (declare (fixnum offset-slen)) - - (write-string string result :start 0 :end start) - (let ((upper (ecase casing - (:simple - #'(lambda (ch) - (multiple-value-bind (hi lo) - (lisp::surrogates (lisp::unicode-upper ch)) - (write-char hi result) - (when lo (write-char lo result))))) - (:full - #'(lambda (ch) - (write-string (lisp::unicode-full-case-title ch) result)))))) - (do ((start start next) - (next (string-next-word-break string start) - (string-next-word-break string next))) - ((or (= start next) - (>= start end))) - ;; Convert the first character of the word to upper - ;; case, and then make the rest of the word lowercase. - (funcall upper (lisp::codepoint string start)) - (write-string (string-downcase string :start (1+ start) :end next :casing casing) - result :start (1+ start) :end next))) - (write-string string result :start end :end offset-slen)))))) - ;; Some utilities (defun codepoints-string (seq) diff --git a/src/code/unicode.lisp b/src/code/unicode.lisp new file mode 100644 index 0000000000000000000000000000000000000000..98fe8805bbf17a0acfca464bed6b1e417ad8c5b3 --- /dev/null +++ b/src/code/unicode.lisp @@ -0,0 +1,519 @@ +;;; -*- Log: code.log; Package: Unicode -*- +;;; +;;; ********************************************************************** +;;; This code was written as part of the CMU Common Lisp project at +;;; Carnegie Mellon University, and has been placed in the public domain. +;;; +(ext:file-comment + "$Header: src/code/unicode.lisp $") +;;; +;;; ********************************************************************** +;;; +;;; Functions to process Unicode strings for CMU Common Lisp +;;; Written by Paul Foley and Raymond Toy. +;;; +;;; **************************************************************** +;;; +(in-package "UNICODE") +(intl:textdomain "cmucl") + +;; An example where this differs from cl:string-upcase differ: +;; #\Latin_Small_Letter_Sharp_S +(defun string-upcase-full (string &key (start 0) end) + _N"Given a string, returns a new string that is a copy of it with + all lower case alphabetic characters converted to uppercase using + full case conversion." + (declare (fixnum start)) (let* ((string (if + (stringp string) string (string string))) + (slen (length string))) + (declare (fixnum slen)) + (with-output-to-string (s) + (lisp::with-one-string string start end offset + (let ((offset-slen (+ slen offset))) + (declare (fixnum offset-slen)) + (write-string string s :start offset :end start) + (do ((index start (1+ index))) + ((= index (the fixnum end))) + (declare (fixnum index)) + (multiple-value-bind (code wide) + (lisp:codepoint string index) + (when wide (incf index)) + ;; Handle ASCII specially because this is called early in + ;; initialization, before unidata is available. + (cond ((< 96 code 123) + (write-char (code-char (decf code 32)) s)) + ((> code 127) + (write-string (lisp:unicode-full-case-upper code) s)) + (t + (multiple-value-bind (hi lo) + (surrogates code) + (write-char hi s) + (when lo + (write-char lo s))))))) + (write-string string s :start end :end offset-slen)))))) + +(defun string-upcase (string &key (start 0) end (casing :full)) + _N"Given a string, returns a new string that is a copy of it with + all lower case alphabetic characters converted to uppercase. Casing + is :simple or :full for simple or full case conversion, + respectively." + (declare (fixnum start)) + (if (eq casing :simple) + (cl:string-upcase string :start start :end end) + (string-upcase-full string :start start :end end))) + + +;; An example this differs from cl:string-downcase: +;; #\Latin_Capital_Letter_I_With_Dot_Above. +(defun string-downcase-full (string &key (start 0) end) + _N"Given a string, returns a new string that is a copy of it with + all uppercase alphabetic characters converted to lowercase using + full case conversion.." + (declare (fixnum start)) + (let* ((string (if (stringp string) string (string string))) + (slen (length string))) + (declare (fixnum slen)) + (with-output-to-string (s) + (lisp::with-one-string string start end offset + (let ((offset-slen (+ slen offset))) + (declare (fixnum offset-slen)) + (write-string string s :start offset :end start) + (do ((index start (1+ index))) + ((= index (the fixnum end))) + (declare (fixnum index)) + (multiple-value-bind (code wide) + (lisp:codepoint string index) + (when wide (incf index)) + ;; Handle ASCII specially because this is called early in + ;; initialization, before unidata is available. + (cond ((< 64 code 91) + (write-char (code-char (incf code 32)) s)) + ((> code 127) + (write-string (lisp:unicode-full-case-lower code) s)) + (t + ;; Handle codes below 64 + (multiple-value-bind (hi lo) + (surrogates code) + (write-char hi s) + (when lo + (write-char lo s))))))) + (write-string string s :start end :end offset-slen)))))) + +(defun string-downcase (string &key (start 0) end (casing :full)) + _N"Given a string, returns a new string that is a copy of it with all + uppercase alphabetic characters converted to lowercase. Casing is + :simple or :full for simple or full case conversion, respectively." + + (declare (fixnum start)) + (if (eq casing :simple) + (cl:string-downcase string :start start :end end) + (string-downcase-full string :start start :end end))) + + +;;; +;;; This is a Lisp translation of the Scheme code from William +;;; D. Clinger that implements the word-breaking algorithm. This is +;;; used with permission. +;;; +;;; This version is modified from the original at +;;; http://www.ccs.neu.edu/home/will/R6RS/ to conform to CMUCL's +;;; implementation of the word break properties. +;;; +;;; +;;; Copyright statement and original comments: +;;; +;;;-------------------------------------------------------------------------------- + +;; Copyright 2006 William D Clinger. +;; +;; Permission to copy this software, in whole or in part, to use this +;; software for any lawful purpose, and to redistribute this software +;; is granted subject to the restriction that all copies made of this +;; software must include this copyright and permission notice in full. +;; +;; I also request that you send me a copy of any improvements that you +;; make to this software so that they may be incorporated within it to +;; the benefit of the Scheme community. + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Word-breaking as defined by Unicode Standard Annex #29. +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Implementation notes. +;; +;; The string-foldcase, string-downcase, and string-titlecase +;; procedures rely on the notion of a word, which is defined +;; by Unicode Standard Annex 29. +;; +;; The string-foldcase and string-downcase procedures rely on +;; word boundaries only when they encounter a Greek sigma, so +;; their performance should not be greatly affected by the +;; performance of the word-breaking algorithm. +;; +;; The string-titlecase procedure must find all word boundaries, +;; but it is typically used on short strings (titles). +;; +;; Hence the performance of the word-breaking algorithm should +;; not matter too much for this reference implementation. +;; Word-breaking is more generally useful, however, so I tried +;; to make this implementation reasonably efficient. +;; +;; Word boundaries are defined by 14 different rules in +;; Unicode Standard Annex #29, and by GraphemeBreakProperty.txt +;; and WordBreakProperty.txt. See also WordBreakTest.html. +;; +;; My original implementation of those specifications failed +;; 6 of the 494 tests in auxiliary/WordBreakTest.txt, but it +;; appeared to me that those tests were inconsistent with the +;; word-breaking rules in UAX #29. John Cowan forwarded my +;; bug report to the Unicode experts, and Mark Davis responded +;; on 29 May 2007: +;; +;; Thanks for following up on this. I think you have found a problem in the +;; formulation of word break, not the test. The intention was to break after a +;; Sep character, as is done in Sentence break. So my previous suggestion was +;; incorrect; instead, what we need is a new rule: +;; +;; *Break after paragraph separators.* +;; WB3a. Sep � +;; I'll make a propose to the UTC for this. +;; +;; Here is Will's translation of those rules (including WB3a) +;; into a finite state machine that searches forward within a +;; string, looking for the next position at which a word break +;; is allowed. The current state consists of an index i into +;; the string and a summary of the left context whose rightmost +;; character is at index i. The left context is usually +;; determined by the character at index i, but there are three +;; complications: +;; +;; Extend and Format characters are ignored unless they +;; follow a separator or the beginning of the text. +;; ALetter followed by MidLetter is treated specially. +;; Numeric followed by MidNum is treated specially. +;; +;; In the implementation below, the left context ending at i +;; is encoded by the following symbols: +;; +;; CR +;; Sep (excluding CR) +;; ALetter +;; MidLetter +;; ALetterMidLetter (ALetter followed by MidLetter) +;; Numeric +;; MidNum +;; NumericMidNum (Numeric followed by MidNum) +;; Katakana +;; ExtendNumLet +;; other (none of the above) +;; +;; Given a string s and an exact integer i (which need not be +;; a valid index into s), returns the index of the next character +;; that is not part of the word containing the character at i, +;; or the length of s if the word containing the character at i +;; extends through the end of s. If i is negative or a valid +;; index into s, then the returned value will be greater than i. +;; +;;;-------------------------------------------------------------------------------- + +(defun string-next-word-break (s i) + _N"Given a string, S, and a starting index, return the index of the + next character that is not part of the word containing the character + at the index, or the length of S if the word containing the + character extends to the end of S. If the index is negative or + valid index into S, the returned value will be strictly greater than + the index." + (let ((n (length s))) + (labels + ((char-word-break-category (c) + ;; Map our unicode word break property into what this + ;; algorithm wants. + (let ((cat (lisp::unicode-word-break c))) + (case cat + ((:lf :cr :newline) + :sep) + ((:extend :format) + :extend-or-format) + (otherwise cat)))) + (left-context (i) + ;; Given a valid index i into s, returns the left context + ;; at i. + (multiple-value-bind (c widep) + (lisp:codepoint s i n) + (let* ((back + ;; If we're at a regular character or a leading + ;; surrogate, decrementing by 1 gets us the to + ;; previous character. But for a trailing + ;; surrogate, we need to decrement by 2! + (if (eql widep -1) + 2 + 1)) + (cat (char-word-break-category c))) + (case cat + ((:sep) + (if (= c (char-code #\return)) :cr cat)) + ((:midletter :midnumlet) + (let ((i-1 (- i back))) + (if (and (<= 0 i-1) + (eq (left-context i-1) :aletter)) + :aletter-midletter + cat))) + ((:midnum :midnumlet) + (let ((i-1 (- i back))) + (if (and (<= 0 i-1) + (eq (left-context i-1) :numeric)) + :numeric-midnum + cat))) + ((:extendorformat) + (if (< 0 i) + (left-context (- i back)) + :other)) + (otherwise cat))))) + + (index-of-previous-non-ignored (j) + ;; Returns the index of the last non-Extend, non-Format + ;; character within (substring s 0 j). Should not be + ;; called unless such a character exists. + + (let* ((j1 (- j 1))) + (multiple-value-bind (c widep) + (lisp:codepoint s j1) + (when (eql widep -1) + ;; Back up one more if we're at the trailing + ;; surrogate. + (decf j1)) + (let ((cat (char-word-break-category c))) + (case cat + ((:extend-or-format) + (index-of-previous-non-ignored j1)) + (otherwise j1)))))) + + (lookup (j context) + ;; Given j and the context to the left of (not including) j, + ;; returns the index at the start of the next word + ;; (or before which a word break is permitted). + + (if (>= j n) + (case context + ((:aletter-midletter :numeric-midnum) + (let ((j (index-of-previous-non-ignored n))) + (if (< i j) j n))) + (otherwise n)) + (multiple-value-bind (c widep) + (lisp:codepoint s j) + (let* ((next-j + ;; The next character is either 1 or 2 code + ;; units away. For a leading surrogate, it's + ;; 2; Otherwise just 1. + (if (eql widep 1) + 2 + 1)) + (cat (char-word-break-category c))) + (case cat + ((:extend-or-format) + (case context + ((:cr :sep) j) + (otherwise (lookup (+ j next-j) context)))) + (otherwise + (case context + ((:cr) + (if (= c (char-code #\linefeed)) + ;; Rule WB3: Don't break CRLF, continue looking + (lookup (+ j next-j) cat) + j)) + ((:aletter) + (case cat + ((:aletter :numeric :extendnumlet) + ;; Rules WB5, WB9, ? + (lookup (+ j next-j) cat)) + ((:midletter :midnumlet) + ;; Rule WB6, need to keep looking + (lookup (+ j next-j) :aletter-midletter)) + (otherwise j))) + ((:aletter-midletter) + (case cat + ((:aletter) + ;; Rule WB7 + (lookup (+ j next-j) cat)) + (otherwise + ;; Rule WB6 and WB7 were extended, but the + ;; region didn't end with :aletter. So + ;; backup and break at that point. + (let ((j2 (index-of-previous-non-ignored j))) + (if (< i j2) j2 j))))) + ((:numeric) + (case cat + ((:numeric :aletter :extendnumlet) + ;; Rules WB8, WB10, ? + (lookup (+ j next-j) cat)) + ((:midnum :midnumlet) + ;; Rules WB11, need to keep looking + (lookup (+ j next-j) :numeric-midnum)) + (otherwise j))) + ((:numeric-midnum) + (case cat + ((:numeric) + ;; Rule WB11, keep looking + (lookup (+ j next-j) cat)) + (otherwise + ;; Rule WB11, WB12 were extended, but the + ;; region didn't end with :numeric, so + ;; backup and break at that point. + (let ((j2 (index-of-previous-non-ignored j))) + (if (< i j2) j2 j))))) + ((:midletter :midnum :midnumlet) + ;; Rule WB14 + j) + ((:katakana) + (case cat + ((:katakana :extendnumlet) + ;; Rule WB13, WB13a + (lookup (+ j next-j) cat)) + (otherwise j))) + ((:extendnumlet) + (case cat + ((:extendnumlet :aletter :numeric :katakana) + ;; Rule WB13a, WB13b + (lookup (+ j next-j) cat)) + (otherwise j))) + ((:regional_indicator) + (case cat + ((:regional_indicator) + ;; Rule WB13c + (lookup (+ j next-j) cat)) + (otherwise j))) + (otherwise j))))))))) + (declare (notinline lookup left-context)) + (cond ((< i 0) + ;; Rule WB1 + 0) + ((<= n i) + ;; Rule WB2 + n) + (t + (multiple-value-bind (c widep) + (lisp:codepoint s i) + (declare (ignore c)) + (lookup (+ i (if (eql widep 1) 2 1)) (left-context i)))))))) + +(defun string-capitalize-unicode (string &key (start 0) end (casing :simple)) + "Capitalize String using the Unicode word-break algorithm to find + the words in String. The beginning is capitalized depending on the + value of Casing" + (declare (type (member :simple :full :title) casing)) + (let* ((string (if (stringp string) string (string string))) + (slen (length string))) + (declare (fixnum slen)) + (with-output-to-string (result) + (lisp::with-one-string string start end offset + (let ((offset-slen (+ slen offset))) + (declare (fixnum offset-slen)) + + (write-string string result :start 0 :end start) + (let ((upper (ecase casing + (:simple + #'(lambda (ch) + (multiple-value-bind (hi lo) + (lisp::surrogates (lisp::unicode-upper ch)) + (write-char hi result) + (when lo (write-char lo result))))) + (:full + #'(lambda (ch) + (write-string (lisp::unicode-full-case-upper ch) result))) + (:title + #'(lambda (ch) + (write-string (lisp::unicode-full-case-title ch) result)))))) + (do ((start start next) + (next (string-next-word-break string start) + (string-next-word-break string next))) + ((or (= start next) + (>= start end))) + ;; Convert the first character of the word to upper + ;; case, and then make the rest of the word lowercase. + (funcall upper (lisp:codepoint string start)) + (write-string (string-downcase string :start (1+ start) + :end next + :casing casing) + result + :start (1+ start) + :end next))) + (write-string string result :start end :end offset-slen)))))) + +(defun string-capitalize-full (string &key (start 0) end (casing :full)) + "Capitalize String using the Common Lisp word-break algorithm to find + the words in String. The beginning is capitalized depending on the + value of Casing" + (declare (fixnum start) + (type (member :simple :full :title) casing)) + (let* ((string (if (stringp string) string (string string))) + (slen (length string))) + (declare (fixnum slen)) + (with-output-to-string (s) + (lisp::with-one-string string start end offset + (let ((offset-slen (+ slen offset))) + (declare (fixnum offset-slen)) + (write-string string s :start offset :end start) + (flet ((alphanump (m) + (or (< 47 m 58) (< 64 m 91) (< 96 m 123) + #+(and unicode (not unicode-bootstrap)) + (and (> m 127) + (<= lisp::+unicode-category-letter+ + (lisp::unicode-category m) + (+ lisp::+unicode-category-letter+ #x0F))))) + (upper (ch) + (ecase casing + (:simple + #'(lambda (ch) + (multiple-value-bind (hi lo) + (lisp::surrogates (lisp::unicode-upper ch)) + (write-char hi s) + (when lo (write-char lo s))))) + (:full + #'(lambda (ch) + (write-string (lisp::unicode-full-case-upper ch) s))) + (:title + #'(lambda (ch) + (write-string (lisp::unicode-full-case-title ch) s)))))) + (do ((index start (1+ index)) + (newword t)) + ((= index (the fixnum end))) + (declare (fixnum index)) + (multiple-value-bind (code wide) + (lisp:codepoint string index) + (when wide (incf index)) + (cond ((not (alphanump code)) + (multiple-value-bind (hi lo) + (surrogates code) + (write-char hi s) + (when lo (write-char lo s))) + (setq newword t)) + (newword + ;; Char is first case-modifiable after non-case-modifiable + (funcall upper code) + (setq newword ())) + (t + ;; char is case-modifiable, but not first + (write-string (lisp:unicode-full-case-lower code) s)))))) + (write-string string s :start end :end offset-slen)))))) + +(defun string-capitalize (string &key (start 0) end + (casing :title) + (unicode-word-break t)) + _N"Given a string, returns a copy of the string with the first + character of each ``word'' converted to upper-case, and remaining + chars in the word converted to lower case. Casing is :simple, :full + or :title for simple, full or title case conversion, respectively. If + Unicode-Word-Break is non-Nil, then the Unicode word-breaking + algorithm is used to determine the word boundaries. Otherwise, A + ``word'' is defined to be a string of case-modifiable characters + delimited by non-case-modifiable chars. " + + (declare (fixnum start) + (type (member :simple :full :title) casing)) + (if unicode-word-break + (string-capitalize-unicode string :start start :end end :casing casing) + (if (eq casing :simple) + (cl:string-capitalize string :start start :end end) + (string-capitalize-full string :start start :end end :casing casing)))) diff --git a/src/code/unidata.lisp b/src/code/unidata.lisp index 55e3a28b73ce5de1e4a794ff6d9afbe7dc5b51ee..351810075c8b5c0eb8503082f7723ec15444fff2 100644 --- a/src/code/unidata.lisp +++ b/src/code/unidata.lisp @@ -18,6 +18,15 @@ (export '(string-to-nfd string-to-nfkc string-to-nfkd string-to-nfc unicode-complete unicode-complete-name + unicode-full-case-lower + unicode-full-case-upper + unicode-full-case-title + unicode-category + +unicode-category-lower+ + +unicode-category-other+ + +unicode-category-graphic+ + +unicode-category-upper+ + +unicode-category-title+ load-all-unicode-data)) (defvar *unidata-path* #p"ext-formats:unidata.bin") diff --git a/src/compiler/fndb.lisp b/src/compiler/fndb.lisp index 52299153e1b9927248a724b797313369798eccd8..052b4c1a2581e0ca8b2bdef9e07ba928cf3a6c57 100644 --- a/src/compiler/fndb.lisp +++ b/src/compiler/fndb.lisp @@ -851,13 +851,11 @@ (sequence stringable) simple-string (flushable)) (defknown (string-upcase string-downcase) - (stringable &key (:start index) (:end sequence-end) #+unicode (:casing case-conversion-type)) + (stringable &key (:start index) (:end sequence-end)) simple-string (flushable)) (defknown (string-capitalize) - (stringable &key (:start index) (:end sequence-end) - #+unicode (:casing case-conversion-type) - #+unicode (:unicode-word-break boolean)) + (stringable &key (:start index) (:end sequence-end)) simple-string (flushable)) (defknown (nstring-upcase nstring-downcase nstring-capitalize) diff --git a/src/tools/worldbuild.lisp b/src/tools/worldbuild.lisp index 9d0dbb68418c15691b4d0eca1b8803978633e1c0..4b5f36439fd07312314c86308c8c752b1c242b3f 100644 --- a/src/tools/worldbuild.lisp +++ b/src/tools/worldbuild.lisp @@ -186,6 +186,8 @@ ,@(when (c:backend-featurep :mp) '("target:code/multi-proc")) "target:code/intl-tramp" + ,@(when (c::backend-featurep :unicode) + '("target:code/unicode")) )) (setf *genesis-core-name* "target:lisp/kernel.core") diff --git a/src/tools/worldcom.lisp b/src/tools/worldcom.lisp index f8ac65ac06ad3b7d00d20ee778af33997a4d7fcd..150d33cc1d71c26e82236eb92f232bd55d197fa5 100644 --- a/src/tools/worldcom.lisp +++ b/src/tools/worldcom.lisp @@ -324,6 +324,8 @@ (comf "target:code/intl-tramp") (comf "target:code/intl") +(when (c:backend-featurep :unicode) + (comf "target:code/unicode")) ); let *byte-compile-top-level* ); with-compiler-log-file