diff --git a/contrib/unicode/receipe b/contrib/unicode/receipe new file mode 100644 index 0000000000000000000000000000000000000000..58232418f250869cbf55fd86cc8b23f370853ed6 --- /dev/null +++ b/contrib/unicode/receipe @@ -0,0 +1 @@ +mkcl -norc -q -eval "(progn (load \"ucd.lisp\") (slurp-ucd) (output-c-code) (quit))" diff --git a/contrib/unicode/ucd.lisp b/contrib/unicode/ucd.lisp index 49fcd4d074138b9670f0eedce249b1bb119605f0..7f36f698d8695f897bb51f8235089665ac99c3fe 100644 --- a/contrib/unicode/ucd.lisp +++ b/contrib/unicode/ucd.lisp @@ -1,10 +1,46 @@ +;;; +;;; Note about the origin of this file. +;;; +;;; This file was inherited by MKCL from ECL 9.6.2 when MKCL was +;;; initially forked from it. But being in the contrib directory, +;;; it is likely that it originated from some other unidentified source. +;;; In fact, this file content can be found verbatim in a file of +;;; the same name (ucd.lisp) in the tools-for-build directory of SBCL. +;;; It is suspected that this SBCL version is in fact the original +;;; source. (sampled in SBCL 1.0.58). As inherited from ECL 9.6.2, +;;; this file did not include any copyright notice, nor do any +;;; version included in SBCL up to this day (2012/11/14). +;;; +;;; +;;; This file was later modified for MKCL's purposes, mainly to +;;; improve its robustness, reliability and maintainability. +;;; For these modifications the same conditions apply as for the +;;; rest of MKCL: +;;; +;;; Copyright (c) 2012, Jean-Claude Beaudoin. +;;; + +;;; In its original form, this code was barely of kleenex quality, +;;; full of unchecked limitations, good to be used at most once... +;;; Some of that state may persist despite efforts to redress it. JCB + + ;;; Common (defparameter *extension-directory* (make-pathname :directory (pathname-directory *load-truename*))) -(defparameter *page-size-exponent* 8) +(defconstant *page-size-exponent* 8) +(defconstant *page-size* (ash 1 *page-size-exponent*)) + +(defconstant *unichar-codepoint-limit* #x110000) +(defconstant *ucd-version* "6.2.0") + +;; #'cp-high and #'cp-low split a code-point in two parts. +;; The high part (from codepoints bits 21 to 8) becomes an index in the page table. +;; The low part (from codepoints bits 7 to 0) becomes an index to a specific +;; character info struct inside a given page. (defun cp-high (cp) (ash cp (- *page-size-exponent*))) @@ -13,53 +49,63 @@ ;;; Generator -(defstruct ucd misc transform) +(defstruct unichar-info + properties-sig-index + transform + ) -(defparameter *unicode-character-database* +#| +(defparameter *unicode-character-database* ;; yet another misnomer. (make-pathname :directory (pathname-directory *load-truename*))) +|# + +(defparameter *unichar-info-pages* nil) ;; a vector of pages. +;; There is (cp-high *unichar-codepoint-limit*) [4352] pages in this vector. +;; Each page is itself a vector of *page-size* [256] unichar-info structs. -(defparameter *ucd-base* nil) -(defparameter *unicode-names* (make-hash-table)) +(defparameter *unicode-names* (make-hash-table)) ;; indexed on code-point. +(defparameter *unicode-names-total-size* 0) (defparameter *last-uppercase* nil) (defparameter *uppercase-transition-count* 0) (defparameter *different-titlecases* nil) (defparameter *different-numerics* nil) -(defparameter *name-size* 0) -(defparameter *misc-hash* (make-hash-table :test #'equal)) -(defparameter *misc-index* -1) -(defparameter *misc-table* nil) -(defparameter *misc-mapping* nil) + +(defparameter *unichar-properties-signature-indices* nil) +(defparameter *unichar-properties-signature-table* nil) +(defparameter *unichar-properties-signature-sorted-table* nil) +(defparameter *unichar-properties-signature-to-sorted-mapping* nil) + (defparameter *both-cases* nil) + (defparameter *decompositions* nil) (defparameter *decomposition-length-max* nil) (defparameter *decomposition-types* nil) (defparameter *decomposition-base* nil) -(defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit - bidi-mirrored cl-both-case-p) - (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit - bidi-mirrored cl-both-case-p)) - (index (gethash list *misc-hash*))) + + +(defun find-unichar-properties-signature-index (general-category-index bidi-index ccc-index + decimal-digit digit bidi-mirrored cl-both-case-p) + (let* ((list (list general-category-index bidi-index ccc-index decimal-digit digit bidi-mirrored cl-both-case-p)) + (index (gethash list *unichar-properties-signature-indices*))) (or index - (progn - (vector-push list *misc-table*) - (setf (gethash list *misc-hash*) - (incf *misc-index*)))))) + (let ((index (vector-push list *unichar-properties-signature-table*))) + (setf (gethash list *unichar-properties-signature-indices*) index))))) -(defun compare-misc-entry (left right) - (destructuring-bind (left-gc-index left-bidi-index left-ccc-index +(defun compare-unichar-properties-signatures (left right) + (destructuring-bind (left-general-category-index left-bidi-index left-ccc-index left-decimal-digit left-digit left-bidi-mirrored left-cl-both-case-p) left - (destructuring-bind (right-gc-index right-bidi-index right-ccc-index + (destructuring-bind (right-general-category-index right-bidi-index right-ccc-index right-decimal-digit right-digit right-bidi-mirrored right-cl-both-case-p) right (or (and left-cl-both-case-p (not right-cl-both-case-p)) (and (or left-cl-both-case-p (not right-cl-both-case-p)) - (or (< left-gc-index right-gc-index) - (and (= left-gc-index right-gc-index) + (or (< left-general-category-index right-general-category-index) + (and (= left-general-category-index right-general-category-index) (or (< left-bidi-index right-bidi-index) (and (= left-bidi-index right-bidi-index) (or (< left-ccc-index right-ccc-index) @@ -74,41 +120,40 @@ (string< left-bidi-mirrored right-bidi-mirrored)))))))))))))))) -(defun build-misc-table () - (sort *misc-table* #'compare-misc-entry) - (setq *misc-mapping* (make-array (1+ *misc-index*))) - (loop for i from 0 to *misc-index* - do (setf (aref *misc-mapping* - (gethash (aref *misc-table* i) *misc-hash*)) - i))) +(defun build-unichar-properties-signature-sorted-table () + (let ((sig-max-index (fill-pointer *unichar-properties-signature-table*))) + (setq *unichar-properties-signature-sorted-table* (copy-seq *unichar-properties-signature-table*)) + (sort *unichar-properties-signature-sorted-table* #'compare-unichar-properties-signatures) + (setq *unichar-properties-signature-to-sorted-mapping* (make-array sig-max-index)) + (loop for i from 0 below sig-max-index + do (setf (aref *unichar-properties-signature-to-sorted-mapping* + (gethash (aref *unichar-properties-signature-sorted-table* i) *unichar-properties-signature-indices*)) + i)))) (defun slurp-ucd () (setq *last-uppercase* nil) (setq *uppercase-transition-count* 0) (setq *different-titlecases* nil) (setq *different-numerics* nil) - (setq *name-size* 0) - (setq *misc-hash* (make-hash-table :test #'equal)) - (setq *misc-index* -1) - (setq *misc-table* (make-array 256 :fill-pointer 0)) + (setq *unicode-names-total-size* 0) + (setq *unichar-properties-signature-indices* (make-hash-table :test #'equal)) + (setq *unichar-properties-signature-table* (make-array 256 :fill-pointer 0)) (setq *both-cases* nil) (setq *decompositions* 0) (setq *decomposition-types* (make-hash-table :test #'equal)) (setq *decomposition-length-max* 0) - (setq *decomposition-base* (make-array (ash #x110000 - (- *page-size-exponent*)) - :initial-element nil)) - (setq *ucd-base* (make-array (ash #x110000 (- *page-size-exponent*)) - :initial-element nil)) + (setq *decomposition-base* (make-array (cp-high *unichar-codepoint-limit*) :initial-element nil)) + (setq *unichar-info-pages* (make-array (cp-high *unichar-codepoint-limit*) :initial-element nil)) + (with-open-file (*standard-input* (make-pathname :name "UnicodeData" :type "txt" :defaults *extension-directory*) - :direction :input :external-format '(:utf-8 :crlf)) + :direction :input :external-format '(:ascii :lf)) (loop for line = (read-line nil nil) while line do (slurp-ucd-line line))) (second-pass) - (build-misc-table) + (build-unichar-properties-signature-sorted-table) *decompositions*) (defun split-string (line character) @@ -118,6 +163,7 @@ do (unless position (loop-finish)))) +#| (defun init-indices (strings) (let ((hash (make-hash-table :test #'equal))) (loop for string in strings @@ -125,13 +171,84 @@ do (setf (gethash string hash) index)) hash)) -(defparameter *general-categories* +(defparameter *general-categories-indices* (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc" "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi" "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs"))) -(defparameter *bidi-classes* +|# + +(defun init-indices (strings) + (let ((hash (make-hash-table :test #'equal))) + (loop for (string c-string) across strings + for index from 0 + do (setf (gethash string hash) index)) + hash)) + +(defconstant *general-categories* + #(("Lu" "mkcl_ucd_Uppercase_Letter") + ("Ll" "mkcl_ucd_Lowercase_Letter") + ("Lt" "mkcl_ucd_Titlecase_Letter") + ("Lm" "mkcl_ucd_Modified_Letter") + ("Lo" "mkcl_ucd_Other_Letter") + ("Cc" "mkcl_ucd_Control") + ("Cf" "mkcl_ucd_Format") + ("Co" "mkcl_ucd_Private_Use") + ("Cs" "mkcl_ucd_Surrogate") + ("Mc" "mkcl_ucd_Spacing_Mark") + ("Me" "mkcl_ucd_Enclosing_Mark") + ("Mn" "mkcl_ucd_Nonspacing_Mark") + ("Nd" "mkcl_ucd_Decimal_Number") + ("Nl" "mkcl_ucd_Letter_Number") + ("No" "mkcl_ucd_Other_Number") + ("Pc" "mkcl_ucd_Connector_Punctuation") + ("Pd" "mkcl_ucd_Dash_Punctuation") + ("Pe" "mkcl_ucd_Close_Punctuation") + ("Pf" "mkcl_ucd_Final_Punctuation") + ("Pi" "mkcl_ucd_Initial_Punctuation") + ("Po" "mkcl_ucd_Other_Punctuation") + ("Ps" "mkcl_ucd_Open_Punctuation") + ("Sc" "mkcl_ucd_Currency_Symbol") + ("Sk" "mkcl_ucd_Modifier_Symbol") + ("Sm" "mkcl_ucd_Math_Symbol") + ("So" "mkcl_ucd_Other_Symbol") + ("Zl" "mkcl_ucd_Line_Separator") + ("Zp" "mkcl_ucd_Paragraph_Separator") + ("Zs" "mkcl_ucd_Space_Separator") + )) + +(defparameter *general-categories-indices* + (init-indices *general-categories*)) + +#| +(defparameter *bidi-classes-indices* (init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO" "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS"))) +|# + +(defconstant *bidi-classes* + #(("AL" "mkcl_ucd_bidi_Arabic_Letter") + ("AN" "mkcl_ucd_bidi_Arabic_Number") + ("B" "mkcl_ucd_bidi_Paragraph_Separator") + ("BN" "mkcl_ucd_bidi_Boundary_Neutral") + ("CS" "mkcl_ucd_bidi_Common_Separator") + ("EN" "mkcl_ucd_bidi_European_Number") + ("ES" "mkcl_ucd_bidi_European_Separator") + ("ET" "mkcl_ucd_bidi_European_Terminator") + ("L" "mkcl_ucd_bidi_Left_To_Right") + ("LRE" "mkcl_ucd_bidi_Left_To_Right_Embedding") + ("LRO" "mkcl_ucd_bidi_Left_To_Right_Override") + ("NSM" "mkcl_ucd_bidi_Nonspacing_Mark") + ("ON" "mkcl_ucd_bidi_Other_Neutral") + ("PDF" "mkcl_ucd_bidi_Pop_Directional_Format") + ("R" "mkcl_ucd_bidi_Right_To_Left") + ("RLE" "mkcl_ucd_bidi_Right_To_Left_Embedding") + ("RLO" "mkcl_ucd_bidi_Right_To_Left_Override") + ("S" "mkcl_ucd_bidi_Segment_Separator") + ("WS" "mkcl_ucd_bidi_White_Space") + )) + +(defparameter *bidi-classes-indices* + (init-indices *bidi-classes*)) (defparameter *block-first* nil) @@ -148,7 +265,7 @@ ;;; 20000 -- 2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;; ;;; F0000 -- FFFFD : private use ;;; 100000 -- 10FFFD: private use -(defun encode-ucd-line (line code-point) +(defun parse-ucd-line (line code-point) (destructuring-bind (name general-category canonical-combining-class bidi-class decomposition-type-and-mapping decimal-digit digit numeric bidi-mirrored @@ -161,10 +278,10 @@ (progn (setq *block-first* code-point) nil) - (let* ((gc-index (or (gethash general-category *general-categories*) + (let* ((general-category-index (or (gethash general-category *general-categories-indices*) (error "unknown general category ~A" general-category))) - (bidi-index (or (gethash bidi-class *bidi-classes*) + (bidi-index (or (gethash bidi-class *bidi-classes-indices*) (error "unknown bidirectional class ~A" bidi-class))) (ccc-index (parse-integer canonical-combining-class)) @@ -177,13 +294,13 @@ (title-index (unless (string= "" simple-titlecase) (parse-integer simple-titlecase :radix 16))) (cl-both-case-p - (not (null (or (and (= gc-index 0) lower-index) - (and (= gc-index 1) upper-index))))) - (misc-index (hash-misc gc-index bidi-index ccc-index - decimal-digit digit bidi-mirrored - cl-both-case-p))) + (not (null (or (and (= general-category-index 0) lower-index) + (and (= general-category-index 1) upper-index))))) + (unichar-properties-signature-index + (find-unichar-properties-signature-index general-category-index bidi-index ccc-index + decimal-digit digit bidi-mirrored cl-both-case-p))) (declare (ignore digit-index)) - (incf *name-size* (length name)) + (incf *unicode-names-total-size* (length name)) (when (string/= "" decomposition-type-and-mapping) (let ((split (split-string decomposition-type-and-mapping #\Space))) @@ -191,7 +308,7 @@ (setf (gethash (pop split) *decomposition-types*) t)) (unless (aref *decomposition-base* (cp-high code-point)) (setf (aref *decomposition-base* (cp-high code-point)) - (make-array (ash 1 *page-size-exponent*) + (make-array *page-size* :initial-element nil))) (setf (aref (aref *decomposition-base* (cp-high code-point)) (cp-low code-point)) @@ -209,7 +326,7 @@ (when (string/= digit numeric) (push (cons code-point numeric) *different-numerics*)) (cond - ((= gc-index 8) + ((= general-category-index 8) ;; Is this really testing for category "Cs"? JCB (unless *last-uppercase* (incf *uppercase-transition-count*)) (setq *last-uppercase* t)) @@ -219,27 +336,26 @@ (setq *last-uppercase* nil))) (when (> ccc-index 255) (error "canonical combining class too large ~A" ccc-index)) - (let ((result (make-ucd :misc misc-index - :transform (or upper-index lower-index 0)))) + (let ((result (make-unichar-info :properties-sig-index unichar-properties-signature-index + :transform (or upper-index lower-index 0)))) (when (and (> (length name) 7) (string= ", Last>" name :start2 (- (length name) 7))) - (let ((page-start (ash (+ *block-first* - (ash 1 *page-size-exponent*) - -1) - (- *page-size-exponent*))) - (page-end (ash code-point (- *page-size-exponent*)))) + (let ((page-start (cp-high (+ *block-first* + *page-size* + -1))) + (page-end (cp-high code-point))) (loop for point from *block-first* below (ash page-start *page-size-exponent*) - do (setf (aref (aref *ucd-base* (cp-high point)) + do (setf (aref (aref *unichar-info-pages* (cp-high point)) (cp-low point)) result)) (loop for page from page-start below page-end - do (setf (aref *ucd-base* page) - (make-array (ash 1 *page-size-exponent*) + do (setf (aref *unichar-info-pages* page) + (make-array *page-size* :initial-element result))) (loop for point from (ash page-end *page-size-exponent*) below code-point - do (setf (aref (aref *ucd-base* (cp-high point)) + do (setf (aref (aref *unichar-info-pages* (cp-high point)) (cp-low point)) result)))) (values result (normalize-character-name name))))))) @@ -247,43 +363,40 @@ (defun slurp-ucd-line (line) (let* ((split-line (split-string line #\;)) (code-point (parse-integer (first split-line) :radix 16)) - (code-high (ash code-point (- *page-size-exponent*))) - (code-low (ldb (byte *page-size-exponent* 0) code-point))) - (unless (aref *ucd-base* code-high) - (setf (aref *ucd-base* code-high) - (make-array (ash 1 *page-size-exponent*) - :initial-element nil))) - (multiple-value-bind (encoding name) - (encode-ucd-line (cdr split-line) code-point) - (setf (aref (aref *ucd-base* code-high) code-low) encoding - (gethash code-point *unicode-names*) name)))) + (code-high (cp-high code-point)) + (code-low (cp-low code-point))) + (unless (aref *unichar-info-pages* code-high) + (setf (aref *unichar-info-pages* code-high) + (make-array *page-size* :initial-element nil))) + (multiple-value-bind (char-info char-name) + (parse-ucd-line (cdr split-line) code-point) + (setf (aref (aref *unichar-info-pages* code-high) code-low) char-info + (gethash code-point *unicode-names*) char-name)))) (defun second-pass () - (loop for i from 0 below (length *ucd-base*) - when (aref *ucd-base* i) - do (loop for j from 0 below (length (aref *ucd-base* i)) - for result = (aref (aref *ucd-base* i) j) + (loop for i from 0 below (length *unichar-info-pages*) + when (aref *unichar-info-pages* i) + do (loop for j from 0 below (length (aref *unichar-info-pages* i)) + for result = (aref (aref *unichar-info-pages* i) j) when result - when (let* ((transform-point (ucd-transform result)) - (transform-high (ash transform-point - (- *page-size-exponent*))) - (transform-low (ldb (byte *page-size-exponent* 0) - transform-point))) + when (let* ((transform-point (unichar-info-transform result)) + (transform-high (cp-high transform-point)) + (transform-low (cp-low transform-point))) (and (plusp transform-point) - (/= (ucd-transform - (aref (aref *ucd-base* transform-high) + (/= (unichar-info-transform + (aref (aref *unichar-info-pages* transform-high) transform-low)) (+ (ash i *page-size-exponent*) j)))) - do (destructuring-bind (gc-index bidi-index ccc-index + do (destructuring-bind (general-category-index bidi-index ccc-index decimal-digit digit bidi-mirrored cl-both-case-p) - (aref *misc-table* (ucd-misc result)) + (aref *unichar-properties-signature-table* (unichar-info-properties-sig-index result)) (declare (ignore cl-both-case-p)) - (format t "~A~%" (+ (ash i *page-size-exponent*) j)) - (setf (ucd-misc result) - (hash-misc gc-index bidi-index ccc-index - decimal-digit digit bidi-mirrored - nil)))))) + ;;(format t "~A~%" (+ (ash i *page-size-exponent*) j)) + (setf (unichar-info-properties-sig-index result) + (find-unichar-properties-signature-index general-category-index bidi-index ccc-index + decimal-digit digit bidi-mirrored + nil)))))) (defun write-3-byte (triplet stream) (write-byte (ldb (byte 8 0) triplet) stream) @@ -296,19 +409,20 @@ (parse-integer digit))) (defun output () - (let ((hash (make-hash-table :test #'equalp)) - (index 0)) - (loop for page across *ucd-base* + (let ((page-folding-bucket (make-hash-table :test #'equalp)) + (folded-page-top-index 0)) + (loop for page across *unichar-info-pages* do (when page - (unless (gethash page hash) - (setf (gethash page hash) - (incf index))))) - (let ((array (make-array (1+ index)))) + (unless (gethash page page-folding-bucket) + (setf (gethash page page-folding-bucket) + (incf folded-page-top-index))))) + (when (<= 255 folded-page-top-index) (error "~&In output: UCD page folding bucket has overflowed: ~S.~%" folded-page-top-index)) + (let ((unichar-info-folded-pages (make-array (1+ folded-page-top-index)))) (maphash #'(lambda (key value) - (setf (aref array value) key)) - hash) - (setf (aref array 0) - (make-array (ash 1 *page-size-exponent*) :initial-element nil)) + (setf (aref unichar-info-folded-pages value) key)) + page-folding-bucket) + (setf (aref unichar-info-folded-pages 0) ;; Is this a blank page simply because we do not know how to count from 0? Yep! JCB + (make-array *page-size* :initial-element nil)) (with-open-file (stream (make-pathname :name "ucd" :type "dat" :defaults *extension-directory*) @@ -316,13 +430,13 @@ :element-type '(unsigned-byte 8) :if-exists :supersede :if-does-not-exist :create) - (let ((offset (* (length *misc-table*) 8))) + (let ((offset (* (length *unichar-properties-signature-sorted-table*) 8))) + ;; "offset" is the external size of the properties signatures table in bytes. (write-byte (mod offset 256) stream) (write-byte (floor offset 256) stream)) - (loop for (gc-index bidi-index ccc-index decimal-digit digit - bidi-mirrored) - across *misc-table* - do (write-byte gc-index stream) + (loop for (general-category-index bidi-index ccc-index decimal-digit digit bidi-mirrored) + across *unichar-properties-signature-sorted-table* + do (write-byte general-category-index stream) do (write-byte bidi-index stream) do (write-byte ccc-index stream) do (write-byte (digit-to-byte decimal-digit) stream) @@ -330,34 +444,55 @@ do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream) do (write-byte 0 stream) do (write-byte 0 stream)) - (loop for page across *ucd-base* - do (write-byte (if page (gethash page hash) 0) stream)) - (loop for page across array + (loop for page across *unichar-info-pages* + do (write-byte (if page (gethash page page-folding-bucket) 0) stream)) + (loop for page across unichar-info-folded-pages do (loop for entry across page do (write-byte (if entry - (aref *misc-mapping* (ucd-misc entry)) - 255) + (aref *unichar-properties-signature-to-sorted-mapping* + (unichar-info-properties-sig-index entry)) + 255) ;; 255 is an index that does/should not exist. JCB stream) - do (write-3-byte (if entry (ucd-transform entry) 0) + do (write-3-byte (if entry (unichar-info-transform entry) 0) stream)))))) - #+(or) - (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr" + ;;#+(or) + (with-open-file (f (make-pathname :name "ucd-code-names" :type "lisp-expr" :defaults *extension-directory*) + :external-format '(:ascii :lf) :direction :output :if-exists :supersede :if-does-not-exist :create) (with-standard-io-syntax - (write-string ";;; Do not edit by hand: generated by ucd.lisp" f) + ;;(write-string ";;; Do not edit by hand: generated by ucd.lisp" f) (maphash (lambda (code name) (when name (print code f) (prin1 name f))) - *unicode-names*)) - (setf *unicode-names* nil)) + *unicode-names*) + (terpri f)) + ;;(setf *unicode-names* nil) ;; why should we destroy this? JCB + ) + (with-open-file (f (make-pathname :name "ucd-name-codes" :type "lisp-expr" + :defaults *extension-directory*) + :external-format '(:ascii :lf) + :direction :output + :if-exists :supersede + :if-does-not-exist :create) + (with-standard-io-syntax + ;;(write-string ";;; Do not edit by hand: generated by ucd.lisp" f) + (maphash (lambda (code name) + (when name + (print name f) + (prin1 code f))) + *unicode-names*) + (terpri f)) + ;;(setf *unicode-names* nil) ;; why should we destroy this? JCB + ) (with-open-file (*standard-output* (make-pathname :name "numerics" :type "lisp-expr" :defaults *extension-directory*) + :external-format '(:ascii :lf) :direction :output :if-exists :supersede :if-does-not-exist :create) @@ -368,6 +503,7 @@ (make-pathname :name "titlecases" :type "lisp-expr" :defaults *extension-directory*) + :external-format '(:ascii :lf) :direction :output :if-exists :supersede :if-does-not-exist :create) @@ -377,22 +513,23 @@ (make-pathname :name "misc" :type "lisp-expr" :defaults *extension-directory*) + :external-format '(:ascii :lf) :direction :output :if-exists :supersede :if-does-not-exist :create) (let ((*print-pretty* t)) - (prin1 `(:length ,(length *misc-table*) - :uppercase ,(loop for (gc-index) across *misc-table* + (prin1 `(:length ,(length *unichar-properties-signature-sorted-table*) + :uppercase ,(loop for (general-category-index) across *unichar-properties-signature-sorted-table* for i from 0 - when (= gc-index 0) + when (= general-category-index 0) collect i) - :lowercase ,(loop for (gc-index) across *misc-table* + :lowercase ,(loop for (general-category-index) across *unichar-properties-signature-sorted-table* for i from 0 - when (= gc-index 1) + when (= general-category-index 1) collect i) - :titlecase ,(loop for (gc-index) across *misc-table* + :titlecase ,(loop for (general-category-index) across *unichar-properties-signature-sorted-table* for i from 0 - when (= gc-index 2) + when (= general-category-index 2) collect i))))) (values)) @@ -408,5 +545,165 @@ (read-sequence *compiled-ucd* stream))) (values)) -(slurp-ucd) -(output) +;;(slurp-ucd) +;;(output) + +(defconstant *unichar-invalid-digit-value* 255) + +(defun output-h-file (h-file) + (format h-file " +/* + This file was automatically generated by utility contrib/unicode/ucd.lisp + for Unicode ~S + + Copyright (c) 2012, Jean-Claude Beaudoin. + + MKCL is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 3 of the License, or (at your option) any later version. + + See file '../../Copyright' for full details. +*/~%" *ucd-version*) + (terpri h-file) + (format h-file "#ifndef MKCL_UNICODE_H~%") + (format h-file "#define MKCL_UNICODE_H~2%") + + (format h-file "#define MKCL_UNICHAR_INVALID_DIGIT ((mkcl_uint8_t) ~D)~2%" *unichar-invalid-digit-value*) + + (format h-file "~%enum mkcl_ucd_general_category {~%") + (loop for (string c-string) across *general-categories* + do (progn (format h-file " ~A,~%" c-string)) + ) + (format h-file "};~%") + (format h-file "~%enum mkcl_ucd_bidi_class {~%") + (loop for (string c-string) across *bidi-classes* + do (progn (format h-file " ~A,~%" c-string)) + ) + (format h-file "};~%") + (terpri h-file) + (format h-file "struct mkcl_unichar_properties_signature~%") + (format h-file "{~%") + (format h-file " enum mkcl_ucd_general_category general_category;~%") + (format h-file " enum mkcl_ucd_bidi_class bidi;~%") + (format h-file " mkcl_uint8_t canonical_combining_class;~%") + (format h-file " mkcl_uint8_t decimal_digit;~%") + (format h-file " mkcl_uint8_t digit;~%") + (format h-file " bool bidi_mirrored;~%") + (format h-file "};~%") + (terpri h-file) + (format h-file "extern const struct mkcl_unichar_properties_signature~%") + (format h-file " _mkcl_unichar_properties_signatures[];~%") + (terpri h-file) + (format h-file "const mkcl_uint8_t _mkcl_unichar_info_pages[~D];~2%" (cp-high *unichar-codepoint-limit*)) + (format h-file "struct mkcl_unichar_info~%") + (format h-file "{~%") + (format h-file " mkcl_uint8_t properties_signature_index;~%") + (format h-file " unsigned int transform:24;~%") + (format h-file "};~2%") + (format h-file "extern const struct mkcl_unichar_info _mkcl_unichar_info[][~D];~2%" *page-size*) + + (format h-file "#endif /* MKCL_UNICODE_H */~2%") + ) + +(defun output-c-file (c-file) + (format c-file " +/* + This file was automatically generated by utility contrib/unicode/ucd.lisp + for Unicode ~S + + Copyright (c) 2012, Jean-Claude Beaudoin. + + MKCL is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 3 of the License, or (at your option) any later version. + + See file '../../Copyright' for full details. +*/~%" *ucd-version*) + (terpri c-file) + (format c-file "#include ~%") + (terpri c-file) + (format c-file "const struct mkcl_unichar_properties_signature~%") + (format c-file " _mkcl_unichar_properties_signatures[] =~%") + (format c-file "{~%") + (loop for (general-category-index bidi-index ccc-index decimal-digit digit bidi-mirrored) + across *unichar-properties-signature-sorted-table* + do (format c-file " { ~A, " (cadr (aref *general-categories* general-category-index))) + do (format c-file "~A, " (cadr (aref *bidi-classes* bidi-index))) + do (format c-file "~D, " ccc-index) + do (format c-file "~A, " (if (and decimal-digit (string/= "" decimal-digit)) decimal-digit *unichar-invalid-digit-value*)) + do (format c-file "~A, " (if (and digit (string/= "" digit)) digit *unichar-invalid-digit-value*)) + do (format c-file "~A },~%" (if (string= "N" bidi-mirrored) "false" "true")) + ) + (format c-file "};~2%") + + (let ((page-folding-bucket (make-hash-table :test #'equalp)) + (folded-page-top-index 0)) + (loop for page across *unichar-info-pages* + do (when page + (unless (gethash page page-folding-bucket) + (setf (gethash page page-folding-bucket) folded-page-top-index) + (incf folded-page-top-index)))) + (when (<= 255 folded-page-top-index) (error "~&In output: UCD page folding bucket has overflowed: ~S.~%" folded-page-top-index)) + ;;(format t "~&In output: folded-page-top-index = ~S.~%" folded-page-top-index) ;; debug JCB + (let ((unichar-info-folded-pages (make-array folded-page-top-index))) + (maphash #'(lambda (key value) + (setf (aref unichar-info-folded-pages value) key)) + page-folding-bucket) + + (format c-file "const mkcl_uint8_t _mkcl_unichar_info_pages[~D] =~%" (cp-high *unichar-codepoint-limit*)) + (format c-file "{") + (loop for page across *unichar-info-pages* + for count from 0 + do (when (eql 0 (mod count 10)) (format c-file "~% /* ~4D */ " count)) + do (format c-file " ~D," (if page (gethash page page-folding-bucket) 0)) + ) + (format c-file "~%};~2%") + + + (format c-file "const struct mkcl_unichar_info~%") + (format c-file " _mkcl_unichar_info[][~D] =~%" *page-size*) + (format c-file "{~%") + (loop for page across unichar-info-folded-pages + for page-index from 0 + do (format c-file " { /* page ~D */" page-index) + do (loop for entry across page + for count from 0 + do (when (eql 0 (mod count 8)) (format c-file "~% /* ~2X */ " count)) + do (format c-file " { ~D, 0x~X }," + (if entry + (aref *unichar-properties-signature-to-sorted-mapping* + (unichar-info-properties-sig-index entry)) + 255) ;; 255 is an index that does/should not exist. JCB + (if entry (unichar-info-transform entry) 0) + ) + ) + do (format c-file "~% },~%") + ) + (format c-file "};~2%") + ) + ) + ) + + +(defun output-c-code () + (with-open-file (h-file (make-pathname :name "mkcl-unicode" + :type "h" + :defaults *extension-directory*) + :direction :output + :external-format '(:ascii :lf) + :if-exists :supersede + :if-does-not-exist :create) + (output-h-file h-file)) + (with-open-file (c-file (make-pathname :name "unicode" + :type "c" + :defaults *extension-directory*) + :direction :output + :external-format '(:ascii :lf) + :if-exists :supersede + :if-does-not-exist :create) + (output-c-file c-file))) + +;;;;;;;;;;;;;;;;;;; +