/[cmucl]/src/tools/build-unidata.lisp
ViewVC logotype

Contents of /src/tools/build-unidata.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (show annotations)
Mon Jun 27 15:11:30 2011 UTC (2 years, 9 months ago) by rtoy
Branch: MAIN
CVS Tags: GIT-CONVERSION, snapshot-2011-09, snapshot-2011-07, HEAD
Changes since 1.8: +4 -4 lines
Update to Unicode 6.0.0.


code/unidata.lisp:
o Update unicode version to 6.0.0
o Add pointer to build-unidata.lisp.
tools/build-unidata.lisp:
o Update unicode version to 6.0.0
o Print out directory path so we can see where we're getting the data
  from.


i18n/CaseFolding.txt
i18n/CompositionExclusions.txt
i18n/DerivedNormalizationProps.txt
i18n/NameAliases.txt
i18n/NormalizationCorrections.txt
i18n/SpecialCasing.txt
i18n/UnicodeData.txt
i18n/WordBreakProperty.txt
i18n/tests/NormalizationTest.txt
i18n/tests/WordBreakTest.txt:
o Update with new files from unicode.org.
1 ;;; -*- Mode: LISP; Syntax: ANSI-Common-Lisp; Package: COMMON-LISP-USER -*-
2 ;;;
3 ;;; **********************************************************************
4 ;;; This code was written by Paul Foley and has been placed in the public
5 ;;; domain.
6 ;;;
7 (ext:file-comment "$Header: /tiger/var/lib/cvsroots/cmucl/src/tools/build-unidata.lisp,v 1.9 2011/06/27 15:11:30 rtoy Exp $")
8 ;;;
9 ;;; **********************************************************************
10 ;;;
11 ;;; Build the Unicode data file
12 ;;;
13 ;;; This can be run from an 8-bit build just fine. The Unicode data
14 ;;; file is read and written in portable way so it can be used on any
15 ;;; platform.
16 ;;;
17 ;;; What to do: Run (build-unidata) then (write-unidata <path>)
18 ;;;
19 ;;; This only needs to be run if the Unicode database changes and we
20 ;;; need a new Unicode data file.
21
22 (in-package "COMMON-LISP-USER")
23
24 (defstruct unidata
25 range
26 name+
27 name
28 category
29 scase
30 numeric
31 decomp
32 combining
33 bidi
34 name1+
35 name1
36 qc-nfd
37 qc-nfkd
38 qc-nfc
39 qc-nfkc
40 comp-exclusions
41 full-case-lower
42 full-case-title
43 full-case-upper
44 case-fold-full
45 case-fold-simple
46 word-break
47 )
48
49 (defvar *unicode-data* (make-unidata))
50
51 ;; The magic number for the unidata.bin file. (It's "*UCD", in
52 ;; big-endian order.)
53 (defconstant +unicode-magic-number+ #x2A554344)
54
55 ;; The expected Unicode version
56 (defconstant +unicode-major-version+ 6)
57 (defconstant +unicode-minor-version+ 0)
58 (defconstant +unicode-update-version+ 0)
59
60 ;;; These need to be synched with code/unidata.lisp
61
62 (defstruct dictionary
63 (cdbk (ext:required-argument) :read-only t :type simple-vector)
64 (keyv (ext:required-argument) :read-only t
65 :type (simple-array (unsigned-byte 8) (*)))
66 (keyl (ext:required-argument) :read-only t
67 :type (simple-array (unsigned-byte 8) (*)))
68 (codev (ext:required-argument) :read-only t
69 :type (simple-array (signed-byte 32) (*)))
70 (nextv (ext:required-argument) :read-only t
71 :type (simple-array (unsigned-byte 32) (*)))
72 (namev (ext:required-argument) :read-only t
73 :type (simple-array (unsigned-byte 32) (*))))
74
75 (defstruct range
76 (codes (ext:required-argument) :read-only t
77 :type (simple-array (unsigned-byte 32) (*))))
78
79 (defstruct ntrie
80 (split (ext:required-argument) :read-only t
81 :type (unsigned-byte 8))
82 (hvec (ext:required-argument) :read-only t
83 :type (simple-array (unsigned-byte 16) (*)))
84 (mvec (ext:required-argument) :read-only t
85 :type (simple-array (unsigned-byte 16) (*))))
86
87 (defstruct (ntrie1 (:include ntrie))
88 (lvec (ext:required-argument) :read-only t
89 :type (simple-array bit (*))))
90
91 (defstruct (ntrie2 (:include ntrie))
92 (lvec (ext:required-argument) :read-only t
93 :type (simple-array (unsigned-byte 2) (*))))
94
95 (defstruct (ntrie4 (:include ntrie))
96 (lvec (ext:required-argument) :read-only t
97 :type (simple-array (unsigned-byte 4) (*))))
98
99 (defstruct (ntrie8 (:include ntrie))
100 (lvec (ext:required-argument) :read-only t
101 :type (simple-array (unsigned-byte 8) (*))))
102
103 (defstruct (ntrie16 (:include ntrie))
104 (lvec (ext:required-argument) :read-only t
105 :type (simple-array (unsigned-byte 16) (*))))
106
107 (defstruct (ntrie32 (:include ntrie))
108 (lvec (ext:required-argument) :read-only t
109 :type (simple-array (unsigned-byte 32) (*))))
110
111
112 (defstruct (scase (:include ntrie32))
113 (svec (ext:required-argument) :read-only t
114 :type (simple-array (unsigned-byte 16) (*))))
115
116 (defstruct (decomp (:include ntrie32))
117 (tabl (ext:required-argument) :read-only t
118 ;; This has type simple-string in unidata.lisp, but we can treat
119 ;; it as a vector of integers here
120 :type (simple-array (unsigned-byte 16) (*))))
121
122 (defstruct (full-case (:include ntrie32))
123 (tabl (ext:required-argument) :read-only t
124 :type (simple-array (unsigned-byte 16) (*))))
125
126 (defstruct (case-folding (:include ntrie32))
127 (tabl (ext:required-argument) :read-only t
128 :type (simple-array (unsigned-byte 16) (*))))
129
130 (defstruct (bidi (:include ntrie16))
131 (tabl (ext:required-argument) :read-only t
132 :type (simple-array (unsigned-byte 16) (*))))
133
134 (defconstant +decomposition-type+
135 '(nil "<compat>" "<initial>" "<medial>" "<final>" "<isolated>"
136 "<super>" "<sub>" "<fraction>" "<font>" "<noBreak>"
137 "<vertical>" "<wide>" "<narrow>" "<small>" "<square>" "<circle>"))
138
139 (defconstant +bidi-class+
140 '("L" "LRE" "LRO" "R" "AL" "RLE" "RLO" "PDF" "EN" "ES" "ET" "AN" "CS"
141 "NSM" "BN" "B" "S" "WS" "ON"))
142
143
144 ;; Codebooks for encoding names. These are fairly arbitrary. Order isn't
145 ;; important, except that if A is an initial substring of B, B must come
146 ;; first (or A will never be used), but since they're searched in order it's
147 ;; nice if more-common strings come before less-common ones. The entries
148 ;; don't have to be complete words; but may not contain spaces or hyphens,
149 ;; and there must be a single-character string for every valid character in
150 ;; the set of names you want to look up (unless some character only ever
151 ;; occurs in some other entry).
152 ;;
153 ;; *CODEBOOK* is for the modern character names, *CODEBOOK-1* is for
154 ;; the Unicode 1.0 names.
155 (defvar *codebook*
156 #(" " "LETTER" "NA" "SMALL" "CAPITAL" "MATHEMATICAL" "RADICAL" "ARABIC"
157 "HANGUL" "LATIN" "SYLLABLE" "-" "WITH" "RA" "YI" "CJK" "FORM" "LIGATURE"
158 "FOR" "PATTERN" "SIGN" "MARK" "GREEK" "AND" "YA" "BOLD" "COMPATIBILITY"
159 "OLD" "KATAKANA" "SYMBOL" "ITALIC" "CUNEIFORM" "TAI" "UP" "U" "LINEAR"
160 "SYLLABICS" "ETHIOPIC" "ABOVE" "2" "Z" "CANADIAN" "DOTS" "LINE"
161 "HALFWIDTH" "X" "KANGXI" "MUSICAL" "VOWEL" "Q" "IDEOGRAPHIC" "DOT"
162 "RIGHTWARDS" "PARENTHESIZED" "LAM" "TIMES" "HALF" "FINAL" "ORIYA" "SERIF"
163 "DIGIT" "ARROW" "IDEOGRAPH" "RIGHT" "PA" "TAG" "CIRCLED" "MALAYALAM" "9"
164 "HAH" "SANS" "CIRCLE" "YEH" "Y" "OR" "DASIA" "WHITE" "KANNADA" "VARIATION"
165 "CYRILLIC" "1" "8" "SQUARE" "VARIA" "NOTATION" "TONE" "0" "3" "5" "LIGHT"
166 "BRAILLE" "DA" "ONE" "KA" "COMBINING" "4" "ISOLATED" "EQUAL" "NOT"
167 "SELECTOR" "VAI" "TO" "DOUBLE" "GEORGIAN" "OF" "BYZANTINE" "MODIFIER"
168 "HAMZA" "6" "ANGLE" "LEFTWARDS" "FOUR" "CARRIER" "VERTICAL" "TIBETAN"
169 "STROKE" "BELOW" "INITIAL" "MEEM" "PLUS" "LEFT" "7" "DOWN" "NUMBER" "TWO"
170 "KHMER" "OVER" "JEEM" "BARB" "SCRIPT" "MACRON" "LAO" "WEST" "W" "HOOK"
171 "TAMIL" "EIGHT" "COPTIC" "ALEF" "BAR" "E" "IDEOGRAM" "TA" "I" "RUNIC"
172 "DRAWINGS" "BLACK" "FIVE" "HORIZONTAL" "MONGOLIAN" "TILE" "GLAGOLITIC"
173 "LOWER" "NUMERIC" "SIX" "CREE" "BOX" "MYANMAR" "THREE" "PSILI" "LOW"
174 "ACUTE" "HARPOON" "BRACKET" "DEVANAGARI" "MAKSURA" "LIMBU" "FULLWIDTH"
175 "VOCALIC" "V" "JONGSEONG" "THAN" "HA" "CHARACTER" "BALINESE" "LI" "MA"
176 "NINE" "SEVEN" "DOMINO" "FRAKTUR" "HEBREW" "TILDE" "CHOSEONG" "LAGAB"
177 "BENGALI" "MEDIAL" "ARMENIAN" "HEAVY" "GUJARATI" "NEW" "TELUGU" "LA"
178 "CHAM" "OMEGA" "DESERET" "KHAROSHTHI" "SIOS" "HIRAGANA" "POINTING" "P"
179 "FUNCTIONAL" "APL" "THAI" "CHEROKEE" "LUE" "GURMUKHI" "F" "DIAERESIS"
180 "D" "SAURASHTRA" "BOPOMOFO" "B" "JUNGSEONG" "TETRAGRAM" "HIGH" "LONG"
181 "CIRCUMFLEX" "C" "GRAVE" "SINHALA" "ALPHA" "KHAH" "K" "TELEGRAPH" "LESS"
182 "OPEN" "HEXAGRAM" "H" "SYRIAC" "ACCENT" "LEPCHA" "L" "TRIANGLE" "GREATER"
183 "STRUCK" "OXIA" "O" "J" "REVERSED" "AEGEAN" "A" "SHORT" "TURNED" "T" "GA2"
184 "G" "MONOSPACE" "STOP" "S" "M" "NKO" "NUMERAL" "RIEUL" "R" "N"))
185
186 (defvar *codebook-1*
187 #(" " "LETTER" "RA" "OR" "ARABIC" "SMALL" "CAPITAL" "LATIN" "GLYPH" "HANGUL"
188 "RIGHT" "IE" "LEFT" "CYRILLIC" "ONE" "SQUARED" "HACEK" "SQUARE" "DOUBLE"
189 "END" "FORMS" "WITH" "AND" "ON" "FOR" "LIGHT" "SPACING" "GEORGIAN" "EN"
190 "NON" "GE" "CIRCLED" "UPPER" "DOWN" "WHITE" "SO" "THAI" "ABOVE" "BRACKET"
191 "HALFWIDTH" "GREEK" "VERTICAL" "ISOLATE" "POINTING" "DIGIT" "HALF" "TEN"
192 "FINAL" "SCRIPT" "BELOW" "ACUTE" "POINT" "HEAVY" "-" "OPENING" "TE"
193 "GRAPHIC" "CLOSING" "HAMZAH" "OPEN" "KAERITEN" "MACRON" "BLACK" "TONOS"
194 "YA" "DOTS" "RING" "ANGLE" "SINGLE" "HAA" "LESS" "PERIOD" "KA" "UPSILON"
195 "CIRCUMFLEX" "BARB" "INITIAL" "ALEF" "FOUR" "DIAERESIS" "THAN" "SIGN"
196 "HOOK" "MEDIAL" "GRAVE" "PARENTHESIZED" "LIEUL" "OF" "REVERSED" "TWO" "UP"
197 "HORIZONTAL" "CEDILLA" "INVERSE" "SANS" "MU" "VOWEL" "ARROWS" "THREE" "FA"
198 "HARPOON" "BIEUB" "OVERLAY" "V" "X" "LAM" "EQUAL" "SERIF" "TIBETAN" "HA"
199 "DASHED" "COMMA" "GREATER" "MARK" "BARRED" "ARROW" "UNDERSCORE" "TRIANGLE"
200 "NUMBER" "SIOS" "FULLWIDTH" "DASH" "WAW" "LINE" "OVERSCORE" "UNDER" "U"
201 "TORTOISE" "BAR" "QUOTATION" "PARENTHESIS" "Q" "GIYEOG" "G" "EUM" "SARA"
202 "DOTLESS" "FRACTION" "OVER" "BREVE" "NOT" "YOGH" "LOWER" "DOT" "Y" "II"
203 "TAA" "EIGHT" "OGONEK" "SSANG" "CURLY" "JIEUJ" "MAI" "TILDE" "J" "KHO"
204 "DESCENDER" "HEADED" "CURL" "RETROFLEX" "R" "BAA" "SUPERSCRIPT" "TONE"
205 "NIEUN" "FIVE" "ZHE" "MODIFIER" "DIGEUD" "CORNER" "SUBSCRIPT" "TO" "INDIC"
206 "EASTERN" "LIGATURE" "CENTER" "PIEUP" "Z" "TAH" "SLASH" "WAVY" "NINE" "W"
207 "HIEUH" "DEVICE" "CAF" "ARMENIAN" "EPSILON" "FEATHERED" "SIX" "THO" "A"
208 "IOTA" "KEY" "NOR" "CONTROL" "BAN" "LENTICULAR" "L" "SEVEN" "B" "DAL"
209 "TAIL" "CHE" "OMEGA" "O" "HEBREW" "EQUIVALENT" "E" "SHORT" "TURNED"
210 "NEITHER" "CHI" "INVERTED" "MIEUM" "SEPARATOR" "TIEUT" "DHAH" "NOON" "N"
211 "PAIRED" "FEED" "KHAA" "SHELL" "HIGH" "TABULATION" "CHARACTER"
212 "IDEOGRAPHS" "I" "DAMMAH" "START" "MADDAH" "C" "TRIPLE" "STRUCK" "HORN"
213 "H" "K" "DELETE" "D" "TATWEEL" "STROKE" "MARBUTAH" "SHIFT" "TIP" "FROM"
214 "SHADOWED" "S" "THAA" "F" "M" "TRANSMISSION" "P" "TRANSFINITE" "T" "2"))
215
216
217
218 (defun encode-name (string codebook)
219 (let ((p 0)
220 (res '()))
221 (loop while (< p (length string)) do
222 (dotimes (i (length codebook)
223 (error "\"~C\" is not in the codebook." (char string p)))
224 (let ((code (aref codebook i)))
225 (when (and (<= (length code) (- (length string) p))
226 (string= string code :start1 p :end1 (+ p (length code))))
227 (push i res)
228 (incf p (length code))
229 (return)))))
230 (nreverse (coerce res 'vector))))
231
232 (defun name-lookup (name codebook keyv keyl nextv)
233 (let* ((current 0)
234 (posn 0))
235 (loop
236 (let ((keyp (ash (aref nextv current) -18)))
237 (dotimes (i (aref keyl keyp)
238 (return-from name-lookup nil)) ; shouldn't happen
239 (let* ((str (aref codebook (aref keyv (+ keyp i))))
240 (len (length str)))
241 (when (and (>= (length name) (+ posn len))
242 (string= name str :start1 posn :end1 (+ posn len)))
243 (setq current
244 (+ (logand (aref nextv current) #x3FFFF) i))
245 (if (= (incf posn len) (length name))
246 (return-from name-lookup current)
247 (return))))))))) ; from DOTIMES - do outer LOOP again
248
249 (defun build-dictionary (codebook entries)
250 (let ((khash (make-hash-table :test 'equalp))
251 (thash (make-hash-table))
252 (top 0)
253 (keyl (make-array 0 :element-type '(unsigned-byte 8)))
254 (keyv (make-array 0 :element-type '(unsigned-byte 8)))
255 vec1 vec2 vec3)
256 (labels ((add-to-trie (trie name codepoint)
257 (loop for ch across (encode-name name codebook) do
258 (let ((sub (cdr (assoc ch (rest trie)))))
259 (if sub
260 (setq trie sub)
261 (setq trie (cdar (push (cons ch (cons nil nil))
262 (rest trie)))))))
263 (unless (or (null (car trie)) (= (car trie) codepoint))
264 (error "Codepoints #x~4,'0X and #x~4,'0X are both named ~S."
265 (car trie) codepoint name))
266 (setf (car trie) codepoint))
267 (key (trie)
268 (map '(simple-array (unsigned-byte 8) (*)) #'car (rest trie)))
269 (pass1 (trie depth)
270 (setf (rest trie) (sort (rest trie) #'< :key #'car))
271 (setf (gethash trie thash)
272 (list depth (1- (incf top)) (length (rest trie))))
273 (setf (gethash (key trie) khash) t)
274 (mapc (lambda (x) (pass1 (cdr x) (1+ depth))) (rest trie)))
275 (pass2 (trie)
276 (let* ((x (gethash (gethash trie thash) thash))
277 (n (car x)))
278 (setf (aref vec1 n) (if (first trie) (first trie) -1)
279 (aref vec2 n) (logior (ash (gethash (key trie) khash)
280 18)
281 (cdr x))))
282 (mapc (lambda (x) (pass2 (cdr x))) (rest trie))))
283 (format t "~& Initializing...~%")
284 (let ((trie (cons nil nil)))
285 (loop for (name . code) in entries do (add-to-trie trie name code))
286 (format t "~& Pass 1...~%")
287 (pass1 trie 0)
288 (format t "~& Sorting...~%")
289 (dolist (key (sort (loop for k being the hash-keys of khash
290 collect k)
291 #'> :key #'length))
292 (let ((pos -1))
293 (loop
294 (setq pos (search key keyv :start2 (1+ pos)))
295 (when (and pos (zerop (aref keyl pos)))
296 (setf (aref keyl pos) (length key)))
297 (when (and pos (= (aref keyl pos) (length key)))
298 (setf (gethash key khash) pos)
299 (return))
300 (when (null pos)
301 (setf (gethash key khash) (length keyv))
302 (setf keyl (adjust-array keyl (+ (length keyv) (length key))))
303 (setf (aref keyl (length keyv)) (length key))
304 (setf keyv (concatenate '(simple-array (unsigned-byte 8) (*))
305 keyv key))
306 (return)))))
307 (loop with off = 1
308 for key in (sort (loop for x being the hash-values of thash
309 collect x)
310 (lambda (a b) (if (= (first a) (first b))
311 (< (second a) (second b))
312 (< (first a) (first b)))))
313 as i upfrom 0
314 do (setf (gethash key thash) (cons i off) off (+ off (third key))))
315 (setq vec1 (make-array top :element-type '(signed-byte 32))
316 vec2 (make-array top :element-type '(unsigned-byte 32))
317 vec3 (make-array top :element-type '(unsigned-byte 32)))
318 (format t "~& Pass 2...~%")
319 (pass2 trie)
320 (format t "~& Finalizing~%")
321 (dotimes (i top)
322 (let ((xxx (aref vec2 i)))
323 (dotimes (j (aref keyl (ash xxx -18)))
324 (setf (aref vec3 (+ (logand xxx #x3FFFF) j)) i))))
325 (loop for (name . code) in entries do
326 (let ((n (name-lookup name codebook keyv keyl vec2)))
327 (unless n (error "Codepoint not found for ~S." name))
328 (setf (ldb (byte 14 18) (aref vec3 n)) (length name))))))
329 (make-dictionary :cdbk codebook
330 :keyv keyv :keyl keyl
331 :codev vec1 :nextv vec2 :namev vec3)))
332
333 (defun pack (ucd range fn default bits split)
334 (let* ((lbits (1+ (logand split 15)))
335 (mbits (1+ (ash split -4)))
336 (upos 0)
337 (base 0)
338 (len (ash 1 mbits))
339 (posn len)
340 (hvec (make-array (1+ (ash #x110000 (- (+ mbits lbits))))
341 :element-type '(unsigned-byte 16)))
342 (ttop (1+ (ash #x110000 (- lbits))))
343 (tvec (make-array ttop :element-type '(unsigned-byte 16)))
344 (lvec (make-array 0 :element-type (list 'unsigned-byte bits))))
345 ;; pass 1 - build lvec and temporary tvec
346 (loop
347 ;;(format t "~&Pass 1: ~6,'0X~%" (ash base lbits))
348 (let ((vec (make-array (ash 1 lbits) :initial-element default
349 :element-type (list 'unsigned-byte bits)))
350 (empty t))
351 (loop while (and (< upos (length ucd))
352 (< (ucdent-code (aref ucd upos))
353 (ash (1+ base) lbits)))
354 do
355 (let* ((min (ucdent-code (aref ucd upos)))
356 (max (let ((x (position min (range-codes range))))
357 (if (and x (evenp x))
358 (aref (range-codes range) (1+ x))
359 min)))
360 (i (max (- min (ash base lbits)) 0))
361 (j (min (- max (ash base lbits)) (1- (ash 1 lbits)))))
362 (loop for i from i to j do
363 (setf (aref vec i) (funcall fn (aref ucd upos)))
364 (unless (= (aref vec i) default)
365 (setq empty nil)))
366 (if (< max (ash (1+ base) lbits))
367 (incf upos)
368 (return))))
369 (if empty
370 (setf (aref tvec base) #xFFFF)
371 (let ((idx (search vec lvec)))
372 (when (null idx)
373 (setq idx (length lvec))
374 (setf lvec (adjust-array lvec (+ idx (ash 1 lbits))))
375 (replace lvec vec :start1 idx))
376 (setf (aref tvec base) idx))))
377 (incf base)
378 (when (> (ash base lbits) #x10FFFF)
379 (return)))
380 ;; pass 2 - build hvec and transform tvec into mvec
381 (setf base 1 (aref hvec 0) 0)
382 (loop
383 ;;(format t "~&Pass 2: ~6,'0X~%" (ash base (+ mbits lbits)))
384 (let ((empty t))
385 (loop for i from posn below (+ posn len)
386 when (/= (aref tvec i) #xFFFF) do (setq empty nil)
387 while empty)
388 (if empty
389 (progn
390 (setf (aref hvec base) #xFFFF)
391 (replace tvec tvec :start1 posn :start2 (+ posn len))
392 (decf ttop len))
393 (let ((idx (search tvec tvec
394 :start1 posn :end1 (+ posn len)
395 :end2 ttop))
396 (dl len) (ds posn) (pinc len))
397 (when (< idx posn)
398 (when (> (+ idx len) posn)
399 (setq dl (- posn idx) ds (+ idx len)))
400 (decf pinc dl)
401 (replace tvec tvec :start1 ds :start2 (+ ds dl))
402 (decf ttop dl))
403 (incf posn pinc)
404 (setf (aref hvec base) idx))))
405 (incf base)
406 (when (> (ash base (+ mbits lbits)) #x10FFFF)
407 (return)))
408 (values hvec (lisp::shrink-vector tvec ttop) lvec)))
409
410
411
412 (defun write-unidata (path)
413 (labels ((write16 (n stm)
414 (write-byte (ldb (byte 8 8) n) stm)
415 (write-byte (ldb (byte 8 0) n) stm))
416 (write32 (n stm)
417 (write16 (ldb (byte 16 16) n) stm)
418 (write16 (ldb (byte 16 0) n) stm))
419 (write-ntrie (split hvec mvec lvec stm)
420 (write-byte split stm)
421 (write16 (length hvec) stm)
422 (write16 (length mvec) stm)
423 (write16 (length lvec) stm)
424 (write-vector hvec stm :endian-swap :network-order)
425 (write-vector mvec stm :endian-swap :network-order)
426 (write-vector lvec stm :endian-swap :network-order))
427 (write-ntrie32 (data stm)
428 (write-ntrie (ntrie32-split data)
429 (ntrie32-hvec data)
430 (ntrie32-mvec data)
431 (ntrie32-lvec data)
432 stm))
433 (write-ntrie16 (data stm)
434 (write-ntrie (bidi-split data)
435 (bidi-hvec data)
436 (bidi-mvec data)
437 (bidi-lvec data)
438 stm))
439 (write-ntrie8 (data stm)
440 (write-ntrie (ntrie8-split data)
441 (ntrie8-hvec data)
442 (ntrie8-mvec data)
443 (ntrie8-lvec data)
444 stm))
445 (write-ntrie4 (data stm)
446 (write-ntrie (ntrie4-split data)
447 (ntrie4-hvec data)
448 (ntrie4-mvec data)
449 (ntrie4-lvec data)
450 stm))
451 (write-ntrie2 (data stm)
452 (write-ntrie (ntrie2-split data)
453 (ntrie2-hvec data)
454 (ntrie2-mvec data)
455 (ntrie2-lvec data)
456 stm))
457 (write-ntrie1 (data stm)
458 (write-ntrie (ntrie1-split data)
459 (ntrie1-hvec data)
460 (ntrie1-mvec data)
461 (ntrie1-lvec data)
462 stm))
463 (write-dict (data stm)
464 (write-byte (1- (length (dictionary-cdbk data))) stm)
465 (write16 (length (dictionary-keyv data)) stm)
466 (write32 (length (dictionary-codev data)) stm)
467 (let ((codebook (dictionary-cdbk data)))
468 (dotimes (i (length codebook))
469 (write-byte (length (aref codebook i)) stm)
470 (dotimes (j (length (aref codebook i)))
471 (write-byte (char-code (char (aref codebook i) j)) stm))))
472 (write-vector (dictionary-keyv data) stm :endian-swap :network-order)
473 (write-vector (dictionary-keyl data) stm :endian-swap :network-order)
474 (write-vector (dictionary-codev data) stm :endian-swap :network-order)
475 (write-vector (dictionary-nextv data) stm :endian-swap :network-order)
476 (write-vector (dictionary-namev data) stm :endian-swap :network-order))
477 (update-index (val array)
478 (let ((result (vector-push val array)))
479 (unless result
480 (error "Index array too short for the data being written")))))
481 (with-open-file (stm path :direction :io :if-exists :rename-and-delete
482 :element-type '(unsigned-byte 8))
483 ;; The length of the index array is the number of sections to be
484 ;; written. See below for each section.
485 (let ((index (make-array 19 :fill-pointer 0)))
486 ;; File header
487 (write32 +unicode-magic-number+ stm) ; identification "magic"
488 ;; File format version
489 (write-byte 0 stm)
490 ;; Unicode version
491 (write-byte +unicode-major-version+ stm)
492 (write-byte +unicode-minor-version+ stm)
493 (write-byte +unicode-update-version+ stm)
494 (dotimes (i (array-dimension index 0))
495 (write32 0 stm)) ; space for indices
496 (write32 0 stm) ; end marker
497 ;; 0. Range data
498 (let ((data (unidata-range *unicode-data*)))
499 (update-index (file-position stm) index)
500 (write32 (length (range-codes data)) stm)
501 (write-vector (range-codes data) stm :endian-swap :network-order))
502 ;; 1. Character name data
503 (update-index (file-position stm) index)
504 (write-dict (unidata-name+ *unicode-data*) stm)
505 ;; 2. Codepoint-to-name mapping
506 (let ((data (unidata-name *unicode-data*)))
507 (update-index (file-position stm) index)
508 (write-ntrie32 data stm))
509 ;; 3. Codepoint-to-category table
510 (let ((data (unidata-category *unicode-data*)))
511 (update-index (file-position stm) index)
512 (write-ntrie8 data stm))
513 ;; 4. Simple case mapping table
514 (let ((data (unidata-scase *unicode-data*)))
515 (update-index (file-position stm) index)
516 (write-ntrie32 data stm)
517 (write-byte (length (scase-svec data)) stm)
518 (write-vector (scase-svec data) stm :endian-swap :network-order))
519 ;; 5. Numeric data
520 (let ((data (unidata-numeric *unicode-data*)))
521 (update-index (file-position stm) index)
522 (write-ntrie32 data stm))
523 ;; 6. Decomposition data
524 (let ((data (unidata-decomp *unicode-data*)))
525 (update-index (file-position stm) index)
526 (write-ntrie32 data stm)
527 (write16 (length (decomp-tabl data)) stm)
528 (write-vector (decomp-tabl data) stm :endian-swap :network-order))
529 ;; 7. Combining classes
530 (let ((data (unidata-combining *unicode-data*)))
531 (update-index (file-position stm) index)
532 (write-ntrie8 data stm))
533 ;; 8. Bidi data
534 (let ((data (unidata-bidi *unicode-data*)))
535 (update-index (file-position stm) index)
536 (write-ntrie16 data stm)
537 (write-byte (length (bidi-tabl data)) stm)
538 (write-vector (bidi-tabl data) stm :endian-swap :network-order))
539 ;; 9. Unicode 1.0 names
540 (update-index (file-position stm) index)
541 (write-dict (unidata-name1+ *unicode-data*) stm)
542 ;; 10. Codepoint to unicode-1.0 name
543 (let ((data (unidata-name1 *unicode-data*)))
544 (update-index (file-position stm) index)
545 (write-ntrie32 data stm))
546 ;; 11. Normalization quick-check data
547 (update-index (file-position stm) index)
548 (let ((data (unidata-qc-nfd *unicode-data*)))
549 (write-ntrie1 data stm))
550 (let ((data (unidata-qc-nfkd *unicode-data*)))
551 (write-ntrie1 data stm))
552 (let ((data (unidata-qc-nfc *unicode-data*)))
553 (write-ntrie2 data stm))
554 (let ((data (unidata-qc-nfkc *unicode-data*)))
555 (write-ntrie2 data stm))
556 ;; 12. Write composition exclusion table
557 (let ((data (unidata-comp-exclusions *unicode-data*)))
558 (update-index (file-position stm) index)
559 (write16 (length data) stm)
560 (write-vector data stm :endian-swap :network-order))
561 (flet ((dump-full-case (data)
562 (update-index (file-position stm) index)
563 (write-ntrie32 data stm)
564 (write16 (length (full-case-tabl data)) stm)
565 (write-vector (full-case-tabl data) stm :endian-swap :network-order)))
566 ;; 13. Write full-case lower data
567 (dump-full-case (unidata-full-case-lower *unicode-data*))
568 ;; 14. Write full-case title data
569 (dump-full-case (unidata-full-case-title *unicode-data*))
570 ;; 15. Write full-case upper data
571 (dump-full-case (unidata-full-case-upper *unicode-data*)))
572 ;; 16. Write case-folding simple data
573 (let ((data (unidata-case-fold-simple *unicode-data*)))
574 (update-index (file-position stm) index)
575 (write-ntrie32 data stm))
576 ;; 17. Write case-folding full data
577 (let ((data (unidata-case-fold-full *unicode-data*)))
578 (update-index (file-position stm) index)
579 (write-ntrie32 data stm)
580 (write16 (length (case-folding-tabl data)) stm)
581 (write-vector (case-folding-tabl data) stm :endian-swap :network-order))
582 ;; 18. Word-break
583 (let ((data (unidata-word-break *unicode-data*)))
584 (update-index (file-position stm) index)
585 (write-ntrie4 data stm))
586 ;; All components saved. Patch up index table now.
587 (file-position stm 8)
588 (dotimes (i (length index))
589 (write32 (aref index i) stm)))))
590 nil)
591
592
593
594 (defstruct (ucdent
595 (:constructor make-ucdent (code name cat comb bidi decomp
596 num1 num2 num3 mirror
597 name1 comment
598 upper lower title)))
599 code name cat comb bidi decomp num1 num2 num3 mirror name1 comment
600 upper lower title
601 aliases
602 mcode
603 norm-qc
604 comp-exclusion
605 full-case-lower
606 full-case-title
607 full-case-upper
608 case-fold-full
609 case-fold-simple
610 word-break
611 ;; ...
612 )
613
614 ;; ucd-directory should be the directory where UnicodeData.txt is
615 ;; located.
616 (defun foreach-ucd (name ucd-directory fn)
617 (format t "~& ~A~%" name)
618 (with-open-file (s (make-pathname :name name :type "txt"
619 :defaults ucd-directory))
620 (cond
621 ((string= name "Unihan")
622 (loop for line = (read-line s nil) while line do
623 (when (char= (char line 0) #\U)
624 (let* ((tab1 (position #\Tab line))
625 (tab2 (position #\Tab line :start (1+ tab1))))
626 (funcall fn
627 (parse-integer line :radix 16 :start 2 :end tab1)
628 (subseq line (1+ tab1) tab2)
629 (subseq line (1+ tab2)))))))
630 ((string= name "CompositionExclusions")
631 (loop for line = (read-line s nil) while line do
632 (let ((code (parse-integer line :radix 16 :junk-allowed t)))
633 (when code
634 (funcall fn code)))))
635 (t
636 (loop with save = 0
637 for line = (read-line s nil) as end = (position #\# line)
638 while line do
639 (loop while (and end (plusp end)
640 (char= (char line (1- end)) #\Space))
641 do (decf end))
642 (when (position #\; line :end end)
643 (let* ((split (loop for i = 0 then (1+ j)
644 as j = (position #\; line :start i :end end)
645 collect (string-trim '(#\Space #\Tab)
646 (subseq line i (or j end)))
647 while j))
648 (first (first split))
649 (second (second split))
650 (length (length second))
651 (brk (search ".." first))
652 hi lo)
653 (if brk
654 (setq lo (parse-integer first :radix 16 :junk-allowed t)
655 hi (parse-integer first :radix 16 :start (+ brk 2)))
656 (setq lo (parse-integer first :radix 16)
657 hi lo))
658 (cond ((and (> length 8)
659 (string= second ", First>" :start1 (- length 8)))
660 (setq save lo))
661 ((and (> length 7)
662 (string= second ", Last>" :start1 (- length 7)))
663 (apply fn save hi
664 (concatenate 'string
665 (subseq second 0 (- length 7)) ">")
666 (rest (rest split))))
667 (t
668 (apply fn lo hi (rest split)))))))))))
669
670 (defun parse-decomposition (string)
671 (let ((type nil) (n 0))
672 (when (char= (char string 0) #\<)
673 (setq n (1+ (position #\Space string)))
674 (setq type (subseq string 0 (1- n))))
675 (list* type
676 (loop while (< n (length string))
677 collect (multiple-value-bind (a b)
678 (parse-integer string :radix 16
679 :start n :end (position #\Space string :start n))
680 (setq n (1+ b))
681 a)))))
682
683 (declaim (ftype (function (&optional t) (values simple-vector range)) read-data))
684 (defun read-data (&optional (ucd-directory #p"target:i18n/"))
685 (let ((vec (make-array 50000))
686 (vec-hash (make-hash-table :size 15000))
687 (pos 0)
688 (range (make-array 50 :element-type '(unsigned-byte 32)))
689 (rpos 0))
690 (flet ((cat (x) (dpb (position (char x 0) "CZMPNLS") (byte 3 4)
691 (position (char x 1) "nsifepkcdmulto")))
692 (num (x) (if (string= x "") nil (read-from-string x)))
693 (chr (x) (if (string= x "") nil (parse-integer x :radix 16)))
694 (str (x) (if (string= x "") nil x))
695 (bool (x) (string= x "Y"))
696 (decomp (x) (if (string= x "") nil (parse-decomposition x))))
697 (foreach-ucd "UnicodeData" ucd-directory
698 (lambda (min max name cat comb bidi decomp num1 num2 num3
699 mirror name1 comment upper lower title)
700 (when (> max min)
701 (setf (aref range rpos) min (aref range (1+ rpos)) max)
702 (incf rpos 2))
703 (when (position #\( name1)
704 (setq name1 (subseq name1 0 (1- (position #\( name1)))))
705 (setf (aref vec pos)
706 (make-ucdent min name (cat cat) (num comb) (str bidi)
707 (decomp decomp)
708 (num num1) (num num2) (num num3)
709 (bool mirror) (str name1) (str comment)
710 (chr upper) (chr lower) (chr title)))
711 (incf pos))))
712 (setf vec (lisp::shrink-vector vec pos))
713 (setf range (lisp::shrink-vector range rpos))
714 (flet ((find-ucd (key)
715 (let ((index (gethash key vec-hash)))
716 (if index
717 (aref vec index)
718 nil))))
719 ;; Using FIND is rather slow, especially for the derived
720 ;; normalization props. We create a hash table that maps a
721 ;; codepoint to the index to VEC that contains the desired entry.
722 ;; This is much faster. We do it this way to keep the structure
723 ;; as close as possible to the original.
724 (loop for k from 0 below pos
725 do (setf (gethash (ucdent-code (aref vec k)) vec-hash) k))
726
727 (foreach-ucd "NameAliases"
728 ucd-directory
729 (lambda (min max alias)
730 (declare (ignore max))
731 (push alias (ucdent-aliases (find-ucd min)))))
732
733 (foreach-ucd "NormalizationCorrections"
734 ucd-directory
735 (lambda (min max bad good &rest junk)
736 (declare (ignore max bad junk))
737 (setf (ucdent-decomp (find-ucd min))
738 (parse-decomposition good))))
739
740 (foreach-ucd "BidiMirroring"
741 ucd-directory
742 (lambda (min max mirror)
743 (declare (ignore max))
744 (setf (ucdent-mcode (find-ucd min))
745 (parse-integer mirror :radix 16 :junk-allowed t))))
746
747 (foreach-ucd "DerivedNormalizationProps"
748 ucd-directory
749 (lambda (min max prop &optional value)
750 (cond ((string= prop "NFD_QC")
751 (loop for i from min to max
752 as ent = (find-ucd i) do
753 (when ent
754 (setf (getf (ucdent-norm-qc ent) :nfd)
755 (intern value "KEYWORD")))))
756 ((string= prop "NFKD_QC")
757 (loop for i from min to max
758 as ent = (find-ucd i) do
759 (when ent
760 (setf (getf (ucdent-norm-qc ent) :nfkd)
761 (intern value "KEYWORD")))))
762 ((string= prop "NFC_QC")
763 (loop for i from min to max
764 as ent = (find-ucd i) do
765 (when ent
766 (setf (getf (ucdent-norm-qc ent) :nfc)
767 (intern value "KEYWORD")))))
768 ((string= prop "NFKC_QC")
769 (loop for i from min to max
770 as ent = (find-ucd i) do
771 (when ent
772 (setf (getf (ucdent-norm-qc ent) :nfkc)
773 (intern value "KEYWORD"))))))))
774
775 (foreach-ucd "CompositionExclusions"
776 ucd-directory
777 (lambda (min)
778 (let ((entry (find-ucd min)))
779 (setf (ucdent-comp-exclusion entry) t))))
780
781 (foreach-ucd "SpecialCasing"
782 ucd-directory
783 (lambda (min max lower title upper &rest condition)
784 (declare (ignore max))
785 (when (string= (car condition) "")
786 (flet ((parse-casing (string)
787 (rest (parse-decomposition string))))
788 (let ((ent (find-ucd min)))
789 (setf (ucdent-full-case-lower ent) (parse-casing lower))
790 (setf (ucdent-full-case-title ent) (parse-casing title))
791 (setf (ucdent-full-case-upper ent) (parse-casing upper)))))))
792
793 (foreach-ucd "CaseFolding"
794 ucd-directory
795 (lambda (min max mode expansion &rest info)
796 (declare (ignore max info))
797 (flet ((parse-folding (string)
798 (rest (parse-decomposition string))))
799 (let ((ent (find-ucd min)))
800 (cond ((string= mode "T")
801 ;; We ignore these language-specific foldings.
802 )
803 ((string= mode "F")
804 ;; Full case folding
805 (setf (ucdent-case-fold-full ent) (parse-folding expansion)))
806 (t
807 ;; Simple case folding (C or S)
808 (setf (ucdent-case-fold-simple ent) (car (parse-folding expansion)))))))))
809
810 (foreach-ucd "WordBreakProperty"
811 ucd-directory
812 (lambda (min max prop)
813 (let ((code (intern (string-upcase prop) "KEYWORD")))
814 (loop for i from min to max
815 as ent = (find-ucd i) do
816 (when ent
817 (setf (ucdent-word-break ent) code))))))
818 (values vec (make-range :codes range)))))
819
820
821
822 (defun pack-name (ucdent name1 dictionary)
823 (let* ((cdbk (dictionary-cdbk dictionary))
824 (keyv (dictionary-keyv dictionary))
825 (keyl (dictionary-keyl dictionary))
826 (nextv (dictionary-nextv dictionary))
827 (name (if name1 (ucdent-name1 ucdent) (ucdent-name ucdent))))
828 (or (and name (name-lookup name cdbk keyv keyl nextv)) 0)))
829
830 (defun pack-case (ucdent svec)
831 (let* ((uo (if (ucdent-upper ucdent)
832 (- (ucdent-code ucdent) (ucdent-upper ucdent))
833 0))
834 (lo (if (ucdent-lower ucdent)
835 (- (ucdent-code ucdent) (ucdent-lower ucdent))
836 0))
837 (to (if (ucdent-title ucdent)
838 (- (ucdent-code ucdent) (ucdent-title ucdent))
839 0))
840 pu pl pt)
841 (unless (setq pu (position (abs uo) svec))
842 (setq pu (fill-pointer svec))
843 (vector-push-extend (abs uo) svec))
844 (unless (setq pl (position (abs lo) svec))
845 (setq pl (fill-pointer svec))
846 (vector-push-extend (abs lo) svec))
847 (unless (setq pt (position (abs to) svec))
848 (setq pt (fill-pointer svec))
849 (vector-push-extend (abs to) svec))
850 (logior (ash (if (minusp to) (logior 128 pt) pt) 16)
851 (ash (if (minusp lo) (logior 128 pl) pl) 8)
852 (if (minusp uo) (logior 128 pu) pu))))
853
854 (defun pack-numeric (ucdent)
855 (let* ((n3 (ucdent-num3 ucdent))
856 (fl (cond ((ucdent-num1 ucdent) #x3800000)
857 ((ucdent-num2 ucdent) #x1800000)
858 (n3 #x800000)
859 (t 0)))
860 (neg (if (and n3 (minusp n3)) #x900000 0))
861 (num (if n3 (ash (abs (numerator n3)) 3) 0))
862 (den (if n3 (1- (denominator n3)) 0)))
863 (logior fl neg num den)))
864
865 (defun pack-decomp (ucdent tabl)
866 (if (not (ucdent-decomp ucdent))
867 0
868 (let* ((d (loop for i in (rest (ucdent-decomp ucdent))
869 if (<= i #xFFFF) collect i
870 else collect (logior (ldb (byte 10 10) (- i #x10000)) #xD800)
871 and collect (logior (ldb (byte 10 0) (- i #x10000)) #xDC00)))
872 (l (length d))
873 (n (search d tabl)))
874 (unless n
875 (setq n (fill-pointer tabl))
876 (dolist (x d) (vector-push-extend x tabl)))
877 ;; high 5 bits: decomp type
878 ;; next 5 bits: unused
879 ;; next 6 bits: length, in code units
880 ;; low 16 bits: index into tabl
881 (logior n (ash l 16)
882 (ash (position (first (ucdent-decomp ucdent))
883 +decomposition-type+ :test #'equalp)
884 27)))))
885
886 (defun pack-full-case (ucdent tabl entry)
887 (if (not (funcall entry ucdent))
888 0
889 (let* ((d (loop for i in (funcall entry ucdent)
890 if (<= i #xFFFF) collect i
891 else collect (logior (ldb (byte 10 10) (- i #x10000)) #xD800)
892 and collect (logior (ldb (byte 10 0) (- i #x10000)) #xDC00)))
893 (l (length d))
894 (n (search d tabl)))
895 (unless n
896 (setq n (fill-pointer tabl))
897 (dolist (x d) (vector-push-extend x tabl)))
898 ;; next 6 bits: length, in code units
899 ;; low 16 bits: index into tabl
900 (logior n (ash l 16)))))
901
902 (defun pack-bidi (ucdent tabl)
903 (logior (position (ucdent-bidi ucdent) +bidi-class+ :test #'string=)
904 (if (ucdent-mirror ucdent) #x20 #x00)
905 (if (ucdent-mcode ucdent)
906 (let* ((n (- (ucdent-mcode ucdent) (ucdent-code ucdent)))
907 (x (abs n)))
908 (logior
909 (ash (if (< x #x10)
910 x
911 (let ((k (position x tabl)))
912 (if k k (prog1 (fill-pointer tabl) (vector-push-extend x tabl)))))
913 6)
914 (if (< x #x10) #x000 #x800)
915 (if (minusp n) #x400 #x000)))
916 0)))
917
918 (defun pack-case-folding-simple (ucdent)
919 (or (ucdent-case-fold-simple ucdent)
920 0))
921
922 (defun pack-case-folding-full (entry tabl)
923 (if (not (ucdent-case-fold-full entry))
924 0
925 (let* ((d (loop for i in (ucdent-case-fold-full entry)
926 if (<= i #xFFFF) collect i
927 else collect (logior (ldb (byte 10 10) (- i #x10000)) #xD800)
928 and collect (logior (ldb (byte 10 0) (- i #x10000)) #xDC00)))
929 (l (length d))
930 (n (search d tabl)))
931 (unless n
932 (setq n (fill-pointer tabl))
933 (dolist (x d) (vector-push-extend x tabl)))
934 ;; next 6 bits: length, in code units
935 ;; low 16 bits: index into tabl
936 (logior n (ash l 16)))))
937
938 (defun pack-word-break (ucdent)
939 ;; The code is the index in the list. :OTHER is a dummy value and
940 ;; used to represent the default case.
941 (or (position (ucdent-word-break ucdent)
942 '(:other :cr :lf :newline :extend :format
943 :katakana :aletter :midnumlet :midletter :midnum
944 :numeric :extendnumlet))
945 0))
946
947 ;; ucd-directory should be the directory where UnicodeData.txt is
948 ;; located.
949 (defun build-unidata (&optional (ucd-directory "target:i18n/"))
950 (format t "~&Reading data from ~S~%" (probe-file ucd-directory))
951 (multiple-value-bind (ucd range) (read-data ucd-directory)
952 (setf (unidata-range *unicode-data*) range)
953 (format t "~&Building character name tables~%")
954 (let* ((data (loop for ent across ucd
955 when (char/= (char (ucdent-name ent) 0) #\<)
956 collect (cons (ucdent-name ent) (ucdent-code ent))
957 when (ucdent-aliases ent)
958 nconc (loop for name in (ucdent-aliases ent)
959 collect (cons name (ucdent-code ent)))))
960 (dict (build-dictionary *codebook* data)))
961 (setf (unidata-name+ *unicode-data*) dict)
962 (multiple-value-bind (hvec mvec lvec)
963 (pack ucd range (lambda (x) (pack-name x nil dict)) 0 32 #x54)
964 (setf (unidata-name *unicode-data*)
965 (make-ntrie32 :split #x54 :hvec hvec :mvec mvec :lvec lvec))))
966
967 (format t "~&Building Unicode 1.0 character name tables~%")
968 (let* ((data (loop for ent across ucd
969 when (plusp (length (ucdent-name1 ent)))
970 collect (cons (ucdent-name1 ent) (ucdent-code ent))))
971 (dict (build-dictionary *codebook-1* data)))
972 (setf (unidata-name1+ *unicode-data*) dict)
973 (multiple-value-bind (hvec mvec lvec)
974 (pack ucd range (lambda (x) (pack-name x t dict)) 0 32 #x54)
975 (setf (unidata-name1 *unicode-data*)
976 (make-ntrie32 :split #x54 :hvec hvec :mvec mvec :lvec lvec))))
977
978 (format t "~&Building general category table~%")
979 (multiple-value-bind (hvec mvec lvec)
980 (pack ucd range #'ucdent-cat 0 8 #x53)
981 (setf (unidata-category *unicode-data*)
982 (make-ntrie8 :split #x53 :hvec hvec :mvec mvec :lvec lvec)))
983
984 (format t "~&Building simple case-conversion table~%")
985 (let ((svec (make-array 100 :element-type '(unsigned-byte 16)
986 :fill-pointer 0 :adjustable t)))
987 (vector-push-extend 0 svec)
988 (multiple-value-bind (hvec mvec lvec)
989 (pack ucd range (lambda (x) (pack-case x svec))
990 0 32 #x62)
991 (setf (unidata-scase *unicode-data*)
992 (make-scase :split #x62 :hvec hvec :mvec mvec :lvec lvec
993 :svec (copy-seq svec)))))
994
995 (format t "~&Building numeric-values table~%")
996 (multiple-value-bind (hvec mvec lvec)
997 (pack ucd range #'pack-numeric 0 32 #x63)
998 (setf (unidata-numeric *unicode-data*)
999 (make-ntrie32 :split #x63 :hvec hvec :mvec mvec :lvec lvec)))
1000
1001 (format t "~&Building decomposition table~%")
1002 (let ((tabl (make-array 6000 :element-type '(unsigned-byte 16)
1003 :fill-pointer 0 :adjustable t)))
1004 (multiple-value-bind (hvec mvec lvec)
1005 (pack ucd range (lambda (x) (pack-decomp x tabl))
1006 0 32 #x62)
1007 (setf (unidata-decomp *unicode-data*)
1008 (make-decomp :split #x62 :hvec hvec :mvec mvec :lvec lvec
1009 :tabl (copy-seq tabl)))))
1010
1011 (format t "~&Building combining-class table~%")
1012 (multiple-value-bind (hvec mvec lvec)
1013 (pack ucd range #'ucdent-comb 0 8 #x64)
1014 (setf (unidata-combining *unicode-data*)
1015 (make-ntrie8 :split #x64 :hvec hvec :mvec mvec :lvec lvec)))
1016
1017 (format t "~&Building bidi information table~%")
1018 (let ((tabl (make-array 10 :element-type '(unsigned-byte 16)
1019 :fill-pointer 0 :adjustable t)))
1020 (multiple-value-bind (hvec mvec lvec)
1021 (pack ucd range (lambda (x) (pack-bidi x tabl))
1022 0 16 #x62)
1023 (setf (unidata-bidi *unicode-data*)
1024 (make-bidi :split #x62 :hvec hvec :mvec mvec :lvec lvec
1025 :tabl (copy-seq tabl)))))
1026
1027 (format t "~&Building normalization quick-check tables~%")
1028 (progn
1029 (multiple-value-bind (hvec mvec lvec)
1030 (pack ucd range (lambda (x)
1031 (ecase (getf (ucdent-norm-qc x) :nfd :y)
1032 (:y 0) (:n 1)))
1033 0 1 #x47)
1034 (setf (unidata-qc-nfd *unicode-data*)
1035 (make-ntrie1 :split #x47 :hvec hvec :mvec mvec :lvec lvec)))
1036 (multiple-value-bind (hvec mvec lvec)
1037 (pack ucd range (lambda (x)
1038 (ecase (getf (ucdent-norm-qc x) :nfkd :y)
1039 (:y 0) (:n 1)))
1040 0 1 #x47)
1041 (setf (unidata-qc-nfkd *unicode-data*)
1042 (make-ntrie1 :split #x47 :hvec hvec :mvec mvec :lvec lvec)))
1043 (multiple-value-bind (hvec mvec lvec)
1044 (pack ucd range (lambda (x)
1045 (ecase (getf (ucdent-norm-qc x) :nfc :y)
1046 (:y 0) (:m 1) (:n 2)))
1047 0 2 #x56)
1048 (setf (unidata-qc-nfc *unicode-data*)
1049 (make-ntrie2 :split #x56 :hvec hvec :mvec mvec :lvec lvec)))
1050 (multiple-value-bind (hvec mvec lvec)
1051 (pack ucd range (lambda (x)
1052 (ecase (getf (ucdent-norm-qc x) :nfkc :y)
1053 (:y 0) (:m 1) (:n 2)))
1054 0 2 #x55)
1055 (setf (unidata-qc-nfkc *unicode-data*)
1056 (make-ntrie2 :split #x55 :hvec hvec :mvec mvec :lvec lvec))))
1057
1058 (format t "~&Building composition exclusion table~%")
1059 (let ((exclusions (make-array 1 :element-type '(unsigned-byte 32)
1060 :adjustable t
1061 :fill-pointer 0)))
1062 (loop for ent across ucd do
1063 (when (ucdent-comp-exclusion ent)
1064 (vector-push-extend (ucdent-code ent) exclusions)))
1065 (setf (unidata-comp-exclusions *unicode-data*) (copy-seq exclusions)))
1066
1067 (format t "~&Building full case mapping tables~%")
1068 (progn
1069 (format t "~& Lower...~%")
1070 (let ((tabl (make-array 100 :element-type '(unsigned-byte 16)
1071 :fill-pointer 0 :adjustable t))
1072 (split #x65))
1073 (multiple-value-bind (hvec mvec lvec)
1074 (pack ucd range (lambda (x) (pack-full-case x tabl #'ucdent-full-case-lower))
1075 0 32 split)
1076 (setf (unidata-full-case-lower *unicode-data*)
1077 (make-full-case :split split :hvec hvec :mvec mvec :lvec lvec
1078 :tabl (copy-seq tabl)))))
1079 (format t "~& Title...~%")
1080 (let ((tabl (make-array 100 :element-type '(unsigned-byte 16)
1081 :fill-pointer 0 :adjustable t))
1082 (split #x65))
1083 (multiple-value-bind (hvec mvec lvec)
1084 (pack ucd range (lambda (x) (pack-full-case x tabl #'ucdent-full-case-title))
1085 0 32 split)
1086 (setf (unidata-full-case-title *unicode-data*)
1087 (make-full-case :split split :hvec hvec :mvec mvec :lvec lvec
1088 :tabl (copy-seq tabl)))))
1089 (format t "~& Upper...~%")
1090 (let ((tabl (make-array 100 :element-type '(unsigned-byte 16)
1091 :fill-pointer 0 :adjustable t))
1092 (split #x65))
1093 (multiple-value-bind (hvec mvec lvec)
1094 (pack ucd range (lambda (x) (pack-full-case x tabl #'ucdent-full-case-upper))
1095 0 32 split)
1096 (setf (unidata-full-case-upper *unicode-data*)
1097 (make-full-case :split split :hvec hvec :mvec mvec :lvec lvec
1098 :tabl (copy-seq tabl))))))
1099
1100 (format t "~&Building case-folding tables~%")
1101 (progn
1102 (format t "~& Simple...~%")
1103 (let ((split #x54))
1104 (multiple-value-bind (hvec mvec lvec)
1105 (pack ucd range (lambda (x) (pack-case-folding-simple x))
1106 0 32 split)
1107 (setf (unidata-case-fold-simple *unicode-data*)
1108 (make-ntrie32 :split split :hvec hvec :mvec mvec :lvec lvec))))
1109 (format t "~& Full...~%")
1110 (let ((tabl (make-array 100 :element-type '(unsigned-byte 16)
1111 :fill-pointer 0 :adjustable t))
1112 (split #x65))
1113 (multiple-value-bind (hvec mvec lvec)
1114 (pack ucd range (lambda (x) (pack-case-folding-full x tabl))
1115 0 32 split)
1116 (setf (unidata-case-fold-full *unicode-data*)
1117 (make-case-folding :split split :hvec hvec :mvec mvec :lvec lvec
1118 :tabl (copy-seq tabl))))))
1119
1120 (format t "~&Building word-break table~%")
1121 (let ((split #x66))
1122 (multiple-value-bind (hvec mvec lvec)
1123 (pack ucd range (lambda (x) (pack-word-break x))
1124 0 4 split)
1125 (setf (unidata-word-break *unicode-data*)
1126 (make-ntrie4 :split split :hvec hvec :mvec mvec :lvec lvec))))
1127 nil))

  ViewVC Help
Powered by ViewVC 1.1.5