/[cmucl]/src/code/unidata.lisp
ViewVC logotype

Contents of /src/code/unidata.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.28 - (show annotations)
Mon Jun 27 15:11:29 2011 UTC (2 years, 9 months ago) by rtoy
Branch: MAIN
CVS Tags: GIT-CONVERSION, snapshot-2011-09, snapshot-2011-07, HEAD
Changes since 1.27: +7 -4 lines
Update to Unicode 6.0.0.


code/unidata.lisp:
o Update unicode version to 6.0.0
o Add pointer to build-unidata.lisp.
tools/build-unidata.lisp:
o Update unicode version to 6.0.0
o Print out directory path so we can see where we're getting the data
  from.


i18n/CaseFolding.txt
i18n/CompositionExclusions.txt
i18n/DerivedNormalizationProps.txt
i18n/NameAliases.txt
i18n/NormalizationCorrections.txt
i18n/SpecialCasing.txt
i18n/UnicodeData.txt
i18n/WordBreakProperty.txt
i18n/tests/NormalizationTest.txt
i18n/tests/WordBreakTest.txt:
o Update with new files from unicode.org.
1 ;;; -*- Mode: LISP; Syntax: ANSI-Common-Lisp; Package: LISP -*-
2 ;;;
3 ;;; **********************************************************************
4 ;;; This code was written by Paul Foley and has been placed in the public
5 ;;; domain.
6 ;;;
7 (ext:file-comment "$Header: /tiger/var/lib/cvsroots/cmucl/src/code/unidata.lisp,v 1.28 2011/06/27 15:11:29 rtoy Exp $")
8 ;;;
9 ;;; **********************************************************************
10 ;;;
11 ;;; Unicode Database access
12 ;;;
13 ;;; See tools/build-unidata.lisp for the instructions and code to
14 ;;; create the unicode database.
15
16 (in-package "LISP")
17 (intl:textdomain "cmucl")
18
19 (export '(string-to-nfd string-to-nfkc string-to-nfkd string-to-nfc
20 unicode-complete unicode-complete-name
21 load-all-unicode-data))
22
23 (defvar *unidata-path* "ext-formats:unidata.bin")
24
25 (defvar *unidata-version* "$Revision: 1.28 $")
26
27 (defstruct unidata
28 range
29 name+
30 name
31 category
32 scase
33 numeric
34 decomp
35 combining
36 bidi
37 name1+
38 name1
39 qc-nfd
40 qc-nfkd
41 qc-nfc
42 qc-nfkc
43 comp-exclusions
44 full-case-lower
45 full-case-title
46 full-case-upper
47 case-fold-simple
48 case-fold-full
49 word-break
50 )
51
52 (defvar *unicode-data* (make-unidata))
53
54 ;; The magic number for the unidata.bin file. (It's "*UCD", in
55 ;; big-endian order).
56 (defconstant +unicode-magic-number+ #x2A554344)
57
58 ;; The format version for the unidata.bin file.
59 (defconstant +unicode-format-version+ 0)
60
61 ;; The expected Unicode version. This needs to be synced with
62 ;; build-unidata.lisp.
63 (defconstant +unicode-major-version+ 6)
64 (defconstant +unicode-minor-version+ 0)
65 (defconstant +unicode-update-version+ 0)
66
67 ;;; These need to be synched with tools/build-unidata.lisp
68
69 ;;; ==== DICTIONARY ====
70 ;;
71 ;; A Dictionary is a mapping from names to codepoints.
72 ;;
73 ;; Imagine we have the following structure:
74 ;;
75 ;; node = (value . ((char1 . node1) (char2 . node2) ...))
76 ;;
77 ;; To look up a string, we start at the root node (representing the
78 ;; empty string), and look up the first character in the alist that is
79 ;; (cdr node); assuming the character is found, repeat using the
80 ;; associated node and the next character of the string. After the
81 ;; last character, the current node's value is the value associated
82 ;; with that string (i.e., the codepoint). If at any point the next
83 ;; character is not present in the node, or you reach the end and the
84 ;; value in the current node is NIL, the string is not in the
85 ;; dictionary.
86 ;;
87 ;; As an example, let's have a dictionary containing three words:
88 ;; "cat" = 1, "dog" = 2, "cow" = 3
89 ;;
90 ;; then we have
91 ;;
92 ;; (nil . (#\c . (nil . (#\a . (nil . (#\t . (1 . nil))))
93 ;; (#\o . (nil . (#\w . (3 . nil))))))
94 ;; (#\d . (nil . (#\o . (nil . (#\g . (2 . nil)))))))
95 ;;
96 ;; or, broken down to make it easier to read,
97 ;;
98 ;; root = (nil . (#\c . node1) (#\d . node2))
99 ;; node1 = (nil . (#\a . node3) (#\o . node4))
100 ;; node2 = (nil . (#\o . node5))
101 ;; node3 = (nil . (#\t . node6))
102 ;; node4 = (nil . (#\w . node7))
103 ;; node5 = (nil . (#\g . node8))
104 ;; node6 = (1 . nil)
105 ;; node7 = (3 . nil)
106 ;; node8 = (2 . nil)
107 ;;
108 ;; To look up "dog", start with the root node;
109 ;; look for #\d in the alist at (cdr root) - it maps to node2
110 ;; look for #\o in the alist at (cdr node2) - it maps to node5
111 ;; look for #\g in the alist at (cdr node5) - it maps to node8
112 ;; no more characters, so return the value in node8, which is 2
113 ;;
114 ;; A Dictionary is just a more compact version of that:
115 ;;
116 ;; There are two pairs of vectors: NEXTV and CODEV is one pair, KEYV
117 ;; and KEYL are the second (ignore NAMEV for now).
118 ;;
119 ;; The "current node" is represented by an index into the NEXTV/CODEV
120 ;; pair. Given node i (initially 0, the root), NEXTV[i] is split into
121 ;; two numbers: x (14 bits) and y (18 bits). n = KEYL[x], and
122 ;; KEYV[x:x+n] is the set of characters that occur in the alist of
123 ;; node i. In the example above, when i=0, KEYL[x] will be 2 and
124 ;; KEYV[x:x+2] will be "cd". If the character you're searching for
125 ;; occurs at x+m, then y+m is the index of the corresponding node.
126 ;; CODEV[i] is the value associated with each node (or -1 for none).
127 ;;
128 ;; The above example could be rendered as:
129 ;;
130 ;; keyv = c d a o t w g
131 ;; keyl = 2 0 2 1 1 1 1
132 ;;
133 ;; nextv = [0:1][2:3][3:5][4:6][5:7][6:8][1:0][1:0][1:0]
134 ;; codev = -1 -1 -1 -1 -1 -1 1 3 2
135 ;;
136 ;; To look up "dog", start with i=0
137 ;; nextv[i] = [0:1] i.e., x=0, y=1
138 ;; n = keyl[x] = 2
139 ;; keyv[x:x+n] = "cd"
140 ;; m = (position #\d "cd") = 1
141 ;; i = y+m = 2
142 ;;
143 ;; nextv[i] = [3:5] i.e., x=3, y=5
144 ;; n = keyl[x] = 1
145 ;; keyv[x:x+n] = "o"
146 ;; m = (position #\o "o") = 0
147 ;; i = y+m = 5
148 ;;
149 ;; nextv[i] = [6:8] i.e., x=6, y=8
150 ;; n = keyl[x] = 1
151 ;; keyv[x:x+n] = "g"
152 ;; m = (position #\g "g") = 0
153 ;; i = y+m = 8
154 ;;
155 ;; codev[i] = 2
156 ;;
157 ;;
158 ;; But rather than using only the restricted set of characters that
159 ;; occur in Unicode character names, we encode the names using a
160 ;; codebook of up to 256 strings, turning, for example, "LATIN SMALL
161 ;; LETTER A" into "ABCBDBE" (where "A" encodes "LATIN", "B" encodes
162 ;; space, and so on), so we only have to go through 7 nodes to reach
163 ;; the end, instead of 20, reducing the size of the tables.
164 ;;
165 ;; The NAMEV vector is used to walk backwards through the trie,
166 ;; reconstructing the string from the final node index. Again, each
167 ;; entry consists of a 14 bit and an 18 bit number. In the example
168 ;; above, we would have
169 ;;
170 ;; namev = [?:0][?:0][?:0][?:1][?:1][?:2][3:3][3:4][3:5] ? = don't care
171 ;;
172 ;; The high 14 bits give the length of the string; the low 18 bits
173 ;; give the previous index. Having looked up "dog", the final node
174 ;; was 8:
175 ;;
176 ;; i = 8
177 ;; namev[i] = [3:5] -- the string is 3 characters long
178 ;; nextv[5] = [6:8] i.e., x=6, y=8
179 ;; z = i-y = 0
180 ;; keyv[x+z] = "g" -- the string is "__g"
181 ;; i = 5
182 ;;
183 ;; namev[i] = [?:2]
184 ;; nextv[2] = [3:5] i.e., x=3, y=5
185 ;; z = i-y = 0
186 ;; keyv[x+z] = "o" -- the string is "_og"
187 ;; i = 2
188 ;;
189 ;; namev[i] = [?:0]
190 ;; nextv[0] = [0:1] i.e., x=0, y=1
191 ;; z = i-y = 1
192 ;; keyv[x+z] = "d" -- the string is "dog"
193 ;; i = 0 (finished)
194
195 (defstruct dictionary
196 (cdbk (ext:required-argument) :read-only t :type simple-vector)
197 (keyv (ext:required-argument) :read-only t
198 :type (simple-array (unsigned-byte 8) (*)))
199 (keyl (ext:required-argument) :read-only t
200 :type (simple-array (unsigned-byte 8) (*)))
201 (codev (ext:required-argument) :read-only t
202 :type (simple-array (signed-byte 32) (*)))
203 (nextv (ext:required-argument) :read-only t
204 :type (simple-array (unsigned-byte 32) (*)))
205 (namev (ext:required-argument) :read-only t
206 :type (simple-array (unsigned-byte 32) (*))))
207
208 (defstruct range
209 (codes (ext:required-argument) :read-only t
210 :type (simple-array (unsigned-byte 32) (*))))
211
212
213 ;;; ==== NTRIE ====
214 ;;
215 ;; An NTrie is a mapping from codepoints to integers.
216 ;;
217 ;; The codepoint is split into three pieces, x:y:z where x+y+z = 21
218 ;; bits. The value of the SPLIT slot encodes y-1 in the upper 4 bits
219 ;; and z-1 in the lower 4 bits. I.e., if SPLIT is #x63, the split is
220 ;; 10:7:4
221 ;;
222 ;; p = HVEC[x] is either an index into MVEC or #xFFFF representing no
223 ;; value. q = MVEC[y+p] is either an index into LVEC or #xFFFF
224 ;; representing no value. LVEC[z+q] is the value associated with the
225 ;; codepoint.
226 ;;
227 (defstruct ntrie
228 (split (ext:required-argument) :read-only t
229 :type (unsigned-byte 8))
230 (hvec (ext:required-argument) :read-only t
231 :type (simple-array (unsigned-byte 16) (*)))
232 (mvec (ext:required-argument) :read-only t
233 :type (simple-array (unsigned-byte 16) (*))))
234
235 (defstruct (ntrie1 (:include ntrie))
236 (lvec (ext:required-argument) :read-only t
237 :type (simple-array bit (*))))
238
239 (defstruct (ntrie2 (:include ntrie))
240 (lvec (ext:required-argument) :read-only t
241 :type (simple-array (unsigned-byte 2) (*))))
242
243 (defstruct (ntrie4 (:include ntrie))
244 (lvec (ext:required-argument) :read-only t
245 :type (simple-array (unsigned-byte 4) (*))))
246
247 (defstruct (ntrie8 (:include ntrie))
248 (lvec (ext:required-argument) :read-only t
249 :type (simple-array (unsigned-byte 8) (*))))
250
251 (defstruct (ntrie16 (:include ntrie))
252 (lvec (ext:required-argument) :read-only t
253 :type (simple-array (unsigned-byte 16) (*))))
254
255 (defstruct (ntrie32 (:include ntrie))
256 (lvec (ext:required-argument) :read-only t
257 :type (simple-array (unsigned-byte 32) (*))))
258
259
260 (defstruct (scase (:include ntrie32))
261 (svec (ext:required-argument) :read-only t
262 :type (simple-array (unsigned-byte 16) (*))))
263
264 (defstruct (decomp (:include ntrie32))
265 (tabl (ext:required-argument) :read-only t :type simple-string))
266
267 (defstruct (full-case (:include ntrie32))
268 (tabl (ext:required-argument) :read-only t :type simple-string))
269
270 (defstruct (case-fold-full (:include decomp)))
271
272 (defstruct (bidi (:include ntrie16))
273 (tabl (ext:required-argument) :read-only t
274 :type (simple-array (unsigned-byte 16) (*))))
275
276 (defconstant +decomposition-type+
277 '(nil "<compat>" "<initial>" "<medial>" "<final>" "<isolated>"
278 "<super>" "<sub>" "<fraction>" "<font>" "<noBreak>"
279 "<vertical>" "<wide>" "<narrow>" "<small>" "<square>" "<circle>"))
280
281 (defconstant +bidi-class+
282 #("L" "LRE" "LRO" "R" "AL" "RLE" "RLO" "PDF" "EN" "ES" "ET" "AN" "CS"
283 "NSM" "BN" "B" "S" "WS" "ON"))
284
285
286 (defconstant +hangul-choseong+ ; U+1100..U+1112
287 #("G" "GG" "N" "D" "DD" "R" "M" "B" "BB" "S" "SS" "" "J" "JJ" "C" "K" "T" "P"
288 "H"))
289
290 (defconstant +hangul-jungseong+ ; U+1161..U+1175
291 #("A" "AE" "YA" "YAE" "EO" "E" "YEO" "YE" "O" "WA" "WAE" "OE" "YO" "U" "WEO"
292 "WE" "WI" "YU" "EU" "YI" "I"))
293
294 (defconstant +hangul-jongseong+ ; U+11A8..U+11C2
295 #("G" "GG" "GS" "N" "NJ" "NH" "D" "L" "LG" "LM" "LB" "LS" "LT" "LP" "LH" "M"
296 "B" "BS" "S" "SS" "NG" "J" "C" "K" "T" "P" "H"))
297
298
299
300 (defun search-dictionary (string dictionary &optional (current 0) (posn 0))
301 "Search the Unicode name dictionary for the longest entry that
302 matches STRING. STRING must be in Unicode name format. That is, it
303 must be upper case with spaces separating each word.
304
305 Two values are returned. The first value is index into the codebook
306 that continues the string.. The second value is the length of the
307 substring of string that matches the codebook. "
308
309 (declare (optimize (speed 3) (space 0) (safety 0)
310 (ext:inhibit-warnings 3))
311 (type string string) (type dictionary dictionary)
312 (type (unsigned-byte 32) current) (type lisp::index posn))
313 (let* ((codebook (dictionary-cdbk dictionary))
314 (stack '()))
315 (loop
316 (let ((keyv (ash (aref (dictionary-nextv dictionary) current) -18)))
317 (dotimes (i (aref (dictionary-keyl dictionary) keyv)
318 (if stack
319 (let ((next (pop stack)))
320 (setq posn (car next) current (cdr next)))
321 (return-from search-dictionary nil)))
322 (let* ((str (aref codebook (aref (dictionary-keyv dictionary)
323 (+ keyv i))))
324 (len (length str)))
325 (declare (type simple-base-string str))
326 (cond ((and (>= (length string) (+ posn len))
327 (string= string str :start1 posn :end1 (+ posn len)))
328 (setq current
329 (+ (logand (aref (dictionary-nextv dictionary) current)
330 #x3FFFF)
331 i))
332 (when (= (incf posn len) (length string))
333 (return-from search-dictionary (values current posn)))
334 (return)) ; from DOTIMES - loop again
335 ((and (< (length string) (+ posn len))
336 (string= string str :start1 posn :end2 (- (length string) posn)))
337 (return-from search-dictionary (values current posn))))
338 (when (or (string= str " ") (string= str "-"))
339 (push (cons posn
340 (+ (logand (aref (dictionary-nextv dictionary)
341 current)
342 #x3FFFF)
343 i))
344 stack))))))))
345
346 ;; Like SEARCH-DICTIONARY, but we don't try to do partial matches. We
347 ;; do an exact match on the given string.
348 (defun exact-match-dictionary (string dictionary)
349 (declare (optimize (speed 3) (space 0) (safety 0)
350 (ext:inhibit-warnings 3))
351 (type string string) (type dictionary dictionary))
352 (let* ((codebook (dictionary-cdbk dictionary))
353 (current 0)
354 (posn 0)
355 (stack '()))
356 (declare (type (unsigned-byte 32) current) (type lisp::index posn))
357 (loop
358 (let ((keyv (ash (aref (dictionary-nextv dictionary) current) -18)))
359 (dotimes (i (aref (dictionary-keyl dictionary) keyv)
360 (if stack
361 (let ((next (pop stack)))
362 (setq posn (car next) current (cdr next)))
363 (return-from exact-match-dictionary nil)))
364 (let* ((str (aref codebook (aref (dictionary-keyv dictionary)
365 (+ keyv i))))
366 (len (length str)))
367 (declare (type simple-base-string str))
368 (when (and (>= (length string) (+ posn len))
369 (string= string str :start1 posn :end1 (+ posn len)))
370 (setq current
371 (+ (logand (aref (dictionary-nextv dictionary) current)
372 #x3FFFF)
373 i))
374 (when (= (incf posn len) (length string))
375 (return-from exact-match-dictionary current))
376 (return)) ; from DOTIMES - loop again
377 (when (or (string= str " ") (string= str "-"))
378 (push (cons posn
379 (+ (logand (aref (dictionary-nextv dictionary)
380 current)
381 #x3FFFF)
382 i))
383 stack))))))))
384
385 (defun search-range (code range)
386 (declare (optimize (speed 3) (space 0) (safety 0))
387 (type codepoint code) (type range range))
388 (let* ((set (range-codes range))
389 (min 0)
390 (max (length set)))
391 (declare (type lisp::index min max))
392 (loop for n of-type lisp::index = (logand #xFFFFFE (ash (+ min max) -1)) do
393 (let* ((dmin (logand #x1FFFFF (aref set n)))
394 (dmax (logand #x1FFFFF (aref set (1+ n)))))
395 (cond ((< code dmin)
396 (when (= (the lisp::index (setq max n)) min) (return nil)))
397 ((> code dmax)
398 (when (= (setq min (+ n 2)) max) (return nil)))
399 (t (return (ash n -1))))))))
400
401 (declaim (inline qref qref1 qref2 qref4 qref8 qref16 qref32))
402 (defun qref (ntrie code)
403 (declare (optimize (speed 3) (space 0) (safety 0))
404 (type ntrie ntrie) (type codepoint code))
405 (let* ((mbits (1+ (ash (ntrie-split ntrie) -4)))
406 (lbits (1+ (logand (ntrie-split ntrie) 15)))
407 (hvec (ntrie-hvec ntrie))
408 (mvec (ntrie-mvec ntrie))
409 (hi (aref hvec (ash code (- (+ mbits lbits)))))
410 (md (logand (ash code (- lbits)) (lognot (ash -1 mbits))))
411 (lo (logand code (lognot (ash -1 lbits)))))
412 (declare (type (simple-array (unsigned-byte 16) (*)) hvec mvec))
413 (if (= hi #xFFFF)
414 nil
415 (let ((md (aref mvec (+ hi md))))
416 (if (= md #xFFFF)
417 nil
418 (+ md lo))))))
419
420 (defun qref1 (ntrie code)
421 (declare (optimize (speed 3) (space 0) (safety 0))
422 (type ntrie1 ntrie) (type codepoint code))
423 (let ((n (qref ntrie code)))
424 (if n (aref (ntrie1-lvec ntrie) n) 0)))
425
426 (defun qref2 (ntrie code)
427 (declare (optimize (speed 3) (space 0) (safety 0))
428 (type ntrie2 ntrie) (type codepoint code))
429 (let ((n (qref ntrie code)))
430 (if n (aref (ntrie2-lvec ntrie) n) 0)))
431
432 (defun qref4 (ntrie code)
433 (declare (optimize (speed 3) (space 0) (safety 0))
434 (type ntrie4 ntrie) (type codepoint code))
435 (let ((n (qref ntrie code)))
436 (if n (aref (ntrie4-lvec ntrie) n) 0)))
437
438 (defun qref8 (ntrie code)
439 (declare (optimize (speed 3) (space 0) (safety 0))
440 (type ntrie8 ntrie) (type codepoint code))
441 (let ((n (qref ntrie code)))
442 (if n (aref (ntrie8-lvec ntrie) n) 0)))
443
444 (defun qref16 (ntrie code)
445 (declare (optimize (speed 3) (space 0) (safety 0))
446 (type ntrie16 ntrie) (type codepoint code))
447 (let ((n (qref ntrie code)))
448 (if n (aref (ntrie16-lvec ntrie) n) 0)))
449
450 (defun qref32 (ntrie code)
451 (declare (optimize (speed 3) (space 0) (safety 0)
452 (ext:inhibit-warnings 3)) ;; shut up about boxing return
453 (type ntrie32 ntrie) (type codepoint code))
454 (let ((n (qref ntrie code)))
455 (if n (aref (ntrie32-lvec ntrie) n) 0)))
456
457
458
459 (defun unidata-locate (stream index)
460 (labels ((read16 (stm)
461 (logior (ash (read-byte stm) 8) (read-byte stm)))
462 (read32 (stm)
463 (logior (ash (read16 stm) 16) (read16 stm))))
464 (unless (and (= (read32 stream) +unicode-magic-number+)
465 (= (read-byte stream) +unicode-format-version+))
466 (error (intl:gettext "The Unicode data file is broken.")))
467 (let ((a (read-byte stream))
468 (b (read-byte stream))
469 (c (read-byte stream)))
470 (unless (and (= a +unicode-major-version+)
471 (= b +unicode-minor-version+)
472 (= c +unicode-update-version+))
473 (warn (intl:gettext "Unicode data file is for Unicode ~D.~D.~D") a b c)))
474 (dotimes (i index)
475 (when (zerop (read32 stream))
476 (return-from unidata-locate nil)))
477 (let ((n (read32 stream)))
478 (and (plusp n) (file-position stream n)))))
479
480 ;; List of all defined defloaders
481 (defvar *defloaders* nil)
482
483 (defmacro defloader (name (stm locn) &body body)
484 `(progn
485 (push ',name *defloaders*)
486 (defun ,name ()
487 (labels ((read16 (stm)
488 (logior (ash (read-byte stm) 8) (read-byte stm)))
489 (read32 (stm)
490 (logior (ash (read16 stm) 16) (read16 stm)))
491 (read-ntrie (bits stm)
492 (let* ((split (read-byte stm))
493 (hlen (read16 stm))
494 (mlen (read16 stm))
495 (llen (read16 stm))
496 (hvec (make-array hlen
497 :element-type '(unsigned-byte 16)))
498 (mvec (make-array mlen
499 :element-type '(unsigned-byte 16)))
500 (lvec (make-array llen
501 :element-type (list 'unsigned-byte bits))))
502 (read-vector hvec stm :endian-swap :network-order)
503 (read-vector mvec stm :endian-swap :network-order)
504 (read-vector lvec stm :endian-swap :network-order)
505 (values split hvec mvec lvec))))
506 (declare (ignorable #'read16 #'read32 #'read-ntrie))
507 (with-open-file (,stm *unidata-path* :direction :input
508 :element-type '(unsigned-byte 8))
509 (unless (unidata-locate ,stm ,locn)
510 (error (intl:gettext "No data in file.")))
511 ,@body)))))
512
513 (defloader load-range (stm 0)
514 (let* ((n (read32 stm))
515 (codes (make-array n :element-type '(unsigned-byte 32))))
516 (read-vector codes stm :endian-swap :network-order)
517 (setf (unidata-range *unicode-data*)
518 (make-range :codes codes))))
519
520 (defloader load-names (stm 1)
521 (let* ((cb (1+ (read-byte stm)))
522 (kv (read16 stm))
523 (cv (read32 stm))
524 (codebook (make-array cb))
525 (keyv (make-array kv :element-type '(unsigned-byte 8)))
526 (keyl (make-array kv :element-type '(unsigned-byte 8)))
527 (codev (make-array cv :element-type '(signed-byte 32)))
528 (nextv (make-array cv :element-type '(unsigned-byte 32)))
529 (namev (make-array cv :element-type '(unsigned-byte 32))))
530 (dotimes (i cb)
531 (let* ((n (read-byte stm))
532 (s (make-string n)))
533 (setf (aref codebook i) s)
534 (dotimes (i n) (setf (char s i) (code-char (read-byte stm))))))
535 (read-vector keyv stm :endian-swap :network-order)
536 (read-vector keyl stm :endian-swap :network-order)
537 (read-vector codev stm :endian-swap :network-order)
538 (read-vector nextv stm :endian-swap :network-order)
539 (read-vector namev stm :endian-swap :network-order)
540 (setf (unidata-name+ *unicode-data*)
541 (make-dictionary :cdbk codebook :keyv keyv :keyl keyl
542 :codev codev :nextv nextv :namev namev))))
543
544 (defloader load-name (stm 2)
545 (multiple-value-bind (split hvec mvec lvec) (read-ntrie 32 stm)
546 (setf (unidata-name *unicode-data*)
547 (make-ntrie32 :split split :hvec hvec :mvec mvec :lvec lvec))))
548
549 (defloader load-categories (stm 3)
550 (multiple-value-bind (split hvec mvec lvec) (read-ntrie 8 stm)
551 (setf (unidata-category *unicode-data*)
552 (make-ntrie8 :split split :hvec hvec :mvec mvec :lvec lvec))))
553
554 (defloader load-scase (stm 4)
555 (multiple-value-bind (split hvec mvec lvec) (read-ntrie 32 stm)
556 (let* ((slen (read-byte stm))
557 (svec (make-array slen :element-type '(unsigned-byte 16))))
558 (read-vector svec stm :endian-swap :network-order)
559 (setf (unidata-scase *unicode-data*)
560 (make-scase :split split :hvec hvec :mvec mvec :lvec lvec
561 :svec svec)))))
562
563 (defloader load-numerics (stm 5)
564 (multiple-value-bind (split hvec mvec lvec) (read-ntrie 32 stm)
565 (setf (unidata-numeric *unicode-data*)
566 (make-ntrie32 :split split :hvec hvec :mvec mvec :lvec lvec))))
567
568 (defloader load-decomp (stm 6)
569 (multiple-value-bind (split hvec mvec lvec) (read-ntrie 32 stm)
570 (let* ((tlen (read16 stm))
571 (tabl (make-array tlen :element-type '(unsigned-byte 16))))
572 (read-vector tabl stm :endian-swap :network-order)
573 (setf (unidata-decomp *unicode-data*)
574 (make-decomp :split split :hvec hvec :mvec mvec :lvec lvec
575 :tabl (map 'simple-string #'code-char tabl))))))
576
577 (defloader load-combining (stm 7)
578 (multiple-value-bind (split hvec mvec lvec) (read-ntrie 8 stm)
579 (setf (unidata-combining *unicode-data*)
580 (make-ntrie8 :split split :hvec hvec :mvec mvec :lvec lvec))))
581
582 (defloader load-bidi (stm 8)
583 (multiple-value-bind (split hvec mvec lvec) (read-ntrie 16 stm)
584 (let* ((tlen (read-byte stm))
585 (tabl (make-array tlen :element-type '(unsigned-byte 16))))
586 (read-vector tabl stm :endian-swap :network-order)
587 (setf (unidata-bidi *unicode-data*)
588 (make-bidi :split split :hvec hvec :mvec mvec :lvec lvec
589 :tabl tabl)))))
590
591 (defloader load-1.0-names (stm 9)
592 (let* ((cb (1+ (read-byte stm)))
593 (kv (read16 stm))
594 (cv (read32 stm))
595 (codebook (make-array cb))
596 (keyv (make-array kv :element-type '(unsigned-byte 8)))
597 (keyl (make-array kv :element-type '(unsigned-byte 8)))
598 (codev (make-array cv :element-type '(signed-byte 32)))
599 (nextv (make-array cv :element-type '(unsigned-byte 32)))
600 (namev (make-array cv :element-type '(unsigned-byte 32))))
601 (dotimes (i cb)
602 (let* ((n (read-byte stm))
603 (s (make-string n)))
604 (setf (aref codebook i) s)
605 (dotimes (i n) (setf (char s i) (code-char (read-byte stm))))))
606 (read-vector keyv stm :endian-swap :network-order)
607 (read-vector keyl stm :endian-swap :network-order)
608 (read-vector codev stm :endian-swap :network-order)
609 (read-vector nextv stm :endian-swap :network-order)
610 (read-vector namev stm :endian-swap :network-order)
611 (setf (unidata-name1+ *unicode-data*)
612 (make-dictionary :cdbk codebook :keyv keyv :keyl keyl
613 :codev codev :nextv nextv :namev namev))))
614
615 (defloader load-1.0-name (stm 10)
616 (multiple-value-bind (split hvec mvec lvec) (read-ntrie 32 stm)
617 (setf (unidata-name1 *unicode-data*)
618 (make-ntrie32 :split split :hvec hvec :mvec mvec :lvec lvec))))
619
620 (defloader load-normalization-qc (stm 11)
621 (multiple-value-bind (split hvec mvec lvec) (read-ntrie 1 stm)
622 (setf (unidata-qc-nfd *unicode-data*)
623 (make-ntrie1 :split split :hvec hvec :mvec mvec :lvec lvec)))
624 (multiple-value-bind (split hvec mvec lvec) (read-ntrie 1 stm)
625 (setf (unidata-qc-nfkd *unicode-data*)
626 (make-ntrie1 :split split :hvec hvec :mvec mvec :lvec lvec)))
627 (multiple-value-bind (split hvec mvec lvec) (read-ntrie 2 stm)
628 (setf (unidata-qc-nfc *unicode-data*)
629 (make-ntrie2 :split split :hvec hvec :mvec mvec :lvec lvec)))
630 (multiple-value-bind (split hvec mvec lvec) (read-ntrie 2 stm)
631 (setf (unidata-qc-nfkc *unicode-data*)
632 (make-ntrie2 :split split :hvec hvec :mvec mvec :lvec lvec))))
633
634 (defloader load-composition-exclusions (stm 12)
635 (let* ((len (read16 stm))
636 (ex (make-array len :element-type '(unsigned-byte 32))))
637 (read-vector ex stm :endian-swap :network-order)
638 (setf (unidata-comp-exclusions *unicode-data*) ex)))
639
640 (defloader load-full-case-lower (stm 13)
641 (multiple-value-bind (split hvec mvec lvec)
642 (read-ntrie 32 stm)
643 (let* ((tlen (read16 stm))
644 (tabl (make-array tlen :element-type '(unsigned-byte 16))))
645 (read-vector tabl stm :endian-swap :network-order)
646 (setf (unidata-full-case-lower *unicode-data*)
647 (make-full-case :split split :hvec hvec :mvec mvec :lvec lvec
648 :tabl (map 'simple-string #'code-char tabl))))))
649
650 (defloader load-full-case-title (stm 14)
651 (multiple-value-bind (split hvec mvec lvec)
652 (read-ntrie 32 stm)
653 (let* ((tlen (read16 stm))
654 (tabl (make-array tlen :element-type '(unsigned-byte 16))))
655 (read-vector tabl stm :endian-swap :network-order)
656 (setf (unidata-full-case-title *unicode-data*)
657 (make-full-case :split split :hvec hvec :mvec mvec :lvec lvec
658 :tabl (map 'simple-string #'code-char tabl))))))
659
660 (defloader load-full-case-upper (stm 15)
661 (multiple-value-bind (split hvec mvec lvec)
662 (read-ntrie 32 stm)
663 (let* ((tlen (read16 stm))
664 (tabl (make-array tlen :element-type '(unsigned-byte 16))))
665 (read-vector tabl stm :endian-swap :network-order)
666 (setf (unidata-full-case-upper *unicode-data*)
667 (make-full-case :split split :hvec hvec :mvec mvec :lvec lvec
668 :tabl (map 'simple-string #'code-char tabl))))))
669
670 (defloader load-case-fold-simple (stm 16)
671 (multiple-value-bind (split hvec mvec lvec)
672 (read-ntrie 32 stm)
673 (setf (unidata-case-fold-simple *unicode-data*)
674 (make-ntrie32 :split split :hvec hvec :mvec mvec :lvec lvec))))
675
676 (defloader load-case-fold-full (stm 17)
677 (multiple-value-bind (split hvec mvec lvec)
678 (read-ntrie 32 stm)
679 (let* ((tlen (read16 stm))
680 (tabl (make-array tlen :element-type '(unsigned-byte 16))))
681 (read-vector tabl stm :endian-swap :network-order)
682 (setf (unidata-case-fold-full *unicode-data*)
683 (make-case-fold-full :split split :hvec hvec :mvec mvec :lvec lvec
684 :tabl (map 'simple-string #'code-char tabl))))))
685
686 (defloader load-word-break (stm 18)
687 (multiple-value-bind (split hvec mvec lvec)
688 (read-ntrie 4 stm)
689 (setf (unidata-word-break *unicode-data*)
690 (make-ntrie4 :split split :hvec hvec :mvec mvec :lvec lvec))))
691
692 ;;; Accessor functions.
693
694 (defvar *reverse-hangul-choseong*)
695 (defvar *reverse-hangul-jungseong*)
696 (defvar *reverse-hangul-jongseong*)
697
698 (declaim (inline cjk-ideograph-p hangul-syllable-p))
699 (defun cjk-ideograph-p (code)
700 ;; Search src/i18n/UnicodeData.txt for "CJK Ideograph" to find the
701 ;; values here.
702 (or (<= #x3400 code #x4DB5) ; CJK Ideograph Extension A
703 (<= #x4E00 code #x9FCB) ; CJK Ideograph
704 (<= #x20000 code #x2A6D6) ; CJK Ideograph Extension B
705 (<= #X2A700 code #X2B734))) ; CJK Ideograph Extension C
706
707 (defun hangul-syllable-p (code)
708 ;; Search src/i18n/UnicodeData.txt for "Hangule Syllable" to find
709 ;; the values here.
710 (<= #xAC00 code #xD7A3))
711
712 (defun initialize-reverse-hangul-tables ()
713 (unless (boundp '*reverse-hangul-choseong*)
714 (setq *reverse-hangul-choseong*
715 (sort (coerce (loop for x across +hangul-choseong+
716 as i upfrom 0 by 588
717 collect (cons x i))
718 'vector)
719 #'> :key (lambda (x) (length (car x)))))
720 (setq *reverse-hangul-jungseong*
721 (sort (coerce (loop for x across +hangul-jungseong+
722 as i upfrom 0 by 28
723 collect (cons x i))
724 'vector)
725 #'> :key (lambda (x) (length (car x)))))
726 (setq *reverse-hangul-jongseong*
727 (sort (coerce (loop for x across +hangul-jongseong+
728 as i upfrom 1
729 collect (cons x i))
730 'vector)
731 #'> :key (lambda (x) (length (car x)))))))
732
733 (defun unicode-name-to-codepoint (name)
734 (declare (type string name))
735 (cond ((and (> (length name) 22)
736 (or (string= name "CJK UNIFIED" :end1 11)
737 (string= name "CJKUNIFIED" :end1 10)))
738 (let* ((x (search "IDEOGRAPH" name))
739 (n (and x (position-if (lambda (x) (digit-char-p x 16)) name
740 :start (+ x 9))))
741 (code (and n (values (parse-integer name :start n :radix 16)))))
742
743 (when (and code (cjk-ideograph-p code))
744 code)))
745 ((and (> (length name) 15)
746 (or (string= name "HANGUL SYLLABLE" :end1 15)
747 (string= name "HANGULSYLLABLE" :end1 14)))
748 (let* ((x (search "SYLLABLE" name))
749 (n (position-if (lambda (x) (alpha-char-p x)) name
750 :start (+ x 8)))
751 (ll nil) (vv nil) (tt 0))
752 (unless n (return-from unicode-name-to-codepoint nil))
753 (initialize-reverse-hangul-tables)
754 (loop for (x . y) across *reverse-hangul-choseong*
755 when (and (<= (+ n (length x)) (length name))
756 (string= name x :start1 n :end1 (+ n (length x))))
757 do (incf n (length x))
758 (setq ll y)
759 (return))
760 (loop for (x . y) across *reverse-hangul-jungseong*
761 when (and (<= (+ n (length x)) (length name))
762 (string= name x :start1 n :end1 (+ n (length x))))
763 do (incf n (length x))
764 (setq vv y)
765 (return))
766 (when (< n (length name))
767 (loop for (x . y) across *reverse-hangul-jongseong*
768 when (and (<= (+ n (length x)) (length name))
769 (string= name x :start1 n :end1 (+ n (length x))))
770 do (incf n (length x))
771 (setq tt y)
772 (return)))
773 (if (and ll vv (= n (length name)))
774 (+ ll vv tt #xAC00)
775 nil)))
776 (t
777 (unless (unidata-name+ *unicode-data*) (load-names))
778 (let* ((names (unidata-name+ *unicode-data*))
779 (n (exact-match-dictionary name names)))
780 (when n
781 (let ((cp (aref (dictionary-codev names) n)))
782 (if (minusp cp) nil cp)))))))
783
784 (defun unicode-1.0-name-to-codepoint (name)
785 (declare (type string name))
786 (unless (unidata-name1+ *unicode-data*) (load-1.0-names))
787 (let* ((names (unidata-name1+ *unicode-data*))
788 (n (exact-match-dictionary name names)))
789 (when n
790 (let ((cp (aref (dictionary-codev names) n)))
791 (if (minusp cp) nil cp)))))
792
793 (defun unicode-name+ (code ntrie dict)
794 (declare (optimize (speed 3) (space 0) (safety 0)
795 (ext:inhibit-warnings 3))
796 (type codepoint code)
797 (type ntrie32 ntrie) (type dictionary dict))
798 (let ((n (qref32 ntrie code)))
799 (when (plusp n)
800 (let* ((codebook (dictionary-cdbk dict))
801 (namev (dictionary-namev dict))
802 (nextv (dictionary-nextv dict))
803 (keyv (dictionary-keyv dict))
804 (p (ash (aref namev n) -18))
805 (s (make-string p)))
806 (loop while (plusp n) do
807 (let* ((prev (logand (aref namev n) #x3FFFF))
808 (temp (aref nextv prev))
809 (base (logand temp #x3FFFF))
810 (str (aref codebook
811 (aref keyv (+ (ash temp -18) (- n base))))))
812 (declare (type simple-base-string str))
813 (setq p (- p (length str)) n prev)
814 (replace s str :start1 p)))
815 s))))
816
817 (defun unicode-name (code)
818 (cond ((cjk-ideograph-p code)
819 (format nil "CJK UNIFIED IDEOGRAPH-~4,'0X" code))
820 ((hangul-syllable-p code) ; Hangul Syllable
821 (apply #'concatenate 'string "HANGUL SYLLABLE "
822 (loop for ch across (unicode-decomp code)
823 as code = (char-code ch)
824 collect (cond ((<= #x1100 code #x1112)
825 (aref +hangul-choseong+ (- code #x1100)))
826 ((<= #x1161 code #x1175)
827 (aref +hangul-jungseong+ (- code #x1161)))
828 ((<= #x11A8 code #x11C2)
829 (aref +hangul-jongseong+ (- code #x11A8)))))))
830 (t
831 (unless (unidata-name+ *unicode-data*) (load-names))
832 (unless (unidata-name *unicode-data*) (load-name))
833 (unicode-name+ code (unidata-name *unicode-data*)
834 (unidata-name+ *unicode-data*)))))
835
836 (defun unicode-1.0-name (code)
837 (unless (unidata-name1+ *unicode-data*) (load-1.0-names))
838 (unless (unidata-name1 *unicode-data*) (load-1.0-name))
839 (unicode-name+ code (unidata-name1 *unicode-data*)
840 (unidata-name1+ *unicode-data*)))
841
842 (declaim (inline unicode-category))
843 (defun unicode-category (code)
844 (declare (type codepoint code))
845 (unless (unidata-category *unicode-data*) (load-categories))
846 (qref8 (the ntrie8 (unidata-category *unicode-data*)) code))
847
848 (defun unicode-category-string (code)
849 (let ((n (unicode-category code))
850 (s (make-string 2)))
851 (setf (schar s 0) (schar "CZMPNLS?????????" (ldb (byte 4 4) n))
852 (schar s 1) (schar "nsifepkcdmulto??" (ldb (byte 4 0) n)))
853 s))
854
855 (declaim (inline unicode-assigned-codepoint-p))
856 (defun unicode-assigned-codepoint-p (code)
857 (not (zerop (unicode-category code))))
858
859 ;; Look at unicode-category-string to see how we get these numbers.
860 ;; +unicode-category-graphic+ is the first graphic character
861 ;; +unicode-category-letter+ is the first letter
862 ;; +unicode-category-upper+, +unicode-category-lower+, +unicode-category-title+
863 ;; are uppercase, lowercase, and titlecase letters respectively.
864 (defconstant +unicode-category-graphic+ #x30)
865 (defconstant +unicode-category-letter+ #x50)
866 (defconstant +unicode-category-upper+ #x5a)
867 (defconstant +unicode-category-lower+ #x5b)
868 (defconstant +unicode-category-title+ #x5c)
869 (defconstant +unicode-category-other+ #x5d)
870
871 (defun unicode-upper (code)
872 (declare (optimize (speed 3) (space 0) (safety 0))
873 (type codepoint code))
874 (unless (unidata-scase *unicode-data*) (load-scase))
875 (let* ((scase (unidata-scase *unicode-data*))
876 (n (logand (qref32 scase code) #xFF)))
877 (if (zerop n)
878 code
879 (let* ((m (aref (scase-svec scase) (logand n #x7F))))
880 (if (logbitp 7 n) (+ code m) (- code m))))))
881
882 (defun unicode-lower (code)
883 (declare (optimize (speed 3) (space 0) (safety 0))
884 (type codepoint code))
885 (unless (unidata-scase *unicode-data*) (load-scase))
886 (let* ((scase (unidata-scase *unicode-data*))
887 (n (logand (ash (qref32 scase code) -8) #xFF)))
888 (if (zerop n)
889 code
890 (let ((m (aref (scase-svec scase) (logand n #x7F))))
891 (if (logbitp 7 n) (+ code m) (- code m))))))
892
893 (defun unicode-title (code)
894 (declare (optimize (speed 3) (space 0) (safety 0))
895 (type codepoint code))
896 (unless (unidata-scase *unicode-data*) (load-scase))
897 (let* ((scase (unidata-scase *unicode-data*))
898 (n (logand (ash (qref32 scase code) -16) #xFF)))
899 (if (zerop n)
900 code
901 (let ((m (aref (scase-svec scase) (logand n #x7F))))
902 (if (logbitp 7 n) (+ code m) (- code m))))))
903
904 (defun unicode-num1 (code)
905 (declare (type codepoint code))
906 (unless (unidata-numeric *unicode-data*) (load-numerics))
907 (let ((n (qref32 (unidata-numeric *unicode-data*) code)))
908 (if (logbitp 25 n) (logand (ash n -3) #xF) nil)))
909
910 (defun unicode-num2 (code)
911 (declare (type codepoint code))
912 (unless (unidata-numeric *unicode-data*) (load-numerics))
913 (let ((n (qref32 (unidata-numeric *unicode-data*) code)))
914 (if (logbitp 24 n) (logand (ash n -3) #xF) nil)))
915
916 (defun unicode-num3 (code)
917 (declare (type codepoint code))
918 (unless (unidata-numeric *unicode-data*) (load-numerics))
919 (let ((n (qref32 (unidata-numeric *unicode-data*) code)))
920 (if (logbitp 23 n)
921 (let ((num (/ (logand (ash n -3) #x1FFFF) (1+ (logand n 7)))))
922 (if (logbitp 20 n) (- num) num))
923 nil)))
924
925 (defun unicode-decomp (code &optional (compatibility t))
926 (declare (optimize (speed 3) (space 0) (safety 0))
927 (type codepoint code))
928 (if (hangul-syllable-p code)
929 ;; Hangul syllables. (See
930 ;; http://www.unicode.org/reports/tr15/#Hangul for the
931 ;; algorithm.)
932 (multiple-value-bind (q1 r1)
933 (truncate (- code #xAC00) 588)
934 (declare (type (integer 0 18) q1) (type (integer 0 587) r1))
935 (multiple-value-bind (q2 r2)
936 (truncate r1 28)
937 (declare (type (integer 0 20) q2) (type (integer 0 27) r2))
938 (let ((decomp (make-string (if (zerop r2) 2 3))))
939 (setf (schar decomp 0) (code-char (+ #x1100 q1))
940 (schar decomp 1) (code-char (+ #x1161 q2)))
941 (unless (zerop r2)
942 (setf (schar decomp 2) (code-char (+ #x11A7 r2))))
943 decomp)))
944 (progn
945 (unless (unidata-decomp *unicode-data*) (load-decomp))
946 (let* ((decomp (unidata-decomp *unicode-data*))
947 (n (qref32 decomp code))
948 (type (ldb (byte 5 27) n)))
949 (if (= n 0)
950 nil
951 (if (or compatibility (zerop type))
952 (let ((off (logand n #xFFFF))
953 (len (ldb (byte 6 16) n)))
954 (values (subseq (decomp-tabl decomp) off (+ off len))
955 type))
956 nil))))))
957
958 (declaim (ftype (function (codepoint) (unsigned-byte 8))
959 unicode-combining-class))
960 (defun unicode-combining-class (code)
961 (declare (optimize (speed 3) (space 0) (safety 0))
962 (type codepoint code))
963 (unless (unidata-combining *unicode-data*) (load-combining))
964 (the (unsigned-byte 8) (qref8 (unidata-combining *unicode-data*) code)))
965
966 (defun unicode-bidi-class (code)
967 (declare (optimize (speed 3) (space 0) (safety 0))
968 (type codepoint code))
969 (unless (unidata-bidi *unicode-data*) (load-bidi))
970 (logand (qref16 (unidata-bidi *unicode-data*) code) #x1F))
971
972 (defun unicode-bidi-class-string (code)
973 (aref +bidi-class+ (unicode-bidi-class code)))
974
975 (defun unicode-bidi-mirror-p (code)
976 (declare (optimize (speed 3) (space 0) (safety 0))
977 (type codepoint code))
978 (unless (unidata-bidi *unicode-data*) (load-bidi))
979 (logbitp 5 (qref16 (unidata-bidi *unicode-data*) code)))
980
981 (defun unicode-mirror-codepoint (code)
982 (declare (optimize (speed 3) (space 0) (safety 0))
983 (type codepoint code))
984 (unless (unidata-bidi *unicode-data*) (load-bidi))
985 (let* ((d (unidata-bidi *unicode-data*))
986 (x (ash (qref16 d code) -6))
987 (i (logand x #x0F))
988 (n (if (logbitp 5 x) (aref (bidi-tabl d) i) i)))
989 (cond ((= x 0) nil)
990 ((logbitp 4 x) (- code n))
991 (t (+ code n)))))
992
993 (declaim (inline unicode-nfc-qc unicode-nfkc-qc
994 unicode-nfd-qc unicode-nfkd-qc))
995
996 (defun unicode-nfc-qc (code)
997 (declare (optimize (speed 3) (space 0) (safety 0))
998 (type codepoint code))
999 (unless (unidata-qc-nfc *unicode-data*) (load-normalization-qc))
1000 (ecase (qref2 (unidata-qc-nfc *unicode-data*) code)
1001 (0 :Y) (1 :M) (2 :N)))
1002
1003 (defun unicode-nfkc-qc (code)
1004 (declare (optimize (speed 3) (space 0) (safety 0))
1005 (type codepoint code))
1006 (unless (unidata-qc-nfkc *unicode-data*) (load-normalization-qc))
1007 (ecase (qref2 (unidata-qc-nfkc *unicode-data*) code)
1008 (0 :Y) (1 :M) (2 :N)))
1009
1010 (defun unicode-nfd-qc (code)
1011 (declare (optimize (speed 3) (space 0) (safety 0))
1012 (type codepoint code))
1013 (unless (unidata-qc-nfd *unicode-data*) (load-normalization-qc))
1014 (ecase (qref1 (unidata-qc-nfd *unicode-data*) code)
1015 (0 :Y) (1 :N)))
1016
1017 (defun unicode-nfkd-qc (code)
1018 (declare (optimize (speed 3) (space 0) (safety 0))
1019 (type codepoint code))
1020 (unless (unidata-qc-nfkd *unicode-data*) (load-normalization-qc))
1021 (ecase (qref1 (unidata-qc-nfkd *unicode-data*) code)
1022 (0 :Y) (1 :N)))
1023
1024 (defun unicode-composition-exclusions ()
1025 (unless (unidata-comp-exclusions *unicode-data*)
1026 (load-composition-exclusions))
1027 (unidata-comp-exclusions *unicode-data*))
1028
1029 (defun %unicode-full-case (code data default)
1030 (let* ((n (qref32 data code)))
1031 (if (= n 0)
1032 (multiple-value-bind (hi lo)
1033 (surrogates (funcall default code))
1034 (let ((s (make-string (if lo 2 1))))
1035 (setf (schar s 0) hi)
1036 (when lo
1037 (setf (schar s 1) lo))
1038 s))
1039 (let ((off (logand n #xffff))
1040 (len (ldb (byte 6 16) n)))
1041 (subseq (full-case-tabl data) off (+ off len))))))
1042
1043 (defun unicode-full-case-lower (code)
1044 (unless (unidata-full-case-lower *unicode-data*)
1045 (load-full-case-lower))
1046 (%unicode-full-case code (unidata-full-case-lower *unicode-data*) #'unicode-lower))
1047
1048 (defun unicode-full-case-title (code)
1049 (unless (unidata-full-case-title *unicode-data*)
1050 (load-full-case-title))
1051 (%unicode-full-case code (unidata-full-case-title *unicode-data*) #'unicode-title))
1052
1053 (defun unicode-full-case-upper (code)
1054 (unless (unidata-full-case-upper *unicode-data*)
1055 (load-full-case-upper))
1056 (%unicode-full-case code (unidata-full-case-upper *unicode-data*) #'unicode-upper))
1057
1058 (defun unicode-case-fold-simple (code)
1059 (unless (unidata-case-fold-simple *unicode-data*)
1060 (load-case-fold-simple))
1061 (let* ((data (unidata-case-fold-simple *unicode-data*))
1062 (n (qref32 data code)))
1063 (if (= n 0)
1064 code
1065 n)))
1066
1067 (defun unicode-case-fold-full (code)
1068 (unless (unidata-case-fold-full *unicode-data*)
1069 (load-case-fold-full))
1070 (let* ((data (unidata-case-fold-full *unicode-data*))
1071 (n (qref32 data code)))
1072 (if (= n 0)
1073 (string (code-char (unicode-case-fold-simple code)))
1074 (let ((off (logand n #xffff))
1075 (len (ldb (byte 6 16) n)))
1076 (subseq (case-fold-full-tabl data) off (+ off len))))))
1077
1078
1079 (declaim (inline composition-table-key))
1080 (defun composition-table-key (c1 c2)
1081 (declare (type codepoint c1 c2))
1082 ;; Compute the key for the composition table from two code points.
1083 ;; Note that each codepoint is 21 bits long. We just cat the
1084 ;; codepoints together to create the key. Based on tests with
1085 ;; Unicode 5.2.0 this is good enough because the low 29 bits are
1086 ;; unique, so each key will be in its own bucket.
1087 (logior (ash c1 21) c2))
1088
1089 ;; Build the composition pair table.
1090 (defun build-composition-table ()
1091 (let ((table (make-hash-table)))
1092 (dotimes (cp #x10ffff)
1093 ;; Ignore Hangul characters, which can be done algorithmically.
1094 (unless (<= #xac00 cp #xd7a3)
1095 (let ((decomp (unicode-decomp cp nil)))
1096 ;; Also ignore any characters whose canonical decomposition
1097 ;; consists of a sequence of characters, the first of which
1098 ;; has a non-zero combining class, or if the decomposition
1099 ;; consists of a single codepoint.
1100 (when (and decomp
1101 (zerop (unicode-combining-class (codepoint decomp 0))))
1102 (multiple-value-bind (c1 widep)
1103 (codepoint decomp 0)
1104 (setf widep (if widep 2 1))
1105 (when (> (length decomp) widep)
1106 (let ((c2 (codepoint decomp widep)))
1107 (setf (gethash (composition-table-key c1 c2) table) cp))))))))
1108 ;; Remove any in the exclusion list
1109 (loop for cp across (unicode-composition-exclusions)
1110 do
1111 (let ((decomp (unicode-decomp cp nil)))
1112 (when decomp
1113 (multiple-value-bind (c1 widep)
1114 (codepoint decomp 0)
1115 (when (> (length decomp) (if widep 2 1))
1116 (let ((c2 (codepoint decomp (if widep 2 1))))
1117 (remhash (composition-table-key c1 c2) table)))))))
1118 (values table)))
1119
1120 (defvar *composition-pair-table* nil)
1121
1122 ;; Based on the sample code from
1123 ;; http://www.unicode.org/reports/tr15/#Hangul
1124 (declaim (inline compose-hangul))
1125 (defun compose-hangul (c1 c2)
1126 (declare (type codepoint c1 c2)
1127 (optimize (speed 3)))
1128 (let ((index-l (- c1 #x1100)))
1129 (cond ((and (<= 0 index-l)
1130 (< index-l 19))
1131 (let ((index-v (- c2 #x1161)))
1132 (when (and (<= 0 index-v)
1133 (< index-v 21))
1134 (+ #xac00 (* 28 (+ (* index-l 21) index-v))))))
1135 (t
1136 (let ((index-s (- c1 #xac00)))
1137 (when (and (<= 0 index-s)
1138 (< index-s 11172)
1139 (zerop (rem index-s 28)))
1140 (let ((index-t (- c2 #x11a7)))
1141 (when (and (plusp index-t)
1142 (< index-t 28))
1143 (+ c1 index-t)))))))))
1144
1145 (defun unicode-pairwise-composition (c1 c2)
1146 (declare (type codepoint c1 c2)
1147 (optimize (speed 3)))
1148 (unless *composition-pair-table*
1149 (setf *composition-pair-table* (build-composition-table)))
1150 (cond ((compose-hangul c1 c2))
1151 (t
1152 (gethash (composition-table-key c1 c2) *composition-pair-table* nil))))
1153
1154 (defun unicode-word-break-code (code)
1155 (unless (unidata-word-break *unicode-data*)
1156 (load-word-break))
1157 (let* ((data (unidata-word-break *unicode-data*))
1158 (n (qref4 data code)))
1159 n))
1160
1161 (defun unicode-word-break (code)
1162 ;; The order of the array here MUST match the order used in
1163 ;; pack-word-break in tools/build-unidata.lisp!
1164 (aref #(:other :cr :lf :newline :extend :format
1165 :katakana :aletter :midnumlet :midletter :midnum
1166 :numeric :extendnumlet)
1167 (unicode-word-break-code code)))
1168
1169 ;; Support for character name completion for slime.
1170 ;;
1171 ;; Code written by Paul Foley, with some modifications by Raymond Toy.
1172 ;;
1173
1174 ;; These hold dictionaries for the Hangul syllables and the CJK
1175 ;; unified ideographs. Note that these could be stored in
1176 ;; unidata.bin, but that adds almost a megabyte to the size of
1177 ;; unidata.bin. That seems way to much bloat for something that is
1178 ;; probably not used that much. However, this incurs a runtime cost
1179 ;; the first time it needs to be accessed. On a 450 MHz sparc, it
1180 ;; takes 55 sec for the cjk dictionary and 9 sec for the Hangul
1181 ;; dictionary. A bit long but not too bad. On a 2 GHz mac mini, it
1182 ;; takes 5 sec and .8 sec, respectively. This seems reasonable,
1183 ;; especially since the intent is for character completion, which
1184 ;; doesn't have to be too fast.
1185 (defvar *hangul-syllable-dictionary* nil
1186 "Dictionary of Hangul syllables")
1187 (defvar *cjk-unified-ideograph-dictionary* nil
1188 "Dictionary of CJK Unified ideographs")
1189
1190 ;; Convert the string into the form we want for character names.
1191 ;; Basically the Unicode name has spaces replaced by underscores, and
1192 ;; the result is capitalized.
1193 (declaim (inline %str %strx))
1194 (defun %str (x)
1195 (nsubstitute #\_ #\Space (string-capitalize x)))
1196
1197 (defun %strx (x)
1198 (%str (car x)))
1199
1200 (declaim (inline %match))
1201 #+(or)
1202 (defun %match (part prefix posn)
1203 (and (>= (length part) (- (length prefix) posn))
1204 (string= part prefix :start2 posn :end1 (- (length prefix) posn))))
1205
1206 #+(or)
1207 (defun %match (part prefix posn)
1208 (let ((s1 (search part prefix :start2 posn))
1209 (s2 (search prefix part :start1 posn)))
1210 (or (and s1 (= s1 posn))
1211 (and s2 (zerop s2)))))
1212
1213 ;; Test if the string PART matches the string PREFIX starting from
1214 ;; position POSN. Basically test that the initial parts of the
1215 ;; strings match each other exactly. For if the prefix is "BO", then
1216 ;; both "B" and "BOX" should match. (This is needed to get the
1217 ;; completion of "cjk_radical_bo" to match "cjk_radical_box" as well
1218 ;; as "cjk_radical_bone" and others because at one point in the
1219 ;; algorithm the part is "B", which we do want to match "BO" so that
1220 ;; we can get the possible completions BONE" and "BOLT OF CLOTH".
1221 (defun %match (part prefix posn)
1222 (let ((len (min (length part)
1223 (- (length prefix) posn))))
1224 (string= part prefix :end1 len :start2 posn :end2 (+ posn len))))
1225
1226
1227 (defun unicode-complete-name (prefix
1228 &optional (dict (unidata-name+
1229 *unicode-data*)))
1230 "Try to complete the string Prefix using the dictionary in Dict.
1231 Three values are returned: (1) The best match of prefix, (2) a list
1232 of possible completions, (3) a boolean indicating whether the best
1233 match is a complete unicode name. "
1234
1235 (unless dict
1236 ;; Load the names dictionary, if needed.
1237 (unless (unidata-name+ *unicode-data*)
1238 (load-names))
1239 (setf dict (unidata-name+ *unicode-data*)))
1240 (let ((prefix (nsubstitute #\Space #\_ (string-upcase prefix)))
1241 completep)
1242 (multiple-value-bind (n p)
1243 (search-dictionary prefix dict)
1244 (when n
1245 (setq completep (> (aref (dictionary-codev dict) n) -1)))
1246 #+(or debug-uc)
1247 (progn
1248 (format t "n,p,complete = ~S ~S ~S~%" n p completep)
1249 (when n (format t "match = ~S~%" (subseq prefix 0 p))))
1250 (cond ((not p)
1251 (values (%str prefix) nil nil))
1252 ((= p (length prefix))
1253 ;; The prefix is an exact match to something in the code
1254 ;; book. Try to find possible completions of this
1255 ;; prefix.
1256 (let ((x (node-next n dict))
1257 (suffix ""))
1258 #+(or debug-uc)
1259 (format t "init x = ~S~%" x)
1260 (when (= (length x) 1)
1261 ;; There was only one possible extension. Try to
1262 ;; extend from there.
1263 #+(or debug-uc)
1264 (format t "extending~%")
1265 (setq suffix (caar x)
1266 n (cdar x)
1267 x (node-next (cdar x) dict)))
1268 #+(or debug-uc)
1269 (progn
1270 (format t "x = ~S~%" x)
1271 (format t "suffix = ~S~%" suffix))
1272 (when (<= (length x) 1)
1273 (setq prefix (concatenate 'string prefix suffix))
1274 (setf suffix ""))
1275 (values (%str prefix)
1276 (sort (mapcar #'(lambda (e)
1277 (%str (concatenate 'string suffix (car e))))
1278 x)
1279 #'string<)
1280 (or (> (aref (dictionary-codev dict) n) -1)
1281 completep))))
1282 (t
1283 ;; The prefix was not an exact match of some entry in the
1284 ;; codebook. Try to find some completions from there.
1285 (let* ((nodex (node-next n dict))
1286 (x (remove-if-not (lambda (x)
1287 (%match (car x) prefix p))
1288 nodex)))
1289 #+(or debug-uc)
1290 (progn
1291 (format t "nodex = ~S~%" nodex)
1292 (format t "x = ~S~%" x))
1293 (setq prefix (subseq prefix 0 p))
1294 (cond ((= (length x) 1)
1295 ;; Only one possible completion. Try to extend
1296 ;; the completions from there.
1297 (setq prefix (concatenate 'string prefix (caar x))
1298 n (cdar x)
1299 x (node-next (cdar x) dict))
1300 (values (%str prefix)
1301 (sort (mapcar #'%strx x) #'string<)
1302 (> (aref (dictionary-codev dict) n) -1)))
1303 (t
1304 ;; There's more than one possible completion.
1305 ;; Try to extend each of those completions one
1306 ;; more step, but we still want to keep the
1307 ;; original completions.
1308 (let* ((p (append (mapcar #'car x)
1309 (mapcan #'(lambda (ex)
1310 (let ((next (node-next (cdr ex) dict)))
1311 (if next
1312 (mapcar #'(lambda (n)
1313 (concatenate 'string (car ex) (car n)))
1314 (node-next (cdr ex) dict))
1315 (list (car ex)))))
1316 x)))
1317 (q (%mip p)))
1318 (setq prefix (concatenate 'string prefix q))
1319
1320 (do ((tmp p (cdr tmp)))
1321 ((endp tmp))
1322 (setf (car tmp) (subseq (car tmp) (length q))))
1323 (values (%str prefix)
1324 (sort (mapcar #'%str p) #'string<)
1325 nil))))))))))
1326
1327 ;; Like unicode-complete-name, but we also try to handle the names
1328 ;; that can be computed algorithmically like the Hangul syllables and
1329 ;; the CJK Unified Ideographs.
1330 (defun unicode-complete (prefix
1331 &optional (dict (unidata-name+ *unicode-data*)))
1332 "Search the dictionary in Dict and return a list of the possible
1333 completions starting with Prefix. If there is no match, NIL is
1334 returned."
1335 (let (names)
1336 (multiple-value-bind (prefix-match next completep)
1337 (unicode-complete-name prefix dict)
1338 (loop for x in next
1339 do (push (concatenate 'string prefix-match x) names))
1340 (when completep
1341 (push prefix-match names))
1342 (flet ((han-or-cjk-completion (prefix-match prefix dictionary)
1343 (let* ((prefix-tail (subseq prefix-match
1344 (min (length prefix)
1345 (length prefix-match))))
1346 (full-prefix (concatenate 'string prefix prefix-tail)))
1347 (multiple-value-bind (m suffixes)
1348 (unicode-complete-name prefix-tail dictionary)
1349 (declare (ignore m))
1350 (if suffixes
1351 (loop for n in suffixes
1352 do (push (concatenate 'string full-prefix n) names))
1353 ;; No suffixes. So either the prefix is the
1354 ;; only possible completion or it's not valid.
1355 ;; Figure that out. If it's valid, add it to
1356 ;; names.
1357 (when (search-dictionary (string-upcase prefix-tail) dictionary)
1358 (push prefix-match names)))))))
1359 ;; Match prefix for Hangul syllables or CJK unified ideographs.
1360 (cond ((char= (char prefix-match 0) #\H)
1361 ;; Add "Hangul_Syllable_" as possible completion for
1362 ;; anything beginning with "H".
1363 (push "Hangul_Syllable_" names)
1364 (when (<= (length names) 1)
1365 ;; Hangul_Syllable is the only match, so let's extend it.
1366 (unless *hangul-syllable-dictionary*
1367 (initialize-reverse-hangul-tables)
1368 (build-hangul-syllable-dictionary))
1369 (han-or-cjk-completion prefix-match "Hangul_Syllable_"
1370 *hangul-syllable-dictionary*)))
1371 ((char= (char prefix-match 0) #\C)
1372 ;; Add "Cjk_Unified_Ideograph-" as possible completion
1373 ;; for anything beginning with "C".
1374 (push "Cjk_Unified_Ideograph-" names)
1375 (when (<= (length names) 1)
1376 (unless *cjk-unified-ideograph-dictionary*
1377 (build-cjk-unified-ideograph-dictionary))
1378 (han-or-cjk-completion prefix-match "Cjk_Unified_Ideograph-"
1379 *cjk-unified-ideograph-dictionary*)
1380 ))))
1381 (setf names (mapcar #'string-capitalize names))
1382 ;;(format t "Final names = ~S~%" names)
1383 names)))
1384
1385 ;; Find the longest initial substring of the STRINGS.
1386 (defun %mip (strings)
1387 (let* ((first (first strings))
1388 (posn (length first)))
1389 (dolist (string (rest strings))
1390 (setq posn (or (mismatch first string :end1 posn) posn)))
1391 (subseq first 0 posn)))
1392
1393 (defun node-next (i &optional (dict (unidata-name+ *unicode-data*)))
1394 (let* ((j (aref (dictionary-nextv dict) i))
1395 (x (ldb (byte 14 18) j))
1396 (y (ldb (byte 18 0) j)))
1397 (loop for i from 0 below (aref (dictionary-keyl dict) x)
1398 collect (close-node (cons (aref (dictionary-cdbk dict)
1399 (aref (dictionary-keyv dict) (+ x i)))
1400 (+ y i))
1401 dict))))
1402
1403 (defun close-node (i &optional (dict (unidata-name+ *unicode-data*)))
1404 (loop
1405 (if (> (aref (dictionary-codev dict) (cdr i)) -1)
1406 (return i)
1407 (let* ((j (aref (dictionary-nextv dict) (cdr i)))
1408 (x (ldb (byte 14 18) j))
1409 (y (ldb (byte 18 0) j)))
1410 (if (> (aref (dictionary-keyl dict) x) 1)
1411 (return i)
1412 (let ((k (aref (dictionary-cdbk dict)
1413 (aref (dictionary-keyv dict) x))))
1414 (setf (car i) (concatenate 'string (car i) k)
1415 (cdr i) y)))))))
1416
1417 (defun build-hangul-syllable-dictionary ()
1418 "Build the dictionary for Hangul syllables"
1419 (format t "~&Building Hangul Syllable dictionary. Please wait...~%")
1420 (force-output)
1421 (initialize-reverse-hangul-tables)
1422 (let ((hangul-codebook
1423 ;; For our codebook, combine all the choseong, jungseong, and
1424 ;; jonseong syllables, but removing empty strings (there's at
1425 ;; least one). Then sort these according to length. This
1426 ;; ensures that if A is an initial substring of B, then B
1427 ;; must come before A (or A will never be used). (See
1428 ;; tools/build-unidata.lisp, *codebook*.)
1429 (sort (map 'vector #'car
1430 (delete ""
1431 (concatenate 'vector
1432 *reverse-hangul-choseong*
1433 *reverse-hangul-jungseong*
1434 *reverse-hangul-jongseong*)
1435 :test #'string= :key #'car))
1436 #'> :key #'length))
1437 (names
1438 (loop for codepoint from 0 below codepoint-limit
1439 when (hangul-syllable-p codepoint)
1440 collect (cons (subseq (format nil "~A"
1441 (string-upcase (char-name (code-char codepoint))))
1442 16)
1443 codepoint))))
1444
1445 (setf *hangul-syllable-dictionary*
1446 (build-dictionary hangul-codebook names))
1447 (format t "~&Done.~%")
1448 (force-output)
1449 (values)))
1450
1451 (defun build-cjk-unified-ideograph-dictionary ()
1452 "Build the dictionary for CJK Unified Ideographs"
1453 (format t "~&Building CJK Unified Ideographs dictionary. Please wait...~%")
1454 (force-output)
1455 (let ((codebook (coerce (loop for k from 0 to 15
1456 collect (format nil "~X" k))
1457 'vector))
1458 (names (loop for codepoint from 0 below codepoint-limit
1459 when (cjk-ideograph-p codepoint)
1460 collect (cons (format nil "~X" codepoint)
1461 codepoint))))
1462 (setf *cjk-unified-ideograph-dictionary*
1463 (build-dictionary codebook names))
1464 (format t "~&Done.~%")
1465 (force-output)
1466 (values)))
1467
1468 ;; The definitions of BUILD-DICTIONARY, NAME-LOOKUP, and ENCODE-NAME
1469 ;; were taken from build-unidata.lisp.
1470 (defun build-dictionary (codebook entries)
1471 (let ((khash (make-hash-table :test 'equalp))
1472 (thash (make-hash-table))
1473 (top 0)
1474 (keyl (make-array 0 :element-type '(unsigned-byte 8)))
1475 (keyv (make-array 0 :element-type '(unsigned-byte 8)))
1476 vec1 vec2 vec3)
1477 (labels ((add-to-trie (trie name codepoint)
1478 (loop for ch across (encode-name name codebook) do
1479 (let ((sub (cdr (assoc ch (rest trie)))))
1480 (if sub
1481 (setq trie sub)
1482 (setq trie (cdar (push (cons ch (cons nil nil))
1483 (rest trie)))))))
1484 (unless (or (null (car trie)) (= (car trie) codepoint))
1485 (error "Codepoints #x~4,'0X and #x~4,'0X are both named ~S."
1486 (car trie) codepoint name))
1487 (setf (car trie) codepoint))
1488 (key (trie)
1489 (map '(simple-array (unsigned-byte 8) (*)) #'car (rest trie)))
1490 (pass1 (trie depth)
1491 (setf (rest trie) (sort (rest trie) #'< :key #'car))
1492 (setf (gethash trie thash)
1493 (list depth (1- (incf top)) (length (rest trie))))
1494 (setf (gethash (key trie) khash) t)
1495 (mapc (lambda (x) (pass1 (cdr x) (1+ depth))) (rest trie)))
1496 (pass2 (trie)
1497 (let* ((x (gethash (gethash trie thash) thash))
1498 (n (car x)))
1499 (setf (aref vec1 n) (if (first trie) (first trie) -1)
1500 (aref vec2 n) (logior (ash (gethash (key trie) khash)
1501 18)
1502 (cdr x))))
1503 (mapc (lambda (x) (pass2 (cdr x))) (rest trie))))
1504 (format t "~& Initializing...~%")
1505 (let ((trie (cons nil nil)))
1506 (loop for (name . code) in entries do (add-to-trie trie name code))
1507 (format t "~& Pass 1...~%")
1508 (pass1 trie 0)
1509 (format t "~& Sorting...~%")
1510 (dolist (key (sort (loop for k being the hash-keys of khash
1511 collect k)
1512 #'> :key #'length))
1513 (let ((pos -1))
1514 (loop
1515 (setq pos (search key keyv :start2 (1+ pos)))
1516 (when (and pos (zerop (aref keyl pos)))
1517 (setf (aref keyl pos) (length key)))
1518 (when (and pos (= (aref keyl pos) (length key)))
1519 (setf (gethash key khash) pos)
1520 (return))
1521 (when (null pos)
1522 (setf (gethash key khash) (length keyv))
1523 (setf keyl (adjust-array keyl (+ (length keyv) (length key))))
1524 (setf (aref keyl (length keyv)) (length key))
1525 (setf keyv (concatenate '(simple-array (unsigned-byte 8) (*))
1526 keyv key))
1527 (return)))))
1528 (loop with off = 1
1529 for key in (sort (loop for x being the hash-values of thash
1530 collect x)
1531 (lambda (a b) (if (= (first a) (first b))
1532 (< (second a) (second b))
1533 (< (first a) (first b)))))
1534 as i upfrom 0
1535 do (setf (gethash key thash) (cons i off) off (+ off (third key))))
1536 (setq vec1 (make-array top :element-type '(signed-byte 32))
1537 vec2 (make-array top :element-type '(unsigned-byte 32))
1538 vec3 (make-array top :element-type '(unsigned-byte 32)))
1539 (format t "~& Pass 2...~%")
1540 (pass2 trie)
1541 (format t "~& Finalizing~%")
1542 (dotimes (i top)
1543 (let ((xxx (aref vec2 i)))
1544 (dotimes (j (aref keyl (ash xxx -18)))
1545 (setf (aref vec3 (+ (logand xxx #x3FFFF) j)) i))))
1546 (loop for (name . code) in entries do
1547 (let ((n (name-lookup name codebook keyv keyl vec2)))
1548 (unless n (error "Codepoint not found for ~S." name))
1549 (setf (ldb (byte 14 18) (aref vec3 n)) (length name))))))
1550 (make-dictionary :cdbk codebook
1551 :keyv keyv :keyl keyl
1552 :codev vec1 :nextv vec2 :namev vec3)))
1553
1554 (defun name-lookup (name codebook keyv keyl nextv)
1555 (let* ((current 0)
1556 (posn 0))
1557 (loop
1558 (let ((keyp (ash (aref nextv current) -18)))
1559 (dotimes (i (aref keyl keyp)
1560 (return-from name-lookup nil)) ; shouldn't happen
1561 (let* ((str (aref codebook (aref keyv (+ keyp i))))
1562 (len (length str)))
1563 (when (and (>= (length name) (+ posn len))
1564 (string= name str :start1 posn :end1 (+ posn len)))
1565 (setq current
1566 (+ (logand (aref nextv current) #x3FFFF) i))
1567 (if (= (incf posn len) (length name))
1568 (return-from name-lookup current)
1569 (return)))))))))
1570
1571 (defun encode-name (string codebook)
1572 (let ((p 0)
1573 (res '()))
1574 (loop while (< p (length string)) do
1575 (dotimes (i (length codebook)
1576 (error "\"~C\" is not in the codebook." (char string p)))
1577 (let ((code (aref codebook i)))
1578 (when (and (<= (length code) (- (length string) p))
1579 (string= string code :start1 p :end1 (+ p (length code))))
1580 (push i res)
1581 (incf p (length code))
1582 (return)))))
1583 (nreverse (coerce res 'vector))))
1584
1585 ;; This is primarily intended for users who what to create a core
1586 ;; image that contains all of the unicode data. By doing this, the
1587 ;; resulting image no longer needs unidata.bin anymore. This is
1588 ;; useful for an executable image.
1589 (defun load-all-unicode-data ()
1590 "Load all unicode data and set *UNIDATA-PATH* to NIL.
1591 Normally, the unicode data is loaded as needed. This loads all of the
1592 data, which is useful for creating a core that no longer needs
1593 unidata.bin."
1594 (dolist (loader (reverse *defloaders*))
1595 (funcall loader))
1596 t)
1597
1598 ;; CHeck to see if all of the unicode data has been loaded.
1599 (defun unicode-data-loaded-p ()
1600 ;; FIXME: Would be nice to be able to do this automatically from the
1601 ;; structure without having to list every slot here.
1602 (and (unidata-range *unicode-data*)
1603 (unidata-name+ *unicode-data*)
1604 (unidata-name *unicode-data*)
1605 (unidata-category *unicode-data*)
1606 (unidata-scase *unicode-data*)
1607 (unidata-numeric *unicode-data*)
1608 (unidata-decomp *unicode-data*)
1609 (unidata-combining *unicode-data*)
1610 (unidata-bidi *unicode-data*)
1611 (unidata-name1+ *unicode-data*)
1612 (unidata-name1 *unicode-data*)
1613 (unidata-qc-nfd *unicode-data*)
1614 (unidata-qc-nfkd *unicode-data*)
1615 (unidata-qc-nfc *unicode-data*)
1616 (unidata-qc-nfkc *unicode-data*)
1617 (unidata-comp-exclusions *unicode-data*)
1618 (unidata-full-case-lower *unicode-data*)
1619 (unidata-full-case-title *unicode-data*)
1620 (unidata-full-case-upper *unicode-data*)
1621 (unidata-case-fold-simple *unicode-data*)
1622 (unidata-case-fold-full *unicode-data*)
1623 (unidata-word-break *unicode-data*)
1624 t))

  ViewVC Help
Powered by ViewVC 1.1.5