/[cmucl]/src/hemlock/spell-build.lisp
ViewVC logotype

Contents of /src/hemlock/spell-build.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (show annotations)
Mon Oct 31 04:50:12 1994 UTC (19 years, 5 months ago) by ram
Branch: MAIN
CVS Tags: sparc-tramp-assem-base, double-double-array-base, post-merge-intl-branch, release-19b-pre1, release-19b-pre2, merged-unicode-utf16-extfmt-2009-06-11, double-double-init-sparc-2, unicode-utf16-extfmt-2009-03-27, double-double-base, snapshot-2007-09, snapshot-2007-08, snapshot-2008-08, snapshot-2008-09, ppc_gencgc_snap_2006-01-06, sse2-packed-2008-11-12, snapshot-2008-05, snapshot-2008-06, snapshot-2008-07, snapshot-2007-05, snapshot-2008-01, snapshot-2008-02, snapshot-2008-03, intl-branch-working-2010-02-19-1000, snapshot-2006-11, snapshot-2006-10, double-double-init-sparc, snapshot-2006-12, unicode-string-buffer-impl-base, sse2-base, release-20b-pre1, release-20b-pre2, unicode-string-buffer-base, RELEASE_18d, sse2-packed-base, sparc-tramp-assem-2010-07-19, amd64-dd-start, snapshot-2003-10, snapshot-2004-10, release-18e-base, release-19f-pre1, snapshot-2008-12, snapshot-2008-11, intl-2-branch-base, snapshot-2004-08, snapshot-2004-09, remove_negative_zero_not_zero, snapshot-2007-01, snapshot-2007-02, snapshot-2004-05, snapshot-2004-06, snapshot-2004-07, release-19e, release-19d, GIT-CONVERSION, double-double-init-ppc, release-19c, dynamic-extent-base, unicode-utf16-sync-2008-12, LINKAGE_TABLE, release-19c-base, cross-sol-x86-merged, label-2009-03-16, release-19f-base, PRE_LINKAGE_TABLE, merge-sse2-packed, mod-arith-base, sparc_gencgc_merge, merge-with-19f, snapshot-2004-12, snapshot-2004-11, intl-branch-working-2010-02-11-1000, RELEASE_18a, RELEASE_18b, RELEASE_18c, unicode-snapshot-2009-05, unicode-snapshot-2009-06, amd64-merge-start, ppc_gencgc_snap_2005-12-17, double-double-init-%make-sparc, unicode-utf16-sync-2008-07, release-18e-pre2, unicode-utf16-sync-2008-09, unicode-utf16-extfmts-sync-2008-12, prm-before-macosx-merge-tag, cold-pcl-base, RELEASE_20b, snapshot-2008-04, snapshot-2003-11, snapshot-2005-07, unicode-utf16-sync-label-2009-03-16, RELEASE_19f, snapshot-2007-03, release-20a-base, cross-sol-x86-base, unicode-utf16-char-support-2009-03-26, unicode-utf16-char-support-2009-03-25, release-19a-base, unicode-utf16-extfmts-pre-sync-2008-11, snapshot-2008-10, sparc_gencgc, snapshot-2007-04, snapshot-2010-12, snapshot-2010-11, unicode-utf16-sync-2008-11, snapshot-2007-07, snapshot-2011-09, snapshot-2011-06, snapshot-2011-07, snapshot-2011-04, snapshot-2007-06, snapshot-2011-02, snapshot-2011-03, snapshot-2011-01, snapshot-2003-12, release-19a-pre1, release-19a-pre3, release-19a-pre2, pre-merge-intl-branch, release-19a, UNICODE-BASE, double-double-array-checkpoint, double-double-reader-checkpoint-1, release-19d-base, release-19e-pre1, double-double-irrat-end, release-19e-pre2, snapshot-2010-05, snapshot-2010-04, snapshot-2010-07, snapshot-2010-06, snapshot-2010-01, snapshot-2010-03, snapshot-2010-02, release-19d-pre2, release-19d-pre1, snapshot-2010-08, release-18e, double-double-init-checkpoint-1, double-double-reader-base, label-2009-03-25, snapshot-2005-03, release-19b-base, cross-sol-x86-2010-12-20, double-double-init-x86, sse2-checkpoint-2008-10-01, intl-branch-2010-03-18-1300, snapshot-2005-11, double-double-sparc-checkpoint-1, snapshot-2004-04, sse2-merge-with-2008-11, sse2-merge-with-2008-10, snapshot-2005-10, RELEASE_20a, snapshot-2005-12, release-20a-pre1, snapshot-2005-01, snapshot-2009-11, snapshot-2009-12, unicode-utf16-extfmt-2009-06-11, portable-clx-import-2009-06-16, unicode-utf16-string-support, release-19c-pre1, cross-sparc-branch-base, release-19e-base, intl-branch-base, double-double-irrat-start, snapshot-2005-06, snapshot-2005-05, snapshot-2005-04, ppc_gencgc_snap_2005-05-14, snapshot-2005-02, unicode-utf16-base, portable-clx-base, snapshot-2005-09, snapshot-2005-08, lisp-executable-base, snapshot-2009-08, snapshot-2007-12, snapshot-2007-10, snapshot-2007-11, snapshot-2009-02, snapshot-2009-01, snapshot-2009-07, snapshot-2009-05, snapshot-2009-04, snapshot-2006-02, snapshot-2006-03, release-18e-pre1, snapshot-2006-01, snapshot-2006-06, snapshot-2006-07, snapshot-2006-04, snapshot-2006-05, pre-telent-clx, snapshot-2006-08, snapshot-2006-09, HEAD
Branch point for: release-19b-branch, double-double-reader-branch, double-double-array-branch, mod-arith-branch, RELEASE-19F-BRANCH, portable-clx-branch, sparc_gencgc_branch, cross-sparc-branch, RELEASE-20B-BRANCH, RELENG_18, unicode-string-buffer-branch, sparc-tramp-assem-branch, dynamic-extent, UNICODE-BRANCH, release-19d-branch, ppc_gencgc_branch, sse2-packed-branch, lisp-executable, RELEASE-20A-BRANCH, amd64-dd-branch, double-double-branch, unicode-string-buffer-impl-branch, intl-branch, release-18e-branch, cold-pcl, unicode-utf16-branch, cross-sol-x86-branch, release-19e-branch, sse2-branch, release-19a-branch, release-19c-branch, intl-2-branch, unicode-utf16-extfmt-branch
Changes since 1.2: +1 -3 lines
Fix headed boilerplate.
1 ;;; -*- Log: hemlock.log; Package: Spell -*-
2 ;;;
3 ;;; **********************************************************************
4 ;;; This code was written as part of the CMU Common Lisp project at
5 ;;; Carnegie Mellon University, and has been placed in the public domain.
6 ;;;
7 (ext:file-comment
8 "$Header: /tiger/var/lib/cvsroots/cmucl/src/hemlock/spell-build.lisp,v 1.3 1994/10/31 04:50:12 ram Rel $")
9 ;;;
10 ;;; **********************************************************************
11 ;;;
12 ;;; Written by Bill Chiles
13 ;;; Designed by Bill Chiles and Rob Maclachlan
14 ;;;
15
16 ;;; This file contains code to build a new binary dictionary file from
17 ;;; text in system space. This code relies on implementation dependent
18 ;;; code from spell-rt.lisp. Also, it is expected that spell-corr.lisp
19 ;;; and spell-aug.lisp have been loaded. In order to compile this file,
20 ;;; you must first compile spell-rt, spell-corr.lisp, and spell-aug.lisp.
21
22 ;;; The text file must be in the following format:
23 ;;; entry1/flag1/flag2/flag3
24 ;;; entry2
25 ;;; entry3/flag1/flag2/flag3/flag4/flag5.
26 ;;; The flags are single letter indicators of legal suffixes for the entry;
27 ;;; the available flags and their correct use may be found at the beginning
28 ;;; of spell-corr.lisp in the Hemlock sources. There must be exactly one
29 ;;; entry per line, and each line must be flushleft.
30
31 ;;; The dictionary is built in system space as three distinct
32 ;;; blocks of memory: the dictionary which is a hash table whose elements
33 ;;; are one machine word or of type '(unsigned-byte 16); a descriptors
34 ;;; vector which is described below; and a string table. After all the
35 ;;; entries are read in from the text file, one large block of memory is
36 ;;; validated, and the three structures are moved into it. Then the file
37 ;;; is written. When the large block of memory is validated, enough
38 ;;; memory is allocated to write the three vector such that they are page
39 ;;; aligned. This is important for the speed it allows in growing the
40 ;;; "dictionary" when augmenting it from a user's text file (see
41 ;;; spell-aug.lisp).
42
43
44 (in-package "SPELL")
45
46
47
48 ;;;; Constants
49
50 ;;; This is an upper bound estimate of the number of stored entries in the
51 ;;; dictionary. It should not be more than 21,845 because the dictionary
52 ;;; is a vector of type '(unsigned-byte 16), and the descriptors' vector
53 ;;; for the entries uses three '(unsigned-byte 16) elements per descriptor
54 ;;; unit. See the beginning of Spell-Correct.Lisp.
55 ;;;
56 (eval-when (compile load eval)
57
58 (defconstant max-entry-count-estimate 15600)
59
60 (defconstant new-dictionary-size 20011)
61
62 (defconstant new-descriptors-size (1+ (* 3 max-entry-count-estimate)))
63
64 (defconstant max-string-table-length (* 10 max-entry-count-estimate))
65
66 ); eval-when
67
68
69 ;;;; Hashing
70
71 ;;; These hashing macros are different from the ones in Spell-Correct.Lisp
72 ;;; simply because we are using separate space and global specials/constants.
73 ;;; Of course, they should be identical, but it doesn't seem worth cluttering
74 ;;; up Spell-Correct with macro generating macros for this file.
75
76 (eval-when (compile eval)
77
78 (defmacro new-hash2-increment (hash)
79 `(- new-dictionary-size
80 2
81 (the fixnum (rem ,hash (- new-dictionary-size 2)))))
82
83 (defmacro new-hash2-loop (loc hash dictionary)
84 (let ((incr (gensym))
85 (loop-loc (gensym)))
86 `(let* ((,incr (new-hash2-increment ,hash))
87 (,loop-loc ,loc))
88 (declare (fixnum ,incr ,loop-loc))
89 (loop (setf ,loop-loc
90 (rem (+ ,loop-loc ,incr) new-dictionary-size))
91 (when (zerop (the fixnum (aref ,dictionary ,loop-loc)))
92 (return ,loop-loc))
93 (when (= ,loop-loc ,loc) (return nil))))))
94
95 (defmacro new-hash-entry (entry entry-len dictionary)
96 (let ((hash (gensym))
97 (loc (gensym)))
98 `(let* ((,hash (string-hash ,entry ,entry-len))
99 (,loc (rem ,hash new-dictionary-size)))
100 (declare (fixnum ,loc))
101 (cond ((not (zerop (the fixnum (aref ,dictionary ,loc))))
102 (incf *collision-count*)
103 (new-hash2-loop ,loc ,hash ,dictionary))
104 (t ,loc)))))
105
106 ) ;eval-when
107
108
109
110 ;;;; Build-Dictionary
111
112 ;;; An interesting value when building an initial dictionary.
113 (defvar *collision-count* 0)
114
115 (defvar *new-dictionary*)
116 (defvar *new-descriptors*)
117 (defvar *new-string-table*)
118
119 (defun build-dictionary (input output &optional save-structures-p)
120 (let ((dictionary (make-array new-dictionary-size
121 :element-type '(unsigned-byte 16)))
122 (descriptors (make-array new-descriptors-size
123 :element-type '(unsigned-byte 16)))
124 (string-table (make-string max-string-table-length)))
125 (write-line "Reading dictionary ...")
126 (force-output)
127 (setf *collision-count* 0)
128 (multiple-value-bind (entry-count string-table-length)
129 (read-initial-dictionary input dictionary
130 descriptors string-table)
131 (write-line "Writing dictionary ...")
132 (force-output)
133 (write-dictionary output dictionary descriptors entry-count
134 string-table string-table-length)
135 (when save-structures-p
136 (setf *new-dictionary* dictionary)
137 (setf *new-descriptors* descriptors)
138 (setf *new-string-table* string-table))
139 (format t "~D entries processed with ~D collisions."
140 entry-count *collision-count*))))
141
142 (defun read-initial-dictionary (f dictionary descriptors string-table)
143 (let* ((filename (pathname f))
144 (s (open filename :direction :input :if-does-not-exist nil)))
145 (unless s (error "File ~S does not exist." f))
146 (multiple-value-prog1
147 (let ((descriptor-ptr 1)
148 (string-ptr 0)
149 (entry-count 0))
150 (declare (fixnum descriptor-ptr string-ptr entry-count))
151 (loop (multiple-value-bind (line eofp) (read-line s nil nil)
152 (declare (type (or null simple-string) line))
153 (unless line (return (values entry-count string-ptr)))
154 (incf entry-count)
155 (when (> entry-count max-entry-count-estimate)
156 (error "There are too many entries in text file!~%~
157 Please change constants in spell-build.lisp, ~
158 recompile the file, and reload it.~%~
159 Be sure to understand the constraints of permissible ~
160 values."))
161 (let ((flags (or (position #\/ line :test #'char=) (length line))))
162 (declare (fixnum flags))
163 (cond ((> flags max-entry-length)
164 (format t "Entry ~s too long." (subseq line 0 flags))
165 (force-output))
166 (t (let ((new-string-ptr (+ string-ptr flags)))
167 (declare (fixnum new-string-ptr))
168 (when (> new-string-ptr max-string-table-length)
169 (error "Spell string table overflow!~%~
170 Please change constants in ~
171 spell-build.lisp, recompile the file, ~
172 and reload it.~%~
173 Be sure to understand the constraints ~
174 of permissible values."))
175 (spell-place-entry line flags
176 dictionary descriptors string-table
177 descriptor-ptr string-ptr)
178 (incf descriptor-ptr 3)
179 (setf string-ptr new-string-ptr)))))
180 (when eofp (return (values entry-count string-ptr))))))
181 (close s))))
182
183 (defun spell-place-entry (line word-end dictionary descriptors string-table
184 descriptor-ptr string-ptr)
185 (declare (simple-string line string-table)
186 (fixnum word-end descriptor-ptr string-ptr)
187 (type (array (unsigned-byte 16) (*)) dictionary descriptors))
188 (nstring-upcase line :end word-end)
189 (let* ((hash-loc (new-hash-entry line word-end dictionary))
190 (descriptor-ptr+1 (1+ descriptor-ptr))
191 (descriptor-ptr+2 (1+ descriptor-ptr+1)))
192 (unless hash-loc (error "Dictionary Overflow!"))
193 (setf (aref dictionary hash-loc) descriptor-ptr)
194 (setf (aref descriptors descriptor-ptr)
195 (dpb (the fixnum
196 (ldb new-hash-byte (string-hash line word-end)))
197 stored-hash-byte
198 word-end))
199 (setf (aref descriptors descriptor-ptr+1)
200 (ldb whole-index-low-byte string-ptr))
201 (setf (aref descriptors descriptor-ptr+2)
202 (dpb (the fixnum (ldb whole-index-high-byte string-ptr))
203 stored-index-high-byte
204 0))
205 (new-add-flags descriptors descriptor-ptr+2 line word-end)
206 (replace string-table line :start1 string-ptr :end2 word-end)))
207
208 (defun new-add-flags (descriptors loc line word-end)
209 (declare (simple-string line)
210 (fixnum word-end)
211 (type (array (unsigned-byte 16) (*)) descriptors))
212 (do ((flag (1+ word-end) (+ 2 flag))
213 (line-end (length line)))
214 ((>= flag line-end))
215 (declare (fixnum flag line-end))
216 (let ((flag-mask (flag-mask (schar line flag))))
217 (declare (fixnum flag-mask))
218 (if (zerop flag-mask)
219 (format t "Illegal flag ~S on word ~S."
220 (schar line flag) (subseq line 0 word-end))
221 (setf (aref descriptors loc)
222 (logior flag-mask (aref descriptors loc)))))))
223
224 (defun write-dictionary (f dictionary descriptors entry-count
225 string-table string-table-length)
226 (declare (type (array (unsigned-byte 16) (*)) dictionary descriptors)
227 (simple-string string-table)
228 (fixnum string-table-length))
229 (let ((filename (ext:unix-namestring (pathname f) nil)))
230 (with-open-file (s filename :direction :output
231 :element-type '(unsigned-byte 16)
232 :if-exists :overwrite
233 :if-does-not-exist :create)
234 (let ((descriptors-size (1+ (* 3 entry-count))))
235 (write-byte magic-file-id s)
236 (write-byte new-dictionary-size s)
237 (write-byte descriptors-size s)
238 (write-byte (ldb whole-index-low-byte string-table-length) s)
239 (write-byte (ldb whole-index-high-byte string-table-length) s)
240 (dotimes (i new-dictionary-size)
241 (write-byte (aref dictionary i) s))
242 (dotimes (i descriptors-size)
243 (write-byte (aref descriptors i) s))))
244 (with-open-file (s f :direction :output :element-type 'base-char
245 :if-exists :append)
246 (write-string string-table s :end string-table-length))))

  ViewVC Help
Powered by ViewVC 1.1.5