/[cmucl]/src/hemlock/spell-corr.lisp
ViewVC logotype

Contents of /src/hemlock/spell-corr.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (show annotations)
Mon Oct 31 04:50:12 1994 UTC (19 years, 5 months ago) by ram
Branch: MAIN
CVS Tags: sparc-tramp-assem-base, double-double-array-base, post-merge-intl-branch, release-19b-pre1, release-19b-pre2, merged-unicode-utf16-extfmt-2009-06-11, double-double-init-sparc-2, unicode-utf16-extfmt-2009-03-27, double-double-base, snapshot-2007-09, snapshot-2007-08, snapshot-2008-08, snapshot-2008-09, ppc_gencgc_snap_2006-01-06, sse2-packed-2008-11-12, snapshot-2008-05, snapshot-2008-06, snapshot-2008-07, snapshot-2007-05, snapshot-2008-01, snapshot-2008-02, snapshot-2008-03, intl-branch-working-2010-02-19-1000, snapshot-2006-11, snapshot-2006-10, double-double-init-sparc, snapshot-2006-12, unicode-string-buffer-impl-base, sse2-base, release-20b-pre1, release-20b-pre2, unicode-string-buffer-base, RELEASE_18d, sse2-packed-base, sparc-tramp-assem-2010-07-19, amd64-dd-start, snapshot-2003-10, snapshot-2004-10, release-18e-base, release-19f-pre1, snapshot-2008-12, snapshot-2008-11, intl-2-branch-base, snapshot-2004-08, snapshot-2004-09, remove_negative_zero_not_zero, snapshot-2007-01, snapshot-2007-02, snapshot-2004-05, snapshot-2004-06, snapshot-2004-07, release-19e, release-19d, GIT-CONVERSION, double-double-init-ppc, release-19c, dynamic-extent-base, unicode-utf16-sync-2008-12, LINKAGE_TABLE, release-19c-base, cross-sol-x86-merged, label-2009-03-16, release-19f-base, PRE_LINKAGE_TABLE, merge-sse2-packed, mod-arith-base, sparc_gencgc_merge, merge-with-19f, snapshot-2004-12, snapshot-2004-11, intl-branch-working-2010-02-11-1000, RELEASE_18a, RELEASE_18b, RELEASE_18c, unicode-snapshot-2009-05, unicode-snapshot-2009-06, amd64-merge-start, ppc_gencgc_snap_2005-12-17, double-double-init-%make-sparc, unicode-utf16-sync-2008-07, release-18e-pre2, unicode-utf16-sync-2008-09, unicode-utf16-extfmts-sync-2008-12, prm-before-macosx-merge-tag, cold-pcl-base, RELEASE_20b, snapshot-2008-04, snapshot-2003-11, snapshot-2005-07, unicode-utf16-sync-label-2009-03-16, RELEASE_19f, snapshot-2007-03, release-20a-base, cross-sol-x86-base, unicode-utf16-char-support-2009-03-26, unicode-utf16-char-support-2009-03-25, release-19a-base, unicode-utf16-extfmts-pre-sync-2008-11, snapshot-2008-10, sparc_gencgc, snapshot-2007-04, snapshot-2010-12, snapshot-2010-11, unicode-utf16-sync-2008-11, snapshot-2007-07, snapshot-2011-09, snapshot-2011-06, snapshot-2011-07, snapshot-2011-04, snapshot-2007-06, snapshot-2011-02, snapshot-2011-03, snapshot-2011-01, snapshot-2003-12, release-19a-pre1, release-19a-pre3, release-19a-pre2, pre-merge-intl-branch, release-19a, UNICODE-BASE, double-double-array-checkpoint, double-double-reader-checkpoint-1, release-19d-base, release-19e-pre1, double-double-irrat-end, release-19e-pre2, snapshot-2010-05, snapshot-2010-04, snapshot-2010-07, snapshot-2010-06, snapshot-2010-01, snapshot-2010-03, snapshot-2010-02, release-19d-pre2, release-19d-pre1, snapshot-2010-08, release-18e, double-double-init-checkpoint-1, double-double-reader-base, label-2009-03-25, snapshot-2005-03, release-19b-base, cross-sol-x86-2010-12-20, double-double-init-x86, sse2-checkpoint-2008-10-01, intl-branch-2010-03-18-1300, snapshot-2005-11, double-double-sparc-checkpoint-1, snapshot-2004-04, sse2-merge-with-2008-11, sse2-merge-with-2008-10, snapshot-2005-10, RELEASE_20a, snapshot-2005-12, release-20a-pre1, snapshot-2005-01, snapshot-2009-11, snapshot-2009-12, unicode-utf16-extfmt-2009-06-11, portable-clx-import-2009-06-16, unicode-utf16-string-support, release-19c-pre1, cross-sparc-branch-base, release-19e-base, intl-branch-base, double-double-irrat-start, snapshot-2005-06, snapshot-2005-05, snapshot-2005-04, ppc_gencgc_snap_2005-05-14, snapshot-2005-02, unicode-utf16-base, portable-clx-base, snapshot-2005-09, snapshot-2005-08, lisp-executable-base, snapshot-2009-08, snapshot-2007-12, snapshot-2007-10, snapshot-2007-11, snapshot-2009-02, snapshot-2009-01, snapshot-2009-07, snapshot-2009-05, snapshot-2009-04, snapshot-2006-02, snapshot-2006-03, release-18e-pre1, snapshot-2006-01, snapshot-2006-06, snapshot-2006-07, snapshot-2006-04, snapshot-2006-05, pre-telent-clx, snapshot-2006-08, snapshot-2006-09, HEAD
Branch point for: release-19b-branch, double-double-reader-branch, double-double-array-branch, mod-arith-branch, RELEASE-19F-BRANCH, portable-clx-branch, sparc_gencgc_branch, cross-sparc-branch, RELEASE-20B-BRANCH, RELENG_18, unicode-string-buffer-branch, sparc-tramp-assem-branch, dynamic-extent, UNICODE-BRANCH, release-19d-branch, ppc_gencgc_branch, sse2-packed-branch, lisp-executable, RELEASE-20A-BRANCH, amd64-dd-branch, double-double-branch, unicode-string-buffer-impl-branch, intl-branch, release-18e-branch, cold-pcl, unicode-utf16-branch, cross-sol-x86-branch, release-19e-branch, sse2-branch, release-19a-branch, release-19c-branch, intl-2-branch, unicode-utf16-extfmt-branch
Changes since 1.2: +1 -3 lines
Fix headed boilerplate.
1 ;;; -*- Log: hemlock.log; Package: Spell -*-
2 ;;;
3 ;;; **********************************************************************
4 ;;; This code was written as part of the CMU Common Lisp project at
5 ;;; Carnegie Mellon University, and has been placed in the public domain.
6 ;;;
7 (ext:file-comment
8 "$Header: /tiger/var/lib/cvsroots/cmucl/src/hemlock/spell-corr.lisp,v 1.3 1994/10/31 04:50:12 ram Rel $")
9 ;;;
10 ;;; **********************************************************************
11 ;;;
12 ;;; Written by Bill Chiles
13 ;;; Designed by Bill Chiles and Rob Maclachlan
14 ;;;
15
16 ;;; This is the file that deals with checking and correcting words
17 ;;; using a dictionary read in from a binary file. It has been written
18 ;;; from the basic ideas used in Ispell (on DEC-20's) which originated as
19 ;;; Spell on the ITS machines at MIT. There are flags which have proper
20 ;;; uses defined for them that indicate permissible suffixes to entries.
21 ;;; This allows for about three times as many known words than are actually
22 ;;; stored. When checking the spelling of a word, first it is looked up;
23 ;;; if this fails, then possible roots are looked up, and if any has the
24 ;;; appropriate suffix flag, then the word is considered to be correctly
25 ;;; spelled. For an unknown word, the following rules define "close" words
26 ;;; which are possible corrections:
27 ;;; 1] two adjacent letters are transposed to form a correct spelling;
28 ;;; 2] one letter is changed to form a correct spelling;
29 ;;; 3] one letter is added to form a correct spelling; and/or
30 ;;; 4] one letter is removed to form a correct spelling.
31 ;;; There are two restrictions on the length of a word in regards to its
32 ;;; worthiness of recognition: it must be at least more than two letters
33 ;;; long, and if it has a suffix, then it must be at least four letters
34 ;;; long. More will be said about this when the flags are discussed.
35 ;;; This is implemented in as tense a fashion as possible, and it uses
36 ;;; implementation dependent code from Spell-RT.Lisp to accomplish this.
37 ;;; In general the file I/O and structure accesses encompass the system
38 ;;; dependencies.
39
40 ;;; This next section will discuss the storage of the dictionary
41 ;;; information. There are three data structures that "are" the
42 ;;; dictionary: a hash table, descriptors table, and a string table. The
43 ;;; hash table is a vector of type '(unsigned-byte 16), whose elements
44 ;;; point into the descriptors table. This is a cyclic hash table to
45 ;;; facilitate dumping it to a file. The descriptors table (also of type
46 ;;; '(unsigned-byte 16)) dedicates three elements to each entry in the
47 ;;; dictionary. Each group of three elements has the following organization
48 ;;; imposed on them:
49 ;;; ----------------------------------------------
50 ;;; | 15..5 hash code | 4..0 length |
51 ;;; ----------------------------------------------
52 ;;; | 15..0 character index |
53 ;;; ----------------------------------------------
54 ;;; | 15..14 character index | 13..0 flags |
55 ;;; ----------------------------------------------
56 ;;; "Length" is the number of characters in the entry; "hash code" is some
57 ;;; eleven bits from the hash code to allow for quicker lookup, "flags"
58 ;;; indicate possible suffixes for the basic entry, and "character index"
59 ;;; is the index of the start of the entry in the string table.
60 ;;; This was originally adopted due to the Perq's word size (can you guess?
61 ;;; 16 bits, that's right). Note the constraint that is placed on the number
62 ;;; of the entries, 21845, because the hash table could not point to more
63 ;;; descriptor units (16 bits of pointer divided by three). Since a value of
64 ;;; zero as a hash table element indicates an empty location, the zeroth element
65 ;;; of the descriptors table must be unused (it cannot be pointed to).
66
67
68 ;;; The following is a short discussion with examples of the correct
69 ;;; use of the suffix flags. Let # and @ be symbols that can stand for any
70 ;;; single letter. Upper case letters are constants. "..." stands for any
71 ;;; string of zero or more letters, but note that no word may exist in the
72 ;;; dictionary which is not at least 2 letters long, so, for example, FLY
73 ;;; may not be produced by placing the "Y" flag on "F". Also, no flag is
74 ;;; effective unless the word that it creates is at least 4 letters long,
75 ;;; so, for example, WED may not be produced by placing the "D" flag on
76 ;;; "WE". These flags and examples are from the Ispell documentation with
77 ;;; only slight modifications. Here are the correct uses of the flags:
78 ;;;
79 ;;; "V" flag:
80 ;;; ...E => ...IVE as in create => creative
81 ;;; if # .ne. E, then ...# => ...#IVE as in prevent => preventive
82 ;;;
83 ;;; "N" flag:
84 ;;; ...E => ...ION as in create => creation
85 ;;; ...Y => ...ICATION as in multiply => multiplication
86 ;;; if # .ne. E or Y, then ...# => ...#EN as in fall => fallen
87 ;;;
88 ;;; "X" flag:
89 ;;; ...E => ...IONS as in create => creations
90 ;;; ...Y => ...ICATIONS as in multiply => multiplications
91 ;;; if # .ne. E or Y, ...# => ...#ENS as in weak => weakens
92 ;;;
93 ;;; "H" flag:
94 ;;; ...Y => ...IETH as in twenty => twentieth
95 ;;; if # .ne. Y, then ...# => ...#TH as in hundred => hundredth
96 ;;;
97 ;;; "Y" FLAG:
98 ;;; ... => ...LY as in quick => quickly
99 ;;;
100 ;;; "G" FLAG:
101 ;;; ...E => ...ING as in file => filing
102 ;;; if # .ne. E, then ...# => ...#ING as in cross => crossing
103 ;;;
104 ;;; "J" FLAG"
105 ;;; ...E => ...INGS as in file => filings
106 ;;; if # .ne. E, then ...# => ...#INGS as in cross => crossings
107 ;;;
108 ;;; "D" FLAG:
109 ;;; ...E => ...ED as in create => created
110 ;;; if @ .ne. A, E, I, O, or U,
111 ;;; then ...@Y => ...@IED as in imply => implied
112 ;;; if # = Y, and @ = A, E, I, O, or U,
113 ;;; then ...@# => ...@#ED as in convey => conveyed
114 ;;; if # .ne. E or Y, then ...# => ...#ED as in cross => crossed
115 ;;;
116 ;;; "T" FLAG:
117 ;;; ...E => ...EST as in late => latest
118 ;;; if @ .ne. A, E, I, O, or U,
119 ;;; then ...@Y => ...@IEST as in dirty => dirtiest
120 ;;; if # = Y, and @ = A, E, I, O, or U,
121 ;;; then ...@# => ...@#EST as in gray => grayest
122 ;;; if # .ne. E or Y, then ...# => ...#EST as in small => smallest
123 ;;;
124 ;;; "R" FLAG:
125 ;;; ...E => ...ER as in skate => skater
126 ;;; if @ .ne. A, E, I, O, or U,
127 ;;; then ...@Y => ...@IER as in multiply => multiplier
128 ;;; if # = Y, and @ = A, E, I, O, or U,
129 ;;; then ...@# => ...@#ER as in convey => conveyer
130 ;;; if # .ne. E or Y, then ...# => ...#ER as in build => builder
131 ;;;
132
133 ;;; "Z FLAG:
134 ;;; ...E => ...ERS as in skate => skaters
135 ;;; if @ .ne. A, E, I, O, or U,
136 ;;; then ...@Y => ...@IERS as in multiply => multipliers
137 ;;; if # = Y, and @ = A, E, I, O, or U,
138 ;;; then ...@# => ...@#ERS as in slay => slayers
139 ;;; if # .ne. E or Y, then ...@# => ...@#ERS as in build => builders
140 ;;;
141 ;;; "S" FLAG:
142 ;;; if @ .ne. A, E, I, O, or U,
143 ;;; then ...@Y => ...@IES as in imply => implies
144 ;;; if # .eq. S, X, Z, or H,
145 ;;; then ...# => ...#ES as in fix => fixes
146 ;;; if # .ne. S, X, Z, H, or Y,
147 ;;; then ...# => ...#S as in bat => bats
148 ;;; if # = Y, and @ = A, E, I, O, or U,
149 ;;; then ...@# => ...@#S as in convey => conveys
150 ;;;
151 ;;; "P" FLAG:
152 ;;; if # .ne. Y, or @ = A, E, I, O, or U,
153 ;;; then ...@# => ...@#NESS as in late => lateness and
154 ;;; gray => grayness
155 ;;; if @ .ne. A, E, I, O, or U,
156 ;;; then ...@Y => ...@INESS as in cloudy => cloudiness
157 ;;;
158 ;;; "M" FLAG:
159 ;;; ... => ...'S as in DOG => DOG'S
160
161 (in-package "SPELL")
162
163
164 ;;;; Some Constants
165
166 (eval-when (compile load eval)
167
168 (defconstant spell-deleted-entry #xFFFF)
169
170 ;;; The next number (using 6 bits) is 63, and that's pretty silly because
171 ;;; "supercalafragalistic" is less than 31 characters long.
172 ;;;
173 (defconstant max-entry-length 31
174 "This the maximum number of characters an entry may have.")
175
176 ;;; These are the flags (described above), and an entry is allowed a
177 ;;; certain suffix if the appropriate bit is on in the third element of
178 ;;; its descriptor unit (described above).
179 ;;;
180 (defconstant V-mask (ash 1 13))
181 (defconstant N-mask (ash 1 12))
182 (defconstant X-mask (ash 1 11))
183 (defconstant H-mask (ash 1 10))
184 (defconstant Y-mask (ash 1 9))
185 (defconstant G-mask (ash 1 8))
186 (defconstant J-mask (ash 1 7))
187 (defconstant D-mask (ash 1 6))
188 (defconstant T-mask (ash 1 5))
189 (defconstant R-mask (ash 1 4))
190 (defconstant Z-mask (ash 1 3))
191 (defconstant S-mask (ash 1 2))
192 (defconstant P-mask (ash 1 1))
193 (defconstant M-mask 1)
194
195
196 ;;; These are the eleven bits of a computed hash that are stored as part of
197 ;;; an entries descriptor unit. The shifting constant is how much the
198 ;;; eleven bits need to be shifted to the right, so they take up the upper
199 ;;; eleven bits of one 16-bit element in a descriptor unit.
200 ;;;
201 (defconstant new-hash-byte (byte 11 13))
202 (defconstant stored-hash-byte (byte 11 5))
203
204
205 ;;; The next two constants are used to extract information from an entry's
206 ;;; descriptor unit. The first is the two most significant bits of 18
207 ;;; bits that hold an index into the string table where the entry is
208 ;;; located. If this is confusing, regard the diagram of the descriptor
209 ;;; units above.
210 ;;;
211 (defconstant whole-index-high-byte (byte 2 16))
212 (defconstant stored-index-high-byte (byte 2 14))
213 (defconstant stored-length-byte (byte 5 0))
214
215
216 ); eval-when (compile load eval)
217
218
219 ;;;; Some Specials and Accesses
220
221 ;;; *spell-aeiou* will have bits on that represent the capital letters
222 ;;; A, E, I, O, and U to be used to determine if some word roots are legal
223 ;;; for looking up.
224 ;;;
225 (defvar *aeiou*
226 (make-array 128 :element-type 'bit :initial-element 0))
227
228 (setf (aref *aeiou* (char-code #\A)) 1)
229 (setf (aref *aeiou* (char-code #\E)) 1)
230 (setf (aref *aeiou* (char-code #\I)) 1)
231 (setf (aref *aeiou* (char-code #\O)) 1)
232 (setf (aref *aeiou* (char-code #\U)) 1)
233
234
235 ;;; *sxzh* will have bits on that represent the capital letters
236 ;;; S, X, Z, and H to be used to determine if some word roots are legal for
237 ;;; looking up.
238 ;;;
239 (defvar *sxzh*
240 (make-array 128 :element-type 'bit :initial-element 0))
241
242 (setf (aref *sxzh* (char-code #\S)) 1)
243 (setf (aref *sxzh* (char-code #\X)) 1)
244 (setf (aref *sxzh* (char-code #\Z)) 1)
245 (setf (aref *sxzh* (char-code #\H)) 1)
246
247
248 ;;; SET-MEMBER-P will be used with *aeiou* and *sxzh* to determine if a
249 ;;; character is in the specified set.
250 ;;;
251 (eval-when (compile eval)
252 (defmacro set-member-p (char set)
253 `(not (zerop (the fixnum (aref (the simple-bit-vector ,set)
254 (char-code ,char))))))
255 ) ;eval-when
256
257
258 (defvar *dictionary*)
259 (defvar *dictionary-size*)
260 (defvar *descriptors*)
261 (defvar *descriptors-size*)
262 (defvar *string-table*)
263 (defvar *string-table-size*)
264
265
266 (eval-when (compile eval)
267
268 ;;; DICTIONARY-REF and DESCRIPTOR-REF are references to implementation
269 ;;; dependent structures. *dictionary* and *descriptors* are "system
270 ;;; area pointers" as a result of the way the binary file is opened for
271 ;;; fast access.
272 ;;;
273 (defmacro dictionary-ref (idx)
274 `(sapref *dictionary* ,idx))
275
276 (defmacro descriptor-ref (idx)
277 `(sapref *descriptors* ,idx))
278
279
280 ;;; DESCRIPTOR-STRING-START access an entry's (indicated by idx)
281 ;;; descriptor unit (described at the beginning of the file) and returns
282 ;;; the start index of the entry in the string table. The second of three
283 ;;; words in the descriptor holds the 16 least significant bits of 18, and
284 ;;; the top two bits of the third word are the 2 most significant bits.
285 ;;; These 18 bits are the index into the string table.
286 ;;;
287 (defmacro descriptor-string-start (idx)
288 `(dpb (the fixnum (ldb stored-index-high-byte
289 (the fixnum (descriptor-ref (+ 2 ,idx)))))
290 whole-index-high-byte
291 (the fixnum (descriptor-ref (1+ ,idx)))))
292
293 ) ;eval-when
294
295
296
297 ;;;; Top level Checking/Correcting
298
299 ;;; CORRECT-SPELLING can be called from top level to check/correct a words
300 ;;; spelling. It is not used for any other purpose.
301 ;;;
302 (defun correct-spelling (word)
303 "Check/correct the spelling of word. Output is done to *standard-output*."
304 (setf word (coerce word 'simple-string))
305 (let ((word (string-upcase (the simple-string word)))
306 (word-len (length (the simple-string word))))
307 (declare (simple-string word) (fixnum word-len))
308 (maybe-read-spell-dictionary)
309 (when (= word-len 1)
310 (error "Single character words are not in the dictionary."))
311 (when (> word-len max-entry-length)
312 (error "~A is too long for the dictionary." word))
313 (multiple-value-bind (idx used-flag-p)
314 (spell-try-word word word-len)
315 (if idx
316 (format t "Found it~:[~; because of ~A~]." used-flag-p
317 (spell-root-word idx))
318 (let ((close-words (spell-collect-close-words word)))
319 (if close-words
320 (format *standard-output*
321 "The possible correct spelling~[~; is~:;s are~]:~
322 ~:*~[~; ~{~A~}~;~{ ~A~^ and~}~:;~
323 ~{~#[~; and~] ~A~^,~}~]."
324 (length close-words)
325 close-words)
326 (format *standard-output* "Word not found.")))))))
327
328
329 (defvar *dictionary-read-p* nil)
330
331 ;;; MAYBE-READ-SPELL-DICTIONARY -- Public
332 ;;;
333 (defun maybe-read-spell-dictionary ()
334 "Read the spelling dictionary if it has not be read already."
335 (unless *dictionary-read-p* (read-dictionary)))
336
337
338 (defun spell-root-word (index)
339 "Return the root word corresponding to a dictionary entry at index."
340 (let* ((start (descriptor-string-start index))
341 (len (the fixnum (ldb stored-length-byte
342 (the fixnum (descriptor-ref index)))))
343 (result (make-string len)))
344 (declare (fixnum start len)
345 (simple-string result))
346 (sap-replace result (the system-area-pointer *string-table*)
347 start 0 len)
348 result))
349
350
351 (eval-when (compile eval)
352 (defmacro check-closeness (word word-len closeness-list)
353 `(if (spell-try-word ,word ,word-len)
354 (pushnew (subseq ,word 0 ,word-len) ,closeness-list :test #'string=)))
355 ) ;eval-when
356
357 (defconstant spell-alphabet
358 (list #\A #\B #\C #\D #\E #\F #\G #\H
359 #\I #\J #\K #\L #\M #\N #\O #\P
360 #\Q #\R #\S #\T #\U #\V #\W #\X #\Y #\Z))
361
362 ;;; SPELL-COLLECT-CLOSE-WORDS Returns a list of all "close" correctly spelled
363 ;;; words. The definition of "close" is at the beginning of the file, and
364 ;;; there are four sections to this function which collect each of the four
365 ;;; different kinds of close words.
366 ;;;
367 (defun spell-collect-close-words (word)
368 "Returns a list of all \"close\" correctly spelled words. This has the
369 same contraints as SPELL-TRY-WORD, which you have probably already called
370 if you are calling this."
371 (declare (simple-string word))
372 (let* ((word-len (length word))
373 (word-len--1 (1- word-len))
374 (word-len-+1 (1+ word-len))
375 (result ())
376 (correcting-buffer (make-string max-entry-length)))
377 (declare (simple-string correcting-buffer)
378 (fixnum word-len word-len--1 word-len-+1))
379 (replace correcting-buffer word :end1 word-len :end2 word-len)
380
381 ;; Misspelled because one letter is different.
382 (dotimes (i word-len)
383 (do ((save-char (schar correcting-buffer i))
384 (alphabet spell-alphabet (cdr alphabet)))
385 ((null alphabet)
386 (setf (schar correcting-buffer i) save-char))
387 (setf (schar correcting-buffer i) (car alphabet))
388 (check-closeness correcting-buffer word-len result)))
389
390 ;; Misspelled because two adjacent letters are transposed.
391 (dotimes (i word-len--1)
392 (rotatef (schar correcting-buffer i) (schar correcting-buffer (1+ i)))
393 (check-closeness correcting-buffer word-len result)
394 (rotatef (schar correcting-buffer i) (schar correcting-buffer (1+ i))))
395
396 ;; Misspelled because of extraneous letter.
397 (replace correcting-buffer word
398 :start2 1 :end1 word-len--1 :end2 word-len)
399 (check-closeness correcting-buffer word-len--1 result)
400 (dotimes (i word-len--1)
401 (setf (schar correcting-buffer i) (schar word i))
402 (replace correcting-buffer word
403 :start1 (1+ i) :start2 (+ i 2) :end1 word-len--1 :end2 word-len)
404 (check-closeness correcting-buffer word-len--1 result))
405
406 ;; Misspelled because a letter is missing.
407 (replace correcting-buffer word
408 :start1 1 :end1 word-len-+1 :end2 word-len)
409 (dotimes (i word-len-+1)
410 (do ((alphabet spell-alphabet (cdr alphabet)))
411 ((null alphabet)
412 (rotatef (schar correcting-buffer i)
413 (schar correcting-buffer (1+ i))))
414 (setf (schar correcting-buffer i) (car alphabet))
415 (check-closeness correcting-buffer word-len-+1 result)))
416 result))
417
418 ;;; SPELL-TRY-WORD The literal 4 is not a constant defined somewhere since it
419 ;;; is part of the definition of the function of looking up words.
420 ;;; TRY-WORD-ENDINGS relies on the guarantee that word-len is at least 4.
421 ;;;
422 (defun spell-try-word (word word-len)
423 "See if the word or an appropriate root is in the spelling dicitionary.
424 Word-len must be inclusively in the range 2..max-entry-length."
425 (or (lookup-entry word word-len)
426 (if (>= (the fixnum word-len) 4)
427 (try-word-endings word word-len))))
428
429
430
431 ;;;; Divining Correct Spelling
432
433 (eval-when (compile eval)
434
435 (defmacro setup-root-buffer (word buffer root-len)
436 `(replace ,buffer ,word :end1 ,root-len :end2 ,root-len))
437
438 (defmacro try-root (word root-len flag-mask)
439 (let ((result (gensym)))
440 `(let ((,result (lookup-entry ,word ,root-len)))
441 (if (and ,result (descriptor-flag ,result ,flag-mask))
442 (return (values ,result ,flag-mask))))))
443
444 ;;; TRY-MODIFIED-ROOT is used for root words that become truncated
445 ;;; when suffixes are added (e.g., skate => skating). Char-idx is the last
446 ;;; character in the root that has to typically be changed from a #\I to a
447 ;;; #\Y or #\E.
448 ;;;
449 (defmacro try-modified-root (word buffer root-len flag-mask char-idx new-char)
450 (let ((root-word (gensym)))
451 `(let ((,root-word (setup-root-buffer ,word ,buffer ,root-len)))
452 (setf (schar ,root-word ,char-idx) ,new-char)
453 (try-root ,root-word ,root-len ,flag-mask))))
454
455 ) ;eval-when
456
457
458 (defvar *rooting-buffer* (make-string max-entry-length))
459
460 ;;; TRY-WORD-ENDINGS takes a word that is at least of length 4 and
461 ;;; returns multiple values on success (the index where the word's root's
462 ;;; descriptor starts and :used-flag), otherwise nil. It looks at
463 ;;; characters from the end to the beginning of the word to determine if it
464 ;;; has any known suffixes. This is a VERY simple finite state machine
465 ;;; where all of the suffixes are narrowed down to one possible one in at
466 ;;; most two state changes. This is a PROG form for speed, and in some sense,
467 ;;; readability. The states of the machine are the flag names that denote
468 ;;; suffixes. The two points of branching to labels are the very beginning
469 ;;; of the PROG and the S state. This is a fairly straight forward
470 ;;; implementation of the flag rules presented at the beginning of this
471 ;;; file, with char-idx checks, so we do not index the string below zero.
472
473 (defun try-word-endings (word word-len)
474 (declare (simple-string word)
475 (fixnum word-len))
476 (prog* ((char-idx (1- word-len))
477 (char (schar word char-idx))
478 (rooting-buffer *rooting-buffer*)
479 flag-mask)
480 (declare (simple-string rooting-buffer)
481 (fixnum char-idx))
482 (case char
483 (#\S (go S)) ;This covers over half of the possible endings
484 ;by branching off the second to last character
485 ;to other flag states that have plural endings.
486 (#\R (setf flag-mask R-mask) ;"er" and "ier"
487 (go D-R-Z-FLAG))
488 (#\T (go T-FLAG)) ;"est" and "iest"
489 (#\D (setf flag-mask D-mask) ;"ed" and "ied"
490 (go D-R-Z-FLAG))
491 (#\H (go H-FLAG)) ;"th" and "ieth"
492 (#\N (setf flag-mask N-mask) ;"ion", "ication", and "en"
493 (go N-X-FLAG))
494 (#\G (setf flag-mask G-mask) ;"ing"
495 (go G-J-FLAG))
496 (#\Y (go Y-FLAG)) ;"ly"
497 (#\E (go V-FLAG))) ;"ive"
498 (return nil)
499
500 S
501 (setf char-idx (1- char-idx))
502 (setf char (schar word char-idx))
503 (if (char= char #\Y)
504 (if (set-member-p (schar word (1- char-idx)) *aeiou*)
505 (try-root word (1+ char-idx) S-mask)
506 (return nil))
507 (if (not (set-member-p char *sxzh*))
508 (try-root word (1+ char-idx) S-mask)))
509 (case char
510 (#\E (go S-FLAG)) ;"es" and "ies"
511 (#\R (setf flag-mask Z-mask) ;"ers" and "iers"
512 (go D-R-Z-FLAG))
513 (#\G (setf flag-mask J-mask) ;"ings"
514 (go G-J-FLAG))
515 (#\S (go P-FLAG)) ;"ness" and "iness"
516 (#\N (setf flag-mask X-mask) ;"ions", "ications", and "ens"
517 (go N-X-FLAG))
518 (#\' (try-root word char-idx M-mask)))
519 (return nil)
520
521 S-FLAG
522 (setf char-idx (1- char-idx))
523 (setf char (schar word char-idx))
524 (if (set-member-p char *sxzh*)
525 (try-root word (1+ char-idx) S-mask))
526 (if (and (char= char #\I)
527 (not (set-member-p (schar word (1- char-idx)) *aeiou*)))
528 (try-modified-root word rooting-buffer (1+ char-idx)
529 S-mask char-idx #\Y))
530 (return nil)
531
532 D-R-Z-FLAG
533 (if (char/= (schar word (1- char-idx)) #\E) (return nil))
534 (try-root word char-idx flag-mask)
535 (if (<= (setf char-idx (- char-idx 2)) 0) (return nil))
536 (setf char (schar word char-idx))
537 (if (char= char #\Y)
538 (if (set-member-p (schar word (1- char-idx)) *aeiou*)
539 (try-root word (1+ char-idx) flag-mask)
540 (return nil))
541 (if (char/= (schar word char-idx) #\E)
542 (try-root word (1+ char-idx) flag-mask)))
543 (if (and (char= char #\I)
544 (not (set-member-p (schar word (1- char-idx)) *aeiou*)))
545 (try-modified-root word rooting-buffer (1+ char-idx)
546 flag-mask char-idx #\Y))
547 (return nil)
548
549 P-FLAG
550 (if (or (char/= (schar word (1- char-idx)) #\E)
551 (char/= (schar word (- char-idx 2)) #\N))
552 (return nil))
553 (if (<= (setf char-idx (- char-idx 3)) 0) (return nil))
554 (setf char (schar word char-idx))
555 (if (char= char #\Y)
556 (if (set-member-p (schar word (1- char-idx)) *aeiou*)
557 (try-root word (1+ char-idx) P-mask)
558 (return nil)))
559 (try-root word (1+ char-idx) P-mask)
560 (if (and (char= char #\I)
561 (not (set-member-p (schar word (1- char-idx)) *aeiou*)))
562 (try-modified-root word rooting-buffer (1+ char-idx)
563 P-mask char-idx #\Y))
564 (return nil)
565
566 G-J-FLAG
567 (if (< char-idx 3) (return nil))
568 (setf char-idx (- char-idx 2))
569 (setf char (schar word char-idx))
570 (if (or (char/= char #\I) (char/= (schar word (1+ char-idx)) #\N))
571 (return nil))
572 (if (char/= (schar word (1- char-idx)) #\E)
573 (try-root word char-idx flag-mask))
574 (try-modified-root word rooting-buffer (1+ char-idx)
575 flag-mask char-idx #\E)
576 (return nil)
577
578 N-X-FLAG
579 (setf char-idx (1- char-idx))
580 (setf char (schar word char-idx))
581 (cond ((char= char #\E)
582 (setf char (schar word (1- char-idx)))
583 (if (and (char/= char #\Y) (char/= char #\E))
584 (try-root word char-idx flag-mask))
585 (return nil))
586 ((char= char #\O)
587 (if (char= (schar word (1- char-idx)) #\I)
588 (try-modified-root word rooting-buffer char-idx
589 flag-mask (1- char-idx) #\E)
590 (return nil))
591 (if (< char-idx 5) (return nil))
592 (if (or (char/= (schar word (- char-idx 2)) #\T)
593 (char/= (schar word (- char-idx 3)) #\A)
594 (char/= (schar word (- char-idx 4)) #\C)
595 (char/= (schar word (- char-idx 5)) #\I))
596 (return nil)
597 (setf char-idx (- char-idx 4)))
598 (try-modified-root word rooting-buffer char-idx
599 flag-mask (1- char-idx) #\Y))
600 (t (return nil)))
601
602 T-FLAG
603 (if (or (char/= (schar word (1- char-idx)) #\S)
604 (char/= (schar word (- char-idx 2)) #\E))
605 (return nil)
606 (setf char-idx (1- char-idx)))
607 (try-root word char-idx T-mask)
608 (if (<= (setf char-idx (- char-idx 2)) 0) (return nil))
609 (setf char (schar word char-idx))
610 (if (char= char #\Y)
611 (if (set-member-p (schar word (1- char-idx)) *aeiou*)
612 (try-root word (1+ char-idx) T-mask)
613 (return nil))
614 (if (char/= (schar word char-idx) #\E)
615 (try-root word (1+ char-idx) T-mask)))
616 (if (and (char= char #\I)
617 (not (set-member-p (schar word (1- char-idx)) *aeiou*)))
618 (try-modified-root word rooting-buffer (1+ char-idx)
619 T-mask char-idx #\Y))
620 (return nil)
621
622 H-FLAG
623 (setf char-idx (1- char-idx))
624 (setf char (schar word char-idx))
625 (if (char/= char #\T) (return nil))
626 (if (char/= (schar word (1- char-idx)) #\Y)
627 (try-root word char-idx H-mask))
628 (if (and (char= (schar word (1- char-idx)) #\E)
629 (char= (schar word (- char-idx 2)) #\I))
630 (try-modified-root word rooting-buffer (1- char-idx)
631 H-mask (- char-idx 2) #\Y))
632 (return nil)
633
634 Y-FLAG
635 (setf char-idx (1- char-idx))
636 (setf char (schar word char-idx))
637 (if (char= char #\L)
638 (try-root word char-idx Y-mask))
639 (return nil)
640
641 V-FLAG
642 (setf char-idx (- char-idx 2))
643 (setf char (schar word char-idx))
644 (if (or (char/= char #\I) (char/= (schar word (1+ char-idx)) #\V))
645 (return nil))
646 (if (char/= (schar word (1- char-idx)) #\E)
647 (try-root word char-idx V-mask))
648 (try-modified-root word rooting-buffer (1+ char-idx)
649 V-mask char-idx #\E)
650 (return nil)))
651
652
653
654 ;;; DESCRIPTOR-FLAG returns t or nil based on whether the flag is on.
655 ;;; From the diagram at the beginning of the file, we see that the flags
656 ;;; are stored two words off of the first word in the descriptor unit for
657 ;;; an entry.
658 ;;;
659 (defun descriptor-flag (descriptor-start flag-mask)
660 (not (zerop
661 (the fixnum
662 (logand
663 (the fixnum (descriptor-ref (+ 2 (the fixnum descriptor-start))))
664 (the fixnum flag-mask))))))
665
666
667 ;;;; Looking up Trials
668
669 (eval-when (compile eval)
670
671 ;;; SPELL-STRING= determines if string1 and string2 are the same. Before
672 ;;; it is called it is known that they are both of (- end1 0) length, and
673 ;;; string2 is in system space. This is used in FOUND-ENTRY-P.
674 ;;;
675 (defmacro spell-string= (string1 string2 end1 start2)
676 (let ((idx1 (gensym))
677 (idx2 (gensym)))
678 `(do ((,idx1 0 (1+ ,idx1))
679 (,idx2 ,start2 (1+ ,idx2)))
680 ((= ,idx1 ,end1) t)
681 (declare (fixnum ,idx1 ,idx2))
682 (unless (= (the fixnum (char-code (schar ,string1 ,idx1)))
683 (the fixnum (string-sapref ,string2 ,idx2)))
684 (return nil)))))
685
686 ;;; FOUND-ENTRY-P determines if entry is what is described at idx.
687 ;;; Hash-and-length is 16 bits that look just like the first word of any
688 ;;; entry's descriptor unit (see diagram at the beginning of the file). If
689 ;;; the word stored at idx and entry have the same hash bits and length,
690 ;;; then we compare characters to see if they are the same.
691 ;;;
692 (defmacro found-entry-p (idx entry entry-len hash-and-length)
693 `(if (= (the fixnum (descriptor-ref ,idx))
694 (the fixnum ,hash-and-length))
695 (spell-string= ,entry *string-table* ,entry-len
696 (descriptor-string-start ,idx))))
697
698 (defmacro hash2-increment (hash)
699 `(- (the fixnum *dictionary-size*)
700 2
701 (the fixnum (rem ,hash (- (the fixnum *dictionary-size*) 2)))))
702
703 (defmacro hash2-loop ((location-var contents-var)
704 loc hash zero-contents-form
705 &optional body-form (for-insertion-p nil))
706 (let ((incr (gensym)))
707 `(let* ((,incr (hash2-increment ,hash))
708 (,location-var ,loc)
709 (,contents-var 0))
710 (declare (fixnum ,location-var ,contents-var ,incr))
711 (loop (setf ,location-var
712 (rem (+ ,location-var ,incr) (the fixnum *dictionary-size*)))
713 (setf ,contents-var (dictionary-ref ,location-var))
714 (if (zerop ,contents-var) (return ,zero-contents-form))
715 ,@(if for-insertion-p
716 `((if (= ,contents-var spell-deleted-entry)
717 (return ,zero-contents-form))))
718 (if (= ,location-var ,loc) (return nil))
719 ,@(if body-form `(,body-form))))))
720
721 ) ;eval-when
722
723
724 ;;; LOOKUP-ENTRY returns the index of the first element of entry's
725 ;;; descriptor unit on success, otherwise nil.
726 ;;;
727 (defun lookup-entry (entry &optional len)
728 (declare (simple-string entry))
729 (let* ((entry-len (or len (length entry)))
730 (hash (string-hash entry entry-len))
731 (hash-and-len (dpb (the fixnum (ldb new-hash-byte hash))
732 stored-hash-byte
733 (the fixnum entry-len)))
734 (loc (rem hash (the fixnum *dictionary-size*)))
735 (loc-contents (dictionary-ref loc)))
736 (declare (fixnum entry-len hash hash-and-len loc))
737 (cond ((zerop loc-contents) nil)
738 ((found-entry-p loc-contents entry entry-len hash-and-len)
739 loc-contents)
740 (t
741 (hash2-loop (loop-loc loc-contents) loc hash
742 nil
743 (if (found-entry-p loc-contents entry entry-len hash-and-len)
744 (return loc-contents)))))))
745
746 ;;;; Binary File Reading
747
748 (defparameter default-binary-dictionary
749 "library:spell-dictionary.bin")
750
751 ;;; This is the first thing in a spell binary dictionary file to serve as a
752 ;;; quick check of its proposed contents. This particular number is
753 ;;; "BILLS" on a calculator held upside-down.
754 ;;;
755 (defconstant magic-file-id 57718)
756
757 ;;; These constants are derived from the order things are written to the
758 ;;; binary dictionary in Spell-Build.Lisp.
759 ;;;
760 (defconstant magic-file-id-loc 0)
761 (defconstant dictionary-size-loc 1)
762 (defconstant descriptors-size-loc 2)
763 (defconstant string-table-size-low-byte-loc 3)
764 (defconstant string-table-size-high-byte-loc 4)
765 (defconstant file-header-bytes 10)
766
767 ;;; Initially, there are no free descriptor elements and string table bytes,
768 ;;; but when these structures are grown, they are grown by more than that
769 ;;; which is necessary.
770 ;;;
771 (defvar *free-descriptor-elements* 0)
772 (defvar *free-string-table-bytes* 0)
773
774 ;;; READ-DICTIONARY opens the dictionary and sets up the global structures
775 ;;; manifesting the spelling dictionary. When computing the start addresses
776 ;;; of these structures, we multiply by two since their sizes are in 16bit
777 ;;; lengths while the RT is 8bit-byte addressable.
778 ;;;
779 (defun read-dictionary (&optional (f default-binary-dictionary))
780 (when *dictionary-read-p*
781 (setf *dictionary-read-p* nil)
782 (deallocate-bytes (system-address *dictionary*)
783 (* 2 (the fixnum *dictionary-size*)))
784 (deallocate-bytes (system-address *descriptors*)
785 (* 2 (the fixnum
786 (+ (the fixnum *descriptors-size*)
787 (the fixnum *free-descriptor-elements*)))))
788 (deallocate-bytes (system-address *string-table*)
789 (+ (the fixnum *string-table-size*)
790 (the fixnum *free-string-table-bytes*))))
791 (setf *free-descriptor-elements* 0)
792 (setf *free-string-table-bytes* 0)
793 (let* ((fd (open-dictionary f))
794 (header-info (read-dictionary-structure fd file-header-bytes)))
795 (unless (= (sapref header-info magic-file-id-loc) magic-file-id)
796 (deallocate-bytes (system-address header-info) file-header-bytes)
797 (error "File is not a dictionary: ~S." f))
798 (setf *dictionary-size* (sapref header-info dictionary-size-loc))
799 (setf *descriptors-size* (sapref header-info descriptors-size-loc))
800 (setf *string-table-size* (sapref header-info string-table-size-low-byte-loc))
801 (setf (ldb (byte 12 16) (the fixnum *string-table-size*))
802 (the fixnum (sapref header-info string-table-size-high-byte-loc)))
803 (deallocate-bytes (system-address header-info) file-header-bytes)
804 (setf *dictionary*
805 (read-dictionary-structure fd (* 2 (the fixnum *dictionary-size*))))
806 (setf *descriptors*
807 (read-dictionary-structure fd (* 2 (the fixnum *descriptors-size*))))
808 (setf *string-table* (read-dictionary-structure fd *string-table-size*))
809 (setf *dictionary-read-p* t)
810 (close-dictionary fd)))

  ViewVC Help
Powered by ViewVC 1.1.5