/[cmucl]/src/hemlock/spell-corr.lisp
ViewVC logotype

Contents of /src/hemlock/spell-corr.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (hide annotations)
Mon Oct 31 04:50:12 1994 UTC (19 years, 5 months ago) by ram
Branch: MAIN
CVS Tags: sparc-tramp-assem-base, double-double-array-base, post-merge-intl-branch, release-19b-pre1, release-19b-pre2, merged-unicode-utf16-extfmt-2009-06-11, double-double-init-sparc-2, unicode-utf16-extfmt-2009-03-27, double-double-base, snapshot-2007-09, snapshot-2007-08, snapshot-2008-08, snapshot-2008-09, ppc_gencgc_snap_2006-01-06, sse2-packed-2008-11-12, snapshot-2008-05, snapshot-2008-06, snapshot-2008-07, snapshot-2007-05, snapshot-2008-01, snapshot-2008-02, snapshot-2008-03, intl-branch-working-2010-02-19-1000, snapshot-2006-11, snapshot-2006-10, double-double-init-sparc, snapshot-2006-12, unicode-string-buffer-impl-base, sse2-base, release-20b-pre1, release-20b-pre2, unicode-string-buffer-base, RELEASE_18d, sse2-packed-base, sparc-tramp-assem-2010-07-19, amd64-dd-start, snapshot-2003-10, snapshot-2004-10, release-18e-base, release-19f-pre1, snapshot-2008-12, snapshot-2008-11, intl-2-branch-base, snapshot-2004-08, snapshot-2004-09, remove_negative_zero_not_zero, snapshot-2007-01, snapshot-2007-02, snapshot-2004-05, snapshot-2004-06, snapshot-2004-07, release-19e, release-19d, GIT-CONVERSION, double-double-init-ppc, release-19c, dynamic-extent-base, unicode-utf16-sync-2008-12, LINKAGE_TABLE, release-19c-base, cross-sol-x86-merged, label-2009-03-16, release-19f-base, PRE_LINKAGE_TABLE, merge-sse2-packed, mod-arith-base, sparc_gencgc_merge, merge-with-19f, snapshot-2004-12, snapshot-2004-11, intl-branch-working-2010-02-11-1000, RELEASE_18a, RELEASE_18b, RELEASE_18c, unicode-snapshot-2009-05, unicode-snapshot-2009-06, amd64-merge-start, ppc_gencgc_snap_2005-12-17, double-double-init-%make-sparc, unicode-utf16-sync-2008-07, release-18e-pre2, unicode-utf16-sync-2008-09, unicode-utf16-extfmts-sync-2008-12, prm-before-macosx-merge-tag, cold-pcl-base, RELEASE_20b, snapshot-2008-04, snapshot-2003-11, snapshot-2005-07, unicode-utf16-sync-label-2009-03-16, RELEASE_19f, snapshot-2007-03, release-20a-base, cross-sol-x86-base, unicode-utf16-char-support-2009-03-26, unicode-utf16-char-support-2009-03-25, release-19a-base, unicode-utf16-extfmts-pre-sync-2008-11, snapshot-2008-10, sparc_gencgc, snapshot-2007-04, snapshot-2010-12, snapshot-2010-11, unicode-utf16-sync-2008-11, snapshot-2007-07, snapshot-2011-09, snapshot-2011-06, snapshot-2011-07, snapshot-2011-04, snapshot-2007-06, snapshot-2011-02, snapshot-2011-03, snapshot-2011-01, snapshot-2003-12, release-19a-pre1, release-19a-pre3, release-19a-pre2, pre-merge-intl-branch, release-19a, UNICODE-BASE, double-double-array-checkpoint, double-double-reader-checkpoint-1, release-19d-base, release-19e-pre1, double-double-irrat-end, release-19e-pre2, snapshot-2010-05, snapshot-2010-04, snapshot-2010-07, snapshot-2010-06, snapshot-2010-01, snapshot-2010-03, snapshot-2010-02, release-19d-pre2, release-19d-pre1, snapshot-2010-08, release-18e, double-double-init-checkpoint-1, double-double-reader-base, label-2009-03-25, snapshot-2005-03, release-19b-base, cross-sol-x86-2010-12-20, double-double-init-x86, sse2-checkpoint-2008-10-01, intl-branch-2010-03-18-1300, snapshot-2005-11, double-double-sparc-checkpoint-1, snapshot-2004-04, sse2-merge-with-2008-11, sse2-merge-with-2008-10, snapshot-2005-10, RELEASE_20a, snapshot-2005-12, release-20a-pre1, snapshot-2005-01, snapshot-2009-11, snapshot-2009-12, unicode-utf16-extfmt-2009-06-11, portable-clx-import-2009-06-16, unicode-utf16-string-support, release-19c-pre1, cross-sparc-branch-base, release-19e-base, intl-branch-base, double-double-irrat-start, snapshot-2005-06, snapshot-2005-05, snapshot-2005-04, ppc_gencgc_snap_2005-05-14, snapshot-2005-02, unicode-utf16-base, portable-clx-base, snapshot-2005-09, snapshot-2005-08, lisp-executable-base, snapshot-2009-08, snapshot-2007-12, snapshot-2007-10, snapshot-2007-11, snapshot-2009-02, snapshot-2009-01, snapshot-2009-07, snapshot-2009-05, snapshot-2009-04, snapshot-2006-02, snapshot-2006-03, release-18e-pre1, snapshot-2006-01, snapshot-2006-06, snapshot-2006-07, snapshot-2006-04, snapshot-2006-05, pre-telent-clx, snapshot-2006-08, snapshot-2006-09, HEAD
Branch point for: release-19b-branch, double-double-reader-branch, double-double-array-branch, mod-arith-branch, RELEASE-19F-BRANCH, portable-clx-branch, sparc_gencgc_branch, cross-sparc-branch, RELEASE-20B-BRANCH, RELENG_18, unicode-string-buffer-branch, sparc-tramp-assem-branch, dynamic-extent, UNICODE-BRANCH, release-19d-branch, ppc_gencgc_branch, sse2-packed-branch, lisp-executable, RELEASE-20A-BRANCH, amd64-dd-branch, double-double-branch, unicode-string-buffer-impl-branch, intl-branch, release-18e-branch, cold-pcl, unicode-utf16-branch, cross-sol-x86-branch, release-19e-branch, sse2-branch, release-19a-branch, release-19c-branch, intl-2-branch, unicode-utf16-extfmt-branch
Changes since 1.2: +1 -3 lines
Fix headed boilerplate.
1 ram 1.1 ;;; -*- Log: hemlock.log; Package: Spell -*-
2     ;;;
3     ;;; **********************************************************************
4 ram 1.2 ;;; This code was written as part of the CMU Common Lisp project at
5     ;;; Carnegie Mellon University, and has been placed in the public domain.
6     ;;;
7     (ext:file-comment
8 ram 1.3 "$Header: /tiger/var/lib/cvsroots/cmucl/src/hemlock/spell-corr.lisp,v 1.3 1994/10/31 04:50:12 ram Rel $")
9 ram 1.2 ;;;
10 ram 1.1 ;;; **********************************************************************
11     ;;;
12     ;;; Written by Bill Chiles
13     ;;; Designed by Bill Chiles and Rob Maclachlan
14     ;;;
15    
16     ;;; This is the file that deals with checking and correcting words
17     ;;; using a dictionary read in from a binary file. It has been written
18     ;;; from the basic ideas used in Ispell (on DEC-20's) which originated as
19     ;;; Spell on the ITS machines at MIT. There are flags which have proper
20     ;;; uses defined for them that indicate permissible suffixes to entries.
21     ;;; This allows for about three times as many known words than are actually
22     ;;; stored. When checking the spelling of a word, first it is looked up;
23     ;;; if this fails, then possible roots are looked up, and if any has the
24     ;;; appropriate suffix flag, then the word is considered to be correctly
25     ;;; spelled. For an unknown word, the following rules define "close" words
26     ;;; which are possible corrections:
27     ;;; 1] two adjacent letters are transposed to form a correct spelling;
28     ;;; 2] one letter is changed to form a correct spelling;
29     ;;; 3] one letter is added to form a correct spelling; and/or
30     ;;; 4] one letter is removed to form a correct spelling.
31     ;;; There are two restrictions on the length of a word in regards to its
32     ;;; worthiness of recognition: it must be at least more than two letters
33     ;;; long, and if it has a suffix, then it must be at least four letters
34     ;;; long. More will be said about this when the flags are discussed.
35     ;;; This is implemented in as tense a fashion as possible, and it uses
36     ;;; implementation dependent code from Spell-RT.Lisp to accomplish this.
37     ;;; In general the file I/O and structure accesses encompass the system
38     ;;; dependencies.
39    
40     ;;; This next section will discuss the storage of the dictionary
41     ;;; information. There are three data structures that "are" the
42     ;;; dictionary: a hash table, descriptors table, and a string table. The
43     ;;; hash table is a vector of type '(unsigned-byte 16), whose elements
44     ;;; point into the descriptors table. This is a cyclic hash table to
45     ;;; facilitate dumping it to a file. The descriptors table (also of type
46     ;;; '(unsigned-byte 16)) dedicates three elements to each entry in the
47     ;;; dictionary. Each group of three elements has the following organization
48     ;;; imposed on them:
49     ;;; ----------------------------------------------
50     ;;; | 15..5 hash code | 4..0 length |
51     ;;; ----------------------------------------------
52     ;;; | 15..0 character index |
53     ;;; ----------------------------------------------
54     ;;; | 15..14 character index | 13..0 flags |
55     ;;; ----------------------------------------------
56     ;;; "Length" is the number of characters in the entry; "hash code" is some
57     ;;; eleven bits from the hash code to allow for quicker lookup, "flags"
58     ;;; indicate possible suffixes for the basic entry, and "character index"
59     ;;; is the index of the start of the entry in the string table.
60     ;;; This was originally adopted due to the Perq's word size (can you guess?
61     ;;; 16 bits, that's right). Note the constraint that is placed on the number
62     ;;; of the entries, 21845, because the hash table could not point to more
63     ;;; descriptor units (16 bits of pointer divided by three). Since a value of
64     ;;; zero as a hash table element indicates an empty location, the zeroth element
65     ;;; of the descriptors table must be unused (it cannot be pointed to).
66    
67    
68     ;;; The following is a short discussion with examples of the correct
69     ;;; use of the suffix flags. Let # and @ be symbols that can stand for any
70     ;;; single letter. Upper case letters are constants. "..." stands for any
71     ;;; string of zero or more letters, but note that no word may exist in the
72     ;;; dictionary which is not at least 2 letters long, so, for example, FLY
73     ;;; may not be produced by placing the "Y" flag on "F". Also, no flag is
74     ;;; effective unless the word that it creates is at least 4 letters long,
75     ;;; so, for example, WED may not be produced by placing the "D" flag on
76     ;;; "WE". These flags and examples are from the Ispell documentation with
77     ;;; only slight modifications. Here are the correct uses of the flags:
78     ;;;
79     ;;; "V" flag:
80     ;;; ...E => ...IVE as in create => creative
81     ;;; if # .ne. E, then ...# => ...#IVE as in prevent => preventive
82     ;;;
83     ;;; "N" flag:
84     ;;; ...E => ...ION as in create => creation
85     ;;; ...Y => ...ICATION as in multiply => multiplication
86     ;;; if # .ne. E or Y, then ...# => ...#EN as in fall => fallen
87     ;;;
88     ;;; "X" flag:
89     ;;; ...E => ...IONS as in create => creations
90     ;;; ...Y => ...ICATIONS as in multiply => multiplications
91     ;;; if # .ne. E or Y, ...# => ...#ENS as in weak => weakens
92     ;;;
93     ;;; "H" flag:
94     ;;; ...Y => ...IETH as in twenty => twentieth
95     ;;; if # .ne. Y, then ...# => ...#TH as in hundred => hundredth
96     ;;;
97     ;;; "Y" FLAG:
98     ;;; ... => ...LY as in quick => quickly
99     ;;;
100     ;;; "G" FLAG:
101     ;;; ...E => ...ING as in file => filing
102     ;;; if # .ne. E, then ...# => ...#ING as in cross => crossing
103     ;;;
104     ;;; "J" FLAG"
105     ;;; ...E => ...INGS as in file => filings
106     ;;; if # .ne. E, then ...# => ...#INGS as in cross => crossings
107     ;;;
108     ;;; "D" FLAG:
109     ;;; ...E => ...ED as in create => created
110     ;;; if @ .ne. A, E, I, O, or U,
111     ;;; then ...@Y => ...@IED as in imply => implied
112     ;;; if # = Y, and @ = A, E, I, O, or U,
113     ;;; then ...@# => ...@#ED as in convey => conveyed
114     ;;; if # .ne. E or Y, then ...# => ...#ED as in cross => crossed
115     ;;;
116     ;;; "T" FLAG:
117     ;;; ...E => ...EST as in late => latest
118     ;;; if @ .ne. A, E, I, O, or U,
119     ;;; then ...@Y => ...@IEST as in dirty => dirtiest
120     ;;; if # = Y, and @ = A, E, I, O, or U,
121     ;;; then ...@# => ...@#EST as in gray => grayest
122     ;;; if # .ne. E or Y, then ...# => ...#EST as in small => smallest
123     ;;;
124     ;;; "R" FLAG:
125     ;;; ...E => ...ER as in skate => skater
126     ;;; if @ .ne. A, E, I, O, or U,
127     ;;; then ...@Y => ...@IER as in multiply => multiplier
128     ;;; if # = Y, and @ = A, E, I, O, or U,
129     ;;; then ...@# => ...@#ER as in convey => conveyer
130     ;;; if # .ne. E or Y, then ...# => ...#ER as in build => builder
131     ;;;
132    
133     ;;; "Z FLAG:
134     ;;; ...E => ...ERS as in skate => skaters
135     ;;; if @ .ne. A, E, I, O, or U,
136     ;;; then ...@Y => ...@IERS as in multiply => multipliers
137     ;;; if # = Y, and @ = A, E, I, O, or U,
138     ;;; then ...@# => ...@#ERS as in slay => slayers
139     ;;; if # .ne. E or Y, then ...@# => ...@#ERS as in build => builders
140     ;;;
141     ;;; "S" FLAG:
142     ;;; if @ .ne. A, E, I, O, or U,
143     ;;; then ...@Y => ...@IES as in imply => implies
144     ;;; if # .eq. S, X, Z, or H,
145     ;;; then ...# => ...#ES as in fix => fixes
146     ;;; if # .ne. S, X, Z, H, or Y,
147     ;;; then ...# => ...#S as in bat => bats
148     ;;; if # = Y, and @ = A, E, I, O, or U,
149     ;;; then ...@# => ...@#S as in convey => conveys
150     ;;;
151     ;;; "P" FLAG:
152     ;;; if # .ne. Y, or @ = A, E, I, O, or U,
153     ;;; then ...@# => ...@#NESS as in late => lateness and
154     ;;; gray => grayness
155     ;;; if @ .ne. A, E, I, O, or U,
156     ;;; then ...@Y => ...@INESS as in cloudy => cloudiness
157     ;;;
158     ;;; "M" FLAG:
159     ;;; ... => ...'S as in DOG => DOG'S
160    
161 ram 1.2 (in-package "SPELL")
162 ram 1.1
163 ram 1.2
164     ;;;; Some Constants
165 ram 1.1
166 ram 1.2 (eval-when (compile load eval)
167 ram 1.1
168 ram 1.2 (defconstant spell-deleted-entry #xFFFF)
169 ram 1.1
170     ;;; The next number (using 6 bits) is 63, and that's pretty silly because
171     ;;; "supercalafragalistic" is less than 31 characters long.
172     ;;;
173     (defconstant max-entry-length 31
174     "This the maximum number of characters an entry may have.")
175    
176     ;;; These are the flags (described above), and an entry is allowed a
177     ;;; certain suffix if the appropriate bit is on in the third element of
178     ;;; its descriptor unit (described above).
179     ;;;
180     (defconstant V-mask (ash 1 13))
181     (defconstant N-mask (ash 1 12))
182     (defconstant X-mask (ash 1 11))
183     (defconstant H-mask (ash 1 10))
184     (defconstant Y-mask (ash 1 9))
185     (defconstant G-mask (ash 1 8))
186     (defconstant J-mask (ash 1 7))
187     (defconstant D-mask (ash 1 6))
188     (defconstant T-mask (ash 1 5))
189     (defconstant R-mask (ash 1 4))
190     (defconstant Z-mask (ash 1 3))
191     (defconstant S-mask (ash 1 2))
192     (defconstant P-mask (ash 1 1))
193     (defconstant M-mask 1)
194    
195    
196     ;;; These are the eleven bits of a computed hash that are stored as part of
197     ;;; an entries descriptor unit. The shifting constant is how much the
198     ;;; eleven bits need to be shifted to the right, so they take up the upper
199     ;;; eleven bits of one 16-bit element in a descriptor unit.
200     ;;;
201     (defconstant new-hash-byte (byte 11 13))
202     (defconstant stored-hash-byte (byte 11 5))
203    
204    
205     ;;; The next two constants are used to extract information from an entry's
206     ;;; descriptor unit. The first is the two most significant bits of 18
207     ;;; bits that hold an index into the string table where the entry is
208     ;;; located. If this is confusing, regard the diagram of the descriptor
209     ;;; units above.
210     ;;;
211     (defconstant whole-index-high-byte (byte 2 16))
212     (defconstant stored-index-high-byte (byte 2 14))
213     (defconstant stored-length-byte (byte 5 0))
214    
215    
216 ram 1.2 ); eval-when (compile load eval)
217    
218 ram 1.1
219     ;;;; Some Specials and Accesses
220    
221     ;;; *spell-aeiou* will have bits on that represent the capital letters
222     ;;; A, E, I, O, and U to be used to determine if some word roots are legal
223     ;;; for looking up.
224     ;;;
225     (defvar *aeiou*
226     (make-array 128 :element-type 'bit :initial-element 0))
227    
228     (setf (aref *aeiou* (char-code #\A)) 1)
229     (setf (aref *aeiou* (char-code #\E)) 1)
230     (setf (aref *aeiou* (char-code #\I)) 1)
231     (setf (aref *aeiou* (char-code #\O)) 1)
232     (setf (aref *aeiou* (char-code #\U)) 1)
233    
234    
235     ;;; *sxzh* will have bits on that represent the capital letters
236     ;;; S, X, Z, and H to be used to determine if some word roots are legal for
237     ;;; looking up.
238     ;;;
239     (defvar *sxzh*
240     (make-array 128 :element-type 'bit :initial-element 0))
241    
242     (setf (aref *sxzh* (char-code #\S)) 1)
243     (setf (aref *sxzh* (char-code #\X)) 1)
244     (setf (aref *sxzh* (char-code #\Z)) 1)
245     (setf (aref *sxzh* (char-code #\H)) 1)
246    
247    
248     ;;; SET-MEMBER-P will be used with *aeiou* and *sxzh* to determine if a
249     ;;; character is in the specified set.
250     ;;;
251     (eval-when (compile eval)
252     (defmacro set-member-p (char set)
253     `(not (zerop (the fixnum (aref (the simple-bit-vector ,set)
254     (char-code ,char))))))
255     ) ;eval-when
256    
257    
258     (defvar *dictionary*)
259     (defvar *dictionary-size*)
260     (defvar *descriptors*)
261     (defvar *descriptors-size*)
262     (defvar *string-table*)
263     (defvar *string-table-size*)
264    
265    
266     (eval-when (compile eval)
267    
268     ;;; DICTIONARY-REF and DESCRIPTOR-REF are references to implementation
269     ;;; dependent structures. *dictionary* and *descriptors* are "system
270     ;;; area pointers" as a result of the way the binary file is opened for
271     ;;; fast access.
272     ;;;
273     (defmacro dictionary-ref (idx)
274     `(sapref *dictionary* ,idx))
275    
276     (defmacro descriptor-ref (idx)
277     `(sapref *descriptors* ,idx))
278    
279    
280     ;;; DESCRIPTOR-STRING-START access an entry's (indicated by idx)
281     ;;; descriptor unit (described at the beginning of the file) and returns
282     ;;; the start index of the entry in the string table. The second of three
283     ;;; words in the descriptor holds the 16 least significant bits of 18, and
284     ;;; the top two bits of the third word are the 2 most significant bits.
285     ;;; These 18 bits are the index into the string table.
286     ;;;
287     (defmacro descriptor-string-start (idx)
288     `(dpb (the fixnum (ldb stored-index-high-byte
289     (the fixnum (descriptor-ref (+ 2 ,idx)))))
290     whole-index-high-byte
291     (the fixnum (descriptor-ref (1+ ,idx)))))
292    
293     ) ;eval-when
294    
295    
296    
297     ;;;; Top level Checking/Correcting
298    
299     ;;; CORRECT-SPELLING can be called from top level to check/correct a words
300     ;;; spelling. It is not used for any other purpose.
301     ;;;
302     (defun correct-spelling (word)
303     "Check/correct the spelling of word. Output is done to *standard-output*."
304     (setf word (coerce word 'simple-string))
305     (let ((word (string-upcase (the simple-string word)))
306     (word-len (length (the simple-string word))))
307     (declare (simple-string word) (fixnum word-len))
308     (maybe-read-spell-dictionary)
309     (when (= word-len 1)
310     (error "Single character words are not in the dictionary."))
311     (when (> word-len max-entry-length)
312     (error "~A is too long for the dictionary." word))
313     (multiple-value-bind (idx used-flag-p)
314     (spell-try-word word word-len)
315     (if idx
316     (format t "Found it~:[~; because of ~A~]." used-flag-p
317     (spell-root-word idx))
318     (let ((close-words (spell-collect-close-words word)))
319     (if close-words
320     (format *standard-output*
321     "The possible correct spelling~[~; is~:;s are~]:~
322     ~:*~[~; ~{~A~}~;~{ ~A~^ and~}~:;~
323     ~{~#[~; and~] ~A~^,~}~]."
324     (length close-words)
325     close-words)
326     (format *standard-output* "Word not found.")))))))
327    
328    
329     (defvar *dictionary-read-p* nil)
330    
331     ;;; MAYBE-READ-SPELL-DICTIONARY -- Public
332     ;;;
333     (defun maybe-read-spell-dictionary ()
334     "Read the spelling dictionary if it has not be read already."
335     (unless *dictionary-read-p* (read-dictionary)))
336    
337    
338     (defun spell-root-word (index)
339     "Return the root word corresponding to a dictionary entry at index."
340     (let* ((start (descriptor-string-start index))
341 ram 1.2 (len (the fixnum (ldb stored-length-byte
342     (the fixnum (descriptor-ref index)))))
343     (result (make-string len)))
344     (declare (fixnum start len)
345     (simple-string result))
346     (sap-replace result (the system-area-pointer *string-table*)
347     start 0 len)
348     result))
349 ram 1.1
350    
351     (eval-when (compile eval)
352     (defmacro check-closeness (word word-len closeness-list)
353     `(if (spell-try-word ,word ,word-len)
354     (pushnew (subseq ,word 0 ,word-len) ,closeness-list :test #'string=)))
355     ) ;eval-when
356    
357     (defconstant spell-alphabet
358     (list #\A #\B #\C #\D #\E #\F #\G #\H
359     #\I #\J #\K #\L #\M #\N #\O #\P
360     #\Q #\R #\S #\T #\U #\V #\W #\X #\Y #\Z))
361    
362     ;;; SPELL-COLLECT-CLOSE-WORDS Returns a list of all "close" correctly spelled
363     ;;; words. The definition of "close" is at the beginning of the file, and
364     ;;; there are four sections to this function which collect each of the four
365     ;;; different kinds of close words.
366     ;;;
367     (defun spell-collect-close-words (word)
368     "Returns a list of all \"close\" correctly spelled words. This has the
369     same contraints as SPELL-TRY-WORD, which you have probably already called
370     if you are calling this."
371     (declare (simple-string word))
372     (let* ((word-len (length word))
373     (word-len--1 (1- word-len))
374     (word-len-+1 (1+ word-len))
375     (result ())
376     (correcting-buffer (make-string max-entry-length)))
377     (declare (simple-string correcting-buffer)
378     (fixnum word-len word-len--1 word-len-+1))
379     (replace correcting-buffer word :end1 word-len :end2 word-len)
380    
381     ;; Misspelled because one letter is different.
382     (dotimes (i word-len)
383     (do ((save-char (schar correcting-buffer i))
384     (alphabet spell-alphabet (cdr alphabet)))
385     ((null alphabet)
386     (setf (schar correcting-buffer i) save-char))
387     (setf (schar correcting-buffer i) (car alphabet))
388     (check-closeness correcting-buffer word-len result)))
389    
390     ;; Misspelled because two adjacent letters are transposed.
391     (dotimes (i word-len--1)
392     (rotatef (schar correcting-buffer i) (schar correcting-buffer (1+ i)))
393     (check-closeness correcting-buffer word-len result)
394     (rotatef (schar correcting-buffer i) (schar correcting-buffer (1+ i))))
395    
396     ;; Misspelled because of extraneous letter.
397     (replace correcting-buffer word
398     :start2 1 :end1 word-len--1 :end2 word-len)
399     (check-closeness correcting-buffer word-len--1 result)
400     (dotimes (i word-len--1)
401     (setf (schar correcting-buffer i) (schar word i))
402     (replace correcting-buffer word
403     :start1 (1+ i) :start2 (+ i 2) :end1 word-len--1 :end2 word-len)
404     (check-closeness correcting-buffer word-len--1 result))
405    
406     ;; Misspelled because a letter is missing.
407     (replace correcting-buffer word
408     :start1 1 :end1 word-len-+1 :end2 word-len)
409     (dotimes (i word-len-+1)
410     (do ((alphabet spell-alphabet (cdr alphabet)))
411     ((null alphabet)
412     (rotatef (schar correcting-buffer i)
413     (schar correcting-buffer (1+ i))))
414     (setf (schar correcting-buffer i) (car alphabet))
415     (check-closeness correcting-buffer word-len-+1 result)))
416     result))
417    
418     ;;; SPELL-TRY-WORD The literal 4 is not a constant defined somewhere since it
419     ;;; is part of the definition of the function of looking up words.
420     ;;; TRY-WORD-ENDINGS relies on the guarantee that word-len is at least 4.
421     ;;;
422     (defun spell-try-word (word word-len)
423     "See if the word or an appropriate root is in the spelling dicitionary.
424     Word-len must be inclusively in the range 2..max-entry-length."
425     (or (lookup-entry word word-len)
426     (if (>= (the fixnum word-len) 4)
427     (try-word-endings word word-len))))
428    
429    
430    
431     ;;;; Divining Correct Spelling
432    
433     (eval-when (compile eval)
434    
435     (defmacro setup-root-buffer (word buffer root-len)
436     `(replace ,buffer ,word :end1 ,root-len :end2 ,root-len))
437    
438     (defmacro try-root (word root-len flag-mask)
439     (let ((result (gensym)))
440     `(let ((,result (lookup-entry ,word ,root-len)))
441     (if (and ,result (descriptor-flag ,result ,flag-mask))
442     (return (values ,result ,flag-mask))))))
443    
444     ;;; TRY-MODIFIED-ROOT is used for root words that become truncated
445     ;;; when suffixes are added (e.g., skate => skating). Char-idx is the last
446     ;;; character in the root that has to typically be changed from a #\I to a
447     ;;; #\Y or #\E.
448     ;;;
449     (defmacro try-modified-root (word buffer root-len flag-mask char-idx new-char)
450     (let ((root-word (gensym)))
451     `(let ((,root-word (setup-root-buffer ,word ,buffer ,root-len)))
452     (setf (schar ,root-word ,char-idx) ,new-char)
453     (try-root ,root-word ,root-len ,flag-mask))))
454    
455     ) ;eval-when
456    
457    
458     (defvar *rooting-buffer* (make-string max-entry-length))
459    
460     ;;; TRY-WORD-ENDINGS takes a word that is at least of length 4 and
461     ;;; returns multiple values on success (the index where the word's root's
462     ;;; descriptor starts and :used-flag), otherwise nil. It looks at
463     ;;; characters from the end to the beginning of the word to determine if it
464     ;;; has any known suffixes. This is a VERY simple finite state machine
465     ;;; where all of the suffixes are narrowed down to one possible one in at
466     ;;; most two state changes. This is a PROG form for speed, and in some sense,
467     ;;; readability. The states of the machine are the flag names that denote
468     ;;; suffixes. The two points of branching to labels are the very beginning
469     ;;; of the PROG and the S state. This is a fairly straight forward
470     ;;; implementation of the flag rules presented at the beginning of this
471     ;;; file, with char-idx checks, so we do not index the string below zero.
472    
473     (defun try-word-endings (word word-len)
474     (declare (simple-string word)
475     (fixnum word-len))
476     (prog* ((char-idx (1- word-len))
477     (char (schar word char-idx))
478     (rooting-buffer *rooting-buffer*)
479     flag-mask)
480     (declare (simple-string rooting-buffer)
481     (fixnum char-idx))
482     (case char
483     (#\S (go S)) ;This covers over half of the possible endings
484     ;by branching off the second to last character
485     ;to other flag states that have plural endings.
486     (#\R (setf flag-mask R-mask) ;"er" and "ier"
487     (go D-R-Z-FLAG))
488     (#\T (go T-FLAG)) ;"est" and "iest"
489     (#\D (setf flag-mask D-mask) ;"ed" and "ied"
490     (go D-R-Z-FLAG))
491     (#\H (go H-FLAG)) ;"th" and "ieth"
492     (#\N (setf flag-mask N-mask) ;"ion", "ication", and "en"
493     (go N-X-FLAG))
494     (#\G (setf flag-mask G-mask) ;"ing"
495     (go G-J-FLAG))
496     (#\Y (go Y-FLAG)) ;"ly"
497     (#\E (go V-FLAG))) ;"ive"
498     (return nil)
499    
500     S
501     (setf char-idx (1- char-idx))
502     (setf char (schar word char-idx))
503     (if (char= char #\Y)
504     (if (set-member-p (schar word (1- char-idx)) *aeiou*)
505     (try-root word (1+ char-idx) S-mask)
506     (return nil))
507     (if (not (set-member-p char *sxzh*))
508     (try-root word (1+ char-idx) S-mask)))
509     (case char
510     (#\E (go S-FLAG)) ;"es" and "ies"
511     (#\R (setf flag-mask Z-mask) ;"ers" and "iers"
512     (go D-R-Z-FLAG))
513     (#\G (setf flag-mask J-mask) ;"ings"
514     (go G-J-FLAG))
515     (#\S (go P-FLAG)) ;"ness" and "iness"
516     (#\N (setf flag-mask X-mask) ;"ions", "ications", and "ens"
517     (go N-X-FLAG))
518     (#\' (try-root word char-idx M-mask)))
519     (return nil)
520    
521     S-FLAG
522     (setf char-idx (1- char-idx))
523     (setf char (schar word char-idx))
524     (if (set-member-p char *sxzh*)
525     (try-root word (1+ char-idx) S-mask))
526     (if (and (char= char #\I)
527     (not (set-member-p (schar word (1- char-idx)) *aeiou*)))
528     (try-modified-root word rooting-buffer (1+ char-idx)
529     S-mask char-idx #\Y))
530     (return nil)
531    
532     D-R-Z-FLAG
533     (if (char/= (schar word (1- char-idx)) #\E) (return nil))
534     (try-root word char-idx flag-mask)
535     (if (<= (setf char-idx (- char-idx 2)) 0) (return nil))
536     (setf char (schar word char-idx))
537     (if (char= char #\Y)
538     (if (set-member-p (schar word (1- char-idx)) *aeiou*)
539     (try-root word (1+ char-idx) flag-mask)
540     (return nil))
541     (if (char/= (schar word char-idx) #\E)
542     (try-root word (1+ char-idx) flag-mask)))
543     (if (and (char= char #\I)
544     (not (set-member-p (schar word (1- char-idx)) *aeiou*)))
545     (try-modified-root word rooting-buffer (1+ char-idx)
546     flag-mask char-idx #\Y))
547     (return nil)
548    
549     P-FLAG
550     (if (or (char/= (schar word (1- char-idx)) #\E)
551     (char/= (schar word (- char-idx 2)) #\N))
552     (return nil))
553     (if (<= (setf char-idx (- char-idx 3)) 0) (return nil))
554     (setf char (schar word char-idx))
555     (if (char= char #\Y)
556     (if (set-member-p (schar word (1- char-idx)) *aeiou*)
557     (try-root word (1+ char-idx) P-mask)
558     (return nil)))
559     (try-root word (1+ char-idx) P-mask)
560     (if (and (char= char #\I)
561     (not (set-member-p (schar word (1- char-idx)) *aeiou*)))
562     (try-modified-root word rooting-buffer (1+ char-idx)
563     P-mask char-idx #\Y))
564     (return nil)
565    
566     G-J-FLAG
567     (if (< char-idx 3) (return nil))
568     (setf char-idx (- char-idx 2))
569     (setf char (schar word char-idx))
570     (if (or (char/= char #\I) (char/= (schar word (1+ char-idx)) #\N))
571     (return nil))
572     (if (char/= (schar word (1- char-idx)) #\E)
573     (try-root word char-idx flag-mask))
574     (try-modified-root word rooting-buffer (1+ char-idx)
575     flag-mask char-idx #\E)
576     (return nil)
577    
578     N-X-FLAG
579     (setf char-idx (1- char-idx))
580     (setf char (schar word char-idx))
581     (cond ((char= char #\E)
582     (setf char (schar word (1- char-idx)))
583     (if (and (char/= char #\Y) (char/= char #\E))
584     (try-root word char-idx flag-mask))
585     (return nil))
586     ((char= char #\O)
587     (if (char= (schar word (1- char-idx)) #\I)
588     (try-modified-root word rooting-buffer char-idx
589     flag-mask (1- char-idx) #\E)
590     (return nil))
591     (if (< char-idx 5) (return nil))
592     (if (or (char/= (schar word (- char-idx 2)) #\T)
593     (char/= (schar word (- char-idx 3)) #\A)
594     (char/= (schar word (- char-idx 4)) #\C)
595     (char/= (schar word (- char-idx 5)) #\I))
596     (return nil)
597     (setf char-idx (- char-idx 4)))
598     (try-modified-root word rooting-buffer char-idx
599     flag-mask (1- char-idx) #\Y))
600     (t (return nil)))
601    
602     T-FLAG
603     (if (or (char/= (schar word (1- char-idx)) #\S)
604     (char/= (schar word (- char-idx 2)) #\E))
605     (return nil)
606     (setf char-idx (1- char-idx)))
607     (try-root word char-idx T-mask)
608     (if (<= (setf char-idx (- char-idx 2)) 0) (return nil))
609     (setf char (schar word char-idx))
610     (if (char= char #\Y)
611     (if (set-member-p (schar word (1- char-idx)) *aeiou*)
612     (try-root word (1+ char-idx) T-mask)
613     (return nil))
614     (if (char/= (schar word char-idx) #\E)
615     (try-root word (1+ char-idx) T-mask)))
616     (if (and (char= char #\I)
617     (not (set-member-p (schar word (1- char-idx)) *aeiou*)))
618     (try-modified-root word rooting-buffer (1+ char-idx)
619     T-mask char-idx #\Y))
620     (return nil)
621    
622     H-FLAG
623     (setf char-idx (1- char-idx))
624     (setf char (schar word char-idx))
625     (if (char/= char #\T) (return nil))
626     (if (char/= (schar word (1- char-idx)) #\Y)
627     (try-root word char-idx H-mask))
628     (if (and (char= (schar word (1- char-idx)) #\E)
629     (char= (schar word (- char-idx 2)) #\I))
630     (try-modified-root word rooting-buffer (1- char-idx)
631     H-mask (- char-idx 2) #\Y))
632     (return nil)
633    
634     Y-FLAG
635     (setf char-idx (1- char-idx))
636     (setf char (schar word char-idx))
637     (if (char= char #\L)
638     (try-root word char-idx Y-mask))
639     (return nil)
640    
641     V-FLAG
642     (setf char-idx (- char-idx 2))
643     (setf char (schar word char-idx))
644     (if (or (char/= char #\I) (char/= (schar word (1+ char-idx)) #\V))
645     (return nil))
646     (if (char/= (schar word (1- char-idx)) #\E)
647     (try-root word char-idx V-mask))
648     (try-modified-root word rooting-buffer (1+ char-idx)
649     V-mask char-idx #\E)
650     (return nil)))
651    
652    
653    
654     ;;; DESCRIPTOR-FLAG returns t or nil based on whether the flag is on.
655     ;;; From the diagram at the beginning of the file, we see that the flags
656     ;;; are stored two words off of the first word in the descriptor unit for
657     ;;; an entry.
658     ;;;
659     (defun descriptor-flag (descriptor-start flag-mask)
660     (not (zerop
661     (the fixnum
662     (logand
663     (the fixnum (descriptor-ref (+ 2 (the fixnum descriptor-start))))
664     (the fixnum flag-mask))))))
665    
666    
667     ;;;; Looking up Trials
668    
669     (eval-when (compile eval)
670    
671     ;;; SPELL-STRING= determines if string1 and string2 are the same. Before
672     ;;; it is called it is known that they are both of (- end1 0) length, and
673     ;;; string2 is in system space. This is used in FOUND-ENTRY-P.
674     ;;;
675     (defmacro spell-string= (string1 string2 end1 start2)
676     (let ((idx1 (gensym))
677     (idx2 (gensym)))
678     `(do ((,idx1 0 (1+ ,idx1))
679     (,idx2 ,start2 (1+ ,idx2)))
680     ((= ,idx1 ,end1) t)
681     (declare (fixnum ,idx1 ,idx2))
682     (unless (= (the fixnum (char-code (schar ,string1 ,idx1)))
683     (the fixnum (string-sapref ,string2 ,idx2)))
684     (return nil)))))
685    
686     ;;; FOUND-ENTRY-P determines if entry is what is described at idx.
687     ;;; Hash-and-length is 16 bits that look just like the first word of any
688     ;;; entry's descriptor unit (see diagram at the beginning of the file). If
689     ;;; the word stored at idx and entry have the same hash bits and length,
690     ;;; then we compare characters to see if they are the same.
691     ;;;
692     (defmacro found-entry-p (idx entry entry-len hash-and-length)
693     `(if (= (the fixnum (descriptor-ref ,idx))
694     (the fixnum ,hash-and-length))
695     (spell-string= ,entry *string-table* ,entry-len
696     (descriptor-string-start ,idx))))
697    
698     (defmacro hash2-increment (hash)
699     `(- (the fixnum *dictionary-size*)
700     2
701     (the fixnum (rem ,hash (- (the fixnum *dictionary-size*) 2)))))
702    
703     (defmacro hash2-loop ((location-var contents-var)
704     loc hash zero-contents-form
705     &optional body-form (for-insertion-p nil))
706     (let ((incr (gensym)))
707     `(let* ((,incr (hash2-increment ,hash))
708     (,location-var ,loc)
709 ram 1.2 (,contents-var 0))
710 ram 1.1 (declare (fixnum ,location-var ,contents-var ,incr))
711     (loop (setf ,location-var
712     (rem (+ ,location-var ,incr) (the fixnum *dictionary-size*)))
713     (setf ,contents-var (dictionary-ref ,location-var))
714     (if (zerop ,contents-var) (return ,zero-contents-form))
715     ,@(if for-insertion-p
716 ram 1.2 `((if (= ,contents-var spell-deleted-entry)
717     (return ,zero-contents-form))))
718 ram 1.1 (if (= ,location-var ,loc) (return nil))
719     ,@(if body-form `(,body-form))))))
720    
721     ) ;eval-when
722    
723    
724     ;;; LOOKUP-ENTRY returns the index of the first element of entry's
725     ;;; descriptor unit on success, otherwise nil.
726     ;;;
727     (defun lookup-entry (entry &optional len)
728     (declare (simple-string entry))
729     (let* ((entry-len (or len (length entry)))
730     (hash (string-hash entry entry-len))
731     (hash-and-len (dpb (the fixnum (ldb new-hash-byte hash))
732     stored-hash-byte
733     (the fixnum entry-len)))
734     (loc (rem hash (the fixnum *dictionary-size*)))
735     (loc-contents (dictionary-ref loc)))
736     (declare (fixnum entry-len hash hash-and-len loc))
737     (cond ((zerop loc-contents) nil)
738     ((found-entry-p loc-contents entry entry-len hash-and-len)
739     loc-contents)
740     (t
741     (hash2-loop (loop-loc loc-contents) loc hash
742     nil
743     (if (found-entry-p loc-contents entry entry-len hash-and-len)
744     (return loc-contents)))))))
745    
746     ;;;; Binary File Reading
747    
748     (defparameter default-binary-dictionary
749 ram 1.2 "library:spell-dictionary.bin")
750 ram 1.1
751     ;;; This is the first thing in a spell binary dictionary file to serve as a
752     ;;; quick check of its proposed contents. This particular number is
753     ;;; "BILLS" on a calculator held upside-down.
754     ;;;
755     (defconstant magic-file-id 57718)
756    
757     ;;; These constants are derived from the order things are written to the
758     ;;; binary dictionary in Spell-Build.Lisp.
759     ;;;
760     (defconstant magic-file-id-loc 0)
761     (defconstant dictionary-size-loc 1)
762     (defconstant descriptors-size-loc 2)
763     (defconstant string-table-size-low-byte-loc 3)
764     (defconstant string-table-size-high-byte-loc 4)
765     (defconstant file-header-bytes 10)
766    
767     ;;; Initially, there are no free descriptor elements and string table bytes,
768     ;;; but when these structures are grown, they are grown by more than that
769     ;;; which is necessary.
770     ;;;
771     (defvar *free-descriptor-elements* 0)
772     (defvar *free-string-table-bytes* 0)
773    
774     ;;; READ-DICTIONARY opens the dictionary and sets up the global structures
775     ;;; manifesting the spelling dictionary. When computing the start addresses
776     ;;; of these structures, we multiply by two since their sizes are in 16bit
777     ;;; lengths while the RT is 8bit-byte addressable.
778     ;;;
779     (defun read-dictionary (&optional (f default-binary-dictionary))
780     (when *dictionary-read-p*
781     (setf *dictionary-read-p* nil)
782     (deallocate-bytes (system-address *dictionary*)
783     (* 2 (the fixnum *dictionary-size*)))
784     (deallocate-bytes (system-address *descriptors*)
785     (* 2 (the fixnum
786     (+ (the fixnum *descriptors-size*)
787     (the fixnum *free-descriptor-elements*)))))
788     (deallocate-bytes (system-address *string-table*)
789     (+ (the fixnum *string-table-size*)
790     (the fixnum *free-string-table-bytes*))))
791     (setf *free-descriptor-elements* 0)
792     (setf *free-string-table-bytes* 0)
793     (let* ((fd (open-dictionary f))
794     (header-info (read-dictionary-structure fd file-header-bytes)))
795     (unless (= (sapref header-info magic-file-id-loc) magic-file-id)
796     (deallocate-bytes (system-address header-info) file-header-bytes)
797     (error "File is not a dictionary: ~S." f))
798     (setf *dictionary-size* (sapref header-info dictionary-size-loc))
799     (setf *descriptors-size* (sapref header-info descriptors-size-loc))
800     (setf *string-table-size* (sapref header-info string-table-size-low-byte-loc))
801     (setf (ldb (byte 12 16) (the fixnum *string-table-size*))
802     (the fixnum (sapref header-info string-table-size-high-byte-loc)))
803     (deallocate-bytes (system-address header-info) file-header-bytes)
804     (setf *dictionary*
805     (read-dictionary-structure fd (* 2 (the fixnum *dictionary-size*))))
806     (setf *descriptors*
807     (read-dictionary-structure fd (* 2 (the fixnum *descriptors-size*))))
808     (setf *string-table* (read-dictionary-structure fd *string-table-size*))
809     (setf *dictionary-read-p* t)
810     (close-dictionary fd)))

  ViewVC Help
Powered by ViewVC 1.1.5