/[cmucl]/src/benchmarks/cascor1.lisp
ViewVC logotype

Contents of /src/benchmarks/cascor1.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (show annotations)
Thu Sep 7 02:18:49 2000 UTC (13 years, 7 months ago) by dtc
Branch: MAIN
CVS Tags: sparc-tramp-assem-base, double-double-array-base, post-merge-intl-branch, release-19b-pre1, release-19b-pre2, merged-unicode-utf16-extfmt-2009-06-11, double-double-init-sparc-2, unicode-utf16-extfmt-2009-03-27, double-double-base, snapshot-2007-09, snapshot-2007-08, snapshot-2008-08, snapshot-2008-09, ppc_gencgc_snap_2006-01-06, sse2-packed-2008-11-12, snapshot-2008-05, snapshot-2008-06, snapshot-2008-07, snapshot-2007-05, snapshot-2008-01, snapshot-2008-02, snapshot-2008-03, intl-branch-working-2010-02-19-1000, snapshot-2006-11, snapshot-2006-10, double-double-init-sparc, snapshot-2006-12, unicode-string-buffer-impl-base, sse2-base, release-20b-pre1, release-20b-pre2, unicode-string-buffer-base, sse2-packed-base, sparc-tramp-assem-2010-07-19, amd64-dd-start, snapshot-2003-10, snapshot-2004-10, release-18e-base, release-19f-pre1, snapshot-2008-12, snapshot-2008-11, intl-2-branch-base, snapshot-2004-08, snapshot-2004-09, remove_negative_zero_not_zero, snapshot-2007-01, snapshot-2007-02, snapshot-2004-05, snapshot-2004-06, snapshot-2004-07, release-19e, release-19d, GIT-CONVERSION, double-double-init-ppc, release-19c, dynamic-extent-base, unicode-utf16-sync-2008-12, LINKAGE_TABLE, release-19c-base, cross-sol-x86-merged, label-2009-03-16, release-19f-base, PRE_LINKAGE_TABLE, merge-sse2-packed, mod-arith-base, sparc_gencgc_merge, merge-with-19f, snapshot-2004-12, snapshot-2004-11, intl-branch-working-2010-02-11-1000, unicode-snapshot-2009-05, unicode-snapshot-2009-06, amd64-merge-start, ppc_gencgc_snap_2005-12-17, double-double-init-%make-sparc, unicode-utf16-sync-2008-07, release-18e-pre2, unicode-utf16-sync-2008-09, unicode-utf16-extfmts-sync-2008-12, prm-before-macosx-merge-tag, cold-pcl-base, RELEASE_20b, snapshot-2008-04, snapshot-2003-11, snapshot-2005-07, unicode-utf16-sync-label-2009-03-16, RELEASE_19f, snapshot-2007-03, release-20a-base, cross-sol-x86-base, unicode-utf16-char-support-2009-03-26, unicode-utf16-char-support-2009-03-25, release-19a-base, unicode-utf16-extfmts-pre-sync-2008-11, snapshot-2008-10, sparc_gencgc, snapshot-2007-04, snapshot-2010-12, snapshot-2010-11, unicode-utf16-sync-2008-11, snapshot-2007-07, snapshot-2011-09, snapshot-2011-06, snapshot-2011-07, snapshot-2011-04, snapshot-2007-06, snapshot-2011-02, snapshot-2011-03, snapshot-2011-01, snapshot-2003-12, release-19a-pre1, release-19a-pre3, release-19a-pre2, pre-merge-intl-branch, release-19a, UNICODE-BASE, double-double-array-checkpoint, double-double-reader-checkpoint-1, release-19d-base, release-19e-pre1, double-double-irrat-end, release-19e-pre2, snapshot-2010-05, snapshot-2010-04, snapshot-2010-07, snapshot-2010-06, snapshot-2010-01, snapshot-2010-03, snapshot-2010-02, release-19d-pre2, release-19d-pre1, snapshot-2010-08, release-18e, double-double-init-checkpoint-1, double-double-reader-base, label-2009-03-25, snapshot-2005-03, release-19b-base, cross-sol-x86-2010-12-20, double-double-init-x86, sse2-checkpoint-2008-10-01, intl-branch-2010-03-18-1300, snapshot-2005-11, double-double-sparc-checkpoint-1, snapshot-2004-04, sse2-merge-with-2008-11, sse2-merge-with-2008-10, snapshot-2005-10, RELEASE_20a, snapshot-2005-12, release-20a-pre1, snapshot-2005-01, snapshot-2009-11, snapshot-2009-12, unicode-utf16-extfmt-2009-06-11, portable-clx-import-2009-06-16, unicode-utf16-string-support, release-19c-pre1, cross-sparc-branch-base, release-19e-base, intl-branch-base, double-double-irrat-start, snapshot-2005-06, snapshot-2005-05, snapshot-2005-04, ppc_gencgc_snap_2005-05-14, snapshot-2005-02, unicode-utf16-base, portable-clx-base, snapshot-2005-09, snapshot-2005-08, lisp-executable-base, snapshot-2009-08, snapshot-2007-12, snapshot-2007-10, snapshot-2007-11, snapshot-2009-02, snapshot-2009-01, snapshot-2009-07, snapshot-2009-05, snapshot-2009-04, snapshot-2006-02, snapshot-2006-03, release-18e-pre1, snapshot-2006-01, snapshot-2006-06, snapshot-2006-07, snapshot-2006-04, snapshot-2006-05, pre-telent-clx, snapshot-2006-08, snapshot-2006-09, HEAD
Branch point for: release-19b-branch, double-double-reader-branch, double-double-array-branch, mod-arith-branch, RELEASE-19F-BRANCH, portable-clx-branch, sparc_gencgc_branch, cross-sparc-branch, RELEASE-20B-BRANCH, unicode-string-buffer-branch, sparc-tramp-assem-branch, dynamic-extent, UNICODE-BRANCH, release-19d-branch, ppc_gencgc_branch, sse2-packed-branch, lisp-executable, RELEASE-20A-BRANCH, amd64-dd-branch, double-double-branch, unicode-string-buffer-impl-branch, intl-branch, release-18e-branch, cold-pcl, unicode-utf16-branch, cross-sol-x86-branch, release-19e-branch, sse2-branch, release-19a-branch, release-19c-branch, intl-2-branch, unicode-utf16-extfmt-branch
Changes since 1.5: +1 -16 lines
Update the setup of the random state.
1 ;;; -*- Mode: Lisp; Package: User -*-
2 ;;; ***************************************************************************
3 ;;; Common Lisp implementation of Cascade-Correlation learning algorithm.
4 ;;; This version for export. Non-portable user-interface stuff excised.
5 ;;;
6 ;;; Written by: Scott E. Fahlman
7 ;;; School of Computer Science
8 ;;; Carnegie-Mellon University
9 ;;; Pittsburgh, PA 15217
10 ;;;
11 ;;; Phone: (412) 268-2575
12 ;;; Internet: fahlman@cs.cmu.edu
13 ;;;
14 ;;; This code has been placed in the public domain by the author. As a
15 ;;; matter of simple courtesy, anyone using or adapting this code is
16 ;;; expected to acknowledge the source. The author would like to hear
17 ;;; about any attempts to use this system, successful or not.
18 ;;;
19 ;;; For an explanation of this algorithm and some results, see "The
20 ;;; Cascade-Correlation Learning Architecture" by Scott E. Fahlman and
21 ;;; Christian Lebiere in D. S. Touretzky (ed.), "Advances in Neural
22 ;;; Information Processing Systems 2", Morgan Kaufmann, 1990. A somewhat
23 ;;; longer version is available as CMU Computer Science Tech Report
24 ;;; CMU-CS-90-100.
25 ;;;
26 ;;; ***************************************************************************
27 ;;; EDIT HISTORY SINCE FIRST RELEASE:
28 ;;;
29 ;;; 8/24/90:
30 ;;; Modified TEST-EPOCH so that it wouldn't mess up error statistics being
31 ;;; passed from the output-training to input-training phase. Thanks to
32 ;;; Scott Crowder for spotting this.
33 ;;;
34 ;;; 6/1/90:
35 ;;; Fixed bug in INSTALL-NEW-UNIT. New unit's initial weight was being
36 ;;; computed using *CAND-COR* values of the successful candidate, which is
37 ;;; already zero. Now uses *CAND-PREV-COR*. Thanks to Tim Howells for
38 ;;; spotting this.
39 ;;;
40 ;;; Modify BUILD-NET to check that *MAX-UNITS* is large enough. A couple of
41 ;;; people got mysterious failures the first time they used lots of inputs.
42 ;;;
43 ;;; Made a small change in QUICKPROP-UPDATE to prevent rare divide-by-zero
44 ;;; errors when p = s = 0.0. Thanks to Tom Dietterich.
45 ;;;
46 ;;; Added CHANGED-TRAINING-SET, which should be called when the training set
47 ;;; is changed but you don't want to reinitialize the net. This rebuilds
48 ;;; the caches.
49 ;;;
50 ;;; 11/9/90:
51 ;;; Added some additional type declarations for maximum speed under certain
52 ;;; Common Lisp compilers.
53 ;;; ***************************************************************************
54 ;;;
55 (in-package "USER")
56
57 ;;; This proclamation buys a certain amount of overall speed at the expense
58 ;;; of runtime checking. Comment it out when debugging new, bug-infested code.
59 ;;;
60 #+declare-unsafe
61 (proclaim '(optimize (speed 3) (space 0) (safety 0)))
62
63 ;;; Style note: Because some of these runs take a long time, this code is
64 ;;; extensively hacked for good performance under a couple of Common Lisp
65 ;;; systems, some of which have poor performance on multi-dimensional
66 ;;; arrays and some of which have weak type-inference in the compiler.
67 ;;; Elegance and clarity have in some cases been sacrificed for speed.
68
69 ;;; In some problems, floating point underflow errors may occur as a result
70 ;;; of weight-decay and other operations. Most Common Lisp implementations
71 ;;; have an option to turn floating underflows into zero values without
72 ;;; signalling an error. You should enable this facility if it is
73 ;;; available. If not, you'll either have to write a condition handler for
74 ;;; floating underflows or live with the occasional underflow error.
75
76 ;;; In CMU Common Lisp, we use the following incantation:
77 ;;; (setq extensions:*ignore-floating-point-underflow* t)
78
79
80 ;;; Compensate for the clumsy Common Lisp declaration system and weak
81 ;;; type-inference in some Common Lisp compilers.
82
83 ;;; INCF-SF, *SF, etc. are like INCF, *, etc., but they declare their
84 ;;; operands and results to be short-floats. The code gets unreadable
85 ;;; quickly if you insert all these declarations by hand.
86
87 (defmacro incf-sf (place &optional (increment 1.0))
88 `(the short-float (incf (the short-float ,place)
89 (the short-float ,increment))))
90
91 (defmacro decf-sf (place &optional (increment 1.0))
92 `(the short-float (decf (the short-float ,place)
93 (the short-float ,increment))))
94
95 (defmacro *sf (&rest args)
96 `(the short-float
97 (* ,@(mapcar #'(lambda (x) (list 'the 'short-float x)) args))))
98
99 (defmacro +sf (&rest args)
100 `(the short-float
101 (+ ,@(mapcar #'(lambda (x) (list 'the 'short-float x)) args))))
102
103 (defmacro -sf (&rest args)
104 `(the short-float
105 (- ,@(mapcar #'(lambda (x) (list 'the 'short-float x)) args))))
106
107 (defmacro /sf (&rest args)
108 `(the short-float
109 (/ ,@(mapcar #'(lambda (x) (list 'the 'short-float x)) args))))
110
111 ;;; DOTIMES1 is like DOTIMES, only with the loop counter declared as a
112 ;;; fixnum. This is for compilers with weak type inference.
113
114 (defmacro dotimes1 (form1 &body body)
115 `(dotimes ,form1 (declare (fixnum ,(car form1))) . ,body))
116
117 ;;; Create vector-access forms similar to SVREF, but for vectors of
118 ;;; element-type SHORT-FLOAT and FIXNUM.
119
120 (eval-when (compile load eval)
121 (defconstant fvector-type
122 (array-element-type (make-array '(1) :element-type 'short-float)))
123 (defconstant ivector-type
124 (array-element-type (make-array '(1) :element-type 'fixnum))))
125
126 (defmacro fvref (a i)
127 "Like SVREF, but with vectors of element-type SHORT-FLOAT."
128 (if (eq fvector-type t)
129 `(the short-float (svref ,a ,i))
130 `(the short-float
131 (aref (the (simple-array ,fvector-type (*)) ,a) ,i))))
132
133 (defmacro ivref (a i)
134 "Like SVREF, but with vectors of element-type FIXNUM."
135 (if (eq ivector-type t)
136 `(the fixnum (svref ,a ,i))
137 `(the fixnum
138 (aref (the (simple-array ,ivector-type (*)) ,a) ,i))))
139
140
141 ;;;; Assorted Parameters and Controls.
142
143 ;;; Thse parameters and switches control the quickprop learning algorithm
144 ;;; used to train the output weights and candidate units.
145
146 (defvar *unit-type* :sigmoid
147 "The type of activation function used by the hidden units. Options
148 currently implemented are :sigmoid, :asigmoid, and :gaussian. Sigmoid is
149 symmetric in range -0.5 to +0.5, while Asigmoid is asymmetric, 0.0 to
150 1.0.")
151
152 (defvar *output-type* :sigmoid
153 "The activation function to use on the output units. Options currently
154 implemented are :linear and :sigmoid.")
155
156 (defvar *raw-error* nil
157 "If T, candidate units will try to correlate with the raw difference
158 between actual and desired outputs. Else, they use the difference modified
159 by the derivative of the output unit activation function.")
160
161 (defvar *sigmoid-prime-offset* 0.1
162 "This is added to the derivative of the sigmoid function to prevent the
163 system from getting stuck at the points where sigmoid-prime goes to
164 zero.")
165 (proclaim '(short-float *sigmoid-prime-offset*))
166
167 (defvar *weight-range* 1.0
168 "Input weights in the network get inital random values between plus and
169 minus *weight-range*. This parameter also controls the initial weights
170 on direct input-to-output links.")
171 (proclaim '(short-float *weight-range*))
172
173 (defvar *weight-multiplier* 1.0
174 "The output weights for cadidate units get an initial value that is the
175 negative of the correlation times this factor.")
176 (proclaim '(short-float *weight-multiplier*))
177
178 (defvar *output-mu* 2.0
179 "Mu parmater used for quickprop training of output weights. The
180 step size is limited to mu times the previous step.")
181 (proclaim '(short-float *output-mu*))
182
183 (defvar *output-shrink-factor* (/ *output-mu* (+ 1.0 *output-mu*))
184 "Derived from *output-mu*. Used in computing whether the proposed step is
185 too large.")
186 (proclaim '(short-float *output-shrink-factor*))
187
188 (defvar *output-epsilon* 0.35
189 "Controls the amount of linear gradient descent to use in updating
190 output weights.")
191 (proclaim '(short-float *output-epsilon*))
192
193 (defvar *output-decay* 0.0001
194 "This factor times the current weight is added to the slope at the
195 start of each output-training epoch. Keeps weights from growing too big.")
196 (proclaim '(short-float *output-decay*))
197
198 (defvar *output-patience* 8
199 "If we go for this many epochs with no significant change, it's time to
200 stop tuning. If 0, go on forever.")
201 (proclaim '(fixnum *output-patience*))
202
203 (defvar *output-change-threshold* 0.01
204 "The error must change by at least this fraction of its old value in
205 order to count as a significant change.")
206 (proclaim '(short-float *output-change-threshold*))
207
208 (defvar *input-mu* 2.0
209 "Mu parmater used for quickprop training of input weights. The
210 step size is limited to mu times the previous step.")
211 (proclaim '(short-float *input-mu*))
212
213 (defvar *input-shrink-factor* (/ *input-mu* (+ 1.0 *input-mu*))
214 "Derived from *input-mu*. Used in computing whether the proposed step is
215 too large.")
216 (proclaim '(short-float *input-shrink-factor*))
217
218 (defvar *input-epsilon* 1.0
219 "Controls the amount of linear gradient descent to use in updating
220 unit input weights.")
221 (proclaim '(short-float *input-epsilon*))
222
223 (defvar *input-decay* 0.0
224 "This factor times the current weight is added to the slope at the
225 start of each output-training epoch. Keeps weights from growing too big.")
226 (proclaim '(short-float *input-decay*))
227
228 (defvar *input-patience* 8
229 "If we go for this many epochs with no significant change, it's time to
230 stop tuning. If 0, go on forever.")
231 (proclaim '(fixnum *input-patience*))
232
233 (defvar *input-change-threshold* 0.03
234 "The correlation score for the best unit must change by at least
235 this fraction of its old value in order to count as a significant
236 change.")
237 (proclaim '(short-float *input-change-threshold*))
238
239 ;;; Variables related to error and correlation.
240
241 (defvar *score-threshold* 0.4
242 "An output is counted as correct for a given case if the difference
243 between that output and the desired value is smaller in magnitude than
244 this value.")
245 (proclaim '(short-float *score-threshold*))
246
247 (defvar *error-bits* 0
248 "Count number of bits in epoch that are wrong by more than
249 *SCORE-THRESHOLD*")
250 (proclaim '(fixnum *error-bits*))
251
252 (defvar *true-error* 0.0
253 "The sum-squared error at the network outputs. This is the value the
254 algorithm is ultimately trying to minimize.")
255 (proclaim '(short-float *true-error*))
256
257 (defvar *sum-error* 0.0
258 "Accumulate the sum of the error values after output training phase.")
259 (proclaim '(short-float *sum-error*))
260
261 (defvar *sum-sq-error* 0.0
262 "Accumulate the sum of the squared error values after output
263 training phase.")
264 (proclaim '(short-float *sum-sq-error*))
265
266 (defvar *avg-error* 0.0
267 "Holds the average of error values after output training phase.")
268 (proclaim '(short-float *avg-error*))
269
270 (defvar *best-candidate-score* 0.0
271 "The best correlation score found among all candidate units being
272 trained.")
273 (proclaim '(short-float *best-candidate-score*))
274
275 (defvar *best-candidate* 0
276 "The index of the candidate unit whose correlation score is best
277 at present.")
278 (proclaim '(fixnum *best-candidate*))
279
280 ;;; These variables and switches control the simulation and display.
281
282 (defvar *use-cache* t
283 "If T, cache the forward-pass values instead of repeatedly
284 computing them. This can save a *lot* of time if all the cached values
285 fit into memory.")
286
287 (defparameter *epoch* 0
288 "Count of the number of times the entire training set has been presented.")
289 (proclaim '(fixnum *epoch*))
290
291 (defvar *test* nil
292 "If T, run a test epoch every so often during output training.")
293
294 (defvar *test-interval* 0
295 "Run a test epoch every *test-interval* output training cycles.")
296 (proclaim '(fixnum *test-interval*))
297
298 (defvar *single-pass* nil
299 "When on, pause after next forward/backward cycle.")
300
301 (defvar *single-epoch* nil
302 "When on, pause after next training epoch.")
303
304 (defparameter *step* nil
305 "Turned briefly to T in order to continue after a pause.")
306
307 ;;; The sets of training inputs and outputs are stored in parallel vectors.
308 ;;; Each element is a SIMPLE-VECTOR holding short-float values, one for
309 ;;; each input or output. Note: this is a simple vector, not a specialized
310 ;;; vector of element-type short-float.
311
312 (defvar *training-inputs* (make-array 0)
313 "Vector of input patterns for training the net.")
314 (proclaim '(simple-vector *training-inputs*))
315
316 (defvar *training-outputs* (make-array 0)
317 "Vector of output patterns for training the net.")
318 (proclaim '(simple-vector *training-outputs*))
319
320 (defvar *goal* (make-array 0)
321 "The goal vector for the current training or testing case.")
322 (proclaim '(simple-vector *goal*))
323
324 (defvar *max-cases* 0
325 "Maximum number of training cases that can be accommdated by the current
326 data structures.")
327 (proclaim '(fixnum *max-cases*))
328
329 (defvar *ncases* 0
330 "Number of training cases currently in use. Assume a contiguous block
331 beginning with *FIRST-CASE*.")
332 (proclaim '(fixnum *ncases*))
333
334 (defvar *first-case* 0
335 "Address of the first training case in the currently active set. Usually
336 zero, but may differ if we are training on different chunks of the training
337 set at different times.")
338 (proclaim '(fixnum *first-case*))
339
340 ;;; For some benchmarks there is a separate set of values used for testing
341 ;;; the network's ability to generalize. These values are not used during
342 ;;; training.
343
344 (defvar *test-inputs* '#()
345 "Vector of input patterns for testing the net.")
346 (proclaim '(simple-vector *test-inputs*))
347
348 (defvar *test-outputs* '#()
349 "Vector of output patterns for testing the net.")
350 (proclaim '(simple-vector *test-outputs*))
351
352
353 ;;;; Fundamental data structures.
354
355 ;;; Unit values and weights are short flonums.
356
357 ;;; Instead of representing each unit by a structure, we represent the
358 ;;; unit by a fixnum. This is used to index into various vectors that hold
359 ;;; per-unit information, such as the activation value of each unit.
360 ;;; So the information concerning a given unit is found in a slice of values
361 ;;; across many vectors, all with the same unit-index.
362
363 ;;; Per-connection information for each connection COMING INTO unit is
364 ;;; stored in a vector of vectors. The outer vector is indexed by the unit
365 ;;; number, and the inner vector is then indexed by connection number.
366 ;;; This is a sleazy way of implementing a 2-D array, faster in most Lisp
367 ;;; systems than multiplying to do the index arithmetic, and more efficient
368 ;;; if the units are sparsely connected.
369
370 ;;; Unit 0, the "bias unit" is always at a maximum-on value. Next come
371 ;;; some input "units", then some hidden units.
372
373 ;;; Output units have their own separate set of data structures and
374 ;;; indices. The units and outputs together form the "active" network.
375 ;;; There are also separate data structures and indices for the "candidate"
376 ;;; units that have not yet been added to the network.
377
378 (defvar *max-units* 30
379 "Maximum number of input values and hidden units in the network.")
380 (proclaim '(fixnum *max-units*))
381
382 (defvar *ninputs* 0
383 "Number of inputs for this problem.")
384 (proclaim '(fixnum *ninputs*))
385
386 (defvar *noutputs* 0
387 "Number of outputs for this problem.")
388 (proclaim '(fixnum *noutputs*))
389
390 (defvar *nunits* 0
391 "Current number of active units in the network. This count includes all
392 inputs to the network and the bias unit.")
393 (proclaim '(fixnum *nunits*))
394
395 (defvar *ncandidates* 8
396 "Number of candidate units whose inputs will be trained at once.")
397 (proclaim '(fixnum *ncandidates*))
398
399 ;;; The following vectors hold values related to hidden units in the active
400 ;;; net and their input weights. The vectors are created by BUILD-NET, after
401 ;;; the dimension variables have been set up.
402
403 (defvar *values* nil
404 "Vector holding the current activation value for each unit and input in
405 the active net.")
406
407 (defvar *values-cache* nil
408 "Holds a distinct *VALUES* vector for each of the *MAX-CASES* training
409 cases. Once we have computed the *VALUES* vector for each training case,
410 we can use it repeatedly until the weights or training cases change.")
411
412 (defvar *extra-values* nil
413 "Extra values vector to use when not using the cache.")
414
415 ;;; Note: the *NCONNECTIONS* and *CONNECTIONS* vectors could be eliminated
416 ;;; if we wanted to commit to total connectivity for all units.
417 ;;; For now, we want to allow for sparse or irregular connectivity.
418
419 (defvar *nconnections* nil
420 "Vector holding the number of incoming connections for each unit.")
421
422 (defvar *connections* nil
423 "Vector that holds a connection vector for each unit J.
424 Each entry in the connection vector holds a unit index I,
425 indicating that this connection is from I to J.")
426
427 (defvar *weights* nil
428 "Vector of vectors with structure parallel to the *connections* vector.
429 Each entry gives the weight associated with an incoming connection.")
430
431 ;;; The following vectors hold values for the outputs of the active
432 ;;; network and the output-side weights.
433
434 (defvar *outputs* nil
435 "Vector holding the network output values.")
436
437 (defvar *errors* nil
438 "Vector holding the current error value for each output.")
439
440 (defvar *errors-cache* nil
441 "Holds a distinct *ERRORS* vector for each of the *MAX-CASES* training
442 cases. Once we have computed the *ERRORS* vector for a given training
443 case, we can use it repeatedly until the weights of the training cases
444 change.")
445
446 (defvar *extra-errors* nil
447 "Extra errors vector to use when not using the cache.")
448
449 (defvar *output-weights* nil
450 "Vector of vectors. For each output, we have a vector of output weights
451 coming from the unit indicated by the index.")
452
453 (defvar *output-deltas* nil
454 "Vector of vectors, parallel with output weights. Each entry is the
455 amount by which the corresponding output weight was changed last time.")
456
457 (defvar *output-slopes* nil
458 "Vector of vectors, parallel with output weights. Each entry is the
459 partial derivative of the total error with repsect to the corresponding
460 weight.")
461
462 (defvar *output-prev-slopes* nil
463 "Vector of vectors, parallel with output weights. Each entry is the
464 previous value of the corresponding *OUTPUT-SLOPES* entry.")
465
466 (defvar *output-weights-record* nil
467 "The vector of output weights is recorded here after each output-training
468 phase and just prior to the addition of the next unit. This record
469 allows us to reconstruct the network's performance at each of these
470 points in time.")
471
472 ;;; The following vectors have one entry for each candidate unit in the
473 ;;; pool of trainees.
474
475 (defvar *cand-sum-values* nil
476 "For each candidate unit, the sum of its values over an entire
477 training set.")
478
479 (defvar *cand-cor* nil
480 "A vector with one entry for each candidate unit. This entry is a vector
481 that holds the correlation between this unit's value and the residual
482 error at each of the outputs, computed over a whole epoch.")
483
484 (defvar *cand-prev-cor* nil
485 "Holds the *cand-cor* values computed in the previous candidate training
486 epoch.")
487
488 (defvar *cand-weights* nil
489 "A vector with one entry for each candidate unit. This entry is a vector
490 that holds the current input weights for that candidate unit.")
491
492 (defvar *cand-deltas* nil
493 "A vector with one entry for each candidate unit. This entry is a vector
494 that holds the input weights deltas for that candidate unit.")
495
496 (defvar *cand-slopes* nil
497 "A vector with one entry for each candidate unit. This entry is a vector
498 that holds the input weights slopes for that candidate unit.")
499
500 (defvar *cand-prev-slopes* nil
501 "A vector with one entry for each candidate unit. This entry is a vector
502 that holds the previous values of the input weight slopes for that
503 candidate unit.")
504
505 ;;; At present, each candidate receives a connection from every input and
506 ;;; pre-existing unit. Rather than cons up a new *connections* vector for
507 ;;; each of these, we can just use this one for all of them.
508
509 (defvar *all-connections* nil
510 "A *CONNECTIONS* vector that can be used by any unit that connects to
511 all lower-numbered units, in order.")
512
513
514 ;;;; Network-building utilities.
515
516 (defun build-net (ninputs noutputs)
517 "Create the network data structures, given the number of input and output
518 connections. Get *MAX-UNITS* and other dimesntions from variables."
519 (declare (fixnum ninputs noutputs))
520 ;; Check to make sure *MAX-UNITS* is big enough.
521 (unless (> *max-units* (+ ninputs 1))
522 (error "*MAX-UNITS* must be greater than number of inputs plus 1."))
523 ;; Fill in assorted variables and create top-level vectors.
524 (setq *ninputs* ninputs
525 *noutputs* noutputs
526 *max-cases* (length *training-inputs*)
527 *ncases* *max-cases*
528 *first-case* 0
529 *nunits* (+ 1 *ninputs*)
530 *values-cache* (make-array *max-cases* :initial-element nil)
531 *extra-values* (make-array *max-units*
532 :element-type 'short-float
533 :initial-element 0.0)
534 *values* *extra-values*
535 *nconnections* (make-array *max-units*
536 :element-type 'fixnum
537 :initial-element 0)
538 *connections* (make-array *max-units* :initial-element nil)
539 *weights* (make-array *max-units* :initial-element nil)
540 *outputs* (make-array *noutputs*
541 :element-type 'short-float
542 :initial-element 0.0)
543 *errors-cache* (make-array *max-cases* :initial-element nil)
544 *extra-errors* (make-array *noutputs*
545 :element-type 'short-float
546 :initial-element 0.0)
547 *errors* *extra-errors*
548 *output-weights* (make-array *noutputs* :initial-element nil)
549 *output-weights-record* (make-array *max-units* :initial-element nil)
550 *output-deltas* (make-array *noutputs* :initial-element nil)
551 *output-slopes* (make-array *noutputs* :initial-element nil)
552 *output-prev-slopes* (make-array *noutputs* :initial-element nil)
553 *cand-sum-values* (make-array *ncandidates*
554 :element-type 'short-float
555 :initial-element 0.0)
556 *cand-cor* (make-array *ncandidates* :initial-element nil)
557 *cand-prev-cor* (make-array *ncandidates* :initial-element nil)
558 *cand-weights* (make-array *ncandidates* :initial-element nil)
559 *cand-deltas* (make-array *ncandidates* :initial-element nil)
560 *cand-slopes* (make-array *ncandidates* :initial-element nil)
561 *cand-prev-slopes* (make-array *ncandidates* :initial-element nil))
562 ;; Only create the caches if *USE-CACHE* is on -- may not always have room.
563 (when *use-cache*
564 (dotimes1 (i *max-cases*)
565 (setf (svref *values-cache* i)
566 (make-array *max-units*
567 :element-type 'short-float
568 :initial-element 0.0))
569 (setf (svref *errors-cache* i)
570 (make-array *noutputs*
571 :element-type 'short-float
572 :initial-element 0.0))))
573 ;; For each output, create the vectors holding per-weight information.
574 (dotimes1 (i *noutputs*)
575 (setf (svref *output-weights* i)
576 (make-array *max-units*
577 :element-type 'short-float
578 :initial-element 0.0))
579 (setf (svref *output-deltas* i)
580 (make-array *max-units*
581 :element-type 'short-float
582 :initial-element 0.0))
583 (setf (svref *output-slopes* i)
584 (make-array *max-units*
585 :element-type 'short-float
586 :initial-element 0.0))
587 (setf (svref *output-prev-slopes* i)
588 (make-array *max-units*
589 :element-type 'short-float
590 :initial-element 0.0)))
591 ;; For each candidate unit, create the vectors holding the correlations,
592 ;; incoming weights, and other stats.
593 (dotimes1 (i *ncandidates*)
594 (setf (svref *cand-cor* i)
595 (make-array *noutputs*
596 :element-type 'short-float
597 :initial-element 0.0))
598 (setf (svref *cand-prev-cor* i)
599 (make-array *noutputs*
600 :element-type 'short-float
601 :initial-element 0.0))
602 (setf (svref *cand-weights* i)
603 (make-array *max-units*
604 :element-type 'short-float
605 :initial-element 0.0))
606 (setf (svref *cand-deltas* i)
607 (make-array *max-units*
608 :element-type 'short-float
609 :initial-element 0.0))
610 (setf (svref *cand-slopes* i)
611 (make-array *max-units*
612 :element-type 'short-float
613 :initial-element 0.0))
614 (setf (svref *cand-prev-slopes* i)
615 (make-array *max-units*
616 :element-type 'short-float
617 :initial-element 0.0))))
618
619 (defun random-weight ()
620 "Select a random weight, uniformly distributed over the
621 interval from minus to plus *weight-range*."
622 (-sf (random (*sf 2.0 *weight-range*)) *weight-range*))
623
624 (defun init-net ()
625 "Set up the network for a learning problem. Clean up all the data
626 structures that may have become corrupted. Initialize the output weights
627 to random values controlled by *weight-range*."
628 ;; Set up the *ALL-CONNECTIONS* vector.
629 (setq *all-connections*
630 (make-array *max-units* :element-type 'fixnum))
631 (dotimes1 (i *max-units*)
632 (setf (ivref *all-connections* i) i))
633 ;; Initialize the active unit data structures.
634 (dotimes1 (i *max-units*)
635 (setf (fvref *extra-values* i) 0.0)
636 (setf (ivref *nconnections* i) 0)
637 (setf (svref *connections* i) nil)
638 (setf (svref *weights* i) nil)
639 (setf (svref *output-weights-record* i) nil))
640 ;; Initialize the per-output data structures.
641 (dotimes1 (i *noutputs*)
642 (setf (fvref *outputs* i) 0.0)
643 (setf (fvref *extra-errors* i) 0.0)
644 (let ((ow (svref *output-weights* i))
645 (od (svref *output-deltas* i))
646 (os (svref *output-slopes* i))
647 (op (svref *output-prev-slopes* i)))
648 (dotimes1 (j *max-units*)
649 (setf (fvref ow j) 0.0)
650 (setf (fvref od j) 0.0)
651 (setf (fvref os j) 0.0)
652 (setf (fvref op j) 0.0))
653 ;; Set up initial random weights for the input-to-output connections.
654 (dotimes1 (j (1+ *ninputs*))
655 (setf (fvref ow j) (random-weight)))))
656 ;; Initialize the caches if they are in use.
657 (when *use-cache*
658 (dotimes1 (j *max-cases*)
659 (let ((v (svref *values-cache* j))
660 (e (svref *errors-cache* j)))
661 (dotimes1 (i *max-units*)
662 (setf (fvref v i) 0.0))
663 (dotimes1 (i *noutputs*)
664 (setf (fvref e i) 0.0)))))
665 ;; Candidate units get initialized in a separate routine.
666 (init-candidates)
667 ;; Do some other assorted housekeeping.
668 (setf (fvref *extra-values* 0) 1.0)
669 (setq *epoch* 0)
670 (setq *nunits* (+ 1 *ninputs*))
671 (setq *error-bits* 0)
672 (setq *true-error* 0.0)
673 (setq *sum-error* 0.0)
674 (setq *sum-sq-error* 0.0)
675 (setq *best-candidate-score* 0.0)
676 (setq *best-candidate* 0))
677
678 (defun changed-training-set ()
679 "Call this instead of BUILD-NET and INIT-NET if you want to leave
680 existing hidden units in place and start from here with new training
681 examples. Assumes that the number of net inputs and outputs remains the
682 same, but the number of cases may have changed. Rebuilds the caches."
683 (setq *max-cases* (length *training-inputs*)
684 *ncases* *max-cases*
685 *first-case* 0
686 *values-cache* (make-array *max-cases* :initial-element nil)
687 *errors-cache* (make-array *max-cases* :initial-element nil))
688 ;; Only create the caches if *USE-CACHE* is on -- may not always have room.
689 (when *use-cache*
690 (dotimes1 (i *max-cases*)
691 (setf (svref *errors-cache* i)
692 (make-array *noutputs*
693 :element-type 'short-float
694 :initial-element 0.0))
695 (setq *values* (make-array *max-units*
696 :element-type 'short-float
697 :initial-element 0.0))
698 (setf (svref *values-cache* i) *values*)
699 (set-up-inputs (svref *training-inputs* i))
700 (do ((j (1+ *ninputs*) (1+ j)))
701 ((= j *nunits*))
702 (declare (fixnum j))
703 (compute-unit-value j)))))
704
705 ;;;; Utilities for learning.
706
707 (proclaim '(inline activation activation-prime))
708
709 (defun activation (sum)
710 "Given the sum of weighted inputs, compute the unit's activation value.
711 Defined unit types are :sigmoid, :asigmoid, and :gaussian."
712 (declare (short-float sum))
713 (ecase *unit-type*
714 (:sigmoid
715 ;; Symmetric sigmoid function in range -0.5 to +0.5.
716 (cond ((< sum -15.0) -0.5)
717 ((> sum 15.0) +0.5)
718 (t (-sf (/sf (+sf 1.0 (exp (-sf sum)))) 0.5))))
719 (:asigmoid
720 ;; Asymmetric sigmoid in range 0.0 to 1.0.
721 (cond ((< sum -15.0) 0.0)
722 ((> sum 15.0) 1.0)
723 (t (/sf 1.0 (+sf 1.0 (exp (-sf sum)))))))
724 (:gaussian
725 ;; Gaussian activation function in range 0.0 to 1.0.
726 (let ((x (*sf -0.5 sum sum)))
727 (if (< x -75.0) 0.0 (exp x))))))
728
729 ;;; Note: do not use *SIGMOID-PRIME-OFFSET* here, as it confuses the
730 ;;; correlation machinery. But do use it in output-prime, since it does no
731 ;;; harm there and the output units often get stuck at extreme values.
732
733 (defun activation-prime (value sum)
734 "Given the unit's activation value and sum of weighted inputs, compute
735 the derivative of the activation with respect to the sum. Defined unit
736 types are :sigmoid, :asigmoid, and :gaussian."
737 (declare (short-float value sum))
738 (ecase *unit-type*
739 (:sigmoid
740 (-sf 0.25 (*sf value value)))
741 (:asigmoid
742 (*sf value (-sf 1.0 value)))
743 (:gaussian
744 (*sf (-sf value) sum))))
745
746 (proclaim '(inline output-function output-prime))
747
748 (defun output-function (sum)
749 "Compute the value of an output, given the weighted sum of incoming values.
750 Defined output types are :sigmoid and :linear."
751 (declare (short-float sum))
752 (ecase *output-type*
753 (:sigmoid (cond ((< sum -15.0) -0.5)
754 ((> sum 15.0) +0.5)
755 (t (-sf (/sf 1.0 (+sf 1.0 (exp (-sf sum)))) 0.5))))
756 (:linear sum)))
757
758 (defun output-prime (output)
759 "Compute the derivative of an output with respect to the weighted sum of
760 incoming values. Defined output types are :sigmoid and :linear."
761 (declare (short-float output))
762 (ecase *output-type*
763 (:sigmoid
764 (+sf *sigmoid-prime-offset* (-sf 0.25 (*sf output output))))
765 (:linear 1.0)))
766
767 ;;; The basic routine for doing Quickprop-style update of weights.
768 ;;; Distilled essence of a year's work...
769
770 (proclaim '(inline quickprop-update))
771
772 (defun quickprop-update (i weights deltas slopes prevs
773 epsilon decay mu shrink-factor)
774 "Given vectors holding weights, deltas, slopes, and previous slopes,
775 and an index i, update weight(i) and delta(i) appropriately. Move
776 slope(i) to prev(i) and zero out slope(i). Add weight decay term to
777 each slope before doing the update."
778 (let* ((w (fvref weights i))
779 (d (fvref deltas i))
780 (s (+sf (fvref slopes i) (*sf decay w)))
781 (p (fvref prevs i))
782 (next-step 0.0))
783 (declare (short-float w p s d next-step))
784 ;; The step must always be downhill.
785 (cond
786 ;; If last step was negative...
787 ((minusp d)
788 ;; First, add in linear term if current slope is still positive.
789 (when (plusp s)
790 (decf-sf next-step (*sf epsilon s)))
791 (cond
792 ;; If current slope is close to or larger than prev slope...
793 ((>= s (*sf shrink-factor p))
794 ;; Take maximum size negative step.
795 (incf-sf next-step (*sf mu d)))
796 ;; Else, use quadratic estimate.
797 (t (incf-sf next-step (*sf d (/sf s (-sf p s)))))))
798 ;; If last step was positive...
799 ((plusp d)
800 ;; First, add in linear term if current slope is still negative.
801 (when (minusp s)
802 (decf-sf next-step (*sf epsilon s)))
803 (cond
804 ;; If current slope is close to or more neg than prev slope...
805 ((<= s (*sf shrink-factor p))
806 ;; Take maximum size positive step.
807 (incf-sf next-step (*sf mu d)))
808 ;; Else, use quadratic estimate.
809 (t (incf-sf next-step (*sf d (/sf s (-sf p s)))))))
810 ;; Last step was zero, so use only linear term.
811 (t (decf-sf next-step (*sf epsilon s))))
812 ;; Having computed the next step, update the data vectors.
813 (setf (fvref deltas i) next-step)
814 (setf (fvref weights i) (+sf w next-step))
815 (setf (fvref prevs i) s)
816 (setf (fvref slopes i) 0.0)
817 nil))
818
819
820 ;;;; Machinery for training output weights.
821
822 (defun set-up-inputs (input)
823 "Set up all the inputs from the INPUT vector as the first few entries in
824 in the values vector."
825 (declare (simple-vector input))
826 (setf (fvref *values* 0) 1.0)
827 (dotimes1 (i *ninputs*)
828 (setf (fvref *values* (1+ i))
829 (the short-float (svref input i)))))
830
831 (defun output-forward-pass ()
832 "Assume the *VALUES* vector has been set up. Just compute the network's
833 outputs."
834 (dotimes1 (j *noutputs*)
835 (let ((ow (svref *output-weights* j))
836 (sum 0.0))
837 (declare (short-float sum))
838 (dotimes1 (i *nunits*)
839 (incf-sf sum (*sf (fvref *values* i) (fvref ow i))))
840 (setf (fvref *outputs* j)
841 (output-function sum)))))
842
843 (defun compute-unit-value (j)
844 "Assume that *VALUES* vector has been set up for all units with index less
845 than J. Compute and record the value for unit J."
846 (declare (fixnum j))
847 (let* ((c (svref *connections* j))
848 (w (svref *weights* j))
849 (sum 0.0))
850 (declare (short-float sum))
851 (dotimes1 (i (ivref *nconnections* j))
852 (incf-sf sum (*sf (fvref *values* (ivref c i))
853 (fvref w i))))
854 (setf (fvref *values* j) (activation sum))
855 nil))
856
857 (defun full-forward-pass (input)
858 "Set up the inputs from the INPUT vector, then propagate activation values
859 forward through all hidden units and output units."
860 (set-up-inputs input)
861 ;; For each hidden unit J, compute the activation value.
862 (do ((j (1+ *ninputs*) (1+ j)))
863 ((= j *nunits*))
864 (declare (fixnum j))
865 (compute-unit-value j))
866 ;; Now compute outputs.
867 (output-forward-pass))
868
869 ;;; Note: We fill the *ERRORS* vector and related statistics with either
870 ;;; the raw error or the error after modification by output-prime,
871 ;;; depending on the *RAW-ERROR* switch. This controls what form of error
872 ;;; the candidate units try to correlate with. All experiments reported in
873 ;;; TR CMU-CS-90-100 assume *RAW-ERROR* is NIL, but this might not always
874 ;;; be the best choice.
875
876 (defun compute-errors (goal output-slopes-p stats-p)
877 "GOAL is a vector of desired values for the output units. Compute and
878 record the output errors for the current training case. If
879 OUTPUT-SLOPES-P is T, then use errors to compute slopes for output
880 weights. If STATS-P is T, accumulate error statistics."
881 (declare (simple-vector goal))
882 (dotimes1 (j *noutputs*)
883 (let* ((out (fvref *outputs* j))
884 (dif (-sf out (svref goal j)))
885 (err-prime (*sf dif (output-prime out)))
886 (os (svref *output-slopes* j)))
887 (declare (short-float dif err-prime))
888 (when stats-p
889 (unless (< (abs dif) *score-threshold*)
890 (incf *error-bits*))
891 (incf-sf *true-error* (*sf dif dif)))
892 (cond (*raw-error*
893 (setf (fvref *errors* j) dif)
894 (incf-sf *sum-error* dif)
895 (incf-sf *sum-sq-error* (*sf dif dif)))
896 (t
897 (setf (fvref *errors* j) err-prime)
898 (incf-sf *sum-error* err-prime)
899 (incf-sf *sum-sq-error* (*sf err-prime err-prime))))
900 (when output-slopes-p
901 (dotimes1 (i *nunits*)
902 (incf-sf (fvref os i) (*sf err-prime (fvref *values* i))))))))
903
904 ;;; Note: Scaling *OUTPUT-EPSILON* by the number of cases seems to keep the
905 ;;; quickprop update in a good range across many different-sized training
906 ;;; sets, but it's something of a hack. Choosing good epsilon values
907 ;;; still requires some trial and error.
908
909 (defun update-output-weights ()
910 "Update the output weights, using the pre-computed slopes, prev-slopes,
911 and delta values. Uses the quickprop update function."
912 (let ((eps (/ *output-epsilon* *ncases*)))
913 (dotimes1 (j *noutputs*)
914 (let ((ow (svref *output-weights* j))
915 (od (svref *output-deltas* j))
916 (os (svref *output-slopes* j))
917 (op (svref *output-prev-slopes* j)))
918 (dotimes1 (i *nunits*)
919 (quickprop-update i ow od os op eps *output-decay*
920 *output-mu* *output-shrink-factor*))))))
921
922
923 ;;;; Outer loops for training output weights.
924
925 (defun train-outputs-epoch ()
926 "Perform forward propagation once for each set of weights in the
927 training vectors, computing errors and slopes. Then update the output
928 weights."
929 ;; Zero error accumulators.
930 (setq *error-bits* 0)
931 (setq *true-error* 0.0)
932 (setq *sum-error* 0.0)
933 (setq *sum-sq-error* 0.0)
934 ;; User may have changed mu between epochs, so fix shrink-factor.
935 (setq *output-shrink-factor*
936 (/sf *output-mu* (+sf 1.0 *output-mu*)))
937 ;; Now run through the training examples.
938 (do ((i *first-case* (1+ i)))
939 ((= i (the fixnum (+ *first-case* *ncases*))))
940 (declare (fixnum i))
941 (setq *goal* (svref *training-outputs* i))
942 (cond (*use-cache*
943 (setq *values* (svref *values-cache* i))
944 (setq *errors* (svref *errors-cache* i))
945 (output-forward-pass))
946 (t (setq *values* *extra-values*)
947 (setq *errors* *extra-errors*)
948 (full-forward-pass (svref *training-inputs* i))))
949 (compute-errors *goal* t t))
950 ;; Do not change weights or count epoch if this run was perfect.
951 (unless (= 0 *error-bits*)
952 (update-output-weights)
953 (incf *epoch*)))
954
955 (defun record-output-weights ()
956 "Store the output weights developed after each output-training phase
957 in the *ouput-weights-record* vector."
958 (let ((record (make-array *noutputs* :initial-element nil)))
959 (dotimes1 (o *noutputs*)
960 (let ((original (svref *output-weights* o))
961 (copy (make-array *nunits* :element-type 'short-float
962 :initial-element 0.0)))
963 (dotimes1 (u *nunits*)
964 (setf (fvref copy u) (fvref original u)))
965 (setf (svref record o) copy)))
966 (setf (svref *output-weights-record* (1- *nunits*)) record)))
967
968 (defun train-outputs (max-epochs)
969 "Train the output weights. If we exhaust MAX-EPOCHS, stop with value
970 :TIMEOUT. If there are zero error bits, stop with value :WIN. Else,
971 keep going until the true error has not changed by a significant amount
972 for *OUTPUT-PATIENCE* epochs. Then return :STAGNANT. If
973 *OUTPUT-PATIENCE* is zero, we do not stop until victory or until
974 MAX-EPOCHS is used up."
975 (declare (fixnum max-epochs))
976 (let ((last-error 0.0)
977 (quit-epoch (+ *epoch* *output-patience*))
978 (first-time t))
979 (declare (fixnum quit-epoch)
980 (short-float last-error))
981 (dotimes1 (i max-epochs (progn
982 (record-output-weights)
983 :timeout))
984 ;; Maybe run a test epoch to see how we're doing.
985 (when (and *test*
986 (not (= 0 *test-interval*))
987 (= 0 (mod i *test-interval*)))
988 (test-epoch))
989 (train-outputs-epoch)
990 (cond ((zerop *error-bits*)
991 (record-output-weights)
992 (return :win))
993 ((zerop *output-patience*))
994 (first-time
995 (setq first-time nil)
996 (setq last-error *true-error*))
997 ((> (abs (- *true-error* last-error))
998 (* last-error *output-change-threshold*))
999 (setq last-error *true-error*)
1000 (setq quit-epoch (+ *epoch* *output-patience*)))
1001 ((>= *epoch* quit-epoch)
1002 (record-output-weights)
1003 (return :stagnant))))))
1004
1005
1006 ;;;; Machinery for Training, Selecting, and Installing Candidate Units.
1007
1008 (defun init-candidates ()
1009 "Give new random weights to all of the candidate units. Zero the other
1010 candidate-unit statistics."
1011 (dotimes1 (i *ncandidates*)
1012 (setf (fvref *cand-sum-values* i) 0.0)
1013 (let ((cw (svref *cand-weights* i))
1014 (cd (svref *cand-deltas* i))
1015 (cs (svref *cand-slopes* i))
1016 (cp (svref *cand-prev-slopes* i))
1017 (cc (svref *cand-cor* i))
1018 (cpc (svref *cand-prev-cor* i)))
1019 (dotimes1 (j *nunits*)
1020 (setf (fvref cw j) (random-weight))
1021 (setf (fvref cd j) 0.0)
1022 (setf (fvref cs j) 0.0)
1023 (setf (fvref cp j) 0.0))
1024 (dotimes1 (o *noutputs*)
1025 (setf (fvref cc o) 0.0)
1026 (setf (fvref cpc o) 0.0)))))
1027
1028 (defun install-new-unit ()
1029 "Add the candidate-unit with the best correlation score to the active
1030 network. Then reinitialize the candidate pool."
1031 (when (>= *nunits* *max-units*)
1032 (error "Cannot add any more units."))
1033 ;; For now, assume total connectivity.
1034 (setf (ivref *nconnections* *nunits*) *nunits*)
1035 (setf (svref *connections* *nunits*) *all-connections*)
1036 ;; Copy the weight vector for the new unit.
1037 (let ((w (make-array *nunits* :element-type 'short-float))
1038 (cw (svref *cand-weights* *best-candidate*)))
1039 (dotimes1 (i *nunits*)
1040 (setf (fvref w i) (fvref cw i)))
1041 (setf (svref *weights* *nunits*) w)
1042 ;; Tell user about the new unit.
1043 (format t " Add unit ~S: ~S~%"
1044 (+ 1 *nunits*) w))
1045 ;; Fix up output weights for candidate unit.
1046 ;; Use minus the correlation times the *weight-multiplier* as an
1047 ;; initial guess. At least the sign should be right.
1048 (dotimes1 (o *noutputs*)
1049 (setf (fvref (svref *output-weights* o) *nunits*)
1050 (*sf (-sf (fvref (svref *cand-prev-cor* *best-candidate*) o))
1051 *weight-multiplier*)))
1052 ;; If using cache, run an epoch to compute this unit's values.
1053 (when *use-cache*
1054 (dotimes1 (i *max-cases*)
1055 (setq *values* (svref *values-cache* i))
1056 (compute-unit-value *nunits*)))
1057 ;; Reinitialize candidate units with random weights.
1058 (incf *nunits*)
1059 (init-candidates))
1060
1061 ;;; Note: Ideally, after each adjustment of the candidate weights, we would
1062 ;;; run two epochs. The first would just determine the correlations
1063 ;;; between the candidate unit outputs and the residual error. Then, in a
1064 ;;; second pass, we would adjust each candidate's input weights so as to
1065 ;;; maximize the absolute value of the correlation. We need to know the
1066 ;;; sign of the correlation for each candidate-output pair so that we know
1067 ;;; which direction to tune the input weights.
1068
1069 ;;; Since this ideal method doubles the number of epochs required for
1070 ;;; training candidates, we cheat slightly and use the correlation values
1071 ;;; computed BEFORE the most recent weight update. This combines the two
1072 ;;; epochs, saving us almost a factor of two. To bootstrap the process, we
1073 ;;; begin with a single epoch that computes only the correlation.
1074
1075 ;;; Since we look only at the sign of the correlation and since that sign
1076 ;;; should change very infrequently, this probably is OK. But keep a
1077 ;;; lookout for pathological situations in which this might cause
1078 ;;; oscillation.
1079
1080
1081 ;;; This function is used only once at the start of each output-training
1082 ;;; phase to prime the pump. After that, each call to compute-slopes also
1083 ;;; computes the error-value products for the next epoch.
1084
1085 (defun compute-correlations ()
1086 "For the current training pattern, compute the value of each candidate
1087 unit and begin to compute the correlation between that unit's value and
1088 the error at each output. We have already done a forward-prop and
1089 computed the error values for active units."
1090 (dotimes1 (u *ncandidates*)
1091 (let ((sum 0.0)
1092 (v 0.0)
1093 (cw (svref *cand-weights* u))
1094 (cc (svref *cand-cor* u)))
1095 (declare (short-float sum v))
1096 ;; Determine activation value of each candidate unit.
1097 (dotimes1 (i *nunits*)
1098 (incf-sf sum (*sf (fvref cw i)
1099 (fvref *values* i))))
1100 (setq v (activation sum))
1101 (incf-sf (fvref *cand-sum-values* u) v)
1102 ;; Accumulate value of each unit times error at each output.
1103 (dotimes1 (o *noutputs*)
1104 (incf-sf (fvref cc o) (*sf v (fvref *errors* o)))))))
1105
1106 ;;; Note: When we were computing true correlations between candidates and
1107 ;;; outputs, this is where the normalization factors went in. Currently we
1108 ;;; are just using covariances, as explained in the tech report. So we
1109 ;;; make only two adjustments here. First, we subtract out the product of
1110 ;;; the mean error and the mean candidate value to keep things from
1111 ;;; exploding when the error has a non-zero mean. Second, we effectively
1112 ;;; scale the error values by the sum-squared error over all training
1113 ;;; cases. This just keeps us from having to adjust *input-epsilon*
1114 ;;; repeatedly as the error is gradually reduced to a small fraction of its
1115 ;;; initial size.
1116
1117 (defun adjust-correlations ()
1118 "Normalize each accumulated correlation value, and stuff the normalized
1119 form into the *cand-prev-cor* data structure. Then zero *cand-cor* to
1120 prepare for the next round. Note the unit with the best total
1121 correlation score."
1122 (setq *best-candidate* 0)
1123 (setq *best-candidate-score* 0.0)
1124 (dotimes1 (u *ncandidates*)
1125 (let* ((cc (svref *cand-cor* u))
1126 (cpc (svref *cand-prev-cor* u))
1127 (offset (*sf (fvref *cand-sum-values* u) *avg-error*))
1128 (cor 0.0)
1129 (score 0.0))
1130 (declare (short-float offset cor score))
1131 (dotimes1 (o *noutputs*)
1132 (setq cor (/sf (-sf (fvref cc o) offset) *sum-sq-error*))
1133 (setf (fvref cpc o) cor)
1134 (setf (fvref cc o) 0.0)
1135 (incf-sf score (abs cor)))
1136 ;; Keep track of the candidate with the best overall correlation.
1137 (when (> score *best-candidate-score*)
1138 (setq *best-candidate-score* score)
1139 (setq *best-candidate* u)))))
1140
1141 ;;; This is the key function in the candidate training process.
1142
1143 (defun compute-slopes ()
1144 "Given the correlation values for each candidate-output pair, compute
1145 the derivative of the candidate's score with respect to each incoming
1146 weight."
1147 (dotimes1 (u *ncandidates*)
1148 (let* ((sum 0.0)
1149 (value 0.0)
1150 (actprime 0.0)
1151 (direction 0.0)
1152 (cw (svref *cand-weights* u))
1153 (cs (svref *cand-slopes* u))
1154 (cc (svref *cand-cor* u))
1155 (cpc (svref *cand-prev-cor* u)))
1156 (declare (short-float sum value actprime direction))
1157 ;; Forward pass through each candidate unit to compute activation-prime.
1158 (dotimes1 (i *nunits*)
1159 (incf-sf sum (*sf (fvref cw i)
1160 (fvref *values* i))))
1161 (setq value (activation sum))
1162 (setq actprime (activation-prime value sum))
1163 ;; Now compute which way we want to adjust each unit's incoming
1164 ;; activation.
1165 (dotimes1 (o *noutputs*)
1166 (let ((error (fvref *errors* o)))
1167 (decf-sf direction
1168 (*sf (if (minusp (fvref cpc o)) -1.0 1.0)
1169 (*sf actprime
1170 (/sf (-sf error *avg-error*)
1171 *sum-sq-error*))))
1172 ;; Also accumulate the error-value products for use next epoch.
1173 (incf-sf (fvref cc o) (*sf error value))))
1174 ;; Given the direction we want to push the candidate, compute
1175 ;; which way we want to tweak each incoming weight.
1176 (dotimes1 (i *nunits*)
1177 (incf-sf (fvref cs i)
1178 (*sf direction (fvref *values* i)))))))
1179
1180 ;;; Note: Scaling *INPUT-EPSILON* by the number of cases and number of
1181 ;;; inputs to each unit seems to keep the quickprop update in a good range,
1182 ;;; as the network goes from small to large, and across many
1183 ;;; different-sized training sets. Still, choosing a good epsilon value
1184 ;;; requires some trial and error.
1185
1186 (defun update-input-weights ()
1187 "Update the input weights, using the pre-computed slopes, prev-slopes,
1188 and delta values. Uses the quickprop update function."
1189 (let ((eps (/ *input-epsilon* (* *ncases* *nunits*))))
1190 (dotimes1 (u *ncandidates*)
1191 (let ((cw (svref *cand-weights* u))
1192 (cd (svref *cand-deltas* u))
1193 (cs (svref *cand-slopes* u))
1194 (cp (svref *cand-prev-slopes* u)))
1195 (dotimes1 (i *nunits*)
1196 (quickprop-update i cw cd cs cp eps *input-decay*
1197 *input-mu* *input-shrink-factor*))))))
1198
1199 ;;; Outer loop for training the candidate unit(s).
1200
1201 (defun train-inputs-epoch ()
1202 "For each training pattern, perform a forward pass. Tune the candidate units'
1203 weights to maximize the correlation score of each."
1204 (do ((i *first-case* (1+ i)))
1205 ((= i (the fixnum (+ *first-case* *ncases*))))
1206 (declare (fixnum i))
1207 (setq *goal* (svref *training-outputs* i))
1208 ;; Compute values and errors, or recall cached values.
1209 (cond (*use-cache*
1210 (setq *values* (svref *values-cache* i))
1211 (setq *errors* (svref *errors-cache* i)))
1212 (t (setq *values* *extra-values*)
1213 (setq *errors* *extra-errors*)
1214 (full-forward-pass (svref *training-inputs* i))
1215 (compute-errors *goal* nil nil)))
1216 ;; Compute the slopes we will use to adjust candidate weights.
1217 (compute-slopes))
1218 ;; User may have changed mu between epochs, so fix shrink-factor.
1219 (setq *input-shrink-factor* (/sf *input-mu*
1220 (+sf 1.0 *input-mu*)))
1221 ;; Now adjust the candidate unit input weights using quickprop.
1222 (update-input-weights)
1223 ;; Fix up the correlation values for the next epoch.
1224 (adjust-correlations)
1225 (incf *epoch*))
1226
1227 (defun correlations-epoch ()
1228 "Do an epoch through all active training patterns just to compute the
1229 initial correlations. After this one pass, we will update the
1230 correlations as we train."
1231 (do ((i *first-case* (1+ i)))
1232 ((= i (the fixnum (+ *first-case* *ncases*))))
1233 (declare (fixnum i))
1234 (setq *goal* (svref *training-outputs* i))
1235 (cond (*use-cache*
1236 (setq *values* (svref *values-cache* i))
1237 (setq *errors* (svref *errors-cache* i)))
1238 (t (setq *values* *extra-values*)
1239 (setq *errors* *extra-errors*)
1240 (full-forward-pass (svref *training-inputs* i))
1241 (compute-errors *goal* nil nil)))
1242 (compute-correlations))
1243 (adjust-correlations)
1244 (incf *epoch*))
1245
1246 (defun train-inputs (max-epochs)
1247 "Train the input weights of all candidates. If we exhaust MAX-EPOCHS,
1248 stop with value :TIMEOUT. Else, keep going until the best candidate
1249 unit's score has changed by a significant amount, and then until it does
1250 not change significantly for PATIENCE epochs. Then return :STAGNANT. If
1251 PATIENCE is zero, we do not stop until victory or until MAX-EPOCHS is
1252 used up."
1253 (declare (fixnum max-epochs))
1254 (setq *avg-error* (/ *sum-error* (* *ncases* *noutputs*)))
1255 (correlations-epoch)
1256 (let ((last-score 0.0)
1257 (quit max-epochs)
1258 (first-time t))
1259 (declare (fixnum quit)
1260 (short-float last-score))
1261 (dotimes1 (i max-epochs :timeout)
1262 (train-inputs-epoch)
1263 (cond ((zerop *input-patience*))
1264 (first-time
1265 (setq first-time nil)
1266 (setq last-score *best-candidate-score*))
1267 ((> (abs (-sf *best-candidate-score* last-score))
1268 (* last-score *input-change-threshold*))
1269 (setq last-score *best-candidate-score*)
1270 (setq quit (+ i *input-patience*)))
1271 ((>= i quit)
1272 (return :stagnant))))))
1273
1274 ;;;; Outer Loop.
1275
1276 (defun list-parameters ()
1277 "Print out the current training parameters in abbreviated form."
1278 (format t "SigOff ~,2F, WtRng ~,2F, WtMul ~,2F~%"
1279 *sigmoid-prime-offset* *weight-range* *weight-multiplier*)
1280 (format t "OMu ~,2F, OEps ~,2F, ODcy ~,4F, OPat ~D, OChange ~,3F~%"
1281 *output-mu* *output-epsilon* *output-decay* *output-patience*
1282 *output-change-threshold*)
1283 (format t "IMu ~,2F, IEps ~,2F, IDcy ~,4F, IPat ~D, IChange ~,3F~%"
1284 *input-mu* *input-epsilon* *input-decay* *input-patience*
1285 *input-change-threshold*)
1286 (format t "Utype ~S, Otype ~S, RawErr ~S, Pool ~D~%"
1287 *unit-type* *output-type* *raw-error* *ncandidates*))
1288
1289 (defun train (outlimit inlimit rounds &optional (restart nil))
1290 "Train the output weights until stagnation or victory is reached. Then
1291 train the input weights to stagnation or victory. Then install the best
1292 candidate unit and repeat. OUTLIMIT and INLIMIT are upper limits on the number
1293 of cycles in each output and input phase. ROUNDS is an upper limit on
1294 the number of unit-installation cycles. If RESTART is non-nil, we are
1295 restarting training from the current point -- do not reinitialize the net."
1296 (declare (fixnum outlimit inlimit rounds))
1297 (unless restart (init-net))
1298 (list-parameters)
1299 (when *use-cache*
1300 (dotimes1 (i *max-cases*)
1301 (setq *values* (svref *values-cache* i))
1302 (set-up-inputs (svref *training-inputs* i))))
1303 (dotimes1 (r rounds :lose)
1304 (case (train-outputs outlimit)
1305 (:win
1306 (list-parameters)
1307 (format t "Victory at ~S epochs, ~S units, ~S hidden, Error ~S.~%"
1308 *epoch* *nunits* (- *nunits* *ninputs* 1) *true-error*)
1309 (return nil))
1310 (:timeout
1311 (format t "Epoch ~D: Out Timeout ~D bits wrong, error ~S.~2%"
1312 *epoch* *error-bits* *true-error*))
1313 (:stagnant
1314 (format t "Epoch ~D: Out Stagnant ~D bits wrong, error ~S.~2%"
1315 *epoch* *error-bits* *true-error*)))
1316 (when *test* (test-epoch))
1317 (case (train-inputs inlimit)
1318 (:timeout
1319 (format t "Epoch ~D: In Timeout. Cor: ~D~%"
1320 *epoch* *best-candidate-score*))
1321 (:stagnant
1322 (format t "Epoch ~D: In Stagnant. Cor: ~D~%"
1323 *epoch* *best-candidate-score*)))
1324 (install-new-unit)))
1325
1326 (defun test-epoch (&optional (*score-threshold* 0.49999))
1327 "Perform forward propagation once for each set of weights in the training
1328 and testing vectors. Reporting the performance. Do not change any
1329 weights. Do not use the caches."
1330 (let ((*use-cache* nil)
1331 (*values* *extra-values*)
1332 (*errors* *extra-errors*)
1333 (*error-bits* 0)
1334 (*true-error* 0.0)
1335 (*sum-error* 0.0)
1336 (*sum-sq-error* 0.0))
1337 ;; Run all training patterns and count errors.
1338 (dotimes1 (i (length *training-inputs*))
1339 (setq *goal* (svref *training-outputs* i))
1340 (full-forward-pass (svref *training-inputs* i))
1341 (compute-errors *goal* nil t))
1342 (format t "Training: ~D of ~D wrong, error ~S."
1343 *error-bits* (length *training-inputs*) *true-error*)
1344 ;; Zero some accumulators again.
1345 (setq *error-bits* 0)
1346 (setq *true-error* 0.0)
1347 (setq *sum-error* 0.0)
1348 (setq *sum-sq-error* 0.0)
1349 ;; Now run all test patterns and report the results.
1350 (when *test-inputs*
1351 (dotimes1 (i (length *test-inputs*))
1352 (setq *goal* (svref *test-outputs* i))
1353 (full-forward-pass (svref *test-inputs* i))
1354 (compute-errors *goal* nil t)))
1355 (format t " Test: ~D of ~D wrong, error ~S.~%"
1356 *error-bits* (length *test-inputs*) *true-error*)))
1357
1358 (defun test-setup (nunits weights output-weights)
1359 "Set up a network for testing, given stored weights and output weights."
1360 (init-net)
1361 (setq *weights* weights)
1362 (setq *output-weights* output-weights)
1363 (setq *nunits* nunits)
1364 (do ((i (1+ *ninputs*) (1+ i)))
1365 ((= i *nunits*))
1366 (declare (fixnum i))
1367 (setf (ivref *nconnections* i) i)
1368 (setf (svref *connections* i) *all-connections*)))
1369
1370
1371 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1372 ;;; Example Applications ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1373 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1374
1375 ;;; Zig-Zag problem. An easy one, useful for testing the code.
1376
1377 (defun build-zig-zag (n)
1378 "Build N pairs of 1-D zig-zag."
1379 (declare (fixnum n))
1380 (setq *ninputs* 1)
1381 (setq *noutputs* 1)
1382 (let ((ti (make-array (* 2 n)))
1383 (to (make-array (* 2 n))))
1384 (dotimes1 (i n)
1385 (setf (svref ti (* i 2))
1386 (vector (+ i 1.0)))
1387 (setf (svref to (* i 2))
1388 (vector (if (evenp i) 0.5 -0.5)))
1389 (setf (svref ti (1+ (* i 2)))
1390 (vector (- (+ i 1.0))))
1391 (setf (svref to (1+ (* i 2)))
1392 (vector (if (evenp i) -0.5 0.5))))
1393 (setq *training-inputs* ti)
1394 (setq *training-outputs* to)
1395 (setq *test-inputs* ti)
1396 (setq *test-outputs* to))
1397 (build-net 1 1)
1398 (init-net))
1399
1400 ;;; Call this with something like (BUILD-ZIG-ZAG 4), then call
1401 ;;; something like (train 100 100 25).
1402
1403
1404 ;;; Two spirals problem.
1405
1406 (defun build-two-spirals (&optional (n 97))
1407 "Build N point-pairs of the two-spiral problem, with standard default
1408 of 97 pairs."
1409 (declare (fixnum n))
1410 (setq *ninputs* 2)
1411 (setq *noutputs* 1)
1412 (let ((ti (make-array (* 2 n)))
1413 (to (make-array (* 2 n))))
1414 (dotimes1 (i n)
1415 (let* ((angle (/ (* i (coerce pi 'short-float)) 16.0))
1416 (radius (/ (* 6.5 (- 104.0 i)) 104))
1417 (x (* radius (sin angle)))
1418 (y (* radius (cos angle))))
1419 (setf (svref ti (* i 2))
1420 (vector x y))
1421 (setf (svref to (* i 2))
1422 (vector 0.5))
1423 (setf (svref ti (1+ (* i 2)))
1424 (vector (- x) (- y)))
1425 (setf (svref to (1+ (* i 2)))
1426 (vector -0.5))))
1427 ;; Put the inner part of the spiral first on the list.
1428 (setq ti (nreverse ti))
1429 (setq to (nreverse to))
1430 (setq *training-inputs* ti)
1431 (setq *training-outputs* to)
1432 (setq *test-inputs* ti)
1433 (setq *test-outputs* to))
1434 (build-net 2 1)
1435 (init-net))
1436
1437 ;;; To run this, call (BUILD-TWO-SPIRALS), set various control parameters,
1438 ;;; and then call something like (TRAIN 100 100 25).
1439
1440 ;;; For parameters, try these:
1441 ;;; SigOff 0.10, WtRng 1.00, WtMul 1.00
1442 ;;; OMu 2.00, OEps 1.00, ODcy 0.0001, OPat 12, OChange 0.010
1443 ;;; IMu 2.00, IEps 100.00, IDcy 0.00000, IPat 8, IChange 0.030
1444 ;;; Utype :SIGMOID, Otype :SIGMOID, RawErr NIL, Pool 8
1445
1446 (defvar *save-random-state*
1447 (make-random-state))
1448
1449 (defun time-two-spirals ()
1450 (setq *random-state* (make-random-state *save-random-state*))
1451 (build-two-spirals)
1452 (time (train 100 100 25)))
1453
1454 ;;; The End.
1455

  ViewVC Help
Powered by ViewVC 1.1.5