/[cmucl]/src/benchmarks/cascor1.lisp
ViewVC logotype

Contents of /src/benchmarks/cascor1.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (show annotations)
Wed Feb 19 18:08:52 1992 UTC (22 years, 2 months ago) by ram
Branch: MAIN
Changes since 1.1: +83 -101 lines
;;; 11/9/90:
;;; Added some additional type declarations for maximum speed under certain
;;; Common Lisp compilers.

1 ;;; -*- Mode:Lisp -*-
2 ;;; ***************************************************************************
3 ;;; Common Lisp implementation of Cascade-Correlation learning algorithm.
4 ;;; This version for export. Non-portable user-interface stuff excised.
5 ;;;
6 ;;; Written by: Scott E. Fahlman
7 ;;; School of Computer Science
8 ;;; Carnegie-Mellon University
9 ;;; Pittsburgh, PA 15217
10 ;;;
11 ;;; Phone: (412) 268-2575
12 ;;; Internet: fahlman@cs.cmu.edu
13 ;;;
14 ;;; This code has been placed in the public domain by the author. As a
15 ;;; matter of simple courtesy, anyone using or adapting this code is
16 ;;; expected to acknowledge the source. The author would like to hear
17 ;;; about any attempts to use this system, successful or not.
18 ;;;
19 ;;; For an explanation of this algorithm and some results, see "The
20 ;;; Cascade-Correlation Learning Architecture" by Scott E. Fahlman and
21 ;;; Christian Lebiere in D. S. Touretzky (ed.), "Advances in Neural
22 ;;; Information Processing Systems 2", Morgan Kaufmann, 1990. A somewhat
23 ;;; longer version is available as CMU Computer Science Tech Report
24 ;;; CMU-CS-90-100.
25 ;;;
26 ;;; ***************************************************************************
27 ;;; EDIT HISTORY SINCE FIRST RELEASE:
28 ;;;
29 ;;; 8/24/90:
30 ;;; Modified TEST-EPOCH so that it wouldn't mess up error statistics being
31 ;;; passed from the output-training to input-training phase. Thanks to
32 ;;; Scott Crowder for spotting this.
33 ;;;
34 ;;; 6/1/90:
35 ;;; Fixed bug in INSTALL-NEW-UNIT. New unit's initial weight was being
36 ;;; computed using *CAND-COR* values of the successful candidate, which is
37 ;;; already zero. Now uses *CAND-PREV-COR*. Thanks to Tim Howells for
38 ;;; spotting this.
39 ;;;
40 ;;; Modify BUILD-NET to check that *MAX-UNITS* is large enough. A couple of
41 ;;; people got mysterious failures the first time they used lots of inputs.
42 ;;;
43 ;;; Made a small change in QUICKPROP-UPDATE to prevent rare divide-by-zero
44 ;;; errors when p = s = 0.0. Thanks to Tom Dietterich.
45 ;;;
46 ;;; Added CHANGED-TRAINING-SET, which should be called when the training set
47 ;;; is changed but you don't want to reinitialize the net. This rebuilds
48 ;;; the caches.
49 ;;;
50 ;;; 11/9/90:
51 ;;; Added some additional type declarations for maximum speed under certain
52 ;;; Common Lisp compilers.
53 ;;; ***************************************************************************
54
55 ;;; This proclamation buys a certain amount of overall speed at the expense
56 ;;; of runtime checking. Comment it out when debugging new, bug-infested code.
57 (proclaim '(optimize (speed 3) (space 0) (safety 0)))
58
59 ;;; Style note: Because some of these runs take a long time, this code is
60 ;;; extensively hacked for good performance under a couple of Common Lisp
61 ;;; systems, some of which have poor performance on multi-dimensional
62 ;;; arrays and some of which have weak type-inference in the compiler.
63 ;;; Elegance and clarity have in some cases been sacrificed for speed.
64
65 ;;; In some problems, floating point underflow errors may occur as a result
66 ;;; of weight-decay and other operations. Most Common Lisp implementations
67 ;;; have an option to turn floating underflows into zero values without
68 ;;; signalling an error. You should enable this facility if it is
69 ;;; available. If not, you'll either have to write a condition handler for
70 ;;; floating underflows or live with the occasional underflow error.
71
72 ;;; In CMU Common Lisp, we use the following incantation:
73 ;;; (setq extensions:*ignore-floating-point-underflow* t)
74
75
76 ;;; Compensate for the clumsy Common Lisp declaration system and weak
77 ;;; type-inference in some Common Lisp compilers.
78
79 ;;; INCF-SF, *SF, etc. are like INCF, *, etc., but they declare their
80 ;;; operands and results to be short-floats. The code gets unreadable
81 ;;; quickly if you insert all these declarations by hand.
82
83 (defmacro incf-sf (place &optional (increment 1.0))
84 `(the short-float (incf (the short-float ,place)
85 (the short-float ,increment))))
86
87 (defmacro decf-sf (place &optional (increment 1.0))
88 `(the short-float (decf (the short-float ,place)
89 (the short-float ,increment))))
90
91 (defmacro *sf (&rest args)
92 `(the short-float
93 (* ,@(mapcar #'(lambda (x) (list 'the 'short-float x)) args))))
94
95 (defmacro +sf (&rest args)
96 `(the short-float
97 (+ ,@(mapcar #'(lambda (x) (list 'the 'short-float x)) args))))
98
99 (defmacro -sf (&rest args)
100 `(the short-float
101 (- ,@(mapcar #'(lambda (x) (list 'the 'short-float x)) args))))
102
103 (defmacro /sf (&rest args)
104 `(the short-float
105 (/ ,@(mapcar #'(lambda (x) (list 'the 'short-float x)) args))))
106
107 ;;; DOTIMES1 is like DOTIMES, only with the loop counter declared as a
108 ;;; fixnum. This is for compilers with weak type inference.
109
110 (defmacro dotimes1 (form1 &body body)
111 `(dotimes ,form1 (declare (fixnum ,(car form1))) . ,body))
112
113 ;;; Create vector-access forms similar to SVREF, but for vectors of
114 ;;; element-type SHORT-FLOAT and FIXNUM.
115
116 (eval-when (compile load eval)
117 (defconstant fvector-type
118 (array-element-type (make-array '(1) :element-type 'short-float)))
119 (defconstant ivector-type
120 (array-element-type (make-array '(1) :element-type 'fixnum))))
121
122 (defmacro fvref (a i)
123 "Like SVREF, but with vectors of element-type SHORT-FLOAT."
124 (if (eq fvector-type t)
125 `(the short-float (svref ,a ,i))
126 `(the short-float
127 (aref (the (simple-array ,fvector-type (*)) ,a) ,i))))
128
129 (defmacro ivref (a i)
130 "Like SVREF, but with vectors of element-type FIXNUM."
131 (if (eq ivector-type t)
132 `(the fixnum (svref ,a ,i))
133 `(the fixnum
134 (aref (the (simple-array ,ivector-type (*)) ,a) ,i))))
135
136
137 ;;;; Assorted Parameters and Controls.
138
139 ;;; Thse parameters and switches control the quickprop learning algorithm
140 ;;; used to train the output weights and candidate units.
141
142 (defvar *unit-type* :sigmoid
143 "The type of activation function used by the hidden units. Options
144 currently implemented are :sigmoid, :asigmoid, and :gaussian. Sigmoid is
145 symmetric in range -0.5 to +0.5, while Asigmoid is asymmetric, 0.0 to
146 1.0.")
147
148 (defvar *output-type* :sigmoid
149 "The activation function to use on the output units. Options currently
150 implemented are :linear and :sigmoid.")
151
152 (defvar *raw-error* nil
153 "If T, candidate units will try to correlate with the raw difference
154 between actual and desired outputs. Else, they use the difference modified
155 by the derivative of the output unit activation function.")
156
157 (defvar *sigmoid-prime-offset* 0.1
158 "This is added to the derivative of the sigmoid function to prevent the
159 system from getting stuck at the points where sigmoid-prime goes to
160 zero.")
161 (proclaim '(short-float *sigmoid-prime-offset*))
162
163 (defvar *weight-range* 1.0
164 "Input weights in the network get inital random values between plus and
165 minus *weight-range*. This parameter also controls the initial weights
166 on direct input-to-output links.")
167 (proclaim '(short-float *weight-range*))
168
169 (defvar *weight-multiplier* 1.0
170 "The output weights for cadidate units get an initial value that is the
171 negative of the correlation times this factor.")
172 (proclaim '(short-float *weight-multiplier*))
173
174 (defvar *output-mu* 2.0
175 "Mu parmater used for quickprop training of output weights. The
176 step size is limited to mu times the previous step.")
177 (proclaim '(short-float *output-mu*))
178
179 (defvar *output-shrink-factor* (/ *output-mu* (+ 1.0 *output-mu*))
180 "Derived from *output-mu*. Used in computing whether the proposed step is
181 too large.")
182 (proclaim '(short-float *output-shrink-factor*))
183
184 (defvar *output-epsilon* 0.35
185 "Controls the amount of linear gradient descent to use in updating
186 output weights.")
187 (proclaim '(short-float *output-epsilon*))
188
189 (defvar *output-decay* 0.0001
190 "This factor times the current weight is added to the slope at the
191 start of each output-training epoch. Keeps weights from growing too big.")
192 (proclaim '(short-float *output-decay*))
193
194 (defvar *output-patience* 8
195 "If we go for this many epochs with no significant change, it's time to
196 stop tuning. If 0, go on forever.")
197 (proclaim '(fixnum *output-patience*))
198
199 (defvar *output-change-threshold* 0.01
200 "The error must change by at least this fraction of its old value in
201 order to count as a significant change.")
202 (proclaim '(short-float *output-change-threshold*))
203
204 (defvar *input-mu* 2.0
205 "Mu parmater used for quickprop training of input weights. The
206 step size is limited to mu times the previous step.")
207 (proclaim '(short-float *input-mu*))
208
209 (defvar *input-shrink-factor* (/ *input-mu* (+ 1.0 *input-mu*))
210 "Derived from *input-mu*. Used in computing whether the proposed step is
211 too large.")
212 (proclaim '(short-float *input-shrink-factor*))
213
214 (defvar *input-epsilon* 1.0
215 "Controls the amount of linear gradient descent to use in updating
216 unit input weights.")
217 (proclaim '(short-float *input-epsilon*))
218
219 (defvar *input-decay* 0.0
220 "This factor times the current weight is added to the slope at the
221 start of each output-training epoch. Keeps weights from growing too big.")
222 (proclaim '(short-float *input-decay*))
223
224 (defvar *input-patience* 8
225 "If we go for this many epochs with no significant change, it's time to
226 stop tuning. If 0, go on forever.")
227 (proclaim '(fixnum *input-patience*))
228
229 (defvar *input-change-threshold* 0.03
230 "The correlation score for the best unit must change by at least
231 this fraction of its old value in order to count as a significant
232 change.")
233 (proclaim '(short-float *input-change-threshold*))
234
235 ;;; Variables related to error and correlation.
236
237 (defvar *score-threshold* 0.4
238 "An output is counted as correct for a given case if the difference
239 between that output and the desired value is smaller in magnitude than
240 this value.")
241 (proclaim '(short-float *score-threshold*))
242
243 (defvar *error-bits* 0
244 "Count number of bits in epoch that are wrong by more than
245 *SCORE-THRESHOLD*")
246 (proclaim '(fixnum *error-bits*))
247
248 (defvar *true-error* 0.0
249 "The sum-squared error at the network outputs. This is the value the
250 algorithm is ultimately trying to minimize.")
251 (proclaim '(short-float *true-error*))
252
253 (defvar *sum-error* 0.0
254 "Accumulate the sum of the error values after output training phase.")
255 (proclaim '(short-float *sum-error*))
256
257 (defvar *sum-sq-error* 0.0
258 "Accumulate the sum of the squared error values after output
259 training phase.")
260 (proclaim '(short-float *sum-sq-error*))
261
262 (defvar *avg-error* 0.0
263 "Holds the average of error values after output training phase.")
264 (proclaim '(short-float *avg-error*))
265
266 (defvar *best-candidate-score* 0.0
267 "The best correlation score found among all candidate units being
268 trained.")
269 (proclaim '(short-float *best-candidate-score*))
270
271 (defvar *best-candidate* 0
272 "The index of the candidate unit whose correlation score is best
273 at present.")
274 (proclaim '(fixnum *best-candidate*))
275
276 ;;; These variables and switches control the simulation and display.
277
278 (defvar *use-cache* t
279 "If T, cache the forward-pass values instead of repeatedly
280 computing them. This can save a *lot* of time if all the cached values
281 fit into memory.")
282
283 (defparameter *epoch* 0
284 "Count of the number of times the entire training set has been presented.")
285 (proclaim '(fixnum *epoch*))
286
287 (defvar *test* nil
288 "If T, run a test epoch every so often during output training.")
289
290 (defvar *test-interval* 0
291 "Run a test epoch every *test-interval* output training cycles.")
292 (proclaim '(fixnum *test-interval*))
293
294 (defvar *single-pass* nil
295 "When on, pause after next forward/backward cycle.")
296
297 (defvar *single-epoch* nil
298 "When on, pause after next training epoch.")
299
300 (defparameter *step* nil
301 "Turned briefly to T in order to continue after a pause.")
302
303 ;;; The sets of training inputs and outputs are stored in parallel vectors.
304 ;;; Each element is a SIMPLE-VECTOR holding short-float values, one for
305 ;;; each input or output. Note: this is a simple vector, not a specialized
306 ;;; vector of element-type short-float.
307
308 (defvar *training-inputs* (make-array 0)
309 "Vector of input patterns for training the net.")
310 (proclaim '(simple-vector *training-inputs*))
311
312 (defvar *training-outputs* (make-array 0)
313 "Vector of output patterns for training the net.")
314 (proclaim '(simple-vector *training-outputs*))
315
316 (defvar *goal* (make-array 0)
317 "The goal vector for the current training or testing case.")
318 (proclaim '(simple-vector *goal*))
319
320 (defvar *max-cases* 0
321 "Maximum number of training cases that can be accommdated by the current
322 data structures.")
323 (proclaim '(fixnum *max-cases*))
324
325 (defvar *ncases* 0
326 "Number of training cases currently in use. Assume a contiguous block
327 beginning with *FIRST-CASE*.")
328 (proclaim '(fixnum *ncases*))
329
330 (defvar *first-case* 0
331 "Address of the first training case in the currently active set. Usually
332 zero, but may differ if we are training on different chunks of the training
333 set at different times.")
334 (proclaim '(fixnum *first-case*))
335
336 ;;; For some benchmarks there is a separate set of values used for testing
337 ;;; the network's ability to generalize. These values are not used during
338 ;;; training.
339
340 (defvar *test-inputs* nil
341 "Vector of input patterns for testing the net.")
342 (proclaim '(simple-vector *test-inputs*))
343
344 (defvar *test-outputs* nil
345 "Vector of output patterns for testing the net.")
346 (proclaim '(simple-vector *test-outputs*))
347
348
349 ;;;; Fundamental data structures.
350
351 ;;; Unit values and weights are short flonums.
352
353 ;;; Instead of representing each unit by a structure, we represent the
354 ;;; unit by a fixnum. This is used to index into various vectors that hold
355 ;;; per-unit information, such as the activation value of each unit.
356 ;;; So the information concerning a given unit is found in a slice of values
357 ;;; across many vectors, all with the same unit-index.
358
359 ;;; Per-connection information for each connection COMING INTO unit is
360 ;;; stored in a vector of vectors. The outer vector is indexed by the unit
361 ;;; number, and the inner vector is then indexed by connection number.
362 ;;; This is a sleazy way of implementing a 2-D array, faster in most Lisp
363 ;;; systems than multiplying to do the index arithmetic, and more efficient
364 ;;; if the units are sparsely connected.
365
366 ;;; Unit 0, the "bias unit" is always at a maximum-on value. Next come
367 ;;; some input "units", then some hidden units.
368
369 ;;; Output units have their own separate set of data structures and
370 ;;; indices. The units and outputs together form the "active" network.
371 ;;; There are also separate data structures and indices for the "candidate"
372 ;;; units that have not yet been added to the network.
373
374 (defvar *max-units* 30
375 "Maximum number of input values and hidden units in the network.")
376 (proclaim '(fixnum *max-units*))
377
378 (defvar *ninputs* 0
379 "Number of inputs for this problem.")
380 (proclaim '(fixnum *ninputs*))
381
382 (defvar *noutputs* 0
383 "Number of outputs for this problem.")
384 (proclaim '(fixnum *noutputs*))
385
386 (defvar *nunits* 0
387 "Current number of active units in the network. This count includes all
388 inputs to the network and the bias unit.")
389 (proclaim '(fixnum *nunits*))
390
391 (defvar *ncandidates* 8
392 "Number of candidate units whose inputs will be trained at once.")
393 (proclaim '(fixnum *ncandidates*))
394
395 ;;; The following vectors hold values related to hidden units in the active
396 ;;; net and their input weights. The vectors are created by BUILD-NET, after
397 ;;; the dimension variables have been set up.
398
399 (defvar *values* nil
400 "Vector holding the current activation value for each unit and input in
401 the active net.")
402
403 (defvar *values-cache* nil
404 "Holds a distinct *VALUES* vector for each of the *MAX-CASES* training
405 cases. Once we have computed the *VALUES* vector for each training case,
406 we can use it repeatedly until the weights or training cases change.")
407
408 (defvar *extra-values* nil
409 "Extra values vector to use when not using the cache.")
410
411 ;;; Note: the *NCONNECTIONS* and *CONNECTIONS* vectors could be eliminated
412 ;;; if we wanted to commit to total connectivity for all units.
413 ;;; For now, we want to allow for sparse or irregular connectivity.
414
415 (defvar *nconnections* nil
416 "Vector holding the number of incoming connections for each unit.")
417
418 (defvar *connections* nil
419 "Vector that holds a connection vector for each unit J.
420 Each entry in the connection vector holds a unit index I,
421 indicating that this connection is from I to J.")
422
423 (defvar *weights* nil
424 "Vector of vectors with structure parallel to the *connections* vector.
425 Each entry gives the weight associated with an incoming connection.")
426
427 ;;; The following vectors hold values for the outputs of the active
428 ;;; network and the output-side weights.
429
430 (defvar *outputs* nil
431 "Vector holding the network output values.")
432
433 (defvar *errors* nil
434 "Vector holding the current error value for each output.")
435
436 (defvar *errors-cache* nil
437 "Holds a distinct *ERRORS* vector for each of the *MAX-CASES* training
438 cases. Once we have computed the *ERRORS* vector for a given training
439 case, we can use it repeatedly until the weights of the training cases
440 change.")
441
442 (defvar *extra-errors* nil
443 "Extra errors vector to use when not using the cache.")
444
445 (defvar *output-weights* nil
446 "Vector of vectors. For each output, we have a vector of output weights
447 coming from the unit indicated by the index.")
448
449 (defvar *output-deltas* nil
450 "Vector of vectors, parallel with output weights. Each entry is the
451 amount by which the corresponding output weight was changed last time.")
452
453 (defvar *output-slopes* nil
454 "Vector of vectors, parallel with output weights. Each entry is the
455 partial derivative of the total error with repsect to the corresponding
456 weight.")
457
458 (defvar *output-prev-slopes* nil
459 "Vector of vectors, parallel with output weights. Each entry is the
460 previous value of the corresponding *OUTPUT-SLOPES* entry.")
461
462 (defvar *output-weights-record* nil
463 "The vector of output weights is recorded here after each output-training
464 phase and just prior to the addition of the next unit. This record
465 allows us to reconstruct the network's performance at each of these
466 points in time.")
467
468 ;;; The following vectors have one entry for each candidate unit in the
469 ;;; pool of trainees.
470
471 (defvar *cand-sum-values* nil
472 "For each candidate unit, the sum of its values over an entire
473 training set.")
474
475 (defvar *cand-cor* nil
476 "A vector with one entry for each candidate unit. This entry is a vector
477 that holds the correlation between this unit's value and the residual
478 error at each of the outputs, computed over a whole epoch.")
479
480 (defvar *cand-prev-cor* nil
481 "Holds the *cand-cor* values computed in the previous candidate training
482 epoch.")
483
484 (defvar *cand-weights* nil
485 "A vector with one entry for each candidate unit. This entry is a vector
486 that holds the current input weights for that candidate unit.")
487
488 (defvar *cand-deltas* nil
489 "A vector with one entry for each candidate unit. This entry is a vector
490 that holds the input weights deltas for that candidate unit.")
491
492 (defvar *cand-slopes* nil
493 "A vector with one entry for each candidate unit. This entry is a vector
494 that holds the input weights slopes for that candidate unit.")
495
496 (defvar *cand-prev-slopes* nil
497 "A vector with one entry for each candidate unit. This entry is a vector
498 that holds the previous values of the input weight slopes for that
499 candidate unit.")
500
501 ;;; At present, each candidate receives a connection from every input and
502 ;;; pre-existing unit. Rather than cons up a new *connections* vector for
503 ;;; each of these, we can just use this one for all of them.
504
505 (defvar *all-connections* nil
506 "A *CONNECTIONS* vector that can be used by any unit that connects to
507 all lower-numbered units, in order.")
508
509
510 ;;;; Network-building utilities.
511
512 (defun build-net (ninputs noutputs)
513 "Create the network data structures, given the number of input and output
514 connections. Get *MAX-UNITS* and other dimesntions from variables."
515 (declare (fixnum ninputs noutputs))
516 ;; Check to make sure *MAX-UNITS* is big enough.
517 (unless (> *max-units* (+ ninputs 1))
518 (error "*MAX-UNITS* must be greater than number of inputs plus 1."))
519 ;; Fill in assorted variables and create top-level vectors.
520 (setq *ninputs* ninputs
521 *noutputs* noutputs
522 *max-cases* (length *training-inputs*)
523 *ncases* *max-cases*
524 *first-case* 0
525 *nunits* (+ 1 *ninputs*)
526 *values-cache* (make-array *max-cases* :initial-element nil)
527 *extra-values* (make-array *max-units*
528 :element-type 'short-float
529 :initial-element 0.0)
530 *values* *extra-values*
531 *nconnections* (make-array *max-units*
532 :element-type 'fixnum
533 :initial-element 0)
534 *connections* (make-array *max-units* :initial-element nil)
535 *weights* (make-array *max-units* :initial-element nil)
536 *outputs* (make-array *noutputs*
537 :element-type 'short-float
538 :initial-element 0.0)
539 *errors-cache* (make-array *max-cases* :initial-element nil)
540 *extra-errors* (make-array *noutputs*
541 :element-type 'short-float
542 :initial-element 0.0)
543 *errors* *extra-errors*
544 *output-weights* (make-array *noutputs* :initial-element nil)
545 *output-weights-record* (make-array *max-units* :initial-element nil)
546 *output-deltas* (make-array *noutputs* :initial-element nil)
547 *output-slopes* (make-array *noutputs* :initial-element nil)
548 *output-prev-slopes* (make-array *noutputs* :initial-element nil)
549 *cand-sum-values* (make-array *ncandidates*
550 :element-type 'short-float
551 :initial-element 0.0)
552 *cand-cor* (make-array *ncandidates* :initial-element nil)
553 *cand-prev-cor* (make-array *ncandidates* :initial-element nil)
554 *cand-weights* (make-array *ncandidates* :initial-element nil)
555 *cand-deltas* (make-array *ncandidates* :initial-element nil)
556 *cand-slopes* (make-array *ncandidates* :initial-element nil)
557 *cand-prev-slopes* (make-array *ncandidates* :initial-element nil))
558 ;; Only create the caches if *USE-CACHE* is on -- may not always have room.
559 (when *use-cache*
560 (dotimes1 (i *max-cases*)
561 (setf (svref *values-cache* i)
562 (make-array *max-units*
563 :element-type 'short-float
564 :initial-element 0.0))
565 (setf (svref *errors-cache* i)
566 (make-array *noutputs*
567 :element-type 'short-float
568 :initial-element 0.0))))
569 ;; For each output, create the vectors holding per-weight information.
570 (dotimes1 (i *noutputs*)
571 (setf (svref *output-weights* i)
572 (make-array *max-units*
573 :element-type 'short-float
574 :initial-element 0.0))
575 (setf (svref *output-deltas* i)
576 (make-array *max-units*
577 :element-type 'short-float
578 :initial-element 0.0))
579 (setf (svref *output-slopes* i)
580 (make-array *max-units*
581 :element-type 'short-float
582 :initial-element 0.0))
583 (setf (svref *output-prev-slopes* i)
584 (make-array *max-units*
585 :element-type 'short-float
586 :initial-element 0.0)))
587 ;; For each candidate unit, create the vectors holding the correlations,
588 ;; incoming weights, and other stats.
589 (dotimes1 (i *ncandidates*)
590 (setf (svref *cand-cor* i)
591 (make-array *noutputs*
592 :element-type 'short-float
593 :initial-element 0.0))
594 (setf (svref *cand-prev-cor* i)
595 (make-array *noutputs*
596 :element-type 'short-float
597 :initial-element 0.0))
598 (setf (svref *cand-weights* i)
599 (make-array *max-units*
600 :element-type 'short-float
601 :initial-element 0.0))
602 (setf (svref *cand-deltas* i)
603 (make-array *max-units*
604 :element-type 'short-float
605 :initial-element 0.0))
606 (setf (svref *cand-slopes* i)
607 (make-array *max-units*
608 :element-type 'short-float
609 :initial-element 0.0))
610 (setf (svref *cand-prev-slopes* i)
611 (make-array *max-units*
612 :element-type 'short-float
613 :initial-element 0.0))))
614
615 (defun random-weight ()
616 "Select a random weight, uniformly distributed over the
617 interval from minus to plus *weight-range*."
618 (-sf (random (*sf 2.0 *weight-range*)) *weight-range*))
619
620 (defun init-net ()
621 "Set up the network for a learning problem. Clean up all the data
622 structures that may have become corrupted. Initialize the output weights
623 to random values controlled by *weight-range*."
624 ;; Set up the *ALL-CONNECTIONS* vector.
625 (setq *all-connections*
626 (make-array *max-units* :element-type 'fixnum))
627 (dotimes1 (i *max-units*)
628 (setf (ivref *all-connections* i) i))
629 ;; Initialize the active unit data structures.
630 (dotimes1 (i *max-units*)
631 (setf (fvref *extra-values* i) 0.0)
632 (setf (ivref *nconnections* i) 0)
633 (setf (svref *connections* i) nil)
634 (setf (svref *weights* i) nil)
635 (setf (svref *output-weights-record* i) nil))
636 ;; Initialize the per-output data structures.
637 (dotimes1 (i *noutputs*)
638 (setf (fvref *outputs* i) 0.0)
639 (setf (fvref *extra-errors* i) 0.0)
640 (let ((ow (svref *output-weights* i))
641 (od (svref *output-deltas* i))
642 (os (svref *output-slopes* i))
643 (op (svref *output-prev-slopes* i)))
644 (dotimes1 (j *max-units*)
645 (setf (fvref ow j) 0.0)
646 (setf (fvref od j) 0.0)
647 (setf (fvref os j) 0.0)
648 (setf (fvref op j) 0.0))
649 ;; Set up initial random weights for the input-to-output connections.
650 (dotimes1 (j (1+ *ninputs*))
651 (setf (fvref ow j) (random-weight)))))
652 ;; Initialize the caches if they are in use.
653 (when *use-cache*
654 (dotimes1 (j *max-cases*)
655 (let ((v (svref *values-cache* j))
656 (e (svref *errors-cache* j)))
657 (dotimes1 (i *max-units*)
658 (setf (fvref v i) 0.0))
659 (dotimes1 (i *noutputs*)
660 (setf (fvref e i) 0.0)))))
661 ;; Candidate units get initialized in a separate routine.
662 (init-candidates)
663 ;; Do some other assorted housekeeping.
664 (setf (fvref *extra-values* 0) 1.0)
665 (setq *epoch* 0)
666 (setq *nunits* (+ 1 *ninputs*))
667 (setq *error-bits* 0)
668 (setq *true-error* 0.0)
669 (setq *sum-error* 0.0)
670 (setq *sum-sq-error* 0.0)
671 (setq *best-candidate-score* 0.0)
672 (setq *best-candidate* 0))
673
674 (defun changed-training-set ()
675 "Call this instead of BUILD-NET and INIT-NET if you want to leave
676 existing hidden units in place and start from here with new training
677 examples. Assumes that the number of net inputs and outputs remains the
678 same, but the number of cases may have changed. Rebuilds the caches."
679 (setq *max-cases* (length *training-inputs*)
680 *ncases* *max-cases*
681 *first-case* 0
682 *values-cache* (make-array *max-cases* :initial-element nil)
683 *errors-cache* (make-array *max-cases* :initial-element nil))
684 ;; Only create the caches if *USE-CACHE* is on -- may not always have room.
685 (when *use-cache*
686 (dotimes1 (i *max-cases*)
687 (setf (svref *errors-cache* i)
688 (make-array *noutputs*
689 :element-type 'short-float
690 :initial-element 0.0))
691 (setq *values* (make-array *max-units*
692 :element-type 'short-float
693 :initial-element 0.0))
694 (setf (svref *values-cache* i) *values*)
695 (set-up-inputs (svref *training-inputs* i))
696 (do ((j (1+ *ninputs*) (1+ j)))
697 ((= j *nunits*))
698 (declare (fixnum j))
699 (compute-unit-value j)))))
700
701 ;;;; Utilities for learning.
702
703 (proclaim '(inline activation activation-prime))
704
705 (defun activation (sum)
706 "Given the sum of weighted inputs, compute the unit's activation value.
707 Defined unit types are :sigmoid, :asigmoid, and :gaussian."
708 (declare (short-float sum))
709 (ecase *unit-type*
710 (:sigmoid
711 ;; Symmetric sigmoid function in range -0.5 to +0.5.
712 (cond ((< sum -15.0) -0.5)
713 ((> sum 15.0) +0.5)
714 (t (-sf (/sf (+sf 1.0 (exp (-sf sum)))) 0.5))))
715 (:asigmoid
716 ;; Asymmetric sigmoid in range 0.0 to 1.0.
717 (cond ((< sum -15.0) 0.0)
718 ((> sum 15.0) 1.0)
719 (t (/sf 1.0 (+sf 1.0 (exp (-sf sum)))))))
720 (:gaussian
721 ;; Gaussian activation function in range 0.0 to 1.0.
722 (let ((x (*sf -0.5 sum sum)))
723 (if (< x -75.0) 0.0 (exp x))))))
724
725 ;;; Note: do not use *SIGMOID-PRIME-OFFSET* here, as it confuses the
726 ;;; correlation machinery. But do use it in output-prime, since it does no
727 ;;; harm there and the output units often get stuck at extreme values.
728
729 (defun activation-prime (value sum)
730 "Given the unit's activation value and sum of weighted inputs, compute
731 the derivative of the activation with respect to the sum. Defined unit
732 types are :sigmoid, :asigmoid, and :gaussian."
733 (declare (short-float value sum))
734 (ecase *unit-type*
735 (:sigmoid
736 (-sf 0.25 (*sf value value)))
737 (:asigmoid
738 (*sf value (-sf 1.0 value)))
739 (:gaussian
740 (*sf (-sf value) sum))))
741
742 (proclaim '(inline output-function output-prime))
743
744 (defun output-function (sum)
745 "Compute the value of an output, given the weighted sum of incoming values.
746 Defined output types are :sigmoid and :linear."
747 (declare (short-float sum))
748 (ecase *output-type*
749 (:sigmoid (cond ((< sum -15.0) -0.5)
750 ((> sum 15.0) +0.5)
751 (t (-sf (/sf 1.0 (+sf 1.0 (exp (-sf sum)))) 0.5))))
752 (:linear sum)))
753
754 (defun output-prime (output)
755 "Compute the derivative of an output with respect to the weighted sum of
756 incoming values. Defined output types are :sigmoid and :linear."
757 (declare (short-float output))
758 (ecase *output-type*
759 (:sigmoid
760 (+sf *sigmoid-prime-offset* (-sf 0.25 (*sf output output))))
761 (:linear 1.0)))
762
763 ;;; The basic routine for doing Quickprop-style update of weights.
764 ;;; Distilled essence of a year's work...
765
766 (proclaim '(inline quickprop-update))
767
768 (defun quickprop-update (i weights deltas slopes prevs
769 epsilon decay mu shrink-factor)
770 "Given vectors holding weights, deltas, slopes, and previous slopes,
771 and an index i, update weight(i) and delta(i) appropriately. Move
772 slope(i) to prev(i) and zero out slope(i). Add weight decay term to
773 each slope before doing the update."
774 (let* ((w (fvref weights i))
775 (d (fvref deltas i))
776 (s (+sf (fvref slopes i) (*sf decay w)))
777 (p (fvref prevs i))
778 (next-step 0.0))
779 (declare (short-float w p s d next-step))
780 ;; The step must always be downhill.
781 (cond
782 ;; If last step was negative...
783 ((minusp d)
784 ;; First, add in linear term if current slope is still positive.
785 (when (plusp s)
786 (decf-sf next-step (*sf epsilon s)))
787 (cond
788 ;; If current slope is close to or larger than prev slope...
789 ((>= s (*sf shrink-factor p))
790 ;; Take maximum size negative step.
791 (incf-sf next-step (*sf mu d)))
792 ;; Else, use quadratic estimate.
793 (t (incf-sf next-step (*sf d (/sf s (-sf p s)))))))
794 ;; If last step was positive...
795 ((plusp d)
796 ;; First, add in linear term if current slope is still negative.
797 (when (minusp s)
798 (decf-sf next-step (*sf epsilon s)))
799 (cond
800 ;; If current slope is close to or more neg than prev slope...
801 ((<= s (*sf shrink-factor p))
802 ;; Take maximum size positive step.
803 (incf-sf next-step (*sf mu d)))
804 ;; Else, use quadratic estimate.
805 (t (incf-sf next-step (*sf d (/sf s (-sf p s)))))))
806 ;; Last step was zero, so use only linear term.
807 (t (decf-sf next-step (*sf epsilon s))))
808 ;; Having computed the next step, update the data vectors.
809 (setf (fvref deltas i) next-step)
810 (setf (fvref weights i) (+sf w next-step))
811 (setf (fvref prevs i) s)
812 (setf (fvref slopes i) 0.0)
813 nil))
814
815
816 ;;;; Machinery for training output weights.
817
818 (defun set-up-inputs (input)
819 "Set up all the inputs from the INPUT vector as the first few entries in
820 in the values vector."
821 (declare (simple-vector input))
822 (setf (fvref *values* 0) 1.0)
823 (dotimes1 (i *ninputs*)
824 (setf (fvref *values* (1+ i))
825 (the short-float (svref input i)))))
826
827 (defun output-forward-pass ()
828 "Assume the *VALUES* vector has been set up. Just compute the network's
829 outputs."
830 (dotimes1 (j *noutputs*)
831 (let ((ow (svref *output-weights* j))
832 (sum 0.0))
833 (declare (short-float sum))
834 (dotimes1 (i *nunits*)
835 (incf-sf sum (*sf (fvref *values* i) (fvref ow i))))
836 (setf (fvref *outputs* j)
837 (output-function sum)))))
838
839 (defun compute-unit-value (j)
840 "Assume that *VALUES* vector has been set up for all units with index less
841 than J. Compute and record the value for unit J."
842 (declare (fixnum j))
843 (let* ((c (svref *connections* j))
844 (w (svref *weights* j))
845 (sum 0.0))
846 (declare (short-float sum))
847 (dotimes1 (i (ivref *nconnections* j))
848 (incf-sf sum (*sf (fvref *values* (ivref c i))
849 (fvref w i))))
850 (setf (fvref *values* j) (activation sum))
851 nil))
852
853 (defun full-forward-pass (input)
854 "Set up the inputs from the INPUT vector, then propagate activation values
855 forward through all hidden units and output units."
856 (set-up-inputs input)
857 ;; For each hidden unit J, compute the activation value.
858 (do ((j (1+ *ninputs*) (1+ j)))
859 ((= j *nunits*))
860 (declare (fixnum j))
861 (compute-unit-value j))
862 ;; Now compute outputs.
863 (output-forward-pass))
864
865 ;;; Note: We fill the *ERRORS* vector and related statistics with either
866 ;;; the raw error or the error after modification by output-prime,
867 ;;; depending on the *RAW-ERROR* switch. This controls what form of error
868 ;;; the candidate units try to correlate with. All experiments reported in
869 ;;; TR CMU-CS-90-100 assume *RAW-ERROR* is NIL, but this might not always
870 ;;; be the best choice.
871
872 (defun compute-errors (goal output-slopes-p stats-p)
873 "GOAL is a vector of desired values for the output units. Compute and
874 record the output errors for the current training case. If
875 OUTPUT-SLOPES-P is T, then use errors to compute slopes for output
876 weights. If STATS-P is T, accumulate error statistics."
877 (declare (simple-vector goal))
878 (dotimes1 (j *noutputs*)
879 (let* ((out (fvref *outputs* j))
880 (dif (-sf out (svref goal j)))
881 (err-prime (*sf dif (output-prime out)))
882 (os (svref *output-slopes* j)))
883 (declare (short-float dif err-prime))
884 (when stats-p
885 (unless (< (abs dif) *score-threshold*)
886 (incf *error-bits*))
887 (incf-sf *true-error* (*sf dif dif)))
888 (cond (*raw-error*
889 (setf (fvref *errors* j) dif)
890 (incf-sf *sum-error* dif)
891 (incf-sf *sum-sq-error* (*sf dif dif)))
892 (t
893 (setf (fvref *errors* j) err-prime)
894 (incf-sf *sum-error* err-prime)
895 (incf-sf *sum-sq-error* (*sf err-prime err-prime))))
896 (when output-slopes-p
897 (dotimes1 (i *nunits*)
898 (incf-sf (fvref os i) (*sf err-prime (fvref *values* i))))))))
899
900 ;;; Note: Scaling *OUTPUT-EPSILON* by the number of cases seems to keep the
901 ;;; quickprop update in a good range across many different-sized training
902 ;;; sets, but it's something of a hack. Choosing good epsilon values
903 ;;; still requires some trial and error.
904
905 (defun update-output-weights ()
906 "Update the output weights, using the pre-computed slopes, prev-slopes,
907 and delta values. Uses the quickprop update function."
908 (let ((eps (/ *output-epsilon* *ncases*)))
909 (dotimes1 (j *noutputs*)
910 (let ((ow (svref *output-weights* j))
911 (od (svref *output-deltas* j))
912 (os (svref *output-slopes* j))
913 (op (svref *output-prev-slopes* j)))
914 (dotimes1 (i *nunits*)
915 (quickprop-update i ow od os op eps *output-decay*
916 *output-mu* *output-shrink-factor*))))))
917
918
919 ;;;; Outer loops for training output weights.
920
921 (defun train-outputs-epoch ()
922 "Perform forward propagation once for each set of weights in the
923 training vectors, computing errors and slopes. Then update the output
924 weights."
925 ;; Zero error accumulators.
926 (setq *error-bits* 0)
927 (setq *true-error* 0.0)
928 (setq *sum-error* 0.0)
929 (setq *sum-sq-error* 0.0)
930 ;; User may have changed mu between epochs, so fix shrink-factor.
931 (setq *output-shrink-factor*
932 (/sf *output-mu* (+sf 1.0 *output-mu*)))
933 ;; Now run through the training examples.
934 (do ((i *first-case* (1+ i)))
935 ((= i (the fixnum (+ *first-case* *ncases*))))
936 (declare (fixnum i))
937 (setq *goal* (svref *training-outputs* i))
938 (cond (*use-cache*
939 (setq *values* (svref *values-cache* i))
940 (setq *errors* (svref *errors-cache* i))
941 (output-forward-pass))
942 (t (setq *values* *extra-values*)
943 (setq *errors* *extra-errors*)
944 (full-forward-pass (svref *training-inputs* i))))
945 (compute-errors *goal* t t))
946 ;; Do not change weights or count epoch if this run was perfect.
947 (unless (= 0 *error-bits*)
948 (update-output-weights)
949 (incf *epoch*)))
950
951 (defun record-output-weights ()
952 "Store the output weights developed after each output-training phase
953 in the *ouput-weights-record* vector."
954 (let ((record (make-array *noutputs* :initial-element nil)))
955 (dotimes1 (o *noutputs*)
956 (let ((original (svref *output-weights* o))
957 (copy (make-array *nunits* :element-type 'short-float
958 :initial-element 0.0)))
959 (dotimes1 (u *nunits*)
960 (setf (fvref copy u) (fvref original u)))
961 (setf (svref record o) copy)))
962 (setf (svref *output-weights-record* (1- *nunits*)) record)))
963
964 (defun train-outputs (max-epochs)
965 "Train the output weights. If we exhaust MAX-EPOCHS, stop with value
966 :TIMEOUT. If there are zero error bits, stop with value :WIN. Else,
967 keep going until the true error has not changed by a significant amount
968 for *OUTPUT-PATIENCE* epochs. Then return :STAGNANT. If
969 *OUTPUT-PATIENCE* is zero, we do not stop until victory or until
970 MAX-EPOCHS is used up."
971 (declare (fixnum max-epochs))
972 (let ((last-error 0.0)
973 (quit-epoch (+ *epoch* *output-patience*))
974 (first-time t))
975 (declare (fixnum quit-epoch)
976 (short-float last-error))
977 (dotimes1 (i max-epochs (progn
978 (record-output-weights)
979 :timeout))
980 ;; Maybe run a test epoch to see how we're doing.
981 (when (and *test*
982 (not (= 0 *test-interval*))
983 (= 0 (mod i *test-interval*)))
984 (test-epoch))
985 (train-outputs-epoch)
986 (cond ((zerop *error-bits*)
987 (record-output-weights)
988 (return :win))
989 ((zerop *output-patience*))
990 (first-time
991 (setq first-time nil)
992 (setq last-error *true-error*))
993 ((> (abs (- *true-error* last-error))
994 (* last-error *output-change-threshold*))
995 (setq last-error *true-error*)
996 (setq quit-epoch (+ *epoch* *output-patience*)))
997 ((>= *epoch* quit-epoch)
998 (record-output-weights)
999 (return :stagnant))))))
1000
1001
1002 ;;;; Machinery for Training, Selecting, and Installing Candidate Units.
1003
1004 (defun init-candidates ()
1005 "Give new random weights to all of the candidate units. Zero the other
1006 candidate-unit statistics."
1007 (dotimes1 (i *ncandidates*)
1008 (setf (fvref *cand-sum-values* i) 0.0)
1009 (let ((cw (svref *cand-weights* i))
1010 (cd (svref *cand-deltas* i))
1011 (cs (svref *cand-slopes* i))
1012 (cp (svref *cand-prev-slopes* i))
1013 (cc (svref *cand-cor* i))
1014 (cpc (svref *cand-prev-cor* i)))
1015 (dotimes1 (j *nunits*)
1016 (setf (fvref cw j) (random-weight))
1017 (setf (fvref cd j) 0.0)
1018 (setf (fvref cs j) 0.0)
1019 (setf (fvref cp j) 0.0))
1020 (dotimes1 (o *noutputs*)
1021 (setf (fvref cc o) 0.0)
1022 (setf (fvref cpc o) 0.0)))))
1023
1024 (defun install-new-unit ()
1025 "Add the candidate-unit with the best correlation score to the active
1026 network. Then reinitialize the candidate pool."
1027 (when (>= *nunits* *max-units*)
1028 (error "Cannot add any more units."))
1029 ;; For now, assume total connectivity.
1030 (setf (ivref *nconnections* *nunits*) *nunits*)
1031 (setf (svref *connections* *nunits*) *all-connections*)
1032 ;; Copy the weight vector for the new unit.
1033 (let ((w (make-array *nunits* :element-type 'short-float))
1034 (cw (svref *cand-weights* *best-candidate*)))
1035 (dotimes1 (i *nunits*)
1036 (setf (fvref w i) (fvref cw i)))
1037 (setf (svref *weights* *nunits*) w)
1038 ;; Tell user about the new unit.
1039 (format t " Add unit ~S: ~S~%"
1040 (+ 1 *nunits*) w))
1041 ;; Fix up output weights for candidate unit.
1042 ;; Use minus the correlation times the *weight-multiplier* as an
1043 ;; initial guess. At least the sign should be right.
1044 (dotimes1 (o *noutputs*)
1045 (setf (fvref (svref *output-weights* o) *nunits*)
1046 (*sf (-sf (fvref (svref *cand-prev-cor* *best-candidate*) o))
1047 *weight-multiplier*)))
1048 ;; If using cache, run an epoch to compute this unit's values.
1049 (when *use-cache*
1050 (dotimes1 (i *max-cases*)
1051 (setq *values* (svref *values-cache* i))
1052 (compute-unit-value *nunits*)))
1053 ;; Reinitialize candidate units with random weights.
1054 (incf *nunits*)
1055 (init-candidates))
1056
1057 ;;; Note: Ideally, after each adjustment of the candidate weights, we would
1058 ;;; run two epochs. The first would just determine the correlations
1059 ;;; between the candidate unit outputs and the residual error. Then, in a
1060 ;;; second pass, we would adjust each candidate's input weights so as to
1061 ;;; maximize the absolute value of the correlation. We need to know the
1062 ;;; sign of the correlation for each candidate-output pair so that we know
1063 ;;; which direction to tune the input weights.
1064
1065 ;;; Since this ideal method doubles the number of epochs required for
1066 ;;; training candidates, we cheat slightly and use the correlation values
1067 ;;; computed BEFORE the most recent weight update. This combines the two
1068 ;;; epochs, saving us almost a factor of two. To bootstrap the process, we
1069 ;;; begin with a single epoch that computes only the correlation.
1070
1071 ;;; Since we look only at the sign of the correlation and since that sign
1072 ;;; should change very infrequently, this probably is OK. But keep a
1073 ;;; lookout for pathological situations in which this might cause
1074 ;;; oscillation.
1075
1076
1077 ;;; This function is used only once at the start of each output-training
1078 ;;; phase to prime the pump. After that, each call to compute-slopes also
1079 ;;; computes the error-value products for the next epoch.
1080
1081 (defun compute-correlations ()
1082 "For the current training pattern, compute the value of each candidate
1083 unit and begin to compute the correlation between that unit's value and
1084 the error at each output. We have already done a forward-prop and
1085 computed the error values for active units."
1086 (dotimes1 (u *ncandidates*)
1087 (let ((sum 0.0)
1088 (v 0.0)
1089 (cw (svref *cand-weights* u))
1090 (cc (svref *cand-cor* u)))
1091 (declare (short-float sum v))
1092 ;; Determine activation value of each candidate unit.
1093 (dotimes1 (i *nunits*)
1094 (incf-sf sum (*sf (fvref cw i)
1095 (fvref *values* i))))
1096 (setq v (activation sum))
1097 (incf-sf (fvref *cand-sum-values* u) v)
1098 ;; Accumulate value of each unit times error at each output.
1099 (dotimes1 (o *noutputs*)
1100 (incf-sf (fvref cc o) (*sf v (fvref *errors* o)))))))
1101
1102 ;;; Note: When we were computing true correlations between candidates and
1103 ;;; outputs, this is where the normalization factors went in. Currently we
1104 ;;; are just using covariances, as explained in the tech report. So we
1105 ;;; make only two adjustments here. First, we subtract out the product of
1106 ;;; the mean error and the mean candidate value to keep things from
1107 ;;; exploding when the error has a non-zero mean. Second, we effectively
1108 ;;; scale the error values by the sum-squared error over all training
1109 ;;; cases. This just keeps us from having to adjust *input-epsilon*
1110 ;;; repeatedly as the error is gradually reduced to a small fraction of its
1111 ;;; initial size.
1112
1113 (defun adjust-correlations ()
1114 "Normalize each accumulated correlation value, and stuff the normalized
1115 form into the *cand-prev-cor* data structure. Then zero *cand-cor* to
1116 prepare for the next round. Note the unit with the best total
1117 correlation score."
1118 (setq *best-candidate* 0)
1119 (setq *best-candidate-score* 0.0)
1120 (dotimes1 (u *ncandidates*)
1121 (let* ((cc (svref *cand-cor* u))
1122 (cpc (svref *cand-prev-cor* u))
1123 (offset (*sf (fvref *cand-sum-values* u) *avg-error*))
1124 (cor 0.0)
1125 (score 0.0))
1126 (declare (short-float offset cor score))
1127 (dotimes1 (o *noutputs*)
1128 (setq cor (/sf (-sf (fvref cc o) offset) *sum-sq-error*))
1129 (setf (fvref cpc o) cor)
1130 (setf (fvref cc o) 0.0)
1131 (incf-sf score (abs cor)))
1132 ;; Keep track of the candidate with the best overall correlation.
1133 (when (> score *best-candidate-score*)
1134 (setq *best-candidate-score* score)
1135 (setq *best-candidate* u)))))
1136
1137 ;;; This is the key function in the candidate training process.
1138
1139 (defun compute-slopes ()
1140 "Given the correlation values for each candidate-output pair, compute
1141 the derivative of the candidate's score with respect to each incoming
1142 weight."
1143 (dotimes1 (u *ncandidates*)
1144 (let* ((sum 0.0)
1145 (value 0.0)
1146 (actprime 0.0)
1147 (direction 0.0)
1148 (cw (svref *cand-weights* u))
1149 (cs (svref *cand-slopes* u))
1150 (cc (svref *cand-cor* u))
1151 (cpc (svref *cand-prev-cor* u)))
1152 (declare (short-float sum value actprime direction))
1153 ;; Forward pass through each candidate unit to compute activation-prime.
1154 (dotimes1 (i *nunits*)
1155 (incf-sf sum (*sf (fvref cw i)
1156 (fvref *values* i))))
1157 (setq value (activation sum))
1158 (setq actprime (activation-prime value sum))
1159 ;; Now compute which way we want to adjust each unit's incoming
1160 ;; activation.
1161 (dotimes1 (o *noutputs*)
1162 (let ((error (fvref *errors* o)))
1163 (decf-sf direction
1164 (*sf (if (minusp (fvref cpc o)) -1.0 1.0)
1165 (*sf actprime
1166 (/sf (-sf error *avg-error*)
1167 *sum-sq-error*))))
1168 ;; Also accumulate the error-value products for use next epoch.
1169 (incf-sf (fvref cc o) (*sf error value))))
1170 ;; Given the direction we want to push the candidate, compute
1171 ;; which way we want to tweak each incoming weight.
1172 (dotimes1 (i *nunits*)
1173 (incf-sf (fvref cs i)
1174 (*sf direction (fvref *values* i)))))))
1175
1176 ;;; Note: Scaling *INPUT-EPSILON* by the number of cases and number of
1177 ;;; inputs to each unit seems to keep the quickprop update in a good range,
1178 ;;; as the network goes from small to large, and across many
1179 ;;; different-sized training sets. Still, choosing a good epsilon value
1180 ;;; requires some trial and error.
1181
1182 (defun update-input-weights ()
1183 "Update the input weights, using the pre-computed slopes, prev-slopes,
1184 and delta values. Uses the quickprop update function."
1185 (let ((eps (/ *input-epsilon* (* *ncases* *nunits*))))
1186 (dotimes1 (u *ncandidates*)
1187 (let ((cw (svref *cand-weights* u))
1188 (cd (svref *cand-deltas* u))
1189 (cs (svref *cand-slopes* u))
1190 (cp (svref *cand-prev-slopes* u)))
1191 (dotimes1 (i *nunits*)
1192 (quickprop-update i cw cd cs cp eps *input-decay*
1193 *input-mu* *input-shrink-factor*))))))
1194
1195 ;;; Outer loop for training the candidate unit(s).
1196
1197 (defun train-inputs-epoch ()
1198 "For each training pattern, perform a forward pass. Tune the candidate units'
1199 weights to maximize the correlation score of each."
1200 (do ((i *first-case* (1+ i)))
1201 ((= i (the fixnum (+ *first-case* *ncases*))))
1202 (declare (fixnum i))
1203 (setq *goal* (svref *training-outputs* i))
1204 ;; Compute values and errors, or recall cached values.
1205 (cond (*use-cache*
1206 (setq *values* (svref *values-cache* i))
1207 (setq *errors* (svref *errors-cache* i)))
1208 (t (setq *values* *extra-values*)
1209 (setq *errors* *extra-errors*)
1210 (full-forward-pass (svref *training-inputs* i))
1211 (compute-errors *goal* nil nil)))
1212 ;; Compute the slopes we will use to adjust candidate weights.
1213 (compute-slopes))
1214 ;; User may have changed mu between epochs, so fix shrink-factor.
1215 (setq *input-shrink-factor* (/sf *input-mu*
1216 (+sf 1.0 *input-mu*)))
1217 ;; Now adjust the candidate unit input weights using quickprop.
1218 (update-input-weights)
1219 ;; Fix up the correlation values for the next epoch.
1220 (adjust-correlations)
1221 (incf *epoch*))
1222
1223 (defun correlations-epoch ()
1224 "Do an epoch through all active training patterns just to compute the
1225 initial correlations. After this one pass, we will update the
1226 correlations as we train."
1227 (do ((i *first-case* (1+ i)))
1228 ((= i (the fixnum (+ *first-case* *ncases*))))
1229 (declare (fixnum i))
1230 (setq *goal* (svref *training-outputs* i))
1231 (cond (*use-cache*
1232 (setq *values* (svref *values-cache* i))
1233 (setq *errors* (svref *errors-cache* i)))
1234 (t (setq *values* *extra-values*)
1235 (setq *errors* *extra-errors*)
1236 (full-forward-pass (svref *training-inputs* i))
1237 (compute-errors *goal* nil nil)))
1238 (compute-correlations))
1239 (adjust-correlations)
1240 (incf *epoch*))
1241
1242 (defun train-inputs (max-epochs)
1243 "Train the input weights of all candidates. If we exhaust MAX-EPOCHS,
1244 stop with value :TIMEOUT. Else, keep going until the best candidate
1245 unit's score has changed by a significant amount, and then until it does
1246 not change significantly for PATIENCE epochs. Then return :STAGNANT. If
1247 PATIENCE is zero, we do not stop until victory or until MAX-EPOCHS is
1248 used up."
1249 (declare (fixnum max-epochs))
1250 (setq *avg-error* (/ *sum-error* (* *ncases* *noutputs*)))
1251 (correlations-epoch)
1252 (let ((last-score 0.0)
1253 (quit max-epochs)
1254 (first-time t))
1255 (declare (fixnum quit)
1256 (short-float last-score))
1257 (dotimes1 (i max-epochs :timeout)
1258 (train-inputs-epoch)
1259 (cond ((zerop *input-patience*))
1260 (first-time
1261 (setq first-time nil)
1262 (setq last-score *best-candidate-score*))
1263 ((> (abs (-sf *best-candidate-score* last-score))
1264 (* last-score *input-change-threshold*))
1265 (setq last-score *best-candidate-score*)
1266 (setq quit (+ i *input-patience*)))
1267 ((>= i quit)
1268 (return :stagnant))))))
1269
1270 ;;;; Outer Loop.
1271
1272 (defun list-parameters ()
1273 "Print out the current training parameters in abbreviated form."
1274 (format t "SigOff ~,2F, WtRng ~,2F, WtMul ~,2F~%"
1275 *sigmoid-prime-offset* *weight-range* *weight-multiplier*)
1276 (format t "OMu ~,2F, OEps ~,2F, ODcy ~,4F, OPat ~D, OChange ~,3F~%"
1277 *output-mu* *output-epsilon* *output-decay* *output-patience*
1278 *output-change-threshold*)
1279 (format t "IMu ~,2F, IEps ~,2F, IDcy ~,4F, IPat ~D, IChange ~,3F~%"
1280 *input-mu* *input-epsilon* *input-decay* *input-patience*
1281 *input-change-threshold*)
1282 (format t "Utype ~S, Otype ~S, RawErr ~S, Pool ~D~%"
1283 *unit-type* *output-type* *raw-error* *ncandidates*))
1284
1285 (defun train (outlimit inlimit rounds &optional (restart nil))
1286 "Train the output weights until stagnation or victory is reached. Then
1287 train the input weights to stagnation or victory. Then install the best
1288 candidate unit and repeat. OUTLIMIT and INLIMIT are upper limits on the number
1289 of cycles in each output and input phase. ROUNDS is an upper limit on
1290 the number of unit-installation cycles. If RESTART is non-nil, we are
1291 restarting training from the current point -- do not reinitialize the net."
1292 (declare (fixnum outlimit inlimit rounds))
1293 (unless restart (init-net))
1294 (list-parameters)
1295 (when *use-cache*
1296 (dotimes1 (i *max-cases*)
1297 (setq *values* (svref *values-cache* i))
1298 (set-up-inputs (svref *training-inputs* i))))
1299 (dotimes1 (r rounds :lose)
1300 (case (train-outputs outlimit)
1301 (:win
1302 (list-parameters)
1303 (format t "Victory at ~S epochs, ~S units, ~S hidden, Error ~S.~%"
1304 *epoch* *nunits* (- *nunits* *ninputs* 1) *true-error*)
1305 (return nil))
1306 (:timeout
1307 (format t "Epoch ~D: Out Timeout ~D bits wrong, error ~S.~2%"
1308 *epoch* *error-bits* *true-error*))
1309 (:stagnant
1310 (format t "Epoch ~D: Out Stagnant ~D bits wrong, error ~S.~2%"
1311 *epoch* *error-bits* *true-error*)))
1312 (when *test* (test-epoch))
1313 (case (train-inputs inlimit)
1314 (:timeout
1315 (format t "Epoch ~D: In Timeout. Cor: ~D~%"
1316 *epoch* *best-candidate-score*))
1317 (:stagnant
1318 (format t "Epoch ~D: In Stagnant. Cor: ~D~%"
1319 *epoch* *best-candidate-score*)))
1320 (install-new-unit)))
1321
1322 (defun test-epoch (&optional (*score-threshold* 0.49999))
1323 "Perform forward propagation once for each set of weights in the training
1324 and testing vectors. Reporting the performance. Do not change any
1325 weights. Do not use the caches."
1326 (let ((*use-cache* nil)
1327 (*values* *extra-values*)
1328 (*errors* *extra-errors*)
1329 (*error-bits* 0)
1330 (*true-error* 0.0)
1331 (*sum-error* 0.0)
1332 (*sum-sq-error* 0.0))
1333 ;; Run all training patterns and count errors.
1334 (dotimes1 (i (length *training-inputs*))
1335 (setq *goal* (svref *training-outputs* i))
1336 (full-forward-pass (svref *training-inputs* i))
1337 (compute-errors *goal* nil t))
1338 (format t "Training: ~D of ~D wrong, error ~S."
1339 *error-bits* (length *training-inputs*) *true-error*)
1340 ;; Zero some accumulators again.
1341 (setq *error-bits* 0)
1342 (setq *true-error* 0.0)
1343 (setq *sum-error* 0.0)
1344 (setq *sum-sq-error* 0.0)
1345 ;; Now run all test patterns and report the results.
1346 (when *test-inputs*
1347 (dotimes1 (i (length *test-inputs*))
1348 (setq *goal* (svref *test-outputs* i))
1349 (full-forward-pass (svref *test-inputs* i))
1350 (compute-errors *goal* nil t)))
1351 (format t " Test: ~D of ~D wrong, error ~S.~%"
1352 *error-bits* (length *test-inputs*) *true-error*)))
1353
1354 (defun test-setup (nunits weights output-weights)
1355 "Set up a network for testing, given stored weights and output weights."
1356 (init-net)
1357 (setq *weights* weights)
1358 (setq *output-weights* output-weights)
1359 (setq *nunits* nunits)
1360 (do ((i (1+ *ninputs*) (1+ i)))
1361 ((= i *nunits*))
1362 (declare (fixnum i))
1363 (setf (ivref *nconnections* i) i)
1364 (setf (svref *connections* i) *all-connections*)))
1365
1366
1367 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1368 ;;; Example Applications ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1369 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1370
1371 ;;; Zig-Zag problem. An easy one, useful for testing the code.
1372
1373 (defun build-zig-zag (n)
1374 "Build N pairs of 1-D zig-zag."
1375 (declare (fixnum n))
1376 (setq *ninputs* 1)
1377 (setq *noutputs* 1)
1378 (let ((ti (make-array (* 2 n)))
1379 (to (make-array (* 2 n))))
1380 (dotimes1 (i n)
1381 (setf (svref ti (* i 2))
1382 (vector (+ i 1.0)))
1383 (setf (svref to (* i 2))
1384 (vector (if (evenp i) 0.5 -0.5)))
1385 (setf (svref ti (1+ (* i 2)))
1386 (vector (- (+ i 1.0))))
1387 (setf (svref to (1+ (* i 2)))
1388 (vector (if (evenp i) -0.5 0.5))))
1389 (setq *training-inputs* ti)
1390 (setq *training-outputs* to)
1391 (setq *test-inputs* ti)
1392 (setq *test-outputs* to))
1393 (build-net 1 1)
1394 (init-net))
1395
1396 ;;; Call this with something like (BUILD-ZIG-ZAG 4), then call
1397 ;;; something like (train 100 100 25).
1398
1399
1400 ;;; Two spirals problem.
1401
1402 (defun build-two-spirals (&optional (n 97))
1403 "Build N point-pairs of the two-spiral problem, with standard default
1404 of 97 pairs."
1405 (declare (fixnum n))
1406 (setq *ninputs* 2)
1407 (setq *noutputs* 1)
1408 (let ((ti (make-array (* 2 n)))
1409 (to (make-array (* 2 n))))
1410 (dotimes1 (i n)
1411 (let* ((angle (/ (* i (coerce pi 'short-float)) 16.0))
1412 (radius (/ (* 6.5 (- 104.0 i)) 104))
1413 (x (* radius (sin angle)))
1414 (y (* radius (cos angle))))
1415 (setf (svref ti (* i 2))
1416 (vector x y))
1417 (setf (svref to (* i 2))
1418 (vector 0.5))
1419 (setf (svref ti (1+ (* i 2)))
1420 (vector (- x) (- y)))
1421 (setf (svref to (1+ (* i 2)))
1422 (vector -0.5))))
1423 ;; Put the inner part of the spiral first on the list.
1424 (setq ti (nreverse ti))
1425 (setq to (nreverse to))
1426 (setq *training-inputs* ti)
1427 (setq *training-outputs* to)
1428 (setq *test-inputs* ti)
1429 (setq *test-outputs* to))
1430 (build-net 2 1)
1431 (init-net))
1432
1433 ;;; To run this, call (BUILD-TWO-SPIRALS), set various control parameters,
1434 ;;; and then call something like (TRAIN 100 100 25).
1435
1436 ;;; For parameters, try these:
1437 ;;; SigOff 0.10, WtRng 1.00, WtMul 1.00
1438 ;;; OMu 2.00, OEps 1.00, ODcy 0.0001, OPat 12, OChange 0.010
1439 ;;; IMu 2.00, IEps 100.00, IDcy 0.00000, IPat 8, IChange 0.030
1440 ;;; Utype :SIGMOID, Otype :SIGMOID, RawErr NIL, Pool 8
1441
1442 (defun time-two-spirals ()
1443 (setq *random-state* (make-random-state))
1444 (build-two-spirals)
1445 (time (train 100 100 25)))
1446
1447 ;;; The End.
1448

  ViewVC Help
Powered by ViewVC 1.1.5