/[cmucl]/src/benchmarks/cascor1.lisp
ViewVC logotype

Contents of /src/benchmarks/cascor1.lisp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (show annotations)
Sat Mar 14 14:17:09 1992 UTC (22 years, 1 month ago) by ram
Branch: MAIN
Changes since 1.3: +21 -3 lines
Put random state in the file so that it really is deterministic.  Supply legal
initial values for *test-{inputs,outputs}*.
1 ;;; -*- Mode:Lisp -*-
2 ;;; ***************************************************************************
3 ;;; Common Lisp implementation of Cascade-Correlation learning algorithm.
4 ;;; This version for export. Non-portable user-interface stuff excised.
5 ;;;
6 ;;; Written by: Scott E. Fahlman
7 ;;; School of Computer Science
8 ;;; Carnegie-Mellon University
9 ;;; Pittsburgh, PA 15217
10 ;;;
11 ;;; Phone: (412) 268-2575
12 ;;; Internet: fahlman@cs.cmu.edu
13 ;;;
14 ;;; This code has been placed in the public domain by the author. As a
15 ;;; matter of simple courtesy, anyone using or adapting this code is
16 ;;; expected to acknowledge the source. The author would like to hear
17 ;;; about any attempts to use this system, successful or not.
18 ;;;
19 ;;; For an explanation of this algorithm and some results, see "The
20 ;;; Cascade-Correlation Learning Architecture" by Scott E. Fahlman and
21 ;;; Christian Lebiere in D. S. Touretzky (ed.), "Advances in Neural
22 ;;; Information Processing Systems 2", Morgan Kaufmann, 1990. A somewhat
23 ;;; longer version is available as CMU Computer Science Tech Report
24 ;;; CMU-CS-90-100.
25 ;;;
26 ;;; ***************************************************************************
27 ;;; EDIT HISTORY SINCE FIRST RELEASE:
28 ;;;
29 ;;; 8/24/90:
30 ;;; Modified TEST-EPOCH so that it wouldn't mess up error statistics being
31 ;;; passed from the output-training to input-training phase. Thanks to
32 ;;; Scott Crowder for spotting this.
33 ;;;
34 ;;; 6/1/90:
35 ;;; Fixed bug in INSTALL-NEW-UNIT. New unit's initial weight was being
36 ;;; computed using *CAND-COR* values of the successful candidate, which is
37 ;;; already zero. Now uses *CAND-PREV-COR*. Thanks to Tim Howells for
38 ;;; spotting this.
39 ;;;
40 ;;; Modify BUILD-NET to check that *MAX-UNITS* is large enough. A couple of
41 ;;; people got mysterious failures the first time they used lots of inputs.
42 ;;;
43 ;;; Made a small change in QUICKPROP-UPDATE to prevent rare divide-by-zero
44 ;;; errors when p = s = 0.0. Thanks to Tom Dietterich.
45 ;;;
46 ;;; Added CHANGED-TRAINING-SET, which should be called when the training set
47 ;;; is changed but you don't want to reinitialize the net. This rebuilds
48 ;;; the caches.
49 ;;;
50 ;;; 11/9/90:
51 ;;; Added some additional type declarations for maximum speed under certain
52 ;;; Common Lisp compilers.
53 ;;; ***************************************************************************
54
55 ;;; This proclamation buys a certain amount of overall speed at the expense
56 ;;; of runtime checking. Comment it out when debugging new, bug-infested code.
57 #+declare-unsafe
58 (proclaim '(optimize (speed 3) (space 0) (safety 0)))
59
60 ;;; Style note: Because some of these runs take a long time, this code is
61 ;;; extensively hacked for good performance under a couple of Common Lisp
62 ;;; systems, some of which have poor performance on multi-dimensional
63 ;;; arrays and some of which have weak type-inference in the compiler.
64 ;;; Elegance and clarity have in some cases been sacrificed for speed.
65
66 ;;; In some problems, floating point underflow errors may occur as a result
67 ;;; of weight-decay and other operations. Most Common Lisp implementations
68 ;;; have an option to turn floating underflows into zero values without
69 ;;; signalling an error. You should enable this facility if it is
70 ;;; available. If not, you'll either have to write a condition handler for
71 ;;; floating underflows or live with the occasional underflow error.
72
73 ;;; In CMU Common Lisp, we use the following incantation:
74 ;;; (setq extensions:*ignore-floating-point-underflow* t)
75
76
77 ;;; Compensate for the clumsy Common Lisp declaration system and weak
78 ;;; type-inference in some Common Lisp compilers.
79
80 ;;; INCF-SF, *SF, etc. are like INCF, *, etc., but they declare their
81 ;;; operands and results to be short-floats. The code gets unreadable
82 ;;; quickly if you insert all these declarations by hand.
83
84 (defmacro incf-sf (place &optional (increment 1.0))
85 `(the short-float (incf (the short-float ,place)
86 (the short-float ,increment))))
87
88 (defmacro decf-sf (place &optional (increment 1.0))
89 `(the short-float (decf (the short-float ,place)
90 (the short-float ,increment))))
91
92 (defmacro *sf (&rest args)
93 `(the short-float
94 (* ,@(mapcar #'(lambda (x) (list 'the 'short-float x)) args))))
95
96 (defmacro +sf (&rest args)
97 `(the short-float
98 (+ ,@(mapcar #'(lambda (x) (list 'the 'short-float x)) args))))
99
100 (defmacro -sf (&rest args)
101 `(the short-float
102 (- ,@(mapcar #'(lambda (x) (list 'the 'short-float x)) args))))
103
104 (defmacro /sf (&rest args)
105 `(the short-float
106 (/ ,@(mapcar #'(lambda (x) (list 'the 'short-float x)) args))))
107
108 ;;; DOTIMES1 is like DOTIMES, only with the loop counter declared as a
109 ;;; fixnum. This is for compilers with weak type inference.
110
111 (defmacro dotimes1 (form1 &body body)
112 `(dotimes ,form1 (declare (fixnum ,(car form1))) . ,body))
113
114 ;;; Create vector-access forms similar to SVREF, but for vectors of
115 ;;; element-type SHORT-FLOAT and FIXNUM.
116
117 (eval-when (compile load eval)
118 (defconstant fvector-type
119 (array-element-type (make-array '(1) :element-type 'short-float)))
120 (defconstant ivector-type
121 (array-element-type (make-array '(1) :element-type 'fixnum))))
122
123 (defmacro fvref (a i)
124 "Like SVREF, but with vectors of element-type SHORT-FLOAT."
125 (if (eq fvector-type t)
126 `(the short-float (svref ,a ,i))
127 `(the short-float
128 (aref (the (simple-array ,fvector-type (*)) ,a) ,i))))
129
130 (defmacro ivref (a i)
131 "Like SVREF, but with vectors of element-type FIXNUM."
132 (if (eq ivector-type t)
133 `(the fixnum (svref ,a ,i))
134 `(the fixnum
135 (aref (the (simple-array ,ivector-type (*)) ,a) ,i))))
136
137
138 ;;;; Assorted Parameters and Controls.
139
140 ;;; Thse parameters and switches control the quickprop learning algorithm
141 ;;; used to train the output weights and candidate units.
142
143 (defvar *unit-type* :sigmoid
144 "The type of activation function used by the hidden units. Options
145 currently implemented are :sigmoid, :asigmoid, and :gaussian. Sigmoid is
146 symmetric in range -0.5 to +0.5, while Asigmoid is asymmetric, 0.0 to
147 1.0.")
148
149 (defvar *output-type* :sigmoid
150 "The activation function to use on the output units. Options currently
151 implemented are :linear and :sigmoid.")
152
153 (defvar *raw-error* nil
154 "If T, candidate units will try to correlate with the raw difference
155 between actual and desired outputs. Else, they use the difference modified
156 by the derivative of the output unit activation function.")
157
158 (defvar *sigmoid-prime-offset* 0.1
159 "This is added to the derivative of the sigmoid function to prevent the
160 system from getting stuck at the points where sigmoid-prime goes to
161 zero.")
162 (proclaim '(short-float *sigmoid-prime-offset*))
163
164 (defvar *weight-range* 1.0
165 "Input weights in the network get inital random values between plus and
166 minus *weight-range*. This parameter also controls the initial weights
167 on direct input-to-output links.")
168 (proclaim '(short-float *weight-range*))
169
170 (defvar *weight-multiplier* 1.0
171 "The output weights for cadidate units get an initial value that is the
172 negative of the correlation times this factor.")
173 (proclaim '(short-float *weight-multiplier*))
174
175 (defvar *output-mu* 2.0
176 "Mu parmater used for quickprop training of output weights. The
177 step size is limited to mu times the previous step.")
178 (proclaim '(short-float *output-mu*))
179
180 (defvar *output-shrink-factor* (/ *output-mu* (+ 1.0 *output-mu*))
181 "Derived from *output-mu*. Used in computing whether the proposed step is
182 too large.")
183 (proclaim '(short-float *output-shrink-factor*))
184
185 (defvar *output-epsilon* 0.35
186 "Controls the amount of linear gradient descent to use in updating
187 output weights.")
188 (proclaim '(short-float *output-epsilon*))
189
190 (defvar *output-decay* 0.0001
191 "This factor times the current weight is added to the slope at the
192 start of each output-training epoch. Keeps weights from growing too big.")
193 (proclaim '(short-float *output-decay*))
194
195 (defvar *output-patience* 8
196 "If we go for this many epochs with no significant change, it's time to
197 stop tuning. If 0, go on forever.")
198 (proclaim '(fixnum *output-patience*))
199
200 (defvar *output-change-threshold* 0.01
201 "The error must change by at least this fraction of its old value in
202 order to count as a significant change.")
203 (proclaim '(short-float *output-change-threshold*))
204
205 (defvar *input-mu* 2.0
206 "Mu parmater used for quickprop training of input weights. The
207 step size is limited to mu times the previous step.")
208 (proclaim '(short-float *input-mu*))
209
210 (defvar *input-shrink-factor* (/ *input-mu* (+ 1.0 *input-mu*))
211 "Derived from *input-mu*. Used in computing whether the proposed step is
212 too large.")
213 (proclaim '(short-float *input-shrink-factor*))
214
215 (defvar *input-epsilon* 1.0
216 "Controls the amount of linear gradient descent to use in updating
217 unit input weights.")
218 (proclaim '(short-float *input-epsilon*))
219
220 (defvar *input-decay* 0.0
221 "This factor times the current weight is added to the slope at the
222 start of each output-training epoch. Keeps weights from growing too big.")
223 (proclaim '(short-float *input-decay*))
224
225 (defvar *input-patience* 8
226 "If we go for this many epochs with no significant change, it's time to
227 stop tuning. If 0, go on forever.")
228 (proclaim '(fixnum *input-patience*))
229
230 (defvar *input-change-threshold* 0.03
231 "The correlation score for the best unit must change by at least
232 this fraction of its old value in order to count as a significant
233 change.")
234 (proclaim '(short-float *input-change-threshold*))
235
236 ;;; Variables related to error and correlation.
237
238 (defvar *score-threshold* 0.4
239 "An output is counted as correct for a given case if the difference
240 between that output and the desired value is smaller in magnitude than
241 this value.")
242 (proclaim '(short-float *score-threshold*))
243
244 (defvar *error-bits* 0
245 "Count number of bits in epoch that are wrong by more than
246 *SCORE-THRESHOLD*")
247 (proclaim '(fixnum *error-bits*))
248
249 (defvar *true-error* 0.0
250 "The sum-squared error at the network outputs. This is the value the
251 algorithm is ultimately trying to minimize.")
252 (proclaim '(short-float *true-error*))
253
254 (defvar *sum-error* 0.0
255 "Accumulate the sum of the error values after output training phase.")
256 (proclaim '(short-float *sum-error*))
257
258 (defvar *sum-sq-error* 0.0
259 "Accumulate the sum of the squared error values after output
260 training phase.")
261 (proclaim '(short-float *sum-sq-error*))
262
263 (defvar *avg-error* 0.0
264 "Holds the average of error values after output training phase.")
265 (proclaim '(short-float *avg-error*))
266
267 (defvar *best-candidate-score* 0.0
268 "The best correlation score found among all candidate units being
269 trained.")
270 (proclaim '(short-float *best-candidate-score*))
271
272 (defvar *best-candidate* 0
273 "The index of the candidate unit whose correlation score is best
274 at present.")
275 (proclaim '(fixnum *best-candidate*))
276
277 ;;; These variables and switches control the simulation and display.
278
279 (defvar *use-cache* t
280 "If T, cache the forward-pass values instead of repeatedly
281 computing them. This can save a *lot* of time if all the cached values
282 fit into memory.")
283
284 (defparameter *epoch* 0
285 "Count of the number of times the entire training set has been presented.")
286 (proclaim '(fixnum *epoch*))
287
288 (defvar *test* nil
289 "If T, run a test epoch every so often during output training.")
290
291 (defvar *test-interval* 0
292 "Run a test epoch every *test-interval* output training cycles.")
293 (proclaim '(fixnum *test-interval*))
294
295 (defvar *single-pass* nil
296 "When on, pause after next forward/backward cycle.")
297
298 (defvar *single-epoch* nil
299 "When on, pause after next training epoch.")
300
301 (defparameter *step* nil
302 "Turned briefly to T in order to continue after a pause.")
303
304 ;;; The sets of training inputs and outputs are stored in parallel vectors.
305 ;;; Each element is a SIMPLE-VECTOR holding short-float values, one for
306 ;;; each input or output. Note: this is a simple vector, not a specialized
307 ;;; vector of element-type short-float.
308
309 (defvar *training-inputs* (make-array 0)
310 "Vector of input patterns for training the net.")
311 (proclaim '(simple-vector *training-inputs*))
312
313 (defvar *training-outputs* (make-array 0)
314 "Vector of output patterns for training the net.")
315 (proclaim '(simple-vector *training-outputs*))
316
317 (defvar *goal* (make-array 0)
318 "The goal vector for the current training or testing case.")
319 (proclaim '(simple-vector *goal*))
320
321 (defvar *max-cases* 0
322 "Maximum number of training cases that can be accommdated by the current
323 data structures.")
324 (proclaim '(fixnum *max-cases*))
325
326 (defvar *ncases* 0
327 "Number of training cases currently in use. Assume a contiguous block
328 beginning with *FIRST-CASE*.")
329 (proclaim '(fixnum *ncases*))
330
331 (defvar *first-case* 0
332 "Address of the first training case in the currently active set. Usually
333 zero, but may differ if we are training on different chunks of the training
334 set at different times.")
335 (proclaim '(fixnum *first-case*))
336
337 ;;; For some benchmarks there is a separate set of values used for testing
338 ;;; the network's ability to generalize. These values are not used during
339 ;;; training.
340
341 (defvar *test-inputs* '#()
342 "Vector of input patterns for testing the net.")
343 (proclaim '(simple-vector *test-inputs*))
344
345 (defvar *test-outputs* '#()
346 "Vector of output patterns for testing the net.")
347 (proclaim '(simple-vector *test-outputs*))
348
349
350 ;;;; Fundamental data structures.
351
352 ;;; Unit values and weights are short flonums.
353
354 ;;; Instead of representing each unit by a structure, we represent the
355 ;;; unit by a fixnum. This is used to index into various vectors that hold
356 ;;; per-unit information, such as the activation value of each unit.
357 ;;; So the information concerning a given unit is found in a slice of values
358 ;;; across many vectors, all with the same unit-index.
359
360 ;;; Per-connection information for each connection COMING INTO unit is
361 ;;; stored in a vector of vectors. The outer vector is indexed by the unit
362 ;;; number, and the inner vector is then indexed by connection number.
363 ;;; This is a sleazy way of implementing a 2-D array, faster in most Lisp
364 ;;; systems than multiplying to do the index arithmetic, and more efficient
365 ;;; if the units are sparsely connected.
366
367 ;;; Unit 0, the "bias unit" is always at a maximum-on value. Next come
368 ;;; some input "units", then some hidden units.
369
370 ;;; Output units have their own separate set of data structures and
371 ;;; indices. The units and outputs together form the "active" network.
372 ;;; There are also separate data structures and indices for the "candidate"
373 ;;; units that have not yet been added to the network.
374
375 (defvar *max-units* 30
376 "Maximum number of input values and hidden units in the network.")
377 (proclaim '(fixnum *max-units*))
378
379 (defvar *ninputs* 0
380 "Number of inputs for this problem.")
381 (proclaim '(fixnum *ninputs*))
382
383 (defvar *noutputs* 0
384 "Number of outputs for this problem.")
385 (proclaim '(fixnum *noutputs*))
386
387 (defvar *nunits* 0
388 "Current number of active units in the network. This count includes all
389 inputs to the network and the bias unit.")
390 (proclaim '(fixnum *nunits*))
391
392 (defvar *ncandidates* 8
393 "Number of candidate units whose inputs will be trained at once.")
394 (proclaim '(fixnum *ncandidates*))
395
396 ;;; The following vectors hold values related to hidden units in the active
397 ;;; net and their input weights. The vectors are created by BUILD-NET, after
398 ;;; the dimension variables have been set up.
399
400 (defvar *values* nil
401 "Vector holding the current activation value for each unit and input in
402 the active net.")
403
404 (defvar *values-cache* nil
405 "Holds a distinct *VALUES* vector for each of the *MAX-CASES* training
406 cases. Once we have computed the *VALUES* vector for each training case,
407 we can use it repeatedly until the weights or training cases change.")
408
409 (defvar *extra-values* nil
410 "Extra values vector to use when not using the cache.")
411
412 ;;; Note: the *NCONNECTIONS* and *CONNECTIONS* vectors could be eliminated
413 ;;; if we wanted to commit to total connectivity for all units.
414 ;;; For now, we want to allow for sparse or irregular connectivity.
415
416 (defvar *nconnections* nil
417 "Vector holding the number of incoming connections for each unit.")
418
419 (defvar *connections* nil
420 "Vector that holds a connection vector for each unit J.
421 Each entry in the connection vector holds a unit index I,
422 indicating that this connection is from I to J.")
423
424 (defvar *weights* nil
425 "Vector of vectors with structure parallel to the *connections* vector.
426 Each entry gives the weight associated with an incoming connection.")
427
428 ;;; The following vectors hold values for the outputs of the active
429 ;;; network and the output-side weights.
430
431 (defvar *outputs* nil
432 "Vector holding the network output values.")
433
434 (defvar *errors* nil
435 "Vector holding the current error value for each output.")
436
437 (defvar *errors-cache* nil
438 "Holds a distinct *ERRORS* vector for each of the *MAX-CASES* training
439 cases. Once we have computed the *ERRORS* vector for a given training
440 case, we can use it repeatedly until the weights of the training cases
441 change.")
442
443 (defvar *extra-errors* nil
444 "Extra errors vector to use when not using the cache.")
445
446 (defvar *output-weights* nil
447 "Vector of vectors. For each output, we have a vector of output weights
448 coming from the unit indicated by the index.")
449
450 (defvar *output-deltas* nil
451 "Vector of vectors, parallel with output weights. Each entry is the
452 amount by which the corresponding output weight was changed last time.")
453
454 (defvar *output-slopes* nil
455 "Vector of vectors, parallel with output weights. Each entry is the
456 partial derivative of the total error with repsect to the corresponding
457 weight.")
458
459 (defvar *output-prev-slopes* nil
460 "Vector of vectors, parallel with output weights. Each entry is the
461 previous value of the corresponding *OUTPUT-SLOPES* entry.")
462
463 (defvar *output-weights-record* nil
464 "The vector of output weights is recorded here after each output-training
465 phase and just prior to the addition of the next unit. This record
466 allows us to reconstruct the network's performance at each of these
467 points in time.")
468
469 ;;; The following vectors have one entry for each candidate unit in the
470 ;;; pool of trainees.
471
472 (defvar *cand-sum-values* nil
473 "For each candidate unit, the sum of its values over an entire
474 training set.")
475
476 (defvar *cand-cor* nil
477 "A vector with one entry for each candidate unit. This entry is a vector
478 that holds the correlation between this unit's value and the residual
479 error at each of the outputs, computed over a whole epoch.")
480
481 (defvar *cand-prev-cor* nil
482 "Holds the *cand-cor* values computed in the previous candidate training
483 epoch.")
484
485 (defvar *cand-weights* nil
486 "A vector with one entry for each candidate unit. This entry is a vector
487 that holds the current input weights for that candidate unit.")
488
489 (defvar *cand-deltas* nil
490 "A vector with one entry for each candidate unit. This entry is a vector
491 that holds the input weights deltas for that candidate unit.")
492
493 (defvar *cand-slopes* nil
494 "A vector with one entry for each candidate unit. This entry is a vector
495 that holds the input weights slopes for that candidate unit.")
496
497 (defvar *cand-prev-slopes* nil
498 "A vector with one entry for each candidate unit. This entry is a vector
499 that holds the previous values of the input weight slopes for that
500 candidate unit.")
501
502 ;;; At present, each candidate receives a connection from every input and
503 ;;; pre-existing unit. Rather than cons up a new *connections* vector for
504 ;;; each of these, we can just use this one for all of them.
505
506 (defvar *all-connections* nil
507 "A *CONNECTIONS* vector that can be used by any unit that connects to
508 all lower-numbered units, in order.")
509
510
511 ;;;; Network-building utilities.
512
513 (defun build-net (ninputs noutputs)
514 "Create the network data structures, given the number of input and output
515 connections. Get *MAX-UNITS* and other dimesntions from variables."
516 (declare (fixnum ninputs noutputs))
517 ;; Check to make sure *MAX-UNITS* is big enough.
518 (unless (> *max-units* (+ ninputs 1))
519 (error "*MAX-UNITS* must be greater than number of inputs plus 1."))
520 ;; Fill in assorted variables and create top-level vectors.
521 (setq *ninputs* ninputs
522 *noutputs* noutputs
523 *max-cases* (length *training-inputs*)
524 *ncases* *max-cases*
525 *first-case* 0
526 *nunits* (+ 1 *ninputs*)
527 *values-cache* (make-array *max-cases* :initial-element nil)
528 *extra-values* (make-array *max-units*
529 :element-type 'short-float
530 :initial-element 0.0)
531 *values* *extra-values*
532 *nconnections* (make-array *max-units*
533 :element-type 'fixnum
534 :initial-element 0)
535 *connections* (make-array *max-units* :initial-element nil)
536 *weights* (make-array *max-units* :initial-element nil)
537 *outputs* (make-array *noutputs*
538 :element-type 'short-float
539 :initial-element 0.0)
540 *errors-cache* (make-array *max-cases* :initial-element nil)
541 *extra-errors* (make-array *noutputs*
542 :element-type 'short-float
543 :initial-element 0.0)
544 *errors* *extra-errors*
545 *output-weights* (make-array *noutputs* :initial-element nil)
546 *output-weights-record* (make-array *max-units* :initial-element nil)
547 *output-deltas* (make-array *noutputs* :initial-element nil)
548 *output-slopes* (make-array *noutputs* :initial-element nil)
549 *output-prev-slopes* (make-array *noutputs* :initial-element nil)
550 *cand-sum-values* (make-array *ncandidates*
551 :element-type 'short-float
552 :initial-element 0.0)
553 *cand-cor* (make-array *ncandidates* :initial-element nil)
554 *cand-prev-cor* (make-array *ncandidates* :initial-element nil)
555 *cand-weights* (make-array *ncandidates* :initial-element nil)
556 *cand-deltas* (make-array *ncandidates* :initial-element nil)
557 *cand-slopes* (make-array *ncandidates* :initial-element nil)
558 *cand-prev-slopes* (make-array *ncandidates* :initial-element nil))
559 ;; Only create the caches if *USE-CACHE* is on -- may not always have room.
560 (when *use-cache*
561 (dotimes1 (i *max-cases*)
562 (setf (svref *values-cache* i)
563 (make-array *max-units*
564 :element-type 'short-float
565 :initial-element 0.0))
566 (setf (svref *errors-cache* i)
567 (make-array *noutputs*
568 :element-type 'short-float
569 :initial-element 0.0))))
570 ;; For each output, create the vectors holding per-weight information.
571 (dotimes1 (i *noutputs*)
572 (setf (svref *output-weights* i)
573 (make-array *max-units*
574 :element-type 'short-float
575 :initial-element 0.0))
576 (setf (svref *output-deltas* i)
577 (make-array *max-units*
578 :element-type 'short-float
579 :initial-element 0.0))
580 (setf (svref *output-slopes* i)
581 (make-array *max-units*
582 :element-type 'short-float
583 :initial-element 0.0))
584 (setf (svref *output-prev-slopes* i)
585 (make-array *max-units*
586 :element-type 'short-float
587 :initial-element 0.0)))
588 ;; For each candidate unit, create the vectors holding the correlations,
589 ;; incoming weights, and other stats.
590 (dotimes1 (i *ncandidates*)
591 (setf (svref *cand-cor* i)
592 (make-array *noutputs*
593 :element-type 'short-float
594 :initial-element 0.0))
595 (setf (svref *cand-prev-cor* i)
596 (make-array *noutputs*
597 :element-type 'short-float
598 :initial-element 0.0))
599 (setf (svref *cand-weights* i)
600 (make-array *max-units*
601 :element-type 'short-float
602 :initial-element 0.0))
603 (setf (svref *cand-deltas* i)
604 (make-array *max-units*
605 :element-type 'short-float
606 :initial-element 0.0))
607 (setf (svref *cand-slopes* i)
608 (make-array *max-units*
609 :element-type 'short-float
610 :initial-element 0.0))
611 (setf (svref *cand-prev-slopes* i)
612 (make-array *max-units*
613 :element-type 'short-float
614 :initial-element 0.0))))
615
616 (defun random-weight ()
617 "Select a random weight, uniformly distributed over the
618 interval from minus to plus *weight-range*."
619 (-sf (random (*sf 2.0 *weight-range*)) *weight-range*))
620
621 (defun init-net ()
622 "Set up the network for a learning problem. Clean up all the data
623 structures that may have become corrupted. Initialize the output weights
624 to random values controlled by *weight-range*."
625 ;; Set up the *ALL-CONNECTIONS* vector.
626 (setq *all-connections*
627 (make-array *max-units* :element-type 'fixnum))
628 (dotimes1 (i *max-units*)
629 (setf (ivref *all-connections* i) i))
630 ;; Initialize the active unit data structures.
631 (dotimes1 (i *max-units*)
632 (setf (fvref *extra-values* i) 0.0)
633 (setf (ivref *nconnections* i) 0)
634 (setf (svref *connections* i) nil)
635 (setf (svref *weights* i) nil)
636 (setf (svref *output-weights-record* i) nil))
637 ;; Initialize the per-output data structures.
638 (dotimes1 (i *noutputs*)
639 (setf (fvref *outputs* i) 0.0)
640 (setf (fvref *extra-errors* i) 0.0)
641 (let ((ow (svref *output-weights* i))
642 (od (svref *output-deltas* i))
643 (os (svref *output-slopes* i))
644 (op (svref *output-prev-slopes* i)))
645 (dotimes1 (j *max-units*)
646 (setf (fvref ow j) 0.0)
647 (setf (fvref od j) 0.0)
648 (setf (fvref os j) 0.0)
649 (setf (fvref op j) 0.0))
650 ;; Set up initial random weights for the input-to-output connections.
651 (dotimes1 (j (1+ *ninputs*))
652 (setf (fvref ow j) (random-weight)))))
653 ;; Initialize the caches if they are in use.
654 (when *use-cache*
655 (dotimes1 (j *max-cases*)
656 (let ((v (svref *values-cache* j))
657 (e (svref *errors-cache* j)))
658 (dotimes1 (i *max-units*)
659 (setf (fvref v i) 0.0))
660 (dotimes1 (i *noutputs*)
661 (setf (fvref e i) 0.0)))))
662 ;; Candidate units get initialized in a separate routine.
663 (init-candidates)
664 ;; Do some other assorted housekeeping.
665 (setf (fvref *extra-values* 0) 1.0)
666 (setq *epoch* 0)
667 (setq *nunits* (+ 1 *ninputs*))
668 (setq *error-bits* 0)
669 (setq *true-error* 0.0)
670 (setq *sum-error* 0.0)
671 (setq *sum-sq-error* 0.0)
672 (setq *best-candidate-score* 0.0)
673 (setq *best-candidate* 0))
674
675 (defun changed-training-set ()
676 "Call this instead of BUILD-NET and INIT-NET if you want to leave
677 existing hidden units in place and start from here with new training
678 examples. Assumes that the number of net inputs and outputs remains the
679 same, but the number of cases may have changed. Rebuilds the caches."
680 (setq *max-cases* (length *training-inputs*)
681 *ncases* *max-cases*
682 *first-case* 0
683 *values-cache* (make-array *max-cases* :initial-element nil)
684 *errors-cache* (make-array *max-cases* :initial-element nil))
685 ;; Only create the caches if *USE-CACHE* is on -- may not always have room.
686 (when *use-cache*
687 (dotimes1 (i *max-cases*)
688 (setf (svref *errors-cache* i)
689 (make-array *noutputs*
690 :element-type 'short-float
691 :initial-element 0.0))
692 (setq *values* (make-array *max-units*
693 :element-type 'short-float
694 :initial-element 0.0))
695 (setf (svref *values-cache* i) *values*)
696 (set-up-inputs (svref *training-inputs* i))
697 (do ((j (1+ *ninputs*) (1+ j)))
698 ((= j *nunits*))
699 (declare (fixnum j))
700 (compute-unit-value j)))))
701
702 ;;;; Utilities for learning.
703
704 (proclaim '(inline activation activation-prime))
705
706 (defun activation (sum)
707 "Given the sum of weighted inputs, compute the unit's activation value.
708 Defined unit types are :sigmoid, :asigmoid, and :gaussian."
709 (declare (short-float sum))
710 (ecase *unit-type*
711 (:sigmoid
712 ;; Symmetric sigmoid function in range -0.5 to +0.5.
713 (cond ((< sum -15.0) -0.5)
714 ((> sum 15.0) +0.5)
715 (t (-sf (/sf (+sf 1.0 (exp (-sf sum)))) 0.5))))
716 (:asigmoid
717 ;; Asymmetric sigmoid in range 0.0 to 1.0.
718 (cond ((< sum -15.0) 0.0)
719 ((> sum 15.0) 1.0)
720 (t (/sf 1.0 (+sf 1.0 (exp (-sf sum)))))))
721 (:gaussian
722 ;; Gaussian activation function in range 0.0 to 1.0.
723 (let ((x (*sf -0.5 sum sum)))
724 (if (< x -75.0) 0.0 (exp x))))))
725
726 ;;; Note: do not use *SIGMOID-PRIME-OFFSET* here, as it confuses the
727 ;;; correlation machinery. But do use it in output-prime, since it does no
728 ;;; harm there and the output units often get stuck at extreme values.
729
730 (defun activation-prime (value sum)
731 "Given the unit's activation value and sum of weighted inputs, compute
732 the derivative of the activation with respect to the sum. Defined unit
733 types are :sigmoid, :asigmoid, and :gaussian."
734 (declare (short-float value sum))
735 (ecase *unit-type*
736 (:sigmoid
737 (-sf 0.25 (*sf value value)))
738 (:asigmoid
739 (*sf value (-sf 1.0 value)))
740 (:gaussian
741 (*sf (-sf value) sum))))
742
743 (proclaim '(inline output-function output-prime))
744
745 (defun output-function (sum)
746 "Compute the value of an output, given the weighted sum of incoming values.
747 Defined output types are :sigmoid and :linear."
748 (declare (short-float sum))
749 (ecase *output-type*
750 (:sigmoid (cond ((< sum -15.0) -0.5)
751 ((> sum 15.0) +0.5)
752 (t (-sf (/sf 1.0 (+sf 1.0 (exp (-sf sum)))) 0.5))))
753 (:linear sum)))
754
755 (defun output-prime (output)
756 "Compute the derivative of an output with respect to the weighted sum of
757 incoming values. Defined output types are :sigmoid and :linear."
758 (declare (short-float output))
759 (ecase *output-type*
760 (:sigmoid
761 (+sf *sigmoid-prime-offset* (-sf 0.25 (*sf output output))))
762 (:linear 1.0)))
763
764 ;;; The basic routine for doing Quickprop-style update of weights.
765 ;;; Distilled essence of a year's work...
766
767 (proclaim '(inline quickprop-update))
768
769 (defun quickprop-update (i weights deltas slopes prevs
770 epsilon decay mu shrink-factor)
771 "Given vectors holding weights, deltas, slopes, and previous slopes,
772 and an index i, update weight(i) and delta(i) appropriately. Move
773 slope(i) to prev(i) and zero out slope(i). Add weight decay term to
774 each slope before doing the update."
775 (let* ((w (fvref weights i))
776 (d (fvref deltas i))
777 (s (+sf (fvref slopes i) (*sf decay w)))
778 (p (fvref prevs i))
779 (next-step 0.0))
780 (declare (short-float w p s d next-step))
781 ;; The step must always be downhill.
782 (cond
783 ;; If last step was negative...
784 ((minusp d)
785 ;; First, add in linear term if current slope is still positive.
786 (when (plusp s)
787 (decf-sf next-step (*sf epsilon s)))
788 (cond
789 ;; If current slope is close to or larger than prev slope...
790 ((>= s (*sf shrink-factor p))
791 ;; Take maximum size negative step.
792 (incf-sf next-step (*sf mu d)))
793 ;; Else, use quadratic estimate.
794 (t (incf-sf next-step (*sf d (/sf s (-sf p s)))))))
795 ;; If last step was positive...
796 ((plusp d)
797 ;; First, add in linear term if current slope is still negative.
798 (when (minusp s)
799 (decf-sf next-step (*sf epsilon s)))
800 (cond
801 ;; If current slope is close to or more neg than prev slope...
802 ((<= s (*sf shrink-factor p))
803 ;; Take maximum size positive step.
804 (incf-sf next-step (*sf mu d)))
805 ;; Else, use quadratic estimate.
806 (t (incf-sf next-step (*sf d (/sf s (-sf p s)))))))
807 ;; Last step was zero, so use only linear term.
808 (t (decf-sf next-step (*sf epsilon s))))
809 ;; Having computed the next step, update the data vectors.
810 (setf (fvref deltas i) next-step)
811 (setf (fvref weights i) (+sf w next-step))
812 (setf (fvref prevs i) s)
813 (setf (fvref slopes i) 0.0)
814 nil))
815
816
817 ;;;; Machinery for training output weights.
818
819 (defun set-up-inputs (input)
820 "Set up all the inputs from the INPUT vector as the first few entries in
821 in the values vector."
822 (declare (simple-vector input))
823 (setf (fvref *values* 0) 1.0)
824 (dotimes1 (i *ninputs*)
825 (setf (fvref *values* (1+ i))
826 (the short-float (svref input i)))))
827
828 (defun output-forward-pass ()
829 "Assume the *VALUES* vector has been set up. Just compute the network's
830 outputs."
831 (dotimes1 (j *noutputs*)
832 (let ((ow (svref *output-weights* j))
833 (sum 0.0))
834 (declare (short-float sum))
835 (dotimes1 (i *nunits*)
836 (incf-sf sum (*sf (fvref *values* i) (fvref ow i))))
837 (setf (fvref *outputs* j)
838 (output-function sum)))))
839
840 (defun compute-unit-value (j)
841 "Assume that *VALUES* vector has been set up for all units with index less
842 than J. Compute and record the value for unit J."
843 (declare (fixnum j))
844 (let* ((c (svref *connections* j))
845 (w (svref *weights* j))
846 (sum 0.0))
847 (declare (short-float sum))
848 (dotimes1 (i (ivref *nconnections* j))
849 (incf-sf sum (*sf (fvref *values* (ivref c i))
850 (fvref w i))))
851 (setf (fvref *values* j) (activation sum))
852 nil))
853
854 (defun full-forward-pass (input)
855 "Set up the inputs from the INPUT vector, then propagate activation values
856 forward through all hidden units and output units."
857 (set-up-inputs input)
858 ;; For each hidden unit J, compute the activation value.
859 (do ((j (1+ *ninputs*) (1+ j)))
860 ((= j *nunits*))
861 (declare (fixnum j))
862 (compute-unit-value j))
863 ;; Now compute outputs.
864 (output-forward-pass))
865
866 ;;; Note: We fill the *ERRORS* vector and related statistics with either
867 ;;; the raw error or the error after modification by output-prime,
868 ;;; depending on the *RAW-ERROR* switch. This controls what form of error
869 ;;; the candidate units try to correlate with. All experiments reported in
870 ;;; TR CMU-CS-90-100 assume *RAW-ERROR* is NIL, but this might not always
871 ;;; be the best choice.
872
873 (defun compute-errors (goal output-slopes-p stats-p)
874 "GOAL is a vector of desired values for the output units. Compute and
875 record the output errors for the current training case. If
876 OUTPUT-SLOPES-P is T, then use errors to compute slopes for output
877 weights. If STATS-P is T, accumulate error statistics."
878 (declare (simple-vector goal))
879 (dotimes1 (j *noutputs*)
880 (let* ((out (fvref *outputs* j))
881 (dif (-sf out (svref goal j)))
882 (err-prime (*sf dif (output-prime out)))
883 (os (svref *output-slopes* j)))
884 (declare (short-float dif err-prime))
885 (when stats-p
886 (unless (< (abs dif) *score-threshold*)
887 (incf *error-bits*))
888 (incf-sf *true-error* (*sf dif dif)))
889 (cond (*raw-error*
890 (setf (fvref *errors* j) dif)
891 (incf-sf *sum-error* dif)
892 (incf-sf *sum-sq-error* (*sf dif dif)))
893 (t
894 (setf (fvref *errors* j) err-prime)
895 (incf-sf *sum-error* err-prime)
896 (incf-sf *sum-sq-error* (*sf err-prime err-prime))))
897 (when output-slopes-p
898 (dotimes1 (i *nunits*)
899 (incf-sf (fvref os i) (*sf err-prime (fvref *values* i))))))))
900
901 ;;; Note: Scaling *OUTPUT-EPSILON* by the number of cases seems to keep the
902 ;;; quickprop update in a good range across many different-sized training
903 ;;; sets, but it's something of a hack. Choosing good epsilon values
904 ;;; still requires some trial and error.
905
906 (defun update-output-weights ()
907 "Update the output weights, using the pre-computed slopes, prev-slopes,
908 and delta values. Uses the quickprop update function."
909 (let ((eps (/ *output-epsilon* *ncases*)))
910 (dotimes1 (j *noutputs*)
911 (let ((ow (svref *output-weights* j))
912 (od (svref *output-deltas* j))
913 (os (svref *output-slopes* j))
914 (op (svref *output-prev-slopes* j)))
915 (dotimes1 (i *nunits*)
916 (quickprop-update i ow od os op eps *output-decay*
917 *output-mu* *output-shrink-factor*))))))
918
919
920 ;;;; Outer loops for training output weights.
921
922 (defun train-outputs-epoch ()
923 "Perform forward propagation once for each set of weights in the
924 training vectors, computing errors and slopes. Then update the output
925 weights."
926 ;; Zero error accumulators.
927 (setq *error-bits* 0)
928 (setq *true-error* 0.0)
929 (setq *sum-error* 0.0)
930 (setq *sum-sq-error* 0.0)
931 ;; User may have changed mu between epochs, so fix shrink-factor.
932 (setq *output-shrink-factor*
933 (/sf *output-mu* (+sf 1.0 *output-mu*)))
934 ;; Now run through the training examples.
935 (do ((i *first-case* (1+ i)))
936 ((= i (the fixnum (+ *first-case* *ncases*))))
937 (declare (fixnum i))
938 (setq *goal* (svref *training-outputs* i))
939 (cond (*use-cache*
940 (setq *values* (svref *values-cache* i))
941 (setq *errors* (svref *errors-cache* i))
942 (output-forward-pass))
943 (t (setq *values* *extra-values*)
944 (setq *errors* *extra-errors*)
945 (full-forward-pass (svref *training-inputs* i))))
946 (compute-errors *goal* t t))
947 ;; Do not change weights or count epoch if this run was perfect.
948 (unless (= 0 *error-bits*)
949 (update-output-weights)
950 (incf *epoch*)))
951
952 (defun record-output-weights ()
953 "Store the output weights developed after each output-training phase
954 in the *ouput-weights-record* vector."
955 (let ((record (make-array *noutputs* :initial-element nil)))
956 (dotimes1 (o *noutputs*)
957 (let ((original (svref *output-weights* o))
958 (copy (make-array *nunits* :element-type 'short-float
959 :initial-element 0.0)))
960 (dotimes1 (u *nunits*)
961 (setf (fvref copy u) (fvref original u)))
962 (setf (svref record o) copy)))
963 (setf (svref *output-weights-record* (1- *nunits*)) record)))
964
965 (defun train-outputs (max-epochs)
966 "Train the output weights. If we exhaust MAX-EPOCHS, stop with value
967 :TIMEOUT. If there are zero error bits, stop with value :WIN. Else,
968 keep going until the true error has not changed by a significant amount
969 for *OUTPUT-PATIENCE* epochs. Then return :STAGNANT. If
970 *OUTPUT-PATIENCE* is zero, we do not stop until victory or until
971 MAX-EPOCHS is used up."
972 (declare (fixnum max-epochs))
973 (let ((last-error 0.0)
974 (quit-epoch (+ *epoch* *output-patience*))
975 (first-time t))
976 (declare (fixnum quit-epoch)
977 (short-float last-error))
978 (dotimes1 (i max-epochs (progn
979 (record-output-weights)
980 :timeout))
981 ;; Maybe run a test epoch to see how we're doing.
982 (when (and *test*
983 (not (= 0 *test-interval*))
984 (= 0 (mod i *test-interval*)))
985 (test-epoch))
986 (train-outputs-epoch)
987 (cond ((zerop *error-bits*)
988 (record-output-weights)
989 (return :win))
990 ((zerop *output-patience*))
991 (first-time
992 (setq first-time nil)
993 (setq last-error *true-error*))
994 ((> (abs (- *true-error* last-error))
995 (* last-error *output-change-threshold*))
996 (setq last-error *true-error*)
997 (setq quit-epoch (+ *epoch* *output-patience*)))
998 ((>= *epoch* quit-epoch)
999 (record-output-weights)
1000 (return :stagnant))))))
1001
1002
1003 ;;;; Machinery for Training, Selecting, and Installing Candidate Units.
1004
1005 (defun init-candidates ()
1006 "Give new random weights to all of the candidate units. Zero the other
1007 candidate-unit statistics."
1008 (dotimes1 (i *ncandidates*)
1009 (setf (fvref *cand-sum-values* i) 0.0)
1010 (let ((cw (svref *cand-weights* i))
1011 (cd (svref *cand-deltas* i))
1012 (cs (svref *cand-slopes* i))
1013 (cp (svref *cand-prev-slopes* i))
1014 (cc (svref *cand-cor* i))
1015 (cpc (svref *cand-prev-cor* i)))
1016 (dotimes1 (j *nunits*)
1017 (setf (fvref cw j) (random-weight))
1018 (setf (fvref cd j) 0.0)
1019 (setf (fvref cs j) 0.0)
1020 (setf (fvref cp j) 0.0))
1021 (dotimes1 (o *noutputs*)
1022 (setf (fvref cc o) 0.0)
1023 (setf (fvref cpc o) 0.0)))))
1024
1025 (defun install-new-unit ()
1026 "Add the candidate-unit with the best correlation score to the active
1027 network. Then reinitialize the candidate pool."
1028 (when (>= *nunits* *max-units*)
1029 (error "Cannot add any more units."))
1030 ;; For now, assume total connectivity.
1031 (setf (ivref *nconnections* *nunits*) *nunits*)
1032 (setf (svref *connections* *nunits*) *all-connections*)
1033 ;; Copy the weight vector for the new unit.
1034 (let ((w (make-array *nunits* :element-type 'short-float))
1035 (cw (svref *cand-weights* *best-candidate*)))
1036 (dotimes1 (i *nunits*)
1037 (setf (fvref w i) (fvref cw i)))
1038 (setf (svref *weights* *nunits*) w)
1039 ;; Tell user about the new unit.
1040 (format t " Add unit ~S: ~S~%"
1041 (+ 1 *nunits*) w))
1042 ;; Fix up output weights for candidate unit.
1043 ;; Use minus the correlation times the *weight-multiplier* as an
1044 ;; initial guess. At least the sign should be right.
1045 (dotimes1 (o *noutputs*)
1046 (setf (fvref (svref *output-weights* o) *nunits*)
1047 (*sf (-sf (fvref (svref *cand-prev-cor* *best-candidate*) o))
1048 *weight-multiplier*)))
1049 ;; If using cache, run an epoch to compute this unit's values.
1050 (when *use-cache*
1051 (dotimes1 (i *max-cases*)
1052 (setq *values* (svref *values-cache* i))
1053 (compute-unit-value *nunits*)))
1054 ;; Reinitialize candidate units with random weights.
1055 (incf *nunits*)
1056 (init-candidates))
1057
1058 ;;; Note: Ideally, after each adjustment of the candidate weights, we would
1059 ;;; run two epochs. The first would just determine the correlations
1060 ;;; between the candidate unit outputs and the residual error. Then, in a
1061 ;;; second pass, we would adjust each candidate's input weights so as to
1062 ;;; maximize the absolute value of the correlation. We need to know the
1063 ;;; sign of the correlation for each candidate-output pair so that we know
1064 ;;; which direction to tune the input weights.
1065
1066 ;;; Since this ideal method doubles the number of epochs required for
1067 ;;; training candidates, we cheat slightly and use the correlation values
1068 ;;; computed BEFORE the most recent weight update. This combines the two
1069 ;;; epochs, saving us almost a factor of two. To bootstrap the process, we
1070 ;;; begin with a single epoch that computes only the correlation.
1071
1072 ;;; Since we look only at the sign of the correlation and since that sign
1073 ;;; should change very infrequently, this probably is OK. But keep a
1074 ;;; lookout for pathological situations in which this might cause
1075 ;;; oscillation.
1076
1077
1078 ;;; This function is used only once at the start of each output-training
1079 ;;; phase to prime the pump. After that, each call to compute-slopes also
1080 ;;; computes the error-value products for the next epoch.
1081
1082 (defun compute-correlations ()
1083 "For the current training pattern, compute the value of each candidate
1084 unit and begin to compute the correlation between that unit's value and
1085 the error at each output. We have already done a forward-prop and
1086 computed the error values for active units."
1087 (dotimes1 (u *ncandidates*)
1088 (let ((sum 0.0)
1089 (v 0.0)
1090 (cw (svref *cand-weights* u))
1091 (cc (svref *cand-cor* u)))
1092 (declare (short-float sum v))
1093 ;; Determine activation value of each candidate unit.
1094 (dotimes1 (i *nunits*)
1095 (incf-sf sum (*sf (fvref cw i)
1096 (fvref *values* i))))
1097 (setq v (activation sum))
1098 (incf-sf (fvref *cand-sum-values* u) v)
1099 ;; Accumulate value of each unit times error at each output.
1100 (dotimes1 (o *noutputs*)
1101 (incf-sf (fvref cc o) (*sf v (fvref *errors* o)))))))
1102
1103 ;;; Note: When we were computing true correlations between candidates and
1104 ;;; outputs, this is where the normalization factors went in. Currently we
1105 ;;; are just using covariances, as explained in the tech report. So we
1106 ;;; make only two adjustments here. First, we subtract out the product of
1107 ;;; the mean error and the mean candidate value to keep things from
1108 ;;; exploding when the error has a non-zero mean. Second, we effectively
1109 ;;; scale the error values by the sum-squared error over all training
1110 ;;; cases. This just keeps us from having to adjust *input-epsilon*
1111 ;;; repeatedly as the error is gradually reduced to a small fraction of its
1112 ;;; initial size.
1113
1114 (defun adjust-correlations ()
1115 "Normalize each accumulated correlation value, and stuff the normalized
1116 form into the *cand-prev-cor* data structure. Then zero *cand-cor* to
1117 prepare for the next round. Note the unit with the best total
1118 correlation score."
1119 (setq *best-candidate* 0)
1120 (setq *best-candidate-score* 0.0)
1121 (dotimes1 (u *ncandidates*)
1122 (let* ((cc (svref *cand-cor* u))
1123 (cpc (svref *cand-prev-cor* u))
1124 (offset (*sf (fvref *cand-sum-values* u) *avg-error*))
1125 (cor 0.0)
1126 (score 0.0))
1127 (declare (short-float offset cor score))
1128 (dotimes1 (o *noutputs*)
1129 (setq cor (/sf (-sf (fvref cc o) offset) *sum-sq-error*))
1130 (setf (fvref cpc o) cor)
1131 (setf (fvref cc o) 0.0)
1132 (incf-sf score (abs cor)))
1133 ;; Keep track of the candidate with the best overall correlation.
1134 (when (> score *best-candidate-score*)
1135 (setq *best-candidate-score* score)
1136 (setq *best-candidate* u)))))
1137
1138 ;;; This is the key function in the candidate training process.
1139
1140 (defun compute-slopes ()
1141 "Given the correlation values for each candidate-output pair, compute
1142 the derivative of the candidate's score with respect to each incoming
1143 weight."
1144 (dotimes1 (u *ncandidates*)
1145 (let* ((sum 0.0)
1146 (value 0.0)
1147 (actprime 0.0)
1148 (direction 0.0)
1149 (cw (svref *cand-weights* u))
1150 (cs (svref *cand-slopes* u))
1151 (cc (svref *cand-cor* u))
1152 (cpc (svref *cand-prev-cor* u)))
1153 (declare (short-float sum value actprime direction))
1154 ;; Forward pass through each candidate unit to compute activation-prime.
1155 (dotimes1 (i *nunits*)
1156 (incf-sf sum (*sf (fvref cw i)
1157 (fvref *values* i))))
1158 (setq value (activation sum))
1159 (setq actprime (activation-prime value sum))
1160 ;; Now compute which way we want to adjust each unit's incoming
1161 ;; activation.
1162 (dotimes1 (o *noutputs*)
1163 (let ((error (fvref *errors* o)))
1164 (decf-sf direction
1165 (*sf (if (minusp (fvref cpc o)) -1.0 1.0)
1166 (*sf actprime
1167 (/sf (-sf error *avg-error*)
1168 *sum-sq-error*))))
1169 ;; Also accumulate the error-value products for use next epoch.
1170 (incf-sf (fvref cc o) (*sf error value))))
1171 ;; Given the direction we want to push the candidate, compute
1172 ;; which way we want to tweak each incoming weight.
1173 (dotimes1 (i *nunits*)
1174 (incf-sf (fvref cs i)
1175 (*sf direction (fvref *values* i)))))))
1176
1177 ;;; Note: Scaling *INPUT-EPSILON* by the number of cases and number of
1178 ;;; inputs to each unit seems to keep the quickprop update in a good range,
1179 ;;; as the network goes from small to large, and across many
1180 ;;; different-sized training sets. Still, choosing a good epsilon value
1181 ;;; requires some trial and error.
1182
1183 (defun update-input-weights ()
1184 "Update the input weights, using the pre-computed slopes, prev-slopes,
1185 and delta values. Uses the quickprop update function."
1186 (let ((eps (/ *input-epsilon* (* *ncases* *nunits*))))
1187 (dotimes1 (u *ncandidates*)
1188 (let ((cw (svref *cand-weights* u))
1189 (cd (svref *cand-deltas* u))
1190 (cs (svref *cand-slopes* u))
1191 (cp (svref *cand-prev-slopes* u)))
1192 (dotimes1 (i *nunits*)
1193 (quickprop-update i cw cd cs cp eps *input-decay*
1194 *input-mu* *input-shrink-factor*))))))
1195
1196 ;;; Outer loop for training the candidate unit(s).
1197
1198 (defun train-inputs-epoch ()
1199 "For each training pattern, perform a forward pass. Tune the candidate units'
1200 weights to maximize the correlation score of each."
1201 (do ((i *first-case* (1+ i)))
1202 ((= i (the fixnum (+ *first-case* *ncases*))))
1203 (declare (fixnum i))
1204 (setq *goal* (svref *training-outputs* i))
1205 ;; Compute values and errors, or recall cached values.
1206 (cond (*use-cache*
1207 (setq *values* (svref *values-cache* i))
1208 (setq *errors* (svref *errors-cache* i)))
1209 (t (setq *values* *extra-values*)
1210 (setq *errors* *extra-errors*)
1211 (full-forward-pass (svref *training-inputs* i))
1212 (compute-errors *goal* nil nil)))
1213 ;; Compute the slopes we will use to adjust candidate weights.
1214 (compute-slopes))
1215 ;; User may have changed mu between epochs, so fix shrink-factor.
1216 (setq *input-shrink-factor* (/sf *input-mu*
1217 (+sf 1.0 *input-mu*)))
1218 ;; Now adjust the candidate unit input weights using quickprop.
1219 (update-input-weights)
1220 ;; Fix up the correlation values for the next epoch.
1221 (adjust-correlations)
1222 (incf *epoch*))
1223
1224 (defun correlations-epoch ()
1225 "Do an epoch through all active training patterns just to compute the
1226 initial correlations. After this one pass, we will update the
1227 correlations as we train."
1228 (do ((i *first-case* (1+ i)))
1229 ((= i (the fixnum (+ *first-case* *ncases*))))
1230 (declare (fixnum i))
1231 (setq *goal* (svref *training-outputs* i))
1232 (cond (*use-cache*
1233 (setq *values* (svref *values-cache* i))
1234 (setq *errors* (svref *errors-cache* i)))
1235 (t (setq *values* *extra-values*)
1236 (setq *errors* *extra-errors*)
1237 (full-forward-pass (svref *training-inputs* i))
1238 (compute-errors *goal* nil nil)))
1239 (compute-correlations))
1240 (adjust-correlations)
1241 (incf *epoch*))
1242
1243 (defun train-inputs (max-epochs)
1244 "Train the input weights of all candidates. If we exhaust MAX-EPOCHS,
1245 stop with value :TIMEOUT. Else, keep going until the best candidate
1246 unit's score has changed by a significant amount, and then until it does
1247 not change significantly for PATIENCE epochs. Then return :STAGNANT. If
1248 PATIENCE is zero, we do not stop until victory or until MAX-EPOCHS is
1249 used up."
1250 (declare (fixnum max-epochs))
1251 (setq *avg-error* (/ *sum-error* (* *ncases* *noutputs*)))
1252 (correlations-epoch)
1253 (let ((last-score 0.0)
1254 (quit max-epochs)
1255 (first-time t))
1256 (declare (fixnum quit)
1257 (short-float last-score))
1258 (dotimes1 (i max-epochs :timeout)
1259 (train-inputs-epoch)
1260 (cond ((zerop *input-patience*))
1261 (first-time
1262 (setq first-time nil)
1263 (setq last-score *best-candidate-score*))
1264 ((> (abs (-sf *best-candidate-score* last-score))
1265 (* last-score *input-change-threshold*))
1266 (setq last-score *best-candidate-score*)
1267 (setq quit (+ i *input-patience*)))
1268 ((>= i quit)
1269 (return :stagnant))))))
1270
1271 ;;;; Outer Loop.
1272
1273 (defun list-parameters ()
1274 "Print out the current training parameters in abbreviated form."
1275 (format t "SigOff ~,2F, WtRng ~,2F, WtMul ~,2F~%"
1276 *sigmoid-prime-offset* *weight-range* *weight-multiplier*)
1277 (format t "OMu ~,2F, OEps ~,2F, ODcy ~,4F, OPat ~D, OChange ~,3F~%"
1278 *output-mu* *output-epsilon* *output-decay* *output-patience*
1279 *output-change-threshold*)
1280 (format t "IMu ~,2F, IEps ~,2F, IDcy ~,4F, IPat ~D, IChange ~,3F~%"
1281 *input-mu* *input-epsilon* *input-decay* *input-patience*
1282 *input-change-threshold*)
1283 (format t "Utype ~S, Otype ~S, RawErr ~S, Pool ~D~%"
1284 *unit-type* *output-type* *raw-error* *ncandidates*))
1285
1286 (defun train (outlimit inlimit rounds &optional (restart nil))
1287 "Train the output weights until stagnation or victory is reached. Then
1288 train the input weights to stagnation or victory. Then install the best
1289 candidate unit and repeat. OUTLIMIT and INLIMIT are upper limits on the number
1290 of cycles in each output and input phase. ROUNDS is an upper limit on
1291 the number of unit-installation cycles. If RESTART is non-nil, we are
1292 restarting training from the current point -- do not reinitialize the net."
1293 (declare (fixnum outlimit inlimit rounds))
1294 (unless restart (init-net))
1295 (list-parameters)
1296 (when *use-cache*
1297 (dotimes1 (i *max-cases*)
1298 (setq *values* (svref *values-cache* i))
1299 (set-up-inputs (svref *training-inputs* i))))
1300 (dotimes1 (r rounds :lose)
1301 (case (train-outputs outlimit)
1302 (:win
1303 (list-parameters)
1304 (format t "Victory at ~S epochs, ~S units, ~S hidden, Error ~S.~%"
1305 *epoch* *nunits* (- *nunits* *ninputs* 1) *true-error*)
1306 (return nil))
1307 (:timeout
1308 (format t "Epoch ~D: Out Timeout ~D bits wrong, error ~S.~2%"
1309 *epoch* *error-bits* *true-error*))
1310 (:stagnant
1311 (format t "Epoch ~D: Out Stagnant ~D bits wrong, error ~S.~2%"
1312 *epoch* *error-bits* *true-error*)))
1313 (when *test* (test-epoch))
1314 (case (train-inputs inlimit)
1315 (:timeout
1316 (format t "Epoch ~D: In Timeout. Cor: ~D~%"
1317 *epoch* *best-candidate-score*))
1318 (:stagnant
1319 (format t "Epoch ~D: In Stagnant. Cor: ~D~%"
1320 *epoch* *best-candidate-score*)))
1321 (install-new-unit)))
1322
1323 (defun test-epoch (&optional (*score-threshold* 0.49999))
1324 "Perform forward propagation once for each set of weights in the training
1325 and testing vectors. Reporting the performance. Do not change any
1326 weights. Do not use the caches."
1327 (let ((*use-cache* nil)
1328 (*values* *extra-values*)
1329 (*errors* *extra-errors*)
1330 (*error-bits* 0)
1331 (*true-error* 0.0)
1332 (*sum-error* 0.0)
1333 (*sum-sq-error* 0.0))
1334 ;; Run all training patterns and count errors.
1335 (dotimes1 (i (length *training-inputs*))
1336 (setq *goal* (svref *training-outputs* i))
1337 (full-forward-pass (svref *training-inputs* i))
1338 (compute-errors *goal* nil t))
1339 (format t "Training: ~D of ~D wrong, error ~S."
1340 *error-bits* (length *training-inputs*) *true-error*)
1341 ;; Zero some accumulators again.
1342 (setq *error-bits* 0)
1343 (setq *true-error* 0.0)
1344 (setq *sum-error* 0.0)
1345 (setq *sum-sq-error* 0.0)
1346 ;; Now run all test patterns and report the results.
1347 (when *test-inputs*
1348 (dotimes1 (i (length *test-inputs*))
1349 (setq *goal* (svref *test-outputs* i))
1350 (full-forward-pass (svref *test-inputs* i))
1351 (compute-errors *goal* nil t)))
1352 (format t " Test: ~D of ~D wrong, error ~S.~%"
1353 *error-bits* (length *test-inputs*) *true-error*)))
1354
1355 (defun test-setup (nunits weights output-weights)
1356 "Set up a network for testing, given stored weights and output weights."
1357 (init-net)
1358 (setq *weights* weights)
1359 (setq *output-weights* output-weights)
1360 (setq *nunits* nunits)
1361 (do ((i (1+ *ninputs*) (1+ i)))
1362 ((= i *nunits*))
1363 (declare (fixnum i))
1364 (setf (ivref *nconnections* i) i)
1365 (setf (svref *connections* i) *all-connections*)))
1366
1367
1368 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1369 ;;; Example Applications ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1370 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1371
1372 ;;; Zig-Zag problem. An easy one, useful for testing the code.
1373
1374 (defun build-zig-zag (n)
1375 "Build N pairs of 1-D zig-zag."
1376 (declare (fixnum n))
1377 (setq *ninputs* 1)
1378 (setq *noutputs* 1)
1379 (let ((ti (make-array (* 2 n)))
1380 (to (make-array (* 2 n))))
1381 (dotimes1 (i n)
1382 (setf (svref ti (* i 2))
1383 (vector (+ i 1.0)))
1384 (setf (svref to (* i 2))
1385 (vector (if (evenp i) 0.5 -0.5)))
1386 (setf (svref ti (1+ (* i 2)))
1387 (vector (- (+ i 1.0))))
1388 (setf (svref to (1+ (* i 2)))
1389 (vector (if (evenp i) -0.5 0.5))))
1390 (setq *training-inputs* ti)
1391 (setq *training-outputs* to)
1392 (setq *test-inputs* ti)
1393 (setq *test-outputs* to))
1394 (build-net 1 1)
1395 (init-net))
1396
1397 ;;; Call this with something like (BUILD-ZIG-ZAG 4), then call
1398 ;;; something like (train 100 100 25).
1399
1400
1401 ;;; Two spirals problem.
1402
1403 (defun build-two-spirals (&optional (n 97))
1404 "Build N point-pairs of the two-spiral problem, with standard default
1405 of 97 pairs."
1406 (declare (fixnum n))
1407 (setq *ninputs* 2)
1408 (setq *noutputs* 1)
1409 (let ((ti (make-array (* 2 n)))
1410 (to (make-array (* 2 n))))
1411 (dotimes1 (i n)
1412 (let* ((angle (/ (* i (coerce pi 'short-float)) 16.0))
1413 (radius (/ (* 6.5 (- 104.0 i)) 104))
1414 (x (* radius (sin angle)))
1415 (y (* radius (cos angle))))
1416 (setf (svref ti (* i 2))
1417 (vector x y))
1418 (setf (svref to (* i 2))
1419 (vector 0.5))
1420 (setf (svref ti (1+ (* i 2)))
1421 (vector (- x) (- y)))
1422 (setf (svref to (1+ (* i 2)))
1423 (vector -0.5))))
1424 ;; Put the inner part of the spiral first on the list.
1425 (setq ti (nreverse ti))
1426 (setq to (nreverse to))
1427 (setq *training-inputs* ti)
1428 (setq *training-outputs* to)
1429 (setq *test-inputs* ti)
1430 (setq *test-outputs* to))
1431 (build-net 2 1)
1432 (init-net))
1433
1434 ;;; To run this, call (BUILD-TWO-SPIRALS), set various control parameters,
1435 ;;; and then call something like (TRAIN 100 100 25).
1436
1437 ;;; For parameters, try these:
1438 ;;; SigOff 0.10, WtRng 1.00, WtMul 1.00
1439 ;;; OMu 2.00, OEps 1.00, ODcy 0.0001, OPat 12, OChange 0.010
1440 ;;; IMu 2.00, IEps 100.00, IDcy 0.00000, IPat 8, IChange 0.030
1441 ;;; Utype :SIGMOID, Otype :SIGMOID, RawErr NIL, Pool 8
1442
1443 (defvar *save-random-state*
1444 #+cmu
1445 #S(RANDOM-STATE
1446 J 6
1447 K 37
1448 SEED #(495959915 319289337 100972028 135321524 137984323
1449 177928266 385820814 500959740 328846885 254554634
1450 354844676 133704123 362896421 217869951 380210131
1451 323670005 366246053 40575617 346460653 10417936
1452 34276234 300730891 211595838 199986777 291429322
1453 1196272 425488031 328015953 24567252 297307474
1454 82341400 29130711 247126684 98716216 478723257
1455 47355455 81983578 248879315 97829219 533448623
1456 148633156 77868250 28344376 162872116 404460195
1457 321766796 8557425 441861346 455213668 302826847
1458 256874625 271153816 11749650 277043774 234844262))
1459 #-cmu (make-random-state))
1460
1461 (defun time-two-spirals ()
1462 (setq *random-state* (make-random-state *save-random-state*))
1463 (build-two-spirals)
1464 (time (train 100 100 25)))
1465
1466 ;;; The End.
1467

  ViewVC Help
Powered by ViewVC 1.1.5