qd.lisp

;;;; -*- Mode: lisp -*-
;;;;
;;;; Copyright (c) 2007, 2008, 2011 Raymond Toy
;;;;
;;;; Permission is hereby granted, free of charge, to any person
;;;; obtaining a copy of this software and associated documentation
;;;; files (the "Software"), to deal in the Software without
;;;; restriction, including without limitation the rights to use,
;;;; copy, modify, merge, publish, distribute, sublicense, and/or sell
;;;; copies of the Software, and to permit persons to whom the
;;;; Software is furnished to do so, subject to the following
;;;; conditions:
;;;;
;;;; The above copyright notice and this permission notice shall be
;;;; included in all copies or substantial portions of the Software.
;;;;
;;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
;;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
;;;; OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
;;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
;;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
;;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
;;;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
;;;; OTHER DEALINGS IN THE SOFTWARE.

;;; This file contains the core routines for basic arithmetic
;;; operations on a %quad-double.  This includes addition,
;;; subtraction, multiplication, division, negation. and absolute
;;; value.  Some additional helper functions are included such as
;;; raising to an integer power. and the n'th root of a (non-negative)
;;; %quad-double.  The basic comparison operators are included, and
;;; some simple tests for zerop, onep, plusp, and minusp. 
;;;
;;; The basic algorithms are based on Yozo Hida's double-double
;;; implementation.  However, some were copied from CMUCL and modified
;;; to support quad-doubles.


(in-package #:octi)

#+cmu
(eval-when (:compile-toplevel)
  (setf ext:*inline-expansion-limit* 1600))

;; All of the following functions should be inline.
;;(declaim (inline three-sum2))

;; Internal routines for implementing quad-double.

#+cmu
(defmacro two-sum (s e x y)
  `(multiple-value-setq (,s ,e)
    (c::two-sum ,x ,y)))


(defmacro three-sum (s0 s1 s2 a b c)
  (let ((aa (gensym))
	(bb (gensym))
	(cc (gensym))
	(t1 (gensym))
	(t2 (gensym))
	(t3 (gensym)))
    `(let* ((,aa ,a)
	    (,bb ,b)
	    (,cc ,c)
	    (,t1 0d0)
	    (,t2 ,t1)
	    (,t3 ,t1))
      (declare (double-float ,t1 ,t2 ,t3))
      (two-sum ,t1 ,t2 ,aa ,bb)
      (two-sum ,s0 ,t3 ,cc ,t1)
      (two-sum ,s1 ,s2 ,t2 ,t3))))
      
	  
#+nil
(defun three-sum2 (a b c)
  (declare (double-float a b c)
	   (optimize (speed 3)))
  (let* ((t1 0d0)
	 (t2 t1)
	 (t3 t1))
    (two-sum t1 t2 a b)
    (two-sum a t3 c t1)
    (values a (cl:+ t2 t3))))

(defmacro three-sum2 (s0 s1 a b c)
  (let ((aa (gensym))
	(bb (gensym))
	(cc (gensym))
	(t1 (gensym))
	(t2 (gensym))
	(t3 (gensym)))
    `(let* ((,aa ,a)
	    (,bb ,b)
	    (,cc ,c)
	    (,t1 0d0)
	    (,t2 ,t1)
	    (,t3 ,t1))
       (declare (double-float ,t1 ,t2 ,t3))
       (two-sum ,t1 ,t2 ,aa ,bb)
       (two-sum ,s0 ,t3 ,cc ,t1)
       (setf ,s1 (+ ,t2 ,t3)))))

;; Not needed????
#+nil
(declaim (inline quick-three-accum))
#+nil
(defun quick-three-accum (a b c)
  (declare (double-float a b c)
	   (optimize (speed 3) (space 0)))
  (multiple-value-bind (s b)
      (two-sum b c)
    (multiple-value-bind (s a)
	(two-sum a s)
      (let ((za (/= a 0))
	    (zb (/= b 0)))
	(when (and za zb)
	  (return-from quick-three-accum (values (cl:+ s 0d0) (cl:+ a 0d0) (cl:+ b 0d0))))
	(when (not za)
	  (setf a s)
	  (setf s 0d0))
	(when (not zb)
	  (setf b a)
	  (setf a s))
	(values 0d0 a b)))))


;; These functions are quite short, so we inline them to minimize
;; consing.
(declaim (inline make-qd-d
		 add-d-qd
		 add-dd-qd
		 neg-qd
		 neg-qd-t
		 sub-qd
		 sub-qd-dd
		 sub-qd-d
		 sub-d-qd
		 #+cmu
		 make-qd-dd
		 abs-qd
		 qd->
		 qd-<
		 qd->=
		 qd-<=
		 zerop-qd
		 onep-qd
		 plusp-qd
		 minusp-qd
		 qd-=
		 scale-float-qd))

;; Should these functions be inline?  The QD C++ source has these as
;; inline functions, but these are fairly large functions.  However,
;; inlining makes quite a big difference in speed and consing.
#+cmu
(declaim (#+qd-inline inline
	 #-qd-inline ext:maybe-inline
	 renorm-4
	 renorm-5
	 add-qd-d
	 add-qd-d-t
	 add-qd-dd
	 add-qd
	 add-qd-t
	 mul-qd-d
	 mul-qd-d-t
	 mul-qd-dd
	 mul-qd
	 mul-qd-t
	 sqr-qd
	 sqr-qd-t
	 div-qd
	 div-qd-t
	 div-qd-d
	 div-qd-dd))

#+cmu
(defmacro quick-two-sum (s e x y)
  `(multiple-value-setq (,s ,e)
    (c::quick-two-sum ,x ,y)))

#-(or qd-inline (not cmu))
(declaim (ext:start-block renorm-4 renorm-5
			  make-qd-d
			  add-qd-d add-d-qd add-qd-dd
			  add-dd-qd
			  add-qd add-qd-t
			  add-qd-d-t
			  neg-qd
			  sub-qd sub-qd-dd sub-qd-d sub-d-qd
			  mul-qd-d mul-qd-dd
			  mul-qd
			  mul-qd-t
			  mul-qd-d-t
			  sqr-qd
			  sqr-qd-t
			  div-qd div-qd-d div-qd-dd
			  div-qd-t
			  make-qd-dd
			  ))

#+(or)
(defun quick-renorm (c0 c1 c2 c3 c4)
  (declare (double-float c0 c1 c2 c3 c4)
	   (optimize (speed 3)))
  (multiple-value-bind (s t3)
      (quick-two-sum c3 c4)
    (multiple-value-bind (s t2)
	(quick-two-sum c2 s)
      (multiple-value-bind (s t1)
	  (quick-two-sum c1 s)
	(multiple-value-bind (c0 t0)
	    (quick-two-sum c0 s)
	  (multiple-value-bind (s t2)
	      (quick-two-sum t2 t3)
	    (multiple-value-bind (s t1)
		(quick-two-sum t1 s)
	      (multiple-value-bind (c1 t0)
		  (quick-two-sum t0 s)
		(multiple-value-bind (s t1)
		    (quick-two-sum t1 t2)
		  (multiple-value-bind (c2 t0)
		      (quick-two-sum t0 s)
		    (values c0 c1 c2 (cl:+ t0 t1))))))))))))

(defun renorm-4 (c0 c1 c2 c3)
  (declare (double-float c0 c1 c2 c3)
	   (optimize (speed 3) (safety #-allegro 0 #+allegro 1) (debug 0)))
  (let ((s0 0d0)
	(s1 0d0)
	(s2 0d0)
	(s3 0d0))
    (declare (double-float s0 s1 s2 s3))
    (quick-two-sum s0 c3 c2 c3)
    (quick-two-sum s0 c2 c1 s0)
    (quick-two-sum c0 c1 c0 s0)
    (setf s0 c0)
    (setf s1 c1)
    (cond ((/= s1 0)
	   (quick-two-sum s1 s2 s1 c2)
	   (if (/= s2 0)
	       (quick-two-sum s2 s3 s2 c3)
	       (quick-two-sum s1 s2 s1 c3)))
	  (t
	   (quick-two-sum s0 s1 s0 c2)
	   (if (/= s1 0)
	       (quick-two-sum s1 s2 s1 c3)
	       (quick-two-sum s0 s1 s0 c3))))
    (values s0 s1 s2 s3)))
  
(defun renorm-5 (c0 c1 c2 c3 c4)
  (declare (double-float c0 c1 c2 c3 c4)
	   (optimize (speed 3) (safety #-allegro 0 #+allegro 1)))
  (let ((s0 0d0)
	(s1 0d0)
	(s2 0d0)
	(s3 0d0))
    (declare (double-float s0 s1 s2 s3))
    (quick-two-sum s0 c4 c3 c4)
    (quick-two-sum s0 c3 c2 s0)
    (quick-two-sum s0 c2 c1 s0)
    (quick-two-sum c0 c1 c0 s0)
    (quick-two-sum s0 s1 c0 c1)
    (cond ((/= s1 0)
	   (quick-two-sum s1 s2 s1 c2)
	   (cond ((/= s2 0)
		  (quick-two-sum s2 s3 s2 c3)
		  (if (/= s3 0)
		      (incf s3 c4)
		      (incf s2 c4)))
		 (t
		  (quick-two-sum s1 s2 s1 c3)
		  (if (/= s2 0)
		      (quick-two-sum s2 s3 s2 c4)
		      (quick-two-sum s1 s2 s1 c4)))))
	  (t
	   (quick-two-sum s0 s1 s0 c2)
	   (cond ((/= s1 0)
		  (quick-two-sum s1 s2 s1 c3)
		  (if (/= s2 0)
		      (quick-two-sum s2 s3 s2 c4)
		      (quick-two-sum s1 s2 s1 c4)))
		 (t
		  (quick-two-sum s0 s1 s0 c3)
		  (if (/= s1 0)
		      (quick-two-sum s1 s2 s1 c4)
		      (quick-two-sum s0 s1 s0 c4))))))
    (values s0 s1 s2 s3)))

(defun make-qd-d (a0 &optional (a1 0d0 a1-p) (a2 0d0) (a3 0d0))
  "Create a %QUAD-DOUBLE from four double-floats, appropriately
normalizing the result from the four double-floats.  A0 is the most
significant part of the %QUAD-DOUBLE, and A3 is the least.  The optional
parameters default to 0.
"
  (declare (double-float a0 a1 a2 a3)
	   (optimize (speed 3)
		     #+cmu
		     (ext:inhibit-warnings 3)))
  (if a1-p
      (multiple-value-bind (s0 s1 s2 s3)
	  (renorm-4 a0 a1 a2 a3)
	(%make-qd-d s0 s1 s2 s3))
      (%make-qd-d a0 0d0 0d0 0d0)))

;;;; Addition

;; Quad-double + double
(defun add-qd-d (a b &optional (target #+oct-array (%make-qd-d 0d0 0d0 0d0 0d0)))
  "Return the sum of the %QUAD-DOUBLE A and the DOUBLE-FLOAT B.
If TARGET is given, TARGET is destructively modified to contain the result."
  (add-qd-d-t a b target))
  
(defun add-qd-d-t (a b target)
  "Add a quad-double A and a double-float B"
  (declare (type %quad-double a #+oct-array target)
	   (double-float b)
	   (optimize (speed 3)
		     (space 0))
	   (inline float-infinity-p)
	   #+(and cmu (not oct-array)) (ignore target))
  (let* ((c0 0d0)
	 (e c0)
	 (c1 c0)
	 (c2 c0)
	 (c3 c0))
    (declare (double-float e c0 c1 c2 c3))
    (two-sum c0 e (qd-0 a) b)

    (when (float-infinity-p c0)
      (return-from add-qd-d-t (%store-qd-d target c0 0d0 0d0 0d0)))
    (two-sum c1 e (qd-1 a) e)
    (two-sum c2 e (qd-2 a) e)
    (two-sum c3 e (qd-3 a) e)
    (multiple-value-bind (r0 r1 r2 r3)
	(renorm-5 c0 c1 c2 c3 e)
      (if (and (zerop (qd-0 a)) (zerop b))
	  (%store-qd-d target c0 0d0 0d0 0d0)
	  (%store-qd-d target r0 r1 r2 r3)))))

(defun add-d-qd (a b &optional (target #+oct-array (%make-qd-d 0d0 0d0 0d0 0d0)))
  "Return the sum of the DOUBLE-FLOAT A and the %QUAD-DOUBLE B.
If TARGET is given, TARGET is destructively modified to contain the result."
  (declare (double-float a)
	   (type %quad-double b)
	   (optimize (speed 3))
	   #+(and cmu (not oct-array)) (ignore target))
  (add-qd-d b a #+oct-array target))

#+cmu
(defun add-qd-dd (a b)
  "Add a quad-double A and a double-double B"
  (declare (type %quad-double a)
	   (double-double-float b)
	   (optimize (speed 3)
		     (space 0)))
  (let* ((s0 0d0)
	 (t0 s0)
	 (s1 s0)
	 (t1 s0)
	 (s2 s0)
	 (s3 s0))
    (declare (double-float s0 s1 s3 t0 t1 s2))
    (two-sum s0 t1 (qd-0 a) (kernel:double-double-hi b))
    (two-sum s1 t1 (qd-1 a) (kernel:double-double-lo b))
    (two-sum s1 t0 s1 t0)
    (three-sum s2 t0 t1 (qd-2 a) t0 t1)
    (two-sum s3 t0 t0 (qd-3 a))
    (incf t0 t1)
    (multiple-value-bind (a0 a1 a2 a3)
	(renorm-5 s0 s1 s2 s3 t0)
      (declare (double-float a0 a1 a2 a3))
      (%make-qd-d a0 a1 a2 a3))))

#+cmu
(defun add-dd-qd (a b)
  (declare (double-double-float a)
	   (type %quad-double b)
	   (optimize (speed 3)
		     (space 0)))
  (add-qd-dd b a))


#+(or)
(defun add-qd-1 (a b)
  (declare (type %quad-double a b)
	   (optimize (speed 3)))
  (multiple-value-bind (s0 t0)
      (two-sum (qd-0 a) (qd-0 b))
    (multiple-value-bind (s1 t1)
	(two-sum (qd-1 a) (qd-1 b))
      (multiple-value-bind (s2 t2)
	  (two-sum (qd-2 a) (qd-2 b))
	(multiple-value-bind (s3 t3)
	    (two-sum (qd-3 a) (qd-3 b))
	  (multiple-value-bind (s1 t0)
	      (two-sum s1 t0)
	    (multiple-value-bind (s2 t0 t1)
		(three-sum s2 t0 t1)
	      (multiple-value-bind (s3 t0)
		  (three-sum2 s3 t0 t2)
		(let ((t0 (cl:+ t0 t1 t3)))
		  (multiple-value-call #'%make-qd-d
		    (renorm-5 s0 s1 s2 s3 t0)))))))))))

;; Same as above, except that everything is expanded out for compilers
;; which don't do a very good job with dataflow.  CMUCL is one of
;; those compilers.


(defun add-qd-t (a b target)
  (declare (type %quad-double a b #+oct-array target)
	   (optimize (speed 3)
		     (space 0))
	   #+(and cmu (not oct-array))
	   (ignore target))
  ;; This is the version that is NOT IEEE.  Should we use the IEEE
  ;; version?  It's quite a bit more complicated.
  ;;
  ;; In addition, this is reorganized to minimize data dependency.
  (with-qd-parts (a0 a1 a2 a3)
      a
    (declare (double-float a0 a1 a2 a3))
    (with-qd-parts (b0 b1 b2 b3)
	b
      (declare (double-float b0 b1 b2 b3))
      (let ((s0 (cl:+ a0 b0))
	    (s1 (cl:+ a1 b1))
	    (s2 (cl:+ a2 b2))
	    (s3 (cl:+ a3 b3)))
	(declare (double-float s0 s1 s2 s3)
		 (inline float-infinity-p))

	(when (float-infinity-p s0)
	  (return-from add-qd-t (%store-qd-d target s0 0d0 0d0 0d0)))
	(let ((v0 (cl:- s0 a0))
	      (v1 (cl:- s1 a1))
	      (v2 (cl:- s2 a2))
	      (v3 (cl:- s3 a3)))
	  (declare (double-float v0 v1 v2 v3))
	  (let ((u0 (cl:- s0 v0))
		(u1 (cl:- s1 v1))
		(u2 (cl:- s2 v2))
		(u3 (cl:- s3 v3)))
	    (declare (double-float u0 u1 u2 u3))
	    (let ((w0 (cl:- a0 u0))
		  (w1 (cl:- a1 u1))
		  (w2 (cl:- a2 u2))
		  (w3 (cl:- a3 u3)))
	      (declare (double-float w0 w1 w2 w3))
	      (let ((u0 (cl:- b0 v0))
		    (u1 (cl:- b1 v1))
		    (u2 (cl:- b2 v2))
		    (u3 (cl:- b3 v3)))
		(declare (double-float u0 u1 u2 u3))
		(let ((t0 (cl:+ w0 u0))
		      (t1 (cl:+ w1 u1))
		      (t2 (cl:+ w2 u2))
		      (t3 (cl:+ w3 u3)))
		  (declare (double-float t0 t1 t2 t3))
		  (two-sum s1 t0 s1 t0)
		  (three-sum s2 t0 t1 s2 t0 t1)
		  (three-sum2 s3 t0 s3 t0 t2)
		  (setf t0 (cl:+ t0 t1 t3))
		  ;; Renormalize
		  (multiple-value-setq (s0 s1 s2 s3)
		    (renorm-5 s0 s1 s2 s3 t0))
		  (if (and (zerop a0) (zerop b0))
		      (%store-qd-d target (+ a0 b0) 0d0 0d0 0d0)
		      (%store-qd-d target s0 s1 s2 s3)))))))))))

(defun add-qd (a b &optional (target #+oct-array (%make-qd-d 0d0 0d0 0d0 0d0)))
  "Return the sum of the %QUAD-DOUBLE numbers A and B.
If TARGET is given, TARGET is destructively modified to contain the result."
  (add-qd-t a b target))


(defun neg-qd-t (a target)
  (declare (type %quad-double a #+oct-array target)
	   #+(and cmu (not oct-array)) (ignore target))
  (with-qd-parts (a0 a1 a2 a3)
      a
    (declare (double-float a0 a1 a2 a3))
    (%store-qd-d target (cl:- a0) (cl:- a1) (cl:- a2) (cl:- a3))))

(defun neg-qd (a &optional (target #+oct-array (%make-qd-d 0d0 0d0 0d0 0d0)))
  "Return the negative of the %QUAD-DOUBLE number A.
If TARGET is given, TARGET is destructively modified to contain the result."
  (neg-qd-t a target))


(defun sub-qd (a b &optional (target #+oct-array (%make-qd-d 0d0 0d0 0d0 0d0)))
  "Return the difference between the %QUAD-DOUBLE numbers A and B.
If TARGET is given, TARGET is destructively modified to contain the result."
  (declare (type %quad-double a b))
  (add-qd-t a (neg-qd b) target))

(defun sub-qd-t (a b target)
  (add-qd-t a (neg-qd b) target))

#+cmu
(defun sub-qd-dd (a b)
  (declare (type %quad-double a)
	   (type double-double-float b))
  (add-qd-dd a (cl:- b)))

(defun sub-qd-d (a b &optional (target #+oct-array (%make-qd-d 0d0 0d0 0d0 0d0)))
  "Return the difference between the %QUAD-DOUBLE number A and
the DOUBLE-FLOAT number B.

If TARGET is given, TARGET is destructively modified to contain the result."
  (declare (type %quad-double a)
	   (type double-float b)
	   #+(and cmu (not oct-array)) (ignore target))
  (add-qd-d a (cl:- b) #+oct-array target))

(defun sub-d-qd (a b &optional (target #+oct-array (%make-qd-d 0d0 0d0 0d0 0d0)))
  "Return the difference between the DOUBLE-FLOAT number A and
the %QUAD-DOUBLE number B.

If TARGET is given, TARGET is destructively modified to contain the result."
  (declare (type double-float a)
	   (type %quad-double b)
	   #+(and cmu (not oct-array)) (ignore target))
  ;; a - b = a + (-b)
  (add-d-qd a (neg-qd b) #+oct-array target))
  

;; Works
;; (mul-qd-d (sqrt-qd (make-qd-dd 2w0 0w0)) 10d0) ->
;; 14.1421356237309504880168872420969807856967187537694807317667973799q0
;;
;; Clisp says
;; 14.142135623730950488016887242096980785696718753769480731766797379908L0
;;
(defun mul-qd-d (a b &optional (target #+oct-array (%make-qd-d 0d0 0d0 0d0 0d0)))
  "Return the product of the %QUAD-DOUBLE number A and
the DOUBLE-FLOAT number B.

If TARGET is given, TARGET is destructively modified to contain the result."
  (mul-qd-d-t a b target))

(defun mul-qd-d-t (a b target)
  "Multiply quad-double A with B"
  (declare (type %quad-double a #+oct-array target)
	   (double-float b)
	   (optimize (speed 3)
		     (space 0))
	   (inline float-infinity-p)
	   #+(and cmu (not oct-array)) (ignore target))
  (multiple-value-bind (p0 q0)
      (two-prod (qd-0 a) b)

    (when (float-infinity-p p0)
      (return-from mul-qd-d-t (%store-qd-d target p0 0d0 0d0 0d0)))
    (multiple-value-bind (p1 q1)
	(two-prod (qd-1 a) b)
      (declare (double-float p1 q1))
      (multiple-value-bind (p2 q2)
	  (two-prod (qd-2 a) b)
	(declare (double-float p2 q2))
	(let* ((p3 (cl:* (qd-3 a) b))
	       (s0 p0)
	       (s1 p0)
	       (s2 p0))
	  (declare (double-float s0 s1 s2 p3))
	  (two-sum s1 s2 q0 p1)
	  (three-sum s2 q1 p2 s2 q1 p2)
	  (three-sum2 q1 q2 q1 q2 p3)
	  (let ((s3 q1)
		(s4 (cl:+ q2 p2)))
	    (multiple-value-bind (s0 s1 s2 s3)
		(renorm-5 s0 s1 s2 s3 s4)
	      (if (zerop s0)
		  (%store-qd-d target (float-sign p0 0d0) 0d0 0d0 0d0)
		  (%store-qd-d target s0 s1 s2 s3)))))))))

;; a0 * b0                        0
;;      a0 * b1                   1
;;      a1 * b0                   2
;;           a1 * b1              3
;;           a2 * b0              4
;;                a2 * b1         5
;;                a3 * b0         6
;;                     a3 * b1    7

;; Not working.
;; (mul-qd-dd (sqrt-qd (make-qd-dd 2w0 0w0)) 10w0) ->
;; 14.142135623730950488016887242097022172449805747901877456053837224q0
;;
;; But clisp says
;; 14.142135623730950488016887242096980785696718753769480731766797379908L0
;;                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
;;
;; Running a test program using qd (2.1.210) shows that we get the
;; same wrong answer.
#+(or)
(defun mul-qd-dd (a b)
  (declare (type %quad-double a)
	   (double-double-float b)
	   (optimize (speed 3)))
  (multiple-value-bind (p0 q0)
      (two-prod (qd-0 a) (kernel:double-double-hi b))
    (multiple-value-bind (p1 q1)
	(two-prod (qd-0 a) (kernel:double-double-lo b))
      (multiple-value-bind (p2 q2)
	  (two-prod (qd-1 a) (kernel:double-double-hi b))
	(multiple-value-bind (p3 q3)
	    (two-prod (qd-1 a) (kernel:double-double-lo b))
	  (multiple-value-bind (p4 q4)
	      (two-prod (qd-2 a) (kernel:double-double-hi b))
	    (format t "p0, q0 = ~A ~A~%" p0 q0)
	    (format t "p1, q1 = ~A ~A~%" p1 q1)
	    (format t "p2, q2 = ~A ~A~%" p2 q2)
	    (format t "p3, q3 = ~A ~A~%" p3 q3)
	    (format t "p4, q4 = ~A ~A~%" p4 q4)
	    (multiple-value-setq (p1 p2 q0)
	      (three-sum p1 p2 q0))
	    (format t "p1 = ~A~%" p1)
	    (format t "p2 = ~A~%" p2)
	    (format t "q0 = ~A~%" q0)
	    ;; five-three-sum
	    (multiple-value-setq (p2 p3 p4)
	      (three-sum p2 p3 p4))
	    (format t "p2 = ~A~%" p2)
	    (format t "p3 = ~A~%" p3)
	    (format t "p4 = ~A~%" p4)
	    (multiple-value-setq (q1 q2)
	      (two-sum q1 q2))
	    (multiple-value-bind (s0 t0)
		(two-sum p2 q1)
	      (multiple-value-bind (s1 t1)
		  (two-sum p3 q2)
		(multiple-value-setq (s1 t0)
		  (two-sum s1 t0))
		(let ((s2 (cl:+ t0 t1 p4))
		      (p2 s0)
		      (p3 (cl:+ (cl:* (qd-2 a)
				(kernel:double-double-hi b))
			     (cl:* (qd-3 a)
				(kernel:double-double-lo b))
			     q3 q4)))
		  (multiple-value-setq (p3 q0)
		    (three-sum2 p3 q0 s1))
		  (let ((p4 (cl:+ q0 s2)))
		    (multiple-value-call #'%make-qd-d
		      (renorm-5 p0 p1 p2 p3 p4))))))))))))

;; a0 * b0                    0
;;      a0 * b1               1
;;      a1 * b0               2
;;           a0 * b2          3
;;           a1 * b1          4
;;           a2 * b0          5
;;                a0 * b3     6
;;                a1 * b2     7
;;                a2 * b1     8
;;                a3 * b0     9 

;; Works
;; (mul-qd (sqrt-qd (make-qd-dd 2w0 0w0)) (make-qd-dd 10w0 0w0)) ->
;; 14.1421356237309504880168872420969807856967187537694807317667973799q0
;;
;; Clisp says
;; 14.142135623730950488016887242096980785696718753769480731766797379908L0

(defun mul-qd-t (a b target)
  (declare (type %quad-double a b #+oct-array target)
	   (optimize (speed 3)
		     (space 0))
	   (inline float-infinity-p)
	   #+(and cmu (not oct-array))
	   (ignore target))
  (with-qd-parts (a0 a1 a2 a3)
      a
    (declare (double-float a0 a1 a2 a3))
    (with-qd-parts (b0 b1 b2 b3)
	b
      (declare (double-float b0 b1 b2 b3))
      (multiple-value-bind (p0 q0)
	  (two-prod a0 b0)
	#+cmu
	(when (float-infinity-p p0)
	  (return-from mul-qd-t (%store-qd-d target p0 0d0 0d0 0d0)))
	(multiple-value-bind (p1 q1)
	    (two-prod a0 b1)
	  (multiple-value-bind (p2 q2)
	      (two-prod a1 b0)
	    (multiple-value-bind (p3 q3)
		(two-prod a0 b2)
	      (multiple-value-bind (p4 q4)
		  (two-prod a1 b1)
		(multiple-value-bind (p5 q5)
		    (two-prod a2 b0)
		  ;; Start accumulation
		  (three-sum p1 p2 q0 p1 p2 q0)

		  ;; six-three-sum of p2, q1, q2, p3, p4, p5
		  (three-sum p2 q1 q2 p2 q1 q2)
		  (three-sum p3 p4 p5 p3 p4 p5)
		  ;; Compute (s0,s1,s2) = (p2,q1,q2) + (p3,p4,p5)
		  (let* ((s0 0d0)
			 (s1 s0)
			 (t0 s0)
			 (t1 s0))
		    (declare (double-float s0 s1 t0 t1))
		    (two-sum s0 t0 p2 p3)
		    (two-sum s1 t1 q1 p4)
		    (let ((s2 (cl:+ q2 p5)))
		      (declare (double-float s2))
		      (two-sum s1 t0 s1 t0)
		      (incf s2 (cl:+ t0 t1))
		      ;; O(eps^3) order terms.  This is the sloppy
		      ;; multiplication version.  Should we use
		      ;; the precise version?  It's significantly
		      ;; more complex.
			  
		      (incf s1 (cl:+ (cl:* a0 b3)
				     (cl:* a1 b2)
				     (cl:* a2 b1)
				     (cl:* a3 b0)
				     q0 q3 q4 q5))
		      #+nil
		      (format t "p0,p1,s0,s1,s2 = ~a ~a ~a ~a ~a~%"
			      p0 p1 s0 s1 s2)
		      (multiple-value-bind (r0 r1 s0 s1)
			  (renorm-5 p0 p1 s0 s1 s2)
			(if (zerop r0)
			    (%store-qd-d target p0 0d0 0d0 0d0)
			    (%store-qd-d target r0 r1 s0 s1))))))))))))))

(defun mul-qd (a b &optional (target #+oct-array (%make-qd-d 0d0 0d0 0d0 0d0)))
  "Returns the product of the %QUAD-DOUBLE numbers A and B.
If TARGET is given, TARGET is destructively modified to contain the result."
  (mul-qd-t a b target))

;; This is the non-sloppy version.  I think this works just fine, but
;; since qd defaults to the sloppy multiplication version, we do the
;; same.
#+nil
(defun mul-qd (a b)
  (declare (type %quad-double a b)
	   (optimize (speed 3)))
  (multiple-value-bind (a0 a1 a2 a3)
      (qd-parts a)
    (multiple-value-bind (b0 b1 b2 b3)
	(qd-parts b)
      (multiple-value-bind (p0 q0)
	  (two-prod a0 b0)
	(multiple-value-bind (p1 q1)
	    (two-prod a0 b1)
	  (multiple-value-bind (p2 q2)
	      (two-prod a1 b0)
	    (multiple-value-bind (p3 q3)
		(two-prod a0 b2)
	      (multiple-value-bind (p4 q4)
		  (two-prod a1 b1)
		(declare (double-float q4))
		#+nil
		(progn
		  (format t"p0, q0 = ~a ~a~%" p0 q0)
		  (format t"p1, q1 = ~a ~a~%" p1 q1)
		  (format t"p2, q2 = ~a ~a~%" p2 q2)
		  (format t"p3, q3 = ~a ~a~%" p3 q3)
		  (format t"p4, q4 = ~a ~a~%" p4 q4))
		(multiple-value-bind (p5 q5)
		    (two-prod a2 b0)
		  #+nil
		  (format t"p5, q5 = ~a ~a~%" p5 q5)
		  ;; Start accumulation
		  (multiple-value-setq (p1 p2 q0)
		    (three-sum p1 p2 q0))
		  #+nil
		  (format t "p1 p2 q0 = ~a ~a ~a~%" p1 p2 q0)
		  ;; six-three-sum of p2, q1, q2, p3, p4, p5
		  (multiple-value-setq (p2 q1 q2)
		    (three-sum p2 q1 q2))
		  (multiple-value-setq (p3 p4 p5)
		    (three-sum p3 p4 p5))
		  ;; Compute (s0,s1,s2) = (p2,q1,q2) + (p3,p4,p5)
		  (multiple-value-bind (s0 t0)
		      (two-sum p2 p3)
		    (multiple-value-bind (s1 t1)
			(two-sum q1 p4)
		      (declare (double-float t1))
		      (let ((s2 (cl:+ q2 p5)))
			(declare (double-float s2))
			(multiple-value-bind (s1 t0)
			    (two-sum s1 t0)
			  (declare (double-float s1))
			  (incf s2 (cl:+ t0 t1))
			  (multiple-value-bind (p6 q6)
			      (two-prod a0 b3)
			    (multiple-value-bind (p7 q7)
				(two-prod a1 b2)
			      (multiple-value-bind (p8 q8)
				  (two-prod a2 b1)
				(multiple-value-bind (p9 q9)
				    (two-prod a3 b0)
				  (multiple-value-setq (q0 q3)
				    (two-sum q0 q3))
				  (multiple-value-setq (q4 q5)
				    (two-sum q4 q5))
				  (multiple-value-setq (p6 p7)
				    (two-sum p6 p7))
				  (multiple-value-setq (p8 p9)
				    (two-sum p8 p9))
				  ;; (t0,t1) = (q0,q3)+(q4,q5)
				  (multiple-value-setq (t0 t1)
				    (two-sum q0 q4))
				  (setf t1 (cl:+ q3 q5))
				  ;; (r0,r1) = (p6,p7)+(p8,p9)
				  (multiple-value-bind (r0 r1)
				      (two-sum p6 p8)
				    (declare (double-float r1))
				    (incf r1 (cl:+ p7 p9))
				    ;; (q3,q4) = (t0,t1)+(r0,r1)
				    (multiple-value-setq (q3 q4)
				      (two-sum t0 r0))
				    (incf q4 (cl:+ t1 r1))
				    ;; (t0,t1)=(q3,q4)+s1
				    (multiple-value-setq (t0 t1)
				      (two-sum q3 s1))
				    (incf t1 q4)
				    ;; O(eps^4) terms
				    (incf t1
					  (cl:+ (cl:* a1 b3)
					     (cl:* a2 b2)
					     (cl:* a3 b1)
					     q6 q7 q8 q9
					     s2))
				    #+nil
				    (format t "p0,p1,s0,t0,t1 = ~a ~a ~a ~a ~a~%"
					    p0 p1 s0 t0 t1)
				    (multiple-value-call #'%make-qd-d
				      (renorm-5 p0 p1 s0 t0 t1))))))))))))))))))))

(defun sqr-qd-t (a target)
  "Square A"
  (declare (type %quad-double a #+oct-array target)
	   (optimize (speed 3)
		     (space 0))
	   #+(and cmu (not oct-array))
	   (ignore target))
  (multiple-value-bind (p0 q0)
      (two-sqr (qd-0 a))
    (multiple-value-bind (p1 q1)
	(two-prod (cl:* 2 (qd-0 a)) (qd-1 a))
      (multiple-value-bind (p2 q2)
	  (two-prod (cl:* 2 (qd-0 a)) (qd-2 a))
	(multiple-value-bind (p3 q3)
	    (two-sqr (qd-1 a))
	  (two-sum p1 q0 q0 p1)
	  (two-sum q0 q1 q0 q1)
	  (two-sum p2 p3 p2 p3)

	  (let* ((s0 0d0)
		 (t0 s0)
		 (s1 s0)
		 (t1 s0))
	    (declare (double-float s0 s1 t0 t1))
	    (two-sum s0 t0 q0 p2)
	    (two-sum s1 t1 q1 p3)
	    (two-sum s1 t0 s1 t0)
	    (incf t0 t1)
	    (quick-two-sum s1 t0 s1 t0)
	    (quick-two-sum p2 t1 s0 s1)
	    (quick-two-sum p3 q0 t1 t0)

	    (let ((p4 (cl:* 2 (qd-0 a) (qd-3 a)))
		  (p5 (cl:* 2 (qd-1 a) (qd-2 a))))
	      (declare (double-float p4 p5))
	      (two-sum p4 p5 p4 p5)
	      (two-sum q2 q3 q2 q3)
	      (two-sum t0 t1 p4 q2)

	      (incf t1 (cl:+ p5 q3))

	      (two-sum p3 p4 p3 t0)
	      (incf p4 (cl:+ q0 t1))

	      (multiple-value-bind (a0 a1 a2 a3)
		  (renorm-5 p0 p1 p2 p3 p4)
		(%store-qd-d target a0 a1 a2 a3)))))))))

(defun sqr-qd (a &optional (target #+oct-array (%make-qd-d 0d0 0d0 0d0 0d0)))
  "Return the square of the %QUAD-DOUBLE number A.  If TARGET is also given,
it is destructively modified with the result."
  (sqr-qd-t a target))

#+nil
(defun div-qd-t (a b target)
  (declare (type %quad-double a b)
	   (optimize (speed 3)
		     (space 0))
	   (inline float-infinity-p)
	   #+cmu
	   (ignore target))
  (let ((a0 (qd-0 a))
	(b0 (qd-0 b)))
    (let* ((q0 (cl:/ a0 b0))
	   (r (sub-qd a (mul-qd-d b q0)))
	   (q1 (cl:/ (qd-0 r) b0)))

      (when (float-infinity-p q0)
	(return-from div-qd-t (%store-qd-d target q0 0d0 0d0 0d0)))
      (setf r (sub-qd r (mul-qd-d b q1)))
      (let ((q2 (cl:/ (qd-0 r) b0)))
	(setf r (sub-qd r (mul-qd-d b q2)))
	(let ((q3 (cl:/ (qd-0 r) b0)))
	  (multiple-value-bind (q0 q1 q2 q3)
	      (renorm-4 q0 q1 q2 q3)
	    (%store-qd-d target q0 q1 q2 q3)))))))

(defun div-qd-t (a b target)
  (declare (type %quad-double a b #+oct-array target)
	   (optimize (speed 3)
		     (space 0))
	   (inline float-infinity-p)
	   #+(and cmu (not oct-array))
	   (ignore target))
  (let ((a0 (qd-0 a))
	(b0 (qd-0 b)))
    (let* ((q0 (cl:/ a0 b0))
	   (r (%make-qd-d 0d0 0d0 0d0 0d0)))
      (mul-qd-d b q0 r)
      (sub-qd a r r)
      (let* ((q1 (cl:/ (qd-0 r) b0))
	     (temp (mul-qd-d b q1)))
	(when (float-infinity-p q0)
	  (return-from div-qd-t (%store-qd-d target q0 0d0 0d0 0d0)))
	
	(sub-qd r temp r)
	(let ((q2 (cl:/ (qd-0 r) b0)))
	  (mul-qd-d b q2 temp)
	  (sub-qd r temp r)
	  (let ((q3 (cl:/ (qd-0 r) b0)))
	    (multiple-value-bind (q0 q1 q2 q3)
		(renorm-4 q0 q1 q2 q3)
	      (%store-qd-d target q0 q1 q2 q3))))))))

(defun div-qd (a b &optional (target #+oct-array (%make-qd-d 0d0 0d0 0d0 0d0)))
  "Return the quotient of the two %QUAD-DOUBLE numbers A and B.
If TARGET is given, it destrutively modified with the result."
  (div-qd-t a b target))

(declaim (inline invert-qd))

(defun invert-qd (v)
  ;; a quartic newton iteration for 1/v
  ;; to invert v, start with a good guess, x.
  ;; let h= 1-v*x  ;; h is small
  ;; return x+ x*(h+h^2+h^3) . compute h3 in double-float
  ;; enough accuracy.
    
  (let* 
      ((x (%make-qd-d (cl:/ (qd-0 v)) 0d0 0d0 0d0))
       (h (add-qd-d (neg-qd (mul-qd v x))
		    1.0d0))
       (h2 (mul-qd h h)) ;also use h2 for target
       (h3 (* (qd-0 h) (qd-0 h2))))
    (declare (type %quad-double v h h2)
	     (double-float h3))
    (add-qd x
	    (mul-qd x
		    (add-qd h (add-qd-d h2 h3))))))

(defun div-qd-i (a b)
  (declare (type %quad-double a b)
	   (optimize (speed 3)
		     (space 0)))
  (mul-qd a (invert-qd b)))
  
;; Non-sloppy divide
#+(or)
(defun div-qd (a b)
  (declare (type %quad-double a b))
  (let ((a0 (qd-0 a))
	(b0 (qd-0 b)))
    (let* ((q0 (cl:/ a0 b0))
	   (r (sub-qd a (mul-qd-d b q0)))
	   (q1 (cl:/ (qd-0 r) b0)))
      (setf r (sub-qd r (mul-qd-d b q1)))
      (let ((q2 (cl:/ (qd-0 r) b0)))
	(setf r (sub-qd r (mul-qd-d b q2)))
	(let ((q3 (cl:/ (qd-0 r) b0)))
	  (setf r (sub-qd r (mul-qd-d b q3)))
	  (let ((q4 (cl:/ (qd-0 r) b0)))
	  (multiple-value-bind (q0 q1 q2 q3)
	      (renorm-5 q0 q1 q2 q3 q4)
	    (%make-qd-d q0 q1 q2 q3))))))))

;; quad-double / double
(defun div-qd-d (a b)
  "Divides the %QUAD-DOUBLE number A by the DOUBLE-FLOAT number B."
  (declare (type %quad-double a)
	   (double-float b)