qd-fun.lisp

;;;; -*- Mode: lisp -*-
;;;;
;;;; Copyright (c) 2007, 2008, 2011 Raymond Toy
;;;;
;;;; Permission is hereby granted, free of charge, to any person
;;;; obtaining a copy of this software and associated documentation
;;;; files (the "Software"), to deal in the Software without
;;;; restriction, including without limitation the rights to use,
;;;; copy, modify, merge, publish, distribute, sublicense, and/or sell
;;;; copies of the Software, and to permit persons to whom the
;;;; Software is furnished to do so, subject to the following
;;;; conditions:
;;;;
;;;; The above copyright notice and this permission notice shall be
;;;; included in all copies or substantial portions of the Software.
;;;;
;;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
;;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
;;;; OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
;;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
;;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
;;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
;;;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
;;;; OTHER DEALINGS IN THE SOFTWARE.

;;; Basic special functions operating on %quad-double numbers.  This
;;; includes sqrt, rounding to the nearest integer, floor, exp, log,
;;; log1p, sin, cos, tan, asin, acos, atan, atan2, sinh, cosh, tanh,
;;; asinh, acosh, atanh, and random.
;;;
;;; These special functions only work on the main domains where the
;;; argument is real and the result is real.  Behavior is undefined if
;;; this doesn't hold.

(in-package #:octi)

(defun logb-finite (x)
  "Same as logb but X is not infinity and non-zero and not a NaN, so
that we can always return an integer"
  (declare (type cl:double-float x))
  (multiple-value-bind (signif expon sign)
      (cl:decode-float x)
    (declare (ignore signif sign))
    ;; decode-float is almost right, except that the exponent
    ;; is off by one
    (1- expon)))

#+cmu
(declaim (ext:maybe-inline sqrt-qd))
#+nil
(defun sqrt-qd (a)
  "Square root of the (non-negative) quad-float"
  (declare (type %quad-double a)
	   (optimize (speed 3) (space 0)))
  ;; Perform the following Newton iteration:
  ;;
  ;;  x' = x + (1 - a * x^2) * x / 2
  ;;
  ;; which converges to 1/sqrt(a).
  ;;
  ;; However, there appear to be round-off errors when x is either
  ;; very large or very small.  So let x = f*2^(2*k).  Then sqrt(x) =
  ;; 2^k*sqrt(f), and sqrt(f) doesn't have round-off problems.
  (when (zerop-qd a)
    (return-from sqrt-qd a))
  (when (float-infinity-p (qd-0 a))
    (return-from sqrt-qd a))

  (let* ((k (logandc2 (logb-finite (qd-0 a)) 1))
	 (new-a (scale-float-qd a (- k))))
    (assert (evenp k))
    (let* ((r (make-qd-d (cl:/ (sqrt (the (double-float (0d0))
				       (qd-0 new-a))))))
	   (half 0.5d0)
	   (h (mul-qd-d new-a half)))
      (declare (type %quad-double r))
      ;; Since we start with double-float precision, three more
      ;; iterations should give us full accuracy.
      (dotimes (k 3)
	(setf r (add-qd r (mul-qd r (sub-d-qd half (mul-qd h (sqr-qd r)))))))
      (scale-float-qd (mul-qd r new-a) (ash k -1)))))

(defun sqrt-qd (a)
  "Return the square root of the (non-negative) %QUAD-DOUBLE number A"
  (declare (type %quad-double a)
	   (optimize (speed 3) (space 0)))
  ;; Perform the following Newton iteration:
  ;;
  ;;  x' = x + (1 - a * x^2) * x / 2
  ;;
  ;; which converges to 1/sqrt(a).
  ;;
  ;; However, there appear to be round-off errors when x is either
  ;; very large or very small.  So let x = f*2^(2*k).  Then sqrt(x) =
  ;; 2^k*sqrt(f), and sqrt(f) doesn't have round-off problems.
  (when (zerop-qd a)
    (return-from sqrt-qd a))
  (when (float-infinity-p (qd-0 a))
    (return-from sqrt-qd a))

  (let* ((k (logandc2 (logb-finite (qd-0 a)) 1))
	 (new-a (scale-float-qd a (- k))))
    (assert (evenp k))
    (let* ((r (make-qd-d (cl:/ (sqrt (the (double-float (0d0))
				       (qd-0 new-a))))))
	   (temp (%make-qd-d 0d0 0d0 0d0 0d0))
	   (half 0.5d0)
	   (h (mul-qd-d new-a half)))
      (declare (type %quad-double r))
      ;; Since we start with double-float precision, three more
      ;; iterations should give us full accuracy.
      (dotimes (k 3)
	#+nil
	(setf r (add-qd r (mul-qd r (sub-d-qd half (mul-qd h (sqr-qd r))))))
	(sqr-qd r temp)
	(mul-qd h temp temp)
	(sub-d-qd half temp temp)
	(mul-qd r temp temp)
	(add-qd r temp r)
	)
      (mul-qd r new-a r)
      (scale-float-qd r (ash k -1)))))

(defun hypot-aux-qd (x y)
  (declare (type %quad-double x y))
  (let ((k (- (logb-finite (max (cl:abs (qd-0 x))
				(cl:abs (qd-0 y)))))))
    (values (add-qd (sqr-qd (scale-float-qd x k))
		    (sqr-qd (scale-float-qd y k)))
	    (- k))))

(defun hypot-qd (x y)
  "sqrt(x^2+y^2) computed carefully without unnecessary overflow for
the %QUAD-DOUBLE numbers X and Y"
  (multiple-value-bind (abs^2 rho)
      (hypot-aux-qd x y)
    (scale-float-qd (sqrt-qd abs^2) rho)))

(defun nint-qd (a)
  "Round the quad-float to the nearest integer, which is returned as a
  quad-float"
  (let ((x0 (fround (qd-0 a)))
	(x1 0d0)
	(x2 0d0)
	(x3 0d0))
    (cond ((= x0 (qd-0 a))
	   ;; First double is already an integer
	   (setf x1 (fround (qd-1 a)))
	   (cond ((= x1 (qd-1 a))
		  ;; Second is an integer
		  (setf x2 (fround (qd-2 a)))
		  (cond ((= x2 (qd-2 a))
			 ;; Third is an integer
			 (setf x3 (fround (qd-3 a))))
			(t
			 (when (and (zerop (abs (cl:- x2 (qd-2 a))))
				    (minusp (qd-3 a)))
			   (decf x2)))))
		 (t
		  (when (and (zerop (abs (cl:- x1 (qd-1 a))))
			     (minusp (qd-2 a)))
		    (decf x1)))))
	  (t
	   (when (and (zerop (abs (cl:- x0 (qd-0 a))))
		      (minusp (qd-1 a)))
	     (decf x0))))
    (multiple-value-bind (s0 s1 s2 s3)
	(renorm-4 x0 x1 x2 x3)
      (make-qd-d s0 s1 s2 s3))))

(defun ffloor-qd (a)
  "The floor of the %QUAD-DOUBLE A, returned as a %QUAD-DOUBLE number"
  (let ((x0 (ffloor (qd-0 a)))
	(x1 0d0)
	(x2 0d0)
	(x3 0d0))
    (cond ((= x0 (qd-0 a))
	   (setf x1 (ffloor (qd-1 a)))
	   (when (= x1 (qd-1 a))
	     (setf x2 (ffloor (qd-2 a)))
	     (when (= x2 (qd-2 a))
	       (setf x3 (ffloor (qd-3 a)))))
	   (make-qd-d x0 x1 x2 x3))
	  (t
	   (%make-qd-d x0 x1 x2 x3)))))
	
			 
(defun exp-qd/reduce (a)
  ;; Strategy:  Reduce the size of x by noting that
  ;;
  ;; exp(k*r+m) = exp(m) * exp(r)^k
  ;;
  ;; Thus, by choosing m to be a multiple of log(2) closest to x, we
  ;; can make |kr| < log(2)/2 = 0.3466.  Now we can set k = 256, so
  ;; that |r| <= 0.00136.
  ;;
  ;; Then
  ;;
  ;; exp(x) = exp(k*r+s*log(2)) = 2^s*(exp(r))^256
  ;;
  ;; We can use Taylor series to evaluate exp(r).

  (let* ((k 256)
	 (z (truncate (qd-0 (nint-qd (div-qd a +qd-log2+)))))
	 (r (div-qd-d (sub-qd a (mul-qd-d +qd-log2+ (float z 1d0)))
		      (float k 1d0)))
	 ;; For Taylor series.  p = r^2/2, the first term
	 (p (div-qd-d (sqr-qd r) 2d0))
	 ;; s = 1+r+p, the sum of the first 3 terms
	 (s (add-qd-d (add-qd r p) 1d0))
	 ;; Denominator of term
	 (m 2d0))
    ;; Taylor series until the term is small enough.
    ;;
    ;; Note that exp(x) = sinh(x) + sqrt(1+sinh(x)^2).  The Taylor
    ;; series for sinh has half as many terms as for exp, so it should
    ;; be less work to compute sinh.  Then a few additional operations
    ;; and a square root gives us exp.
    (loop
       (incf m)
       (setf p (mul-qd p r))
       (setf p (div-qd-d p m))
       (setf s (add-qd s p))
       (unless (> (abs (qd-0 p)) +qd-eps+)
	 (return)))

    (setf r (npow s k))
    (setf r (scale-float-qd r z))
    r))

(defun expm1-qd/duplication (a)
  (declare (type %quad-double a))
  ;; Brent gives expm1(2*x) = expm1(x)*(2+expm1(x))
  ;;
  ;; Hence
  ;;
  ;; expm1(x) = expm1(x/2)*(2+expm1(x/2))
  ;;
  ;; Keep applying this formula until x is small enough.  Then use
  ;; Taylor series to compute expm1(x).
  (cond ((< (abs (qd-0 a)) .0001d0)
	 ;; What is the right threshold?
	 ;;
	 ;; Taylor series for exp(x)-1
	 ;; = x+x^2/2!+x^3/3!+x^4/4!+...
	 ;; = x*(1+x/2!+x^2/3!+x^3/4!+...)
	 (let ((sum +qd-one+)
	       (term +qd-one+))
	   (dotimes (k 28)
	     (setf term (div-qd-d (mul-qd term a) (float (cl:+ k 2) 1d0)))
	     (setf sum (add-qd sum term)))
	   (mul-qd a sum)))
	(t
	 (let ((d (expm1-qd/duplication (scale-float-qd a -1))))
	   (mul-qd d (add-qd-d d 2d0))))))

(defun expm1-qd (a)
  "Return exp(a) - 1 for the %QUAD-DOUBLE number A.  This is more
  accurate than just computing exp(a) - 1 directly."
  (declare (type %quad-double a))

  (when (float-infinity-p (qd-0 a))
    (return-from expm1-qd
      (if (minusp (float-sign (qd-0 a)))
	  +qd-zero+
	  a)))
  (expm1-qd/duplication a))

(defun exp-qd (a)
  "Return the expnential of the %QUAD-DOUBLE number A"
  (declare (type %quad-double a))
  ;; Should we try to be more accurate than just 709?
  (when (< (qd-0 a) (log least-positive-normalized-double-float))
    (return-from exp-qd +qd-zero+))

  (when (> (qd-0 a) (log most-positive-double-float))
    #-cmu
    (error 'floating-point-overflow
	   :operation 'exp
	   :operands (list a))
    #+cmu
    (return-from exp-qd (%make-qd-d (/ 1d0 0d0) 0d0 0d0 0d0)))

  (when (zerop-qd a)
    (return-from exp-qd +qd-one+))

  ;; Default for now is exp-qd/reduce
  (exp-qd/reduce a))

(defun log-qd/halley (a)
  (declare (type %quad-double a))
  ;; Halley iteration:
  ;;
  ;; x' = x - 2*(exp(x)-a)/(exp(x)+a)
  ;;
  ;; But the above has problems if a is near
  ;; most-positive-double-float.  Rearrange the computation:
  ;;
  ;; x' = x - 2*(exp(x)/a-1)/(exp(x)/a+1)
  ;;
  ;; I think this works better, but it's also probably a little bit
  ;; more expensive because each iteration has two divisions.
  (let ((x (make-qd-d (log (qd-0 a)))))
    (flet ((iter (est)
	     (let ((exp (div-qd (exp-qd est)
				a)))
	       (sub-qd est
		       (scale-float-qd
			(div-qd (sub-qd-d exp 1d0)
				(add-qd-d exp 1d0))
			1)))))
      ;; Two iterations should be enough
      (setf x (iter x))
      (setf x (iter x))
      x)))
  

(defun log1p-qd/duplication (x)
  (declare (type %quad-double x)
	   (optimize (speed 3)))
  ;; Brent gives the following duplication formula for log1p(x) =
  ;; log(1+x):
  ;;
  ;; log1p(x) = 2*log1p(x/(1+sqrt(1+x)))
  ;;
  ;; So we apply the duplication formula until x is small enough, and
  ;; then use the series
  ;;
  ;; log(1+x) = 2*sum((x/(2+x))^(2*k+1)/(2*k+1),k,0,inf)
  ;;
  ;; Currently "small enough" means x < 0.005.  What is the right
  ;; cutoff?
  (cond ((> (abs (qd-0 x)) .005d0)
	 ;; log1p(x) = 2*log1p(x/(1+sqrt(1+x)))
	 (mul-qd-d (log1p-qd/duplication
		    (div-qd x
			    (add-d-qd 1d0
				      (sqrt-qd (add-d-qd 1d0 x)))))
		   2d0))
	(t
	 ;; Use the series
	 (let* ((term (div-qd x (add-qd-d x 2d0)))
		(mult (sqr-qd term))
		(sum term))
	   (loop for k of-type double-float from 3d0 by 2d0
	      while (> (abs (qd-0 term)) +qd-eps+)
	      do
		(setf term (mul-qd term mult))
		(setf sum (add-qd sum (div-qd-d term k))))
	   (mul-qd-d sum 2d0)))))

(defun log1p-qd (x)
  "Return log1p(x) = log(1+x), done more accurately than just evaluating
  log(1+x). X is a non-negative %QUAD-DOUBLE number"
  (declare (type %quad-double x))

  (when (float-infinity-p (qd-0 x))
    x)
  (log1p-qd/duplication x))

(defun log-qd (a)
  "Return the (natural) log of the non-negative %QUAD-DOUBLE number A"
  (declare (type %quad-double a))
  (cond ((onep-qd a)
	 +qd-zero+)
	((and (zerop-qd a)
	      (plusp (float-sign (qd-0 a))))
	 (%make-qd-d (/ -1d0 (qd-0 a)) 0d0 0d0 0d0))
	((float-infinity-p (qd-0 a))
	 a)
	((minusp (float-sign (qd-0 a)))
	 (error "log of negative"))
	(t
	 ;; Default is Halley's method
	 (log-qd/halley a))))
  

;; sin(a) and cos(a) using Taylor series
;;
;; Assumes |a| <= pi/2048
(defun sincos-taylor (a)
  (declare (type %quad-double a))
  (let ((thresh (cl:* +qd-eps+ (abs (qd-0 a)))))
    (when (zerop-qd a)
      (return-from sincos-taylor
	(values +qd-zero+
		+qd-one+)))
    (let* ((x (neg-qd (sqr-qd a)))
	   (s a)
	   (p a)
	   (m 1d0))
      (loop
	 (setf p (mul-qd p x))
	 (incf m 2)
	 (setf p (div-qd-d p (cl:* m (cl:1- m))))
	 (setf s (add-qd s p))
	 ;;(format t "p = ~A~%" (qd-0 p))
	 (when (<= (abs (qd-0 p)) thresh)
	   (return)))
      ;; cos(c) = sqrt(1-sin(c)^2).  This seems to work ok, even
      ;; though I would have expected some round-off errors in
      ;; computing this.  sqrt(1-x^2) is normally better computed as
      ;; sqrt(1-x)*sqrt(1+x) for small x.
      (values s (sqrt-qd (add-qd-d (neg-qd (sqr-qd s)) 1d0))))))

(defun drem-qd (a b)
  (declare (type %quad-double a b))
  (let ((n (nint-qd (div-qd a b))))
    (sub-qd a (mul-qd n b))))

(defun divrem-qd (a b)
  (declare (type %quad-double a b))
  (let ((n (nint-qd (div-qd a b))))
    (values n (sub-qd a (mul-qd n b)))))

;; Old, original routines.  These are correct, but they don't handle
;; large args because drem-qd isn't accurate enough.
#+(or)
(progn
(defun sin-qd (a)
  "Sin(a)"
  (declare (type %quad-double a))
  ;; To compute sin(x), choose integers a, b so that
  ;;
  ;; x = s + a * (pi/2) + b*(pi/1024)
  ;;
  ;; with |x| <= pi/2048.  Using a precomputed table of sin(k*pi/1024)
  ;; and cos(k*pi/1024), we can compute sin(x) from sin(s) and cos(s).
  ;;
  ;; sin(x) = sin(s+k*(pi/1024) + j*pi/2)
  ;;        = sin(s+k*(pi/1024))*cos(j*pi/2)
  ;;            + cos(s+k*(pi/1024))*sin(j*pi/2)
  ;;
  ;; sin(s+k*pi/1024) = sin(s)*cos(k*pi/1024)
  ;;                     + cos(s)*sin(k*pi/1024)
  ;;
  ;; cos(s+k*pi/1024) = cos(s)*cos(k*pi/1024)
  ;;                     - sin(s)*sin(k*pi/1024)
  (when (zerop-qd a)
    (return-from sin-qd a))

  ;; Reduce modulo 2*pi
  (let ((r (drem-qd a +qd-2pi+)))
    ;; Now reduce by pi/2 and then by pi/1024 so that we obtain
    ;; numbers a, b, t
    (multiple-value-bind (j tmp)
	(divrem-qd r +qd-pi/2+)
      (let* ((j (truncate (qd-0 j)))
	     (abs-j (abs j)))
	(multiple-value-bind (k tmp)
	    (divrem-qd tmp +qd-pi/1024+)
	  (let* ((k (truncate (qd-0 k)))
		 (abs-k (abs k)))
	    (assert (<= abs-j 2))
	    (assert (<= abs-k 256))
	    ;; Compute sin(s) and cos(s)
	    (multiple-value-bind (sin-t cos-t)
		(sincos-taylor tmp)
	      (multiple-value-bind (s c)
		  (cond ((zerop abs-k)
			 (values sin-t cos-t))
			(t
			 ;; Compute sin(s+k*pi/1024), cos(s+k*pi/1024)
			 (let ((u (aref +qd-cos-table+ (cl:1- abs-k)))
			       (v (aref +qd-sin-table+ (cl:1- abs-k))))
			   (cond ((plusp k)
				  ;; sin(s) * cos(k*pi/1024)
				  ;; + cos(s) * sin(k*pi/1024)
				  ;;
				  ;; cos(s) * cos(k*pi/1024)
				  ;; - sin(s) * sin(k*pi/1024)
				  (values (add-qd (mul-qd u sin-t)
						  (mul-qd v cos-t))
					  (sub-qd (mul-qd u cos-t)
						  (mul-qd v sin-t))))
				 (t
				  ;; sin(s) * cos(k*pi/1024)
				  ;; - cos(s) * sin(|k|*pi/1024)
				  ;;
				  ;; cos(s) * cos(k*pi/1024)
				  ;; + sin(s) * sin(|k|*pi/1024)
				  (values (sub-qd (mul-qd u sin-t)
						  (mul-qd v cos-t))
					  (add-qd (mul-qd u cos-t)
						  (mul-qd v sin-t))))))))
		;;(format t "s = ~/qd::qd-format/~%" s)
		;;(format t "c = ~/qd::qd-format/~%" c)
		;; sin(x) =  sin(s+k*pi/1024) * cos(j*pi/2)
		;;         + cos(s+k*pi/1024) * sin(j*pi/2)
		(cond ((zerop abs-j)
		       ;; cos(j*pi/2) = 1, sin(j*pi/2) = 0
		       s)
		      ((= j 1)
		       ;; cos(j*pi/2) = 0, sin(j*pi/2) = 1
		       c)
		      ((= j -1)
		       ;; cos(j*pi/2) = 0, sin(j*pi/2) = -1
		       (neg-qd c))
		      (t
		       ;; cos(j*pi/2) = -1, sin(j*pi/2) = 0
		       (neg-qd s)))))))))))
		     
(defun cos-qd (a)
  "Return the cosine of the %QUAD-DOUBLE number A"
  ;; Just like sin-qd, but for cos.
  (declare (type %quad-double a))
  ;; To compute sin(x), choose integers a, b so that
  ;;
  ;; x = s + a * (pi/2) + b*(pi/1024)
  ;;
  ;; with |x| <= pi/2048.  Using a precomputed table of sin(k*pi/1024)
  ;; and cos(k*pi/1024), we can compute sin(x) from sin(s) and cos(s).
  ;;
  ;; sin(x) = sin(s+k*(pi/1024) + j*pi/2)
  ;;        = sin(s+k*(pi/1024))*cos(j*pi/2)
  ;;            + cos(s+k*(pi/1024))*sin(j*pi/2)
  ;;
  ;; sin(s+k*pi/1024) = sin(s)*cos(k*pi/1024)
  ;;                     + cos(s)*sin(k*pi/1024)
  ;;
  ;; cos(s+k*pi/1024) = cos(s)*cos(k*pi/1024)
  ;;                     - sin(s)*sin(k*pi/1024)
  (when (zerop-qd a)
    (return-from cos-qd +qd-one+))

  ;; Reduce modulo 2*pi
  (let ((r (drem-qd a +qd-2pi+)))
    ;; Now reduce by pi/2 and then by pi/1024 so that we obtain
    ;; numbers a, b, t
    (multiple-value-bind (j tmp)
	(divrem-qd r +qd-pi/2+)
      (let* ((j (truncate (qd-0 j)))
	     (abs-j (abs j)))
	(multiple-value-bind (k tmp)
	    (divrem-qd tmp +qd-pi/1024+)
	  (let* ((k (truncate (qd-0 k)))
		 (abs-k (abs k)))
	    (assert (<= abs-j 2))
	    (assert (<= abs-k 256))
	    ;; Compute sin(s) and cos(s)
	    (multiple-value-bind (sin-t cos-t)
		(sincos-taylor tmp)
	      (multiple-value-bind (s c)
		  (cond ((zerop abs-k)
			 (values sin-t cos-t))
			(t
			 ;; Compute sin(s+k*pi/1024), cos(s+k*pi/1024)
			 (let ((u (aref +qd-cos-table+ (cl:1- abs-k)))
			       (v (aref +qd-sin-table+ (cl:1- abs-k))))
			   (cond ((plusp k)
				  ;; sin(s) * cos(k*pi/1024)
				  ;; + cos(s) * sin(k*pi/1024)
				  ;;
				  ;; cos(s) * cos(k*pi/1024)
				  ;; - sin(s) * sin(k*pi/1024)
				  (values (add-qd (mul-qd u sin-t)
						  (mul-qd v cos-t))
					  (sub-qd (mul-qd u cos-t)
						  (mul-qd v sin-t))))
				 (t
				  ;; sin(s) * cos(k*pi/1024)
				  ;; - cos(s) * sin(|k|*pi/1024)
				  ;;
				  ;; cos(s) * cos(k*pi/1024)
				  ;; + sin(s) * sin(|k|*pi/1024)
				  (values (sub-qd (mul-qd u sin-t)
						  (mul-qd v cos-t))
					  (add-qd (mul-qd u cos-t)
						  (mul-qd v sin-t))))))))
		#+nil
		(progn
		  (format t "s = ~/qd::qd-format/~%" s)
		  (format t "c = ~/qd::qd-format/~%" c))
		;; sin(x) =  sin(s+k*pi/1024) * cos(j*pi/2)
		;;         + cos(s+k*pi/1024) * sin(j*pi/2)
		(cond ((zerop abs-j)
		       ;; cos(j*pi/2) = 1, sin(j*pi/2) = 0
		       c)
		      ((= j 1)
		       ;; cos(j*pi/2) = 0, sin(j*pi/2) = 1
		       (neg-qd s))
		      ((= j -1)
		       ;; cos(j*pi/2) = 0, sin(j*pi/2) = -1
		       s)
		      (t
		       ;; cos(j*pi/2) = -1, sin(j*pi/2) = 0
		       (neg-qd c)))))))))))

;; Compute sin and cos of a
(defun sincos-qd (a)
  "Return the sine of the %QUAD-DOUBLE number A.  The second returned value
is the cosine of A"
  (declare (type %quad-double a))
  (when (zerop-qd a)
    (return-from sincos-qd
      (values +qd-zero+
	      +qd-one+)))

  ;; Reduce modulo 2*pi
  (let ((r (drem-qd a +qd-2pi+)))
    ;; Now reduce by pi/2 and then by pi/1024 so that we obtain
    ;; numbers a, b, t
    (multiple-value-bind (j tmp)
	(divrem-qd r +qd-pi/2+)
      (let* ((j (truncate (qd-0 j)))
	     (abs-j (abs j)))
	(multiple-value-bind (k tmp)
	    (divrem-qd tmp +qd-pi/1024+)
	  (let* ((k (truncate (qd-0 k)))
		 (abs-k (abs k)))
	    (assert (<= abs-j 2))
	    (assert (<= abs-k 256))
	    ;; Compute sin(s) and cos(s)
	    (multiple-value-bind (sin-t cos-t)
		(sincos-taylor tmp)
	      (multiple-value-bind (s c)
		  (cond ((zerop abs-k)
			 (values sin-t cos-t))
			(t
			 ;; Compute sin(s+k*pi/1024), cos(s+k*pi/1024)
			 (let ((u (aref +qd-cos-table+ (cl:1- abs-k)))
			       (v (aref +qd-sin-table+ (cl:1- abs-k))))
			   (cond ((plusp k)
				  ;; sin(s) * cos(k*pi/1024)
				  ;; + cos(s) * sin(k*pi/1024)
				  ;;
				  ;; cos(s) * cos(k*pi/1024)
				  ;; - sin(s) * sin(k*pi/1024)
				  (values (add-qd (mul-qd u sin-t)
						  (mul-qd v cos-t))
					  (sub-qd (mul-qd u cos-t)
						  (mul-qd v sin-t))))
				 (t
				  ;; sin(s) * cos(k*pi/1024)
				  ;; - cos(s) * sin(|k|*pi/1024)
				  ;;
				  ;; cos(s) * cos(k*pi/1024)
				  ;; + sin(s) * sin(|k|*pi/1024)
				  (values (sub-qd (mul-qd u sin-t)
						  (mul-qd v cos-t))
					  (add-qd (mul-qd u cos-t)
						  (mul-qd v sin-t))))))))
		#+nil
		(progn
		  (format t "s = ~/qd::qd-format/~%" s)
		  (format t "c = ~/qd::qd-format/~%" c))
		;; sin(x) =  sin(s+k*pi/1024) * cos(j*pi/2)
		;;         + cos(s+k*pi/1024) * sin(j*pi/2)
		(cond ((zerop abs-j)
		       ;; cos(j*pi/2) = 1, sin(j*pi/2) = 0
		       (values s c))
		      ((= j 1)
		       ;; cos(j*pi/2) = 0, sin(j*pi/2) = 1
		       (values c (neg-qd s)))
		      ((= j -1)
		       ;; cos(j*pi/2) = 0, sin(j*pi/2) = -1
		       (values (neg-qd c) s))
		      (t
		       ;; cos(j*pi/2) = -1, sin(j*pi/2) = 0
		       (values (neg-qd s)
			       (neg-qd c))))))))))))
)

;; A more accurate implementation of sin and cos.  We use a more
;; accurate argument reduction using 1584 bits of 2/pi.  This makes
;; sin and cos more expensive, of course.

#+(or)
(defun rem-pi/2-int (qd)
  ;; Compute qd rem pi/2 = k*pi/2+y.  So we compute k + y*2/pi =
  ;; qd*2/pi.
  ;;
  ;; First convert qd to 2^e*I.  We already have 2/pi in the form
  ;; 2^-1584*J.  Then qd*2/pi = 2^(e-1584)*I*J.  Extract out the
  ;; integer and fractional parts of this.  For the integer part we
  ;; only need it modulo 4, because of the periodicity.  For the
  ;; fractional part, we only need 212 (or so bits of fraction).
  ;;
  ;; FIXME: But we don't really need to compute all the bits of I*J.
  ;; In the product, we really only need the 2 bits to the left of the
  ;; binary point, and then 212 bits to the right.  This doesn't
  ;; require doing the full multiplication.
  (multiple-value-bind (i e s)
      (integer-decode-qd qd)
    (let* ((exp (- e 1584))
	   (prod (* (* s i) +2/pi-bits+))
	   (int (ash prod exp))
	   (mod (logand int 3))
	   (frac (ldb (byte 212 (- (- exp) 212)) prod))
	   (f (mul-qd (scale-float-qd (rational-to-qd frac) -212)
		      +qd-pi/2+)))
      ;; We want the remainder part to be <= pi/4 because the trig
      ;; functions want that.  So if the fraction is too big, adjust
      ;; it, and mod value.
      (if (<= (abs (qd-0 f)) (/ pi 4))
	  (values mod f)
	  (values (mod (1+ mod) 4) (sub-qd f +qd-pi/2+))))))

(defun split-prod-2/pi (qd)
  ;; Compute k and f such that k + f = qd*2/pi, where k is an intege
  ;; and f < 1. Since we do not need more than the the least
  ;; two bits from k, only the least two bits of k are returned.
  (multiple-value-bind (i e)
      (integer-decode-qd qd)
    ;; First convert qd to 2^e*I.  We already have 2/pi in the form
    ;; 2^-1584*J.  Then qd*2/pi = 2^(e-1584)*I*J.  Extract out the
    ;; integer and fractional parts of this.  For the integer part we
    ;; only need it modulo 4, because of the periodicity. For the
    ;; fractional bits, we return the actual fraction as a ratio.
    ;;
    ;; FIXME: We compute the entire product I*J, but we only need the
    ;; 2 bits to the left of the binary point.  But we need many of
    ;; the bits to the right of the binary point becaue we have no
    ;; lower bound on the difference between a quad-double and pi.
    (let* ((exp (- e (integer-length +2/pi-bits+)))
	   (binary-point (- exp))
	   (prod (* i +2/pi-bits+))
	   (int (ldb (byte 2 binary-point) prod))
	   (frac (/ (ldb (byte binary-point 0) prod)
		    (ash 1 binary-point))))
      (values int frac))))

(defun rem-pi/2-int (qd)
  "Compute qd rem pi/2 = k + f, where k is an integer and |f| <
  pi/4. Two values are returned: k mod 4 and f."
  (if (minusp-qd qd)
      (multiple-value-bind (k f)
	  (rem-pi/2-int (neg-qd qd))
	(values (mod (- k) 4) (neg-qd f)))
      (multiple-value-bind (k f)
	  (split-prod-2/pi qd)
	;; The trig functions want the arg to be less than pi/4.  That
	;; is f < 1/2.  Thus adjust k and f accordingly.
	(when (> f 1/2)
	  (setf k (mod (1+ k) 4))
	  (setf f (- f 1)))
	(values k (mul-qd +qd-pi/2+ (rational-to-qd f))))))
	    
(defun rem-pi/2-int-b (qd)
  (declare (type %quad-double qd))
  (multiple-value-bind (i e s)
      (integer-decode-qd qd)
    ;; This computes qd = k*pi/2+s.  And because of the 2*pi
    ;; periodicity of trig functions, we really only need mod(k, 4),
    ;; the least 2 bits of k.
    ;;
    ;; Write qd = 2^e*i.  Write the value 2/pi as sum f[k]*2^(-k), k =
    ;; 0, 1, 2,... and f[k] is 0 or 1.  Then the product, p, is
    ;;
    ;; p = sum i*f[k]*2^(e-k).
    ;;
    ;; Assume 2^m <= i < 2^(m+1).  Then it's obvious that
    ;; i*f[k]*2^(e-k) is greater than 2 if e-k+m > 1, or k < e+m-1.
    ;; Thus we can ignore the bits of 2/pi for k < e+m-1, because the
    ;; contribution is greater than 2.  For k >= e+m-1, we will have
    ;; the product that is between 0 and 4, which is the part we want.
    ;; We just need to select how many fraction bits we want.
    ;; 4*53=212 seems a little low, and 5*53=265 should be more than
    ;; enough.  Let's choose 220.
    ;;
    ;; This code would be faster if we knew how many bits are in i.
    ;; You might think it should be 212 (4*53), but no.  Consider #q1
    ;; + #q1q-100.  This has way more than 212 bits.
    (let* ((qd-bits 220)
	   (m (integer-length i))
	   (frac-bits (+ qd-bits (- m 2)))
	   (bits (ldb (byte qd-bits
			    (- (integer-length +2/pi-bits+) e frac-bits))
		      +2/pi-bits+))
	   ;; Multiply these together.
	   (prod (* s i bits))
	   ;; Extract out the integer and fractional parts
	   (frac (ldb (byte frac-bits 0) prod))
	   (mod (ldb (byte 2 frac-bits) prod))
	   ;;(mod (ash prod (- frac-bits)))
	   (f (mul-qd (scale-float-qd (rational-to-qd frac) (- frac-bits))
		      +qd-pi/2+)))
      ;; If we did this right, (ash prod (- frac-bits)) should be 2
      ;; bits long at most.
      (if (qd-<= (abs-qd f) +qd-pi/4+)
	  (values mod f)
	  (values (mod (1+ mod) 4)
		  (sub-qd f +qd-pi/2+))))))

(defun rem-pi/2 (a)
  "Compute qd rem pi/2 = k + f, where k is an integer and |f| <
  pi/4. Two values are returned: k mod 4 and f."
  (cond ((qd-<= (abs-qd a) +qd-pi/4+)
	 (values 0 a))
	(t
	 (rem-pi/2-int a))))
      

(defun sin-qd (a)
  "Return the sine of the %QUAD-DOUBLE number A"
  (declare (type %quad-double a))
  ;; To compute sin(x), choose integers a, b so that
  ;;
  ;; x = s + a * (pi/2) + b*(pi/1024)
  ;;
  ;; with |x| <= pi/2048.  Using a precomputed table of sin(k*pi/1024)
  ;; and cos(k*pi/1024), we can compute sin(x) from sin(s) and cos(s).
  ;;
  ;; sin(x) = sin(s+k*(pi/1024) + j*pi/2)
  ;;        = sin(s+k*(pi/1024))*cos(j*pi/2)
  ;;            + cos(s+k*(pi/1024))*sin(j*pi/2)
  ;;
  ;; sin(s+k*pi/1024) = sin(s)*cos(k*pi/1024)
  ;;                     + cos(s)*sin(k*pi/1024)
  ;;
  ;; cos(s+k*pi/1024) = cos(s)*cos(k*pi/1024)
  ;;                     - sin(s)*sin(k*pi/1024)
  (when (zerop-qd a)
    (return-from sin-qd a))

  (multiple-value-bind (j r)
      (rem-pi/2 a)
    (multiple-value-bind (k tmp)
	(divrem-qd r +qd-pi/1024+)
      (let* ((k (truncate (qd-0 k)))
	     (abs-k (abs k)))
	(assert (<= 0 j 3))
	(assert (<= abs-k 256))
	;; Compute sin(s) and cos(s)
	(multiple-value-bind (sin-t cos-t)
	    (sincos-taylor tmp)
	  (multiple-value-bind (s c)
	      (cond ((zerop abs-k)
		     (values sin-t cos-t))
		    (t
		     ;; Compute sin(s+k*pi/1024), cos(s+k*pi/1024)
		     (let ((u (aref +qd-cos-table+ (cl:1- abs-k)))
			   (v (aref +qd-sin-table+ (cl:1- abs-k))))
		       (cond ((plusp k)
			      ;; sin(s) * cos(k*pi/1024)
			      ;; + cos(s) * sin(k*pi/1024)
			      ;;
			      ;; cos(s) * cos(k*pi/1024)
			      ;; - sin(s) * sin(k*pi/1024)
			      (values (add-qd (mul-qd u sin-t)
					      (mul-qd v cos-t))
				      (sub-qd (mul-qd u cos-t)
					      (mul-qd v sin-t))))
			     (t
			      ;; sin(s) * cos(k*pi/1024)
			      ;; - cos(s) * sin(|k|*pi/1024)
			      ;;
			      ;; cos(s) * cos(k*pi/1024)
			      ;; + sin(s) * sin(|k|*pi/1024)
			      (values (sub-qd (mul-qd u sin-t)
					      (mul-qd v cos-t))
				      (add-qd (mul-qd u cos-t)
					      (mul-qd v sin-t))))))))
	    ;;(format t "s = ~/qd::qd-format/~%" s)
	    ;;(format t "c = ~/qd::qd-format/~%" c)
	    ;; sin(x) =  sin(s+k*pi/1024) * cos(j*pi/2)
	    ;;         + cos(s+k*pi/1024) * sin(j*pi/2)
	    (cond ((zerop j)
		   ;; cos(j*pi/2) = 1, sin(j*pi/2) = 0
		   s)
		  ((= j 1)
		   ;; cos(j*pi/2) = 0, sin(j*pi/2) = 1
		   c)
		  ((= j 2)
		   ;; cos(j*pi/2) = -1, sin(j*pi/2) = 0
		   (neg-qd s))
		  ((= j 3)
		   ;; cos(j*pi/2) = 0, sin(j*pi/2) = -1
		   (neg-qd c)))))))))

(defun cos-qd (a)
  "Return the cosine of the %QUAD-DOUBLE number A"
  ;; Just like sin-qd, but for cos.
  (declare (type %quad-double a))
  ;; To compute sin(x), choose integers a, b so that
  ;;
  ;; x = s + a * (pi/2) + b*(pi/1024)
  ;;
  ;; with |x| <= pi/2048.  Using a precomputed table of sin(k*pi/1024)
  ;; and cos(k*pi/1024), we can compute sin(x) from sin(s) and cos(s).
  ;;
  ;; sin(x) = sin(s+k*(pi/1024) + j*pi/2)
  ;;        = sin(s+k*(pi/1024))*cos(j*pi/2)
  ;;            + cos(s+k*(pi/1024))*sin(j*pi/2)
  ;;
  ;; sin(s+k*pi/1024) = sin(s)*cos(k*pi/1024)
  ;;                     + cos(s)*sin(k*pi/1024)
  ;;
  ;; cos(s+k*pi/1024) = cos(s)*cos(k*pi/1024)
  ;;                     - sin(s)*sin(k*pi/1024)
  (when (zerop-qd a)
    (return-from cos-qd +qd-one+))

  (multiple-value-bind (j r)
      (rem-pi/2 a)
    (multiple-value-bind (k tmp)
	(divrem-qd r +qd-pi/1024+)
      (let* ((k (truncate (qd-0 k)))
	     (abs-k (abs k)))
	(assert (<= 0 j 3))
	(assert (<= abs-k 256))
	;; Compute sin(s) and cos(s)
	(multiple-value-bind (sin-t cos-t)
	    (sincos-taylor tmp)
	  (multiple-value-bind (s c)
	      (cond ((zerop abs-k)
		     (values sin-t cos-t))
		    (t
		     ;; Compute sin(s+k*pi/1024), cos(s+k*pi/1024)
		     (let ((u (aref +qd-cos-table+ (cl:1- abs-k)))
			   (v (aref +qd-sin-table+ (cl:1- abs-k))))
		       (cond ((plusp k)
			      ;; sin(s) * cos(k*pi/1024)
			      ;; + cos(s) * sin(k*pi/1024)
			      ;;
			      ;; cos(s) * cos(k*pi/1024)
			      ;; - sin(s) * sin(k*pi/1024)
			      (values (add-qd (mul-qd u sin-t)
					      (mul-qd v cos-t))
				      (sub-qd (mul-qd u cos-t)
					      (mul-qd v sin-t))))
			     (t
			      ;; sin(s) * cos(k*pi/1024)
			      ;; - cos(s) * sin(|k|*pi/1024)
			      ;;
			      ;; cos(s) * cos(k*pi/1024)
			      ;; + sin(s) * sin(|k|*pi/1024)
			      (values (sub-qd (mul-qd u sin-t)
					      (mul-qd v cos-t))
				      (add-qd (mul-qd u cos-t)
					      (mul-qd v sin-t))))))))
	    #+nil
	    (progn
	      (format t "s = ~/qd::qd-format/~%" s)
	      (format t "c = ~/qd::qd-format/~%" c))
	    ;; sin(x) =  sin(s+k*pi/1024) * cos(j*pi/2)
	    ;;         + cos(s+k*pi/1024) * sin(j*pi/2)
	    (cond ((zerop j)
		   ;; cos(j*pi/2) = 1, sin(j*pi/2) = 0
		   c)
		  ((= j 1)
		   ;; cos(j*pi/2) = 0, sin(j*pi/2) = 1
		   (neg-qd s))
		  ((= j 2)
		   ;; cos(j*pi/2) = -1, sin(j*pi/2) = 0
		   (neg-qd c))
		  ((= j 3)
		   ;; cos(j*pi/2) = 0, sin(j*pi/2) = -1
		   s))))))))

(defun sincos-qd (a)
  (declare (type %quad-double a))
  (when (zerop-qd a)
    (return-from sincos-qd
      (values +qd-zero+
	      +qd-one+)))

  (multiple-value-bind (j r)
      (rem-pi/2 a)
    (multiple-value-bind (k tmp)
	(divrem-qd r +qd-pi/1024+)
      (let* ((k (truncate (qd-0 k)))
	     (abs-k (abs k)))
	(assert (<= 0 j 3))
	(assert (<= abs-k 256))
	;; Compute sin(s) and cos(s)
	(multiple-value-bind (sin-t cos-t)
	    (sincos-taylor tmp)
	  (multiple-value-bind (s c)
	      (cond ((zerop abs-k)
		     (values sin-t cos-t))
		    (t
		     ;; Compute sin(s+k*pi/1024), cos(s+k*pi/1024)
		     (let ((u (aref +qd-cos-table+ (cl:1- abs-k)))
			   (v (aref +qd-sin-table+ (cl:1- abs-k))))
		       (cond ((plusp k)
			      ;; sin(s) * cos(k*pi/1024)
			      ;; + cos(s) * sin(k*pi/1024)
			      ;;
			      ;; cos(s) * cos(k*pi/1024)
			      ;; - sin(s) * sin(k*pi/1024)
			      (values (add-qd (mul-qd u sin-t)
					      (mul-qd v cos-t))
				      (sub-qd (mul-qd u cos-t)
					      (mul-qd v sin-t))))
			     (t
			      ;; sin(s) * cos(k*pi/1024)
			      ;; - cos(s) * sin(|k|*pi/1024)
			      ;;
			      ;; cos(s) * cos(k*pi/1024)
			      ;; + sin(s) * sin(|k|*pi/1024)
			      (values (sub-qd (mul-qd u sin-t)
					      (mul-qd v cos-t))
				      (add-qd (mul-qd u cos-t)
					      (mul-qd v sin-t))))))))
	    #+nil
	    (progn
	      (format t "s = ~/qd::qd-format/~%" s)
	      (format t "c = ~/qd::qd-format/~%" c))
	    ;; sin(x) =  sin(s+k*pi/1024) * cos(j*pi/2)