From b03917e7efea6e2c6f3e17cafc00d3f65eeae075 Mon Sep 17 00:00:00 2001
From: Neven Sajko <nsajko@gmail.com>
Date: Mon, 23 Mar 2020 21:53:15 +0000
Subject: [PATCH] add function sncs1cs to get accurate 1-cosine

Currently in three places in the code there is an expression like:
1-cosine(subexpr). The problem with that is that it sacrifices quite a
bit of precision the floating point type in use could provide. Because
of the subtraction, and the fact that floating point is MUCH denser near
zero than near one, 1-cosine(x) values that should be close to zero will
just be truncated to exactly zero.

The added function sncs1cs is not exported.

The changes apply only if the user has not specified a custom mfloat_t.

This improves mat3_rotation_axis, mat4_rotation_axis and
sine_ease_in_out. A side improvement in the former two is that the
computation of sine, cosine and 1-cosine should be cheaper when
computing them all at the same time than when calling a separate
function for each of the three (because the majority of the code is
shared between the three).

I checked that the improvements do materialize and, more importantly,
at the same time searched for big regressions:

https://github.com/nsajko/numericcompfricas (see the README.md)

Basically, the results are that there are no significant differences for
sin and cos (I think the accuracy only varies by one bit), but there are
big improvements for 1-cos (AKA omc AKA 1cs). The repository contains
the results of the check against glibc and against musl.

FriCAS (for the check) can be found here:
https://github.com/fricas/fricas
---
 mathc.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 152 insertions(+)

diff --git a/mathc.c b/mathc.c
index 888d9f3..ef92afe 100644
--- a/mathc.c
+++ b/mathc.c
@@ -20,6 +20,140 @@ the following restrictions:
 
 #include "mathc.h"
 
+#if !defined(MATHC_FLOATING_POINT_TYPE) && defined(MATHC_USE_FLOATING_POINT)
+/* The following code (sincos1cos and float and double versions of
+ * sncs1cs) is Copyright © 1985, 1995, 2000 Stephen L. Moshier and
+ * Copyright © 2020 Neven Sajko. The intention is to get accurate
+ * 1-cosine, while also getting the sine and cosine as a bonus. The
+ * implementation is derived from the Cephes Math Library's sin.c and
+ * sinf.c. To be more specific, I took Stephen Moshier's sin, cos, sinf
+ * and cosf (without changing the polynomials) and adapted them to give
+ * all three required function values (in double and float versions),
+ * without unnecessary accuracy losses.
+ *
+ * sncs1cs is not correct for values of x of huge magnitude. That can
+ * be fixed by more elaborate range reduction.
+ */
+
+typedef struct {
+	/* Sine, cosine, 1-cosine */
+	mfloat_t sin, cos, omc;
+} sincos1cos;
+
+static
+sincos1cos sncs1cs(mfloat_t x)
+{
+	const mfloat_t fourOverPi = 1.27323954473516268615;
+
+	mfloat_t y, z, zz;
+	mint_t j, sign = 1, csign = 1;
+	sincos1cos r;
+
+	/* Handle +-0. */
+	if (x == (mfloat_t)0) {
+		r.sin = x;
+		r.cos = 1;
+		r.omc = 0;
+		return r;
+	}
+	if (isnan(x)) {
+		r.sin = r.cos = r.omc = x;
+		return r;
+	}
+	if (isinf(x)) {
+		r.sin = r.cos = r.omc = x - x;
+		return r;
+	}
+	if (x < 0) {
+		sign = -1;
+		x = -x;
+	}
+	j = (mint_t)(x * fourOverPi);
+	y = (mfloat_t)j;
+	/* map zeros to origin */
+	if ((j & 1)) {
+		j += 1;
+		y += 1;
+	}
+	j = j & 7; /* octant modulo one turn */
+	/* reflect in x axis */
+	if (j > 3) {
+		sign = -sign;
+		csign = -csign;
+		j -= 4;
+	}
+	if (j > 1) {
+		csign = -csign;
+	}
+
+# if defined(MATHC_USE_SINGLE_FLOATING_POINT)
+	const float sc[] = {-1.9515295891E-4, 8.3321608736E-3, -1.6666654611E-1};
+
+	const float cc[] = {2.443315711809948E-5, -1.388731625493765E-3, 4.166664568298827E-2};
+
+	/* These are for a 24-bit significand: */
+	const float DP1 = 0.78515625;
+	const float DP2 = 2.4187564849853515625e-4;
+	const float DP3 = 3.77489497744594108e-8;
+
+	/* Extended precision modular arithmetic */
+	z = ((x - y * DP1) - y * DP2) - y * DP3;
+	zz = z * z;
+	r.sin = z + zz*z*((sc[0]*zz + sc[1])*zz + sc[2]);
+	r.omc = (mfloat_t)0.5*zz - zz*zz*((cc[0]*zz + cc[1])*zz + cc[2]);
+# elif defined(MATHC_USE_DOUBLE_FLOATING_POINT)
+	const double sc[] = {
+		1.58962301576546568060E-10,
+		-2.50507477628578072866E-8,
+		2.75573136213857245213E-6,
+		-1.98412698295895385996E-4,
+		8.33333333332211858878E-3,
+		-1.66666666666666307295E-1,
+	};
+
+	const double cc[] = {
+		-1.13585365213876817300E-11,
+		2.08757008419747316778E-9,
+		-2.75573141792967388112E-7,
+		2.48015872888517045348E-5,
+		-1.38888888888730564116E-3,
+		4.16666666666665929218E-2,
+	};
+
+	const double DP1 = 7.85398125648498535156E-1;
+	const double DP2 = 3.77489470793079817668E-8;
+	const double DP3 = 2.69515142907905952645E-15;
+
+	/* Extended precision modular arithmetic */
+	z = ((x - y * DP1) - y * DP2) - y * DP3;
+	zz = z * z;
+	r.sin = z + zz*z*(((((sc[0]*zz + sc[1])*zz + sc[2])*zz + sc[3])*zz + sc[4])*zz + sc[5]);
+	r.omc = (mfloat_t)0.5*zz - zz*zz*(((((cc[0]*zz + cc[1])*zz + cc[2])*zz + cc[3])*zz + cc[4])*zz + cc[5]);
+# else
+#  error "Unknown preprocessing configuration."
+# endif
+	if (j == 1 || j == 2) {
+		if (csign < 0) {
+			r.sin = -r.sin;
+		}
+		r.cos = r.sin;
+		r.sin = 1 - r.omc;
+		r.omc = 1 - r.cos;
+	} else {
+		if (csign < 0) {
+			r.cos = r.omc - 1;
+			r.omc = 1 - r.cos;
+		} else {
+			r.cos = 1 - r.omc;
+		}
+	}
+	if (sign < 0) {
+		r.sin = -r.sin;
+	}
+	return r;
+}
+#endif
+
 #if defined(MATHC_USE_INT)
 mint_t clampi(mint_t value, mint_t min, mint_t max)
 {
@@ -2520,9 +2654,15 @@ mfloat_t *mat3_rotation_z(mfloat_t *result, mfloat_t f)
 
 mfloat_t *mat3_rotation_axis(mfloat_t *result, mfloat_t *v0, mfloat_t f)
 {
+#if !defined(MATHC_FLOATING_POINT_TYPE)
+	// Get higher accuracy 1-cosine when mfloat_t is known.
+	sincos1cos sc1c = sncs1cs(f);
+	mfloat_t c = sc1c.cos, s = sc1c.sin, one_c = sc1c.omc;
+#else
 	mfloat_t c = MCOS(f);
 	mfloat_t s = MSIN(f);
 	mfloat_t one_c = MFLOAT_C(1.0) - c;
+#endif
 	mfloat_t x = v0[0];
 	mfloat_t y = v0[4];
 	mfloat_t z = v0[8];
@@ -2977,9 +3117,15 @@ mfloat_t *mat4_rotation_z(mfloat_t *result, mfloat_t f)
 
 mfloat_t *mat4_rotation_axis(mfloat_t *result, mfloat_t *v0, mfloat_t f)
 {
+#if !defined(MATHC_FLOATING_POINT_TYPE)
+	// Get higher accuracy 1-cosine when mfloat_t is known.
+	sincos1cos sc1c = sncs1cs(f);
+	mfloat_t c = sc1c.cos, s = sc1c.sin, one_c = sc1c.omc;
+#else
 	mfloat_t c = MCOS(f);
 	mfloat_t s = MSIN(f);
 	mfloat_t one_c = MFLOAT_C(1.0) - c;
+#endif
 	mfloat_t x = v0[0];
 	mfloat_t y = v0[1];
 	mfloat_t z = v0[2];
@@ -6753,7 +6899,13 @@ mfloat_t sine_ease_in(mfloat_t f)
 
 mfloat_t sine_ease_in_out(mfloat_t f)
 {
+#if !defined(MATHC_FLOATING_POINT_TYPE)
+	// Get higher accuracy 1-cosine when mfloat_t is known.
+	sincos1cos sc1c = sncs1cs(f * MPI);
+	return MFLOAT_C(0.5) * sc1c.omc;
+#else
 	return MFLOAT_C(0.5) * (MFLOAT_C(1.0) - MCOS(f * MPI));
+#endif
 }
 
 mfloat_t circular_ease_out(mfloat_t f)