From deea19512915a3d20942878ce509c8f5a955610f Mon Sep 17 00:00:00 2001
From: Jeong YunWon <jeong@youknowone.org>
Date: Tue, 22 Apr 2025 07:45:38 +0900
Subject: [PATCH] mul_add feature + macos CI

---
 .github/workflows/rust.yml | 10 ++++++----
 Cargo.toml                 |  5 +++++
 src/gamma.rs               | 15 ++++++++++++---
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 98ec583..0b5c9dc 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -19,7 +19,7 @@ jobs:
         os: [
           ubuntu-latest,
           windows-latest,
-          # macos-latest  # disabled due to incompatibility. See issue #1
+          macos-latest,
         ]
         rust: [stable]
     steps:
@@ -38,6 +38,8 @@ jobs:
     - name: Build
       run: cargo build --verbose
     - name: Run tests
-      run: cargo test --verbose
-    - name: Run tests on Release
-      run: cargo test --release --verbose
+      if: matrix.os != 'macos-latest'
+      run: cargo test --verbose && cargo test --release --verbose
+    - name: Run tests with FMA (macOS)
+      if: matrix.os == 'macos-latest'
+      run: cargo test --verbose --features mul_add && cargo test --release --verbose --features mul_add
diff --git a/Cargo.toml b/Cargo.toml
index 6347416..e95f84d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,6 +3,11 @@ name = "pymath"
 version = "0.1.0"
 edition = "2024"
 
+[features]
+# Turning on this feature on aarch64-apple-darwin helps bit representation compatibility
+# See also: https://github.com/python/cpython/issues/132763
+mul_add = []
+
 [dev-dependencies]
 proptest = "1.6.0"
 pyo3 = { version = "0.24", features = ["abi3"] }
diff --git a/src/gamma.rs b/src/gamma.rs
index 7dd6f98..db96333 100644
--- a/src/gamma.rs
+++ b/src/gamma.rs
@@ -37,6 +37,14 @@ const LANCZOS_DEN_COEFFS: [f64; LANCZOS_N] = [
     1.0,
 ];
 
+fn mul_add(a: f64, b: f64, c: f64) -> f64 {
+    if cfg!(feature = "mul_add") {
+        a.mul_add(b, c)
+    } else {
+        a * b + c
+    }
+}
+
 fn lanczos_sum(x: f64) -> f64 {
     let mut num = 0.0;
     let mut den = 0.0;
@@ -50,8 +58,8 @@ fn lanczos_sum(x: f64) -> f64 {
     // this resulted in lower accuracy.
     if x < 5.0 {
         for i in (0..LANCZOS_N).rev() {
-            num = num * x + LANCZOS_NUM_COEFFS[i];
-            den = den * x + LANCZOS_DEN_COEFFS[i];
+            num = mul_add(num, x, LANCZOS_NUM_COEFFS[i]);
+            den = mul_add(den, x, LANCZOS_DEN_COEFFS[i]);
         }
     } else {
         for i in 0..LANCZOS_N {
@@ -237,7 +245,8 @@ pub fn lgamma(x: f64) -> Result<f64, Error> {
     // absorbed the exp(-lanczos_g) term, and throwing out the lanczos_g
     // subtraction below; it's probably not worth it.
     let mut r = lanczos_sum(absx).ln() - LANCZOS_G;
-    r += (absx - 0.5) * ((absx + LANCZOS_G - 0.5).ln() - 1.0);
+    let t = absx - 0.5;
+    r = mul_add(t, (absx + LANCZOS_G - 0.5).ln() - 1.0, r);
 
     if x < 0.0 {
         // Use reflection formula to get value for negative x