deepmodeling · A-006 · Apr 8, 2025 · Apr 8, 2025 · Apr 8, 2025 · Apr 8, 2025
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -31,7 +31,7 @@ jobs:
 
       - name: Configure
         run: |
-          cmake -B build -DBUILD_TESTING=ON -DENABLE_DEEPKS=ON -DENABLE_MLKEDF=ON -DENABLE_LIBXC=ON -DENABLE_LIBRI=ON -DENABLE_PAW=ON -DENABLE_GOOGLEBENCH=ON -DENABLE_RAPIDJSON=ON  -DCMAKE_EXPORT_COMPILE_COMMANDS=1
+          cmake -B build -DBUILD_TESTING=ON -DENABLE_DEEPKS=ON -DENABLE_MLKEDF=ON -DENABLE_LIBXC=ON -DENABLE_LIBRI=ON -DENABLE_PAW=ON -DENABLE_GOOGLEBENCH=ON -DENABLE_RAPIDJSON=ON  -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DENABLE_FLOAT_FFTW=ON
 
 # Temporarily removed because no one maintains this now.
 # And it will break the CI test workflow.

diff --git a/.gitignore b/.gitignore
@@ -23,4 +23,4 @@ __pycache__
 abacus.json
 *.npy
 toolchain/install/
-toolchain/abacus_env.sh
+toolchain/abacus_env.sh
diff --git a/source/module_base/test/math_chebyshev_test.cpp b/source/module_base/test/math_chebyshev_test.cpp
@@ -14,9 +14,6 @@
  *   - calfinalvec_real
  *   - calfinalvec_complex
  *   - tracepolyA
- *   - checkconverge
- *
- *
  */
 class toolfunc
 {
@@ -625,6 +622,8 @@ TEST_F(MathChebyshevTest, tracepolyA_float)
 
 TEST_F(MathChebyshevTest, checkconverge_float)
 {
+    #ifdef __MPI
+    #undef __MPI
     const int norder = 100;
     p_fchetest = new ModuleBase::Chebyshev<float>(norder);
 
@@ -648,5 +647,6 @@ TEST_F(MathChebyshevTest, checkconverge_float)
 
     delete[] v;
     delete p_fchetest;
+    #endif
 }
 #endif
diff --git a/source/module_base/test_parallel/CMakeLists.txt b/source/module_base/test_parallel/CMakeLists.txt
@@ -40,6 +40,12 @@ AddTest(
   SOURCES test_para_gemm.cpp
 )
 
+AddTest(
+  TARGET base_math_chebyshev_mpi
+  LIBS MPI::MPI_CXX parameter ${math_libs} base device container
+  SOURCES math_chebyshev_mpi_test.cpp
+)
+
 add_test(NAME base_para_gemm_parallel
       COMMAND mpirun -np 4 ./base_para_gemm
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}

diff --git a/source/module_base/test_parallel/math_chebyshev_mpi_test.cpp b/source/module_base/test_parallel/math_chebyshev_mpi_test.cpp
@@ -0,0 +1,207 @@
+#include "../math_chebyshev.h"
+#include "mpi.h"
+#include "module_base/parallel_comm.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+/************************************************
+ *  unit test of class Chebyshev MPI part
+ ***********************************************/
+
+ /**
+  * - Tested Functions:
+  * - checkconverge
+  */
+class toolfunc
+{
+  public:
+    double x7(double x)
+    {
+        return pow(x, 7);
+    }
+    double x6(double x)
+    {
+        return pow(x, 6);
+    }
+    double expr(double x)
+    {
+        return exp(x);
+    }
+    std::complex<double> expi(std::complex<double> x)
+    {
+        const std::complex<double> j(0.0, 1.0);
+        return exp(j * x);
+    }
+    std::complex<double> expi2(std::complex<double> x)
+    {
+        const std::complex<double> j(0.0, 1.0);
+        const double PI = 3.14159265358979323846;
+        return exp(j * PI / 2.0 * x);
+    }
+    // Pauli matrix: [0,-i;i,0]
+    int LDA = 2;
+    double factor = 1;
+    void sigma_y(std::complex<double>* spin_in, std::complex<double>* spin_out, const int m = 1)
+    {
+        const std::complex<double> j(0.0, 1.0);
+        if (this->LDA < 2) {
+            this->LDA = 2;
+}
+        for (int i = 0; i < m; ++i)
+        {
+            spin_out[LDA * i] = -factor * j * spin_in[LDA * i + 1];
+            spin_out[LDA * i + 1] = factor * j * spin_in[LDA * i];
+        }
+    }
+#ifdef __ENABLE_FLOAT_FFTW
+    float x7(float x)
+    {
+        return pow(x, 7);
+    }
+    float x6(float x)
+    {
+        return pow(x, 6);
+    }
+    float expr(float x)
+    {
+        return exp(x);
+    }
+    std::complex<float> expi(std::complex<float> x)
+    {
+        const std::complex<float> j(0.0, 1.0);
+        return exp(j * x);
+    }
+    std::complex<float> expi2(std::complex<float> x)
+    {
+        const std::complex<float> j(0.0, 1.0);
+        const float PI = 3.14159265358979323846;
+        return exp(j * PI / 2.0f * x);
+    }
+    // Pauli matrix: [0,-i;i,0]
+    void sigma_y(std::complex<float>* spin_in, std::complex<float>* spin_out, const int m = 1)
+    {
+        const std::complex<float> j(0.0, 1.0);
+        if (this->LDA < 2)
+            this->LDA = 2;
+        for (int i = 0; i < m; ++i)
+        {
+            spin_out[LDA * i] = -j * spin_in[LDA * i + 1];
+            spin_out[LDA * i + 1] = j * spin_in[LDA * i];
+        }
+    }
+#endif
+};
+class MathChebyshevTest : public testing::Test
+{
+  protected:
+    ModuleBase::Chebyshev<double>* p_chetest;
+    ModuleBase::Chebyshev<float>* p_fchetest;
+    toolfunc fun;
+    int dsize = 0;
+    int my_rank = 0;
+    void SetUp() override
+    {
+        int world_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+        int world_size;
+        MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+        int color = (world_rank < world_size / 2) ? 0 : 1;
+        int key = world_rank;
+
+        MPI_Comm_split(MPI_COMM_WORLD, color, key, &POOL_WORLD);
+
+        int pool_rank, pool_size;
+        MPI_Comm_rank(POOL_WORLD, &pool_rank);
+        MPI_Comm_size(POOL_WORLD, &pool_size);
+    }
+    void TearDown() override
+    {
+    }
+};
+
+TEST_F(MathChebyshevTest, checkconverge)
+{
+    const int norder = 100;
+    p_chetest = new ModuleBase::Chebyshev<double>(norder);
+    auto fun_sigma_y
+        = [&](std::complex<double>* in, std::complex<double>* out, const int m = 1) { fun.sigma_y(in, out, m); };
+
+    std::complex<double>* v = new std::complex<double>[4];
+    v[0] = 1.0;
+    v[1] = 0.0;
+    v[2] = 0.0;
+    v[3] = 1.0; //[1 0; 0 1]
+    double tmin = -1.1;
+    double tmax = 1.1;
+    bool converge;
+    converge = p_chetest->checkconverge(fun_sigma_y, v, 2, 2, tmax, tmin, 0.2);
+    EXPECT_TRUE(converge);
+    converge = p_chetest->checkconverge(fun_sigma_y, v + 2, 2, 2, tmax, tmin, 0.2);
+    EXPECT_TRUE(converge);
+    EXPECT_NEAR(tmin, -1.1, 1e-8);
+    EXPECT_NEAR(tmax, 1.1, 1e-8);
+
+    tmax = -1.1;
+    converge = p_chetest->checkconverge(fun_sigma_y, v, 2, 2, tmax, tmin, 2.2);
+    EXPECT_TRUE(converge);
+    EXPECT_NEAR(tmin, -1.1, 1e-8);
+    EXPECT_NEAR(tmax, 1.1, 1e-8);
+
+    // not converge
+    v[0] = std::complex<double>(0, 1), v[1] = 1;
+    fun.factor = 1.5;
+    tmin = -1.1, tmax = 1.1;
+    converge = p_chetest->checkconverge(fun_sigma_y, v, 2, 2, tmax, tmin, 0.2);
+    EXPECT_FALSE(converge);
+
+    fun.factor = -1.5;
+    tmin = -1.1, tmax = 1.1;
+    converge = p_chetest->checkconverge(fun_sigma_y, v, 2, 2, tmax, tmin, 0.2);
+    EXPECT_FALSE(converge);
+    fun.factor = 1;
+
+    delete[] v;
+    delete p_chetest;
+}
+
+#ifdef __ENABLE_FLOAT_FFTW
+TEST_F(MathChebyshevTest, checkconverge_float)
+{
+    const int norder = 100;
+    p_fchetest = new ModuleBase::Chebyshev<float>(norder);
+
+    std::complex<float>* v = new std::complex<float>[4];
+    v[0] = 1.0;
+    v[1] = 0.0;
+    v[2] = 0.0;
+    v[3] = 1.0; //[1 0; 0 1]
+    float tmin = -1.1;
+    float tmax = 1.1;
+    bool converge;
+
+    auto fun_sigma_yf
+        = [&](std::complex<float>* in, std::complex<float>* out, const int m = 1) { fun.sigma_y(in, out, m); };
+    converge = p_fchetest->checkconverge(fun_sigma_yf, v, 2, 2, tmax, tmin, 0.2);
+    EXPECT_TRUE(converge);
+    converge = p_fchetest->checkconverge(fun_sigma_yf, v + 2, 2, 2, tmax, tmin, 0.2);
+    EXPECT_TRUE(converge);
+    EXPECT_NEAR(tmin, -1.1, 1e-6);
+    EXPECT_NEAR(tmax, 1.1, 1e-6);
+
+    delete[] v;
+    delete p_fchetest;
+}
+#endif
+
+int main(int argc, char** argv)
+{
+#ifdef __MPI
+    MPI_Init(&argc, &argv);
+#endif
+    testing::InitGoogleTest(&argc, argv);
+    int result = RUN_ALL_TESTS();
+#ifdef __MPI
+    MPI_Finalize();
+#endif
+    return result;
+}
diff --git a/source/module_basis/module_pw/module_fft/fft_cpu.cpp b/source/module_basis/module_pw/module_fft/fft_cpu.cpp
@@ -347,18 +347,22 @@ void FFT_CPU<double>::fftxyfor(std::complex<double>* in, std::complex<double>* o
     int npy = this->nplane * this->ny;
     if (this->xprime)
     {
+
         fftw_execute_dft(this->planxfor1, (fftw_complex*)in, (fftw_complex*)out);
+        #pragma omp parallel for
         for (int i = 0; i < this->lixy + 1; ++i)
         {
             fftw_execute_dft(this->planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
         }
+        #pragma omp parallel for
         for (int i = rixy; i < this->nx; ++i)
         {
             fftw_execute_dft(this->planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
         }
     }
     else
     {
+        #pragma omp parallel for
         for (int i = 0; i < this->nx; ++i)
         {
             fftw_execute_dft(this->planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
@@ -374,10 +378,12 @@ void FFT_CPU<double>::fftxybac(std::complex<double>* in,std::complex<double>* ou
     int npy = this->nplane * this->ny;
     if (this->xprime)
     {
+        #pragma omp parallel for
         for (int i = 0; i < this->lixy + 1; ++i)
         {
             fftw_execute_dft(this->planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
         }
+        #pragma omp parallel for
         for (int i = rixy; i < this->nx; ++i)
         {
             fftw_execute_dft(this->planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
@@ -388,6 +394,7 @@ void FFT_CPU<double>::fftxybac(std::complex<double>* in,std::complex<double>* ou
     {
         fftw_execute_dft(this->planxbac1, (fftw_complex*)in, (fftw_complex*)out);
         fftw_execute_dft(this->planxbac2, (fftw_complex*)&in[rixy * nplane], (fftw_complex*)&out[rixy * nplane]);
+        #pragma omp parallel for
         for (int i = 0; i < this->nx; ++i)
         {
             fftw_execute_dft(this->planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
@@ -414,13 +421,15 @@ void FFT_CPU<double>::fftxyr2c(double* in, std::complex<double>* out) const
     if (this->xprime)
     {
         fftw_execute_dft_r2c(this->planxr2c, in, (fftw_complex*)out);
+        #pragma omp parallel for
         for (int i = 0; i < this->lixy + 1; ++i)
         {
             fftw_execute_dft(this->planyfor, (fftw_complex*)&out[i * npy], (fftw_complex*)&out[i * npy]);
         }
     }
     else
     {
+        #pragma omp parallel for
         for (int i = 0; i < this->nx; ++i)
         {
             fftw_execute_dft_r2c(this->planyr2c, &in[i * npy], (fftw_complex*)&out[i * npy]);
@@ -435,6 +444,7 @@ void FFT_CPU<double>::fftxyc2r(std::complex<double> *in,double *out) const
     int npy = this->nplane * this->ny;
     if (this->xprime)
     {
+        #pragma omp parallel for
         for (int i = 0; i < this->lixy + 1; ++i)
         {
             fftw_execute_dft(this->planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&in[i * npy]);
@@ -444,6 +454,7 @@ void FFT_CPU<double>::fftxyc2r(std::complex<double> *in,double *out) const
     else
     {
         fftw_execute_dft(this->planxbac1, (fftw_complex*)in, (fftw_complex*)in);
+        #pragma omp parallel for
         for (int i = 0; i < this->nx; ++i)
         {
             fftw_execute_dft_c2r(this->planyc2r, (fftw_complex*)&in[i * npy], &out[i * npy]);

diff --git a/source/module_basis/module_pw/pw_basis.cpp b/source/module_basis/module_pw/pw_basis.cpp
@@ -17,7 +17,7 @@ PW_Basis::PW_Basis(std::string device_, std::string precision_) : device(std::mo
     classname="PW_Basis";
     this->fft_bundle.setfft("cpu",this->precision);
     this->double_data_ = (this->precision == "double") || (this->precision == "mixing");
-    this->float_data_ = (this->precision == "single") || (this->precision == "mixing");
+    this->float_data_ = (this->precision == "single")  || (this->precision == "mixing");
 }
 
 PW_Basis:: ~PW_Basis()

diff --git a/source/module_basis/module_pw/pw_basis_k.cpp b/source/module_basis/module_pw/pw_basis_k.cpp
@@ -203,11 +203,11 @@ void PW_Basis_K::setuptransform()
     this->getstartgr();
     this->setupIndGk();
     this->fft_bundle.clear();
+    std::string fft_device = this->device;
 #if defined(__DSP)
-    this->fft_bundle.setfft("dsp", this->precision);
-#else
-    this->fft_bundle.setfft(this->device, this->precision);
+    fft_device = "dsp";
 #endif
+    this->fft_bundle.setfft(fft_device, this->precision);
     if (this->xprime)
     {
         this->fft_bundle.initfft(this->nx,

diff --git a/source/module_basis/module_pw/pw_gatherscatter.h b/source/module_basis/module_pw/pw_gatherscatter.h
@@ -98,8 +98,7 @@ void PW_Basis::gatherp_scatters(std::complex<T>* in, std::complex<T>* out) const
 template <typename T>
 void PW_Basis::gathers_scatterp(std::complex<T>* in, std::complex<T>* out) const
 {
-    //ModuleBase::timer::tick(this->classname, "gathers_scatterp");
-
+    // ModuleBase::timer::tick(this->classname, "gathers_scatterp");
     if(this->poolnproc == 1) //In this case nrxx=fftnx*fftny*nz, nst = nstot, 
     {
 #ifdef _OPENMP
@@ -183,7 +182,7 @@ void PW_Basis::gathers_scatterp(std::complex<T>* in, std::complex<T>* out) const
 		}
     }
 #endif
-    //ModuleBase::timer::tick(this->classname, "gathers_scatterp");
+    // ModuleBase::timer::tick(this->classname, "gathers_scatterp");
     return;
 }
 

diff --git a/source/module_basis/module_pw/pw_transform.cpp b/source/module_basis/module_pw/pw_transform.cpp
@@ -210,7 +210,7 @@ void PW_Basis::recip2real(const std::complex<FPTYPE>* in, FPTYPE* out, const boo
 #endif
     for (int i = 0; i < this->nst * this->nz; ++i)
     {
-        fft_bundle.get_auxg_data<FPTYPE>()[i] = std::complex<double>(0, 0);
+        fft_bundle.get_auxg_data<FPTYPE>()[i] = std::complex<FPTYPE>(0, 0);
     }
 
 #ifdef _OPENMP

diff --git a/source/module_basis/module_pw/pw_transform_k.cpp b/source/module_basis/module_pw/pw_transform_k.cpp
@@ -187,7 +187,6 @@ void PW_Basis_K::recip2real(const std::complex<FPTYPE>* in,
     this->gathers_scatterp(this->fft_bundle.get_auxg_data<FPTYPE>(), this->fft_bundle.get_auxr_data<FPTYPE>());
 
     this->fft_bundle.fftxybac(fft_bundle.get_auxr_data<FPTYPE>(), fft_bundle.get_auxr_data<FPTYPE>());
-
     auto* auxr = this->fft_bundle.get_auxr_data<FPTYPE>();
     if (add)
     {