deepmodeling · jieli-matrix · Feb 8, 2025
diff --git a/source/module_base/kernels/cuda/math_kernel_op_vec.cu b/source/module_base/kernels/cuda/math_kernel_op_vec.cu
@@ -52,6 +52,19 @@ __global__ void vector_mul_vector_kernel(const int size,
     }
 }
 
+template <typename T>
+__global__ void vector_div_constant_kernel(const int size,
+                                         T* result,
+                                         const T* vector,
+                                         const typename GetTypeReal<T>::type constant)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size)
+    {
+        result[i] = vector[i] / constant;
+    }
+}
+
 template <typename T>
 __global__ void vector_div_vector_kernel(const int size,
                                          T* result,
@@ -127,6 +140,55 @@ void vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator
     vector_mul_real_wrapper(dim, result, vector, constant);
 }
 
+// vector operator: result[i] = vector[i] / constant
+template <>
+void vector_div_constant_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                     double* result,
+                                                                     const double* vector,
+                                                                     const double constant)
+{
+    // In small cases, 1024 threads per block will only utilize 17 blocks, much less than 40
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    vector_div_constant_kernel<double><<<block, thread>>>(dim, result, vector, constant);
+
+    cudaCheckOnDebug();
+}
+
+template <typename FPTYPE>
+inline void vector_div_constant_wrapper(const int& dim,
+                                    std::complex<FPTYPE>* result,
+                                    const std::complex<FPTYPE>* vector,
+                                    const FPTYPE constant)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
+
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    vector_div_constant_kernel<thrust::complex<FPTYPE>><<<block, thread>>>(dim, result_tmp, vector_tmp, constant);
+
+    cudaCheckOnDebug();
+}
+
+template <>
+void vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                  std::complex<float>* result,
+                                                                                  const std::complex<float>* vector,
+                                                                                  const float constant)
+{
+    vector_div_constant_wrapper(dim, result, vector, constant);
+}
+
+template <>
+void vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                   std::complex<double>* result,
+                                                                                   const std::complex<double>* vector,
+                                                                                   const double constant)
+{
+    vector_div_constant_wrapper(dim, result, vector, constant);
+}
+
 // vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex)
 template <>
 void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
@@ -306,6 +368,10 @@ template struct vector_mul_real_op<std::complex<float>, base_device::DEVICE_GPU>
 template struct vector_mul_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 
+template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
+template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
+
 template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;

diff --git a/source/module_base/kernels/math_kernel_op.h b/source/module_base/kernels/math_kernel_op.h
@@ -98,6 +98,21 @@ template <typename T, typename Device> struct vector_mul_vector_op {
   void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
 };
 
+// vector operator: result[i] = vector[i] / constant
+template <typename T, typename Device> struct vector_div_constant_op {
+  using Real = typename GetTypeReal<T>::type;
+  /// @brief result[i] = vector[i] / constant
+  ///
+  /// Input Parameters
+  /// \param dim : array size
+  /// \param vector : input array 
+  /// \param constant : input constant
+  ///
+  /// Output Parameters
+  /// \param result : output array
+  void operator()(const int& dim, T* result, const T* vector, const Real constant);
+};
+
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
 template <typename T, typename Device> struct vector_div_vector_op {
   using Real = typename GetTypeReal<T>::type;
@@ -284,6 +299,48 @@ template <typename T, typename Device> struct matrixCopy {
   void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB);
 };
 
+template <typename T, typename Device>
+struct apply_eigenvalues_op {
+    using Real = typename GetTypeReal<T>::type;
+
+    void operator()(const Device *d, const int &nbase, const int &nbase_x, const int &notconv,
+                    T *result, const T *vectors, const Real *eigenvalues);
+};
+
+template <typename T, typename Device>
+struct precondition_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const Device* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,  
+                   const Real* eigenvalues);
+};
+
+template <typename T, typename Device>
+struct normalize_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const Device* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   Real* psi_norm = nullptr);
+};
+
+template <typename T>
+struct normalize_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const base_device::DEVICE_GPU* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   Real* psi_norm);
+};
+
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 // Partially specialize functor for base_device::GpuDevice.
 template <typename T> struct dot_real_op<T, base_device::DEVICE_GPU> {
@@ -306,6 +363,12 @@ template <typename T> struct vector_mul_vector_op<T, base_device::DEVICE_GPU> {
   void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
 };
 
+// vector operator: result[i] = vector[i] / constant
+template <typename T> struct vector_div_constant_op<T, base_device::DEVICE_GPU> {
+  using Real = typename GetTypeReal<T>::type;
+  void operator()(const int& dim, T* result, const T* vector, const Real constant);
+};
+
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
 template <typename T> struct vector_div_vector_op<T, base_device::DEVICE_GPU> {
   using Real = typename GetTypeReal<T>::type;
@@ -334,6 +397,26 @@ template <typename T> struct matrixCopy<T, base_device::DEVICE_GPU> {
 void createGpuBlasHandle();
 void destoryBLAShandle();
 
+// vector operator: result[i] = -lambda[i] * vector[i]
+template <typename T> struct apply_eigenvalues_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+
+    void operator()(const base_device::DEVICE_GPU *d, const int &nbase, const int &nbase_x, const int &notconv,
+                    T *result, const T *vectors, const Real *eigenvalues);
+};
+
+template <typename T>
+struct precondition_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const base_device::DEVICE_GPU* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,
+                   const Real* eigenvalues);
+};
+
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 } // namespace hsolver
 

diff --git a/source/module_base/kernels/math_kernel_op_vec.cpp b/source/module_base/kernels/math_kernel_op_vec.cpp
@@ -60,6 +60,22 @@ struct vector_mul_vector_op<T, base_device::DEVICE_CPU>
     }
 };
 
+template <typename T>
+struct vector_div_constant_op<T, base_device::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int& dim, T* result, const T* vector, const Real constant)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
+#endif
+        for (int i = 0; i < dim; i++)
+        {
+            result[i] = vector[i] / constant;
+        }
+    }
+};
+
 template <typename T>
 struct vector_div_vector_op<T, base_device::DEVICE_CPU>
 {
@@ -159,6 +175,10 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_CP
 template struct vector_mul_vector_op<double, base_device::DEVICE_CPU>;
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
 
+template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct vector_div_constant_op<double, base_device::DEVICE_CPU>;
+template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_CPU>;
+
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_CPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;

diff --git a/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu b/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu
@@ -50,6 +50,19 @@ __launch_bounds__(1024) __global__ void vector_mul_vector_kernel(const int size,
     }
 }
 
+template <typename T>
+__launch_bounds__(1024) __global__ void vector_div_constant_kernel(const int size,
+                                                                 T* result,
+                                                                 const T* vector,
+                                                                 const typename GetTypeReal<T>::type constant)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size)
+    {
+        result[i] = vector[i] / constant;
+    }
+}
+
 template <typename T>
 __launch_bounds__(1024) __global__ void vector_div_vector_kernel(const int size,
                                                                  T* result,
@@ -142,6 +155,69 @@ void vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator
     hipCheckOnDebug();
 }
 
+// vector operator: result[i] = vector[i] / constant
+template <>
+void vector_div_constant_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                     double* result,
+                                                                     const double* vector,
+                                                                     const double constant)
+{
+    int thread = 1024;
+    int block = (dim + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<double>),
+                       dim3(block),
+                       dim3(thread),
+                       0,
+                       0,
+                       dim,
+                       result,
+                       vector,
+                       constant);
+
+    hipCheckOnDebug();
+}
+
+template <typename FPTYPE>
+inline void vector_div_constant_wrapper(const int& dim,
+                                    std::complex<FPTYPE>* result,
+                                    const std::complex<FPTYPE>* vector,
+                                    const FPTYPE constant)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
+    int thread = 1024;
+    int block = (dim + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<thrust::complex<FPTYPE>>),
+                       dim3(block),
+                       dim3(thread),
+                       0,
+                       0,
+                       dim,
+                       result_tmp,
+                       vector_tmp,
+                       constant);
+
+    hipCheckOnDebug();
+}
+
+template <>
+void vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                  std::complex<float>* result,
+                                                                                  const std::complex<float>* vector,
+                                                                                  const float constant)
+{
+    vector_div_constant_wrapper(dim, result, vector, constant);
+}
+
+template <>
+void vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                   std::complex<double>* result,
+                                                                                   const std::complex<double>* vector,
+                                                                                   const double constant)
+{
+    vector_div_constant_wrapper(dim, result, vector, constant);
+}
+
 // vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex)
 template <>
 void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
@@ -357,6 +433,10 @@ template struct vector_mul_real_op<std::complex<float>, base_device::DEVICE_GPU>
 template struct vector_mul_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 
+template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
+template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
+
 template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;

diff --git a/source/module_base/module_device/device.cpp b/source/module_base/module_device/device.cpp
@@ -5,7 +5,7 @@
 
 #include <base/macros/macros.h>
 #include <cstring>
-
+#include <iostream>
 #ifdef __MPI
 #include "mpi.h"
 #endif
@@ -166,6 +166,11 @@ int device_count = -1;
 cudaGetDeviceCount(&device_count);
 #elif defined(__ROCM)
 hipGetDeviceCount(&device_count);
+/***auto start_time = std::chrono::high_resolution_clock::now();
+std::cout << "Starting hipGetDeviceCount.." << std::endl;
+auto end_time = std::chrono::high_resolution_clock::now();
+auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+std::cout << "hipGetDeviceCount took " << duration.count() << "seconds" << std::endl;***/
 #endif
 if (device_count <= 0)
 {
@@ -711,4 +716,4 @@ void record_device_memory<base_device::DEVICE_GPU>(
 #endif
 
 } // end of namespace information
-} // end of namespace base_device
+} // end of namespace base_device
diff --git a/source/module_esolver/esolver.cpp b/source/module_esolver/esolver.cpp
@@ -114,8 +114,10 @@ std::string determine_type()
     }
 
     GlobalV::ofs_running << "\n RUNNING WITH DEVICE  : " << device_info << " / "
-                         << base_device::information::get_device_info(PARAM.inp.device) << std::endl;
-
+                         << base_device::information::get_device_info(PARAM.inp.device) << std::endl; 
+    /***auto end_time = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+    std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/
     return esolver_type;
 }
 

diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp
@@ -572,7 +572,8 @@ void ESolver_KS_PW<T, Device>::hamilt2rho_single(UnitCell& ucell,
                                                      hsolver::DiagoIterAssist<T, Device>::SCF_ITER,
                                                      hsolver::DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
                                                      hsolver::DiagoIterAssist<T, Device>::PW_DIAG_THR,
-                                                     hsolver::DiagoIterAssist<T, Device>::need_subspace);
+                                                     hsolver::DiagoIterAssist<T, Device>::need_subspace,
+                                                     PARAM.inp.use_k_continuity);
 
         hsolver_pw_obj.solve(this->p_hamilt,
                              this->kspw_psi[0],