diff --git a/source/module_base/kernels/cuda/math_kernel_op_vec.cu b/source/module_base/kernels/cuda/math_kernel_op_vec.cu
index c0a3b102d7..182d4c54d9 100644
--- a/source/module_base/kernels/cuda/math_kernel_op_vec.cu
+++ b/source/module_base/kernels/cuda/math_kernel_op_vec.cu
@@ -52,6 +52,19 @@ __global__ void vector_mul_vector_kernel(const int size,
     }
 }
 
+template <typename T>
+__global__ void vector_div_constant_kernel(const int size,
+                                         T* result,
+                                         const T* vector,
+                                         const typename GetTypeReal<T>::type constant)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size)
+    {
+        result[i] = vector[i] / constant;
+    }
+}
+
 template <typename T>
 __global__ void vector_div_vector_kernel(const int size,
                                          T* result,
@@ -127,6 +140,55 @@ void vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator
     vector_mul_real_wrapper(dim, result, vector, constant);
 }
 
+// vector operator: result[i] = vector[i] / constant
+template <>
+void vector_div_constant_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                     double* result,
+                                                                     const double* vector,
+                                                                     const double constant)
+{
+    // In small cases, 1024 threads per block will only utilize 17 blocks, much less than 40
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    vector_div_constant_kernel<double><<<block, thread>>>(dim, result, vector, constant);
+
+    cudaCheckOnDebug();
+}
+
+template <typename FPTYPE>
+inline void vector_div_constant_wrapper(const int& dim,
+                                    std::complex<FPTYPE>* result,
+                                    const std::complex<FPTYPE>* vector,
+                                    const FPTYPE constant)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
+
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    vector_div_constant_kernel<thrust::complex<FPTYPE>><<<block, thread>>>(dim, result_tmp, vector_tmp, constant);
+
+    cudaCheckOnDebug();
+}
+
+template <>
+void vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                  std::complex<float>* result,
+                                                                                  const std::complex<float>* vector,
+                                                                                  const float constant)
+{
+    vector_div_constant_wrapper(dim, result, vector, constant);
+}
+
+template <>
+void vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                   std::complex<double>* result,
+                                                                                   const std::complex<double>* vector,
+                                                                                   const double constant)
+{
+    vector_div_constant_wrapper(dim, result, vector, constant);
+}
+
 // vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex)
 template <>
 void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
@@ -306,6 +368,10 @@ template struct vector_mul_real_op<std::complex<float>, base_device::DEVICE_GPU>
 template struct vector_mul_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 
+template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
+template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
+
 template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
diff --git a/source/module_base/kernels/math_kernel_op.h b/source/module_base/kernels/math_kernel_op.h
index bae77b23d6..d26e96171c 100644
--- a/source/module_base/kernels/math_kernel_op.h
+++ b/source/module_base/kernels/math_kernel_op.h
@@ -98,6 +98,21 @@ template <typename T, typename Device> struct vector_mul_vector_op {
   void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
 };
 
+// vector operator: result[i] = vector[i] / constant
+template <typename T, typename Device> struct vector_div_constant_op {
+  using Real = typename GetTypeReal<T>::type;
+  /// @brief result[i] = vector[i] / constant
+  ///
+  /// Input Parameters
+  /// \param dim : array size
+  /// \param vector : input array 
+  /// \param constant : input constant
+  ///
+  /// Output Parameters
+  /// \param result : output array
+  void operator()(const int& dim, T* result, const T* vector, const Real constant);
+};
+
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
 template <typename T, typename Device> struct vector_div_vector_op {
   using Real = typename GetTypeReal<T>::type;
@@ -284,6 +299,48 @@ template <typename T, typename Device> struct matrixCopy {
   void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB);
 };
 
+template <typename T, typename Device>
+struct apply_eigenvalues_op {
+    using Real = typename GetTypeReal<T>::type;
+
+    void operator()(const Device *d, const int &nbase, const int &nbase_x, const int &notconv,
+                    T *result, const T *vectors, const Real *eigenvalues);
+};
+
+template <typename T, typename Device>
+struct precondition_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const Device* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,  
+                   const Real* eigenvalues);
+};
+
+template <typename T, typename Device>
+struct normalize_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const Device* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   Real* psi_norm = nullptr);
+};
+
+template <typename T>
+struct normalize_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const base_device::DEVICE_GPU* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   Real* psi_norm);
+};
+
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 // Partially specialize functor for base_device::GpuDevice.
 template <typename T> struct dot_real_op<T, base_device::DEVICE_GPU> {
@@ -306,6 +363,12 @@ template <typename T> struct vector_mul_vector_op<T, base_device::DEVICE_GPU> {
   void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
 };
 
+// vector operator: result[i] = vector[i] / constant
+template <typename T> struct vector_div_constant_op<T, base_device::DEVICE_GPU> {
+  using Real = typename GetTypeReal<T>::type;
+  void operator()(const int& dim, T* result, const T* vector, const Real constant);
+};
+
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
 template <typename T> struct vector_div_vector_op<T, base_device::DEVICE_GPU> {
   using Real = typename GetTypeReal<T>::type;
@@ -334,6 +397,26 @@ template <typename T> struct matrixCopy<T, base_device::DEVICE_GPU> {
 void createGpuBlasHandle();
 void destoryBLAShandle();
 
+// vector operator: result[i] = -lambda[i] * vector[i]
+template <typename T> struct apply_eigenvalues_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+
+    void operator()(const base_device::DEVICE_GPU *d, const int &nbase, const int &nbase_x, const int &notconv,
+                    T *result, const T *vectors, const Real *eigenvalues);
+};
+
+template <typename T>
+struct precondition_op<T, base_device::DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const base_device::DEVICE_GPU* d,
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,
+                   const Real* eigenvalues);
+};
+
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 } // namespace hsolver
 
diff --git a/source/module_base/kernels/math_kernel_op_vec.cpp b/source/module_base/kernels/math_kernel_op_vec.cpp
index 4030edbb48..605f97ff19 100644
--- a/source/module_base/kernels/math_kernel_op_vec.cpp
+++ b/source/module_base/kernels/math_kernel_op_vec.cpp
@@ -60,6 +60,22 @@ struct vector_mul_vector_op<T, base_device::DEVICE_CPU>
     }
 };
 
+template <typename T>
+struct vector_div_constant_op<T, base_device::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int& dim, T* result, const T* vector, const Real constant)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
+#endif
+        for (int i = 0; i < dim; i++)
+        {
+            result[i] = vector[i] / constant;
+        }
+    }
+};
+
 template <typename T>
 struct vector_div_vector_op<T, base_device::DEVICE_CPU>
 {
@@ -159,6 +175,10 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_CP
 template struct vector_mul_vector_op<double, base_device::DEVICE_CPU>;
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
 
+template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct vector_div_constant_op<double, base_device::DEVICE_CPU>;
+template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_CPU>;
+
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_CPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
diff --git a/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu b/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu
index a9009fe5c6..9e05160c54 100644
--- a/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu
+++ b/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu
@@ -50,6 +50,19 @@ __launch_bounds__(1024) __global__ void vector_mul_vector_kernel(const int size,
     }
 }
 
+template <typename T>
+__launch_bounds__(1024) __global__ void vector_div_constant_kernel(const int size,
+                                                                 T* result,
+                                                                 const T* vector,
+                                                                 const typename GetTypeReal<T>::type constant)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size)
+    {
+        result[i] = vector[i] / constant;
+    }
+}
+
 template <typename T>
 __launch_bounds__(1024) __global__ void vector_div_vector_kernel(const int size,
                                                                  T* result,
@@ -142,6 +155,69 @@ void vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator
     hipCheckOnDebug();
 }
 
+// vector operator: result[i] = vector[i] / constant
+template <>
+void vector_div_constant_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                     double* result,
+                                                                     const double* vector,
+                                                                     const double constant)
+{
+    int thread = 1024;
+    int block = (dim + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<double>),
+                       dim3(block),
+                       dim3(thread),
+                       0,
+                       0,
+                       dim,
+                       result,
+                       vector,
+                       constant);
+
+    hipCheckOnDebug();
+}
+
+template <typename FPTYPE>
+inline void vector_div_constant_wrapper(const int& dim,
+                                    std::complex<FPTYPE>* result,
+                                    const std::complex<FPTYPE>* vector,
+                                    const FPTYPE constant)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
+    int thread = 1024;
+    int block = (dim + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<thrust::complex<FPTYPE>>),
+                       dim3(block),
+                       dim3(thread),
+                       0,
+                       0,
+                       dim,
+                       result_tmp,
+                       vector_tmp,
+                       constant);
+
+    hipCheckOnDebug();
+}
+
+template <>
+void vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                  std::complex<float>* result,
+                                                                                  const std::complex<float>* vector,
+                                                                                  const float constant)
+{
+    vector_div_constant_wrapper(dim, result, vector, constant);
+}
+
+template <>
+void vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                   std::complex<double>* result,
+                                                                                   const std::complex<double>* vector,
+                                                                                   const double constant)
+{
+    vector_div_constant_wrapper(dim, result, vector, constant);
+}
+
 // vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex)
 template <>
 void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
@@ -357,6 +433,10 @@ template struct vector_mul_real_op<std::complex<float>, base_device::DEVICE_GPU>
 template struct vector_mul_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 
+template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
+template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
+
 template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
diff --git a/source/module_base/module_device/device.cpp b/source/module_base/module_device/device.cpp
index b20ea9f3ad..9373a5a31a 100644
--- a/source/module_base/module_device/device.cpp
+++ b/source/module_base/module_device/device.cpp
@@ -5,7 +5,7 @@
 
 #include <base/macros/macros.h>
 #include <cstring>
-
+#include <iostream>
 #ifdef __MPI
 #include "mpi.h"
 #endif
@@ -166,6 +166,11 @@ int device_count = -1;
 cudaGetDeviceCount(&device_count);
 #elif defined(__ROCM)
 hipGetDeviceCount(&device_count);
+/***auto start_time = std::chrono::high_resolution_clock::now();
+std::cout << "Starting hipGetDeviceCount.." << std::endl;
+auto end_time = std::chrono::high_resolution_clock::now();
+auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+std::cout << "hipGetDeviceCount took " << duration.count() << "seconds" << std::endl;***/
 #endif
 if (device_count <= 0)
 {
@@ -711,4 +716,4 @@ void record_device_memory<base_device::DEVICE_GPU>(
 #endif
 
 } // end of namespace information
-} // end of namespace base_device
\ No newline at end of file
+} // end of namespace base_device
diff --git a/source/module_esolver/esolver.cpp b/source/module_esolver/esolver.cpp
index 9d5559dd0d..baf5caf4cc 100644
--- a/source/module_esolver/esolver.cpp
+++ b/source/module_esolver/esolver.cpp
@@ -114,8 +114,10 @@ std::string determine_type()
     }
 
     GlobalV::ofs_running << "\n RUNNING WITH DEVICE  : " << device_info << " / "
-                         << base_device::information::get_device_info(PARAM.inp.device) << std::endl;
-
+                         << base_device::information::get_device_info(PARAM.inp.device) << std::endl; 
+    /***auto end_time = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+    std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/
     return esolver_type;
 }
 
diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp
index 14afe6c3c9..1673f537bc 100644
--- a/source/module_esolver/esolver_ks_pw.cpp
+++ b/source/module_esolver/esolver_ks_pw.cpp
@@ -572,7 +572,8 @@ void ESolver_KS_PW<T, Device>::hamilt2rho_single(UnitCell& ucell,
                                                      hsolver::DiagoIterAssist<T, Device>::SCF_ITER,
                                                      hsolver::DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
                                                      hsolver::DiagoIterAssist<T, Device>::PW_DIAG_THR,
-                                                     hsolver::DiagoIterAssist<T, Device>::need_subspace);
+                                                     hsolver::DiagoIterAssist<T, Device>::need_subspace,
+                                                     PARAM.inp.use_k_continuity);
 
         hsolver_pw_obj.solve(this->p_hamilt,
                              this->kspw_psi[0],
diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp
index 9e8aa2f066..3440a778e8 100644
--- a/source/module_hsolver/diago_dav_subspace.cpp
+++ b/source/module_hsolver/diago_dav_subspace.cpp
@@ -6,6 +6,7 @@
 #include "module_base/timer.h"
 #include "module_hsolver/kernels/dngvd_op.h"
 #include "module_base/kernels/math_kernel_op.h"
+#include "module_hsolver/kernels/bpcg_kernel_op.h"
 #include "module_base/kernels/dsp/dsp_connector.h"
 
 #include "module_hsolver/diag_hs_para.h"
@@ -293,26 +294,28 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
                          psi_iter + (nbase) * this->dim,
                          this->dim);
 
-    std::vector<Real> e_temp_cpu(nbase, 0);
+    // Eigenvalues operation section
+    std::vector<Real> e_temp_cpu(this->notconv, 0);
     Real* e_temp_hd = e_temp_cpu.data();
-    if(this->device == base_device::GpuDevice)
+    if (this->device == base_device::GpuDevice)
     {
         e_temp_hd = nullptr;
         resmem_real_op()(e_temp_hd, nbase);
     }
-    for (int m = 0; m < notconv; m++)
+
+    for (int m = 0; m < this->notconv; m++)
     {
-        e_temp_cpu.assign(nbase, (-1.0 * (*eigenvalue_iter)[m]));
-        if (this->device == base_device::GpuDevice)
-        {
-            syncmem_var_h2d_op()(e_temp_hd, e_temp_cpu.data(), nbase);
-        }
-        ModuleBase::vector_mul_vector_op<T, Device>()(nbase,
-                                                      vcc + m * this->nbase_x,
-                                                      vcc + m * this->nbase_x,
-                                                      e_temp_hd);
+        e_temp_cpu[m] = -(*eigenvalue_iter)[m];
     }
-    if(this->device == base_device::GpuDevice)
+
+    if (this->device == base_device::GpuDevice)
+    {
+        syncmem_var_h2d_op()(e_temp_hd, e_temp_cpu.data(), this->notconv);
+    }
+    
+    apply_eigenvalues_op<T, Device>()(nbase, this->nbase_x, this->notconv, this->vcc, this->vcc, e_temp_hd);
+
+    if (this->device == base_device::GpuDevice)
     {
         delmem_real_op()(e_temp_hd);
     }
@@ -336,48 +339,58 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
          psi_iter + nbase * this->dim,
          this->dim);
 
-    // "precondition!!!"
-    std::vector<Real> pre(this->dim, 0.0);
-    for (int m = 0; m < notconv; m++)
-    {
-        for (size_t i = 0; i < this->dim; i++)
-        {
-            // pre[i] = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]);
-            double x = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]);
-            pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
-        }
+    // Precondition section
 #if defined(__CUDA) || defined(__ROCM)
-        if (this->device == base_device::GpuDevice)
-        {
-            syncmem_var_h2d_op()(this->d_precondition, pre.data(), this->dim);
-            ModuleBase::vector_div_vector_op<T, Device>()(this->dim,
-                                                          psi_iter + (nbase + m) * this->dim,
-                                                          psi_iter + (nbase + m) * this->dim,
-                                                          this->d_precondition);
-        }
-        else
+    if (this->device == base_device::GpuDevice)
+    {
+        Real* eigenvalues_gpu = nullptr;
+        resmem_real_op()(eigenvalues_gpu, notconv);
+        syncmem_var_h2d_op()(eigenvalues_gpu, (*eigenvalue_iter).data(), notconv);
+        
+        precondition_op<T, Device>()(this->dim,
+                                    psi_iter,
+                                    nbase,
+                                    notconv,
+                                    d_precondition,
+                                    eigenvalues_gpu);
+        delmem_real_op()(eigenvalues_gpu);
+    }
+    else
 #endif
-        {
-            ModuleBase::vector_div_vector_op<T, Device>()(this->dim,
-                                                          psi_iter + (nbase + m) * this->dim,
-                                                          psi_iter + (nbase + m) * this->dim,
-                                                          pre.data());
-        }
+    {
+        precondition_op<T, Device>()(this->dim,
+                                    psi_iter,
+                                    nbase,
+                                    notconv,
+                                    this->precondition.data(),
+                                    (*eigenvalue_iter).data());
     }
 
-    // "normalize!!!" in order to improve numerical stability of subspace diagonalization
-    for (size_t i = 0; i < notconv; i++)
+    // Normalize section
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == base_device::GpuDevice)
+    {
+        Real* psi_norm = nullptr;
+        resmem_real_op()(psi_norm, notconv);
+        using setmem_real_op = base_device::memory::set_memory_op<Real, Device>;
+        setmem_real_op()(psi_norm, 0.0, notconv);
+        
+        normalize_op<T, Device>()(this->dim,
+                                psi_iter,
+                                nbase,
+                                notconv,
+                                psi_norm);
+        delmem_real_op()(psi_norm);
+    }
+    else
+#endif
     {
-        Real psi_norm = ModuleBase::dot_real_op<T, Device>()(this->dim,
-                                                           psi_iter + (nbase + i) * this->dim,
-                                                           psi_iter + (nbase + i) * this->dim,
-                                                           true);
-        assert(psi_norm > 0.0);
-        psi_norm = sqrt(psi_norm);
-        ModuleBase::vector_mul_real_op<T, Device>()(this->dim,
-                                                       psi_iter + (nbase + i) * this->dim,
-                                                       psi_iter + (nbase + i) * this->dim,
-                                                       Real(1.0 / psi_norm));
+        Real* psi_norm = nullptr;
+        normalize_op<T, Device>()(this->dim,
+                                psi_iter,
+                                nbase,
+                                notconv,
+                                psi_norm);
     }
 
     // update hpsi[:, nbase:nbase+notconv]
diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp
index a8b9c02eec..2868fd2f0a 100644
--- a/source/module_hsolver/hsolver_pw.cpp
+++ b/source/module_hsolver/hsolver_pw.cpp
@@ -282,51 +282,106 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
     std::vector<Real> eigenvalues(this->wfc_basis->nks * psi.get_nbands(), 0.0);
     ethr_band.resize(psi.get_nbands(), this->diag_thr);
 
-    /// Loop over k points for solve Hamiltonian to charge density
-    for (int ik = 0; ik < this->wfc_basis->nks; ++ik)
-    {
-        /// update H(k) for each k point
-        pHamilt->updateHk(ik);
+    // Initialize k-point continuity if enabled
+    static int count = 0;
+    if (use_k_continuity) {
+        build_k_neighbors();
+    }
+
+    // Loop over k points for solve Hamiltonian to charge density
+    if (use_k_continuity) {
+        // K-point continuity case
+        for (int i = 0; i < this->wfc_basis->nks; ++i)
+        {
+            const int ik = k_order[i];
+            
+            // update H(k) for each k point
+            pHamilt->updateHk(ik);
 
 #ifdef USE_PAW
-        this->paw_func_in_kloop(ik, tpiba);
+            this->paw_func_in_kloop(ik, tpiba);
 #endif
 
-        /// update psi pointer for each k point
-        psi.fix_k(ik);
+            // update psi pointer for each k point
+            psi.fix_k(ik);
+
+            // If using k-point continuity and not first k-point, propagate from parent
+            if (ik > 0 && count == 0 && k_parent.find(ik) != k_parent.end()) {
+                propagate_psi(psi, k_parent[ik], ik);
+            }
 
-        // template add precondition calculating here
-        update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0()));
+            // template add precondition calculating here
+            update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0()));
 
-        // use smooth threshold for all iter methods
-        if (PARAM.inp.diago_smooth_ethr == true)
-        {
-            this->cal_smooth_ethr(pes->klist->wk[ik],
-                                  &pes->wg(ik, 0),
-                                  DiagoIterAssist<T, Device>::PW_DIAG_THR,
-                                  ethr_band);
-        }
+            // use smooth threshold for all iter methods
+            if (PARAM.inp.diago_smooth_ethr == true)
+            {
+                this->cal_smooth_ethr(pes->klist->wk[ik],
+                                    &pes->wg(ik, 0),
+                                    DiagoIterAssist<T, Device>::PW_DIAG_THR,
+                                    ethr_band);
+            }
 
 #ifdef USE_PAW
-        this->call_paw_cell_set_currentk(ik);
+            this->call_paw_cell_set_currentk(ik);
 #endif
 
-        /// solve eigenvector and eigenvalue for H(k)
-        this->hamiltSolvePsiK(pHamilt,
-                              psi,
-                              precondition,
-                              eigenvalues.data() + ik * psi.get_nbands(),
-                              this->wfc_basis->nks);
+            // solve eigenvector and eigenvalue for H(k)
+            this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks);
 
-        if (skip_charge)
+            if (skip_charge)
+            {
+                GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik
+                                    << " is: " << DiagoIterAssist<T, Device>::avg_iter
+                                    << " ; where current threshold is: " << this->diag_thr << " . " << std::endl;
+                DiagoIterAssist<T, Device>::avg_iter = 0.0;
+            }
+        }
+    }
+    else {
+        // Original code without k-point continuity
+        for (int ik = 0; ik < this->wfc_basis->nks; ++ik)
         {
-            GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik
-                                 << " is: " << DiagoIterAssist<T, Device>::avg_iter
-                                 << " ; where current threshold is: " << this->diag_thr << " . " << std::endl;
-            DiagoIterAssist<T, Device>::avg_iter = 0.0;
+            // update H(k) for each k point
+            pHamilt->updateHk(ik);
+
+#ifdef USE_PAW
+            this->paw_func_in_kloop(ik, tpiba);
+#endif
+
+            // update psi pointer for each k point
+            psi.fix_k(ik);
+
+            // template add precondition calculating here
+            update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0()));
+
+            // use smooth threshold for all iter methods
+            if (PARAM.inp.diago_smooth_ethr == true)
+            {
+                this->cal_smooth_ethr(pes->klist->wk[ik],
+                                    &pes->wg(ik, 0),
+                                    DiagoIterAssist<T, Device>::PW_DIAG_THR,
+                                    ethr_band);
+            }
+
+#ifdef USE_PAW
+            this->call_paw_cell_set_currentk(ik);
+#endif
+
+            // solve eigenvector and eigenvalue for H(k)
+            this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks);
+
+            if (skip_charge)
+            {
+                GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik
+                                    << " is: " << DiagoIterAssist<T, Device>::avg_iter
+                                    << " ; where current threshold is: " << this->diag_thr << " . " << std::endl;
+                DiagoIterAssist<T, Device>::avg_iter = 0.0;
+            }
         }
-        /// calculate the contribution of Psi for charge density rho
     }
+    
+    count++;
     // END Loop over k points
 
     // copy eigenvalues to ekb in ElecState
@@ -661,6 +716,101 @@ void HSolverPW<T, Device>::output_iterInfo()
     }
 }
 
+template <typename T, typename Device>
+void HSolverPW<T, Device>::build_k_neighbors() {
+    const int nk = this->wfc_basis->nks;
+    kvecs_c.resize(nk);
+    k_order.clear();
+    k_order.reserve(nk);
+    
+    // Store k-points and corresponding indices
+    struct KPoint {
+        ModuleBase::Vector3<double> kvec;
+        int index;
+        double norm;
+        
+        KPoint(const ModuleBase::Vector3<double>& v, int i) : 
+            kvec(v), index(i), norm(v.norm()) {}
+    };
+    
+    // Build k-point list
+    std::vector<KPoint> klist;
+    for (int ik = 0; ik < nk; ++ik) {
+        kvecs_c[ik] = this->wfc_basis->kvec_c[ik];
+        klist.push_back(KPoint(kvecs_c[ik], ik));
+    }
+    
+    // Sort k-points by distance from origin
+    std::sort(klist.begin(), klist.end(),
+        [](const KPoint& a, const KPoint& b) {
+            return a.norm < b.norm;
+        });
+    
+    // Build parent-child relationships
+    k_order.push_back(klist[0].index);
+    
+    // Find nearest processed k-point as parent for each k-point
+    for (int i = 1; i < nk; ++i) {
+        int current_k = klist[i].index;
+        double min_dist = 1e10;
+        int parent = -1;
+        
+        // find the nearest k-point as parent
+        for (int j = 0; j < k_order.size(); ++j) {
+            int processed_k = k_order[j];
+            double dist = (kvecs_c[current_k] - kvecs_c[processed_k]).norm2();
+            if (dist < min_dist) {
+                min_dist = dist;
+                parent = processed_k;
+            }
+        }
+        
+        k_parent[current_k] = parent;
+        k_order.push_back(current_k);
+    }
+}
+
+template <typename T, typename Device>
+void HSolverPW<T, Device>::propagate_psi(psi::Psi<T, Device>& psi, const int from_ik, const int to_ik) {
+    const int nbands = psi.get_nbands();
+    const int npwk = this->wfc_basis->npwk[to_ik];
+    
+    // Get k-point difference
+    ModuleBase::Vector3<double> dk = kvecs_c[to_ik] - kvecs_c[from_ik];
+    
+    // Allocate porter locally
+    T* porter = nullptr;
+    resmem_complex_op()(porter, this->wfc_basis->nmaxgr, "HSolverPW::porter");
+    
+    // Process each band
+    for (int ib = 0; ib < nbands; ib++)
+    {
+        // Fix current k-point and band
+        // psi.fix_k(from_ik);
+        
+        // FFT to real space
+        // this->wfc_basis->recip_to_real(this->ctx, psi.get_pointer(ib), porter, from_ik);
+        this->wfc_basis->recip_to_real(this->ctx, &psi(from_ik, ib, 0), porter, from_ik);
+        
+        // Apply phase factor
+        //     // TODO: Check how to get the r vector
+        //     ModuleBase::Vector3<double> r = this->wfc_basis->get_ir2r(ir);
+        //     double phase = this->wfc_basis->tpiba * (dk.x * r.x + dk.y * r.y + dk.z * r.z);
+        //     psi_real[ir] *= std::exp(std::complex<double>(0.0, phase));
+        // }
+        
+        // Fix k-point for target
+        // psi.fix_k(to_ik);
+        
+        // FFT back to reciprocal space
+        // this->wfc_basis->real_to_recip(this->ctx, porter, psi.get_pointer(ib), to_ik, true);
+        this->wfc_basis->real_to_recip(this->ctx, porter, &psi(to_ik, ib, 0), to_ik);
+    }
+
+    // Clean up porter
+    delmem_complex_op()(porter);
+}
+
 template class HSolverPW<std::complex<float>, base_device::DEVICE_CPU>;
 template class HSolverPW<std::complex<double>, base_device::DEVICE_CPU>;
 #if ((defined __CUDA) || (defined __ROCM))
diff --git a/source/module_hsolver/hsolver_pw.h b/source/module_hsolver/hsolver_pw.h
index 5f44bc10a4..3a52293fb2 100644
--- a/source/module_hsolver/hsolver_pw.h
+++ b/source/module_hsolver/hsolver_pw.h
@@ -5,6 +5,8 @@
 #include "module_hamilt_general/hamilt.h"
 #include "module_base/macros.h"
 #include "module_basis/module_pw/pw_basis_k.h"
+#include <unordered_map>
+#include "module_base/memory.h"
 
 namespace hsolver
 {
@@ -17,6 +19,9 @@ class HSolverPW
     // return T if T is real type(float, double),
     // otherwise return the real type of T(complex<float>, complex<double>)
     using Real = typename GetTypeReal<T>::type;
+    using resmem_complex_op = base_device::memory::resize_memory_op<T, Device>;
+    using delmem_complex_op = base_device::memory::delete_memory_op<T, Device>;
+    using setmem_complex_op = base_device::memory::set_memory_op<T, Device>;
 
   public:
     HSolverPW(ModulePW::PW_Basis_K* wfc_basis_in,
@@ -29,10 +34,12 @@ class HSolverPW
               const int scf_iter_in,
               const int diag_iter_max_in,
               const double diag_thr_in,
-              const bool need_subspace_in)
+              const bool need_subspace_in,
+              const bool use_k_continuity_in = false)
         : wfc_basis(wfc_basis_in), calculation_type(calculation_type_in), basis_type(basis_type_in), method(method_in),
           use_paw(use_paw_in), use_uspp(use_uspp_in), nspin(nspin_in), scf_iter(scf_iter_in),
-          diag_iter_max(diag_iter_max_in), diag_thr(diag_thr_in), need_subspace(need_subspace_in){};
+          diag_iter_max(diag_iter_max_in), diag_thr(diag_thr_in), need_subspace(need_subspace_in),
+          use_k_continuity(use_k_continuity_in) {};
 
     /// @brief solve function for pw
     /// @param pHamilt interface to hamilt
@@ -50,6 +57,7 @@ class HSolverPW
                const double tpiba,
                const int nat);
 
+
   protected:
     // diago caller
     void hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm,
@@ -78,6 +86,8 @@ class HSolverPW
 
     const bool need_subspace; // for cg or dav_subspace
 
+    const bool use_k_continuity;
+
   protected:
     Device* ctx = {};
 
@@ -98,8 +108,16 @@ class HSolverPW
 
     void paw_func_after_kloop(psi::Psi<T, Device>& psi, elecstate::ElecState* pes,const double tpiba,const int nat);
 #endif
+
+    // K-point continuity related members
+    std::vector<int> k_order;
+    std::unordered_map<int, int> k_parent;
+    std::vector<ModuleBase::Vector3<double>> kvecs_c;
+    
+    void build_k_neighbors();
+    void propagate_psi(psi::Psi<T, Device>& psi, const int from_ik, const int to_ik);
 };
 
 } // namespace hsolver
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/module_hsolver/kernels/bpcg_kernel_op.cpp b/source/module_hsolver/kernels/bpcg_kernel_op.cpp
index bf05ac94e1..97cb7e5302 100644
--- a/source/module_hsolver/kernels/bpcg_kernel_op.cpp
+++ b/source/module_hsolver/kernels/bpcg_kernel_op.cpp
@@ -2,6 +2,7 @@
 #include "module_base/blas_connector.h"
 #include "module_base/kernels/math_kernel_op.h"
 #include "module_base/parallel_reduce.h"
+#include <vector>
 namespace hsolver
 {
 
@@ -106,8 +107,93 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
     }
 };
 
+template <typename T>
+struct apply_eigenvalues_op<T, base_device::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int& nbase, const int& nbase_x, const int& notconv, T* result, const T* vectors, const Real* eigenvalues)
+    {
+        for (int m = 0; m < notconv; m++)
+        {
+            for (int idx = 0; idx < nbase; idx++)
+            {
+                result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx];
+            }
+        }
+    }
+};
+
+template <typename T>
+struct precondition_op<T, base_device::DEVICE_CPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,
+                   const Real* eigenvalues)
+    {
+        std::vector<Real> pre(dim, 0.0);
+        for (int m = 0; m < notconv; m++)
+        {
+            for (size_t i = 0; i < dim; i++)
+            {
+                Real x = std::abs(precondition[i] - eigenvalues[m]);
+                pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
+            }
+            ModuleBase::vector_div_vector_op<T, base_device::DEVICE_CPU>()(
+                                                             dim,
+                                                             psi_iter + (nbase + m) * dim,
+                                                             psi_iter + (nbase + m) * dim,
+                                                             pre.data());
+        }
+    }
+};
+
+template <typename T>
+struct normalize_op<T, base_device::DEVICE_CPU> {
+    void operator()(const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   typename GetTypeReal<T>::type* psi_norm)
+    {
+        using Real = typename GetTypeReal<T>::type;
+        for (int m = 0; m < notconv; m++)
+        {
+            // Calculate norm using dot_real_op
+            Real psi_m_norm = ModuleBase::dot_real_op<T, base_device::DEVICE_CPU>()(
+                                                                dim,
+                                                                psi_iter + (nbase + m) * dim,
+                                                                psi_iter + (nbase + m) * dim,
+                                                                true);
+            assert(psi_m_norm > 0.0);
+            psi_m_norm = sqrt(psi_m_norm);
+
+            // Normalize using vector_div_constant_op
+            ModuleBase::vector_div_constant_op<T, base_device::DEVICE_CPU>()(
+                                                              dim,
+                                                              psi_iter + (nbase + m) * dim,
+                                                              psi_iter + (nbase + m) * dim,
+                                                              psi_m_norm);
+            if (psi_norm) {
+                psi_norm[m] = psi_m_norm;
+            }
+        }
+    }
+};
+
 template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct apply_eigenvalues_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct apply_eigenvalues_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct apply_eigenvalues_op<double, base_device::DEVICE_CPU>;
+template struct precondition_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct precondition_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct precondition_op<double, base_device::DEVICE_CPU>;
+template struct normalize_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct normalize_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct normalize_op<double, base_device::DEVICE_CPU>;
 } // namespace hsolver
\ No newline at end of file
diff --git a/source/module_hsolver/kernels/bpcg_kernel_op.h b/source/module_hsolver/kernels/bpcg_kernel_op.h
index fcf8f99267..31afb74caa 100644
--- a/source/module_hsolver/kernels/bpcg_kernel_op.h
+++ b/source/module_hsolver/kernels/bpcg_kernel_op.h
@@ -59,6 +59,39 @@ struct calc_grad_with_block_op
                     const int& n_band);
 };
 
+template <typename T, typename Device>
+struct apply_eigenvalues_op
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int& nbase, 
+                    const int& nbase_x, 
+                    const int& notconv, 
+                    T* result, 
+                    const T* vectors, 
+                    const Real* eigenvalues);
+};
+
+template <typename T, typename Device>
+struct precondition_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,
+                   const Real* eigenvalues);
+};
+
+template <typename T, typename Device>
+struct normalize_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   Real* psi_norm = nullptr);
+};
+
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 
 template <typename T>
@@ -78,6 +111,38 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_GPU> {
                   const int &n_band);
 };
 
+template <typename T>
+struct apply_eigenvalues_op<T, base_device::DEVICE_GPU> {
+  using Real = typename GetTypeReal<T>::type;
+  void operator()(const int& nbase, 
+                  const int& nbase_x, 
+                  const int& notconv, 
+                  T* result, 
+                  const T* vectors, 
+                  const Real* eigenvalues);
+};
+
+template <typename T>
+struct precondition_op<T, base_device::DEVICE_GPU> {
+  using Real = typename GetTypeReal<T>::type;
+  void operator()(const int& dim,
+                 T* psi_iter,
+                 const int& nbase,
+                 const int& notconv,
+                 const Real* precondition,
+                 const Real* eigenvalues);
+};
+
+template <typename T>
+struct normalize_op<T, base_device::DEVICE_GPU> {
+  using Real = typename GetTypeReal<T>::type;
+  void operator()(const int& dim,
+                 T* psi_iter,
+                 const int& nbase,
+                 const int& notconv,
+                 Real* psi_norm = nullptr);
+};
+
 #endif
 } // namespace hsolver
 
diff --git a/source/module_hsolver/kernels/cuda/bpcg_kernel_op.cu b/source/module_hsolver/kernels/cuda/bpcg_kernel_op.cu
index d851265a1b..dd94823aad 100644
--- a/source/module_hsolver/kernels/cuda/bpcg_kernel_op.cu
+++ b/source/module_hsolver/kernels/cuda/bpcg_kernel_op.cu
@@ -246,6 +246,96 @@ __global__ void calc_grad_with_block(
     }
 }
 
+template <typename Real>
+__global__ void apply_eigenvalues_kernel(
+        const thrust::complex<Real>* vectors,
+        thrust::complex<Real>* result,
+        const Real* eigenvalues,
+        const int nbase,
+        const int nbase_x,
+        const int notconv)
+{
+    int m = blockIdx.x;
+    int idx = threadIdx.x + blockIdx.y * blockDim.x;
+    
+    if (m < notconv && idx < nbase) {
+        result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx];
+    }
+}
+
+template <typename Real>
+__global__ void precondition_kernel(
+        thrust::complex<Real>* psi_iter,
+        const Real* precondition,
+        const Real* eigenvalues,
+        const int dim,
+        const int nbase,
+        const int notconv)
+{
+    int m = blockIdx.x;
+    int i = threadIdx.x + blockIdx.y * blockDim.x;
+    
+    if (m < notconv && i < dim) {
+        Real x = abs(precondition[i] - eigenvalues[m]);
+        Real pre = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
+        psi_iter[(nbase + m) * dim + i] = psi_iter[(nbase + m) * dim + i] / pre;
+    }
+}
+
+template <typename Real>
+__global__ void normalize_kernel(
+        thrust::complex<Real>* psi_iter,
+        Real* psi_norm,
+        const int dim,
+        const int nbase,
+        const int notconv)
+{
+    int m = blockIdx.x;
+    int tid = threadIdx.x;
+    __shared__ Real sum[thread_per_block];
+    
+    sum[tid] = 0.0;
+    
+    // Calculate the sum for normalization
+    for (int i = tid; i < dim; i += thread_per_block) {
+        auto val = psi_iter[(nbase + m) * dim + i];
+        sum[tid] += (val * thrust::conj(val)).real();
+    }
+    
+    __syncthreads();
+    
+    // Parallel reduction in shared memory
+    for (int s = thread_per_block/2; s > warp_size; s >>= 1) {
+        if (tid < s) {
+            sum[tid] += sum[tid + s];
+        }
+        __syncthreads();
+    }
+    
+    if (tid < warp_size) {
+        sum[tid] += sum[tid + 32]; __syncwarp();
+        sum[tid] += sum[tid + 16]; __syncwarp();
+        sum[tid] += sum[tid + 8]; __syncwarp();
+        sum[tid] += sum[tid + 4]; __syncwarp();
+        sum[tid] += sum[tid + 2]; __syncwarp();
+        sum[tid] += sum[tid + 1]; __syncwarp();
+    }
+    
+    __syncthreads();
+    
+    Real norm = sqrt(sum[0]);
+    
+    // Normalize the vector
+    for (int i = tid; i < dim; i += thread_per_block) {
+        psi_iter[(nbase + m) * dim + i] /= norm;
+    }
+    
+    // Store the norm if needed
+    if (tid == 0 && psi_norm != nullptr) {
+        psi_norm[m] = norm;
+    }
+}
+
 template <typename T>
 void line_minimize_with_block_op<T, base_device::DEVICE_GPU>::operator()(T* grad_out,
                                                                          T* hgrad_out,
@@ -292,9 +382,75 @@ void calc_grad_with_block_op<T, base_device::DEVICE_GPU>::operator()(const Real*
     cudaCheckOnDebug();
 }
 
+template <typename T>
+void apply_eigenvalues_op<T, base_device::DEVICE_GPU>::operator()(const int& nbase,
+                                                                const int& nbase_x,
+                                                                const int& notconv,
+                                                                T* result,
+                                                                const T* vectors,
+                                                                const Real* eigenvalues)
+{
+    const int threads_per_block = 256;
+    const int blocks_per_grid_y = (nbase + threads_per_block - 1) / threads_per_block;
+    
+    dim3 grid(notconv, blocks_per_grid_y);
+    
+    auto vec_complex = reinterpret_cast<const thrust::complex<Real>*>(vectors);
+    auto res_complex = reinterpret_cast<thrust::complex<Real>*>(result);
+    
+    apply_eigenvalues_kernel<Real><<<grid, threads_per_block>>>(
+        vec_complex, res_complex, eigenvalues, nbase, nbase_x, notconv);
+    
+    cudaCheckOnDebug();
+}
+
+template <typename T>
+void precondition_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                           T* psi_iter,
+                                                           const int& nbase,
+                                                           const int& notconv,
+                                                           const Real* precondition,
+                                                           const Real* eigenvalues)
+{
+    const int threads_per_block = 256;
+    const int blocks_per_grid_y = (dim + threads_per_block - 1) / threads_per_block;
+    
+    dim3 grid(notconv, blocks_per_grid_y);
+    
+    auto psi_complex = reinterpret_cast<thrust::complex<Real>*>(psi_iter);
+    
+    precondition_kernel<Real><<<grid, threads_per_block>>>(
+        psi_complex, precondition, eigenvalues, dim, nbase, notconv);
+    
+    cudaCheckOnDebug();
+}
+
+template <typename T>
+void normalize_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                        T* psi_iter,
+                                                        const int& nbase,
+                                                        const int& notconv,
+                                                        Real* psi_norm)
+{
+    auto psi_complex = reinterpret_cast<thrust::complex<Real>*>(psi_iter);
+    
+    normalize_kernel<Real><<<notconv, thread_per_block>>>(
+        psi_complex, psi_norm, dim, nbase, notconv);
+    
+    cudaCheckOnDebug();
+}
+
 template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
-
+template struct apply_eigenvalues_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<double, base_device::DEVICE_GPU>;
+template struct precondition_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct precondition_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct precondition_op<double, base_device::DEVICE_GPU>;
+template struct normalize_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct normalize_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct normalize_op<double, base_device::DEVICE_GPU>;
 }
\ No newline at end of file
diff --git a/source/module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu b/source/module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu
index 850b48bf2e..d7faf727a8 100644
--- a/source/module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu
+++ b/source/module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu
@@ -171,6 +171,85 @@ __global__ void calc_grad_with_block(
     }
 }
 
+template <typename Real>
+__global__ void apply_eigenvalues_kernel(
+        const thrust::complex<Real>* vectors,
+        thrust::complex<Real>* result,
+        const Real* eigenvalues,
+        const int nbase,
+        const int nbase_x,
+        const int notconv)
+{
+    int m = blockIdx.x;
+    int idx = threadIdx.x + blockIdx.y * blockDim.x;
+    
+    if (m < notconv && idx < nbase) {
+        result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx];
+    }
+}
+
+template <typename Real>
+__global__ void precondition_kernel(
+        thrust::complex<Real>* psi_iter,
+        const Real* precondition,
+        const Real* eigenvalues,
+        const int dim,
+        const int nbase,
+        const int notconv)
+{
+    int m = blockIdx.x;
+    int i = threadIdx.x + blockIdx.y * blockDim.x;
+    
+    if (m < notconv && i < dim) {
+        Real x = abs(precondition[i] - eigenvalues[m]);
+        Real pre = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
+        psi_iter[(nbase + m) * dim + i] = psi_iter[(nbase + m) * dim + i] / pre;
+    }
+}
+
+template <typename Real>
+__global__ void normalize_kernel(
+        thrust::complex<Real>* psi_iter,
+        Real* psi_norm,
+        const int dim,
+        const int nbase,
+        const int notconv)
+{
+    int m = blockIdx.x;
+    int tid = threadIdx.x;
+    __shared__ Real sum[THREAD_PER_BLOCK];
+    
+    sum[tid] = 0.0;
+    
+    // Calculate the sum for normalization
+    for (int i = tid; i < dim; i += THREAD_PER_BLOCK) {
+        auto val = psi_iter[(nbase + m) * dim + i];
+        sum[tid] += (val * thrust::conj(val)).real();
+    }
+    
+    __syncthreads();
+    
+    // Parallel reduction in shared memory
+    for (int s = THREAD_PER_BLOCK/2; s > 0; s >>= 1) {
+        if (tid < s) {
+            sum[tid] += sum[tid + s];
+        }
+        __syncthreads();
+    }
+    
+    Real norm = sqrt(sum[0]);
+    
+    // Normalize the vector
+    for (int i = tid; i < dim; i += THREAD_PER_BLOCK) {
+        psi_iter[(nbase + m) * dim + i] /= norm;
+    }
+    
+    // Store the norm if needed
+    if (tid == 0 && psi_norm != nullptr) {
+        psi_norm[m] = norm;
+    }
+}
+
 template <typename T>
 void line_minimize_with_block_op<T, base_device::DEVICE_GPU>::operator()(T* grad_out,
                                                                          T* hgrad_out,
@@ -217,8 +296,75 @@ void calc_grad_with_block_op<T, base_device::DEVICE_GPU>::operator()(const Real*
     hipCheckOnDebug();
 }
 
+template <typename T>
+void apply_eigenvalues_op<T, base_device::DEVICE_GPU>::operator()(const int& nbase,
+                                                                const int& nbase_x,
+                                                                const int& notconv,
+                                                                T* result,
+                                                                const T* vectors,
+                                                                const Real* eigenvalues)
+{
+    const int threads_per_block = 256;
+    const int blocks_per_grid_y = (nbase + threads_per_block - 1) / threads_per_block;
+    
+    dim3 grid(notconv, blocks_per_grid_y);
+    
+    auto vec_complex = reinterpret_cast<const thrust::complex<Real>*>(vectors);
+    auto res_complex = reinterpret_cast<thrust::complex<Real>*>(result);
+    
+    apply_eigenvalues_kernel<Real><<<grid, threads_per_block>>>(
+        vec_complex, res_complex, eigenvalues, nbase, nbase_x, notconv);
+    
+    hipCheckOnDebug();
+}
+
+template <typename T>
+void precondition_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                           T* psi_iter,
+                                                           const int& nbase,
+                                                           const int& notconv,
+                                                           const Real* precondition,
+                                                           const Real* eigenvalues)
+{
+    const int threads_per_block = 256;
+    const int blocks_per_grid_y = (dim + threads_per_block - 1) / threads_per_block;
+    
+    dim3 grid(notconv, blocks_per_grid_y);
+    
+    auto psi_complex = reinterpret_cast<thrust::complex<Real>*>(psi_iter);
+    
+    precondition_kernel<Real><<<grid, threads_per_block>>>(
+        psi_complex, precondition, eigenvalues, dim, nbase, notconv);
+    
+    hipCheckOnDebug();
+}
+
+template <typename T>
+void normalize_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                        T* psi_iter,
+                                                        const int& nbase,
+                                                        const int& notconv,
+                                                        Real* psi_norm)
+{
+    auto psi_complex = reinterpret_cast<thrust::complex<Real>*>(psi_iter);
+    
+    normalize_kernel<Real><<<notconv, THREAD_PER_BLOCK>>>(
+        psi_complex, psi_norm, dim, nbase, notconv);
+    
+    hipCheckOnDebug();
+}
+
 template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<double, base_device::DEVICE_GPU>;
+template struct precondition_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct precondition_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct precondition_op<double, base_device::DEVICE_GPU>;
+template struct normalize_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct normalize_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct normalize_op<double, base_device::DEVICE_GPU>;
 }
\ No newline at end of file
diff --git a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu
index 5eeb1697e8..8cba06db1b 100644
--- a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu
+++ b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu
@@ -5,12 +5,25 @@
 
 namespace hsolver {
 
+// NOTE: mimicked from ../cuda/dngvd_op.cu for three dngvd_op
+
+static hipsolverHandle_t hipsolver_H = nullptr;
+// Test on DCU platform. When nstart is greater than 234, code on DCU performs better. 
+const int N_DCU = 234;
+
 void createGpuSolverHandle() {
-    return;
+    if (hipsolver_H == nullptr)
+    {
+        hipsolverErrcheck(hipsolverCreate(&hipsolver_H));
+    }
 }
 
 void destroyGpuSolverHandle() {
-    return;
+    if (hipsolver_H != nullptr)
+    {
+        hipsolverErrcheck(hipsolverDestroy(hipsolver_H));
+        hipsolver_H = nullptr;
+    }
 }
 
 #ifdef __LCAO
@@ -23,22 +36,68 @@ void dngvd_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DE
                                                            double* _eigenvalue,
                                                            double* _vcc)
 {
-    std::vector<double> hcc(nstart * nstart, 0.0);
-    std::vector<double> scc(nstart * nstart, 0.0);
-    std::vector<double> vcc(nstart * nstart, 0.0);
-    std::vector<double> eigenvalue(nstart, 0);
-    hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost));
-    hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost));
-    base_device::DEVICE_CPU* cpu_ctx = {};
-    dngvd_op<double, base_device::DEVICE_CPU>()(cpu_ctx,
-                                                nstart,
-                                                ldh,
-                                                hcc.data(),
-                                                scc.data(),
-                                                eigenvalue.data(),
-                                                vcc.data());
-    hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice));
-    hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
+    // copied from ../cuda/dngvd_op.cu, "dngvd_op"
+    assert(nstart == ldh);
+
+    if (nstart > N_DCU){
+        hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(double) * ldh * nstart, hipMemcpyDeviceToDevice));
+        // now vcc contains hcc
+
+        // prepare some values for hipsolverDnZhegvd_bufferSize
+        int * devInfo = nullptr;
+        int lwork = 0, info_gpu = 0;
+        double * work = nullptr;
+        hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
+        hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
+
+        // calculate the sizes needed for pre-allocated buffer.
+        hipsolverErrcheck(hipsolverDnDsygvd_bufferSize(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            _vcc, ldh,
+            _scc, ldh,
+            _eigenvalue,
+            &lwork));
+
+        // allocate memery
+        hipErrcheck(hipMalloc((void**)&work, sizeof(double) * lwork));
+
+        // compute eigenvalues and eigenvectors.
+        hipsolverErrcheck(hipsolverDnDsygvd(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            _vcc, ldh,
+            const_cast<double *>(_scc), ldh,
+            _eigenvalue,
+            work, lwork, devInfo));
+
+        hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
+
+        // free the buffer
+        hipErrcheck(hipFree(work));
+        hipErrcheck(hipFree(devInfo));
+    }
+    // if(fail_info != nullptr) *fail_info = info_gpu;
+    else{
+        std::vector<double> hcc(nstart * nstart, 0.0);
+        std::vector<double> scc(nstart * nstart, 0.0);
+        std::vector<double> vcc(nstart * nstart, 0.0);
+        std::vector<double> eigenvalue(nstart, 0);
+        hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost));
+        hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost));
+        base_device::DEVICE_CPU* cpu_ctx = {};
+        dngvd_op<double, base_device::DEVICE_CPU>()(cpu_ctx,
+                                                   nstart,
+                                                   ldh,
+                                                   hcc.data(),
+                                                   scc.data(),
+                                                   eigenvalue.data(),
+                                                   vcc.data());
+        hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice));
+        hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
+    }
+
+    
 }
 #endif // __LCAO
 
@@ -51,22 +110,67 @@ void dngvd_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const ba
                                                                         float* _eigenvalue,
                                                                         std::complex<float>* _vcc)
 {
-    std::vector<std::complex<float>> hcc(nstart * nstart, {0, 0});
-    std::vector<std::complex<float>> scc(nstart * nstart, {0, 0});
-    std::vector<std::complex<float>> vcc(nstart * nstart, {0, 0});
-    std::vector<float> eigenvalue(nstart, 0);
-    hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<float>) * hcc.size(), hipMemcpyDeviceToHost));
-    hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<float>) * scc.size(), hipMemcpyDeviceToHost));
-    base_device::DEVICE_CPU* cpu_ctx = {};
-    dngvd_op<std::complex<float>, base_device::DEVICE_CPU>()(cpu_ctx,
-                                                             nstart,
-                                                             ldh,
-                                                             hcc.data(),
-                                                             scc.data(),
-                                                             eigenvalue.data(),
-                                                             vcc.data());
-    hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<float>) * vcc.size(), hipMemcpyHostToDevice));
-    hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice));
+    // copied from ../cuda/dngvd_op.cu, "dngvd_op"
+    assert(nstart == ldh);
+    
+    if (nstart > N_DCU){
+        hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex<float>) * ldh * nstart, hipMemcpyDeviceToDevice));
+        // now vcc contains hcc
+
+        // prepare some values for hipsolverDnZhegvd_bufferSize
+        int * devInfo = nullptr;
+        int lwork = 0, info_gpu = 0;
+        float2 * work = nullptr;
+        hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
+        hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
+
+        // calculate the sizes needed for pre-allocated buffer.
+        hipsolverErrcheck(hipsolverDnChegvd_bufferSize(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            reinterpret_cast<const float2 *>(_vcc), ldh,
+            reinterpret_cast<const float2 *>(_scc), ldh,
+            _eigenvalue,
+            &lwork));
+
+        // allocate memery
+        hipErrcheck(hipMalloc((void**)&work, sizeof(float2) * lwork));
+
+        // compute eigenvalues and eigenvectors.
+        hipsolverErrcheck(hipsolverDnChegvd(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            reinterpret_cast<float2 *>(_vcc), ldh,
+            const_cast<float2 *>(reinterpret_cast<const float2 *>(_scc)), ldh,
+            _eigenvalue,
+            work, lwork, devInfo));
+
+        hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
+        // free the buffer
+        hipErrcheck(hipFree(work));
+        hipErrcheck(hipFree(devInfo));
+    }
+    // if(fail_info != nullptr) *fail_info = info_gpu;
+    else{
+        std::vector<std::complex<float>> hcc(nstart * nstart, {0, 0});
+        std::vector<std::complex<float>> scc(nstart * nstart, {0, 0});
+        std::vector<std::complex<float>> vcc(nstart * nstart, {0, 0});
+        std::vector<float> eigenvalue(nstart, 0);
+        hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<float>) * hcc.size(), hipMemcpyDeviceToHost));
+        hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<float>) * scc.size(), hipMemcpyDeviceToHost));
+        base_device::DEVICE_CPU* cpu_ctx = {};
+        dngvd_op<std::complex<float>, base_device::DEVICE_CPU>()(cpu_ctx,
+                                                                nstart,
+                                                                ldh,
+                                                                hcc.data(),
+                                                                scc.data(),
+                                                                eigenvalue.data(),
+                                                                vcc.data());
+        hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<float>) * vcc.size(), hipMemcpyHostToDevice));
+        hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice));
+    }
+
+    
 }
 
 template <>
@@ -76,24 +180,80 @@ void dngvd_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const b
                                                                          const std::complex<double>* _hcc,
                                                                          const std::complex<double>* _scc,
                                                                          double* _eigenvalue,
-                                                                         std::complex<double>* _vcc)
+                                                                         std::complex<double>* _vcc
+                                                                        )
 {
-    std::vector<std::complex<double>> hcc(nstart * nstart, {0, 0});
-    std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
-    std::vector<std::complex<double>> vcc(nstart * nstart, {0, 0});
-    std::vector<double> eigenvalue(nstart, 0);
-    hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<double>) * hcc.size(), hipMemcpyDeviceToHost));
-    hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));
-    base_device::DEVICE_CPU* cpu_ctx = {};
-    dngvd_op<std::complex<double>, base_device::DEVICE_CPU>()(cpu_ctx,
-                                                              nstart,
-                                                              ldh,
-                                                              hcc.data(),
-                                                              scc.data(),
-                                                              eigenvalue.data(),
-                                                              vcc.data());
-    hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<double>) * vcc.size(), hipMemcpyHostToDevice));
-    hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
+    // copied from ../cuda/dngvd_op.cu, "dngvd_op"
+    assert(nstart == ldh);
+
+    // save a copy of scc in case the diagonalization fails
+    if (nstart > N_DCU){
+        std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
+        hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));
+
+        hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex<double>) * ldh * nstart, hipMemcpyDeviceToDevice));
+
+        // now vcc contains hcc
+
+        // prepare some values for hipsolverDnZhegvd_bufferSize
+        int * devInfo = nullptr;
+        int lwork = 0, info_gpu = 0;
+        double2 * work = nullptr;
+        hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
+        hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
+
+        // calculate the sizes needed for pre-allocated buffer.
+        hipsolverErrcheck(hipsolverDnZhegvd_bufferSize(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            reinterpret_cast<const double2 *>(_vcc), ldh,
+            reinterpret_cast<const double2 *>(_scc), ldh,
+            _eigenvalue,
+            &lwork));
+
+        // allocate memery
+        hipErrcheck(hipMalloc((void**)&work, sizeof(double2) * lwork));
+
+        // compute eigenvalues and eigenvectors.
+        hipsolverErrcheck(hipsolverDnZhegvd(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            reinterpret_cast<double2 *>(_vcc), ldh,
+            const_cast<double2 *>(reinterpret_cast<const double2 *>(_scc)), ldh,
+            _eigenvalue,
+            work, lwork, devInfo));
+
+        hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
+        // free the buffer
+        hipErrcheck(hipFree(work));
+        hipErrcheck(hipFree(devInfo));
+    }
+    // if(fail_info != nullptr) *fail_info = info_gpu;
+    else{
+        std::vector<std::complex<double>> hcc(nstart * nstart, {0, 0});
+        std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
+        std::vector<std::complex<double>> vcc(nstart * nstart, {0, 0});
+        std::vector<double> eigenvalue(nstart, 0);
+        hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<double>) * hcc.size(), hipMemcpyDeviceToHost));
+        hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));
+        base_device::DEVICE_CPU* cpu_ctx = {};
+        dngvd_op<std::complex<double>, base_device::DEVICE_CPU>()(cpu_ctx,
+                                                                nstart,
+                                                                ldh,
+                                                                hcc.data(),
+                                                                scc.data(),
+                                                                eigenvalue.data(),
+                                                                vcc.data());
+        hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<double>) * vcc.size(), hipMemcpyHostToDevice));
+        hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
+    }
+
+
+
+
+
+
+    
 }
 
 #ifdef __LCAO
diff --git a/source/module_hsolver/test/test_hsolver_pw.cpp b/source/module_hsolver/test/test_hsolver_pw.cpp
index 6b0e8c5a80..92be00fcbd 100644
--- a/source/module_hsolver/test/test_hsolver_pw.cpp
+++ b/source/module_hsolver/test/test_hsolver_pw.cpp
@@ -14,6 +14,68 @@
 #include "module_hsolver/hsolver_pw.h"
 #undef private
 #undef protected
+
+// Mock implementations for the template functions causing linking errors
+namespace ModulePW {
+    // Mock implementation for recip_to_real
+    template<typename FPTYPE, typename Device>
+    void PW_Basis_K::recip_to_real(const Device* ctx,
+                                  const std::complex<FPTYPE>* in,
+                                  std::complex<FPTYPE>* out,
+                                  const int ik,
+                                  const bool add,
+                                  const FPTYPE factor) const
+    {
+        // Simple mock implementation that does nothing
+        // In a real test, you might want to implement behavior that simulates the actual function
+    }
+
+    // Mock implementation for real_to_recip
+    template<typename FPTYPE, typename Device>
+    void PW_Basis_K::real_to_recip(const Device* ctx,
+                                  const std::complex<FPTYPE>* in,
+                                  std::complex<FPTYPE>* out,
+                                  const int ik,
+                                  const bool add,
+                                  const FPTYPE factor) const
+    {
+        // Simple mock implementation that does nothing
+    }
+
+    // Explicit template instantiations
+    template void PW_Basis_K::recip_to_real<float, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<float>* in,
+        std::complex<float>* out,
+        const int ik,
+        const bool add,
+        const float factor) const;
+
+    template void PW_Basis_K::recip_to_real<double, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<double>* in,
+        std::complex<double>* out,
+        const int ik,
+        const bool add,
+        const double factor) const;
+
+    template void PW_Basis_K::real_to_recip<float, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<float>* in,
+        std::complex<float>* out,
+        const int ik,
+        const bool add,
+        const float factor) const;
+
+    template void PW_Basis_K::real_to_recip<double, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<double>* in,
+        std::complex<double>* out,
+        const int ik,
+        const bool add,
+        const double factor) const;
+}
+
 /************************************************
  *  unit test of HSolverPW class
  ***********************************************/
diff --git a/source/module_hsolver/test/test_hsolver_sdft.cpp b/source/module_hsolver/test/test_hsolver_sdft.cpp
index d1f7bbca13..83cbb8f361 100644
--- a/source/module_hsolver/test/test_hsolver_sdft.cpp
+++ b/source/module_hsolver/test/test_hsolver_sdft.cpp
@@ -186,6 +186,66 @@ template class Stochastic_Iter<std::complex<double>, base_device::DEVICE_CPU>;
 Charge::Charge(){};
 Charge::~Charge(){};
 
+// Mock implementations for the template functions causing linking errors
+namespace ModulePW {
+    // Mock implementation for recip_to_real
+    template<typename FPTYPE, typename Device>
+    void PW_Basis_K::recip_to_real(const Device* ctx,
+                                  const std::complex<FPTYPE>* in,
+                                  std::complex<FPTYPE>* out,
+                                  const int ik,
+                                  const bool add,
+                                  const FPTYPE factor) const
+    {
+        // Simple mock implementation that does nothing
+        // In a real test, you might want to implement behavior that simulates the actual function
+    }
+
+    // Mock implementation for real_to_recip
+    template<typename FPTYPE, typename Device>
+    void PW_Basis_K::real_to_recip(const Device* ctx,
+                                  const std::complex<FPTYPE>* in,
+                                  std::complex<FPTYPE>* out,
+                                  const int ik,
+                                  const bool add,
+                                  const FPTYPE factor) const
+    {
+        // Simple mock implementation that does nothing
+    }
+
+    // Explicit template instantiations
+    template void PW_Basis_K::recip_to_real<float, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<float>* in,
+        std::complex<float>* out,
+        const int ik,
+        const bool add,
+        const float factor) const;
+
+    template void PW_Basis_K::recip_to_real<double, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<double>* in,
+        std::complex<double>* out,
+        const int ik,
+        const bool add,
+        const double factor) const;
+
+    template void PW_Basis_K::real_to_recip<float, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<float>* in,
+        std::complex<float>* out,
+        const int ik,
+        const bool add,
+        const float factor) const;
+
+    template void PW_Basis_K::real_to_recip<double, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<double>* in,
+        std::complex<double>* out,
+        const int ik,
+        const bool add,
+        const double factor) const;
+}
 /************************************************
  *  unit test of HSolverPW_SDFT class
  ***********************************************/
diff --git a/source/module_io/read_input_item_elec_stru.cpp b/source/module_io/read_input_item_elec_stru.cpp
index 1c03f5c9ff..16d08031a9 100644
--- a/source/module_io/read_input_item_elec_stru.cpp
+++ b/source/module_io/read_input_item_elec_stru.cpp
@@ -335,6 +335,26 @@ void ReadInput::item_elec_stru()
         read_sync_bool(input.diago_smooth_ethr);
         this->add_item(item);
     }
+    {
+        Input_Item item("use_k_continuity");
+        item.annotation = "whether to use k-point continuity for initializing wave functions";
+        read_sync_bool(input.use_k_continuity);
+        item.check_value = [](const Input_Item& item, const Parameter& para) {
+            if (para.input.use_k_continuity && para.input.basis_type != "pw") {
+                ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity only works for PW basis");
+            }
+            if (para.input.use_k_continuity && para.input.calculation == "nscf") {
+                ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for NSCF calculation");
+            }
+            if (para.input.use_k_continuity && para.input.nspin == 2) {
+                ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for spin-polarized calculation");
+            }
+            if (para.input.use_k_continuity && para.input.esolver_type == "sdft") {
+                ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for SDFT calculation");
+            }
+        };
+        this->add_item(item);
+    }
     {
         Input_Item item("pw_diag_ndim");
         item.annotation = "dimension of workspace for Davidson diagonalization";
diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h
index 5c0291b5b5..cbac66ed74 100644
--- a/source/module_parameter/input_parameter.h
+++ b/source/module_parameter/input_parameter.h
@@ -89,6 +89,7 @@ struct Input_para
     int pw_diag_ndim = 4;           ///< dimension of workspace for Davidson diagonalization
     int diago_cg_prec = 1;          ///< mohan add 2012-03-31
     int diag_subspace = 0;          // 0: Lapack, 1: elpa, 2: scalapack
+    bool use_k_continuity = false;   ///< whether to use k-point continuity for initializing wave functions
 
     std::string smearing_method = "gauss"; ///< "gauss",
                                            ///< "mp","methfessel-paxton"
diff --git a/source/module_psi/psi_init.cpp b/source/module_psi/psi_init.cpp
index d89b8ab477..11c9ecb7b7 100644
--- a/source/module_psi/psi_init.cpp
+++ b/source/module_psi/psi_init.cpp
@@ -133,6 +133,7 @@ void PSIInit<T, Device>::initialize_psi(Psi<std::complex<double>>* psi,
     // like (1, nbands, npwx), in which npwx is the maximal npw of all kpoints
     for (int ik = 0; ik < this->pw_wfc.nks; ik++)
     {
+        if(PARAM.inp.use_k_continuity && ik > 0) continue;
         //! Fix the wavefunction to initialize at given kpoint
         psi->fix_k(ik);
         kspw_psi->fix_k(ik);
@@ -147,6 +148,7 @@ void PSIInit<T, Device>::initialize_psi(Psi<std::complex<double>>* psi,
             {
                 syncmem_h2d_op()(psi_device->get_pointer(), psi_cpu->get_pointer(), nbands_start * nbasis);
             }
+                
 
             if (this->ks_solver == "cg")
             {
@@ -268,4 +270,4 @@ template class PSIInit<std::complex<double>, base_device::DEVICE_CPU>;
 template class PSIInit<std::complex<float>, base_device::DEVICE_GPU>;
 template class PSIInit<std::complex<double>, base_device::DEVICE_GPU>;
 #endif
-} // namespace psi
\ No newline at end of file
+} // namespace psi