diff --git a/source/module_base/kernels/cuda/math_kernel_op_vec.cu b/source/module_base/kernels/cuda/math_kernel_op_vec.cu index c0a3b102d7..182d4c54d9 100644 --- a/source/module_base/kernels/cuda/math_kernel_op_vec.cu +++ b/source/module_base/kernels/cuda/math_kernel_op_vec.cu @@ -52,6 +52,19 @@ __global__ void vector_mul_vector_kernel(const int size, } } +template +__global__ void vector_div_constant_kernel(const int size, + T* result, + const T* vector, + const typename GetTypeReal::type constant) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) + { + result[i] = vector[i] / constant; + } +} + template __global__ void vector_div_vector_kernel(const int size, T* result, @@ -127,6 +140,55 @@ void vector_mul_real_op, base_device::DEVICE_GPU>::operator vector_mul_real_wrapper(dim, result, vector, constant); } +// vector operator: result[i] = vector[i] / constant +template <> +void vector_div_constant_op::operator()(const int& dim, + double* result, + const double* vector, + const double constant) +{ + // In small cases, 1024 threads per block will only utilize 17 blocks, much less than 40 + int thread = thread_per_block; + int block = (dim + thread - 1) / thread; + vector_div_constant_kernel<<>>(dim, result, vector, constant); + + cudaCheckOnDebug(); +} + +template +inline void vector_div_constant_wrapper(const int& dim, + std::complex* result, + const std::complex* vector, + const FPTYPE constant) +{ + thrust::complex* result_tmp = reinterpret_cast*>(result); + const thrust::complex* vector_tmp = reinterpret_cast*>(vector); + + int thread = thread_per_block; + int block = (dim + thread - 1) / thread; + vector_div_constant_kernel><<>>(dim, result_tmp, vector_tmp, constant); + + cudaCheckOnDebug(); +} + +template <> +void vector_div_constant_op, base_device::DEVICE_GPU>::operator()(const int& dim, + std::complex* result, + const std::complex* vector, + const float constant) +{ + vector_div_constant_wrapper(dim, result, vector, constant); +} + +template <> +void vector_div_constant_op, base_device::DEVICE_GPU>::operator()(const int& dim, + std::complex* result, + const std::complex* vector, + const double constant) +{ + vector_div_constant_wrapper(dim, result, vector, constant); +} + // vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex) template <> void vector_mul_vector_op::operator()(const int& dim, @@ -306,6 +368,10 @@ template struct vector_mul_real_op, base_device::DEVICE_GPU> template struct vector_mul_real_op; template struct vector_mul_real_op, base_device::DEVICE_GPU>; +template struct vector_div_constant_op, base_device::DEVICE_GPU>; +template struct vector_div_constant_op; +template struct vector_div_constant_op, base_device::DEVICE_GPU>; + template struct vector_mul_vector_op; template struct vector_mul_vector_op, base_device::DEVICE_GPU>; template struct vector_mul_vector_op; diff --git a/source/module_base/kernels/math_kernel_op.h b/source/module_base/kernels/math_kernel_op.h index bae77b23d6..d26e96171c 100644 --- a/source/module_base/kernels/math_kernel_op.h +++ b/source/module_base/kernels/math_kernel_op.h @@ -98,6 +98,21 @@ template struct vector_mul_vector_op { void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false); }; +// vector operator: result[i] = vector[i] / constant +template struct vector_div_constant_op { + using Real = typename GetTypeReal::type; + /// @brief result[i] = vector[i] / constant + /// + /// Input Parameters + /// \param dim : array size + /// \param vector : input array + /// \param constant : input constant + /// + /// Output Parameters + /// \param result : output array + void operator()(const int& dim, T* result, const T* vector, const Real constant); +}; + // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex) template struct vector_div_vector_op { using Real = typename GetTypeReal::type; @@ -284,6 +299,48 @@ template struct matrixCopy { void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB); }; +template +struct apply_eigenvalues_op { + using Real = typename GetTypeReal::type; + + void operator()(const Device *d, const int &nbase, const int &nbase_x, const int ¬conv, + T *result, const T *vectors, const Real *eigenvalues); +}; + +template +struct precondition_op { + using Real = typename GetTypeReal::type; + void operator()(const Device* d, + const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + const Real* precondition, + const Real* eigenvalues); +}; + +template +struct normalize_op { + using Real = typename GetTypeReal::type; + void operator()(const Device* d, + const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + Real* psi_norm = nullptr); +}; + +template +struct normalize_op { + using Real = typename GetTypeReal::type; + void operator()(const base_device::DEVICE_GPU* d, + const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + Real* psi_norm); +}; + #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM // Partially specialize functor for base_device::GpuDevice. template struct dot_real_op { @@ -306,6 +363,12 @@ template struct vector_mul_vector_op { void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false); }; +// vector operator: result[i] = vector[i] / constant +template struct vector_div_constant_op { + using Real = typename GetTypeReal::type; + void operator()(const int& dim, T* result, const T* vector, const Real constant); +}; + // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex) template struct vector_div_vector_op { using Real = typename GetTypeReal::type; @@ -334,6 +397,26 @@ template struct matrixCopy { void createGpuBlasHandle(); void destoryBLAShandle(); +// vector operator: result[i] = -lambda[i] * vector[i] +template struct apply_eigenvalues_op { + using Real = typename GetTypeReal::type; + + void operator()(const base_device::DEVICE_GPU *d, const int &nbase, const int &nbase_x, const int ¬conv, + T *result, const T *vectors, const Real *eigenvalues); +}; + +template +struct precondition_op { + using Real = typename GetTypeReal::type; + void operator()(const base_device::DEVICE_GPU* d, + const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + const Real* precondition, + const Real* eigenvalues); +}; + #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM } // namespace hsolver diff --git a/source/module_base/kernels/math_kernel_op_vec.cpp b/source/module_base/kernels/math_kernel_op_vec.cpp index 4030edbb48..605f97ff19 100644 --- a/source/module_base/kernels/math_kernel_op_vec.cpp +++ b/source/module_base/kernels/math_kernel_op_vec.cpp @@ -60,6 +60,22 @@ struct vector_mul_vector_op } }; +template +struct vector_div_constant_op +{ + using Real = typename GetTypeReal::type; + void operator()(const int& dim, T* result, const T* vector, const Real constant) + { +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector[i] / constant; + } + } +}; + template struct vector_div_vector_op { @@ -159,6 +175,10 @@ template struct vector_mul_vector_op, base_device::DEVICE_CP template struct vector_mul_vector_op; template struct vector_mul_vector_op, base_device::DEVICE_CPU>; +template struct vector_div_constant_op, base_device::DEVICE_CPU>; +template struct vector_div_constant_op; +template struct vector_div_constant_op, base_device::DEVICE_CPU>; + template struct vector_div_vector_op, base_device::DEVICE_CPU>; template struct vector_div_vector_op; template struct vector_div_vector_op, base_device::DEVICE_CPU>; diff --git a/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu b/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu index a9009fe5c6..9e05160c54 100644 --- a/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu +++ b/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu @@ -50,6 +50,19 @@ __launch_bounds__(1024) __global__ void vector_mul_vector_kernel(const int size, } } +template +__launch_bounds__(1024) __global__ void vector_div_constant_kernel(const int size, + T* result, + const T* vector, + const typename GetTypeReal::type constant) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) + { + result[i] = vector[i] / constant; + } +} + template __launch_bounds__(1024) __global__ void vector_div_vector_kernel(const int size, T* result, @@ -142,6 +155,69 @@ void vector_mul_real_op, base_device::DEVICE_GPU>::operator hipCheckOnDebug(); } +// vector operator: result[i] = vector[i] / constant +template <> +void vector_div_constant_op::operator()(const int& dim, + double* result, + const double* vector, + const double constant) +{ + int thread = 1024; + int block = (dim + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel), + dim3(block), + dim3(thread), + 0, + 0, + dim, + result, + vector, + constant); + + hipCheckOnDebug(); +} + +template +inline void vector_div_constant_wrapper(const int& dim, + std::complex* result, + const std::complex* vector, + const FPTYPE constant) +{ + thrust::complex* result_tmp = reinterpret_cast*>(result); + const thrust::complex* vector_tmp = reinterpret_cast*>(vector); + int thread = 1024; + int block = (dim + thread - 1) / thread; + hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel>), + dim3(block), + dim3(thread), + 0, + 0, + dim, + result_tmp, + vector_tmp, + constant); + + hipCheckOnDebug(); +} + +template <> +void vector_div_constant_op, base_device::DEVICE_GPU>::operator()(const int& dim, + std::complex* result, + const std::complex* vector, + const float constant) +{ + vector_div_constant_wrapper(dim, result, vector, constant); +} + +template <> +void vector_div_constant_op, base_device::DEVICE_GPU>::operator()(const int& dim, + std::complex* result, + const std::complex* vector, + const double constant) +{ + vector_div_constant_wrapper(dim, result, vector, constant); +} + // vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex) template <> void vector_mul_vector_op::operator()(const int& dim, @@ -357,6 +433,10 @@ template struct vector_mul_real_op, base_device::DEVICE_GPU> template struct vector_mul_real_op; template struct vector_mul_real_op, base_device::DEVICE_GPU>; +template struct vector_div_constant_op, base_device::DEVICE_GPU>; +template struct vector_div_constant_op; +template struct vector_div_constant_op, base_device::DEVICE_GPU>; + template struct vector_mul_vector_op; template struct vector_mul_vector_op, base_device::DEVICE_GPU>; template struct vector_mul_vector_op; diff --git a/source/module_base/module_device/device.cpp b/source/module_base/module_device/device.cpp index b20ea9f3ad..9373a5a31a 100644 --- a/source/module_base/module_device/device.cpp +++ b/source/module_base/module_device/device.cpp @@ -5,7 +5,7 @@ #include #include - +#include #ifdef __MPI #include "mpi.h" #endif @@ -166,6 +166,11 @@ int device_count = -1; cudaGetDeviceCount(&device_count); #elif defined(__ROCM) hipGetDeviceCount(&device_count); +/***auto start_time = std::chrono::high_resolution_clock::now(); +std::cout << "Starting hipGetDeviceCount.." << std::endl; +auto end_time = std::chrono::high_resolution_clock::now(); +auto duration = std::chrono::duration_cast>(end_time - start_time); +std::cout << "hipGetDeviceCount took " << duration.count() << "seconds" << std::endl;***/ #endif if (device_count <= 0) { @@ -711,4 +716,4 @@ void record_device_memory( #endif } // end of namespace information -} // end of namespace base_device \ No newline at end of file +} // end of namespace base_device diff --git a/source/module_esolver/esolver.cpp b/source/module_esolver/esolver.cpp index 9d5559dd0d..baf5caf4cc 100644 --- a/source/module_esolver/esolver.cpp +++ b/source/module_esolver/esolver.cpp @@ -114,8 +114,10 @@ std::string determine_type() } GlobalV::ofs_running << "\n RUNNING WITH DEVICE : " << device_info << " / " - << base_device::information::get_device_info(PARAM.inp.device) << std::endl; - + << base_device::information::get_device_info(PARAM.inp.device) << std::endl; + /***auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast>(end_time - start_time); + std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/ return esolver_type; } diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp index 14afe6c3c9..1673f537bc 100644 --- a/source/module_esolver/esolver_ks_pw.cpp +++ b/source/module_esolver/esolver_ks_pw.cpp @@ -572,7 +572,8 @@ void ESolver_KS_PW::hamilt2rho_single(UnitCell& ucell, hsolver::DiagoIterAssist::SCF_ITER, hsolver::DiagoIterAssist::PW_DIAG_NMAX, hsolver::DiagoIterAssist::PW_DIAG_THR, - hsolver::DiagoIterAssist::need_subspace); + hsolver::DiagoIterAssist::need_subspace, + PARAM.inp.use_k_continuity); hsolver_pw_obj.solve(this->p_hamilt, this->kspw_psi[0], diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp index 9e8aa2f066..3440a778e8 100644 --- a/source/module_hsolver/diago_dav_subspace.cpp +++ b/source/module_hsolver/diago_dav_subspace.cpp @@ -6,6 +6,7 @@ #include "module_base/timer.h" #include "module_hsolver/kernels/dngvd_op.h" #include "module_base/kernels/math_kernel_op.h" +#include "module_hsolver/kernels/bpcg_kernel_op.h" #include "module_base/kernels/dsp/dsp_connector.h" #include "module_hsolver/diag_hs_para.h" @@ -293,26 +294,28 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, psi_iter + (nbase) * this->dim, this->dim); - std::vector e_temp_cpu(nbase, 0); + // Eigenvalues operation section + std::vector e_temp_cpu(this->notconv, 0); Real* e_temp_hd = e_temp_cpu.data(); - if(this->device == base_device::GpuDevice) + if (this->device == base_device::GpuDevice) { e_temp_hd = nullptr; resmem_real_op()(e_temp_hd, nbase); } - for (int m = 0; m < notconv; m++) + + for (int m = 0; m < this->notconv; m++) { - e_temp_cpu.assign(nbase, (-1.0 * (*eigenvalue_iter)[m])); - if (this->device == base_device::GpuDevice) - { - syncmem_var_h2d_op()(e_temp_hd, e_temp_cpu.data(), nbase); - } - ModuleBase::vector_mul_vector_op()(nbase, - vcc + m * this->nbase_x, - vcc + m * this->nbase_x, - e_temp_hd); + e_temp_cpu[m] = -(*eigenvalue_iter)[m]; } - if(this->device == base_device::GpuDevice) + + if (this->device == base_device::GpuDevice) + { + syncmem_var_h2d_op()(e_temp_hd, e_temp_cpu.data(), this->notconv); + } + + apply_eigenvalues_op()(nbase, this->nbase_x, this->notconv, this->vcc, this->vcc, e_temp_hd); + + if (this->device == base_device::GpuDevice) { delmem_real_op()(e_temp_hd); } @@ -336,48 +339,58 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, psi_iter + nbase * this->dim, this->dim); - // "precondition!!!" - std::vector pre(this->dim, 0.0); - for (int m = 0; m < notconv; m++) - { - for (size_t i = 0; i < this->dim; i++) - { - // pre[i] = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]); - double x = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]); - pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0))); - } + // Precondition section #if defined(__CUDA) || defined(__ROCM) - if (this->device == base_device::GpuDevice) - { - syncmem_var_h2d_op()(this->d_precondition, pre.data(), this->dim); - ModuleBase::vector_div_vector_op()(this->dim, - psi_iter + (nbase + m) * this->dim, - psi_iter + (nbase + m) * this->dim, - this->d_precondition); - } - else + if (this->device == base_device::GpuDevice) + { + Real* eigenvalues_gpu = nullptr; + resmem_real_op()(eigenvalues_gpu, notconv); + syncmem_var_h2d_op()(eigenvalues_gpu, (*eigenvalue_iter).data(), notconv); + + precondition_op()(this->dim, + psi_iter, + nbase, + notconv, + d_precondition, + eigenvalues_gpu); + delmem_real_op()(eigenvalues_gpu); + } + else #endif - { - ModuleBase::vector_div_vector_op()(this->dim, - psi_iter + (nbase + m) * this->dim, - psi_iter + (nbase + m) * this->dim, - pre.data()); - } + { + precondition_op()(this->dim, + psi_iter, + nbase, + notconv, + this->precondition.data(), + (*eigenvalue_iter).data()); } - // "normalize!!!" in order to improve numerical stability of subspace diagonalization - for (size_t i = 0; i < notconv; i++) + // Normalize section +#if defined(__CUDA) || defined(__ROCM) + if (this->device == base_device::GpuDevice) + { + Real* psi_norm = nullptr; + resmem_real_op()(psi_norm, notconv); + using setmem_real_op = base_device::memory::set_memory_op; + setmem_real_op()(psi_norm, 0.0, notconv); + + normalize_op()(this->dim, + psi_iter, + nbase, + notconv, + psi_norm); + delmem_real_op()(psi_norm); + } + else +#endif { - Real psi_norm = ModuleBase::dot_real_op()(this->dim, - psi_iter + (nbase + i) * this->dim, - psi_iter + (nbase + i) * this->dim, - true); - assert(psi_norm > 0.0); - psi_norm = sqrt(psi_norm); - ModuleBase::vector_mul_real_op()(this->dim, - psi_iter + (nbase + i) * this->dim, - psi_iter + (nbase + i) * this->dim, - Real(1.0 / psi_norm)); + Real* psi_norm = nullptr; + normalize_op()(this->dim, + psi_iter, + nbase, + notconv, + psi_norm); } // update hpsi[:, nbase:nbase+notconv] diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp index a8b9c02eec..2868fd2f0a 100644 --- a/source/module_hsolver/hsolver_pw.cpp +++ b/source/module_hsolver/hsolver_pw.cpp @@ -282,51 +282,106 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, std::vector eigenvalues(this->wfc_basis->nks * psi.get_nbands(), 0.0); ethr_band.resize(psi.get_nbands(), this->diag_thr); - /// Loop over k points for solve Hamiltonian to charge density - for (int ik = 0; ik < this->wfc_basis->nks; ++ik) - { - /// update H(k) for each k point - pHamilt->updateHk(ik); + // Initialize k-point continuity if enabled + static int count = 0; + if (use_k_continuity) { + build_k_neighbors(); + } + + // Loop over k points for solve Hamiltonian to charge density + if (use_k_continuity) { + // K-point continuity case + for (int i = 0; i < this->wfc_basis->nks; ++i) + { + const int ik = k_order[i]; + + // update H(k) for each k point + pHamilt->updateHk(ik); #ifdef USE_PAW - this->paw_func_in_kloop(ik, tpiba); + this->paw_func_in_kloop(ik, tpiba); #endif - /// update psi pointer for each k point - psi.fix_k(ik); + // update psi pointer for each k point + psi.fix_k(ik); + + // If using k-point continuity and not first k-point, propagate from parent + if (ik > 0 && count == 0 && k_parent.find(ik) != k_parent.end()) { + propagate_psi(psi, k_parent[ik], ik); + } - // template add precondition calculating here - update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0())); + // template add precondition calculating here + update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0())); - // use smooth threshold for all iter methods - if (PARAM.inp.diago_smooth_ethr == true) - { - this->cal_smooth_ethr(pes->klist->wk[ik], - &pes->wg(ik, 0), - DiagoIterAssist::PW_DIAG_THR, - ethr_band); - } + // use smooth threshold for all iter methods + if (PARAM.inp.diago_smooth_ethr == true) + { + this->cal_smooth_ethr(pes->klist->wk[ik], + &pes->wg(ik, 0), + DiagoIterAssist::PW_DIAG_THR, + ethr_band); + } #ifdef USE_PAW - this->call_paw_cell_set_currentk(ik); + this->call_paw_cell_set_currentk(ik); #endif - /// solve eigenvector and eigenvalue for H(k) - this->hamiltSolvePsiK(pHamilt, - psi, - precondition, - eigenvalues.data() + ik * psi.get_nbands(), - this->wfc_basis->nks); + // solve eigenvector and eigenvalue for H(k) + this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks); - if (skip_charge) + if (skip_charge) + { + GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik + << " is: " << DiagoIterAssist::avg_iter + << " ; where current threshold is: " << this->diag_thr << " . " << std::endl; + DiagoIterAssist::avg_iter = 0.0; + } + } + } + else { + // Original code without k-point continuity + for (int ik = 0; ik < this->wfc_basis->nks; ++ik) { - GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik - << " is: " << DiagoIterAssist::avg_iter - << " ; where current threshold is: " << this->diag_thr << " . " << std::endl; - DiagoIterAssist::avg_iter = 0.0; + // update H(k) for each k point + pHamilt->updateHk(ik); + +#ifdef USE_PAW + this->paw_func_in_kloop(ik, tpiba); +#endif + + // update psi pointer for each k point + psi.fix_k(ik); + + // template add precondition calculating here + update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0())); + + // use smooth threshold for all iter methods + if (PARAM.inp.diago_smooth_ethr == true) + { + this->cal_smooth_ethr(pes->klist->wk[ik], + &pes->wg(ik, 0), + DiagoIterAssist::PW_DIAG_THR, + ethr_band); + } + +#ifdef USE_PAW + this->call_paw_cell_set_currentk(ik); +#endif + + // solve eigenvector and eigenvalue for H(k) + this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks); + + if (skip_charge) + { + GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik + << " is: " << DiagoIterAssist::avg_iter + << " ; where current threshold is: " << this->diag_thr << " . " << std::endl; + DiagoIterAssist::avg_iter = 0.0; + } } - /// calculate the contribution of Psi for charge density rho } + + count++; // END Loop over k points // copy eigenvalues to ekb in ElecState @@ -661,6 +716,101 @@ void HSolverPW::output_iterInfo() } } +template +void HSolverPW::build_k_neighbors() { + const int nk = this->wfc_basis->nks; + kvecs_c.resize(nk); + k_order.clear(); + k_order.reserve(nk); + + // Store k-points and corresponding indices + struct KPoint { + ModuleBase::Vector3 kvec; + int index; + double norm; + + KPoint(const ModuleBase::Vector3& v, int i) : + kvec(v), index(i), norm(v.norm()) {} + }; + + // Build k-point list + std::vector klist; + for (int ik = 0; ik < nk; ++ik) { + kvecs_c[ik] = this->wfc_basis->kvec_c[ik]; + klist.push_back(KPoint(kvecs_c[ik], ik)); + } + + // Sort k-points by distance from origin + std::sort(klist.begin(), klist.end(), + [](const KPoint& a, const KPoint& b) { + return a.norm < b.norm; + }); + + // Build parent-child relationships + k_order.push_back(klist[0].index); + + // Find nearest processed k-point as parent for each k-point + for (int i = 1; i < nk; ++i) { + int current_k = klist[i].index; + double min_dist = 1e10; + int parent = -1; + + // find the nearest k-point as parent + for (int j = 0; j < k_order.size(); ++j) { + int processed_k = k_order[j]; + double dist = (kvecs_c[current_k] - kvecs_c[processed_k]).norm2(); + if (dist < min_dist) { + min_dist = dist; + parent = processed_k; + } + } + + k_parent[current_k] = parent; + k_order.push_back(current_k); + } +} + +template +void HSolverPW::propagate_psi(psi::Psi& psi, const int from_ik, const int to_ik) { + const int nbands = psi.get_nbands(); + const int npwk = this->wfc_basis->npwk[to_ik]; + + // Get k-point difference + ModuleBase::Vector3 dk = kvecs_c[to_ik] - kvecs_c[from_ik]; + + // Allocate porter locally + T* porter = nullptr; + resmem_complex_op()(porter, this->wfc_basis->nmaxgr, "HSolverPW::porter"); + + // Process each band + for (int ib = 0; ib < nbands; ib++) + { + // Fix current k-point and band + // psi.fix_k(from_ik); + + // FFT to real space + // this->wfc_basis->recip_to_real(this->ctx, psi.get_pointer(ib), porter, from_ik); + this->wfc_basis->recip_to_real(this->ctx, &psi(from_ik, ib, 0), porter, from_ik); + + // Apply phase factor + // // TODO: Check how to get the r vector + // ModuleBase::Vector3 r = this->wfc_basis->get_ir2r(ir); + // double phase = this->wfc_basis->tpiba * (dk.x * r.x + dk.y * r.y + dk.z * r.z); + // psi_real[ir] *= std::exp(std::complex(0.0, phase)); + // } + + // Fix k-point for target + // psi.fix_k(to_ik); + + // FFT back to reciprocal space + // this->wfc_basis->real_to_recip(this->ctx, porter, psi.get_pointer(ib), to_ik, true); + this->wfc_basis->real_to_recip(this->ctx, porter, &psi(to_ik, ib, 0), to_ik); + } + + // Clean up porter + delmem_complex_op()(porter); +} + template class HSolverPW, base_device::DEVICE_CPU>; template class HSolverPW, base_device::DEVICE_CPU>; #if ((defined __CUDA) || (defined __ROCM)) diff --git a/source/module_hsolver/hsolver_pw.h b/source/module_hsolver/hsolver_pw.h index 5f44bc10a4..3a52293fb2 100644 --- a/source/module_hsolver/hsolver_pw.h +++ b/source/module_hsolver/hsolver_pw.h @@ -5,6 +5,8 @@ #include "module_hamilt_general/hamilt.h" #include "module_base/macros.h" #include "module_basis/module_pw/pw_basis_k.h" +#include +#include "module_base/memory.h" namespace hsolver { @@ -17,6 +19,9 @@ class HSolverPW // return T if T is real type(float, double), // otherwise return the real type of T(complex, complex) using Real = typename GetTypeReal::type; + using resmem_complex_op = base_device::memory::resize_memory_op; + using delmem_complex_op = base_device::memory::delete_memory_op; + using setmem_complex_op = base_device::memory::set_memory_op; public: HSolverPW(ModulePW::PW_Basis_K* wfc_basis_in, @@ -29,10 +34,12 @@ class HSolverPW const int scf_iter_in, const int diag_iter_max_in, const double diag_thr_in, - const bool need_subspace_in) + const bool need_subspace_in, + const bool use_k_continuity_in = false) : wfc_basis(wfc_basis_in), calculation_type(calculation_type_in), basis_type(basis_type_in), method(method_in), use_paw(use_paw_in), use_uspp(use_uspp_in), nspin(nspin_in), scf_iter(scf_iter_in), - diag_iter_max(diag_iter_max_in), diag_thr(diag_thr_in), need_subspace(need_subspace_in){}; + diag_iter_max(diag_iter_max_in), diag_thr(diag_thr_in), need_subspace(need_subspace_in), + use_k_continuity(use_k_continuity_in) {}; /// @brief solve function for pw /// @param pHamilt interface to hamilt @@ -50,6 +57,7 @@ class HSolverPW const double tpiba, const int nat); + protected: // diago caller void hamiltSolvePsiK(hamilt::Hamilt* hm, @@ -78,6 +86,8 @@ class HSolverPW const bool need_subspace; // for cg or dav_subspace + const bool use_k_continuity; + protected: Device* ctx = {}; @@ -98,8 +108,16 @@ class HSolverPW void paw_func_after_kloop(psi::Psi& psi, elecstate::ElecState* pes,const double tpiba,const int nat); #endif + + // K-point continuity related members + std::vector k_order; + std::unordered_map k_parent; + std::vector> kvecs_c; + + void build_k_neighbors(); + void propagate_psi(psi::Psi& psi, const int from_ik, const int to_ik); }; } // namespace hsolver -#endif \ No newline at end of file +#endif diff --git a/source/module_hsolver/kernels/bpcg_kernel_op.cpp b/source/module_hsolver/kernels/bpcg_kernel_op.cpp index bf05ac94e1..97cb7e5302 100644 --- a/source/module_hsolver/kernels/bpcg_kernel_op.cpp +++ b/source/module_hsolver/kernels/bpcg_kernel_op.cpp @@ -2,6 +2,7 @@ #include "module_base/blas_connector.h" #include "module_base/kernels/math_kernel_op.h" #include "module_base/parallel_reduce.h" +#include namespace hsolver { @@ -106,8 +107,93 @@ struct calc_grad_with_block_op } }; +template +struct apply_eigenvalues_op +{ + using Real = typename GetTypeReal::type; + void operator()(const int& nbase, const int& nbase_x, const int& notconv, T* result, const T* vectors, const Real* eigenvalues) + { + for (int m = 0; m < notconv; m++) + { + for (int idx = 0; idx < nbase; idx++) + { + result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx]; + } + } + } +}; + +template +struct precondition_op { + using Real = typename GetTypeReal::type; + void operator()(const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + const Real* precondition, + const Real* eigenvalues) + { + std::vector pre(dim, 0.0); + for (int m = 0; m < notconv; m++) + { + for (size_t i = 0; i < dim; i++) + { + Real x = std::abs(precondition[i] - eigenvalues[m]); + pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0))); + } + ModuleBase::vector_div_vector_op()( + dim, + psi_iter + (nbase + m) * dim, + psi_iter + (nbase + m) * dim, + pre.data()); + } + } +}; + +template +struct normalize_op { + void operator()(const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + typename GetTypeReal::type* psi_norm) + { + using Real = typename GetTypeReal::type; + for (int m = 0; m < notconv; m++) + { + // Calculate norm using dot_real_op + Real psi_m_norm = ModuleBase::dot_real_op()( + dim, + psi_iter + (nbase + m) * dim, + psi_iter + (nbase + m) * dim, + true); + assert(psi_m_norm > 0.0); + psi_m_norm = sqrt(psi_m_norm); + + // Normalize using vector_div_constant_op + ModuleBase::vector_div_constant_op()( + dim, + psi_iter + (nbase + m) * dim, + psi_iter + (nbase + m) * dim, + psi_m_norm); + if (psi_norm) { + psi_norm[m] = psi_m_norm; + } + } + } +}; + template struct calc_grad_with_block_op, base_device::DEVICE_CPU>; template struct line_minimize_with_block_op, base_device::DEVICE_CPU>; template struct calc_grad_with_block_op, base_device::DEVICE_CPU>; template struct line_minimize_with_block_op, base_device::DEVICE_CPU>; +template struct apply_eigenvalues_op, base_device::DEVICE_CPU>; +template struct apply_eigenvalues_op, base_device::DEVICE_CPU>; +template struct apply_eigenvalues_op; +template struct precondition_op, base_device::DEVICE_CPU>; +template struct precondition_op, base_device::DEVICE_CPU>; +template struct precondition_op; +template struct normalize_op, base_device::DEVICE_CPU>; +template struct normalize_op, base_device::DEVICE_CPU>; +template struct normalize_op; } // namespace hsolver \ No newline at end of file diff --git a/source/module_hsolver/kernels/bpcg_kernel_op.h b/source/module_hsolver/kernels/bpcg_kernel_op.h index fcf8f99267..31afb74caa 100644 --- a/source/module_hsolver/kernels/bpcg_kernel_op.h +++ b/source/module_hsolver/kernels/bpcg_kernel_op.h @@ -59,6 +59,39 @@ struct calc_grad_with_block_op const int& n_band); }; +template +struct apply_eigenvalues_op +{ + using Real = typename GetTypeReal::type; + void operator()(const int& nbase, + const int& nbase_x, + const int& notconv, + T* result, + const T* vectors, + const Real* eigenvalues); +}; + +template +struct precondition_op { + using Real = typename GetTypeReal::type; + void operator()(const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + const Real* precondition, + const Real* eigenvalues); +}; + +template +struct normalize_op { + using Real = typename GetTypeReal::type; + void operator()(const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + Real* psi_norm = nullptr); +}; + #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM template @@ -78,6 +111,38 @@ struct calc_grad_with_block_op { const int &n_band); }; +template +struct apply_eigenvalues_op { + using Real = typename GetTypeReal::type; + void operator()(const int& nbase, + const int& nbase_x, + const int& notconv, + T* result, + const T* vectors, + const Real* eigenvalues); +}; + +template +struct precondition_op { + using Real = typename GetTypeReal::type; + void operator()(const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + const Real* precondition, + const Real* eigenvalues); +}; + +template +struct normalize_op { + using Real = typename GetTypeReal::type; + void operator()(const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + Real* psi_norm = nullptr); +}; + #endif } // namespace hsolver diff --git a/source/module_hsolver/kernels/cuda/bpcg_kernel_op.cu b/source/module_hsolver/kernels/cuda/bpcg_kernel_op.cu index d851265a1b..dd94823aad 100644 --- a/source/module_hsolver/kernels/cuda/bpcg_kernel_op.cu +++ b/source/module_hsolver/kernels/cuda/bpcg_kernel_op.cu @@ -246,6 +246,96 @@ __global__ void calc_grad_with_block( } } +template +__global__ void apply_eigenvalues_kernel( + const thrust::complex* vectors, + thrust::complex* result, + const Real* eigenvalues, + const int nbase, + const int nbase_x, + const int notconv) +{ + int m = blockIdx.x; + int idx = threadIdx.x + blockIdx.y * blockDim.x; + + if (m < notconv && idx < nbase) { + result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx]; + } +} + +template +__global__ void precondition_kernel( + thrust::complex* psi_iter, + const Real* precondition, + const Real* eigenvalues, + const int dim, + const int nbase, + const int notconv) +{ + int m = blockIdx.x; + int i = threadIdx.x + blockIdx.y * blockDim.x; + + if (m < notconv && i < dim) { + Real x = abs(precondition[i] - eigenvalues[m]); + Real pre = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0))); + psi_iter[(nbase + m) * dim + i] = psi_iter[(nbase + m) * dim + i] / pre; + } +} + +template +__global__ void normalize_kernel( + thrust::complex* psi_iter, + Real* psi_norm, + const int dim, + const int nbase, + const int notconv) +{ + int m = blockIdx.x; + int tid = threadIdx.x; + __shared__ Real sum[thread_per_block]; + + sum[tid] = 0.0; + + // Calculate the sum for normalization + for (int i = tid; i < dim; i += thread_per_block) { + auto val = psi_iter[(nbase + m) * dim + i]; + sum[tid] += (val * thrust::conj(val)).real(); + } + + __syncthreads(); + + // Parallel reduction in shared memory + for (int s = thread_per_block/2; s > warp_size; s >>= 1) { + if (tid < s) { + sum[tid] += sum[tid + s]; + } + __syncthreads(); + } + + if (tid < warp_size) { + sum[tid] += sum[tid + 32]; __syncwarp(); + sum[tid] += sum[tid + 16]; __syncwarp(); + sum[tid] += sum[tid + 8]; __syncwarp(); + sum[tid] += sum[tid + 4]; __syncwarp(); + sum[tid] += sum[tid + 2]; __syncwarp(); + sum[tid] += sum[tid + 1]; __syncwarp(); + } + + __syncthreads(); + + Real norm = sqrt(sum[0]); + + // Normalize the vector + for (int i = tid; i < dim; i += thread_per_block) { + psi_iter[(nbase + m) * dim + i] /= norm; + } + + // Store the norm if needed + if (tid == 0 && psi_norm != nullptr) { + psi_norm[m] = norm; + } +} + template void line_minimize_with_block_op::operator()(T* grad_out, T* hgrad_out, @@ -292,9 +382,75 @@ void calc_grad_with_block_op::operator()(const Real* cudaCheckOnDebug(); } +template +void apply_eigenvalues_op::operator()(const int& nbase, + const int& nbase_x, + const int& notconv, + T* result, + const T* vectors, + const Real* eigenvalues) +{ + const int threads_per_block = 256; + const int blocks_per_grid_y = (nbase + threads_per_block - 1) / threads_per_block; + + dim3 grid(notconv, blocks_per_grid_y); + + auto vec_complex = reinterpret_cast*>(vectors); + auto res_complex = reinterpret_cast*>(result); + + apply_eigenvalues_kernel<<>>( + vec_complex, res_complex, eigenvalues, nbase, nbase_x, notconv); + + cudaCheckOnDebug(); +} + +template +void precondition_op::operator()(const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + const Real* precondition, + const Real* eigenvalues) +{ + const int threads_per_block = 256; + const int blocks_per_grid_y = (dim + threads_per_block - 1) / threads_per_block; + + dim3 grid(notconv, blocks_per_grid_y); + + auto psi_complex = reinterpret_cast*>(psi_iter); + + precondition_kernel<<>>( + psi_complex, precondition, eigenvalues, dim, nbase, notconv); + + cudaCheckOnDebug(); +} + +template +void normalize_op::operator()(const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + Real* psi_norm) +{ + auto psi_complex = reinterpret_cast*>(psi_iter); + + normalize_kernel<<>>( + psi_complex, psi_norm, dim, nbase, notconv); + + cudaCheckOnDebug(); +} + template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; - +template struct apply_eigenvalues_op, base_device::DEVICE_GPU>; +template struct apply_eigenvalues_op, base_device::DEVICE_GPU>; +template struct apply_eigenvalues_op; +template struct precondition_op, base_device::DEVICE_GPU>; +template struct precondition_op, base_device::DEVICE_GPU>; +template struct precondition_op; +template struct normalize_op, base_device::DEVICE_GPU>; +template struct normalize_op, base_device::DEVICE_GPU>; +template struct normalize_op; } \ No newline at end of file diff --git a/source/module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu b/source/module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu index 850b48bf2e..d7faf727a8 100644 --- a/source/module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu +++ b/source/module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu @@ -171,6 +171,85 @@ __global__ void calc_grad_with_block( } } +template +__global__ void apply_eigenvalues_kernel( + const thrust::complex* vectors, + thrust::complex* result, + const Real* eigenvalues, + const int nbase, + const int nbase_x, + const int notconv) +{ + int m = blockIdx.x; + int idx = threadIdx.x + blockIdx.y * blockDim.x; + + if (m < notconv && idx < nbase) { + result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx]; + } +} + +template +__global__ void precondition_kernel( + thrust::complex* psi_iter, + const Real* precondition, + const Real* eigenvalues, + const int dim, + const int nbase, + const int notconv) +{ + int m = blockIdx.x; + int i = threadIdx.x + blockIdx.y * blockDim.x; + + if (m < notconv && i < dim) { + Real x = abs(precondition[i] - eigenvalues[m]); + Real pre = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0))); + psi_iter[(nbase + m) * dim + i] = psi_iter[(nbase + m) * dim + i] / pre; + } +} + +template +__global__ void normalize_kernel( + thrust::complex* psi_iter, + Real* psi_norm, + const int dim, + const int nbase, + const int notconv) +{ + int m = blockIdx.x; + int tid = threadIdx.x; + __shared__ Real sum[THREAD_PER_BLOCK]; + + sum[tid] = 0.0; + + // Calculate the sum for normalization + for (int i = tid; i < dim; i += THREAD_PER_BLOCK) { + auto val = psi_iter[(nbase + m) * dim + i]; + sum[tid] += (val * thrust::conj(val)).real(); + } + + __syncthreads(); + + // Parallel reduction in shared memory + for (int s = THREAD_PER_BLOCK/2; s > 0; s >>= 1) { + if (tid < s) { + sum[tid] += sum[tid + s]; + } + __syncthreads(); + } + + Real norm = sqrt(sum[0]); + + // Normalize the vector + for (int i = tid; i < dim; i += THREAD_PER_BLOCK) { + psi_iter[(nbase + m) * dim + i] /= norm; + } + + // Store the norm if needed + if (tid == 0 && psi_norm != nullptr) { + psi_norm[m] = norm; + } +} + template void line_minimize_with_block_op::operator()(T* grad_out, T* hgrad_out, @@ -217,8 +296,75 @@ void calc_grad_with_block_op::operator()(const Real* hipCheckOnDebug(); } +template +void apply_eigenvalues_op::operator()(const int& nbase, + const int& nbase_x, + const int& notconv, + T* result, + const T* vectors, + const Real* eigenvalues) +{ + const int threads_per_block = 256; + const int blocks_per_grid_y = (nbase + threads_per_block - 1) / threads_per_block; + + dim3 grid(notconv, blocks_per_grid_y); + + auto vec_complex = reinterpret_cast*>(vectors); + auto res_complex = reinterpret_cast*>(result); + + apply_eigenvalues_kernel<<>>( + vec_complex, res_complex, eigenvalues, nbase, nbase_x, notconv); + + hipCheckOnDebug(); +} + +template +void precondition_op::operator()(const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + const Real* precondition, + const Real* eigenvalues) +{ + const int threads_per_block = 256; + const int blocks_per_grid_y = (dim + threads_per_block - 1) / threads_per_block; + + dim3 grid(notconv, blocks_per_grid_y); + + auto psi_complex = reinterpret_cast*>(psi_iter); + + precondition_kernel<<>>( + psi_complex, precondition, eigenvalues, dim, nbase, notconv); + + hipCheckOnDebug(); +} + +template +void normalize_op::operator()(const int& dim, + T* psi_iter, + const int& nbase, + const int& notconv, + Real* psi_norm) +{ + auto psi_complex = reinterpret_cast*>(psi_iter); + + normalize_kernel<<>>( + psi_complex, psi_norm, dim, nbase, notconv); + + hipCheckOnDebug(); +} + template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; template struct calc_grad_with_block_op, base_device::DEVICE_GPU>; template struct line_minimize_with_block_op, base_device::DEVICE_GPU>; +template struct apply_eigenvalues_op, base_device::DEVICE_GPU>; +template struct apply_eigenvalues_op, base_device::DEVICE_GPU>; +template struct apply_eigenvalues_op; +template struct precondition_op, base_device::DEVICE_GPU>; +template struct precondition_op, base_device::DEVICE_GPU>; +template struct precondition_op; +template struct normalize_op, base_device::DEVICE_GPU>; +template struct normalize_op, base_device::DEVICE_GPU>; +template struct normalize_op; } \ No newline at end of file diff --git a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu index 5eeb1697e8..8cba06db1b 100644 --- a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu +++ b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu @@ -5,12 +5,25 @@ namespace hsolver { +// NOTE: mimicked from ../cuda/dngvd_op.cu for three dngvd_op + +static hipsolverHandle_t hipsolver_H = nullptr; +// Test on DCU platform. When nstart is greater than 234, code on DCU performs better. +const int N_DCU = 234; + void createGpuSolverHandle() { - return; + if (hipsolver_H == nullptr) + { + hipsolverErrcheck(hipsolverCreate(&hipsolver_H)); + } } void destroyGpuSolverHandle() { - return; + if (hipsolver_H != nullptr) + { + hipsolverErrcheck(hipsolverDestroy(hipsolver_H)); + hipsolver_H = nullptr; + } } #ifdef __LCAO @@ -23,22 +36,68 @@ void dngvd_op::operator()(const base_device::DE double* _eigenvalue, double* _vcc) { - std::vector hcc(nstart * nstart, 0.0); - std::vector scc(nstart * nstart, 0.0); - std::vector vcc(nstart * nstart, 0.0); - std::vector eigenvalue(nstart, 0); - hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost)); - hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost)); - base_device::DEVICE_CPU* cpu_ctx = {}; - dngvd_op()(cpu_ctx, - nstart, - ldh, - hcc.data(), - scc.data(), - eigenvalue.data(), - vcc.data()); - hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice)); - hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice)); + // copied from ../cuda/dngvd_op.cu, "dngvd_op" + assert(nstart == ldh); + + if (nstart > N_DCU){ + hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(double) * ldh * nstart, hipMemcpyDeviceToDevice)); + // now vcc contains hcc + + // prepare some values for hipsolverDnZhegvd_bufferSize + int * devInfo = nullptr; + int lwork = 0, info_gpu = 0; + double * work = nullptr; + hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int))); + hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER; + + // calculate the sizes needed for pre-allocated buffer. + hipsolverErrcheck(hipsolverDnDsygvd_bufferSize( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + _vcc, ldh, + _scc, ldh, + _eigenvalue, + &lwork)); + + // allocate memery + hipErrcheck(hipMalloc((void**)&work, sizeof(double) * lwork)); + + // compute eigenvalues and eigenvectors. + hipsolverErrcheck(hipsolverDnDsygvd( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + _vcc, ldh, + const_cast(_scc), ldh, + _eigenvalue, + work, lwork, devInfo)); + + hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost)); + + // free the buffer + hipErrcheck(hipFree(work)); + hipErrcheck(hipFree(devInfo)); + } + // if(fail_info != nullptr) *fail_info = info_gpu; + else{ + std::vector hcc(nstart * nstart, 0.0); + std::vector scc(nstart * nstart, 0.0); + std::vector vcc(nstart * nstart, 0.0); + std::vector eigenvalue(nstart, 0); + hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost)); + hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost)); + base_device::DEVICE_CPU* cpu_ctx = {}; + dngvd_op()(cpu_ctx, + nstart, + ldh, + hcc.data(), + scc.data(), + eigenvalue.data(), + vcc.data()); + hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice)); + hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice)); + } + + } #endif // __LCAO @@ -51,22 +110,67 @@ void dngvd_op, base_device::DEVICE_GPU>::operator()(const ba float* _eigenvalue, std::complex* _vcc) { - std::vector> hcc(nstart * nstart, {0, 0}); - std::vector> scc(nstart * nstart, {0, 0}); - std::vector> vcc(nstart * nstart, {0, 0}); - std::vector eigenvalue(nstart, 0); - hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex) * hcc.size(), hipMemcpyDeviceToHost)); - hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex) * scc.size(), hipMemcpyDeviceToHost)); - base_device::DEVICE_CPU* cpu_ctx = {}; - dngvd_op, base_device::DEVICE_CPU>()(cpu_ctx, - nstart, - ldh, - hcc.data(), - scc.data(), - eigenvalue.data(), - vcc.data()); - hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex) * vcc.size(), hipMemcpyHostToDevice)); - hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice)); + // copied from ../cuda/dngvd_op.cu, "dngvd_op" + assert(nstart == ldh); + + if (nstart > N_DCU){ + hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex) * ldh * nstart, hipMemcpyDeviceToDevice)); + // now vcc contains hcc + + // prepare some values for hipsolverDnZhegvd_bufferSize + int * devInfo = nullptr; + int lwork = 0, info_gpu = 0; + float2 * work = nullptr; + hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int))); + hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER; + + // calculate the sizes needed for pre-allocated buffer. + hipsolverErrcheck(hipsolverDnChegvd_bufferSize( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + reinterpret_cast(_vcc), ldh, + reinterpret_cast(_scc), ldh, + _eigenvalue, + &lwork)); + + // allocate memery + hipErrcheck(hipMalloc((void**)&work, sizeof(float2) * lwork)); + + // compute eigenvalues and eigenvectors. + hipsolverErrcheck(hipsolverDnChegvd( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + reinterpret_cast(_vcc), ldh, + const_cast(reinterpret_cast(_scc)), ldh, + _eigenvalue, + work, lwork, devInfo)); + + hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost)); + // free the buffer + hipErrcheck(hipFree(work)); + hipErrcheck(hipFree(devInfo)); + } + // if(fail_info != nullptr) *fail_info = info_gpu; + else{ + std::vector> hcc(nstart * nstart, {0, 0}); + std::vector> scc(nstart * nstart, {0, 0}); + std::vector> vcc(nstart * nstart, {0, 0}); + std::vector eigenvalue(nstart, 0); + hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex) * hcc.size(), hipMemcpyDeviceToHost)); + hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex) * scc.size(), hipMemcpyDeviceToHost)); + base_device::DEVICE_CPU* cpu_ctx = {}; + dngvd_op, base_device::DEVICE_CPU>()(cpu_ctx, + nstart, + ldh, + hcc.data(), + scc.data(), + eigenvalue.data(), + vcc.data()); + hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex) * vcc.size(), hipMemcpyHostToDevice)); + hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice)); + } + + } template <> @@ -76,24 +180,80 @@ void dngvd_op, base_device::DEVICE_GPU>::operator()(const b const std::complex* _hcc, const std::complex* _scc, double* _eigenvalue, - std::complex* _vcc) + std::complex* _vcc + ) { - std::vector> hcc(nstart * nstart, {0, 0}); - std::vector> scc(nstart * nstart, {0, 0}); - std::vector> vcc(nstart * nstart, {0, 0}); - std::vector eigenvalue(nstart, 0); - hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex) * hcc.size(), hipMemcpyDeviceToHost)); - hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex) * scc.size(), hipMemcpyDeviceToHost)); - base_device::DEVICE_CPU* cpu_ctx = {}; - dngvd_op, base_device::DEVICE_CPU>()(cpu_ctx, - nstart, - ldh, - hcc.data(), - scc.data(), - eigenvalue.data(), - vcc.data()); - hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex) * vcc.size(), hipMemcpyHostToDevice)); - hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice)); + // copied from ../cuda/dngvd_op.cu, "dngvd_op" + assert(nstart == ldh); + + // save a copy of scc in case the diagonalization fails + if (nstart > N_DCU){ + std::vector> scc(nstart * nstart, {0, 0}); + hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex) * scc.size(), hipMemcpyDeviceToHost)); + + hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex) * ldh * nstart, hipMemcpyDeviceToDevice)); + + // now vcc contains hcc + + // prepare some values for hipsolverDnZhegvd_bufferSize + int * devInfo = nullptr; + int lwork = 0, info_gpu = 0; + double2 * work = nullptr; + hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int))); + hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER; + + // calculate the sizes needed for pre-allocated buffer. + hipsolverErrcheck(hipsolverDnZhegvd_bufferSize( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + reinterpret_cast(_vcc), ldh, + reinterpret_cast(_scc), ldh, + _eigenvalue, + &lwork)); + + // allocate memery + hipErrcheck(hipMalloc((void**)&work, sizeof(double2) * lwork)); + + // compute eigenvalues and eigenvectors. + hipsolverErrcheck(hipsolverDnZhegvd( + hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo, + nstart, + reinterpret_cast(_vcc), ldh, + const_cast(reinterpret_cast(_scc)), ldh, + _eigenvalue, + work, lwork, devInfo)); + + hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost)); + // free the buffer + hipErrcheck(hipFree(work)); + hipErrcheck(hipFree(devInfo)); + } + // if(fail_info != nullptr) *fail_info = info_gpu; + else{ + std::vector> hcc(nstart * nstart, {0, 0}); + std::vector> scc(nstart * nstart, {0, 0}); + std::vector> vcc(nstart * nstart, {0, 0}); + std::vector eigenvalue(nstart, 0); + hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex) * hcc.size(), hipMemcpyDeviceToHost)); + hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex) * scc.size(), hipMemcpyDeviceToHost)); + base_device::DEVICE_CPU* cpu_ctx = {}; + dngvd_op, base_device::DEVICE_CPU>()(cpu_ctx, + nstart, + ldh, + hcc.data(), + scc.data(), + eigenvalue.data(), + vcc.data()); + hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex) * vcc.size(), hipMemcpyHostToDevice)); + hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice)); + } + + + + + + + } #ifdef __LCAO diff --git a/source/module_hsolver/test/test_hsolver_pw.cpp b/source/module_hsolver/test/test_hsolver_pw.cpp index 6b0e8c5a80..92be00fcbd 100644 --- a/source/module_hsolver/test/test_hsolver_pw.cpp +++ b/source/module_hsolver/test/test_hsolver_pw.cpp @@ -14,6 +14,68 @@ #include "module_hsolver/hsolver_pw.h" #undef private #undef protected + +// Mock implementations for the template functions causing linking errors +namespace ModulePW { + // Mock implementation for recip_to_real + template + void PW_Basis_K::recip_to_real(const Device* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const FPTYPE factor) const + { + // Simple mock implementation that does nothing + // In a real test, you might want to implement behavior that simulates the actual function + } + + // Mock implementation for real_to_recip + template + void PW_Basis_K::real_to_recip(const Device* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const FPTYPE factor) const + { + // Simple mock implementation that does nothing + } + + // Explicit template instantiations + template void PW_Basis_K::recip_to_real( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const float factor) const; + + template void PW_Basis_K::recip_to_real( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const double factor) const; + + template void PW_Basis_K::real_to_recip( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const float factor) const; + + template void PW_Basis_K::real_to_recip( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const double factor) const; +} + /************************************************ * unit test of HSolverPW class ***********************************************/ diff --git a/source/module_hsolver/test/test_hsolver_sdft.cpp b/source/module_hsolver/test/test_hsolver_sdft.cpp index d1f7bbca13..83cbb8f361 100644 --- a/source/module_hsolver/test/test_hsolver_sdft.cpp +++ b/source/module_hsolver/test/test_hsolver_sdft.cpp @@ -186,6 +186,66 @@ template class Stochastic_Iter, base_device::DEVICE_CPU>; Charge::Charge(){}; Charge::~Charge(){}; +// Mock implementations for the template functions causing linking errors +namespace ModulePW { + // Mock implementation for recip_to_real + template + void PW_Basis_K::recip_to_real(const Device* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const FPTYPE factor) const + { + // Simple mock implementation that does nothing + // In a real test, you might want to implement behavior that simulates the actual function + } + + // Mock implementation for real_to_recip + template + void PW_Basis_K::real_to_recip(const Device* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const FPTYPE factor) const + { + // Simple mock implementation that does nothing + } + + // Explicit template instantiations + template void PW_Basis_K::recip_to_real( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const float factor) const; + + template void PW_Basis_K::recip_to_real( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const double factor) const; + + template void PW_Basis_K::real_to_recip( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const float factor) const; + + template void PW_Basis_K::real_to_recip( + const base_device::DEVICE_CPU* ctx, + const std::complex* in, + std::complex* out, + const int ik, + const bool add, + const double factor) const; +} /************************************************ * unit test of HSolverPW_SDFT class ***********************************************/ diff --git a/source/module_io/read_input_item_elec_stru.cpp b/source/module_io/read_input_item_elec_stru.cpp index 1c03f5c9ff..16d08031a9 100644 --- a/source/module_io/read_input_item_elec_stru.cpp +++ b/source/module_io/read_input_item_elec_stru.cpp @@ -335,6 +335,26 @@ void ReadInput::item_elec_stru() read_sync_bool(input.diago_smooth_ethr); this->add_item(item); } + { + Input_Item item("use_k_continuity"); + item.annotation = "whether to use k-point continuity for initializing wave functions"; + read_sync_bool(input.use_k_continuity); + item.check_value = [](const Input_Item& item, const Parameter& para) { + if (para.input.use_k_continuity && para.input.basis_type != "pw") { + ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity only works for PW basis"); + } + if (para.input.use_k_continuity && para.input.calculation == "nscf") { + ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for NSCF calculation"); + } + if (para.input.use_k_continuity && para.input.nspin == 2) { + ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for spin-polarized calculation"); + } + if (para.input.use_k_continuity && para.input.esolver_type == "sdft") { + ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for SDFT calculation"); + } + }; + this->add_item(item); + } { Input_Item item("pw_diag_ndim"); item.annotation = "dimension of workspace for Davidson diagonalization"; diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h index 5c0291b5b5..cbac66ed74 100644 --- a/source/module_parameter/input_parameter.h +++ b/source/module_parameter/input_parameter.h @@ -89,6 +89,7 @@ struct Input_para int pw_diag_ndim = 4; ///< dimension of workspace for Davidson diagonalization int diago_cg_prec = 1; ///< mohan add 2012-03-31 int diag_subspace = 0; // 0: Lapack, 1: elpa, 2: scalapack + bool use_k_continuity = false; ///< whether to use k-point continuity for initializing wave functions std::string smearing_method = "gauss"; ///< "gauss", ///< "mp","methfessel-paxton" diff --git a/source/module_psi/psi_init.cpp b/source/module_psi/psi_init.cpp index d89b8ab477..11c9ecb7b7 100644 --- a/source/module_psi/psi_init.cpp +++ b/source/module_psi/psi_init.cpp @@ -133,6 +133,7 @@ void PSIInit::initialize_psi(Psi>* psi, // like (1, nbands, npwx), in which npwx is the maximal npw of all kpoints for (int ik = 0; ik < this->pw_wfc.nks; ik++) { + if(PARAM.inp.use_k_continuity && ik > 0) continue; //! Fix the wavefunction to initialize at given kpoint psi->fix_k(ik); kspw_psi->fix_k(ik); @@ -147,6 +148,7 @@ void PSIInit::initialize_psi(Psi>* psi, { syncmem_h2d_op()(psi_device->get_pointer(), psi_cpu->get_pointer(), nbands_start * nbasis); } + if (this->ks_solver == "cg") { @@ -268,4 +270,4 @@ template class PSIInit, base_device::DEVICE_CPU>; template class PSIInit, base_device::DEVICE_GPU>; template class PSIInit, base_device::DEVICE_GPU>; #endif -} // namespace psi \ No newline at end of file +} // namespace psi