Skip to content

Feature: implement k continuity initialization strategy & kernel #6171

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions source/module_base/kernels/cuda/math_kernel_op_vec.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,19 @@ __global__ void vector_mul_vector_kernel(const int size,
}
}

template <typename T>
__global__ void vector_div_constant_kernel(const int size,
T* result,
const T* vector,
const typename GetTypeReal<T>::type constant)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < size)
{
result[i] = vector[i] / constant;
}
}

template <typename T>
__global__ void vector_div_vector_kernel(const int size,
T* result,
Expand Down Expand Up @@ -127,6 +140,55 @@ void vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator
vector_mul_real_wrapper(dim, result, vector, constant);
}

// vector operator: result[i] = vector[i] / constant
template <>
void vector_div_constant_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
double* result,
const double* vector,
const double constant)
{
// In small cases, 1024 threads per block will only utilize 17 blocks, much less than 40
int thread = thread_per_block;
int block = (dim + thread - 1) / thread;
vector_div_constant_kernel<double><<<block, thread>>>(dim, result, vector, constant);

cudaCheckOnDebug();
}

template <typename FPTYPE>
inline void vector_div_constant_wrapper(const int& dim,
std::complex<FPTYPE>* result,
const std::complex<FPTYPE>* vector,
const FPTYPE constant)
{
thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);

int thread = thread_per_block;
int block = (dim + thread - 1) / thread;
vector_div_constant_kernel<thrust::complex<FPTYPE>><<<block, thread>>>(dim, result_tmp, vector_tmp, constant);

cudaCheckOnDebug();
}

template <>
void vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
std::complex<float>* result,
const std::complex<float>* vector,
const float constant)
{
vector_div_constant_wrapper(dim, result, vector, constant);
}

template <>
void vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
std::complex<double>* result,
const std::complex<double>* vector,
const double constant)
{
vector_div_constant_wrapper(dim, result, vector, constant);
}

// vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex)
template <>
void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
Expand Down Expand Up @@ -306,6 +368,10 @@ template struct vector_mul_real_op<std::complex<float>, base_device::DEVICE_GPU>
template struct vector_mul_real_op<double, base_device::DEVICE_GPU>;
template struct vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;

template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;

template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
Expand Down
83 changes: 83 additions & 0 deletions source/module_base/kernels/math_kernel_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,21 @@ template <typename T, typename Device> struct vector_mul_vector_op {
void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
};

// vector operator: result[i] = vector[i] / constant
template <typename T, typename Device> struct vector_div_constant_op {
using Real = typename GetTypeReal<T>::type;
/// @brief result[i] = vector[i] / constant
///
/// Input Parameters
/// \param dim : array size
/// \param vector : input array
/// \param constant : input constant
///
/// Output Parameters
/// \param result : output array
void operator()(const int& dim, T* result, const T* vector, const Real constant);
};

// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
template <typename T, typename Device> struct vector_div_vector_op {
using Real = typename GetTypeReal<T>::type;
Expand Down Expand Up @@ -284,6 +299,48 @@ template <typename T, typename Device> struct matrixCopy {
void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB);
};

template <typename T, typename Device>
struct apply_eigenvalues_op {
using Real = typename GetTypeReal<T>::type;

void operator()(const Device *d, const int &nbase, const int &nbase_x, const int &notconv,
T *result, const T *vectors, const Real *eigenvalues);
};

template <typename T, typename Device>
struct precondition_op {
using Real = typename GetTypeReal<T>::type;
void operator()(const Device* d,
const int& dim,
T* psi_iter,
const int& nbase,
const int& notconv,
const Real* precondition,
const Real* eigenvalues);
};

template <typename T, typename Device>
struct normalize_op {
using Real = typename GetTypeReal<T>::type;
void operator()(const Device* d,
const int& dim,
T* psi_iter,
const int& nbase,
const int& notconv,
Real* psi_norm = nullptr);
};

template <typename T>
struct normalize_op<T, base_device::DEVICE_GPU> {
using Real = typename GetTypeReal<T>::type;
void operator()(const base_device::DEVICE_GPU* d,
const int& dim,
T* psi_iter,
const int& nbase,
const int& notconv,
Real* psi_norm);
};

#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
// Partially specialize functor for base_device::GpuDevice.
template <typename T> struct dot_real_op<T, base_device::DEVICE_GPU> {
Expand All @@ -306,6 +363,12 @@ template <typename T> struct vector_mul_vector_op<T, base_device::DEVICE_GPU> {
void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
};

// vector operator: result[i] = vector[i] / constant
template <typename T> struct vector_div_constant_op<T, base_device::DEVICE_GPU> {
using Real = typename GetTypeReal<T>::type;
void operator()(const int& dim, T* result, const T* vector, const Real constant);
};

// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
template <typename T> struct vector_div_vector_op<T, base_device::DEVICE_GPU> {
using Real = typename GetTypeReal<T>::type;
Expand Down Expand Up @@ -334,6 +397,26 @@ template <typename T> struct matrixCopy<T, base_device::DEVICE_GPU> {
void createGpuBlasHandle();
void destoryBLAShandle();

// vector operator: result[i] = -lambda[i] * vector[i]
template <typename T> struct apply_eigenvalues_op<T, base_device::DEVICE_GPU> {
using Real = typename GetTypeReal<T>::type;

void operator()(const base_device::DEVICE_GPU *d, const int &nbase, const int &nbase_x, const int &notconv,
T *result, const T *vectors, const Real *eigenvalues);
};

template <typename T>
struct precondition_op<T, base_device::DEVICE_GPU> {
using Real = typename GetTypeReal<T>::type;
void operator()(const base_device::DEVICE_GPU* d,
const int& dim,
T* psi_iter,
const int& nbase,
const int& notconv,
const Real* precondition,
const Real* eigenvalues);
};

#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
} // namespace hsolver

Expand Down
20 changes: 20 additions & 0 deletions source/module_base/kernels/math_kernel_op_vec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,22 @@ struct vector_mul_vector_op<T, base_device::DEVICE_CPU>
}
};

template <typename T>
struct vector_div_constant_op<T, base_device::DEVICE_CPU>
{
using Real = typename GetTypeReal<T>::type;
void operator()(const int& dim, T* result, const T* vector, const Real constant)
{
#ifdef _OPENMP
#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
#endif
for (int i = 0; i < dim; i++)
{
result[i] = vector[i] / constant;
}
}
};

template <typename T>
struct vector_div_vector_op<T, base_device::DEVICE_CPU>
{
Expand Down Expand Up @@ -159,6 +175,10 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_CP
template struct vector_mul_vector_op<double, base_device::DEVICE_CPU>;
template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_CPU>;

template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_CPU>;
template struct vector_div_constant_op<double, base_device::DEVICE_CPU>;
template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_CPU>;

template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
template struct vector_div_vector_op<double, base_device::DEVICE_CPU>;
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
Expand Down
80 changes: 80 additions & 0 deletions source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,19 @@ __launch_bounds__(1024) __global__ void vector_mul_vector_kernel(const int size,
}
}

template <typename T>
__launch_bounds__(1024) __global__ void vector_div_constant_kernel(const int size,
T* result,
const T* vector,
const typename GetTypeReal<T>::type constant)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < size)
{
result[i] = vector[i] / constant;
}
}

template <typename T>
__launch_bounds__(1024) __global__ void vector_div_vector_kernel(const int size,
T* result,
Expand Down Expand Up @@ -142,6 +155,69 @@ void vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator
hipCheckOnDebug();
}

// vector operator: result[i] = vector[i] / constant
template <>
void vector_div_constant_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
double* result,
const double* vector,
const double constant)
{
int thread = 1024;
int block = (dim + thread - 1) / thread;
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<double>),
dim3(block),
dim3(thread),
0,
0,
dim,
result,
vector,
constant);

hipCheckOnDebug();
}

template <typename FPTYPE>
inline void vector_div_constant_wrapper(const int& dim,
std::complex<FPTYPE>* result,
const std::complex<FPTYPE>* vector,
const FPTYPE constant)
{
thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
int thread = 1024;
int block = (dim + thread - 1) / thread;
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<thrust::complex<FPTYPE>>),
dim3(block),
dim3(thread),
0,
0,
dim,
result_tmp,
vector_tmp,
constant);

hipCheckOnDebug();
}

template <>
void vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
std::complex<float>* result,
const std::complex<float>* vector,
const float constant)
{
vector_div_constant_wrapper(dim, result, vector, constant);
}

template <>
void vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
std::complex<double>* result,
const std::complex<double>* vector,
const double constant)
{
vector_div_constant_wrapper(dim, result, vector, constant);
}

// vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex)
template <>
void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
Expand Down Expand Up @@ -357,6 +433,10 @@ template struct vector_mul_real_op<std::complex<float>, base_device::DEVICE_GPU>
template struct vector_mul_real_op<double, base_device::DEVICE_GPU>;
template struct vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;

template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;

template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
Expand Down
9 changes: 7 additions & 2 deletions source/module_base/module_device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

#include <base/macros/macros.h>
#include <cstring>

#include <iostream>
#ifdef __MPI
#include "mpi.h"
#endif
Expand Down Expand Up @@ -166,6 +166,11 @@ int device_count = -1;
cudaGetDeviceCount(&device_count);
#elif defined(__ROCM)
hipGetDeviceCount(&device_count);
/***auto start_time = std::chrono::high_resolution_clock::now();
std::cout << "Starting hipGetDeviceCount.." << std::endl;
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
std::cout << "hipGetDeviceCount took " << duration.count() << "seconds" << std::endl;***/
#endif
if (device_count <= 0)
{
Expand Down Expand Up @@ -711,4 +716,4 @@ void record_device_memory<base_device::DEVICE_GPU>(
#endif

} // end of namespace information
} // end of namespace base_device
} // end of namespace base_device
6 changes: 4 additions & 2 deletions source/module_esolver/esolver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,10 @@ std::string determine_type()
}

GlobalV::ofs_running << "\n RUNNING WITH DEVICE : " << device_info << " / "
<< base_device::information::get_device_info(PARAM.inp.device) << std::endl;

<< base_device::information::get_device_info(PARAM.inp.device) << std::endl;
/***auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/
return esolver_type;
}

Expand Down
3 changes: 2 additions & 1 deletion source/module_esolver/esolver_ks_pw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,8 @@ void ESolver_KS_PW<T, Device>::hamilt2rho_single(UnitCell& ucell,
hsolver::DiagoIterAssist<T, Device>::SCF_ITER,
hsolver::DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
hsolver::DiagoIterAssist<T, Device>::PW_DIAG_THR,
hsolver::DiagoIterAssist<T, Device>::need_subspace);
hsolver::DiagoIterAssist<T, Device>::need_subspace,
PARAM.inp.use_k_continuity);

hsolver_pw_obj.solve(this->p_hamilt,
this->kspw_psi[0],
Expand Down
Loading
Loading