#include "PoissonSolver3DGPU.h" #include #include // GPU constant variables __device__ __constant__ int d_coef_StartPos; __device__ __constant__ int d_grid_StartPos; __device__ __constant__ float d_h2; __device__ __constant__ float d_ih2; __device__ __constant__ float d_tempRatioZ; /* GPU kernels start */ __global__ void relaxationGaussSeidelRed ( float *VPotential, float *RhoChargeDensity, const int RRow, const int ZColumn, const int PhiSlice, float *coef1, float *coef2, float *coef3, float *coef4 ) { int index_x, index_y, index, index_left, index_right, index_up, index_down, index_front, index_back, index_coef; index_x = blockIdx.x * blockDim.x + threadIdx.x; index_y = blockIdx.y * blockDim.y + threadIdx.y; index = d_grid_StartPos + blockIdx.z * RRow * ZColumn + index_y * ZColumn + index_x; index_left = d_grid_StartPos + blockIdx.z * RRow * ZColumn + index_y * ZColumn + (index_x - 1); index_right = d_grid_StartPos + blockIdx.z * RRow * ZColumn + index_y * ZColumn + (index_x + 1); index_up = d_grid_StartPos + blockIdx.z * RRow * ZColumn + (index_y - 1) * ZColumn + index_x; index_down = d_grid_StartPos + blockIdx.z * RRow * ZColumn + (index_y + 1) * ZColumn + index_x; index_front = d_grid_StartPos + ((blockIdx.z - 1 + PhiSlice) % PhiSlice) * RRow * ZColumn + index_y * ZColumn + index_x; index_back = d_grid_StartPos + ((blockIdx.z + 1) % PhiSlice) * RRow * ZColumn + index_y * ZColumn + index_x; index_coef = d_coef_StartPos + index_y; if (index_x != 0 && index_x < (ZColumn - 1) && index_y != 0 && index_y < (RRow - 1)) { //calculate red if ((blockIdx.z % 2 == 0 && (index_x + index_y) % 2 == 0) || (blockIdx.z % 2 != 0 && (index_x + index_y) % 2 != 0)) { VPotential[index] = (coef2[index_coef] * VPotential[index_up] + coef1[index_coef] * VPotential[index_down] + d_tempRatioZ * (VPotential[index_left] + VPotential[index_right]) + coef3[index_coef] * (VPotential[index_front] + VPotential[index_back]) + d_h2 * RhoChargeDensity[index]) * coef4[index_coef]; } } } __global__ void relaxationGaussSeidelBlack ( float *VPotential, float *RhoChargeDensity, const int RRow, const int ZColumn, const int PhiSlice, float *coef1, float *coef2, float *coef3, float *coef4 ) { int index_x, index_y, index, index_left, index_right, index_up, index_down, index_front, index_back, index_coef; index_x = blockIdx.x * blockDim.x + threadIdx.x; index_y = blockIdx.y * blockDim.y + threadIdx.y; index = d_grid_StartPos + blockIdx.z * RRow * ZColumn + index_y * ZColumn + index_x; index_left = d_grid_StartPos + blockIdx.z * RRow * ZColumn + index_y * ZColumn + (index_x - 1); index_right = d_grid_StartPos + blockIdx.z * RRow * ZColumn + index_y * ZColumn + (index_x + 1); index_up = d_grid_StartPos + blockIdx.z * RRow * ZColumn + (index_y - 1) * ZColumn + index_x; index_down = d_grid_StartPos + blockIdx.z * RRow * ZColumn + (index_y + 1) * ZColumn + index_x; index_front = d_grid_StartPos + ((blockIdx.z - 1 + PhiSlice) % PhiSlice) * RRow * ZColumn + index_y * ZColumn + index_x; index_back = d_grid_StartPos + ((blockIdx.z + 1) % PhiSlice) * RRow * ZColumn + index_y * ZColumn + index_x; index_coef = d_coef_StartPos + index_y; if (index_x != 0 && index_x < (ZColumn - 1) && index_y != 0 && index_y < (RRow - 1)) { //calculate black if ((blockIdx.z % 2 == 0 && (index_x + index_y) % 2 != 0) || (blockIdx.z % 2 != 0 && (index_x + index_y) % 2 == 0)) { VPotential[index] = (coef2[index_coef] * VPotential[index_up] + coef1[index_coef] * VPotential[index_down] + d_tempRatioZ * (VPotential[index_left] + VPotential[index_right]) + coef3[index_coef] * (VPotential[index_front] + VPotential[index_back]) + d_h2 * RhoChargeDensity[index]) * coef4[index_coef]; } } } __global__ void residueCalculation ( float *VPotential, float *RhoChargeDensity, float *DeltaResidue, const int RRow, const int ZColumn, const int PhiSlice, float *coef1, float *coef2, float *coef3, float *icoef4 ) { int index_x, index_y, index, index_left, index_right, index_up, index_down, index_front, index_back, index_coef; index_x = blockIdx.x * blockDim.x + threadIdx.x; index_y = blockIdx.y * blockDim.y + threadIdx.y; index = d_grid_StartPos + blockIdx.z * RRow * ZColumn + index_y * ZColumn + index_x; index_left = d_grid_StartPos + blockIdx.z * RRow * ZColumn + index_y * ZColumn + (index_x - 1); index_right = d_grid_StartPos + blockIdx.z * RRow * ZColumn + index_y * ZColumn + (index_x + 1); index_up = d_grid_StartPos + blockIdx.z * RRow * ZColumn + (index_y - 1) * ZColumn + index_x; index_down = d_grid_StartPos + blockIdx.z * RRow * ZColumn + (index_y + 1) * ZColumn + index_x; index_front = d_grid_StartPos + ((blockIdx.z - 1 + PhiSlice) % PhiSlice) * RRow * ZColumn + index_y * ZColumn + index_x; index_back = d_grid_StartPos + ((blockIdx.z + 1) % PhiSlice) * RRow * ZColumn + index_y * ZColumn + index_x; index_coef = d_coef_StartPos + index_y; if (index_x != 0 && index_x < (ZColumn - 1) && index_y != 0 && index_y < (RRow - 1)) { DeltaResidue[index] = d_ih2 * (coef2[index_coef] * VPotential[index_up] + coef1[index_coef] * VPotential[index_down] + d_tempRatioZ * (VPotential[index_left] + VPotential[index_right]) + coef3[index_coef] * (VPotential[index_front] + VPotential[index_back]) - icoef4[index_coef] * VPotential[index]) + RhoChargeDensity[index]; } } __global__ void restriction2DHalf ( float *RhoChargeDensity, float *DeltaResidue, const int RRow, const int ZColumn, const int PhiSlice ) { int index_x, index_y, index; int finer_RRow, finer_ZColumn, finer_grid_StartPos; int finer_index_x, finer_index_y, finer_index, finer_index_left, finer_index_right, finer_index_up, finer_index_down; index_x = blockIdx.x * blockDim.x + threadIdx.x; index_y = blockIdx.y * blockDim.y + threadIdx.y; index = d_grid_StartPos + blockIdx.z * RRow * ZColumn + index_y * ZColumn + index_x; finer_RRow = 2 * RRow - 1; finer_ZColumn = 2 * ZColumn - 1; finer_grid_StartPos = d_grid_StartPos - (finer_RRow * finer_ZColumn * PhiSlice); finer_index_x = index_x * 2; finer_index_y = index_y * 2; finer_index = finer_grid_StartPos + blockIdx.z * finer_RRow * finer_ZColumn + finer_index_y * finer_ZColumn + finer_index_x; finer_index_left = finer_grid_StartPos + blockIdx.z * finer_RRow * finer_ZColumn + finer_index_y * finer_ZColumn + (finer_index_x - 1); finer_index_right = finer_grid_StartPos + blockIdx.z * finer_RRow * finer_ZColumn + finer_index_y * finer_ZColumn + (finer_index_x + 1); finer_index_up = finer_grid_StartPos + blockIdx.z * finer_RRow * finer_ZColumn + (finer_index_y - 1) * finer_ZColumn + finer_index_x; finer_index_down = finer_grid_StartPos + blockIdx.z * finer_RRow * finer_ZColumn + (finer_index_y + 1) * finer_ZColumn + finer_index_x; if (index_x != 0 && index_x < (ZColumn - 1) && index_y != 0 && index_y < (RRow - 1)) { RhoChargeDensity[index] = 0.5 * DeltaResidue[finer_index] + 0.125 * (DeltaResidue[finer_index_left] + DeltaResidue[finer_index_right] + DeltaResidue[finer_index_up] + DeltaResidue[finer_index_down]); } } __global__ void restriction2DFull ( float *RhoChargeDensity, float *DeltaResidue, const int RRow, const int ZColumn, const int PhiSlice ) { int index_x, index_y, index; int finer_RRow, finer_ZColumn, finer_grid_StartPos; int finer_index_x, finer_index_y, finer_index, finer_index_left, finer_index_right, finer_index_up, finer_index_down; int finer_index_up_left, finer_index_up_right, finer_index_down_left, finer_index_down_right; index_x = blockIdx.x * blockDim.x + threadIdx.x; index_y = blockIdx.y * blockDim.y + threadIdx.y; index = d_grid_StartPos + blockIdx.z * RRow * ZColumn + index_y * ZColumn + index_x; finer_RRow = 2 * RRow - 1; finer_ZColumn = 2 * ZColumn - 1; finer_grid_StartPos = d_grid_StartPos - (finer_RRow * finer_ZColumn * PhiSlice); finer_index_x = index_x * 2; finer_index_y = index_y * 2; finer_index = finer_grid_StartPos + blockIdx.z * finer_RRow * finer_ZColumn + finer_index_y * finer_ZColumn + finer_index_x; finer_index_left = finer_grid_StartPos + blockIdx.z * finer_RRow * finer_ZColumn + finer_index_y * finer_ZColumn + (finer_index_x - 1); finer_index_right = finer_grid_StartPos + blockIdx.z * finer_RRow * finer_ZColumn + finer_index_y * finer_ZColumn + (finer_index_x + 1); finer_index_up = finer_grid_StartPos + blockIdx.z * finer_RRow * finer_ZColumn + (finer_index_y - 1) * finer_ZColumn + finer_index_x; finer_index_down = finer_grid_StartPos + blockIdx.z * finer_RRow * finer_ZColumn + (finer_index_y + 1) * finer_ZColumn + finer_index_x; finer_index_up_left = finer_grid_StartPos + blockIdx.z * finer_RRow * finer_ZColumn + (finer_index_y - 1) * finer_ZColumn + (finer_index_x - 1); finer_index_up_right = finer_grid_StartPos + blockIdx.z * finer_RRow * finer_ZColumn + (finer_index_y - 1) * finer_ZColumn + (finer_index_x + 1); finer_index_down_left = finer_grid_StartPos + blockIdx.z * finer_RRow * finer_ZColumn + (finer_index_y + 1) * finer_ZColumn + (finer_index_x - 1); finer_index_down_right = finer_grid_StartPos + blockIdx.z * finer_RRow * finer_ZColumn + (finer_index_y + 1) * finer_ZColumn + (finer_index_x + 1); if (index_x != 0 && index_x < (ZColumn - 1) && index_y != 0 && index_y < (RRow - 1)) { RhoChargeDensity[index] = 0.25 * DeltaResidue[finer_index] + 0.125 * (DeltaResidue[finer_index_left] + DeltaResidue[finer_index_right] + DeltaResidue[finer_index_up] + DeltaResidue[finer_index_down]) + 0.0625 * (DeltaResidue[finer_index_up_left] + DeltaResidue[finer_index_up_right] + DeltaResidue[finer_index_down_left] + DeltaResidue[finer_index_down_right]); } else { RhoChargeDensity[index] = DeltaResidue[finer_index]; } } __global__ void zeroingVPotential ( float *VPotential, const int RRow, const int ZColumn, const int PhiSlice ) { int index_x, index_y, index; index_x = blockIdx.x * blockDim.x + threadIdx.x; index_y = blockIdx.y * blockDim.y + threadIdx.y; index = d_grid_StartPos + blockIdx.z * RRow * ZColumn + index_y * ZColumn + index_x; if (index_x != 0 && index_x < (ZColumn - 1) && index_y != 0 && index_y < (RRow - 1)) { // zeroing V VPotential[index] = 0; } if (index_x == ZColumn - 2) { index_x++; index = d_grid_StartPos + blockIdx.z * RRow * ZColumn + index_y * ZColumn + index_x; VPotential[index] = 0; } } __global__ void zeroingBoundaryTopBottom ( float *VPotential, int RRow, int ZColumn, int PhiSlice ) { int index_x, index_top, index_bottom; index_x = blockIdx.x * blockDim.x + threadIdx.x; index_top = d_grid_StartPos + blockIdx.z * RRow * ZColumn + 0 * ZColumn + index_x; index_bottom = d_grid_StartPos + blockIdx.z * RRow * ZColumn + (ZColumn - 1) * ZColumn + index_x; if (index_x < RRow) { VPotential[index_top] = 0.0; VPotential[index_bottom] = 0.0; } } __global__ void zeroingBoundaryLeftRight ( float *VPotential, int RRow, int ZColumn, int PhiSlice ) { int index_y, index_left, index_right; index_y = blockIdx.x * blockDim.x + threadIdx.x; index_left = d_grid_StartPos + blockIdx.z * RRow * ZColumn + index_y * ZColumn + 0; index_right = d_grid_StartPos + blockIdx.z * RRow * ZColumn + index_y * ZColumn + (RRow - 1); if (index_y < ZColumn) { VPotential[index_left] = 0.0; VPotential[index_right] = 0.0; } } __global__ void prolongation2DHalf ( float *VPotential, const int RRow, const int ZColumn, const int PhiSlice ) { int index_x, index_y, index; int coarser_RRow = (RRow >> 1) + 1; int coarser_ZColumn = (ZColumn >> 1) + 1; int coarser_grid_StartPos = d_grid_StartPos + RRow * ZColumn * PhiSlice; int coarser_index_self; int coarser_index_up, coarser_index_down, coarser_index_left, coarser_index_right; int coarser_index_up_left, coarser_index_up_right, coarser_index_down_left, coarser_index_down_right; index_x = blockIdx.x * blockDim.x + threadIdx.x; index_y = blockIdx.y * blockDim.y + threadIdx.y; index = d_grid_StartPos + blockIdx.z * RRow * ZColumn + index_y * ZColumn + index_x; if (index_x != 0 && index_x < (ZColumn - 1) && index_y != 0 && index_y < (RRow - 1)) { // x odd, y odd if ((index_x % 2 != 0) && (index_y % 2 != 0)) { coarser_index_up_left = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2) * coarser_ZColumn + (index_x / 2); coarser_index_up_right = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2) * coarser_ZColumn + (index_x / 2 + 1); coarser_index_down_left = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2 + 1) * coarser_ZColumn + (index_x / 2); coarser_index_down_right = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2 + 1) * coarser_ZColumn + (index_x / 2 + 1); VPotential[index] += 0.25 * (VPotential[coarser_index_up_left] + VPotential[coarser_index_up_right] + VPotential[coarser_index_down_left] + VPotential[coarser_index_down_right]); } // x even, y odd else if ((index_x % 2 == 0) && (index_y % 2 != 0)) { coarser_index_up = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2) * coarser_ZColumn + (index_x / 2); coarser_index_down = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2 + 1) * coarser_ZColumn + (index_x / 2); VPotential[index] += 0.5 * (VPotential[coarser_index_up] + VPotential[coarser_index_down]); } // x odd, y even else if ((index_x % 2 != 0) && (index_y % 2 == 0)) { coarser_index_left = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2) * coarser_ZColumn + (index_x / 2); coarser_index_right = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2) * coarser_ZColumn + (index_x / 2 + 1); VPotential[index] += 0.5 * (VPotential[coarser_index_left] + VPotential[coarser_index_right]); } // x even, y even else { coarser_index_self = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2) * coarser_ZColumn + (index_x / 2); VPotential[index] += VPotential[coarser_index_self]; } } } __global__ void prolongation2DHalfNoAdd ( float *VPotential, const int RRow, const int ZColumn, const int PhiSlice ) { int index_x, index_y, index; int coarser_RRow = (RRow >> 1) + 1; int coarser_ZColumn = (ZColumn >> 1) + 1; int coarser_grid_StartPos = d_grid_StartPos + RRow * ZColumn * PhiSlice; int coarser_index_self; int coarser_index_up, coarser_index_down, coarser_index_left, coarser_index_right; int coarser_index_up_left, coarser_index_up_right, coarser_index_down_left, coarser_index_down_right; index_x = blockIdx.x * blockDim.x + threadIdx.x; index_y = blockIdx.y * blockDim.y + threadIdx.y; index = d_grid_StartPos + blockIdx.z * RRow * ZColumn + index_y * ZColumn + index_x; if (index_x != 0 && index_x < (ZColumn - 1) && index_y != 0 && index_y < (RRow - 1)) { // x odd, y odd if ((index_x % 2 != 0) && (index_y % 2 != 0)) { coarser_index_up_left = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2) * coarser_ZColumn + (index_x / 2); coarser_index_up_right = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2) * coarser_ZColumn + (index_x / 2 + 1); coarser_index_down_left = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2 + 1) * coarser_ZColumn + (index_x / 2); coarser_index_down_right = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2 + 1) * coarser_ZColumn + (index_x / 2 + 1); VPotential[index] = 0.25 * (VPotential[coarser_index_up_left] + VPotential[coarser_index_up_right] + VPotential[coarser_index_down_left] + VPotential[coarser_index_down_right]); } // x even, y odd else if ((index_x % 2 == 0) && (index_y % 2 != 0)) { coarser_index_up = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2) * coarser_ZColumn + (index_x / 2); coarser_index_down = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2 + 1) * coarser_ZColumn + (index_x / 2); VPotential[index] = 0.5 * (VPotential[coarser_index_up] + VPotential[coarser_index_down]); } // x odd, y even else if ((index_x % 2 != 0) && (index_y % 2 == 0)) { coarser_index_left = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2) * coarser_ZColumn + (index_x / 2); coarser_index_right = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2) * coarser_ZColumn + (index_x / 2 + 1); VPotential[index] = 0.5 * (VPotential[coarser_index_left] + VPotential[coarser_index_right]); } // x even, y even else { coarser_index_self = coarser_grid_StartPos + blockIdx.z * coarser_RRow * coarser_ZColumn + (index_y / 2) * coarser_ZColumn + (index_x / 2); VPotential[index] = VPotential[coarser_index_self]; } } } __global__ void errorCalculation ( float *VPotentialPrev, float *VPotential, float *EpsilonError, const int RRow, const int ZColumn, const int PhiSlice ) { int index_x, index_y, index; float error; float sum_error; index_x = blockIdx.x * blockDim.x + threadIdx.x; index_y = blockIdx.y * blockDim.y + threadIdx.y; index = blockIdx.z * RRow * ZColumn + index_y * ZColumn + index_x; if (index_x != 0 && index_x < (ZColumn - 1) && index_y != 0 && index_y < (RRow - 1)) { error = VPotential[index] - VPotentialPrev[index]; sum_error = error * error; __syncthreads(); atomicAdd( EpsilonError, sum_error ); } } /* GPU kernels end */ /* Error related functions start */ float GetErrorNorm2 ( float * VPotential, float * VPotentialPrev, const int rows, const int cols, float weight ) { float error = 0.0; float sum_error = 0.0; for (int i=0;i mymax) mymax = abs(VPotentialExact[i]); return mymax; } /* Error related functions end */ /* Restrict Boundary for FCycle start -- just CPU enough */ void Restrict_Boundary ( float *VPotential, const int RRow, const int ZColumn, const int PhiSlice, const int Offset ) { int i,ii,j,jj; int finer_RRow = 2 * RRow - 1; int finer_ZColumn = 2 * ZColumn - 1; int finer_Offset = Offset - (finer_RRow * finer_ZColumn * PhiSlice); int sliceStart; int finer_SliceStart; //printf("(%d,%d,%d) -> (%d,%d,%d)\n",RRow,ZColumn,Offset,finer_RRow,finer_ZColumn,finer_Offset); // do for each slice for ( int m = 0;m < PhiSlice;m++) { sliceStart = m * (RRow * ZColumn); finer_SliceStart = m * (finer_RRow * finer_ZColumn); // copy boundary for ( j = 0, jj =0; j < ZColumn ; j++,jj+=2) { VPotential[Offset + sliceStart + (0 * ZColumn) + j] = VPotential[finer_Offset + finer_SliceStart + (0 * finer_ZColumn) + jj]; VPotential[Offset + sliceStart + ((RRow - 1) * ZColumn) + j] = VPotential[finer_Offset + finer_SliceStart + ((finer_RRow -1) * finer_ZColumn) + jj]; } for ( i = 0, ii =0; i < RRow ; i++,ii+=2) { VPotential[Offset + sliceStart + (i * ZColumn)] = VPotential[finer_Offset + finer_SliceStart + (ii * finer_ZColumn)]; VPotential[Offset + sliceStart + (i * ZColumn) + (ZColumn - 1)] = VPotential[finer_Offset + finer_SliceStart + (ii * finer_ZColumn) + (finer_ZColumn - 1)]; } } /** // top left (0,0) // boundary in top and down for ( j = 1, jj =2; j < ZColumn-1 ; j++,jj+=2) { VPotential[Offset + sliceStart + (0 * ZColumn) + j] = 0.5 * VPotential[finer_Offset + finer_SliceStart + (0 * finer_ZColumn) + jj] + 0.25 * VPotential[finer_Offset + finer_SliceStart + (0 * finer_ZColumn) + jj - 1] + 0.25 * VPotential[finer_Offset + finer_SliceStart + (0 * finer_ZColumn) + jj + 1]; VPotential[Offset + sliceStart + ((RRow - 1) * ZColumn) + j] = 0.5 * VPotential[finer_Offset + finer_SliceStart + ((finer_RRow -1) * finer_ZColumn) + jj] + 0.25 * VPotential[finer_Offset + finer_SliceStart + ((finer_RRow -1) * finer_ZColumn) + jj - 1] + 0.25 * VPotential[finer_Offset + finer_SliceStart + ((finer_RRow -1) * finer_ZColumn) + jj + 1]; } // boundary in left and right for ( i = 1, ii =2; i < RRow - 1 ; i++,ii+=2) { VPotential[Offset + sliceStart + (i * ZColumn)] = 0.5 * VPotential[finer_Offset + finer_SliceStart + (ii * finer_ZColumn)] + 0.25 * VPotential[finer_Offset + finer_SliceStart + ((ii-1) * finer_ZColumn)] + 0.25 * VPotential[finer_Offset + finer_SliceStart + ((ii + 1) * finer_ZColumn)]; VPotential[Offset + sliceStart + (i * ZColumn) + (ZColumn - 1)] = 0.5 * VPotential[finer_Offset + finer_SliceStart + (ii * finer_ZColumn) + jj + (finer_ZColumn - 1)] + 0.25 * VPotential[finer_Offset + finer_SliceStart + ((ii -1) * finer_ZColumn) + (finer_ZColumn - 1)] + 0.25 * VPotential[finer_Offset + finer_SliceStart + ((ii +1) * finer_ZColumn) + (finer_ZColumn - 1)]; } // top left (0,0) VPotential[Offset + sliceStart + (0 * ZColumn) + 0] = 0.5 * VPotential[finer_Offset + finer_SliceStart] + 0.25 * VPotential[finer_Offset + finer_SliceStart + (0 * finer_ZColumn) + 1] + 0.25 * VPotential[finer_Offset + finer_SliceStart + (1 * finer_ZColumn)]; // top right VPotential[Offset + sliceStart + (0 * ZColumn) + (ZColumn - 1) ] = 0.5 * VPotential[finer_Offset + finer_SliceStart + (0 * finer_ZColumn) + (finer_ZColumn -1) ] + 0.25 * VPotential[finer_Offset + finer_SliceStart + (0 * finer_ZColumn) + (finer_ZColumn - 2)] + 0.25 * VPotential[finer_Offset + finer_SliceStart + (1 * finer_ZColumn) + (finer_ZColumn - 1)]; // bottom left VPotential[Offset + sliceStart + ((RRow - 1) * ZColumn) + 0] = 0.5 * VPotential[finer_Offset + finer_SliceStart + ((finer_RRow - 1) * finer_ZColumn) + 0] + 0.25 * VPotential[finer_Offset + finer_SliceStart + ((finer_RRow - 1) * finer_ZColumn) + 1] + 0.25 * VPotential[finer_Offset + finer_SliceStart + ((finer_RRow - 2) * finer_ZColumn) + 0]; // bottom right VPotential[Offset + sliceStart + ((RRow - 1) * ZColumn) + (ZColumn - 1)] = 0.5 * VPotential[finer_Offset + finer_SliceStart + ((finer_RRow - 1) * finer_ZColumn) + (finer_ZColumn - 1)] + 0.25 * VPotential[finer_Offset + finer_SliceStart + ((finer_RRow - 1) * finer_ZColumn) + (finer_ZColumn - 2)] + 0.25 * VPotential[finer_Offset + finer_SliceStart + ((finer_RRow - 2) * finer_ZColumn) + (finer_ZColumn - 1)]; } **/ } /* Restrict Boundary for FCycle end */ /** Print matrix **/ void PrintMatrix ( float *Mat, const int Row, const int Column ) { printf("Matrix (%d,%d)\n",Row,Column); for (int i=0;i Finest Grid iOne = 1 << (gridFrom - 1); jOne = 1 << (gridFrom - 1); //grid_RRow = ((RRow - 1) / iOne) + 1; //grid_ZColumn = ((ZColumn - 1) / jOne) + 1; // change accordingly to gridFrom grid_StartPos = 0; coef_StartPos = 0; for (int step = 1; step < gridFrom; step++) { grid_RRow = ((RRow - 1) / (1 << (step - 1))) + 1; grid_ZColumn = ((ZColumn - 1) / (1 << (step - 1))) + 1; grid_StartPos += grid_RRow * grid_ZColumn * grid_PhiSlice; coef_StartPos += grid_RRow; } grid_RRow = ((RRow - 1) / iOne) + 1; grid_ZColumn = ((ZColumn - 1) / jOne) + 1; // pre-compute constant memory h = gridSizeR * iOne; h2 = h * h; ih2 = 1.0 / h2; tempRatioZ = ratioZ * iOne * iOne / (jOne * jOne); tempRatioPhi = ratioPhi * iOne * iOne; // copy constant to device memory cudaMemcpyToSymbol( d_grid_StartPos, &grid_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_coef_StartPos, &coef_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_h2, &h2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_ih2, &ih2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_tempRatioZ, &tempRatioZ, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); // set kernel grid size and block size dim3 grid_BlockPerGrid((grid_RRow < 16) ? 1 : (grid_RRow / 16), (grid_ZColumn < 16) ? 1 : (grid_ZColumn / 16), PhiSlice); dim3 grid_ThreadPerBlock(16, 16); // red-black gauss seidel relaxation (nPre times) for (int i = 0; i < nPre; i++) { relaxationGaussSeidelRed<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); relaxationGaussSeidelBlack<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); } // residue calculation residueCalculation<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, d_DeltaResidue, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_icoef4 ); // V-Cycle => from finer to coarsest grid for (int step = gridFrom + 1; step <= gridTo; step++) { iOne = 1 << (step - 1); jOne = 1 << (step - 1); grid_StartPos += grid_RRow * grid_ZColumn * PhiSlice; coef_StartPos += grid_RRow; grid_RRow = ((RRow - 1) / iOne) + 1; grid_ZColumn = ((ZColumn - 1) / jOne) + 1; // pre-compute constant memory h = gridSizeR * iOne; h2 = h * h; ih2 = 1.0 / h2; tempRatioZ = ratioZ * iOne * iOne / (jOne * jOne); tempRatioPhi = ratioPhi * iOne * iOne; // copy constant to device memory cudaMemcpyToSymbol( d_grid_StartPos, &grid_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_coef_StartPos, &coef_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_h2, &h2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_ih2, &ih2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_tempRatioZ, &tempRatioZ, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); // set kernel grid size and block size dim3 grid_BlockPerGrid((grid_RRow < 16) ? 1 : (grid_RRow / 16), (grid_ZColumn < 16) ? 1 : (grid_ZColumn / 16), PhiSlice); dim3 grid_ThreadPerBlock(16, 16); // restriction restriction2DFull<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_RhoChargeDensity, d_DeltaResidue, grid_RRow, grid_ZColumn, grid_PhiSlice ); // zeroing V zeroingVPotential<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, grid_RRow, grid_ZColumn, grid_PhiSlice ); // zeroing boundaries dim3 grid_BlockPerGridTopBottom((grid_RRow < 16) ? 1 : ((grid_RRow / 16) + 1), 1, PhiSlice); dim3 grid_BlockPerGridLeftRight((grid_ZColumn < 16) ? 1 : ((grid_ZColumn / 16) + 1), 1, PhiSlice); dim3 grid_ThreadPerBlockBoundary(16); zeroingBoundaryTopBottom<<< grid_BlockPerGridTopBottom, grid_ThreadPerBlockBoundary >>>( d_VPotential, grid_RRow, grid_ZColumn, PhiSlice ); zeroingBoundaryLeftRight<<< grid_BlockPerGridLeftRight, grid_ThreadPerBlockBoundary >>>( d_VPotential, grid_RRow, grid_ZColumn, PhiSlice ); // red-black gauss seidel relaxation (nPre times) for (int i = 0; i < nPre; i++) { relaxationGaussSeidelRed<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); relaxationGaussSeidelBlack<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); } // residue calculation if (step < gridTo) { residueCalculation<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, d_DeltaResidue, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_icoef4 ); } } // V-Cycle => from coarser to finer grid for (int step = (gridTo - 1); step >= gridFrom; step--) { iOne = iOne / 2; jOne = jOne / 2; grid_RRow = ((RRow - 1) / iOne) + 1; grid_ZColumn = ((ZColumn - 1) / jOne) + 1; grid_StartPos -= grid_RRow * grid_ZColumn * PhiSlice; coef_StartPos -= grid_RRow; h = gridSizeR * iOne; h2 = h * h; ih2 = 1.0 / h2; tempRatioZ = ratioZ * iOne * iOne / (jOne * jOne); tempRatioPhi = ratioPhi * iOne * iOne; // copy constant to device memory cudaMemcpyToSymbol( d_grid_StartPos, &grid_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_coef_StartPos, &coef_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_h2, &h2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_ih2, &ih2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_tempRatioZ, &tempRatioZ, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); // set kernel grid size and block size dim3 grid_BlockPerGrid((grid_RRow < 16) ? 1 : (grid_RRow / 16), (grid_ZColumn < 16) ? 1 : (grid_ZColumn / 16), PhiSlice); dim3 grid_ThreadPerBlock(16, 16); // prolongation prolongation2DHalf<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, grid_RRow, grid_ZColumn, grid_PhiSlice ); // red-black gauss seidel relaxation (nPost times) for (int i = 0; i < nPost; i++) { relaxationGaussSeidelRed<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); relaxationGaussSeidelBlack<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); } } } /* Cycle functions end */ /*extern function */ extern "C" void PoissonMultigrid3DSemiCoarseningGPUError ( float *VPotential, float *RhoChargeDensity, const int RRow, const int ZColumn, const int PhiSlice, const int Symmetry, float *fparam, int *iparam, bool isExactPresent, float *errorConv, float *errorExact, float *VPotentialExact //allocation in the client ) { // variables for CPU memory float *temp_VPotential; float *VPotentialPrev; float *EpsilonError; // variables for GPU memory float *d_VPotential; float *d_RhoChargeDensity; float *d_DeltaResidue; float *d_VPotentialPrev; float *d_EpsilonError; float *d_coef1; float *d_coef2; float *d_coef3; float *d_coef4; float *d_icoef4; // variables for coefficent calculations float *coef1; float *coef2; float *coef3; float *coef4; float *icoef4; float tempRatioZ; float tempRatioPhi; float radius; int gridFrom; int gridTo; int loops; // variables passed from ALIROOT float gridSizeR = fparam[0]; float gridSizePhi = fparam[1]; float gridSizeZ = fparam[2]; float ratioPhi = fparam[3]; float ratioZ = fparam[4]; float convErr = fparam[5]; float IFCRadius = fparam[6]; int nPre = iparam[0]; int nPost = iparam[1]; int maxLoop = iparam[2]; int nCycle = iparam[3]; // variables for calculating GPU memory allocation int grid_RRow; int grid_ZColumn; int grid_PhiSlice = PhiSlice; int grid_Size = 0; int grid_StartPos; int coef_Size = 0; int coef_StartPos; int iOne, jOne; float h, h2, ih2; // variables for calculating multigrid maximum depth int depth_RRow = 0; int depth_ZColumn = 0; int temp_RRow = RRow; int temp_ZColumn = ZColumn; // calculate depth for multigrid while (temp_RRow >>= 1) depth_RRow++; while (temp_ZColumn >>= 1) depth_ZColumn++; loops = (depth_RRow > depth_ZColumn) ? depth_ZColumn : depth_RRow; loops = (loops > maxLoop) ? maxLoop : loops; gridFrom = 1; gridTo = loops; // calculate GPU memory allocation for multigrid for (int step = gridFrom; step <= gridTo; step++) { grid_RRow = ((RRow - 1) / (1 << (step - 1))) + 1; grid_ZColumn = ((ZColumn - 1) / (1 << (step - 1))) + 1; grid_Size += grid_RRow * grid_ZColumn * grid_PhiSlice; coef_Size += grid_RRow; } // allocate memory for temporary output temp_VPotential = (float *) malloc(grid_Size * sizeof(float)); VPotentialPrev = (float *) malloc(RRow * ZColumn * PhiSlice * sizeof(float)); EpsilonError = (float *) malloc(1 * sizeof(float)); // allocate memory for relaxation coefficient coef1 = (float *) malloc(coef_Size * sizeof(float)); coef2 = (float *) malloc(coef_Size * sizeof(float)); coef3 = (float *) malloc(coef_Size * sizeof(float)); coef4 = (float *) malloc(coef_Size * sizeof(float)); icoef4 = (float *) malloc(coef_Size * sizeof(float)); // pre-compute relaxation coefficient coef_StartPos = 0; iOne = 1 << (gridFrom - 1); jOne = 1 << (gridFrom - 1); for (int step = gridFrom; step <= gridTo; step++) { grid_RRow = ((RRow - 1) / iOne) + 1; h = gridSizeR * iOne; h2 = h * h; ih2 = 1.0 / h2; tempRatioZ = ratioZ * iOne * iOne / (jOne * jOne); tempRatioPhi = ratioPhi * iOne * iOne; for (int i = 1; i < grid_RRow - 1; i++) { radius = IFCRadius + i * h; coef1[coef_StartPos + i] = 1.0 + h / (2 * radius); coef2[coef_StartPos + i] = 1.0 - h / (2 * radius); coef3[coef_StartPos + i] = tempRatioPhi / (radius * radius); coef4[coef_StartPos + i] = 0.5 / (1.0 + tempRatioZ + coef3[coef_StartPos + i]); icoef4[coef_StartPos + i] = 1.0 / coef4[coef_StartPos + i]; } coef_StartPos += grid_RRow; iOne = 2 * iOne; jOne = 2 * jOne; } // device memory allocation cudaMalloc( &d_VPotential, grid_Size * sizeof(float) ); cudaMalloc( &d_VPotentialPrev, RRow * ZColumn * PhiSlice * sizeof(float) ); cudaMalloc( &d_EpsilonError, 1 * sizeof(float) ); cudaMalloc( &d_DeltaResidue, grid_Size * sizeof(float) ); cudaMalloc( &d_RhoChargeDensity, grid_Size * sizeof(float) ); cudaMalloc( &d_coef1, coef_Size * sizeof(float) ); cudaMalloc( &d_coef2, coef_Size * sizeof(float) ); cudaMalloc( &d_coef3, coef_Size * sizeof(float) ); cudaMalloc( &d_coef4, coef_Size * sizeof(float) ); cudaMalloc( &d_icoef4, coef_Size * sizeof(float) ); // set memory to zero cudaMemset( d_VPotential, 0, grid_Size * sizeof(float) ); cudaMemset( d_DeltaResidue, 0, grid_Size * sizeof(float) ); cudaMemset( d_RhoChargeDensity, 0, grid_Size * sizeof(float) ); cudaMemset( d_VPotentialPrev, 0, RRow * ZColumn * PhiSlice * sizeof(float) ); cudaMemset( d_EpsilonError, 0, 1 * sizeof(float) ); // copy data from host to device cudaMemcpy( d_VPotential, VPotential, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyHostToDevice ); //check cudaMemcpy( d_RhoChargeDensity, RhoChargeDensity, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyHostToDevice ); //check cudaMemcpy( d_coef1, coef1, coef_Size * sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpy( d_coef2, coef2, coef_Size * sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpy( d_coef3, coef3, coef_Size * sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpy( d_coef4, coef4, coef_Size * sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpy( d_icoef4, icoef4, coef_Size * sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpy( d_VPotentialPrev, VPotential, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyHostToDevice ); // max exact // float maxAbsExact = GetAbsMax(VPotentialExact, RRow * PhiSlice * ZColumn); float maxAbsExact = 1.0; if (isExactPresent == true) maxAbsExact = GetAbsMax(VPotentialExact, RRow * PhiSlice * ZColumn); dim3 error_BlockPerGrid((RRow < 16) ? 1 : (RRow / 16), (ZColumn < 16) ? 1 : (ZColumn / 16), PhiSlice); dim3 error_ThreadPerBlock(16, 16); for (int cycle = 0; cycle < nCycle; cycle++) { cudaMemcpy( temp_VPotential, d_VPotential, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToHost ); if (isExactPresent == true) errorExact[cycle] = GetErrorNorm2(temp_VPotential, VPotentialExact, RRow * PhiSlice,ZColumn, maxAbsExact); VCycleSemiCoarseningGPU(d_VPotential, d_RhoChargeDensity, d_DeltaResidue, d_coef1, d_coef2, d_coef3, d_coef4, d_icoef4, gridSizeR, ratioZ, ratioPhi, RRow, ZColumn, PhiSlice, gridFrom, gridTo, nPre, nPost); errorCalculation<<< error_BlockPerGrid, error_ThreadPerBlock >>> ( d_VPotentialPrev, d_VPotential, d_EpsilonError, RRow, ZColumn, PhiSlice); cudaMemcpy( EpsilonError, d_EpsilonError, 1 * sizeof(float), cudaMemcpyDeviceToHost ); errorConv[cycle] = *EpsilonError / (RRow * ZColumn * PhiSlice); if (errorConv[cycle] < convErr) { //errorConv nCycle = cycle; break; } cudaMemcpy( d_VPotentialPrev, d_VPotential, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToDevice ); cudaMemset( d_EpsilonError, 0, 1 * sizeof(float) ); } iparam[3] = nCycle; // for (int cycle = 0; cycle < nCycle; cycle++) // { // cudaMemcpy( temp_VPotentialPrev, d_VPotential, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToHost ); // VCycleSemiCoarseningGPU(d_VPotential, d_RhoChargeDensity, d_DeltaResidue, d_coef1, d_coef2, d_coef3, d_coef4, d_icoef4, gridSizeR, ratioZ, ratioPhi, RRow, ZColumn, PhiSlice, gridFrom, gridTo, nPre, nPost); // cudaMemcpy( temp_VPotential, d_VPotential, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToHost ); // errorConv[cycle] = GetErrorNorm2(temp_VPotential, temp_VPotentialPrev, RRow * PhiSlice, ZColumn, 1.0); // //errorExact[cycle] = GetErrorNorm2(temp_VPotential, VPotentialExact, RRow * PhiSlice,ZColumn, 1.0); // } // copy result from device to host cudaMemcpy( temp_VPotential, d_VPotential, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToHost ); memcpy(VPotential, temp_VPotential, RRow * ZColumn * PhiSlice * sizeof(float)); // free device memory cudaFree( d_VPotential ); cudaFree( d_DeltaResidue ); cudaFree( d_RhoChargeDensity ); cudaFree( d_VPotentialPrev ); cudaFree( d_EpsilonError ); cudaFree( d_coef1 ); cudaFree( d_coef2 ); cudaFree( d_coef3 ); cudaFree( d_coef4 ); cudaFree( d_icoef4 ); // free host memory free( coef1 ); free( coef2 ); free( coef3 ); free( coef4 ); free( icoef4 ); free( temp_VPotential ); free( VPotentialPrev ); } extern "C" void PoissonMultigrid3DSemiCoarseningGPUErrorWCycle ( float *VPotential, float *RhoChargeDensity, const int RRow, const int ZColumn, const int PhiSlice, const int Symmetry, float *fparam, int *iparam, float *errorConv, float *errorExact, float *VPotentialExact //allocation in the client ) { // variables for CPU memory float *temp_VPotential; float *VPotentialPrev; float *EpsilonError; // variables for GPU memory float *d_VPotential; float *d_RhoChargeDensity; float *d_DeltaResidue; float *d_coef1; float *d_coef2; float *d_coef3; float *d_coef4; float *d_icoef4; float *d_VPotentialPrev; float *d_EpsilonError; // variables for coefficent calculations float *coef1; float *coef2; float *coef3; float *coef4; float *icoef4; float tempRatioZ; float tempRatioPhi; float radius; int gridFrom; int gridTo; int loops; // variables passed from ALIROOT float gridSizeR = fparam[0]; //float gridSizePhi = fparam[1]; //float gridSizeZ = fparam[2]; float ratioPhi = fparam[3]; float ratioZ = fparam[4]; float convErr = fparam[5]; float IFCRadius = fparam[6]; int nPre = iparam[0]; int nPost = iparam[1]; int maxLoop = iparam[2]; int nCycle = iparam[3]; // variables for calculating GPU memory allocation int grid_RRow; int grid_ZColumn; int grid_PhiSlice = PhiSlice; int grid_Size = 0; int grid_StartPos; int coef_Size = 0; int coef_StartPos; int iOne, jOne; float h, h2, ih2; // variables for calculating multigrid maximum depth int depth_RRow = 0; int depth_ZColumn = 0; int temp_RRow = RRow; int temp_ZColumn = ZColumn; // calculate depth for multigrid while (temp_RRow >>= 1) depth_RRow++; while (temp_ZColumn >>= 1) depth_ZColumn++; loops = (depth_RRow > depth_ZColumn) ? depth_ZColumn : depth_RRow; loops = (loops > maxLoop) ? maxLoop : loops; gridFrom = 1; gridTo = loops; // calculate GPU memory allocation for multigrid for (int step = gridFrom; step <= gridTo; step++) { grid_RRow = ((RRow - 1) / (1 << (step - 1))) + 1; grid_ZColumn = ((ZColumn - 1) / (1 << (step - 1))) + 1; grid_Size += grid_RRow * grid_ZColumn * grid_PhiSlice; coef_Size += grid_RRow; } // allocate memory for temporary output temp_VPotential = (float *) malloc(grid_Size * sizeof(float)); VPotentialPrev = (float *) malloc(RRow * ZColumn * PhiSlice * sizeof(float)); EpsilonError = (float *) malloc(1 * sizeof(float)); // allocate memory for relaxation coefficient coef1 = (float *) malloc(coef_Size * sizeof(float)); coef2 = (float *) malloc(coef_Size * sizeof(float)); coef3 = (float *) malloc(coef_Size * sizeof(float)); coef4 = (float *) malloc(coef_Size * sizeof(float)); icoef4 = (float *) malloc(coef_Size * sizeof(float)); // pre-compute relaxation coefficient coef_StartPos = 0; iOne = 1 << (gridFrom - 1); jOne = 1 << (gridFrom - 1); for (int step = gridFrom; step <= gridTo; step++) { grid_RRow = ((RRow - 1) / iOne) + 1; h = gridSizeR * iOne; h2 = h * h; ih2 = 1.0 / h2; tempRatioZ = ratioZ * iOne * iOne / (jOne * jOne); tempRatioPhi = ratioPhi * iOne * iOne; for (int i = 1; i < grid_RRow - 1; i++) { radius = IFCRadius + i * h; coef1[coef_StartPos + i] = 1.0 + h / (2 * radius); coef2[coef_StartPos + i] = 1.0 - h / (2 * radius); coef3[coef_StartPos + i] = tempRatioPhi / (radius * radius); coef4[coef_StartPos + i] = 0.5 / (1.0 + tempRatioZ + coef3[coef_StartPos + i]); icoef4[coef_StartPos + i] = 1.0 / coef4[coef_StartPos + i]; } coef_StartPos += grid_RRow; iOne = 2 * iOne; jOne = 2 * jOne; } // device memory allocation cudaMalloc( &d_VPotential, grid_Size * sizeof(float) ); cudaMalloc( &d_DeltaResidue, grid_Size * sizeof(float) ); cudaMalloc( &d_VPotentialPrev, RRow * ZColumn * PhiSlice * sizeof(float) ); cudaMalloc( &d_EpsilonError, 1 * sizeof(float) ); cudaMalloc( &d_RhoChargeDensity, grid_Size * sizeof(float) ); cudaMalloc( &d_coef1, coef_Size * sizeof(float) ); cudaMalloc( &d_coef2, coef_Size * sizeof(float) ); cudaMalloc( &d_coef3, coef_Size * sizeof(float) ); cudaMalloc( &d_coef4, coef_Size * sizeof(float) ); cudaMalloc( &d_icoef4, coef_Size * sizeof(float) ); // set memory to zero cudaMemset( d_VPotential, 0, grid_Size * sizeof(float) ); cudaMemset( d_DeltaResidue, 0, grid_Size * sizeof(float) ); cudaMemset( d_RhoChargeDensity, 0, grid_Size * sizeof(float) ); cudaMemset( d_VPotentialPrev, 0, RRow * ZColumn * PhiSlice * sizeof(float) ); cudaMemset( d_EpsilonError, 0, 1 * sizeof(float) ); // copy data from host to device cudaMemcpy( d_VPotential, VPotential, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyHostToDevice ); //check cudaMemcpy( d_RhoChargeDensity, RhoChargeDensity, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyHostToDevice ); //check cudaMemcpy( d_coef1, coef1, coef_Size * sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpy( d_coef2, coef2, coef_Size * sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpy( d_coef3, coef3, coef_Size * sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpy( d_coef4, coef4, coef_Size * sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpy( d_icoef4, icoef4, coef_Size * sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpy( d_VPotentialPrev, VPotential, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyHostToDevice ); // max exact float maxAbsExact = GetAbsMax(VPotentialExact,RRow * PhiSlice * ZColumn); float maxAbsExact = GetAbsMax(VPotentialExact, RRow * PhiSlice * ZColumn); dim3 error_BlockPerGrid((RRow < 16) ? 1 : (RRow / 16), (ZColumn < 16) ? 1 : (ZColumn / 16), PhiSlice); dim3 error_ThreadPerBlock(16, 16); for (int cycle = 0; cycle < nCycle; cycle++) { /*V-Cycle starts*/ // error conv // cudaMemcpy( temp_VPotentialPrev, d_VPotential, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToHost ); cudaMemcpy( temp_VPotential, d_VPotential, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToHost ); errorExact[cycle] = GetErrorNorm2(temp_VPotential,VPotentialExact,RRow * PhiSlice,ZColumn,maxAbsExact); // V-Cycle => Finest Grid iOne = 1 << (gridFrom - 1); jOne = 1 << (gridFrom - 1); grid_RRow = ((RRow - 1) / iOne) + 1; grid_ZColumn = ((ZColumn - 1) / jOne) + 1; grid_StartPos = 0; coef_StartPos = 0; // pre-compute constant memory h = gridSizeR * iOne; h2 = h * h; ih2 = 1.0 / h2; tempRatioZ = ratioZ * iOne * iOne / (jOne * jOne); tempRatioPhi = ratioPhi * iOne * iOne; // copy constant to device memory cudaMemcpyToSymbol( d_grid_StartPos, &grid_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_coef_StartPos, &coef_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_h2, &h2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_ih2, &ih2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_tempRatioZ, &tempRatioZ, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); // set kernel grid size and block size dim3 grid_BlockPerGrid((grid_RRow < 16) ? 1 : (grid_RRow / 16), (grid_ZColumn < 16) ? 1 : (grid_ZColumn / 16), PhiSlice); dim3 grid_ThreadPerBlock(16, 16); // red-black gauss seidel relaxation (nPre times) for (int i = 0; i < nPre; i++) { relaxationGaussSeidelRed<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); //cudaDeviceSynchronize(); relaxationGaussSeidelBlack<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); //cudaDeviceSynchronize(); } // residue calculation residueCalculation<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, d_DeltaResidue, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_icoef4 ); //cudaDeviceSynchronize(); // V-Cycle => from finer to coarsest grid for (int step = gridFrom + 1; step <= gridTo; step++) { iOne = 1 << (step - 1); jOne = 1 << (step - 1); grid_StartPos += grid_RRow * grid_ZColumn * PhiSlice; coef_StartPos += grid_RRow; grid_RRow = ((RRow - 1) / iOne) + 1; grid_ZColumn = ((ZColumn - 1) / jOne) + 1; // pre-compute constant memory h = gridSizeR * iOne; h2 = h * h; ih2 = 1.0 / h2; tempRatioZ = ratioZ * iOne * iOne / (jOne * jOne); tempRatioPhi = ratioPhi * iOne * iOne; // copy constant to device memory cudaMemcpyToSymbol( d_grid_StartPos, &grid_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_coef_StartPos, &coef_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_h2, &h2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_ih2, &ih2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_tempRatioZ, &tempRatioZ, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); // set kernel grid size and block size dim3 grid_BlockPerGrid((grid_RRow < 16) ? 1 : (grid_RRow / 16), (grid_ZColumn < 16) ? 1 : (grid_ZColumn / 16), PhiSlice); dim3 grid_ThreadPerBlock(16, 16); // restriction restriction2DFull<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_RhoChargeDensity, d_DeltaResidue, grid_RRow, grid_ZColumn, grid_PhiSlice ); //cudaDeviceSynchronize(); // zeroing V zeroingVPotential<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, grid_RRow, grid_ZColumn, grid_PhiSlice ); //cudaDeviceSynchronize(); // red-black gauss seidel relaxation (nPre times) for (int i = 0; i < nPre; i++) { relaxationGaussSeidelRed<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); //cudaDeviceSynchronize(); relaxationGaussSeidelBlack<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); //cudaDeviceSynchronize(); } // residue calculation if (step < gridTo) { residueCalculation<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, d_DeltaResidue, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_icoef4 ); //cudaDeviceSynchronize(); } } /////////// innner w cycle /// up one down one // up one { int step = (gridTo - 1); iOne = iOne / 2; jOne = jOne / 2; grid_RRow = ((RRow - 1) / iOne) + 1; grid_ZColumn = ((ZColumn - 1) / jOne) + 1; grid_StartPos -= grid_RRow * grid_ZColumn * PhiSlice; coef_StartPos -= grid_RRow; h = gridSizeR * iOne; h2 = h * h; ih2 = 1.0 / h2; tempRatioPhi = ratioPhi * iOne * iOne; // copy constant to device memory cudaMemcpyToSymbol( d_grid_StartPos, &grid_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_coef_StartPos, &coef_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_h2, &h2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_ih2, &ih2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_tempRatioZ, &tempRatioZ, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); // set kernel grid size and block size dim3 grid_BlockPerGrid((grid_RRow < 16) ? 1 : (grid_RRow / 16), (grid_ZColumn < 16) ? 1 : (grid_ZColumn / 16), PhiSlice); dim3 grid_ThreadPerBlock(16, 16); // prolongation prolongation2DHalf<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, grid_RRow, grid_ZColumn, grid_PhiSlice ); // cudaDeviceSynchronize(); // red-black gauss seidel relaxation (nPost times) for (int i = 0; i < nPost; i++) { relaxationGaussSeidelRed<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); // cudaDeviceSynchronize(); relaxationGaussSeidelBlack<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); // cudaDeviceSynchronize(); } } // down one { residueCalculation<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, d_DeltaResidue, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_icoef4 ); iOne = iOne * 2; jOne = jOne * 2; grid_StartPos += grid_RRow * grid_ZColumn * PhiSlice; coef_StartPos += grid_RRow; grid_RRow = ((RRow - 1) / iOne) + 1; grid_ZColumn = ((ZColumn - 1) / jOne) + 1; // pre-compute constant memory h = gridSizeR * iOne; h2 = h * h; ih2 = 1.0 / h2; tempRatioZ = ratioZ * iOne * iOne / (jOne * jOne); tempRatioPhi = ratioPhi * iOne * iOne; // copy constant to device memory cudaMemcpyToSymbol( d_grid_StartPos, &grid_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_coef_StartPos, &coef_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_h2, &h2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_ih2, &ih2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_tempRatioZ, &tempRatioZ, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); // set kernel grid size and block size dim3 grid_BlockPerGrid((grid_RRow < 16) ? 1 : (grid_RRow / 16), (grid_ZColumn < 16) ? 1 : (grid_ZColumn / 16), PhiSlice); dim3 grid_ThreadPerBlock(16, 16); // restriction restriction2DFull<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_RhoChargeDensity, d_DeltaResidue, grid_RRow, grid_ZColumn, grid_PhiSlice ); //cudaDeviceSynchronize(); // zeroing V zeroingVPotential<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, grid_RRow, grid_ZColumn, grid_PhiSlice ); //cudaDeviceSynchronize(); // red-black gauss seidel relaxation (nPre times) for (int i = 0; i < nPre; i++) { relaxationGaussSeidelRed<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); //cudaDeviceSynchronize(); relaxationGaussSeidelBlack<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); //cudaDeviceSynchronize(); } } /// end up one down on /// up two down two // up two from gridTo - 1, to gridTo -3 for (int step = (gridTo - 1); step >= gridTo - 3; step--) { iOne = iOne / 2; jOne = jOne / 2; grid_RRow = ((RRow - 1) / iOne) + 1; grid_ZColumn = ((ZColumn - 1) / jOne) + 1; grid_StartPos -= grid_RRow * grid_ZColumn * PhiSlice; coef_StartPos -= grid_RRow; h = gridSizeR * iOne; h2 = h * h; ih2 = 1.0 / h2; tempRatioZ = ratioZ * iOne * iOne / (jOne * jOne); tempRatioPhi = ratioPhi * iOne * iOne; // copy constant to device memory cudaMemcpyToSymbol( d_grid_StartPos, &grid_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_coef_StartPos, &coef_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_h2, &h2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_ih2, &ih2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_tempRatioZ, &tempRatioZ, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); // set kernel grid size and block size dim3 grid_BlockPerGrid((grid_RRow < 16) ? 1 : (grid_RRow / 16), (grid_ZColumn < 16) ? 1 : (grid_ZColumn / 16), PhiSlice); dim3 grid_ThreadPerBlock(16, 16); // prolongation prolongation2DHalf<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, grid_RRow, grid_ZColumn, grid_PhiSlice ); // cudaDeviceSynchronize(); // red-black gauss seidel relaxation (nPost times) for (int i = 0; i < nPost; i++) { relaxationGaussSeidelRed<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); // cudaDeviceSynchronize(); relaxationGaussSeidelBlack<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); // cudaDeviceSynchronize(); } } // down to from gridTo - 1, to gridTo -3 for (int step = gridTo - 3; step <= gridTo - 1; step++) { iOne = iOne * 2; jOne = jOne * 2; grid_StartPos += grid_RRow * grid_ZColumn * PhiSlice; coef_StartPos += grid_RRow; grid_RRow = ((RRow - 1) / iOne) + 1; grid_ZColumn = ((ZColumn - 1) / jOne) + 1; // pre-compute constant memory h = gridSizeR * iOne; h2 = h * h; ih2 = 1.0 / h2; tempRatioZ = ratioZ * iOne * iOne / (jOne * jOne); tempRatioPhi = ratioPhi * iOne * iOne; // copy constant to device memory cudaMemcpyToSymbol( d_grid_StartPos, &grid_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_coef_StartPos, &coef_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_h2, &h2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_ih2, &ih2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_tempRatioZ, &tempRatioZ, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); // set kernel grid size and block size dim3 grid_BlockPerGrid((grid_RRow < 16) ? 1 : (grid_RRow / 16), (grid_ZColumn < 16) ? 1 : (grid_ZColumn / 16), PhiSlice); dim3 grid_ThreadPerBlock(16, 16); // restriction restriction2DFull<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_RhoChargeDensity, d_DeltaResidue, grid_RRow, grid_ZColumn, grid_PhiSlice ); //cudaDeviceSynchronize(); // zeroing V zeroingVPotential<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, grid_RRow, grid_ZColumn, grid_PhiSlice ); //cudaDeviceSynchronize(); // red-black gauss seidel relaxation (nPre times) for (int i = 0; i < nPre; i++) { relaxationGaussSeidelRed<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); //cudaDeviceSynchronize(); relaxationGaussSeidelBlack<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); //cudaDeviceSynchronize(); } // residue calculation if (step < gridTo) { residueCalculation<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, d_DeltaResidue, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_icoef4 ); //cudaDeviceSynchronize(); } } /// up one down one { int step = (gridTo - 1); iOne = iOne / 2; jOne = jOne / 2; grid_RRow = ((RRow - 1) / iOne) + 1; grid_ZColumn = ((ZColumn - 1) / jOne) + 1; grid_StartPos -= grid_RRow * grid_ZColumn * PhiSlice; coef_StartPos -= grid_RRow; h = gridSizeR * iOne; h2 = h * h; ih2 = 1.0 / h2; tempRatioZ = ratioZ * iOne * iOne / (jOne * jOne); tempRatioPhi = ratioPhi * iOne * iOne; // copy constant to device memory cudaMemcpyToSymbol( d_grid_StartPos, &grid_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_coef_StartPos, &coef_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_h2, &h2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_ih2, &ih2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_tempRatioZ, &tempRatioZ, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); // set kernel grid size and block size dim3 grid_BlockPerGrid((grid_RRow < 16) ? 1 : (grid_RRow / 16), (grid_ZColumn < 16) ? 1 : (grid_ZColumn / 16), PhiSlice); dim3 grid_ThreadPerBlock(16, 16); // prolongation prolongation2DHalf<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, grid_RRow, grid_ZColumn, grid_PhiSlice ); // cudaDeviceSynchronize(); // red-black gauss seidel relaxation (nPost times) for (int i = 0; i < nPost; i++) { relaxationGaussSeidelRed<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); // cudaDeviceSynchronize(); relaxationGaussSeidelBlack<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); // cudaDeviceSynchronize(); } } // down one { residueCalculation<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, d_DeltaResidue, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_icoef4 ); iOne = iOne * 2; jOne = jOne * 2; grid_StartPos += grid_RRow * grid_ZColumn * PhiSlice; coef_StartPos += grid_RRow; grid_RRow = ((RRow - 1) / iOne) + 1; grid_ZColumn = ((ZColumn - 1) / jOne) + 1; // pre-compute constant memory h = gridSizeR * iOne; h2 = h * h; ih2 = 1.0 / h2; tempRatioZ = ratioZ * iOne * iOne / (jOne * jOne); tempRatioPhi = ratioPhi * iOne * iOne; // copy constant to device memory cudaMemcpyToSymbol( d_grid_StartPos, &grid_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_coef_StartPos, &coef_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_h2, &h2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_ih2, &ih2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_tempRatioZ, &tempRatioZ, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); // set kernel grid size and block size dim3 grid_BlockPerGrid((grid_RRow < 16) ? 1 : (grid_RRow / 16), (grid_ZColumn < 16) ? 1 : (grid_ZColumn / 16), PhiSlice); dim3 grid_ThreadPerBlock(16, 16); // restriction restriction2DFull<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_RhoChargeDensity, d_DeltaResidue, grid_RRow, grid_ZColumn, grid_PhiSlice ); //cudaDeviceSynchronize(); // zeroing V zeroingVPotential<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, grid_RRow, grid_ZColumn, grid_PhiSlice ); //cudaDeviceSynchronize(); // red-black gauss seidel relaxation (nPre times) for (int i = 0; i < nPre; i++) { relaxationGaussSeidelRed<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); //cudaDeviceSynchronize(); relaxationGaussSeidelBlack<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); //cudaDeviceSynchronize(); } } /// end up one down one /////////// end inner w cyle // V-Cycle => from coarser to finer grid for (int step = (gridTo - 1); step >= gridFrom; step--) { iOne = iOne / 2; jOne = jOne / 2; grid_RRow = ((RRow - 1) / iOne) + 1; grid_ZColumn = ((ZColumn - 1) / jOne) + 1; grid_StartPos -= grid_RRow * grid_ZColumn * PhiSlice; coef_StartPos -= grid_RRow; h = gridSizeR * iOne; h2 = h * h; ih2 = 1.0 / h2; tempRatioZ = ratioZ * iOne * iOne / (jOne * jOne); tempRatioPhi = ratioPhi * iOne * iOne; // copy constant to device memory cudaMemcpyToSymbol( d_grid_StartPos, &grid_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_coef_StartPos, &coef_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_h2, &h2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_ih2, &ih2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_tempRatioZ, &tempRatioZ, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); // set kernel grid size and block size dim3 grid_BlockPerGrid((grid_RRow < 16) ? 1 : (grid_RRow / 16), (grid_ZColumn < 16) ? 1 : (grid_ZColumn / 16), PhiSlice); dim3 grid_ThreadPerBlock(16, 16); // prolongation prolongation2DHalf<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, grid_RRow, grid_ZColumn, grid_PhiSlice ); // cudaDeviceSynchronize(); // red-black gauss seidel relaxation (nPost times) for (int i = 0; i < nPost; i++) { relaxationGaussSeidelRed<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); // cudaDeviceSynchronize(); relaxationGaussSeidelBlack<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); // cudaDeviceSynchronize(); } } /*V-Cycle ends*/ errorCalculation<<< error_BlockPerGrid, error_ThreadPerBlock >>> ( d_VPotentialPrev, d_VPotential, d_EpsilonError, RRow, ZColumn, PhiSlice); cudaMemcpy( EpsilonError, d_EpsilonError, 1 * sizeof(float), cudaMemcpyDeviceToHost ); errorConv[cycle] = *EpsilonError / (RRow * ZColumn * PhiSlice); if (errorConv[cycle] < convErr) { //errorConv nCycle = cycle; iparam[3] = nCycle; break; } cudaMemcpy( d_VPotentialPrev, d_VPotential, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToDevice ); cudaMemset( d_EpsilonError, 0, 1 * sizeof(float) ); } cudaDeviceSynchronize(); // copy result from device to host cudaMemcpy( temp_VPotential, d_VPotential, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToHost ); memcpy(VPotential, temp_VPotential, RRow * ZColumn * PhiSlice * sizeof(float)); // free device memory cudaFree( d_VPotential ); cudaFree( d_VPotentialPrev ); cudaFree( d_EpsilonError ); cudaFree( d_DeltaResidue ); cudaFree( d_RhoChargeDensity ); cudaFree( d_coef1 ); cudaFree( d_coef2 ); cudaFree( d_coef3 ); cudaFree( d_coef4 ); cudaFree( d_icoef4 ); // free host memory free( coef1 ); free( coef2 ); free( coef3 ); free( coef4 ); free( icoef4 ); free( temp_VPotential ); //free( temp_VPotentialPrev ); } /*extern function */ extern "C" void PoissonMultigrid3DSemiCoarseningGPUErrorFCycle ( float *VPotential, float *RhoChargeDensity, const int RRow, const int ZColumn, const int PhiSlice, const int Symmetry, float *fparam, int *iparam, bool isExactPresent, float *errorConv, float *errorExact, float *VPotentialExact //allocation in the client ) { // variables for CPU memory float *temp_VPotential; float *VPotentialPrev; float *EpsilonError; // variables for GPU memory float *d_VPotential; float *d_RhoChargeDensity; float *d_DeltaResidue; float *d_coef1; float *d_coef2; float *d_coef3; float *d_coef4; float *d_icoef4; float *d_VPotentialPrev; float *d_EpsilonError; // variables for coefficent calculations float *coef1; float *coef2; float *coef3; float *coef4; float *icoef4; float tempRatioZ; float tempRatioPhi; float radius; int gridFrom; int gridTo; int loops; // variables passed from ALIROOT float gridSizeR = fparam[0]; //float gridSizePhi = fparam[1]; //float gridSizeZ = fparam[2]; float ratioPhi = fparam[3]; float ratioZ = fparam[4]; float convErr = fparam[5]; float IFCRadius = fparam[6]; int nPre = iparam[0]; int nPost = iparam[1]; int maxLoop = iparam[2]; int nCycle = iparam[3]; // variables for calculating GPU memory allocation int grid_RRow; int grid_ZColumn; int grid_PhiSlice = PhiSlice; int grid_Size = 0; int grid_StartPos; int coef_Size = 0; int coef_StartPos; int iOne, jOne; float h, h2, ih2; // variables for calculating multigrid maximum depth int depth_RRow = 0; int depth_ZColumn = 0; int temp_RRow = RRow; int temp_ZColumn = ZColumn; // calculate depth for multigrid while (temp_RRow >>= 1) depth_RRow++; while (temp_ZColumn >>= 1) depth_ZColumn++; loops = (depth_RRow > depth_ZColumn) ? depth_ZColumn : depth_RRow; loops = (loops > maxLoop) ? maxLoop : loops; gridFrom = 1; gridTo = loops; // calculate GPU memory allocation for multigrid for (int step = gridFrom; step <= gridTo; step++) { grid_RRow = ((RRow - 1) / (1 << (step - 1))) + 1; grid_ZColumn = ((ZColumn - 1) / (1 << (step - 1))) + 1; grid_Size += grid_RRow * grid_ZColumn * grid_PhiSlice; coef_Size += grid_RRow; } // allocate memory for temporary output temp_VPotential = (float *) malloc(grid_Size * sizeof(float)); VPotentialPrev = (float *) malloc(grid_Size * sizeof(float)); EpsilonError = (float *) malloc(1 * sizeof(float)); for (int i=0;i>>( d_RhoChargeDensity, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice ); // restrict boundary (already done in cpu) /// cudaMemcpy( temp_VPotential, d_RhoChargeDensity + grid_StartPos , grid_RRow * grid_ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToHost ); // PrintMatrix(temp_VPotential,grid_RRow * PhiSlice,grid_ZColumn); restriction2DFull<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_VPotential, grid_RRow, grid_ZColumn, grid_PhiSlice ); } dim3 grid_BlockPerGrid((grid_RRow < 16) ? 1 : (grid_RRow / 16), (grid_ZColumn < 16) ? 1 : (grid_ZColumn / 16), PhiSlice); dim3 grid_ThreadPerBlock(16, 16); // relax on the coarsest // red-black gauss seidel relaxation (nPre times) // printf("rho\n"); // cudaMemcpy( temp_VPotential, d_RhoChargeDensity + grid_StartPos , grid_RRow * grid_ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToHost ); // PrintMatrix(temp_VPotential,grid_RRow,grid_ZColumn); // printf("v\n"); // cudaMemcpy( temp_VPotential, d_VPotential + grid_StartPos , grid_RRow * grid_ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToHost ); // PrintMatrix(temp_VPotential,grid_RRow,grid_ZColumn); for (int i = 0; i < nPre; i++) { relaxationGaussSeidelRed<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); cudaDeviceSynchronize(); relaxationGaussSeidelBlack<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, d_RhoChargeDensity, grid_RRow, grid_ZColumn, grid_PhiSlice, d_coef1, d_coef2, d_coef3, d_coef4 ); cudaDeviceSynchronize(); } // printf("v after relax\n"); // cudaMemcpy( temp_VPotential, d_VPotential + grid_StartPos , grid_RRow * grid_ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToHost ); // PrintMatrix(temp_VPotential,grid_RRow,grid_ZColumn); // V-Cycle => from coarser to finer grid for (int step = gridTo -1 ; step >= gridFrom; step--) { iOne = iOne / 2; jOne = jOne / 2; grid_RRow = ((RRow - 1) / iOne) + 1; grid_ZColumn = ((ZColumn - 1) / jOne) + 1; grid_StartPos -= grid_RRow * grid_ZColumn * PhiSlice; coef_StartPos -= grid_RRow; h = gridSizeR * iOne; h2 = h * h; ih2 = 1.0 / h2; tempRatioZ = ratioZ * iOne * iOne / (jOne * jOne); tempRatioPhi = ratioPhi * iOne * iOne; // copy constant to device memory cudaMemcpyToSymbol( d_grid_StartPos, &grid_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_coef_StartPos, &coef_StartPos, 1 * sizeof(int), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_h2, &h2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_ih2, &ih2, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( d_tempRatioZ, &tempRatioZ, 1 * sizeof(float), 0, cudaMemcpyHostToDevice ); // set kernel grid size and block size dim3 grid_BlockPerGrid((grid_RRow < 16) ? 1 : (grid_RRow / 16), (grid_ZColumn < 16) ? 1 : (grid_ZColumn / 16), PhiSlice); dim3 grid_ThreadPerBlock(16, 16); prolongation2DHalfNoAdd<<< grid_BlockPerGrid, grid_ThreadPerBlock >>>( d_VPotential, grid_RRow, grid_ZColumn, grid_PhiSlice ); // just // max exact cudaMemcpy( d_VPotentialPrev + grid_StartPos, d_VPotential + grid_StartPos, grid_RRow * grid_ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToDevice ); float maxAbsExact = 1.0; if (isExactPresent == true) maxAbsExact = GetAbsMax(VPotentialExact, RRow * PhiSlice * ZColumn); dim3 error_BlockPerGrid((grid_RRow < 16) ? 1 : (grid_RRow / 16), (grid_ZColumn < 16) ? 1 : (grid_ZColumn / 16), PhiSlice); dim3 error_ThreadPerBlock(16, 16); for (int cycle = 0; cycle < nCycle; cycle++) { if (step == gridFrom) { cudaMemcpy( temp_VPotential, d_VPotential, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToHost ); if (isExactPresent == true )errorExact[cycle] = GetErrorNorm2(temp_VPotential, VPotentialExact, RRow * PhiSlice,ZColumn, maxAbsExact); } //cudaDeviceSynchronize(); VCycleSemiCoarseningGPU(d_VPotential, d_RhoChargeDensity, d_DeltaResidue, d_coef1, d_coef2, d_coef3, d_coef4, d_icoef4, gridSizeR, ratioZ, ratioPhi, RRow, ZColumn, PhiSlice, step, gridTo, nPre, nPost); //if (step == gridFrom) { //cudaMemcpy( temp_VPotential, d_VPotential, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToHost ); //errorConv[cycle] = GetErrorNorm2(temp_VPotential, VPotentialPrev, RRow * PhiSlice,ZColumn, 1.0); errorCalculation<<< error_BlockPerGrid, error_ThreadPerBlock >>> ( d_VPotentialPrev + grid_StartPos, d_VPotential + grid_StartPos, d_EpsilonError, grid_RRow, grid_ZColumn, PhiSlice); cudaMemcpy( EpsilonError, d_EpsilonError, 1 * sizeof(float), cudaMemcpyDeviceToHost ); errorConv[cycle] = *EpsilonError / (grid_RRow * grid_ZColumn * PhiSlice); if (errorConv[cycle]< convErr) { nCycle = cycle; break; } cudaMemcpy( d_VPotentialPrev + grid_StartPos, d_VPotential + grid_StartPos, grid_RRow * grid_ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToDevice ); cudaMemset( d_EpsilonError, 0, 1 * sizeof(float) ); } } iparam[3] = nCycle; // copy result from device to host cudaMemcpy( temp_VPotential, d_VPotential, RRow * ZColumn * PhiSlice * sizeof(float), cudaMemcpyDeviceToHost ); memcpy(VPotential, temp_VPotential, RRow * ZColumn * PhiSlice * sizeof(float)); // free device memory cudaFree( d_VPotential ); cudaFree( d_DeltaResidue ); cudaFree( d_RhoChargeDensity ); cudaFree( d_coef1 ); cudaFree( d_coef2 ); cudaFree( d_coef3 ); cudaFree( d_coef4 ); cudaFree( d_icoef4 ); // free host memory free( coef1 ); free( coef2 ); free( coef3 ); free( coef4 ); free( icoef4 ); free( temp_VPotential ); free( VPotentialPrev ); }