frank921 ревизий этого фрагмента . К ревизии
1 file changed, 110 insertions
sum.cpp(файл создан)
| @@ -0,0 +1,110 @@ | |||
| 1 | + | #include <algorithm> | |
| 2 | + | #include <iostream> | |
| 3 | + | #include <chrono> | |
| 4 | + | #include <math.h> | |
| 5 | + | ||
| 6 | + | //------------------------------------------------ | |
| 7 | + | ||
| 8 | + | double Sum(double* arr, size_t N) { | |
| 9 | + | double result = 0.; | |
| 10 | + | for (size_t i = 0; i < N; ++i) { | |
| 11 | + | result += arr[i]; | |
| 12 | + | } | |
| 13 | + | return result; | |
| 14 | + | } | |
| 15 | + | ||
| 16 | + | //------------------------------------------------- | |
| 17 | + | ||
| 18 | + | __global__ | |
| 19 | + | void pairwiseSum(double *x, double *y, double *res, size_t N) { | |
| 20 | + | size_t index = blockIdx.x * blockDim.x + threadIdx.x; | |
| 21 | + | size_t sumIndex = index * 2; | |
| 22 | + | ||
| 23 | + | if (sumIndex < N) { | |
| 24 | + | res[index] = x[sumIndex] + ((sumIndex + 1) < N ? y[sumIndex + 1] : 0); | |
| 25 | + | } | |
| 26 | + | } | |
| 27 | + | ||
| 28 | + | ||
| 29 | + | ||
| 30 | + | ||
| 31 | + | /** | |
| 32 | + | * @brief Computes the sum of the array | |
| 33 | + | * @param arr - the pointer to the beginning of an array | |
| 34 | + | * @param N - the length of the array | |
| 35 | + | */ | |
| 36 | + | double SumGPU(double *arr, size_t N) { | |
| 37 | + | double *currentArr_d, *nextArr_d; | |
| 38 | + | size_t currentSize = N, nextSize; | |
| 39 | + | ||
| 40 | + | cudaMalloc(¤tArr_d, currentSize * sizeof(double)); | |
| 41 | + | cudaMemcpy(currentArr_d, arr, currentSize * sizeof(double), cudaMemcpyHostToDevice); | |
| 42 | + | ||
| 43 | + | while (currentSize > 20) { | |
| 44 | + | nextSize = (currentSize + 1) / 2; | |
| 45 | + | cudaMalloc(&nextArr_d, nextSize * sizeof(double)); | |
| 46 | + | ||
| 47 | + | size_t threadsPerBlock = 1024; | |
| 48 | + | size_t blocksPerGrid = (nextSize + threadsPerBlock - 1) / threadsPerBlock; | |
| 49 | + | ||
| 50 | + | pairwiseSum<<<blocksPerGrid, threadsPerBlock>>>(currentArr_d, currentArr_d, nextArr_d, currentSize); | |
| 51 | + | ||
| 52 | + | cudaFree(currentArr_d); | |
| 53 | + | currentArr_d = nextArr_d; | |
| 54 | + | currentSize = nextSize; | |
| 55 | + | } | |
| 56 | + | ||
| 57 | + | double *currentArr_h = (double *) malloc(currentSize * sizeof(double)); | |
| 58 | + | cudaMemcpy(currentArr_h, currentArr_d, currentSize * sizeof(double), cudaMemcpyDeviceToHost); | |
| 59 | + | double finalResult = 0; | |
| 60 | + | for (size_t i = 0; i < currentSize; i++) { | |
| 61 | + | finalResult += currentArr_h[i]; | |
| 62 | + | } | |
| 63 | + | free(currentArr_h); | |
| 64 | + | ||
| 65 | + | cudaFree(currentArr_d); | |
| 66 | + | ||
| 67 | + | ||
| 68 | + | ||
| 69 | + | return finalResult; | |
| 70 | + | } | |
| 71 | + | ||
| 72 | + | //--------------------------------------------------- | |
| 73 | + | ||
| 74 | + | int main(int argc, char* argv[]) { | |
| 75 | + | // setting the random seed to get the same result each time | |
| 76 | + | srand(42); | |
| 77 | + | ||
| 78 | + | // taking as input, which algo to run | |
| 79 | + | int alg_ind = std::stoi(argv[1]); | |
| 80 | + | ||
| 81 | + | // Generating data | |
| 82 | + | size_t N = 1 << 26; | |
| 83 | + | double* arr = (double*) malloc(N * sizeof(double)); | |
| 84 | + | for (size_t i = 0; i < N; ++i) { | |
| 85 | + | arr[i] = static_cast <double> (rand()) / static_cast <double> (RAND_MAX); | |
| 86 | + | } | |
| 87 | + | ||
| 88 | + | double result = 0.; | |
| 89 | + | auto start = std::chrono::steady_clock::now(); | |
| 90 | + | switch (alg_ind) { | |
| 91 | + | case 0: | |
| 92 | + | result = Sum(arr, N); | |
| 93 | + | break; | |
| 94 | + | case 1: | |
| 95 | + | result = SumGPU(arr, N); | |
| 96 | + | break; | |
| 97 | + | } | |
| 98 | + | auto finish = std::chrono::steady_clock::now(); | |
| 99 | + | auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(finish - start).count(); | |
| 100 | + | std::cout << "Elapsed time: " << elapsed << std::endl; | |
| 101 | + | std::cout << "Total result: " << result << std::endl; | |
| 102 | + | ||
| 103 | + | delete[] arr; | |
| 104 | + | return 0; | |
| 105 | + | } | |
| 106 | + | // total computation time on CPU: 198,664 ms | |
| 107 | + | ||
| 108 | + | // total computation time on GPU: 149,512 ms | |
| 109 | + | ||
| 110 | + | // actual computation time on GPU: 2,451 ms | |
Новее
Позже