sum.cpp
· 3.0 KiB · C++
Brut
#include <algorithm>
#include <iostream>
#include <chrono>
#include <math.h>
//------------------------------------------------
double Sum(double* arr, size_t N) {
double result = 0.;
for (size_t i = 0; i < N; ++i) {
result += arr[i];
}
return result;
}
//-------------------------------------------------
__global__
void pairwiseSum(double *x, double *y, double *res, size_t N) {
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
size_t sumIndex = index * 2;
if (sumIndex < N) {
res[index] = x[sumIndex] + ((sumIndex + 1) < N ? y[sumIndex + 1] : 0);
}
}
/**
* @brief Computes the sum of the array
* @param arr - the pointer to the beginning of an array
* @param N - the length of the array
*/
double SumGPU(double *arr, size_t N) {
double *currentArr_d, *nextArr_d;
size_t currentSize = N, nextSize;
cudaMalloc(¤tArr_d, currentSize * sizeof(double));
cudaMemcpy(currentArr_d, arr, currentSize * sizeof(double), cudaMemcpyHostToDevice);
while (currentSize > 20) {
nextSize = (currentSize + 1) / 2;
cudaMalloc(&nextArr_d, nextSize * sizeof(double));
size_t threadsPerBlock = 1024;
size_t blocksPerGrid = (nextSize + threadsPerBlock - 1) / threadsPerBlock;
pairwiseSum<<<blocksPerGrid, threadsPerBlock>>>(currentArr_d, currentArr_d, nextArr_d, currentSize);
cudaFree(currentArr_d);
currentArr_d = nextArr_d;
currentSize = nextSize;
}
double *currentArr_h = (double *) malloc(currentSize * sizeof(double));
cudaMemcpy(currentArr_h, currentArr_d, currentSize * sizeof(double), cudaMemcpyDeviceToHost);
double finalResult = 0;
for (size_t i = 0; i < currentSize; i++) {
finalResult += currentArr_h[i];
}
free(currentArr_h);
cudaFree(currentArr_d);
return finalResult;
}
//---------------------------------------------------
int main(int argc, char* argv[]) {
// setting the random seed to get the same result each time
srand(42);
// taking as input, which algo to run
int alg_ind = std::stoi(argv[1]);
// Generating data
size_t N = 1 << 26;
double* arr = (double*) malloc(N * sizeof(double));
for (size_t i = 0; i < N; ++i) {
arr[i] = static_cast <double> (rand()) / static_cast <double> (RAND_MAX);
}
double result = 0.;
auto start = std::chrono::steady_clock::now();
switch (alg_ind) {
case 0:
result = Sum(arr, N);
break;
case 1:
result = SumGPU(arr, N);
break;
}
auto finish = std::chrono::steady_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(finish - start).count();
std::cout << "Elapsed time: " << elapsed << std::endl;
std::cout << "Total result: " << result << std::endl;
delete[] arr;
return 0;
}
// total computation time on CPU: 198,664 ms
// total computation time on GPU: 149,512 ms
// actual computation time on GPU: 2,451 ms
| 1 | #include <algorithm> |
| 2 | #include <iostream> |
| 3 | #include <chrono> |
| 4 | #include <math.h> |
| 5 | |
| 6 | //------------------------------------------------ |
| 7 | |
| 8 | double Sum(double* arr, size_t N) { |
| 9 | double result = 0.; |
| 10 | for (size_t i = 0; i < N; ++i) { |
| 11 | result += arr[i]; |
| 12 | } |
| 13 | return result; |
| 14 | } |
| 15 | |
| 16 | //------------------------------------------------- |
| 17 | |
| 18 | __global__ |
| 19 | void pairwiseSum(double *x, double *y, double *res, size_t N) { |
| 20 | size_t index = blockIdx.x * blockDim.x + threadIdx.x; |
| 21 | size_t sumIndex = index * 2; |
| 22 | |
| 23 | if (sumIndex < N) { |
| 24 | res[index] = x[sumIndex] + ((sumIndex + 1) < N ? y[sumIndex + 1] : 0); |
| 25 | } |
| 26 | } |
| 27 | |
| 28 | |
| 29 | |
| 30 | |
| 31 | /** |
| 32 | * @brief Computes the sum of the array |
| 33 | * @param arr - the pointer to the beginning of an array |
| 34 | * @param N - the length of the array |
| 35 | */ |
| 36 | double SumGPU(double *arr, size_t N) { |
| 37 | double *currentArr_d, *nextArr_d; |
| 38 | size_t currentSize = N, nextSize; |
| 39 | |
| 40 | cudaMalloc(¤tArr_d, currentSize * sizeof(double)); |
| 41 | cudaMemcpy(currentArr_d, arr, currentSize * sizeof(double), cudaMemcpyHostToDevice); |
| 42 | |
| 43 | while (currentSize > 20) { |
| 44 | nextSize = (currentSize + 1) / 2; |
| 45 | cudaMalloc(&nextArr_d, nextSize * sizeof(double)); |
| 46 | |
| 47 | size_t threadsPerBlock = 1024; |
| 48 | size_t blocksPerGrid = (nextSize + threadsPerBlock - 1) / threadsPerBlock; |
| 49 | |
| 50 | pairwiseSum<<<blocksPerGrid, threadsPerBlock>>>(currentArr_d, currentArr_d, nextArr_d, currentSize); |
| 51 | |
| 52 | cudaFree(currentArr_d); |
| 53 | currentArr_d = nextArr_d; |
| 54 | currentSize = nextSize; |
| 55 | } |
| 56 | |
| 57 | double *currentArr_h = (double *) malloc(currentSize * sizeof(double)); |
| 58 | cudaMemcpy(currentArr_h, currentArr_d, currentSize * sizeof(double), cudaMemcpyDeviceToHost); |
| 59 | double finalResult = 0; |
| 60 | for (size_t i = 0; i < currentSize; i++) { |
| 61 | finalResult += currentArr_h[i]; |
| 62 | } |
| 63 | free(currentArr_h); |
| 64 | |
| 65 | cudaFree(currentArr_d); |
| 66 | |
| 67 | |
| 68 | |
| 69 | return finalResult; |
| 70 | } |
| 71 | |
| 72 | //--------------------------------------------------- |
| 73 | |
| 74 | int main(int argc, char* argv[]) { |
| 75 | // setting the random seed to get the same result each time |
| 76 | srand(42); |
| 77 | |
| 78 | // taking as input, which algo to run |
| 79 | int alg_ind = std::stoi(argv[1]); |
| 80 | |
| 81 | // Generating data |
| 82 | size_t N = 1 << 26; |
| 83 | double* arr = (double*) malloc(N * sizeof(double)); |
| 84 | for (size_t i = 0; i < N; ++i) { |
| 85 | arr[i] = static_cast <double> (rand()) / static_cast <double> (RAND_MAX); |
| 86 | } |
| 87 | |
| 88 | double result = 0.; |
| 89 | auto start = std::chrono::steady_clock::now(); |
| 90 | switch (alg_ind) { |
| 91 | case 0: |
| 92 | result = Sum(arr, N); |
| 93 | break; |
| 94 | case 1: |
| 95 | result = SumGPU(arr, N); |
| 96 | break; |
| 97 | } |
| 98 | auto finish = std::chrono::steady_clock::now(); |
| 99 | auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(finish - start).count(); |
| 100 | std::cout << "Elapsed time: " << elapsed << std::endl; |
| 101 | std::cout << "Total result: " << result << std::endl; |
| 102 | |
| 103 | delete[] arr; |
| 104 | return 0; |
| 105 | } |
| 106 | // total computation time on CPU: 198,664 ms |
| 107 | |
| 108 | // total computation time on GPU: 149,512 ms |
| 109 | |
| 110 | // actual computation time on GPU: 2,451 ms |