Dernière activité 1713812421

sum.cpp Brut
1#include <algorithm>
2#include <iostream>
3#include <chrono>
4#include <math.h>
5
6//------------------------------------------------
7
8double Sum(double* arr, size_t N) {
9 double result = 0.;
10 for (size_t i = 0; i < N; ++i) {
11 result += arr[i];
12 }
13 return result;
14}
15
16//-------------------------------------------------
17
18__global__
19void pairwiseSum(double *x, double *y, double *res, size_t N) {
20 size_t index = blockIdx.x * blockDim.x + threadIdx.x;
21 size_t sumIndex = index * 2;
22
23 if (sumIndex < N) {
24 res[index] = x[sumIndex] + ((sumIndex + 1) < N ? y[sumIndex + 1] : 0);
25 }
26}
27
28
29
30
31/**
32 * @brief Computes the sum of the array
33 * @param arr - the pointer to the beginning of an array
34 * @param N - the length of the array
35 */
36double SumGPU(double *arr, size_t N) {
37 double *currentArr_d, *nextArr_d;
38 size_t currentSize = N, nextSize;
39
40 cudaMalloc(&currentArr_d, currentSize * sizeof(double));
41 cudaMemcpy(currentArr_d, arr, currentSize * sizeof(double), cudaMemcpyHostToDevice);
42
43 while (currentSize > 20) {
44 nextSize = (currentSize + 1) / 2;
45 cudaMalloc(&nextArr_d, nextSize * sizeof(double));
46
47 size_t threadsPerBlock = 1024;
48 size_t blocksPerGrid = (nextSize + threadsPerBlock - 1) / threadsPerBlock;
49
50 pairwiseSum<<<blocksPerGrid, threadsPerBlock>>>(currentArr_d, currentArr_d, nextArr_d, currentSize);
51
52 cudaFree(currentArr_d);
53 currentArr_d = nextArr_d;
54 currentSize = nextSize;
55 }
56
57 double *currentArr_h = (double *) malloc(currentSize * sizeof(double));
58 cudaMemcpy(currentArr_h, currentArr_d, currentSize * sizeof(double), cudaMemcpyDeviceToHost);
59 double finalResult = 0;
60 for (size_t i = 0; i < currentSize; i++) {
61 finalResult += currentArr_h[i];
62 }
63 free(currentArr_h);
64
65 cudaFree(currentArr_d);
66
67
68
69 return finalResult;
70}
71
72//---------------------------------------------------
73
74int main(int argc, char* argv[]) {
75 // setting the random seed to get the same result each time
76 srand(42);
77
78 // taking as input, which algo to run
79 int alg_ind = std::stoi(argv[1]);
80
81 // Generating data
82 size_t N = 1 << 26;
83 double* arr = (double*) malloc(N * sizeof(double));
84 for (size_t i = 0; i < N; ++i) {
85 arr[i] = static_cast <double> (rand()) / static_cast <double> (RAND_MAX);
86 }
87
88 double result = 0.;
89 auto start = std::chrono::steady_clock::now();
90 switch (alg_ind) {
91 case 0:
92 result = Sum(arr, N);
93 break;
94 case 1:
95 result = SumGPU(arr, N);
96 break;
97 }
98 auto finish = std::chrono::steady_clock::now();
99 auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(finish - start).count();
100 std::cout << "Elapsed time: " << elapsed << std::endl;
101 std::cout << "Total result: " << result << std::endl;
102
103 delete[] arr;
104 return 0;
105}
106// total computation time on CPU: 198,664 ms
107
108// total computation time on GPU: 149,512 ms
109
110// actual computation time on GPU: 2,451 ms