Naposledy aktivní 1713812421

frank921 revidoval tento gist 1713812420. Přejít na revizi

1 file changed, 110 insertions

sum.cpp(vytvořil soubor)

@@ -0,0 +1,110 @@
1 + #include <algorithm>
2 + #include <iostream>
3 + #include <chrono>
4 + #include <math.h>
5 +
6 + //------------------------------------------------
7 +
8 + double Sum(double* arr, size_t N) {
9 + double result = 0.;
10 + for (size_t i = 0; i < N; ++i) {
11 + result += arr[i];
12 + }
13 + return result;
14 + }
15 +
16 + //-------------------------------------------------
17 +
18 + __global__
19 + void pairwiseSum(double *x, double *y, double *res, size_t N) {
20 + size_t index = blockIdx.x * blockDim.x + threadIdx.x;
21 + size_t sumIndex = index * 2;
22 +
23 + if (sumIndex < N) {
24 + res[index] = x[sumIndex] + ((sumIndex + 1) < N ? y[sumIndex + 1] : 0);
25 + }
26 + }
27 +
28 +
29 +
30 +
31 + /**
32 + * @brief Computes the sum of the array
33 + * @param arr - the pointer to the beginning of an array
34 + * @param N - the length of the array
35 + */
36 + double SumGPU(double *arr, size_t N) {
37 + double *currentArr_d, *nextArr_d;
38 + size_t currentSize = N, nextSize;
39 +
40 + cudaMalloc(&currentArr_d, currentSize * sizeof(double));
41 + cudaMemcpy(currentArr_d, arr, currentSize * sizeof(double), cudaMemcpyHostToDevice);
42 +
43 + while (currentSize > 20) {
44 + nextSize = (currentSize + 1) / 2;
45 + cudaMalloc(&nextArr_d, nextSize * sizeof(double));
46 +
47 + size_t threadsPerBlock = 1024;
48 + size_t blocksPerGrid = (nextSize + threadsPerBlock - 1) / threadsPerBlock;
49 +
50 + pairwiseSum<<<blocksPerGrid, threadsPerBlock>>>(currentArr_d, currentArr_d, nextArr_d, currentSize);
51 +
52 + cudaFree(currentArr_d);
53 + currentArr_d = nextArr_d;
54 + currentSize = nextSize;
55 + }
56 +
57 + double *currentArr_h = (double *) malloc(currentSize * sizeof(double));
58 + cudaMemcpy(currentArr_h, currentArr_d, currentSize * sizeof(double), cudaMemcpyDeviceToHost);
59 + double finalResult = 0;
60 + for (size_t i = 0; i < currentSize; i++) {
61 + finalResult += currentArr_h[i];
62 + }
63 + free(currentArr_h);
64 +
65 + cudaFree(currentArr_d);
66 +
67 +
68 +
69 + return finalResult;
70 + }
71 +
72 + //---------------------------------------------------
73 +
74 + int main(int argc, char* argv[]) {
75 + // setting the random seed to get the same result each time
76 + srand(42);
77 +
78 + // taking as input, which algo to run
79 + int alg_ind = std::stoi(argv[1]);
80 +
81 + // Generating data
82 + size_t N = 1 << 26;
83 + double* arr = (double*) malloc(N * sizeof(double));
84 + for (size_t i = 0; i < N; ++i) {
85 + arr[i] = static_cast <double> (rand()) / static_cast <double> (RAND_MAX);
86 + }
87 +
88 + double result = 0.;
89 + auto start = std::chrono::steady_clock::now();
90 + switch (alg_ind) {
91 + case 0:
92 + result = Sum(arr, N);
93 + break;
94 + case 1:
95 + result = SumGPU(arr, N);
96 + break;
97 + }
98 + auto finish = std::chrono::steady_clock::now();
99 + auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(finish - start).count();
100 + std::cout << "Elapsed time: " << elapsed << std::endl;
101 + std::cout << "Total result: " << result << std::endl;
102 +
103 + delete[] arr;
104 + return 0;
105 + }
106 + // total computation time on CPU: 198,664 ms
107 +
108 + // total computation time on GPU: 149,512 ms
109 +
110 + // actual computation time on GPU: 2,451 ms
Novější Starší