Revision of sum.cpp

frank921 revidoval tento gist 1713812420. Přejít na revizi

1 file changed, 110 insertions

sum.cpp(vytvořil soubor)

		@@ -0,0 +1,110 @@
1	+	#include <algorithm>
2	+	#include <iostream>
3	+	#include <chrono>
4	+	#include <math.h>
5	+
6	+	//------------------------------------------------
7	+
8	+	double Sum(double* arr, size_t N) {
9	+	double result = 0.;
10	+	for (size_t i = 0; i < N; ++i) {
11	+	result += arr[i];
12	+	}
13	+	return result;
14	+	}
15	+
16	+	//-------------------------------------------------
17	+
18	+	__global__
19	+	void pairwiseSum(double x, double y, double *res, size_t N) {
20	+	size_t index = blockIdx.x * blockDim.x + threadIdx.x;
21	+	size_t sumIndex = index * 2;
22	+
23	+	if (sumIndex < N) {
24	+	res[index] = x[sumIndex] + ((sumIndex + 1) < N ? y[sumIndex + 1] : 0);
25	+	}
26	+	}
27	+
28	+
29	+
30	+
31	+	/**
32	+	* @brief Computes the sum of the array
33	+	* @param arr - the pointer to the beginning of an array
34	+	* @param N - the length of the array
35	+	*/
36	+	double SumGPU(double *arr, size_t N) {
37	+	double currentArr_d, nextArr_d;
38	+	size_t currentSize = N, nextSize;
39	+
40	+	cudaMalloc(&currentArr_d, currentSize * sizeof(double));
41	+	cudaMemcpy(currentArr_d, arr, currentSize * sizeof(double), cudaMemcpyHostToDevice);
42	+
43	+	while (currentSize > 20) {
44	+	nextSize = (currentSize + 1) / 2;
45	+	cudaMalloc(&nextArr_d, nextSize * sizeof(double));
46	+
47	+	size_t threadsPerBlock = 1024;
48	+	size_t blocksPerGrid = (nextSize + threadsPerBlock - 1) / threadsPerBlock;
49	+
50	+	pairwiseSum<<<blocksPerGrid, threadsPerBlock>>>(currentArr_d, currentArr_d, nextArr_d, currentSize);
51	+
52	+	cudaFree(currentArr_d);
53	+	currentArr_d = nextArr_d;
54	+	currentSize = nextSize;
55	+	}
56	+
57	+	double currentArr_h = (double ) malloc(currentSize * sizeof(double));
58	+	cudaMemcpy(currentArr_h, currentArr_d, currentSize * sizeof(double), cudaMemcpyDeviceToHost);
59	+	double finalResult = 0;
60	+	for (size_t i = 0; i < currentSize; i++) {
61	+	finalResult += currentArr_h[i];
62	+	}
63	+	free(currentArr_h);
64	+
65	+	cudaFree(currentArr_d);
66	+
67	+
68	+
69	+	return finalResult;
70	+	}
71	+
72	+	//---------------------------------------------------
73	+
74	+	int main(int argc, char* argv[]) {
75	+	// setting the random seed to get the same result each time
76	+	srand(42);
77	+
78	+	// taking as input, which algo to run
79	+	int alg_ind = std::stoi(argv[1]);
80	+
81	+	// Generating data
82	+	size_t N = 1 << 26;
83	+	double* arr = (double) malloc(N sizeof(double));
84	+	for (size_t i = 0; i < N; ++i) {
85	+	arr[i] = static_cast <double> (rand()) / static_cast <double> (RAND_MAX);
86	+	}
87	+
88	+	double result = 0.;
89	+	auto start = std::chrono::steady_clock::now();
90	+	switch (alg_ind) {
91	+	case 0:
92	+	result = Sum(arr, N);
93	+	break;
94	+	case 1:
95	+	result = SumGPU(arr, N);
96	+	break;
97	+	}
98	+	auto finish = std::chrono::steady_clock::now();
99	+	auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(finish - start).count();
100	+	std::cout << "Elapsed time: " << elapsed << std::endl;
101	+	std::cout << "Total result: " << result << std::endl;
102	+
103	+	delete[] arr;
104	+	return 0;
105	+	}
106	+	// total computation time on CPU: 198,664 ms
107	+
108	+	// total computation time on GPU: 149,512 ms
109	+
110	+	// actual computation time on GPU: 2,451 ms

Novější Starší