#include <algorithm>
#include <iostream>
#include <chrono>
#include <math.h>

//------------------------------------------------

double Sum(double* arr, size_t N) {
    double result = 0.;
    for (size_t i = 0; i < N; ++i) {
        result += arr[i];
    }
    return result;
}

//-------------------------------------------------

__global__ 
void pairwiseSum(double *x, double *y, double *res, size_t N) {
    size_t index = blockIdx.x * blockDim.x + threadIdx.x;
    size_t sumIndex = index * 2; 

    if (sumIndex < N) {
        res[index] = x[sumIndex] + ((sumIndex + 1) < N ? y[sumIndex + 1] : 0);
    }
}

    


/**
 * @brief Computes the sum of the array
 * @param arr - the pointer to the beginning of an array
 * @param N - the length of the array
 */
double SumGPU(double *arr, size_t N) {
    double *currentArr_d, *nextArr_d;
    size_t currentSize = N, nextSize;

    cudaMalloc(&currentArr_d, currentSize * sizeof(double));
    cudaMemcpy(currentArr_d, arr, currentSize * sizeof(double), cudaMemcpyHostToDevice);

    while (currentSize > 20) {
        nextSize = (currentSize + 1) / 2; 
        cudaMalloc(&nextArr_d, nextSize * sizeof(double));

        size_t threadsPerBlock = 1024;
        size_t blocksPerGrid = (nextSize + threadsPerBlock - 1) / threadsPerBlock;

        pairwiseSum<<<blocksPerGrid, threadsPerBlock>>>(currentArr_d, currentArr_d, nextArr_d, currentSize);

        cudaFree(currentArr_d); 
        currentArr_d = nextArr_d; 
        currentSize = nextSize;
    }

    double *currentArr_h = (double *) malloc(currentSize * sizeof(double));
    cudaMemcpy(currentArr_h, currentArr_d, currentSize * sizeof(double), cudaMemcpyDeviceToHost);
    double finalResult = 0;
    for (size_t i = 0; i < currentSize; i++) {
        finalResult += currentArr_h[i];
    }
    free(currentArr_h);

    cudaFree(currentArr_d);

    

    return finalResult;
}

//---------------------------------------------------

int main(int argc, char* argv[]) {
    // setting the random seed to get the same result each time
    srand(42);

    // taking as input, which algo to run
    int alg_ind = std::stoi(argv[1]);

    // Generating data
    size_t N = 1 << 26;
    double* arr = (double*) malloc(N * sizeof(double));
    for (size_t i = 0; i < N; ++i) {
          arr[i] = static_cast <double> (rand()) / static_cast <double> (RAND_MAX);
    }
 
    double result = 0.;
    auto start = std::chrono::steady_clock::now();
    switch (alg_ind) {
        case 0: 
            result = Sum(arr, N);
            break;
        case 1:
            result = SumGPU(arr, N);
            break;
    }
    auto finish = std::chrono::steady_clock::now();
    auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(finish - start).count(); 
    std::cout << "Elapsed time: " << elapsed << std::endl;
    std::cout << "Total result: " << result << std::endl;
    
    delete[] arr;
    return 0;
}
// total computation time on CPU: 198,664 ms

// total computation time on GPU: 149,512 ms

// actual computation time on GPU: 2,451 ms