C++/TBC++ 2021. 3. 30. 00:06

멀티쓰레딩 예제 (벡터 내적)

  • 병렬로 처리할 때, 정확한 계산을 위해 레이스 컨디션을 고려해주는 것은 필수이다.

  • 시간적인 요소 또한 중요하므로, 쓰레드의 개수 등을 조절하여 최적화를 해야 한다.


코드

  • Release모드, x64 환경에서 컴파일해야 빠르다. Debug모드는 기다리다가 속터져서 죽는다.

    #include <iostream>
    #include <vector>
    #include <random>
    #include <chrono>
    #include <numeric>
    #include <thread>
    #include <mutex>
    #include <atomic>
    #include <future>
    #include <execution>
    
    std::mutex mtx;
    
    auto    dotProductFuture(const std::vector<int>& v0, const std::vector<int>& v1, \
        const unsigned i_start, const unsigned i_end)
    {
        unsigned long long sum = 0;
    
        for (unsigned i = i_start; i < i_end; ++i)
            sum += v0[i] * v1[i];
        return sum;
    }
    
    void    dotProductAtomic(const std::vector<int>& v0, const std::vector<int>& v1, \
        const unsigned i_start, const unsigned i_end, \
        std::atomic<unsigned long long>& sum)
    {
        unsigned long long tmp = 0;
        for (unsigned i = i_start; i < i_end; ++i)
            tmp += v0[i] * v1[i];
        sum += tmp;
    }
    
    void    dotProductLock(const std::vector<int>& v0, const std::vector<int>& v1, \
        const unsigned i_start, const unsigned i_end, unsigned long long& sum)
    {
        unsigned long long tmp = 0;
    
        for (unsigned i = i_start; i < i_end; ++i)
            tmp += v0[i] * v1[i];
    
        {
            std::scoped_lock lock(mtx);
            sum += tmp;
        }
    }
    
    void    dotProductNaive(const std::vector<int>& v0, const std::vector<int>& v1, \
        const unsigned i_start, const unsigned i_end, unsigned long long& sum)
    {
        for (unsigned i = i_start; i < i_end; ++i)
            sum += v0[i] * v1[i];
    }
    
    int        main()
    {
        using namespace std;
    
        const long long n_data = 100'000'000;
        const unsigned  n_threads = 4;
    
        vector<int> v0, v1;
        v0.reserve(n_data);
        v1.reserve(n_data);
    
        random_device seed;
        mt19937 engine(seed());
    
        uniform_int_distribution<> uniformDist(1, 10);
    
        for (long long i = 0; i < n_data; ++i)
        {
            v0.push_back(uniformDist(engine));
            v1.push_back(uniformDist(engine));
        }
    
        {
            cout << "std::inner_product (Not parallel computing)\n";
    
            const auto sta = chrono::steady_clock::now();
            const auto sum = std::inner_product(v0.begin(), v0.end(), v1.begin(), 0ull);
            const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
    
            cout << "Elapsed time : " << dur.count() << 's' << endl;
            cout << "Sum : " << sum << endl;
            cout << endl;
        }
    
        {
            cout << "Naive (Race Condition)\n";
    
            const auto sta = chrono::steady_clock::now();
            unsigned long long sum = 0;
    
            vector<thread> threads;
            threads.resize(n_threads);
    
            const unsigned n_per_thread = n_data / n_threads;
            for (unsigned t = 0; t < n_threads; ++t)
                threads[t] = thread(dotProductNaive, ref(v0), ref(v1), \
                    t * n_per_thread, (t + 1) * n_per_thread, ref(sum));
            for (unsigned t = 0; t < n_threads; ++t)
                threads[t].join();
    
            const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
    
            cout << "Elapsed time : " << dur.count() << 's' << endl;
            cout << "Sum : " << sum << endl;
            cout << endl;
        }
    
        {
            cout << "Lockguard\n";
    
            const auto sta = chrono::steady_clock::now();
            unsigned long long sum = 0;
    
            vector<thread> threads;
            threads.resize(n_threads);
    
            const unsigned n_per_thread = n_data / n_threads;
            for (unsigned t = 0; t < n_threads; ++t)
                threads[t] = thread(dotProductLock, ref(v0), ref(v1), \
                    t * n_per_thread, (t + 1) * n_per_thread, ref(sum));
            for (unsigned t = 0; t < n_threads; ++t)
                threads[t].join();
    
            const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
    
            cout << "Elapsed time : " << dur.count() << 's' << endl;
            cout << "Sum : " << sum << endl;
            cout << endl;
        }
    
        {
            cout << "Atomic\n";
    
            const auto sta = chrono::steady_clock::now();
            atomic<unsigned long long> sum = 0;
    
            vector<thread> threads;
            threads.resize(n_threads);
    
            const unsigned n_per_thread = n_data / n_threads;
            for (unsigned t = 0; t < n_threads; ++t)
                threads[t] = thread(dotProductAtomic, ref(v0), ref(v1), \
                    t * n_per_thread, (t + 1) * n_per_thread, ref(sum));
            for (unsigned t = 0; t < n_threads; ++t)
                threads[t].join();
    
            const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
    
            cout << "Elapsed time : " << dur.count() << 's' << endl;
            cout << "Sum : " << sum << endl;
            cout << endl;
        }
    
        {
            cout << "Future\n";
    
            const auto sta = chrono::steady_clock::now();
            unsigned long long sum = 0;
    
            vector<future<unsigned long long>> futures;
            futures.resize(n_threads);
    
            const unsigned n_per_thread = n_data / n_threads;
            for (unsigned t = 0; t < n_threads; ++t)
                futures[t] = async(dotProductFuture, ref(v0), ref(v1), \
                    t * n_per_thread, (t + 1) * n_per_thread);
            for (unsigned t = 0; t < n_threads; ++t)
                sum += futures[t].get();
    
            const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
    
            cout << "Elapsed time : " << dur.count() << 's' << endl;
            cout << "Sum : " << sum << endl;
            cout << endl;
        }
    
        {
            cout << "std::transform_reduce (Parallel)\n";
    
            const auto sta = chrono::steady_clock::now();
            const auto sum = transform_reduce(execution::par, v0.begin(), v0.end(), \
                v1.begin(), 0ull);
    
            const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
    
            cout << "Elapsed time : " << dur.count() << 's' << endl;
            cout << "Sum : " << sum << endl;
            cout << endl;
        }
    }
    
    /* stdout
    std::inner_product (Not parallel computing)
    Elapsed time : 0.0370022s
    Sum : 3024919192
    
    Naive (Race Condition)
    Elapsed time : 0.302984s
    Sum : 877824373
    
    Lockguard
    Elapsed time : 0.0200525s
    Sum : 3024919192
    
    Atomic
    Elapsed time : 0.0201991s
    Sum : 3024919192
    
    Future
    Elapsed time : 0.020037s
    Sum : 3024919192
    
    std::transform_reduce (Parallel)
    Elapsed time : 0.0214739s
    Sum : 3024919192
    */

'C++ > TBC++' 카테고리의 다른 글

C++ TCP/IP 네트워킹 (TCP/IP Networking)  (0) 2021.03.30
따라하며 배우는 C++ 20장  (0) 2021.03.30
따라하며 배우는 C++ 19장  (0) 2021.03.30
따라하며 배우는 C++ 18장  (0) 2021.03.26
따라하며 배우는 C++ 17장  (0) 2021.03.26