C++ 멀티쓰레딩 예제 (벡터 내적)

2021. 3. 30. 00:06·C++/TBC++

멀티쓰레딩 예제 (벡터 내적)

  • 병렬로 처리할 때, 정확한 계산을 위해 레이스 컨디션을 고려해주는 것은 필수이다.

    • mutex, atomic, future 등을 사용할 수 있다.
  • 시간적인 요소 또한 중요하므로, 쓰레드의 개수 등을 조절하여 최적화를 해야 한다.


코드

  • Release모드, x64 환경에서 컴파일해야 빠르다. Debug모드는 기다리다가 속터져서 죽는다.

    #include <iostream>
    #include <vector>
    #include <random>
    #include <chrono>
    #include <numeric>
    #include <thread>
    #include <mutex>
    #include <atomic>
    #include <future>
    #include <execution>
    
    std::mutex mtx;
    
    auto    dotProductFuture(const std::vector<int>& v0, const std::vector<int>& v1, \
        const unsigned i_start, const unsigned i_end)
    {
        unsigned long long sum = 0;
    
        for (unsigned i = i_start; i < i_end; ++i)
            sum += v0[i] * v1[i];
        return sum;
    }
    
    void    dotProductAtomic(const std::vector<int>& v0, const std::vector<int>& v1, \
        const unsigned i_start, const unsigned i_end, \
        std::atomic<unsigned long long>& sum)
    {
        unsigned long long tmp = 0;
        for (unsigned i = i_start; i < i_end; ++i)
            tmp += v0[i] * v1[i];
        sum += tmp;
    }
    
    void    dotProductLock(const std::vector<int>& v0, const std::vector<int>& v1, \
        const unsigned i_start, const unsigned i_end, unsigned long long& sum)
    {
        unsigned long long tmp = 0;
    
        for (unsigned i = i_start; i < i_end; ++i)
            tmp += v0[i] * v1[i];
    
        {
            std::scoped_lock lock(mtx);
            sum += tmp;
        }
    }
    
    void    dotProductNaive(const std::vector<int>& v0, const std::vector<int>& v1, \
        const unsigned i_start, const unsigned i_end, unsigned long long& sum)
    {
        for (unsigned i = i_start; i < i_end; ++i)
            sum += v0[i] * v1[i];
    }
    
    int        main()
    {
        using namespace std;
    
        const long long n_data = 100'000'000;
        const unsigned  n_threads = 4;
    
        vector<int> v0, v1;
        v0.reserve(n_data);
        v1.reserve(n_data);
    
        random_device seed;
        mt19937 engine(seed());
    
        uniform_int_distribution<> uniformDist(1, 10);
    
        for (long long i = 0; i < n_data; ++i)
        {
            v0.push_back(uniformDist(engine));
            v1.push_back(uniformDist(engine));
        }
    
        {
            cout << "std::inner_product (Not parallel computing)\n";
    
            const auto sta = chrono::steady_clock::now();
            const auto sum = std::inner_product(v0.begin(), v0.end(), v1.begin(), 0ull);
            const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
    
            cout << "Elapsed time : " << dur.count() << 's' << endl;
            cout << "Sum : " << sum << endl;
            cout << endl;
        }
    
        {
            cout << "Naive (Race Condition)\n";
    
            const auto sta = chrono::steady_clock::now();
            unsigned long long sum = 0;
    
            vector<thread> threads;
            threads.resize(n_threads);
    
            const unsigned n_per_thread = n_data / n_threads;
            for (unsigned t = 0; t < n_threads; ++t)
                threads[t] = thread(dotProductNaive, ref(v0), ref(v1), \
                    t * n_per_thread, (t + 1) * n_per_thread, ref(sum));
            for (unsigned t = 0; t < n_threads; ++t)
                threads[t].join();
    
            const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
    
            cout << "Elapsed time : " << dur.count() << 's' << endl;
            cout << "Sum : " << sum << endl;
            cout << endl;
        }
    
        {
            cout << "Lockguard\n";
    
            const auto sta = chrono::steady_clock::now();
            unsigned long long sum = 0;
    
            vector<thread> threads;
            threads.resize(n_threads);
    
            const unsigned n_per_thread = n_data / n_threads;
            for (unsigned t = 0; t < n_threads; ++t)
                threads[t] = thread(dotProductLock, ref(v0), ref(v1), \
                    t * n_per_thread, (t + 1) * n_per_thread, ref(sum));
            for (unsigned t = 0; t < n_threads; ++t)
                threads[t].join();
    
            const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
    
            cout << "Elapsed time : " << dur.count() << 's' << endl;
            cout << "Sum : " << sum << endl;
            cout << endl;
        }
    
        {
            cout << "Atomic\n";
    
            const auto sta = chrono::steady_clock::now();
            atomic<unsigned long long> sum = 0;
    
            vector<thread> threads;
            threads.resize(n_threads);
    
            const unsigned n_per_thread = n_data / n_threads;
            for (unsigned t = 0; t < n_threads; ++t)
                threads[t] = thread(dotProductAtomic, ref(v0), ref(v1), \
                    t * n_per_thread, (t + 1) * n_per_thread, ref(sum));
            for (unsigned t = 0; t < n_threads; ++t)
                threads[t].join();
    
            const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
    
            cout << "Elapsed time : " << dur.count() << 's' << endl;
            cout << "Sum : " << sum << endl;
            cout << endl;
        }
    
        {
            cout << "Future\n";
    
            const auto sta = chrono::steady_clock::now();
            unsigned long long sum = 0;
    
            vector<future<unsigned long long>> futures;
            futures.resize(n_threads);
    
            const unsigned n_per_thread = n_data / n_threads;
            for (unsigned t = 0; t < n_threads; ++t)
                futures[t] = async(dotProductFuture, ref(v0), ref(v1), \
                    t * n_per_thread, (t + 1) * n_per_thread);
            for (unsigned t = 0; t < n_threads; ++t)
                sum += futures[t].get();
    
            const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
    
            cout << "Elapsed time : " << dur.count() << 's' << endl;
            cout << "Sum : " << sum << endl;
            cout << endl;
        }
    
        {
            cout << "std::transform_reduce (Parallel)\n";
    
            const auto sta = chrono::steady_clock::now();
            const auto sum = transform_reduce(execution::par, v0.begin(), v0.end(), \
                v1.begin(), 0ull);
    
            const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
    
            cout << "Elapsed time : " << dur.count() << 's' << endl;
            cout << "Sum : " << sum << endl;
            cout << endl;
        }
    }
    
    /* stdout
    std::inner_product (Not parallel computing)
    Elapsed time : 0.0370022s
    Sum : 3024919192
    
    Naive (Race Condition)
    Elapsed time : 0.302984s
    Sum : 877824373
    
    Lockguard
    Elapsed time : 0.0200525s
    Sum : 3024919192
    
    Atomic
    Elapsed time : 0.0201991s
    Sum : 3024919192
    
    Future
    Elapsed time : 0.020037s
    Sum : 3024919192
    
    std::transform_reduce (Parallel)
    Elapsed time : 0.0214739s
    Sum : 3024919192
    */
저작자표시 (새창열림)

'C++ > TBC++' 카테고리의 다른 글

C++ TCP/IP 네트워킹 (TCP/IP Networking)  (0) 2021.03.30
따라하며 배우는 C++ 20장  (0) 2021.03.30
따라하며 배우는 C++ 19장  (0) 2021.03.30
따라하며 배우는 C++ 18장  (0) 2021.03.26
따라하며 배우는 C++ 17장  (0) 2021.03.26
'C++/TBC++' 카테고리의 다른 글
  • C++ TCP/IP 네트워킹 (TCP/IP Networking)
  • 따라하며 배우는 C++ 20장
  • 따라하며 배우는 C++ 19장
  • 따라하며 배우는 C++ 18장
Caniro
Caniro
  • Caniro
    Minimalism
    Caniro
  • 전체
    오늘
    어제
    • 분류 전체보기 (317)
      • Algorithm (13)
        • 알기 쉬운 알고리즘 (10)
        • Search (1)
        • Sort (2)
      • Arduino (0)
      • C++ (185)
        • Class (46)
        • Exception (6)
        • Library (51)
        • Overloading (10)
        • SmartPointer (5)
        • Syntax (33)
        • TBC++ (23)
        • Templates (9)
        • VisualStudio (2)
      • Embedded (1)
      • Git (4)
      • Java (5)
      • Linux (16)
        • Error (1)
        • Linux Structure (11)
      • MacOS (7)
      • OS (1)
        • Concurrency (1)
      • Python (21)
        • Class (1)
        • Function (2)
        • Syntax (17)
      • Raspberrypi (9)
      • Review (1)
      • Utility (12)
        • VSCode (5)
        • VirtualBox (3)
      • Web (8)
        • Nginx (1)
        • React (3)
        • Django (1)
      • Windows (20)
        • Registry (3)
        • WSL (1)
        • DeviceDriver (6)
  • 블로그 메뉴

    • 홈
    • 태그
    • 방명록
  • 링크

  • 공지사항

  • 인기 글

  • 태그

    spring
    알림
    EXCLUDE
    SunOS 5.1
    MacOS
    Workspace
    vscode
    윈도우 명령어
    windows
    citrix workspace
    시스템 복구
    스프링
    dism
    로지텍 마우스 제스처
    logi options
    java
    백기선
    Solaris 10
    mspaint
    맥북 카카오톡 알림 안뜸
    윈도우
    그림판
    스프링 프레임워크 핵심 기술
    KakaoTalk
    unix
    SFC
    제외
    Windows 11
  • 최근 댓글

  • 최근 글

  • hELLO· Designed By정상우.v4.10.3
Caniro
C++ 멀티쓰레딩 예제 (벡터 내적)
상단으로

티스토리툴바