C++/TBC++
2021. 3. 30. 00:06
멀티쓰레딩 예제 (벡터 내적)
병렬로 처리할 때, 정확한 계산을 위해 레이스 컨디션을 고려해주는 것은 필수이다.
시간적인 요소 또한 중요하므로, 쓰레드의 개수 등을 조절하여 최적화를 해야 한다.
코드
Release
모드,x64
환경에서 컴파일해야 빠르다.Debug
모드는 기다리다가 속터져서 죽는다.#include <iostream> #include <vector> #include <random> #include <chrono> #include <numeric> #include <thread> #include <mutex> #include <atomic> #include <future> #include <execution> std::mutex mtx; auto dotProductFuture(const std::vector<int>& v0, const std::vector<int>& v1, \ const unsigned i_start, const unsigned i_end) { unsigned long long sum = 0; for (unsigned i = i_start; i < i_end; ++i) sum += v0[i] * v1[i]; return sum; } void dotProductAtomic(const std::vector<int>& v0, const std::vector<int>& v1, \ const unsigned i_start, const unsigned i_end, \ std::atomic<unsigned long long>& sum) { unsigned long long tmp = 0; for (unsigned i = i_start; i < i_end; ++i) tmp += v0[i] * v1[i]; sum += tmp; } void dotProductLock(const std::vector<int>& v0, const std::vector<int>& v1, \ const unsigned i_start, const unsigned i_end, unsigned long long& sum) { unsigned long long tmp = 0; for (unsigned i = i_start; i < i_end; ++i) tmp += v0[i] * v1[i]; { std::scoped_lock lock(mtx); sum += tmp; } } void dotProductNaive(const std::vector<int>& v0, const std::vector<int>& v1, \ const unsigned i_start, const unsigned i_end, unsigned long long& sum) { for (unsigned i = i_start; i < i_end; ++i) sum += v0[i] * v1[i]; } int main() { using namespace std; const long long n_data = 100'000'000; const unsigned n_threads = 4; vector<int> v0, v1; v0.reserve(n_data); v1.reserve(n_data); random_device seed; mt19937 engine(seed()); uniform_int_distribution<> uniformDist(1, 10); for (long long i = 0; i < n_data; ++i) { v0.push_back(uniformDist(engine)); v1.push_back(uniformDist(engine)); } { cout << "std::inner_product (Not parallel computing)\n"; const auto sta = chrono::steady_clock::now(); const auto sum = std::inner_product(v0.begin(), v0.end(), v1.begin(), 0ull); const chrono::duration<double> dur = chrono::steady_clock::now() - sta; cout << "Elapsed time : " << dur.count() << 's' << endl; cout << "Sum : " << sum << endl; cout << endl; } { cout << "Naive (Race Condition)\n"; const auto sta = chrono::steady_clock::now(); unsigned long long sum = 0; vector<thread> threads; threads.resize(n_threads); const unsigned n_per_thread = n_data / n_threads; for (unsigned t = 0; t < n_threads; ++t) threads[t] = thread(dotProductNaive, ref(v0), ref(v1), \ t * n_per_thread, (t + 1) * n_per_thread, ref(sum)); for (unsigned t = 0; t < n_threads; ++t) threads[t].join(); const chrono::duration<double> dur = chrono::steady_clock::now() - sta; cout << "Elapsed time : " << dur.count() << 's' << endl; cout << "Sum : " << sum << endl; cout << endl; } { cout << "Lockguard\n"; const auto sta = chrono::steady_clock::now(); unsigned long long sum = 0; vector<thread> threads; threads.resize(n_threads); const unsigned n_per_thread = n_data / n_threads; for (unsigned t = 0; t < n_threads; ++t) threads[t] = thread(dotProductLock, ref(v0), ref(v1), \ t * n_per_thread, (t + 1) * n_per_thread, ref(sum)); for (unsigned t = 0; t < n_threads; ++t) threads[t].join(); const chrono::duration<double> dur = chrono::steady_clock::now() - sta; cout << "Elapsed time : " << dur.count() << 's' << endl; cout << "Sum : " << sum << endl; cout << endl; } { cout << "Atomic\n"; const auto sta = chrono::steady_clock::now(); atomic<unsigned long long> sum = 0; vector<thread> threads; threads.resize(n_threads); const unsigned n_per_thread = n_data / n_threads; for (unsigned t = 0; t < n_threads; ++t) threads[t] = thread(dotProductAtomic, ref(v0), ref(v1), \ t * n_per_thread, (t + 1) * n_per_thread, ref(sum)); for (unsigned t = 0; t < n_threads; ++t) threads[t].join(); const chrono::duration<double> dur = chrono::steady_clock::now() - sta; cout << "Elapsed time : " << dur.count() << 's' << endl; cout << "Sum : " << sum << endl; cout << endl; } { cout << "Future\n"; const auto sta = chrono::steady_clock::now(); unsigned long long sum = 0; vector<future<unsigned long long>> futures; futures.resize(n_threads); const unsigned n_per_thread = n_data / n_threads; for (unsigned t = 0; t < n_threads; ++t) futures[t] = async(dotProductFuture, ref(v0), ref(v1), \ t * n_per_thread, (t + 1) * n_per_thread); for (unsigned t = 0; t < n_threads; ++t) sum += futures[t].get(); const chrono::duration<double> dur = chrono::steady_clock::now() - sta; cout << "Elapsed time : " << dur.count() << 's' << endl; cout << "Sum : " << sum << endl; cout << endl; } { cout << "std::transform_reduce (Parallel)\n"; const auto sta = chrono::steady_clock::now(); const auto sum = transform_reduce(execution::par, v0.begin(), v0.end(), \ v1.begin(), 0ull); const chrono::duration<double> dur = chrono::steady_clock::now() - sta; cout << "Elapsed time : " << dur.count() << 's' << endl; cout << "Sum : " << sum << endl; cout << endl; } } /* stdout std::inner_product (Not parallel computing) Elapsed time : 0.0370022s Sum : 3024919192 Naive (Race Condition) Elapsed time : 0.302984s Sum : 877824373 Lockguard Elapsed time : 0.0200525s Sum : 3024919192 Atomic Elapsed time : 0.0201991s Sum : 3024919192 Future Elapsed time : 0.020037s Sum : 3024919192 std::transform_reduce (Parallel) Elapsed time : 0.0214739s Sum : 3024919192 */
'C++ > TBC++' 카테고리의 다른 글
C++ TCP/IP 네트워킹 (TCP/IP Networking) (0) | 2021.03.30 |
---|---|
따라하며 배우는 C++ 20장 (0) | 2021.03.30 |
따라하며 배우는 C++ 19장 (0) | 2021.03.30 |
따라하며 배우는 C++ 18장 (0) | 2021.03.26 |
따라하며 배우는 C++ 17장 (0) | 2021.03.26 |