Release
모드, x64
환경에서 컴파일해야 빠르다. Debug
모드는 기다리다가 속터져서 죽는다.
#include <iostream>
#include <vector>
#include <random>
#include <chrono>
#include <numeric>
#include <thread>
#include <mutex>
#include <atomic>
#include <future>
#include <execution>
std::mutex mtx;
auto dotProductFuture(const std::vector<int>& v0, const std::vector<int>& v1, \
const unsigned i_start, const unsigned i_end)
{
unsigned long long sum = 0;
for (unsigned i = i_start; i < i_end; ++i)
sum += v0[i] * v1[i];
return sum;
}
void dotProductAtomic(const std::vector<int>& v0, const std::vector<int>& v1, \
const unsigned i_start, const unsigned i_end, \
std::atomic<unsigned long long>& sum)
{
unsigned long long tmp = 0;
for (unsigned i = i_start; i < i_end; ++i)
tmp += v0[i] * v1[i];
sum += tmp;
}
void dotProductLock(const std::vector<int>& v0, const std::vector<int>& v1, \
const unsigned i_start, const unsigned i_end, unsigned long long& sum)
{
unsigned long long tmp = 0;
for (unsigned i = i_start; i < i_end; ++i)
tmp += v0[i] * v1[i];
{
std::scoped_lock lock(mtx);
sum += tmp;
}
}
void dotProductNaive(const std::vector<int>& v0, const std::vector<int>& v1, \
const unsigned i_start, const unsigned i_end, unsigned long long& sum)
{
for (unsigned i = i_start; i < i_end; ++i)
sum += v0[i] * v1[i];
}
int main()
{
using namespace std;
const long long n_data = 100'000'000;
const unsigned n_threads = 4;
vector<int> v0, v1;
v0.reserve(n_data);
v1.reserve(n_data);
random_device seed;
mt19937 engine(seed());
uniform_int_distribution<> uniformDist(1, 10);
for (long long i = 0; i < n_data; ++i)
{
v0.push_back(uniformDist(engine));
v1.push_back(uniformDist(engine));
}
{
cout << "std::inner_product (Not parallel computing)\n";
const auto sta = chrono::steady_clock::now();
const auto sum = std::inner_product(v0.begin(), v0.end(), v1.begin(), 0ull);
const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
cout << "Elapsed time : " << dur.count() << 's' << endl;
cout << "Sum : " << sum << endl;
cout << endl;
}
{
cout << "Naive (Race Condition)\n";
const auto sta = chrono::steady_clock::now();
unsigned long long sum = 0;
vector<thread> threads;
threads.resize(n_threads);
const unsigned n_per_thread = n_data / n_threads;
for (unsigned t = 0; t < n_threads; ++t)
threads[t] = thread(dotProductNaive, ref(v0), ref(v1), \
t * n_per_thread, (t + 1) * n_per_thread, ref(sum));
for (unsigned t = 0; t < n_threads; ++t)
threads[t].join();
const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
cout << "Elapsed time : " << dur.count() << 's' << endl;
cout << "Sum : " << sum << endl;
cout << endl;
}
{
cout << "Lockguard\n";
const auto sta = chrono::steady_clock::now();
unsigned long long sum = 0;
vector<thread> threads;
threads.resize(n_threads);
const unsigned n_per_thread = n_data / n_threads;
for (unsigned t = 0; t < n_threads; ++t)
threads[t] = thread(dotProductLock, ref(v0), ref(v1), \
t * n_per_thread, (t + 1) * n_per_thread, ref(sum));
for (unsigned t = 0; t < n_threads; ++t)
threads[t].join();
const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
cout << "Elapsed time : " << dur.count() << 's' << endl;
cout << "Sum : " << sum << endl;
cout << endl;
}
{
cout << "Atomic\n";
const auto sta = chrono::steady_clock::now();
atomic<unsigned long long> sum = 0;
vector<thread> threads;
threads.resize(n_threads);
const unsigned n_per_thread = n_data / n_threads;
for (unsigned t = 0; t < n_threads; ++t)
threads[t] = thread(dotProductAtomic, ref(v0), ref(v1), \
t * n_per_thread, (t + 1) * n_per_thread, ref(sum));
for (unsigned t = 0; t < n_threads; ++t)
threads[t].join();
const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
cout << "Elapsed time : " << dur.count() << 's' << endl;
cout << "Sum : " << sum << endl;
cout << endl;
}
{
cout << "Future\n";
const auto sta = chrono::steady_clock::now();
unsigned long long sum = 0;
vector<future<unsigned long long>> futures;
futures.resize(n_threads);
const unsigned n_per_thread = n_data / n_threads;
for (unsigned t = 0; t < n_threads; ++t)
futures[t] = async(dotProductFuture, ref(v0), ref(v1), \
t * n_per_thread, (t + 1) * n_per_thread);
for (unsigned t = 0; t < n_threads; ++t)
sum += futures[t].get();
const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
cout << "Elapsed time : " << dur.count() << 's' << endl;
cout << "Sum : " << sum << endl;
cout << endl;
}
{
cout << "std::transform_reduce (Parallel)\n";
const auto sta = chrono::steady_clock::now();
const auto sum = transform_reduce(execution::par, v0.begin(), v0.end(), \
v1.begin(), 0ull);
const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
cout << "Elapsed time : " << dur.count() << 's' << endl;
cout << "Sum : " << sum << endl;
cout << endl;
}
}
/* stdout
std::inner_product (Not parallel computing)
Elapsed time : 0.0370022s
Sum : 3024919192
Naive (Race Condition)
Elapsed time : 0.302984s
Sum : 877824373
Lockguard
Elapsed time : 0.0200525s
Sum : 3024919192
Atomic
Elapsed time : 0.0201991s
Sum : 3024919192
Future
Elapsed time : 0.020037s
Sum : 3024919192
std::transform_reduce (Parallel)
Elapsed time : 0.0214739s
Sum : 3024919192
*/