转载请注明出处:
http://www.cnblogs.com/darkknightzh/p/4988264.html
参考网址:
关于mt19937:http://www.cnblogs.com/egmkang/archive/2012/09/06/2673253.html
代码如下:
1 #include "stdafx.h"
2 #include <iostream>
3 #include <random> // mt19937的头文件
4 #include <ppl.h> // parfor的头文件
5 #include <windows.h> // QueryPerformanceFrequency等函数的头文件
6
7 using namespace concurrency; // parfor使用
8 using namespace std;
9
10
11 // 分配内存
12 void AllocMatrix(double** m, size_t n)
13 {
14 *m = new double[n*n];
15 memset(*m, 0, sizeof(double)*n*n);
16 }
17
18
19 // 初始化矩阵内容
20 template <class Gen>
21 void IniMatrix(double* m, size_t n, Gen& gen)
22 {
23 for (size_t i = 0; i < n; ++i)
24 {
25 for (size_t j = 0; j < n; ++j)
26 {
27 m[i*n + j] = static_cast<double>(gen());
28 }
29 }
30 }
31
32
33 // 释放内存
34 void FreeMatrix(double** m)
35 {
36 if (nullptr != *m)
37 {
38 delete[](*m);
39 (*m) = nullptr;
40 }
41 }
42
43
44 // 矩阵相乘,使用for
45 void matrixMultiplyFor(double* res, const double* m1, const double* m2, size_t n)
46 {
47 for (size_t i = 0; i < n; i++)
48 {
49 for (size_t j = i; j < n; j++)
50 {
51 double temp = 0;
52 for (size_t k = 0; k < n; k++)
53 {
54 temp += m1[i * n + k] * m2[k * n + j];
55 }
56 res[i*n + j] = temp;
57 }
58 }
59 }
60
61
62 // 矩阵相乘,外层使用parfor
63 void matrixMultiplyParForOuter(double* res, const double* m1, const double* m2, size_t n)
64 {
65 parallel_for(size_t(0), n, [&](size_t i)
66 {
67 for (size_t j = i; j < n; j++)
68 {
69 double temp = 0;
70 for (size_t k = 0; k < n; k++)
71 {
72 temp += m1[i * n + k] * m2[k * n + j];
73 }
74 res[i*n + j] = temp;
75 }
76 });
77 }
78
79
80 // 矩阵相乘,内层使用parfor
81 void matrixMultiplyParForInner(double* res, const double* m1, const double* m2, size_t n)
82 {
83 for (size_t i = 0; i < n; i++)
84 {
85 parallel_for(size_t(i), n, [&](size_t j)
86 {
87 double temp = 0;
88 for (size_t k = 0; k < n; k++)
89 {
90 temp += m1[i * n + k] * m2[k * n + j];
91 }
92 res[i*n + j] = temp;
93 });
94 }
95 }
96
97
98 // 测试矩阵相乘,使用for的时间
99 double testmatrixMultiplyFor(double* res, const double* m1, const double* m2, size_t n)
100 {
101 LARGE_INTEGER nFreq, nBeginTime, nEndTime;
102 QueryPerformanceFrequency(&nFreq);
103 QueryPerformanceCounter(&nBeginTime);
104
105 matrixMultiplyFor(res, m1, m2, n);
106
107 QueryPerformanceCounter(&nEndTime);
108 return (double)(nEndTime.QuadPart - nBeginTime.QuadPart) * 1000 / (double)nFreq.QuadPart;
109 }
110
111
112 // 测试矩阵相乘,外层使用parfor的时间
113 double testmatrixMultiplyParForOuter(double* res, const double* m1, const double* m2, size_t n)
114 {
115 LARGE_INTEGER nFreq, nBeginTime, nEndTime;
116 QueryPerformanceFrequency(&nFreq);
117 QueryPerformanceCounter(&nBeginTime);
118
119 matrixMultiplyParForOuter(res, m1, m2, n);
120
121 QueryPerformanceCounter(&nEndTime);
122 return (double)(nEndTime.QuadPart - nBeginTime.QuadPart) * 1000 / (double)nFreq.QuadPart;
123 }
124
125
126 // 测试矩阵相乘,内层使用parfor的时间
127 double testmatrixMultiplyParForInner(double* res, const double* m1, const double* m2, size_t n)
128 {
129 LARGE_INTEGER nFreq, nBeginTime, nEndTime;
130 QueryPerformanceFrequency(&nFreq);
131 QueryPerformanceCounter(&nBeginTime);
132
133 matrixMultiplyParForInner(res, m1, m2, n);
134
135 QueryPerformanceCounter(&nEndTime);
136 return (double)(nEndTime.QuadPart - nBeginTime.QuadPart) * 1000 / (double)nFreq.QuadPart;
137 }
138
139
140 // 主函数
141 int _tmain(int argc, _TCHAR* argv[])
142 {
143 const size_t n = 1024;
144 double* dM1 = NULL;
145 double* dM2 = NULL;
146 double* dRes1 = NULL;
147 double* dRes2 = NULL;
148 double* dRes3 = NULL;
149
150 random_device rd;
151 mt19937 gen(rd());
152
153 AllocMatrix(&dM1, n);
154 AllocMatrix(&dM2, n);
155 IniMatrix(dM1, n, gen);
156 IniMatrix(dM2, n, gen);
157
158 AllocMatrix(&dRes1, n);
159 AllocMatrix(&dRes2, n);
160 AllocMatrix(&dRes3, n);
161
162 double dTimeFor = testmatrixMultiplyFor(dRes1, dM1, dM2, n);
163 double dTimeParForOuter = testmatrixMultiplyParForOuter(dRes2, dM1, dM2, n);
164 double dTimeParForInner = testmatrixMultiplyParForInner(dRes3, dM1, dM2, n);
165
166 printf("time(ms)\nfor: %f \nparforOunter: %f \nparforInner: %f\n", dTimeFor, dTimeParForOuter, dTimeParForInner);
167
168 FreeMatrix(&dM1);
169 FreeMatrix(&dM2);
170 FreeMatrix(&dRes1);
171 FreeMatrix(&dRes2);
172 FreeMatrix(&dRes3);
173
174 return 0;
175 }
debug:
time(ms)
for: 7761.769099
parforOunter: 3416.670736
parforInner: 3423.701265
release:
time(ms)
for: 3884.167485
parforOunter: 1062.581817
parforInner: 1083.642302
说明:此处测试outer和inner是因为,matlab里面,使用outer形式的并行计算,使用parfor后,如果循环比对类似这种三角形式,最终有些核先跑完结果,有些核后跑完结果,导致出现,一个核累死累活的跑程序,另外N-1个核围观的状态,使最终的计算时间变长(不过在matlab中未测试outer和inner使用parfor的时间对比)。
但是,在C++里面,不知道是否优化的原因,outer使用parfor比inner使用parfor要快。此处测试了n=2048,结果也是outer比inner的形式要快。
来源:https://www.cnblogs.com/darkknightzh/p/4988264.html