Suppose a1
,
b1
, c1
, and
d1
point to heap memory and my numerical code has the following
core loop.
const int n =
100000;
for (int j = 0; j < n; j++) {
a1[j] += b1[j];
c1[j] +=
d1[j];
}
This
loop is executed 10,000 times via another outer for
loop. To
speed it up, I changed the code
to:
for (int j = 0; j < n; j++)
{
a1[j] += b1[j];
}
for (int j = 0;
j < n; j++) {
c1[j] +=
d1[j];
}
Compiled
on MS rel="noreferrer">Visual C++ 10.0 with full optimization and href="http://en.wikipedia.org/wiki/SSE2" rel="noreferrer">SSE2 enabled for
32-bit on a rel="noreferrer">Intel Core 2 Duo (x64), the first example takes
5.5 seconds and the double-loop example takes only 1.9 seconds. My question is: (Please
refer to the my rephrased question at the
bottom)
PS: I am not sure, if this
helps:
Disassembly for the first loop
basically looks like this (this block is repeated about five times in the full
program):
movsd xmm0,mmword ptr
[edx+18h]
addsd xmm0,mmword ptr [ecx+20h]
movsd mmword ptr
[ecx+20h],xmm0
movsd xmm0,mmword ptr [esi+10h]
addsd xmm0,mmword ptr
[eax+30h]
movsd mmword ptr [eax+30h],xmm0
movsd xmm0,mmword ptr
[edx+20h]
addsd xmm0,mmword ptr [ecx+28h]
movsd mmword
ptr [ecx+28h],xmm0
movsd xmm0,mmword ptr [esi+18h]
addsd xmm0,mmword
ptr [eax+38h]
Each
loop of the double loop example produces this code (the following block is repeated
about three times):
addsd
xmm0,mmword ptr [eax+28h]
movsd mmword ptr
[eax+28h],xmm0
movsd xmm0,mmword ptr [ecx+20h]
addsd
xmm0,mmword ptr [eax+30h]
movsd mmword ptr [eax+30h],xmm0
movsd
xmm0,mmword ptr [ecx+28h]
addsd xmm0,mmword ptr [eax+38h]
movsd
mmword ptr [eax+38h],xmm0
movsd xmm0,mmword ptr [ecx+30h]
addsd
xmm0,mmword ptr [eax+40h]
movsd mmword ptr
[eax+40h],xmm0
The
question turned out to be of no relevance, as the behavior severely depends on the sizes
of the arrays (n) and the CPU cache. So if there is further interest, I rephrase the
question:
Could you provide some
solid insight into the details that lead to the different cache behaviors as illustrated
by the five regions on the following
graph?
It might
also be interesting to point out the differences between CPU/cache architectures, by
providing a similar graph for these
CPUs.
PPS: Here is the full code.
It uses rel="noreferrer">TBB Tick_Count
for higher
resolution timing, which can be disabled by not defining the
TBB_TIMING
Macro:
#include
#include
#include
#include
//#define
TBB_TIMING
#ifdef TBB_TIMING
#include
using
tbb::tick_count;
#else
#include
#endif
using namespace
std;
//#define preallocate_memory
new_cont
enum { new_cont, new_sep };
double
*a1, *b1, *c1, *d1;
void allo(int cont, int
n)
{
switch(cont) {
case new_cont:
a1 = new
double[n*4];
b1 = a1 + n;
c1 = b1 + n;
d1 = c1 +
n;
break;
case new_sep:
a1 = new
double[n];
b1 = new double[n];
c1 = new double[n];
d1 =
new double[n];
break;
}
for (int i = 0; i
< n; i++) {
a1[i] = 1.0;
d1[i] = 1.0;
c1[i] = 1.0;
b1[i] = 1.0;
}
}
void
ff(int cont)
{
switch(cont){
case
new_sep:
delete[] b1;
delete[] c1;
delete[]
d1;
case new_cont:
delete[] a1;
}
}
double plain(int n, int m, int cont, int
loops)
{
#ifndef preallocate_memory
allo(cont,n);
#endif
#ifdef TBB_TIMING
tick_count t0 = tick_count::now();
#else
clock_t start =
clock();
#endif
if (loops == 1) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++){
a1[j] += b1[j];
c1[j] += d1[j];
}
}
} else
{
for (int i = 0; i < m; i++) {
for (int j = 0; j
< n; j++) {
a1[j] += b1[j];
}
for (int j = 0; j <
n; j++) {
c1[j] += d1[j];
}
}
}
double ret;
#ifdef TBB_TIMING
tick_count t1
= tick_count::now();
ret =
2.0*double(n)*double(m)/(t1-t0).seconds();
#else
clock_t end =
clock();
ret = 2.0*double(n)*double(m)/(double)(end - start)
*double(CLOCKS_PER_SEC);
#endif
#ifndef
preallocate_memory
ff(cont);
#endif
return
ret;
}
void main()
{
freopen("C:\\test.csv", "w", stdout);
char *s = "
";
string na[2] ={"new_cont", "new_sep"};
cout << "n";
for (int j = 0; j < 2; j++)
for
(int i = 1; i <= 2; i++)
#ifdef preallocate_memory
cout <<
s << i << "_loops_" <<
na[preallocate_memory];
#else
cout << s << i
<< "_loops_" << na[j];
#endif
cout <<
endl;
long long nmax = 1000000;
#ifdef
preallocate_memory
allo(preallocate_memory,
nmax);
#endif
for (long long n = 1L; n <
nmax; n = max(n+1, long long(n*1.2)))
{
const long long m =
10000000/n;
cout << n;
for (int j = 0; j < 2;
j++)
for (int i = 1; i <= 2; i++)
cout << s <<
plain(n, m, j, i);
cout << endl;
}
}
(It
shows FLOP/s for different values of
n
.)
src="https://i.stack.imgur.com/keuWU.gif" alt="enter image description
here">
No comments:
Post a Comment