Why doesn't Intel compiler vectorize this simple loop (while MSVC does)?

up vote
0
down vote

favorite

I am investigating vectorization of vector operations. As an example I took common multiplication and addition of 3 vectors. I used Eigen's Vector data types to ensure alignment.

#include "pch.h"

#include <iostream>

#include "Core"

#include <chrono>

using Eigen::RowVectorXd;

using std::chrono::high_resolution_clock;

using std::chrono::nanoseconds;

using std::chrono::duration_cast;

//using tbb::tick_count;

int main()

{

    std::cout<<EIGEN_DEFAULT_ALIGN_BYTES<<'n';

    const int length = 1000;

    RowVectorXd v1 = RowVectorXd::Constant(length, 4.0);

    RowVectorXd v2 = RowVectorXd::Constant(length, 6.0);

    RowVectorXd v3 = RowVectorXd::Constant(length, 7.0);

    RowVectorXd output(length);



    auto  s = high_resolution_clock::now();

    for (int i = 0; i < length; ++i)

        output(i) = v1(i) + v2(i) * v3(i);

    auto e = high_resolution_clock::now();

    std::cout << output(0) << " Plain loop:" << duration_cast<nanoseconds>(s - e).count()/1000.0<<'n';

    return 0;

}

MSVC 17 happily unrolls and vectorizes the loop.
Settings:

/permissive- /Yu"pch.h" /GS /GL /analyze- /W3 /Gy /Zc:wchar_t /Zi /Gm- /O2 /sdl /Fd"Releasevc141.pdb" /Zc:inline /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope /Gd /Oy- /MD /std:c++14 /FC /Fa"Release" /EHsc /nologo /Fo"Release" /Fp"Releasetest_loops.pch" /diagnostics:classic

Disassembly:

            output(i) = v1(i) + v2(i) * v3(i);

01121580  movups      xmm1,xmmword ptr [edx+esi*8]  

01121584  mov         edx,dword ptr [ebp-6Ch]  

01121587  add         esi,8  

0112158A  movups      xmm0,xmmword ptr [edi-10h]  

0112158E  mulpd       xmm1,xmm0  

01121592  movups      xmm0,xmmword ptr [eax-20h]  

01121596  addpd       xmm1,xmm0  

0112159A  movups      xmm0,xmmword ptr [edi]  

0112159D  movups      xmmword ptr [ecx-30h],xmm1  

011215A1  movups      xmm1,xmmword ptr [edx+edi]  

011215A5  mov         edx,dword ptr [ebp-4Ch]  

011215A8  mulpd       xmm1,xmm0  

011215AC  movups      xmm0,xmmword ptr [edx+edi]  

011215B0  mov         edx,dword ptr [ebp-94h]  

011215B6  addpd       xmm1,xmm0  

011215BA  movups      xmm0,xmmword ptr [edi+10h]  

011215BE  movups      xmmword ptr [edx+edi],xmm1  

011215C2  mov         edx,dword ptr [ebp-64h]  

011215C5  movups      xmm1,xmmword ptr [edx+eax]  

011215C9  mov         edx,dword ptr [ebp-8Ch]  

011215CF  mulpd       xmm1,xmm0  

011215D3  movups      xmm0,xmmword ptr [eax]  

011215D6  addpd       xmm1,xmm0  

011215DA  movups      xmm0,xmmword ptr [edi+20h]  

011215DE  add         edi,40h  

011215E1  movups      xmmword ptr [edx+eax],xmm1  

011215E5  mov         edx,dword ptr [ebp-34h]  

011215E8  movups      xmm1,xmmword ptr [edx+ecx]  

011215EC  mov         edx,dword ptr [v3]  

011215EF  mulpd       xmm1,xmm0  

011215F3  movups      xmm0,xmmword ptr [eax+10h]  

011215F7  add         eax,40h  

011215FA  addpd       xmm1,xmm0  

011215FE  movups      xmmword ptr [ecx],xmm1  

01121601  add         ecx,40h  

01121604  cmp         esi,3E8h  

0112160A  jl          main+3E0h (01121580h

)

However, The Intel Compiler 18 update 4 fails to vectorize. Settings:

/permissive- /Yu"pch.h" /GS /W3 /Gy /Zc:wchar_t /Zi /O2 /Qopt-report:5 /Qopt-report-phase:vec /Fd"Releasevc141.pdb" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /Qipo /Zc:forScope /Gd /MD /std:c++14 /FC /Fa"Release" /EHsc /nologo /Fo"Release" /Qprof-dir "Release" /Fp"Releasetest_loops.pch"

Disassembly:

for (int i = 0; i < length; ++i)

002113D0  xor         edx,edx  

002113D2  xor         eax,eax  

002113D4  nop         dword ptr [eax+eax]  

002113D9  nop         dword ptr [eax]  

        output(i) = v1(i) + v2(i) * v3(i);

002113E0  mov         ecx,dword ptr [v2]  

    for (int i = 0; i < length; ++i)

002113E3  inc         edx  

        output(i) = v1(i) + v2(i) * v3(i);

002113E4  mov         esi,dword ptr [v3]  

002113E7  mov         edi,dword ptr [v1]  

002113EA  movsd       xmm0,mmword ptr [ecx+eax]  

002113EF  mulsd       xmm0,mmword ptr [esi+eax]  

002113F4  mov         ecx,dword ptr [output]  

002113F7  addsd       xmm0,mmword ptr [edi+eax]  

002113FC  movsd       mmword ptr [ecx+eax],xmm0  

00211401  mov         esi,dword ptr [v2]  

00211404  mov         edi,dword ptr [v3]  

00211407  mov         ecx,dword ptr [v1]  

0021140A  movsd       xmm1,mmword ptr [esi+eax+8]  

00211410  mulsd       xmm1,mmword ptr [edi+eax+8]  

00211416  mov         esi,dword ptr [output]  

00211419  addsd       xmm1,mmword ptr [ecx+eax+8]  

        output(i) = v1(i) + v2(i) * v3(i);

0021141F  movsd       mmword ptr [esi+eax+8],xmm1  

    for (int i = 0; i < length; ++i)

00211425  add         eax,10h  

00211428  cmp         edx,1F4h  

0021142E  jb          main+3E0h (02113E0h)

I enable detailed vectorization reporting and it shows following.
enter image description here

So what's wrong? Why is Intel compiler unable to vectorize in contrast to MSVC?

edited Nov 10 at 19:23

Pezo

920512

asked Nov 10 at 15:48

Andrey Pro

256217

6

We're not Intel's customer support.
– n.m.
Nov 10 at 16:17

You must have a reason to use Intel’s compiler… right? Well, does this experience change anything for you?
– Kuba Ober
Nov 10 at 18:21

Quality of implementation issue. Not all compilers are equally good at all things and no compiler is best at all things.
– Jesper Juhl
Nov 10 at 18:32

I don't know about you but I was taught that a compiler bug is the last thing to suspect. And that the most likely bug sits in the front of the monitor. You say, that is a compiler bug? OK, but before I assume that and ditch the compiler, I want to make sure that is not my fault.
– Andrey Pro
Nov 10 at 18:52

A missed optimization is not a compiler bug.
– Pezo
Nov 10 at 19:15

add a comment |

up vote
0
down vote

favorite

I am investigating vectorization of vector operations. As an example I took common multiplication and addition of 3 vectors. I used Eigen's Vector data types to ensure alignment.

#include "pch.h"

#include <iostream>

#include "Core"

#include <chrono>

using Eigen::RowVectorXd;

using std::chrono::high_resolution_clock;

using std::chrono::nanoseconds;

using std::chrono::duration_cast;

//using tbb::tick_count;

int main()

{

    std::cout<<EIGEN_DEFAULT_ALIGN_BYTES<<'n';

    const int length = 1000;

    RowVectorXd v1 = RowVectorXd::Constant(length, 4.0);

    RowVectorXd v2 = RowVectorXd::Constant(length, 6.0);

    RowVectorXd v3 = RowVectorXd::Constant(length, 7.0);

    RowVectorXd output(length);



    auto  s = high_resolution_clock::now();

    for (int i = 0; i < length; ++i)

        output(i) = v1(i) + v2(i) * v3(i);

    auto e = high_resolution_clock::now();

    std::cout << output(0) << " Plain loop:" << duration_cast<nanoseconds>(s - e).count()/1000.0<<'n';

    return 0;

}

MSVC 17 happily unrolls and vectorizes the loop.
Settings:

/permissive- /Yu"pch.h" /GS /GL /analyze- /W3 /Gy /Zc:wchar_t /Zi /Gm- /O2 /sdl /Fd"Releasevc141.pdb" /Zc:inline /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope /Gd /Oy- /MD /std:c++14 /FC /Fa"Release" /EHsc /nologo /Fo"Release" /Fp"Releasetest_loops.pch" /diagnostics:classic

Disassembly:

            output(i) = v1(i) + v2(i) * v3(i);

01121580  movups      xmm1,xmmword ptr [edx+esi*8]  

01121584  mov         edx,dword ptr [ebp-6Ch]  

01121587  add         esi,8  

0112158A  movups      xmm0,xmmword ptr [edi-10h]  

0112158E  mulpd       xmm1,xmm0  

01121592  movups      xmm0,xmmword ptr [eax-20h]  

01121596  addpd       xmm1,xmm0  

0112159A  movups      xmm0,xmmword ptr [edi]  

0112159D  movups      xmmword ptr [ecx-30h],xmm1  

011215A1  movups      xmm1,xmmword ptr [edx+edi]  

011215A5  mov         edx,dword ptr [ebp-4Ch]  

011215A8  mulpd       xmm1,xmm0  

011215AC  movups      xmm0,xmmword ptr [edx+edi]  

011215B0  mov         edx,dword ptr [ebp-94h]  

011215B6  addpd       xmm1,xmm0  

011215BA  movups      xmm0,xmmword ptr [edi+10h]  

011215BE  movups      xmmword ptr [edx+edi],xmm1  

011215C2  mov         edx,dword ptr [ebp-64h]  

011215C5  movups      xmm1,xmmword ptr [edx+eax]  

011215C9  mov         edx,dword ptr [ebp-8Ch]  

011215CF  mulpd       xmm1,xmm0  

011215D3  movups      xmm0,xmmword ptr [eax]  

011215D6  addpd       xmm1,xmm0  

011215DA  movups      xmm0,xmmword ptr [edi+20h]  

011215DE  add         edi,40h  

011215E1  movups      xmmword ptr [edx+eax],xmm1  

011215E5  mov         edx,dword ptr [ebp-34h]  

011215E8  movups      xmm1,xmmword ptr [edx+ecx]  

011215EC  mov         edx,dword ptr [v3]  

011215EF  mulpd       xmm1,xmm0  

011215F3  movups      xmm0,xmmword ptr [eax+10h]  

011215F7  add         eax,40h  

011215FA  addpd       xmm1,xmm0  

011215FE  movups      xmmword ptr [ecx],xmm1  

01121601  add         ecx,40h  

01121604  cmp         esi,3E8h  

0112160A  jl          main+3E0h (01121580h

)

However, The Intel Compiler 18 update 4 fails to vectorize. Settings:

/permissive- /Yu"pch.h" /GS /W3 /Gy /Zc:wchar_t /Zi /O2 /Qopt-report:5 /Qopt-report-phase:vec /Fd"Releasevc141.pdb" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /Qipo /Zc:forScope /Gd /MD /std:c++14 /FC /Fa"Release" /EHsc /nologo /Fo"Release" /Qprof-dir "Release" /Fp"Releasetest_loops.pch"

Disassembly:

for (int i = 0; i < length; ++i)

002113D0  xor         edx,edx  

002113D2  xor         eax,eax  

002113D4  nop         dword ptr [eax+eax]  

002113D9  nop         dword ptr [eax]  

        output(i) = v1(i) + v2(i) * v3(i);

002113E0  mov         ecx,dword ptr [v2]  

    for (int i = 0; i < length; ++i)

002113E3  inc         edx  

        output(i) = v1(i) + v2(i) * v3(i);

002113E4  mov         esi,dword ptr [v3]  

002113E7  mov         edi,dword ptr [v1]  

002113EA  movsd       xmm0,mmword ptr [ecx+eax]  

002113EF  mulsd       xmm0,mmword ptr [esi+eax]  

002113F4  mov         ecx,dword ptr [output]  

002113F7  addsd       xmm0,mmword ptr [edi+eax]  

002113FC  movsd       mmword ptr [ecx+eax],xmm0  

00211401  mov         esi,dword ptr [v2]  

00211404  mov         edi,dword ptr [v3]  

00211407  mov         ecx,dword ptr [v1]  

0021140A  movsd       xmm1,mmword ptr [esi+eax+8]  

00211410  mulsd       xmm1,mmword ptr [edi+eax+8]  

00211416  mov         esi,dword ptr [output]  

00211419  addsd       xmm1,mmword ptr [ecx+eax+8]  

        output(i) = v1(i) + v2(i) * v3(i);

0021141F  movsd       mmword ptr [esi+eax+8],xmm1  

    for (int i = 0; i < length; ++i)

00211425  add         eax,10h  

00211428  cmp         edx,1F4h  

0021142E  jb          main+3E0h (02113E0h)

I enable detailed vectorization reporting and it shows following.
enter image description here

So what's wrong? Why is Intel compiler unable to vectorize in contrast to MSVC?

edited Nov 10 at 19:23

Pezo

920512

asked Nov 10 at 15:48

Andrey Pro

256217

6

We're not Intel's customer support.
– n.m.
Nov 10 at 16:17

You must have a reason to use Intel’s compiler… right? Well, does this experience change anything for you?
– Kuba Ober
Nov 10 at 18:21

Quality of implementation issue. Not all compilers are equally good at all things and no compiler is best at all things.
– Jesper Juhl
Nov 10 at 18:32

I don't know about you but I was taught that a compiler bug is the last thing to suspect. And that the most likely bug sits in the front of the monitor. You say, that is a compiler bug? OK, but before I assume that and ditch the compiler, I want to make sure that is not my fault.
– Andrey Pro
Nov 10 at 18:52

A missed optimization is not a compiler bug.
– Pezo
Nov 10 at 19:15

add a comment |

up vote
0
down vote

favorite

I am investigating vectorization of vector operations. As an example I took common multiplication and addition of 3 vectors. I used Eigen's Vector data types to ensure alignment.

#include "pch.h"

#include <iostream>

#include "Core"

#include <chrono>

using Eigen::RowVectorXd;

using std::chrono::high_resolution_clock;

using std::chrono::nanoseconds;

using std::chrono::duration_cast;

//using tbb::tick_count;

int main()

{

    std::cout<<EIGEN_DEFAULT_ALIGN_BYTES<<'n';

    const int length = 1000;

    RowVectorXd v1 = RowVectorXd::Constant(length, 4.0);

    RowVectorXd v2 = RowVectorXd::Constant(length, 6.0);

    RowVectorXd v3 = RowVectorXd::Constant(length, 7.0);

    RowVectorXd output(length);



    auto  s = high_resolution_clock::now();

    for (int i = 0; i < length; ++i)

        output(i) = v1(i) + v2(i) * v3(i);

    auto e = high_resolution_clock::now();

    std::cout << output(0) << " Plain loop:" << duration_cast<nanoseconds>(s - e).count()/1000.0<<'n';

    return 0;

}

MSVC 17 happily unrolls and vectorizes the loop.
Settings:

/permissive- /Yu"pch.h" /GS /GL /analyze- /W3 /Gy /Zc:wchar_t /Zi /Gm- /O2 /sdl /Fd"Releasevc141.pdb" /Zc:inline /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope /Gd /Oy- /MD /std:c++14 /FC /Fa"Release" /EHsc /nologo /Fo"Release" /Fp"Releasetest_loops.pch" /diagnostics:classic

Disassembly:

            output(i) = v1(i) + v2(i) * v3(i);

01121580  movups      xmm1,xmmword ptr [edx+esi*8]  

01121584  mov         edx,dword ptr [ebp-6Ch]  

01121587  add         esi,8  

0112158A  movups      xmm0,xmmword ptr [edi-10h]  

0112158E  mulpd       xmm1,xmm0  

01121592  movups      xmm0,xmmword ptr [eax-20h]  

01121596  addpd       xmm1,xmm0  

0112159A  movups      xmm0,xmmword ptr [edi]  

0112159D  movups      xmmword ptr [ecx-30h],xmm1  

011215A1  movups      xmm1,xmmword ptr [edx+edi]  

011215A5  mov         edx,dword ptr [ebp-4Ch]  

011215A8  mulpd       xmm1,xmm0  

011215AC  movups      xmm0,xmmword ptr [edx+edi]  

011215B0  mov         edx,dword ptr [ebp-94h]  

011215B6  addpd       xmm1,xmm0  

011215BA  movups      xmm0,xmmword ptr [edi+10h]  

011215BE  movups      xmmword ptr [edx+edi],xmm1  

011215C2  mov         edx,dword ptr [ebp-64h]  

011215C5  movups      xmm1,xmmword ptr [edx+eax]  

011215C9  mov         edx,dword ptr [ebp-8Ch]  

011215CF  mulpd       xmm1,xmm0  

011215D3  movups      xmm0,xmmword ptr [eax]  

011215D6  addpd       xmm1,xmm0  

011215DA  movups      xmm0,xmmword ptr [edi+20h]  

011215DE  add         edi,40h  

011215E1  movups      xmmword ptr [edx+eax],xmm1  

011215E5  mov         edx,dword ptr [ebp-34h]  

011215E8  movups      xmm1,xmmword ptr [edx+ecx]  

011215EC  mov         edx,dword ptr [v3]  

011215EF  mulpd       xmm1,xmm0  

011215F3  movups      xmm0,xmmword ptr [eax+10h]  

011215F7  add         eax,40h  

011215FA  addpd       xmm1,xmm0  

011215FE  movups      xmmword ptr [ecx],xmm1  

01121601  add         ecx,40h  

01121604  cmp         esi,3E8h  

0112160A  jl          main+3E0h (01121580h

)

However, The Intel Compiler 18 update 4 fails to vectorize. Settings:

/permissive- /Yu"pch.h" /GS /W3 /Gy /Zc:wchar_t /Zi /O2 /Qopt-report:5 /Qopt-report-phase:vec /Fd"Releasevc141.pdb" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /Qipo /Zc:forScope /Gd /MD /std:c++14 /FC /Fa"Release" /EHsc /nologo /Fo"Release" /Qprof-dir "Release" /Fp"Releasetest_loops.pch"

Disassembly:

for (int i = 0; i < length; ++i)

002113D0  xor         edx,edx  

002113D2  xor         eax,eax  

002113D4  nop         dword ptr [eax+eax]  

002113D9  nop         dword ptr [eax]  

        output(i) = v1(i) + v2(i) * v3(i);

002113E0  mov         ecx,dword ptr [v2]  

    for (int i = 0; i < length; ++i)

002113E3  inc         edx  

        output(i) = v1(i) + v2(i) * v3(i);

002113E4  mov         esi,dword ptr [v3]  

002113E7  mov         edi,dword ptr [v1]  

002113EA  movsd       xmm0,mmword ptr [ecx+eax]  

002113EF  mulsd       xmm0,mmword ptr [esi+eax]  

002113F4  mov         ecx,dword ptr [output]  

002113F7  addsd       xmm0,mmword ptr [edi+eax]  

002113FC  movsd       mmword ptr [ecx+eax],xmm0  

00211401  mov         esi,dword ptr [v2]  

00211404  mov         edi,dword ptr [v3]  

00211407  mov         ecx,dword ptr [v1]  

0021140A  movsd       xmm1,mmword ptr [esi+eax+8]  

00211410  mulsd       xmm1,mmword ptr [edi+eax+8]  

00211416  mov         esi,dword ptr [output]  

00211419  addsd       xmm1,mmword ptr [ecx+eax+8]  

        output(i) = v1(i) + v2(i) * v3(i);

0021141F  movsd       mmword ptr [esi+eax+8],xmm1  

    for (int i = 0; i < length; ++i)

00211425  add         eax,10h  

00211428  cmp         edx,1F4h  

0021142E  jb          main+3E0h (02113E0h)

I enable detailed vectorization reporting and it shows following.
enter image description here

So what's wrong? Why is Intel compiler unable to vectorize in contrast to MSVC?

edited Nov 10 at 19:23

Pezo

920512

asked Nov 10 at 15:48

Andrey Pro

256217

I am investigating vectorization of vector operations. As an example I took common multiplication and addition of 3 vectors. I used Eigen's Vector data types to ensure alignment.

#include "pch.h"

#include <iostream>

#include "Core"

#include <chrono>

using Eigen::RowVectorXd;

using std::chrono::high_resolution_clock;

using std::chrono::nanoseconds;

using std::chrono::duration_cast;

//using tbb::tick_count;

int main()

{

    std::cout<<EIGEN_DEFAULT_ALIGN_BYTES<<'n';

    const int length = 1000;

    RowVectorXd v1 = RowVectorXd::Constant(length, 4.0);

    RowVectorXd v2 = RowVectorXd::Constant(length, 6.0);

    RowVectorXd v3 = RowVectorXd::Constant(length, 7.0);

    RowVectorXd output(length);



    auto  s = high_resolution_clock::now();

    for (int i = 0; i < length; ++i)

        output(i) = v1(i) + v2(i) * v3(i);

    auto e = high_resolution_clock::now();

    std::cout << output(0) << " Plain loop:" << duration_cast<nanoseconds>(s - e).count()/1000.0<<'n';

    return 0;

}

MSVC 17 happily unrolls and vectorizes the loop.
Settings:

/permissive- /Yu"pch.h" /GS /GL /analyze- /W3 /Gy /Zc:wchar_t /Zi /Gm- /O2 /sdl /Fd"Releasevc141.pdb" /Zc:inline /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope /Gd /Oy- /MD /std:c++14 /FC /Fa"Release" /EHsc /nologo /Fo"Release" /Fp"Releasetest_loops.pch" /diagnostics:classic

Disassembly:

            output(i) = v1(i) + v2(i) * v3(i);

01121580  movups      xmm1,xmmword ptr [edx+esi*8]  

01121584  mov         edx,dword ptr [ebp-6Ch]  

01121587  add         esi,8  

0112158A  movups      xmm0,xmmword ptr [edi-10h]  

0112158E  mulpd       xmm1,xmm0  

01121592  movups      xmm0,xmmword ptr [eax-20h]  

01121596  addpd       xmm1,xmm0  

0112159A  movups      xmm0,xmmword ptr [edi]  

0112159D  movups      xmmword ptr [ecx-30h],xmm1  

011215A1  movups      xmm1,xmmword ptr [edx+edi]  

011215A5  mov         edx,dword ptr [ebp-4Ch]  

011215A8  mulpd       xmm1,xmm0  

011215AC  movups      xmm0,xmmword ptr [edx+edi]  

011215B0  mov         edx,dword ptr [ebp-94h]  

011215B6  addpd       xmm1,xmm0  

011215BA  movups      xmm0,xmmword ptr [edi+10h]  

011215BE  movups      xmmword ptr [edx+edi],xmm1  

011215C2  mov         edx,dword ptr [ebp-64h]  

011215C5  movups      xmm1,xmmword ptr [edx+eax]  

011215C9  mov         edx,dword ptr [ebp-8Ch]  

011215CF  mulpd       xmm1,xmm0  

011215D3  movups      xmm0,xmmword ptr [eax]  

011215D6  addpd       xmm1,xmm0  

011215DA  movups      xmm0,xmmword ptr [edi+20h]  

011215DE  add         edi,40h  

011215E1  movups      xmmword ptr [edx+eax],xmm1  

011215E5  mov         edx,dword ptr [ebp-34h]  

011215E8  movups      xmm1,xmmword ptr [edx+ecx]  

011215EC  mov         edx,dword ptr [v3]  

011215EF  mulpd       xmm1,xmm0  

011215F3  movups      xmm0,xmmword ptr [eax+10h]  

011215F7  add         eax,40h  

011215FA  addpd       xmm1,xmm0  

011215FE  movups      xmmword ptr [ecx],xmm1  

01121601  add         ecx,40h  

01121604  cmp         esi,3E8h  

0112160A  jl          main+3E0h (01121580h

)

However, The Intel Compiler 18 update 4 fails to vectorize. Settings:

/permissive- /Yu"pch.h" /GS /W3 /Gy /Zc:wchar_t /Zi /O2 /Qopt-report:5 /Qopt-report-phase:vec /Fd"Releasevc141.pdb" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /Qipo /Zc:forScope /Gd /MD /std:c++14 /FC /Fa"Release" /EHsc /nologo /Fo"Release" /Qprof-dir "Release" /Fp"Releasetest_loops.pch"

Disassembly:

for (int i = 0; i < length; ++i)

002113D0  xor         edx,edx  

002113D2  xor         eax,eax  

002113D4  nop         dword ptr [eax+eax]  

002113D9  nop         dword ptr [eax]  

        output(i) = v1(i) + v2(i) * v3(i);

002113E0  mov         ecx,dword ptr [v2]  

    for (int i = 0; i < length; ++i)

002113E3  inc         edx  

        output(i) = v1(i) + v2(i) * v3(i);

002113E4  mov         esi,dword ptr [v3]  

002113E7  mov         edi,dword ptr [v1]  

002113EA  movsd       xmm0,mmword ptr [ecx+eax]  

002113EF  mulsd       xmm0,mmword ptr [esi+eax]  

002113F4  mov         ecx,dword ptr [output]  

002113F7  addsd       xmm0,mmword ptr [edi+eax]  

002113FC  movsd       mmword ptr [ecx+eax],xmm0  

00211401  mov         esi,dword ptr [v2]  

00211404  mov         edi,dword ptr [v3]  

00211407  mov         ecx,dword ptr [v1]  

0021140A  movsd       xmm1,mmword ptr [esi+eax+8]  

00211410  mulsd       xmm1,mmword ptr [edi+eax+8]  

00211416  mov         esi,dword ptr [output]  

00211419  addsd       xmm1,mmword ptr [ecx+eax+8]  

        output(i) = v1(i) + v2(i) * v3(i);

0021141F  movsd       mmword ptr [esi+eax+8],xmm1  

    for (int i = 0; i < length; ++i)

00211425  add         eax,10h  

00211428  cmp         edx,1F4h  

0021142E  jb          main+3E0h (02113E0h)

I enable detailed vectorization reporting and it shows following.
enter image description here

So what's wrong? Why is Intel compiler unable to vectorize in contrast to MSVC?

c++ vectorization icc

edited Nov 10 at 19:23

Pezo

920512

asked Nov 10 at 15:48

Andrey Pro

256217

edited Nov 10 at 19:23

Pezo

920512

asked Nov 10 at 15:48

Andrey Pro

256217

edited Nov 10 at 19:23

Pezo

920512

edited Nov 10 at 19:23

Pezo

920512

edited Nov 10 at 19:23

Pezo

920512

asked Nov 10 at 15:48

Andrey Pro

256217

asked Nov 10 at 15:48

Andrey Pro

256217

asked Nov 10 at 15:48

Andrey Pro

256217

6

We're not Intel's customer support.
– n.m.
Nov 10 at 16:17

You must have a reason to use Intel’s compiler… right? Well, does this experience change anything for you?
– Kuba Ober
Nov 10 at 18:21

Quality of implementation issue. Not all compilers are equally good at all things and no compiler is best at all things.
– Jesper Juhl
Nov 10 at 18:32

I don't know about you but I was taught that a compiler bug is the last thing to suspect. And that the most likely bug sits in the front of the monitor. You say, that is a compiler bug? OK, but before I assume that and ditch the compiler, I want to make sure that is not my fault.
– Andrey Pro
Nov 10 at 18:52

A missed optimization is not a compiler bug.
– Pezo
Nov 10 at 19:15

add a comment |

6

We're not Intel's customer support.
– n.m.
Nov 10 at 16:17

You must have a reason to use Intel’s compiler… right? Well, does this experience change anything for you?
– Kuba Ober
Nov 10 at 18:21

Quality of implementation issue. Not all compilers are equally good at all things and no compiler is best at all things.
– Jesper Juhl
Nov 10 at 18:32

I don't know about you but I was taught that a compiler bug is the last thing to suspect. And that the most likely bug sits in the front of the monitor. You say, that is a compiler bug? OK, but before I assume that and ditch the compiler, I want to make sure that is not my fault.
– Andrey Pro
Nov 10 at 18:52

A missed optimization is not a compiler bug.
– Pezo
Nov 10 at 19:15

We're not Intel's customer support.
– n.m.
Nov 10 at 16:17

You must have a reason to use Intel’s compiler… right? Well, does this experience change anything for you?
– Kuba Ober
Nov 10 at 18:21

Quality of implementation issue. Not all compilers are equally good at all things and no compiler is best at all things.
– Jesper Juhl
Nov 10 at 18:32

I don't know about you but I was taught that a compiler bug is the last thing to suspect. And that the most likely bug sits in the front of the monitor. You say, that is a compiler bug? OK, but before I assume that and ditch the compiler, I want to make sure that is not my fault.
– Andrey Pro
Nov 10 at 18:52

A missed optimization is not a compiler bug.
– Pezo
Nov 10 at 19:15

add a comment |

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53240606%2fwhy-doesnt-intel-compiler-vectorize-this-simple-loop-while-msvc-does%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

active

oldest

votes

draft saved

draft discarded

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

5ebJY8WhYPQd ou6bqJQkHcKa2DQL2RWrS6f b5oDGcgG4FzEpV,U8ezaK9vsnhdr,y7lZgtPo2VrCY9XKuPpoFkCpYheztTuQe

搜尋此網誌

Nrthugu