MMX code generation in Visual C++ 2010 Express

¶MMX code generation in Visual C++ 2010 Express

After installing Visual C++ 2010 Express, I decided to try MMX code generation on a whim:

#include <emmintrin.h>

double ComputeVariance(const unsigned char *samples, int quads) {
    __m64 zero = _mm_setzero_si64();
    __m64 one = _mm_set1_pi16(1);
    __m64 sum = zero;
    __m64 sumsq = zero;

    for(int i=0; i<quads; ++i) {
        int raw = *(int *)samples;
        samples += 4;
        __m64 p = _m_punpcklbw(_m_from_int(raw), zero);
        __m64 x = _m_pmaddwd(p, one);
        __m64 x2 = _m_pmaddwd(p, p);
        sum = _m_paddd(sum, x);
        sumsq = _m_paddd(sumsq, x2);
    }

    unsigned int isum = _m_to_int(_m_paddd(_m_psrlqi(sum, 32), sum));
    unsigned int isumsq = _m_to_int(_m_paddd(_m_psrlqi(sumsq, 32), sumsq));

    _mm_empty();

    double n = (double)quads * 4;
    double fsum = (double)isum;
    double fsumsq = (double)isumsq;
    return (n*fsumsq - fsum*fsum) / (n*(n-1));
}

This routine uses MMX intrinsics to compute the variance of a series of samples, stored as unsigned bytes. SSE intrinsics got some attention in the VS2010 compiler, but MMX intrinsics have long been the neglected stepchild and I hadn't heard anything about them. Well, let's look at the disassembly:

VS2008 SP1 (VC9)	VS2010 (VC10)
00: push ebp 01: mov ebp,esp 03: and esp,0FFFFFFF8h 06: mov edx,dword ptr [ebp+0Ch] 09: pxor mm3,mm3 0C: mov eax,1 11: movd mm0,eax 14: movq mm1,mm0 17: punpcklwd mm1,mm0 1A: movq mm0,mm1 1D: punpcklwd mm1,mm0 20: sub esp,8 23: movq mm4,mm1 26: movq mm1,mm3 29: movq mm2,mm3 2C: test edx,edx 2E: jle 0000005B 30: mov ecx,dword ptr [ebp+8] 33: mov eax,dword ptr [ecx] 35: movq mm5,mm3 38: movd mm0,eax 3B: punpcklbw mm0,mm5 3E: movq mm5,mm0 41: movq mm6,mm4 44: pmaddwd mm5,mm6 47: paddd mm1,mm5 4A: add ecx,4 4D: sub edx,1 50: movq mm5,mm0 53: pmaddwd mm5,mm0 56: paddd mm2,mm5 59: jne 00000033 5B: movq mm0,mm1 5E: psrlq mm0,20h 62: paddd mm0,mm1 65: movd eax,mm0 68: movq mm0,mm2 6B: psrlq mm0,20h 6F: paddd mm0,mm2 72: movd ecx,mm0 75: emms 77: fild dword ptr [ebp+0Ch] 7A: mov dword ptr [esp+4],eax 7E: fmul qword ptr [__real@4010000000000000] 84: fild dword ptr [esp+4] 88: test eax,eax 8A: jge 00000092 8C: fadd qword ptr [__real@41f0000000000000] 92: mov dword ptr [esp+4],ecx 96: fild dword ptr [esp+4] 9A: test ecx,ecx 9C: jge 000000A4 9E: fadd qword ptr [__real@41f0000000000000] A4: fmul st,st(2) A6: fld st(1) A8: fmulp st(2),st AA: fsubrp st(1),st AC: fld st(1) AE: fsub qword ptr [__real@3ff0000000000000] B4: fmulp st(2),st B6: fdivrp st(1),st B8: mov esp,ebp BA: pop ebp BB: ret	00: mov edx,dword ptr [esp+8] 04: pxor mm3,mm3 07: mov eax,1 0C: movd mm0,eax 0F: punpcklwd mm0,mm0 12: punpcklwd mm0,mm0 15: movq mm4,mm0 18: movq mm1,mm3 1B: movq mm2,mm3 1E: test edx,edx 20: jle 00000046 22: mov ecx,dword ptr [esp+4] 26: mov eax,dword ptr [ecx] 28: movd mm0,eax 2B: punpcklbw mm0,mm3 2E: movq mm5,mm0 31: pmaddwd mm5,mm4 34: paddd mm1,mm5 37: add ecx,4 3A: dec edx 3B: movq mm5,mm0 3E: pmaddwd mm5,mm0 41: paddd mm2,mm5 44: jne 00000026 46: movq mm0,mm1 49: psrlq mm0,20h 4D: paddd mm0,mm1 50: movd eax,mm0 53: movq mm0,mm2 56: psrlq mm0,20h 5A: paddd mm0,mm2 5D: movd ecx,mm0 60: emms 62: fild dword ptr [esp+8] 66: mov dword ptr [esp+8],eax 6A: fmul qword ptr [__real@4010000000000000] 70: fild dword ptr [esp+8] 74: test eax,eax 76: jns 0000007E 78: fadd qword ptr [__real@41f0000000000000] 7E: mov dword ptr [esp+8],ecx 82: fild dword ptr [esp+8] 86: test ecx,ecx 88: jns 00000090 8A: fadd qword ptr [__real@41f0000000000000] 90: fmul st,st(2) 92: fld st(1) 94: fmulp st(2),st 96: fsubrp st(1),st 98: fld st(1) 9A: fsub qword ptr [__real@3ff0000000000000] A0: fmulp st(2),st A2: fdivrp st(1),st A4: ret

VS2008 SP1 (VC9)

VS2010 (VC10)

00: push ebp
01: mov ebp,esp
03: and esp,0FFFFFFF8h
06: mov edx,dword ptr [ebp+0Ch]
09: pxor mm3,mm3
0C: mov eax,1
11: movd mm0,eax
14: movq mm1,mm0
17: punpcklwd mm1,mm0
1A: movq mm0,mm1
1D: punpcklwd mm1,mm0
20: sub esp,8
23: movq mm4,mm1
26: movq mm1,mm3
29: movq mm2,mm3
2C: test edx,edx
2E: jle 0000005B
30: mov ecx,dword ptr [ebp+8]
33: mov eax,dword ptr [ecx]
35: movq mm5,mm3
38: movd mm0,eax
3B: punpcklbw mm0,mm5
3E: movq mm5,mm0
41: movq mm6,mm4
44: pmaddwd mm5,mm6
47: paddd mm1,mm5
4A: add ecx,4
4D: sub edx,1
50: movq mm5,mm0
53: pmaddwd mm5,mm0
56: paddd mm2,mm5
59: jne 00000033
5B: movq mm0,mm1
5E: psrlq mm0,20h
62: paddd mm0,mm1
65: movd eax,mm0
68: movq mm0,mm2
6B: psrlq mm0,20h
6F: paddd mm0,mm2
72: movd ecx,mm0
75: emms
77: fild dword ptr [ebp+0Ch]
7A: mov dword ptr [esp+4],eax
7E: fmul qword ptr [__real@4010000000000000]
84: fild dword ptr [esp+4]
88: test eax,eax
8A: jge 00000092
8C: fadd qword ptr [__real@41f0000000000000]
92: mov dword ptr [esp+4],ecx
96: fild dword ptr [esp+4]
9A: test ecx,ecx
9C: jge 000000A4
9E: fadd qword ptr [__real@41f0000000000000]
A4: fmul st,st(2)
A6: fld st(1)
A8: fmulp st(2),st
AA: fsubrp st(1),st
AC: fld st(1)
AE: fsub qword ptr [__real@3ff0000000000000]
B4: fmulp st(2),st
B6: fdivrp st(1),st
B8: mov esp,ebp
BA: pop ebp
BB: ret

 
 
 
00: mov edx,dword ptr [esp+8]
04: pxor mm3,mm3
07: mov eax,1
0C: movd mm0,eax
 
0F: punpcklwd mm0,mm0
 
12: punpcklwd mm0,mm0
  
15: movq mm4,mm0
18: movq mm1,mm3
1B: movq mm2,mm3
1E: test edx,edx
20: jle 00000046
22: mov ecx,dword ptr [esp+4]
26: mov eax,dword ptr [ecx]
 
28: movd mm0,eax
2B: punpcklbw mm0,mm3
2E: movq mm5,mm0
 
31: pmaddwd mm5,mm4
34: paddd mm1,mm5
37: add ecx,4
3A: dec edx
3B: movq mm5,mm0
3E: pmaddwd mm5,mm0
41: paddd mm2,mm5
44: jne 00000026
46: movq mm0,mm1
49: psrlq mm0,20h
4D: paddd mm0,mm1
50: movd eax,mm0
53: movq mm0,mm2
56: psrlq mm0,20h
5A: paddd mm0,mm2
5D: movd ecx,mm0
60: emms
62: fild dword ptr [esp+8]
66: mov dword ptr [esp+8],eax
6A: fmul qword ptr [__real@4010000000000000]
70: fild dword ptr [esp+8]
74: test eax,eax
76: jns 0000007E
78: fadd qword ptr [__real@41f0000000000000]
7E: mov dword ptr [esp+8],ecx
82: fild dword ptr [esp+8]
86: test ecx,ecx
88: jns 00000090
8A: fadd qword ptr [__real@41f0000000000000]
90: fmul st,st(2)
92: fld st(1)
94: fmulp st(2),st
96: fsubrp st(1),st
98: fld st(1)
9A: fsub qword ptr [__real@3ff0000000000000]
A0: fmulp st(2),st
A2: fdivrp st(1),st
 
 
A4: ret

The inner loop is highlighted in red. The first thing I'll point out is that the code is correct; MMX intrinsics were troublesome in VC7.1 because the compiler had a tendency to hoist floating-point operations above calls to _mm_empty(), which fortunately has long been fixed.

I've omitted the disassembly for VS2005 SP1 because it's nearly the same as VS2008 SP1, except for a couple of very minor differences like add eax,1 vs. inc eax. One immediately noticeable difference is that the VS2010 compiler (VC10) generated smaller code than the VS2008 SP1 (VC9), ~13% shorter. Digging into the details, we can see that:

VC9 emitted an unnecessary aligned stack frame, which VC10 was able to avoid.
VC10 is better at copy propagation in MMX registers and generated fewer useless moves, although it still emitted one (offset 3B).

This is a bit of a nice surprise, given that I hadn't expected any improvement in MMX code generation at all. The reduction in code size is also accompanied by a slight increase in execution speed, which I measure at 2412 clocks vs. 2537 clocks for a 2K block on my 45nm Core 2. 5% isn't much, but I'll take it. Unfortunately, although the SSE set intrinsics have been improved, the MMX intrinsics haven't, and the compiler still emits a bunch of code to compute the (1, 1, 1, 1) vector instead of computing the final value. The compiler is also still unable to emit a direct 32-bit load, always preferring to bounce through GPRs. That is the main problem I've had with the VC++ implementation of MMX/SSE2 intrinsics, as I work a lot with 32-bit pixels.

I did check SSE2 code generation as well, and the differences there are fewer. VC8/9 already had improvements in SSE copy propagation, so no advantage there. However, VC10 still pulls ahead due to omitting the aligned stack frame and much better code generation for the set intrinsics. This means that entry/initialization code will tend to benefit a lot more than inner loops. (I can no longer use _mm_set_epi8() as my poster child for bad code generation; it was my favorite as it generated 18 instructions in VC8 and 74 instructions in VC9. VC10 generates a single instruction with constant input.)

It's nice to see improvement in intrinsics support, but after all this time, I still don't like intrinsics that much. I've warmed up to them a bit, though, since my tolerance for fiddling with manual register allocation is not quite what it used to be and they're handy for prototyping. My wish list:

More speed. Hey, why do you think I'm using intrinsics in the first place?
Less underscores. I'm getting flashbacks to Managed C++ here. How about a namespace so I can do a "using namespace"?
Better syntax: something like VMX's vec_add() instead of nasty functions like _mm_add_epi32(), or even a+b.
Constant folding: even in VC10, _mm_add_epi32(_mm_set1_epi32(1), _mm_set1_epi32(2)) generates a runtime add of two constants.
Correct pointer typing: why does _mm_loadl_epi64() take a __m128i* when it takes a 64-bit argument without required alignment?
Scalar-vector mixing: combining scalar float operations and SSE intrinsics results in bouncing floats through memory even with /arch:SSE2.
Immediate arguments: the compiler should handle promoting constant shifts to immediate shift arguments, i.e. emit a constant shift without requiring _mm_srli_epi16() and a constant expression.
Aligned operator new().

3 comments | Apr 15, 2010 at 17:25 | default

Current version

Navigation

Archives

¶MMX code generation in Visual C++ 2010 Express

Comments