NSieve details

I know I need to generate 64-bit code in order to be competitive with other compilers out there. Even though my code is cross-platform, I'm writing it at home on the weekends, where I run Windows XP. This morning, went out and bought Windows 7. One package has both 32- and 64-bit DVDs, and I got the Ultimate edition, because the difference in the upgrades was only 20.

Also downloaded Visual Studio 2010 Beta (free 6-month trial), which comes with a 64-bit compiler. Good deal! Half a year of free development. I didn't expect nsieve to run MUCH faster, maybe 10 percent faster because of prefetching, which I expected the Microsoft compiler to do aggressively. The original 32-bit version under XP 32 ran in 0.81 seconds; On the same computer under Windows 7, the new version ran in 0.88. What gives?

unsigned int nsieve2(unsigned char *c, unsigned int m) {
   unsigned int count = 0;
   for (unsigned int i=2; i < m; i++) {
      if(c[i]) {
         ++count;
         for (unsigned int j=i+i; j < m; j = j+i) {
             c[j]=0;
         }
      }
   }
   return count;
}

unsigned int nsieve(unsigned int m) {
   vector<unsigned char> vec(m,1);
   return nsieve2(& vec[0],m);
}

Disassembly of nsieve2:

...
000000013F1D115B 4C 8D 59 04          lea         r11,[rcx+4]
000000013F1D115F 90                   nop
000000013F1D1160 41 80 3C 19 00       cmp         byte ptr [r9+rbx],0
000000013F1D1165 74 27                je          000000013F1D118E
000000013F1D1167 43 8D 0C 00          lea         ecx,[r8+r8]
000000013F1D116B FF C0                inc         eax
000000013F1D116D 41 3B CA             cmp         ecx,r10d
000000013F1D1170 73 1C                jae         000000013F1D118E
000000013F1D1172 49 8B D3             mov         rdx,r11
I can't explain this part. What is this?
000000013F1D1175 66 66 66 0F 1F 84 00 00 00 00 00 nop         word ptr [rax+rax]
000000013F1D1180 41 03 C8             add         ecx,r8d    ; innermost loop
000000013F1D1183 C6 02 00             mov         byte ptr [rdx],0
000000013F1D1186 49 03 D1             add         rdx,r9
000000013F1D1189 41 3B CA             cmp         ecx,r10d
000000013F1D118C 72 F2                jb          000000013F1D1180 ; end innermost loop
000000013F1D118E 41 FF C0             inc         r8d
000000013F1D1191 49 FF C1             inc         r9
000000013F1D1194 49 83 C3 02          add         r11,2
000000013F1D1198 45 3B C2             cmp         r8d,r10d
000000013F1D119B 72 C3                jb          000000013F1D1160
000000013F1D119D 48 8B 5C 24 08       mov         rbx,qword ptr [rsp+8]
000000013F1D11A2 C3                   ret

Notice that compared to the 32-bit verison, there is extra work going on here: the address variables live a life of their own (rdx is the pointer equivalent or ecx, and r9 is pointer the equivalent of r8d). Apparently, because the default size of int is 32 bits, while addresses are always 64-bits, the compiler can't mix 32- and 64-bit arithmetic. Let's try with 64-bit integers:

unsigned int nsieve2(unsigned char *c, unsigned __int64 m) {
   unsigned __int64 count = 0;
   for (unsigned __int64 i=2; i<m; i++) {
      if(c[i]) {
         ++count;
		 for (unsigned __int64 j=i+i; j<m; j = j+i) {
			 c[j]=0;
		 }
      }
   }
   return count;
}

...
000000013F781164 41 80 3C 38 00       cmp         byte ptr [r8+rdi],0
000000013F781169 74 21                je          000000013F78118C
000000013F78116B 4B 8D 14 00          lea         rdx,[r8+r8]
000000013F78116F 49 FF C3             inc         r11
000000013F781172 48 3B D3             cmp         rdx,rbx
000000013F781175 73 15                jae         000000013F78118C
000000013F781177 66 0F 1F 84 00 00 00 00 00 nop         word ptr [rax+rax]
000000013F781180 C6 04 3A 00          mov         byte ptr [rdx+rdi],0
000000013F781184 49 03 D0             add         rdx,r8
000000013F781187 48 3B D3             cmp         rdx,rbx
000000013F78118A 72 F4                jb          000000013F781180
000000013F78118C 49 FF C0             inc         r8
000000013F78118F 4C 3B C3             cmp         r8,rbx
000000013F781192 72 D0                jb          000000013F781164
000000013F781194 41 8B C3             mov         eax,r11d
000000013F781197 48 8B 5C 24 30       mov         rbx,qword ptr [rsp+30h]
000000013F78119C 48 83 C4 20          add         rsp,20h
000000013F7811A0 5F                   pop         rdi
000000013F7811A1 C3                   ret

Much better! The code now runs in 0.83 seconds, almost as fast as the original 32-bit version. I still can't explain the big fat noop in the middle of the loop.
Perhaps the Intel C++ compiler will perform better? Downloaded their free trial (all 622 megs of it), which integrates with Studio.
Unfortunately, the Intel compiler proved to be not as compatible with Microsoft as they claimed. The install failed to integrate with Studio (there was a note in the installation wizard about the failure, marked with an asterisk, but no footnote that would explain the asterisk). Experimentation resulted in the following makefile:

INCLUDE = $(INCLUDE);c:\mvs10\VC\include;c:\mssdk70\include;

i_nsieve:
!IF "$(VCINSTALLDIR)" == "" && "$(NOSETENV)" == ""
	"C:\Program Files (x86)\Intel\Compiler\11.1\054\bin\iclvars.bat" intel64 \
    vs2008 & nmake NOSETENV=YES i_nsieve
!ELSE
	icl /Og /Ox /Oi /fp:fast /arch:SSE2 nsieve.cpp
!ENDIF

Alas, this didn't work too well. Ironically, the Intel compiler crashed when parsing an invocation of the windows crash debugger:

c:\mvs10\VC\include\crtdefs.h(527): internal error: assertion failed at:
"shared/edgcpfe/lexical.c", line 16664

  void __cdecl _invoke_watson(_In_opt_z_ const wchar_t *, _In_opt_z_ const wchar_t *,
 _In_opt_z_ const wchar_t *, unsigned int, uintptr_t);

Some digging around revealed an interesting VC++ header, sal.h, which contains a whole (incredibly convoluted) mini-language for specifying function pre- and post-conditions, boundary conditions, buffer sizes, in- and out-parameters, and other annotations. The Intel compiler crash was avoided with /D_USE_DECLSPECS_FOR_SAL /U_USE_ATTRIBUTES_FOR_SAL. Let's try again...

c:\mvs10\VC\include\xtr1common(355): catastrophic error:
rvalue references not yet implemented
        struct _Remove_reference<_Ty&&>

Oops. OK, no big deal. I don't really need Microsoft's vector. I can just use malloc instead.

unsigned int nsieve(unsigned int m) {
   unsigned char *c = (unsigned char*) malloc(m);
   memset(c,1,m);
   unsigned int result = nsieve2(c,m);
   free(c);
   return result;
}

The resulting code:

.B1.7::
        movzx     edx, BYTE PTR [rax+r15]
        test      edx, edx
        je        .B1.12
.B1.8::
        inc       rbx
        lea       rdx, QWORD PTR [rax+rax]
        cmp       rdx, rbp
        jae       .B1.12
.B1.10::
        mov       BYTE PTR [rdx+r15], 0
        add       rdx, rax
        cmp       rdx, rbp
        jb        .B1.10
.B1.12::
        inc       rax
        cmp       rax, rbp
        jb        .B1.7

Result: same as VC++, as you might have guessed, because the innermost loop is the same 4 instructions.
Yes, I know this test is entirely memory-bound and puts no pressure on the compiler's register allocator. But still I shouldn't be seeing such wide variation.

Some impressions of the VC++ 10.0 Beta. The shell has goten better. However, F12 still doesn't work (even in my trivial project with 20 files, it misses a lot of definitions, and seemingly doesn't parse macros).
Also, the disassembler has gotten MUCH worse. It takes two to three seconds for each instruction step. In effect, it takes around 10 billion instructions for studio to execute one debugee instruction. Quite an achievement.

GCC disassembly below. My GCC is not on a comparable machine, so I don't know if this code is faster than the ones above. The inner loop is not that good-looking -- keeps re-reading the vector base pointer from memory

.L18
        movq    (%rsp), %rsi
        mov     %edi, %eax
        cmpb    $0, (%rsi,%rax)
        je      .L19
        addl    $1, %ebp
        cmpl    %ebx, %r8d
        movl    %r8d, %ecx
        movl    %r9d, %edx
        jbe     .L22
        jmp     .L19
        .p2align 4,,7
.L33:
        movq    (%rsp), %rsi
.L22:
        mov     %ecx, %eax
        addl    %edi, %edx
        addl    %edi, %ecx
        movb    $0, (%rsi,%rax)
        movl    %edx, %eax
        subl    %edi, %eax
        cmpl    %ebx, %eax
        jbe     .L33
.L19:
        addl    $1, %edi
        addl    $3, %r9d
        addl    $2, %r8d
        cmpl    %edi, %ebx
        jae     .L18

Reading the address of vector's first element into a stand-alone variable resolves the issue. Command line:

 gcc nsieve.cpp -O3 -fstrict-aliasing -lstdc++ -S

Source:

void nsieve(unsigned m)
{
   unsigned i, j;
   unsigned count=0;
   std::vector<bool> b(m+1, true);
   bool *base = &b[0];   // using b[i] will cause an extra read at each cycle
   for (i=2; i<=m; ++i) {
      if (base[i]) {
         ++count;
         j=i*2;
         while (j<=m) {
            base[j]=false;
            j+=i;
         }
      }
   }
   std::cout
      << "Primes up to "
      << std::setw(8) << m << " "
      << std::setw(8) << count << "\n";
}

Disassembly (relevant part):

.L18:
        mov     %esi, %eax
        cmpb    $0, (%r8,%rax)
        je      .L19
        addl    $1, %ebp
        cmpl    %ebx, %edi
        ja      .L19
        movl    %edi, %ecx
        movl    %r9d, %edx
        .p2align 4,,7
.L22:
        mov     %ecx, %eax
        addl    %esi, %edx
        addl    %esi, %ecx
        movb    $0, (%r8,%rax)
        movl    %edx, %eax
        subl    %esi, %eax
        cmpl    %eax, %ebx
        jae     .L22
.L19:
        addl    $1, %esi
        addl    $3, %r9d
        addl    $2, %edi
        cmpl    %esi, %ebx
        jae     .L18