/* xor128()をSSE2で実装してみました。XorShift は SIMD に向かない事がわかりました。*/
#include <stdio.h>
#include <time.h>
#include <emmintrin.h>
unsigned long xor128(void){
static unsigned long x=123456789UL,y=362436069UL,z=521288629UL,w=88675123UL;
unsigned long t;t=(x^(x<<11));x=y;y=z;z=w;return(w=(w^(w>>19))^(t^(t>>8)));}

unsigned long xor128sse2(void){
static union{unsigned long v[4];__m128i m;}u={123456789UL,362436069UL,521288629UL,88675123UL};
__m128i x=u.m, w, t, r; r=_mm_slli_epi32(x,11);t=_mm_xor_si128(x,r); w=_mm_srli_si128(x,12);
x=_mm_srli_si128(x,4); r=_mm_srli_epi32(w,19);w=_mm_xor_si128(w,r); r=_mm_srli_epi32(t,8);
t=_mm_xor_si128(t,r); w=_mm_xor_si128(w,t); r=_mm_slli_si128(w,12);x=_mm_or_si128(x,r);
_mm_store_si128(&u.m,x);return(u.v[3]);}

void main(void){long i,c;
for(i=0;i<16;i++)printf("%8lx %8lx\n",xor128(),xor128sse2());
c=clock();for(i=0;i<50000000L;i++)xor128(); printf("xor128(): %4lu msec\n",clock()-c);
c=clock();for(i=0;i<50000000L;i++)xor128sse2();printf("xor128sse2():%4lu msec\n",clock()-c);}