SIMD型拡張命令をもっと使った最適化への道のり
科学技術振興調整費「並列化コンパイラ向け共通インフラストラクチャの研究」
藤波順久(ソニー株式会社) 阿部正佳(東京大学)
2002年1月11日
第43回プログラミング・シンポジウム
short a[1000];
short b[1000];
short c[1000];
void f() {
int i;
for(i=0;i<1000;i++) c[i]=a[i]+b[i];
}
Function: _f
XOR EAX, EAX
ALIGN 00000010, 00000008
L0001:
MOVQ MM2, QWORD PTR _b[EAX*2]
MOVQ MM0, QWORD PTR _a[EAX*2]
PADDW MM0, MM2
MOVQ MM1, QWORD PTR _a+00000008[EAX*2]
MOVQ QWORD PTR _c[EAX*2], MM0
MOVQ MM0, QWORD PTR _b+00000008[EAX*2]
PADDW MM1, MM0
MOVQ MM2, QWORD PTR _a+00000010[EAX*2]
MOVQ QWORD PTR _c+00000008[EAX*2], MM1
MOVQ MM1, QWORD PTR _b+00000010[EAX*2]
PADDW MM2, MM1
MOVQ MM0, QWORD PTR _a+00000018[EAX*2]
MOVQ QWORD PTR _c+00000010[EAX*2], MM2
MOVQ MM2, QWORD PTR _b+00000018[EAX*2]
PADDW MM0, MM2
MOVQ MM1, QWORD PTR _a+00000020[EAX*2]
MOVQ QWORD PTR _c+00000018[EAX*2], MM0
MOVQ MM0, QWORD PTR _b+00000020[EAX*2]
PADDW MM1, MM0
MOVQ MM2, QWORD PTR _a+00000028[EAX*2]
MOVQ QWORD PTR _c+00000020[EAX*2], MM1
MOVQ MM1, QWORD PTR _b+00000028[EAX*2]
PADDW MM2, MM1
MOVQ MM0, QWORD PTR _a+00000030[EAX*2]
MOVQ QWORD PTR _c+00000028[EAX*2], MM2
MOVQ MM2, QWORD PTR _b+00000030[EAX*2]
PADDW MM0, MM2
MOVQ MM1, QWORD PTR _a+00000038[EAX*2]
MOVQ QWORD PTR _c+00000030[EAX*2], MM0
MOVQ MM0, QWORD PTR _b+00000038[EAX*2]
PADDW MM1, MM0
MOVQ QWORD PTR _c+00000038[EAX*2], MM1
ADD EAX, 00000020
CMP EAX, 000003C8
JL L0001
MOV CX, WORD PTR _a+00000002[EAX*2]
MOV DX, WORD PTR _a[EAX*2]
ADD CX, WORD PTR _b+00000002[EAX*2]
ADD DX, WORD PTR _b[EAX*2]
MOV WORD PTR _c[EAX*2], DX
MOV WORD PTR _c+00000002[EAX*2], CX
MOV CX, WORD PTR _a+00000006[EAX*2]
MOV DX, WORD PTR _a+00000004[EAX*2]
ADD CX, WORD PTR _b+00000006[EAX*2]
ADD DX, WORD PTR _b+00000004[EAX*2]
MOV WORD PTR _c+00000004[EAX*2], DX
MOV WORD PTR _c+00000006[EAX*2], CX
MOV CX, WORD PTR _a+0000000A[EAX*2]
MOV DX, WORD PTR _a+00000008[EAX*2]
ADD CX, WORD PTR _b+0000000A[EAX*2]
ADD DX, WORD PTR _b+00000008[EAX*2]
MOV WORD PTR _c+00000008[EAX*2], DX
MOV WORD PTR _c+0000000A[EAX*2], CX
MOV CX, WORD PTR _a+0000000E[EAX*2]
MOV DX, WORD PTR _a+0000000C[EAX*2]
ADD DX, WORD PTR _b+0000000C[EAX*2]
ADD CX, WORD PTR _b+0000000E[EAX*2]
EMMS
MOV WORD PTR _c+0000000C[EAX*2], DX
MOV WORD PTR _c+0000000E[EAX*2], CX
L0000:
RETN
__m128 _mm_add_ps(__m128 a, __m128 b)これは、aとbの4つの単精度浮動小数点値を加算する。
static unsigned char sa,sr,sg,sb;
static short da,dr,dg,db;
static short k;
static void
hLineRight(unsigned char *p,int n,
unsigned char a,short ea,
unsigned char r,short er,
unsigned char g,short eg,
unsigned char b,short eb) {
while(n!=0) {
*p++=b; *p++=g; *p++=r; *p++=a;
a+=sa; r+=sr; g+=sg; b+=sb;
if((ea+=da)>=0) { a++; ea-=k; }
if((er+=dr)>=0) { r++; er-=k; }
if((eg+=dg)>=0) { g++; eg-=k; }
if((eb+=db)>=0) { b++; eb-=k; }
--n;
}
}
mwcc -opt all -inst comp,mmx,sse -machinecodelist -c simdlp16.c
Function: _hLineRight
CMP DWORD PTR 00000008[ESP], 00000000
PUSH EBX
MOV ECX, DWORD PTR 00000028[ESP]
PUSH ESI
MOV EDX, DWORD PTR 00000024[ESP]
PUSH EDI
MOV EDI, DWORD PTR 00000024[ESP]
PUSH EBP
MOV EBP, DWORD PTR 00000038[ESP]
MOV ESI, DWORD PTR 00000014[ESP]
MOV EBX, DWORD PTR 00000024[ESP]
JE L0001
ALIGN 00000010, 00000008
L0002:
MOV BYTE PTR 00000000[ESI], CL
MOV BYTE PTR 00000001[ESI], DL
MOV BYTE PTR 00000002[ESI], BL
MOV AL, BYTE PTR 0000001C[ESP]
MOV BYTE PTR 00000003[ESI], AL
MOV AL, BYTE PTR 0000001C[ESP]
ADD AL, BYTE PTR _sa
ADD ESI, 00000004
MOV BYTE PTR 0000001C[ESP], AL
MOV AX, WORD PTR 00000020[ESP]
ADD AX, WORD PTR _da
ADD BL, BYTE PTR _sr
MOV WORD PTR 00000020[ESP], AX
ADD DL, BYTE PTR _sg
ADD CL, BYTE PTR _sb
CMP WORD PTR 00000020[ESP], 0000
JL L0003
MOV AX, WORD PTR 00000020[ESP]
INC BYTE PTR 0000001C[ESP]
SUB AX, WORD PTR _k
MOV WORD PTR 00000020[ESP], AX
L0003:
ADD DI, WORD PTR _dr
TEST DI, DI
JL L0004
INC BL
SUB DI, WORD PTR _k
L0004:
MOV AX, WORD PTR 00000030[ESP]
ADD AX, WORD PTR _dg
MOV WORD PTR 00000030[ESP], AX
CMP WORD PTR 00000030[ESP], 0000
JL L0005
MOV AX, WORD PTR 00000030[ESP]
INC DL
SUB AX, WORD PTR _k
MOV WORD PTR 00000030[ESP], AX
L0005:
ADD BP, WORD PTR _db
TEST BP, BP
JL L0006
INC CL
SUB BP, WORD PTR _k
L0006:
DEC DWORD PTR 00000018[ESP]
CMP DWORD PTR 00000018[ESP], 00000000
JNE L0002
L0001:
L0000:
POP EBP
POP EDI
POP ESI
POP EBX
RETN
_hLineRight:
CMP DWORD PTR _n[ESP],0
JZ L2
MOVD MM0,DWORD PTR [ESP+_b]
MOVD MM1,DWORD PTR _sb
MOVQ MM2,QWORD PTR [ESP+_eb]
MOVQ MM3,QWORD PTR _db
MOVZX EAX,_k
IMUL EAX,10001H
MOVD MM6,EAX
MOVQ MM7,MM6
PSLLQ MM6,32
POR MM7,MM6
PCMPEQB MM4,MM4 ; MM4=0FFF...FH
PSUBB MM1,MM4
MOV EAX,_p[ESP]
MOV ECX,_n[ESP]
L1:
MOVD [EAX],MM0
ADD EAX,4
PADDB MM0,MM1
PADDW MM2,MM3
PXOR MM4,MM4 ; MM4=0
PCMPGTW MM4,MM2
MOVQ MM5,MM4
PACKSSWB MM5,MM5
PADDB MM0,MM5
PANDN MM4,MM7
PSUBW MM2,MM4
DEC ECX
JNZ L1
L2:
RET
r1:=r2+r3 r4:=mem(r5+12)
PADDW MM0,MM1 parallel( subregI16(MM0I64,0):=subregI16(MM0I64,0)+subregI16(MM1I64,0) subregI16(MM0I64,1):=subregI16(MM0I64,1)+subregI16(MM1I64,1) subregI16(MM0I64,2):=subregI16(MM0I64,2)+subregI16(MM1I64,2) subregI16(MM0I64,3):=subregI16(MM0I64,3)+subregI16(MM1I64,3) )
if c then
:
v:=t
:
else → v:=(t bitand c) bitor (f bitand bitnot c)
:
v:=f
:
endif
簡略化できる場合:
if t8>=0 then t16:=t8>=0 t0:=t0+1 → t0:=t0-t16 t8:=t8-t28 t8:=t8-(t28 bitand t16) endif
parallel(
s4:=t0+t4
s8:=t1+t5
s12:=t2+t6
s16:=t3+t7
)
↓
parallel(
t0:=t0+t4
t1:=t1+t5
t2:=t2+t6
t3:=t3+t7
)
parallel(
s6:=s5<z0
s10:=s9<z1
s14:=s13<z2
s18:=s17<z3
)
↓
parallel(
s6:=s5>z0
s10:=s9>z1
s14:=s13>z2
s18:=s17>z3
)