An example of SIMD Parallelization


Candidate for Parallelization

The following loop is the principal part of the motion vector estimation process in an MPEG encoder. SIMD optimization is performed to this loop.
#define ABSDIF(x, y) ((x <= y) ? y-x : x-y)

int error(char xx[16][16], char yy[16][16]){
	char *across;
	char *cacross;
	//	int localDiff;
	int diff=0;
	int y;

	for (y=0;y<16;y++) {
		across = &xx[y][0];
		cacross = &yy[y][0];
		diff += ABSDIF(across[0], cacross[0]);
		diff += ABSDIF(across[1], cacross[1]);
		diff += ABSDIF(across[2], cacross[2]);
		diff += ABSDIF(across[3], cacross[3]);
		diff += ABSDIF(across[4], cacross[4]);
		diff += ABSDIF(across[5], cacross[5]);
		diff += ABSDIF(across[6], cacross[6]);
		diff += ABSDIF(across[7], cacross[7]);
		diff += ABSDIF(across[8], cacross[8]);
		diff += ABSDIF(across[9], cacross[9]);
		diff += ABSDIF(across[10], cacross[10]);
		diff += ABSDIF(across[11], cacross[11]);
		diff += ABSDIF(across[12], cacross[12]);
		diff += ABSDIF(across[13], cacross[13]);
		diff += ABSDIF(across[14], cacross[14]);
		diff += ABSDIF(across[15], cacross[15]);
	};

	return diff;
}

Parallelization method

Note: SIR (S-expression Intermediate Representation) is similar to LIR, but operations required for SIMD optimization are added. Most of them will be adopted in the future LIR.

In the case of the SIMD code generation, the data for a pattern match is used. It is created based on the instruction description file which strictly describes the meaning of SIMD instructions in detail. For example, in the above-mentioned loop, the pattern based on the PDIST instruction (it accumulates absolute differences of 8 pairs of 8-bit numbers) of UltraSPARC processors is matched.

PDIST instruction description in SIR is as follows:

;; PDIST
(DEFINST ("pdist ?1f, ?2f, ?3f" "pdist ?1f, ?2f, ?3f")
  (PARALLEL
    (SET (SUBREG I32 (REG F64 (HOLE 3 FREG)) 0)
      (ADD I32 (ADD I32 (ADD I32 (ADD I32 (ADD I32 (ADD I32 (ADD I32
        (ADD I32
          (SUBREG I32 (REG F64 (HOLE 3 FREG)) 0)
          (BOR I32
            (BAND I32
              (SUB I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 0))
                      (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 0)))
              (TSTLES I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 0))
                          (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 0))))
            (BAND I32
              (SUB I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 0))
                      (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 0)))
              (BNOT I32
                (TSTLES I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 0))
                            (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 0)))))))
          (BOR I32
            (BAND I32
              (SUB I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 1))
                      (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 1)))
              (TSTLES I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 1))
                          (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 1))))
            (BAND I32
              (SUB I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 1))
                      (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 1)))
              (BNOT I32
                (TSTLES I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 1))
                            (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 1)))))))
          (BOR I32
            (BAND I32
              (SUB I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 2))
                      (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 2)))
              (TSTLES I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 2))
                          (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 2))))
            (BAND I32
              (SUB I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 2))
                      (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 2)))
              (BNOT I32
                (TSTLES I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 2))
                            (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 2)))))))
          (BOR I32
            (BAND I32
              (SUB I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 3))
                      (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 3)))
              (TSTLES I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 3))
                          (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 3))))
            (BAND I32
              (SUB I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 3))
                      (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 3)))
              (BNOT I32
                (TSTLES I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 3))
                            (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 3)))))))
          (BOR I32
            (BAND I32
              (SUB I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 4))
                      (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 4)))
              (TSTLES I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 4))
                          (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 4))))
            (BAND I32
              (SUB I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 4))
                      (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 4)))
              (BNOT I32
                (TSTLES I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 4))
                            (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 4)))))))
          (BOR I32
            (BAND I32
              (SUB I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 5))
                      (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 5)))
              (TSTLES I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 5))
                          (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 5))))
            (BAND I32
              (SUB I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 5))
                      (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 5)))
              (BNOT I32
                (TSTLES I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 5))
                            (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 5)))))))
          (BOR I32
            (BAND I32
              (SUB I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 6))
                      (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 6)))
              (TSTLES I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 6))
                          (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 6))))
            (BAND I32
              (SUB I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 6))
                      (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 6)))
              (BNOT I32
                (TSTLES I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 6))
                            (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 6)))))))
          (BOR I32
            (BAND I32
              (SUB I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 7))
                      (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 7)))
              (TSTLES I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 7))
                          (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 7))))
            (BAND I32
              (SUB I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 7))
                      (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 7)))
              (BNOT I32
                (TSTLES I32 (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 1 FREG)) 7))
                            (CONVZX I32 (SUBREG I8 (REG F64 (HOLE 2 FREG)) 7)))))))
)
))

Parallelization result

The following is an assembly language output after optimizing to UltraSPARC processors. In the loop, the PDIST instruction is used twice to accumulate 16 absolute differences.
        .file	"testsimd/testerror-1.c"
.section	".text"
        .align	4
        .global	error
        .type	error,#function
        .proc	04
error:
        !#PROLOGUE# 0
        save	%sp, -8, %sp
        !#PROLOGUE# 1
_lab1:
        mov (0), %l0
        mov (0), %l1
_lab5:
        cmp %l1, (16)
        bl _lab6
        nop
        ba _lab4
        nop
_lab6:
        mov %i0, %l2
        smul %l1, (16), %l3
        add %l2, %l3, %l4
        mov %i1, %l5
        add %l5, %l3, %l6
        ldd [%l4], %f0
        ldd [%l6], %f2
        st %l0, [%i6-(4)]
        st %g0, [%i6-(8)]
        ldd [%i6-(8)], %f4
        pdist %f0, %f2, %f4
        ldd [%l4+(8)], %f0
        ldd [%l6+(8)], %f2
        pdist %f0, %f2, %f4
        std %f4, [%i6-(8)]
        ld [%i6-(4)], %l0
        add %l1, (1), %l1
        ba _lab5
        nop
_lab4:
        mov %l0,%i0
        ret
        restore

Performance measurement result

The above-mentioned optimization result and the programs generated by other compiling methods were executed 10 million times on UltraSPARC, and their execution time was measured. The processor of the machine is UltraSPARC-IIi 360MHz, and a memory is 256MB. The compared compiling methods are as follows:
GCC
Compiled by gcc-2.95.3 with no optimization
GCC -O2
Compiled by gcc-2.95.3 with -O2 option
COINS
Compiled by COINS C Compiler
SIMD
Our method
SIMD+manual
Our method with manual improvement. If our method is unified with the COINS base part and various kinds of basic optimization will be carried out, this speed will be possible.
Compiling methodExecution time (second)
GCC223
GCC -O245
COINS327
SIMD13
SIMD+manual8