[Haskell-cafe] [2/16] SBM: Inner loops of the hand-tweaked assembly
benchmarks
Peter Firefly Brodersen Lund
firefly at vax64.dk
Sat Dec 22 04:16:56 EST 2007
I've taken the two benchmarks byte-bs----acc and space-bs-c8-acc-1 and
gradually tweaked their inner loops from something that used memory all the
time to something that used registers more and more efficiently. I've done
this gradually, pretty much one register at a time. Along the way, I've also
done a simple common subexpression/loop hoisting thing in which I combined the
pointer to the start of the string and the index into the string into a single
pointer. Doing this in real life may cause bad problems with the garbage
collector.
At the end, I go a bit mad and start doing heroic optimizations (reading four
bytes at a time, using MMX registers to read 8 bytes at a time, twisted MMX
math to keep 8 space counters in an MMX register + a bit of loop unrolling).
Here follows first the two original inner loops and then the 23 hand-tweaked
versions.
I used the following shell code to isolate the inner loops:
(for F in hs/byte-bs----acc.s hs/space-bs-c8-acc-1.s hand/*.s ; \
do echo "------------------------------"; \
echo "$F:"; \
echo ; \
cat "$F" | perl -e 'while(<>){ if (/Main_zdwcnt_info:/ .. /.section .data/) { print; }}' | head -n-1; \
done; \
echo "=============================="; \
) > xx.txt
-Peter
------------------------------
hs/byte-bs----acc.s:
Main_zdwcnt_info:
.LcYL:
cmpl $0,16(%ebp)
jle .LcYO
movl 12(%ebp),%eax
incl %eax
movl (%ebp),%ecx
incl %ecx
subl $1,16(%ebp)
movl %eax,12(%ebp)
movl %ecx,(%ebp)
jmp Main_zdwcnt_info
.LcYO:
movl (%ebp),%esi
addl $20,%ebp
jmp *(%ebp)
------------------------------
hs/space-bs-c8-acc-1.s:
Main_zdwcnt_info:
.Lc16u:
cmpl $0,16(%ebp)
jle .Lc16x
movl 4(%ebp),%eax
movl 12(%ebp),%ecx
movzbl (%eax,%ecx,1),%eax
cmpl $32,%eax
jne .Lc16F
movl 12(%ebp),%eax
incl %eax
movl (%ebp),%ecx
incl %ecx
subl $1,16(%ebp)
movl %eax,12(%ebp)
movl %ecx,(%ebp)
jmp Main_zdwcnt_info
.Lc16x:
movl (%ebp),%esi
addl $20,%ebp
jmp *(%ebp)
.Lc16F:
movl 12(%ebp),%eax
incl %eax
subl $1,16(%ebp)
movl %eax,12(%ebp)
jmp Main_zdwcnt_info
------------------------------
hand/byte-bs----acc-a.s:
Main_zdwcnt_info:
.LcYN:
cmpl $0,16(%ebp)
jle .LcYQ
movl 00(%ebp),%ecx
movl 12(%ebp),%eax
movl 16(%ebp),%edx
incl %ecx
incl %eax
decl %edx
movl %ecx,00(%ebp)
movl %eax,12(%ebp)
movl %edx,16(%ebp)
jmp Main_zdwcnt_info
.LcYQ:
movl (%ebp),%esi
addl $20,%ebp
jmp *(%ebp)
------------------------------
hand/byte-bs----acc-b.s:
Main_zdwcnt_info:
.LcYN:
cmpl $0,16(%ebp)
jle .LcYQ
movl 00(%ebp),%ecx
movl 12(%ebp),%eax
movl 16(%ebp),%edx
.L_again:
cmpl $0,%edx
jle .L_out
incl %ecx
incl %eax
decl %edx
jmp .L_again
.L_out:
movl %ecx,00(%ebp)
movl %eax,12(%ebp)
movl %edx,16(%ebp)
jmp Main_zdwcnt_info
.LcYQ:
movl (%ebp),%esi
addl $20,%ebp
jmp *(%ebp)
------------------------------
hand/byte-bs----acc-c.s:
Main_zdwcnt_info:
.LcYN:
cmpl $0,16(%ebp)
jle .LcYQ
movl 00(%ebp),%ecx
movl 12(%ebp),%eax
movl 16(%ebp),%edx
cmpl $0,%edx
jle .L_out
.L_again:
incl %ecx
incl %eax
decl %edx
cmpl $0,%edx
jg .L_again
.L_out:
movl %ecx,00(%ebp)
movl %eax,12(%ebp)
movl %edx,16(%ebp)
jmp Main_zdwcnt_info
.LcYQ:
movl (%ebp),%esi
addl $20,%ebp
jmp *(%ebp)
------------------------------
hand/byte-bs----acc-d.s:
Main_zdwcnt_info:
.LcYN:
cmpl $0,16(%ebp)
jle .LcYQ
movl 00(%ebp),%ecx
movl 12(%ebp),%eax
movl 16(%ebp),%edx
cmpl $0,%edx
jle .L_out
.align 16
.L_again:
incl %ecx
incl %eax
decl %edx
cmpl $0,%edx
jg .L_again
.L_out:
movl %ecx,00(%ebp)
movl %eax,12(%ebp)
movl %edx,16(%ebp)
jmp Main_zdwcnt_info
.LcYQ:
movl (%ebp),%esi
addl $20,%ebp
jmp *(%ebp)
------------------------------
hand/space-bs-c8-acc-1-a.s:
Main_zdwcnt_info:
.Lc16w:
cmpl $0,16(%ebp)
jle .Lc16z
movl 4(%ebp),%eax
movl 12(%ebp),%ecx
movzbl (%eax,%ecx,1),%eax
cmpl $32,%eax
jne .Lc16H
movl 12(%ebp),%eax
incl %eax
movl (%ebp),%ecx
incl %ecx
subl $1,16(%ebp)
movl %eax,12(%ebp)
movl %ecx,(%ebp)
jmp Main_zdwcnt_info
.Lc16H:
movl 12(%ebp),%eax
incl %eax
subl $1,16(%ebp)
movl %eax,12(%ebp)
jmp Main_zdwcnt_info
.Lc16z:
movl (%ebp),%esi
addl $20,%ebp
jmp *(%ebp)
------------------------------
hand/space-bs-c8-acc-1-b.s:
Main_zdwcnt_info:
.Lc16w:
cmpl $0,16(%ebp)
jle .Lc16z
movl 4(%ebp),%eax
movl 12(%ebp),%ecx
movzbl (%eax,%ecx,1),%eax
cmpl $32,%eax
je .Lc16H
movl 12(%ebp),%eax
incl %eax
subl $1,16(%ebp)
movl %eax,12(%ebp)
jmp Main_zdwcnt_info
.Lc16H:
movl 12(%ebp),%eax
incl %eax
movl (%ebp),%ecx
incl %ecx
subl $1,16(%ebp)
movl %eax,12(%ebp)
movl %ecx,(%ebp)
jmp Main_zdwcnt_info
.Lc16z:
movl (%ebp),%esi
addl $20,%ebp
jmp *(%ebp)
------------------------------
hand/space-bs-c8-acc-1-c.s:
Main_zdwcnt_info:
.Lc16w:
cmpl $0,16(%ebp)
jle .Lc16z
movl 4(%ebp),%eax
movl 12(%ebp),%ecx
movzbl (%eax,%ecx,1),%eax
cmpl $32,%eax
jne .Lc16H
movl (%ebp),%ecx
incl %ecx
movl 12(%ebp),%eax
incl %eax
movl %ecx,(%ebp)
movl %eax,12(%ebp)
subl $1,16(%ebp)
jmp Main_zdwcnt_info
.Lc16z:
movl (%ebp),%esi
addl $20,%ebp
jmp *(%ebp)
.Lc16H:
movl 12(%ebp),%eax
incl %eax
movl %eax,12(%ebp)
subl $1,16(%ebp)
jmp Main_zdwcnt_info
------------------------------
hand/space-bs-c8-acc-1-d.s:
Main_zdwcnt_info:
.Lc16w:
cmpl $0,16(%ebp)
jle .Lc16z
movl 4(%ebp),%eax
movl 12(%ebp),%ecx
movzbl (%eax,%ecx,1),%eax
cmpl $32,%eax
jne .Lc16H
addl $1,(%ebp)
addl $1,12(%ebp)
subl $1,16(%ebp)
jmp Main_zdwcnt_info
.Lc16z:
movl (%ebp),%esi
addl $20,%ebp
jmp *(%ebp)
.Lc16H:
addl $1,12(%ebp)
subl $1,16(%ebp)
jmp Main_zdwcnt_info
------------------------------
hand/space-bs-c8-acc-1-e.s:
Main_zdwcnt_info:
.Lc16w:
cmpl $0,16(%ebp)
jle .Lc16z
movl 4(%ebp),%eax
movl 12(%ebp),%ecx
movzbl (%eax,%ecx,1),%eax
cmpl $32,%eax
jne .Lc16H
movl 12(%ebp),%eax
incl %eax
incl %ecx
movl (%ebp),%eax
incl %eax
subl $1,16(%ebp)
movl %ecx,12(%ebp)
movl %eax,(%ebp)
jmp Main_zdwcnt_info
.Lc16z:
movl (%ebp),%esi
addl $20,%ebp
jmp *(%ebp)
.Lc16H:
incl %ecx
subl $1,16(%ebp)
movl %ecx,12(%ebp)
jmp Main_zdwcnt_info
------------------------------
hand/space-bs-c8-acc-1-f.s:
Main_zdwcnt_info:
.Lc16w:
cmpl $0,16(%ebp)
jle .Lc16z
movl 4(%ebp),%eax
movl 12(%ebp),%ecx
movzbl (%eax,%ecx,1),%eax
cmpl $32,%eax
jne .Lc16H
incl %ecx
subl $1,16(%ebp)
addl $1,(%ebp)
movl %ecx,12(%ebp)
jmp Main_zdwcnt_info
.Lc16z:
movl (%ebp),%esi
addl $20,%ebp
jmp *(%ebp)
.Lc16H:
incl %ecx
subl $1,16(%ebp)
movl %ecx,12(%ebp)
jmp Main_zdwcnt_info
------------------------------
hand/space-bs-c8-acc-1-g.s:
Main_zdwcnt_info:
movl (%ebp),%esi
.Lc16w:
cmpl $0,16(%ebp)
jle .Lc16z
movl 4(%ebp),%eax
movl 12(%ebp),%ecx
movzbl (%eax,%ecx,1),%eax
cmpl $32,%eax
jne .Lc16H
incl %ecx
subl $1,16(%ebp)
inc %esi
movl %ecx,12(%ebp)
jmp .Lc16w
.Lc16z:
addl $20,%ebp
jmp *(%ebp)
.Lc16H:
incl %ecx
subl $1,16(%ebp)
movl %ecx,12(%ebp)
jmp .Lc16w
------------------------------
hand/space-bs-c8-acc-1-h.s:
Main_zdwcnt_info:
movl (%ebp),%esi
movl 12(%ebp),%ecx
.Lc16w:
cmpl $0,16(%ebp)
jle .Lc16z
movl 4(%ebp),%eax
movzbl (%eax,%ecx,1),%eax
cmpl $32,%eax
jne .Lc16H
incl %ecx
subl $1,16(%ebp)
inc %esi
jmp .Lc16w
.Lc16z:
addl $20,%ebp
jmp *(%ebp)
.Lc16H:
incl %ecx
subl $1,16(%ebp)
jmp .Lc16w
------------------------------
hand/space-bs-c8-acc-1-i.s:
Main_zdwcnt_info:
movl (%ebp),%esi
movl 12(%ebp),%ecx
movl 16(%ebp),%edx
.Lc16w:
cmpl $0,%edx
jle .Lc16z
movl 4(%ebp),%eax
movzbl (%eax,%ecx,1),%eax
cmpl $32,%eax
jne .Lc16H
incl %ecx
decl %edx
inc %esi
jmp .Lc16w
.Lc16z:
addl $20,%ebp
jmp *(%ebp)
.Lc16H:
incl %ecx
decl %edx
jmp .Lc16w
------------------------------
hand/space-bs-c8-acc-1-j.s:
Main_zdwcnt_info:
movl (%ebp),%esi
movl 4(%ebp),%ecx
addl 12(%ebp),%ecx
movl 16(%ebp),%edx
.Lc16w:
cmpl $0,%edx
jle .Lc16z
movzbl (%ecx),%eax
cmpl $32,%eax
jne .Lc16H
incl %ecx
decl %edx
inc %esi
jmp .Lc16w
.Lc16z:
addl $20,%ebp
jmp *(%ebp)
.Lc16H:
incl %ecx
decl %edx
jmp .Lc16w
------------------------------
hand/space-bs-c8-acc-1-k.s:
Main_zdwcnt_info:
movl (%ebp),%esi
movl 4(%ebp),%ecx
addl 12(%ebp),%ecx
movl 16(%ebp),%edx
.Lc16w:
cmpl $0,%edx
jle .Lc16z
movzbl (%ecx),%eax
cmpl $32,%eax
jne .Lc16H
incl %ecx
decl %edx
inc %esi
jmp .Lc16w
.Lc16H:
incl %ecx
decl %edx
jmp .Lc16w
.Lc16z:
addl $20,%ebp
jmp *(%ebp)
------------------------------
hand/space-bs-c8-acc-1-l.s:
Main_zdwcnt_info:
movl (%ebp),%esi
movl 4(%ebp),%ecx
addl 12(%ebp),%ecx
movl 16(%ebp),%edx
.Lc16w:
cmpl $0,%edx
jle .Lc16z
movzbl (%ecx),%eax
incl %ecx
decl %edx
cmpl $32,%eax
jne .Lc16H
inc %esi
jmp .Lc16w
.Lc16H:
jmp .Lc16w
.Lc16z:
addl $20,%ebp
jmp *(%ebp)
------------------------------
hand/space-bs-c8-acc-1-m.s:
Main_zdwcnt_info:
movl (%ebp),%esi
movl 4(%ebp),%ecx
addl 12(%ebp),%ecx
movl 16(%ebp),%edx
.Lc16w:
cmpl $0,%edx
jle .Lc16z
movzbl (%ecx),%eax
incl %ecx
decl %edx
cmpl $32,%eax
jne .Lc16w
inc %esi
jmp .Lc16w
.Lc16z:
addl $20,%ebp
jmp *(%ebp)
------------------------------
hand/space-bs-c8-acc-1-n.s:
Main_zdwcnt_info:
movl (%ebp),%esi
movl 4(%ebp),%ecx
addl 12(%ebp),%ecx
movl 16(%ebp),%edx
.Lc16w:
cmpl $0,%edx
jle .Lc16z
.Lc16xx:
movzbl (%ecx),%eax
incl %ecx
decl %edx
cmpl $32,%eax
jne .Lc16w
inc %esi
cmpl $0,%edx
jg .Lc16xx
.Lc16z:
addl $20,%ebp
jmp *(%ebp)
------------------------------
hand/space-bs-c8-acc-1-o.s:
Main_zdwcnt_info:
movl (%ebp),%esi
movl 4(%ebp),%ecx
addl 12(%ebp),%ecx
movl 16(%ebp),%edx
.Lc16w:
cmpl $0,%edx
jle .Lc16z
.Lc16xx:
movzbl (%ecx),%eax
incl %ecx
decl %edx
cmpl $32,%eax
jne .Lc16w
inc %esi
cmpl $0,%edx
jle .Lc16z
movzbl (%ecx),%eax
incl %ecx
decl %edx
cmpl $32,%eax
jne .Lc16w
inc %esi
cmpl $0,%edx
jg .Lc16xx
.Lc16z:
addl $20,%ebp
jmp *(%ebp)
------------------------------
hand/space-bs-c8-acc-1-p.s:
Main_zdwcnt_info:
movl (%ebp),%esi
movl 4(%ebp),%ecx
addl 12(%ebp),%ecx
movl 16(%ebp),%edx
.Lc16w4:
cmpl $4,%edx
jl .Lc16wxx
movl (%ecx),%eax
addl $4,%ecx
subl $4,%edx
cmpb $32,%al
jne .Lc16wa
incl %esi
.Lc16wa:
cmpb $32,%ah
jne .Lc16wb
incl %esi
.Lc16wb:
shrl $16,%eax
cmpb $32,%al
jne .Lc16wc
incl %esi
.Lc16wc:
cmpb $32,%ah
jne .Lc16w4
incl %esi
jmp .Lc16w4
.Lc16w1:
cmpl $0,%edx
jle .Lc16z
.Lc16wxx:
movzbl (%ecx),%eax
incl %ecx
decl %edx
cmpl $32,%eax
jne .Lc16w1
inc %esi
jmp .Lc16w1
.Lc16z:
addl $20,%ebp
jmp *(%ebp)
------------------------------
hand/space-bs-c8-acc-1-q.s:
Main_zdwcnt_info:
movl (%ebp),%esi /* #spaces found */
movl 4(%ebp),%ecx /* ptr */
addl 12(%ebp),%ecx /* ... + idx */
movl 16(%ebp),%edx /* cnt of remaining bytes */
emms /* clear fp tags so we can use mmx instrs */
mov $0x20202020,%eax
movd %eax,%mm1 /* mm1: 0000000020202020 */
movq %mm1,%mm0 /* mm0: 0000000020202020 */
psllq $32,%mm1 /* mm1: 2020202000000000 */
por %mm0,%mm1 /* mm1: 2020202020202020 */
mov $0x01010101,%eax
movd %eax,%mm2 /* mm2: 0000000001010101 */
movq %mm2,%mm0 /* mm0: 0000000001010101 */
psllq $32,%mm2 /* mm2: 0101010100000000 */
por %mm0,%mm2 /* mm2: 0101010101010101 */
/* MMX loads can use any alignment (potentially at a speed-hit) */
/* this loop looks at 8 bytes at a time */
.Lc16w8:
cmpl $8,%edx
jl .Lc16w1
movq (%ecx),%mm0 /* mm0 holds 8 characters */
addl $8,%ecx
subl $8,%edx
pcmpeqb %mm1,%mm0 /* cmp byte for byte with ' ' */
/* the result flag is 00 or FF */
pand %mm2,%mm0 /* turn FF into 01, which is actually useful */
/* if we could just add the bytes up horizontally in %mm0, sigh.. .*/
movd %mm0,%eax
push %eax
add %ah, %al
and $0x03,%eax
add %eax,%esi
pop %eax
shr $16,%eax
add %ah,%al
and $0x03,%eax
add %eax,%esi
psrlq $32,%mm0
movd %mm0,%eax
push %eax
add %ah, %al
and $0x03,%eax
add %eax,%esi
pop %eax
shr $16,%eax
add %ah,%al
and $0x03,%eax
add %eax,%esi
jmp .Lc16w8
/* this loop looks at one byte at a time to handle the remainder */
.Lc16w1:
cmpl $0,%edx
jle .Lc16z
movzbl (%ecx),%eax
incl %ecx
decl %edx
cmpl $32,%eax
jne .Lc16w1
inc %esi
jmp .Lc16w1
/* done, remember to clear fp/mmx tags with emms */
.Lc16z:
emms
addl $20,%ebp
jmp *(%ebp)
------------------------------
hand/space-bs-c8-acc-1-r.s:
Main_zdwcnt_info:
movl (%ebp),%esi /* #spaces found */
movl 4(%ebp),%ecx /* ptr */
addl 12(%ebp),%ecx /* ... + idx */
movl 16(%ebp),%edx /* cnt of remaining bytes */
emms /* clear fp tags so we can use mmx instrs */
mov $0x20202020,%eax
movd %eax,%mm1 /* mm1: 0000000020202020 */
movq %mm1,%mm0 /* mm0: 0000000020202020 */
psllq $32,%mm1 /* mm1: 2020202000000000 */
por %mm0,%mm1 /* mm1: 2020202020202020 */
mov $0x01010101,%eax
movd %eax,%mm2 /* mm2: 0000000001010101 */
movq %mm2,%mm0 /* mm0: 0000000001010101 */
psllq $32,%mm2 /* mm2: 0101010100000000 */
por %mm0,%mm2 /* mm2: 0101010101010101 */
/* MMX loads can use any alignment (potentially at a speed-hit) */
/* therefore we don't have to try to read 1-7 bytes one at a time */
/* first in order to end up with an aligned %ecx. */
.Lc16_mainloop:
cmpl $8,%edx
jl .Lc16w1
movl %edx,%eax
shr $3,%eax
cmpl $127,%eax
jle .Lc16_127
movl $127,%eax
.Lc16_127:
shl $3,%eax
sub %eax,%edx
shr $3,%eax
pxor %mm3,%mm3 /* clear block of space counters */
/* loop up to 127 times in a loop that looks at 8 bytes at a time. */
/* Going above 255 could overflow the 8 counters in mm3. */
/* Going above 127 could overflow the horizontal summation code. */
.Lc16w8:
cmpl $0,%eax
jle .Lc16w8end
movq (%ecx),%mm0 /* mm0 holds 8 characters */
addl $8,%ecx
decl %eax
pcmpeqb %mm1,%mm0 /* cmp byte for byte with ' ' */
/* the result flag is 00 or FF */
pand %mm2,%mm0 /* turn FF into 01, which is actually useful */
paddb %mm0,%mm3 /* add to the 8 space counters */
jmp .Lc16w8
.Lc16w8end:
/* sum the 8 space counters in mm3 and add to %esi */
/* if only MMX had horizontal byte adds... */
movd %mm3,%eax
push %eax
add %ah, %al /* NOTE! potential overflow! */
and $0xFF,%eax
add %eax,%esi
pop %eax
shr $16,%eax
add %ah,%al /* NOTE! potential overflow! */
and $0xFF,%eax
add %eax,%esi
psrlq $32,%mm3
movd %mm3,%eax
push %eax
add %ah, %al /* NOTE! potential overflow! */
and $0xFF,%eax
add %eax,%esi
pop %eax
shr $16,%eax
add %ah,%al /* NOTE! potential overflow! */
and $0xFF,%eax
add %eax,%esi
jmp .Lc16_mainloop
/* this loop looks at one byte at a time to handle the remainder */
.Lc16w1:
cmpl $0,%edx
jle .Lc16z
movzbl (%ecx),%eax
incl %ecx
decl %edx
cmpl $32,%eax
jne .Lc16w1
inc %esi
jmp .Lc16w1
/* done, remember to clear fp/mmx tags with emms */
.Lc16z:
emms
addl $20,%ebp
jmp *(%ebp)
------------------------------
hand/space-bs-c8-acc-1-s.s:
Main_zdwcnt_info:
movl (%ebp),%esi /* #spaces found */
movl 4(%ebp),%ecx /* ptr */
addl 12(%ebp),%ecx /* ... + idx */
movl 16(%ebp),%edx /* cnt of remaining bytes */
emms /* clear fp tags so we can use mmx instrs */
mov $0x20202020,%eax
movd %eax,%mm1 /* mm1: 0000000020202020 */
movq %mm1,%mm0 /* mm0: 0000000020202020 */
psllq $32,%mm1 /* mm1: 2020202000000000 */
por %mm0,%mm1 /* mm1: 2020202020202020 */
mov $0x01010101,%eax
movd %eax,%mm2 /* mm2: 0000000001010101 */
movq %mm2,%mm0 /* mm0: 0000000001010101 */
psllq $32,%mm2 /* mm2: 0101010100000000 */
por %mm0,%mm2 /* mm2: 0101010101010101 */
/* MMX loads can use any alignment (potentially at a speed-hit) */
/* therefore we don't have to try to read 1-7 bytes one at a time */
/* first in order to end up with an aligned %ecx. */
.Lc16_mainloop:
cmpl $8,%edx
jl .Lc16w1
movl %edx,%eax
shr $3,%eax
cmpl $127,%eax
jle .Lc16_127
movl $127,%eax
.Lc16_127:
shl $3,%eax
sub %eax,%edx
shr $3,%eax
pxor %mm3,%mm3 /* clear block of space counters */
/* loop up to 127 times in a loop that looks at 8 bytes at a time. */
/* Going above 255 could overflow the 8 counters in mm3. */
/* Going above 127 could overflow the horizontal summation code. */
cmpl $0,%eax
jle .Lc16w8end
/* this is an unspeakably ugly and sloppy loop unrolling. Doesn't */
/* seem to help much on an Athlon64 3000+. */
test $1,%eax
jz .Lc16w8
incl %eax
jmp .Lc16w8x
.Lc16w8:
movq (%ecx),%mm0 /* mm0 holds 8 characters */
addl $8,%ecx
pcmpeqb %mm1,%mm0 /* cmp byte for byte with ' ' */
/* the result flag is 00 or FF */
pand %mm2,%mm0 /* turn FF into 01, which is actually useful */
paddb %mm0,%mm3 /* add to the 8 space counters */
.Lc16w8x:
movq (%ecx),%mm0 /* mm0 holds 8 characters */
addl $8,%ecx
pcmpeqb %mm1,%mm0 /* cmp byte for byte with ' ' */
/* the result flag is 00 or FF */
pand %mm2,%mm0 /* turn FF into 01, which is actually useful */
paddb %mm0,%mm3 /* add to the 8 space counters */
subl $2,%eax
jnz .Lc16w8
.Lc16w8end:
/* sum the 8 space counters in mm3 and add to %esi */
/* if only MMX had horizontal byte adds... */
movd %mm3,%eax
push %eax
add %ah, %al /* NOTE! potential overflow! */
and $0xFF,%eax
add %eax,%esi
pop %eax
shr $16,%eax
add %ah,%al /* NOTE! potential overflow! */
and $0xFF,%eax
add %eax,%esi
psrlq $32,%mm3
movd %mm3,%eax
push %eax
add %ah, %al /* NOTE! potential overflow! */
and $0xFF,%eax
add %eax,%esi
pop %eax
shr $16,%eax
add %ah,%al /* NOTE! potential overflow! */
and $0xFF,%eax
add %eax,%esi
jmp .Lc16_mainloop
/* this loop looks at one byte at a time to handle the remainder */
.Lc16w1:
cmpl $0,%edx
jle .Lc16z
movzbl (%ecx),%eax
incl %ecx
decl %edx
cmpl $32,%eax
jne .Lc16w1
inc %esi
jmp .Lc16w1
/* done, remember to clear fp/mmx tags with emms */
.Lc16z:
emms
addl $20,%ebp
jmp *(%ebp)
==============================
More information about the Haskell-Cafe
mailing list