[Haskell-cafe] [2/16] SBM: Inner loops of the hand-tweaked assembly benchmarks

Sat Dec 22 04:16:56 EST 2007

I've taken the two benchmarks byte-bs----acc and space-bs-c8-acc-1 and
gradually tweaked their inner loops from something that used memory all the
time to something that used registers more and more efficiently.  I've done
this gradually, pretty much one register at a time.  Along the way, I've also
done a simple common subexpression/loop hoisting thing in which I combined the
pointer to the start of the string and the index into the string into a single
pointer.  Doing this in real life may cause bad problems with the garbage
collector.

At the end, I go a bit mad and start doing heroic optimizations (reading four
bytes at a time, using MMX registers to read 8 bytes at a time, twisted MMX
math to keep 8 space counters in an MMX register + a bit of loop unrolling).

Here follows first the two original inner loops and then the 23 hand-tweaked
versions.

I used the following shell code to isolate the inner loops:

 (for F in hs/byte-bs----acc.s hs/space-bs-c8-acc-1.s hand/*.s ; \
        do echo "------------------------------"; \
           echo "$F:";                            \
           echo ;                                 \
           cat "$F" | perl -e 'while(<>){ if (/Main_zdwcnt_info:/ .. /.section .data/) { print; }}' | head -n-1;                         \
        done; \
           echo "=============================="; \
 ) > xx.txt

-Peter

------------------------------
hs/byte-bs----acc.s:

Main_zdwcnt_info:
.LcYL:
	cmpl $0,16(%ebp)
	jle .LcYO
	movl 12(%ebp),%eax
	incl %eax
	movl (%ebp),%ecx
	incl %ecx
	subl $1,16(%ebp)
	movl %eax,12(%ebp)
	movl %ecx,(%ebp)
	jmp Main_zdwcnt_info
.LcYO:
	movl (%ebp),%esi
	addl $20,%ebp
	jmp *(%ebp)
------------------------------
hs/space-bs-c8-acc-1.s:

Main_zdwcnt_info:
.Lc16u:
	cmpl $0,16(%ebp)
	jle .Lc16x
	movl 4(%ebp),%eax
	movl 12(%ebp),%ecx
	movzbl (%eax,%ecx,1),%eax
	cmpl $32,%eax
	jne .Lc16F
	movl 12(%ebp),%eax
	incl %eax
	movl (%ebp),%ecx
	incl %ecx
	subl $1,16(%ebp)
	movl %eax,12(%ebp)
	movl %ecx,(%ebp)
	jmp Main_zdwcnt_info
.Lc16x:
	movl (%ebp),%esi
	addl $20,%ebp
	jmp *(%ebp)
.Lc16F:
	movl 12(%ebp),%eax
	incl %eax
	subl $1,16(%ebp)
	movl %eax,12(%ebp)
	jmp Main_zdwcnt_info
------------------------------
hand/byte-bs----acc-a.s:

Main_zdwcnt_info:
.LcYN:
	cmpl $0,16(%ebp)
	jle .LcYQ

	movl 00(%ebp),%ecx
	movl 12(%ebp),%eax
	movl 16(%ebp),%edx

	incl %ecx
	incl %eax
	decl %edx

	movl %ecx,00(%ebp)
	movl %eax,12(%ebp)
	movl %edx,16(%ebp)
	jmp Main_zdwcnt_info

.LcYQ:
	movl (%ebp),%esi
	addl $20,%ebp
	jmp *(%ebp)
------------------------------
hand/byte-bs----acc-b.s:

Main_zdwcnt_info:
.LcYN:
	cmpl $0,16(%ebp)
	jle .LcYQ

	movl 00(%ebp),%ecx
	movl 12(%ebp),%eax
	movl 16(%ebp),%edx

.L_again:
	cmpl $0,%edx
	jle  .L_out
	incl %ecx
	incl %eax
	decl %edx
	jmp  .L_again
.L_out:
	movl %ecx,00(%ebp)
	movl %eax,12(%ebp)
	movl %edx,16(%ebp)
	jmp Main_zdwcnt_info

.LcYQ:
	movl (%ebp),%esi
	addl $20,%ebp
	jmp *(%ebp)
------------------------------
hand/byte-bs----acc-c.s:

Main_zdwcnt_info:
.LcYN:
	cmpl $0,16(%ebp)
	jle .LcYQ

	movl 00(%ebp),%ecx
	movl 12(%ebp),%eax
	movl 16(%ebp),%edx

	cmpl $0,%edx
	jle  .L_out
.L_again:
	incl %ecx
	incl %eax
	decl %edx
	cmpl $0,%edx
	jg   .L_again

.L_out:
	movl %ecx,00(%ebp)
	movl %eax,12(%ebp)
	movl %edx,16(%ebp)
	jmp Main_zdwcnt_info

.LcYQ:
	movl (%ebp),%esi
	addl $20,%ebp
	jmp *(%ebp)
------------------------------
hand/byte-bs----acc-d.s:

Main_zdwcnt_info:
.LcYN:
	cmpl $0,16(%ebp)
	jle .LcYQ

	movl 00(%ebp),%ecx
	movl 12(%ebp),%eax
	movl 16(%ebp),%edx

	cmpl $0,%edx
	jle  .L_out
	.align 16
.L_again:
	incl %ecx
	incl %eax
	decl %edx
	cmpl $0,%edx
	jg   .L_again

.L_out:
	movl %ecx,00(%ebp)
	movl %eax,12(%ebp)
	movl %edx,16(%ebp)
	jmp Main_zdwcnt_info

.LcYQ:
	movl (%ebp),%esi
	addl $20,%ebp
	jmp *(%ebp)
------------------------------
hand/space-bs-c8-acc-1-a.s:

Main_zdwcnt_info:

.Lc16w:
	cmpl $0,16(%ebp)
	jle .Lc16z
	movl 4(%ebp),%eax
	movl 12(%ebp),%ecx
	movzbl (%eax,%ecx,1),%eax
	cmpl $32,%eax
	jne .Lc16H
	movl 12(%ebp),%eax
	incl %eax
	movl (%ebp),%ecx
	incl %ecx
	subl $1,16(%ebp)
	movl %eax,12(%ebp)
	movl %ecx,(%ebp)
	jmp Main_zdwcnt_info
.Lc16H:
	movl 12(%ebp),%eax
	incl %eax
	subl $1,16(%ebp)
	movl %eax,12(%ebp)
	jmp Main_zdwcnt_info
.Lc16z:
	movl (%ebp),%esi
	addl $20,%ebp
	jmp *(%ebp)

------------------------------
hand/space-bs-c8-acc-1-b.s:

Main_zdwcnt_info:

.Lc16w:
	cmpl $0,16(%ebp)
	jle .Lc16z
	movl 4(%ebp),%eax
	movl 12(%ebp),%ecx
	movzbl (%eax,%ecx,1),%eax
	cmpl $32,%eax
	je .Lc16H

	movl 12(%ebp),%eax
	incl %eax
	subl $1,16(%ebp)
	movl %eax,12(%ebp)
	jmp Main_zdwcnt_info

.Lc16H:
	movl 12(%ebp),%eax
	incl %eax
	movl (%ebp),%ecx
	incl %ecx
	subl $1,16(%ebp)
	movl %eax,12(%ebp)
	movl %ecx,(%ebp)
	jmp Main_zdwcnt_info

.Lc16z:
	movl (%ebp),%esi
	addl $20,%ebp
	jmp *(%ebp)

------------------------------
hand/space-bs-c8-acc-1-c.s:

Main_zdwcnt_info:

.Lc16w:
	cmpl $0,16(%ebp)
	jle .Lc16z
	movl 4(%ebp),%eax
	movl 12(%ebp),%ecx
	movzbl (%eax,%ecx,1),%eax
	cmpl $32,%eax
	jne .Lc16H
	movl (%ebp),%ecx
	incl %ecx
	movl 12(%ebp),%eax
	incl %eax
	movl %ecx,(%ebp)
	movl %eax,12(%ebp)
	subl $1,16(%ebp)
	jmp Main_zdwcnt_info
.Lc16z:
	movl (%ebp),%esi
	addl $20,%ebp
	jmp *(%ebp)
.Lc16H:
	movl 12(%ebp),%eax
	incl %eax
	movl %eax,12(%ebp)
	subl $1,16(%ebp)
	jmp Main_zdwcnt_info

------------------------------
hand/space-bs-c8-acc-1-d.s:

Main_zdwcnt_info:

.Lc16w:
	cmpl $0,16(%ebp)
	jle .Lc16z
	movl 4(%ebp),%eax
	movl 12(%ebp),%ecx
	movzbl (%eax,%ecx,1),%eax
	cmpl $32,%eax
	jne .Lc16H

	addl $1,(%ebp)
	addl $1,12(%ebp)
	subl $1,16(%ebp)
	jmp Main_zdwcnt_info
.Lc16z:
	movl (%ebp),%esi
	addl $20,%ebp
	jmp *(%ebp)
.Lc16H:
	addl $1,12(%ebp)
	subl $1,16(%ebp)
	jmp Main_zdwcnt_info

------------------------------
hand/space-bs-c8-acc-1-e.s:

Main_zdwcnt_info:

.Lc16w:
	cmpl $0,16(%ebp)
	jle .Lc16z
	movl 4(%ebp),%eax
	movl 12(%ebp),%ecx
	movzbl (%eax,%ecx,1),%eax
	cmpl $32,%eax
	jne .Lc16H

	movl 12(%ebp),%eax
	incl %eax

	incl  %ecx
	movl (%ebp),%eax
	incl %eax
	subl $1,16(%ebp)
	movl %ecx,12(%ebp)
	movl %eax,(%ebp)
	jmp Main_zdwcnt_info
.Lc16z:
	movl (%ebp),%esi
	addl $20,%ebp
	jmp *(%ebp)
.Lc16H:
	incl %ecx
	subl $1,16(%ebp)
	movl %ecx,12(%ebp)
	jmp Main_zdwcnt_info

------------------------------
hand/space-bs-c8-acc-1-f.s:

Main_zdwcnt_info:

.Lc16w:
	cmpl $0,16(%ebp)
	jle .Lc16z
	movl 4(%ebp),%eax
	movl 12(%ebp),%ecx
	movzbl (%eax,%ecx,1),%eax
	cmpl $32,%eax
	jne .Lc16H

	incl  %ecx
	subl $1,16(%ebp)
	addl $1,(%ebp)
	movl %ecx,12(%ebp)
	jmp Main_zdwcnt_info
.Lc16z:
	movl (%ebp),%esi
	addl $20,%ebp
	jmp *(%ebp)
.Lc16H:
	incl %ecx
	subl $1,16(%ebp)
	movl %ecx,12(%ebp)
	jmp Main_zdwcnt_info

------------------------------
hand/space-bs-c8-acc-1-g.s:

Main_zdwcnt_info:
	movl (%ebp),%esi

.Lc16w:
	cmpl $0,16(%ebp)
	jle .Lc16z
	movl 4(%ebp),%eax
	movl 12(%ebp),%ecx
	movzbl (%eax,%ecx,1),%eax
	cmpl $32,%eax
	jne .Lc16H

	incl  %ecx
	subl $1,16(%ebp)
	inc  %esi
	movl %ecx,12(%ebp)
	jmp .Lc16w
.Lc16z:
	addl $20,%ebp
	jmp *(%ebp)
.Lc16H:
	incl %ecx
	subl $1,16(%ebp)
	movl %ecx,12(%ebp)
	jmp .Lc16w

------------------------------
hand/space-bs-c8-acc-1-h.s:

Main_zdwcnt_info:
	movl (%ebp),%esi
	movl 12(%ebp),%ecx

.Lc16w:
	cmpl $0,16(%ebp)
	jle .Lc16z
	movl 4(%ebp),%eax
	movzbl (%eax,%ecx,1),%eax
	cmpl $32,%eax
	jne .Lc16H

	incl  %ecx
	subl $1,16(%ebp)
	inc  %esi
	jmp .Lc16w
.Lc16z:
	addl $20,%ebp
	jmp *(%ebp)
.Lc16H:
	incl %ecx
	subl $1,16(%ebp)
	jmp .Lc16w

------------------------------
hand/space-bs-c8-acc-1-i.s:

Main_zdwcnt_info:
	movl (%ebp),%esi
	movl 12(%ebp),%ecx
	movl 16(%ebp),%edx

.Lc16w:
	cmpl $0,%edx
	jle .Lc16z
	movl 4(%ebp),%eax
	movzbl (%eax,%ecx,1),%eax
	cmpl $32,%eax
	jne .Lc16H

	incl  %ecx
	decl %edx
	inc  %esi
	jmp .Lc16w
.Lc16z:
	addl $20,%ebp
	jmp *(%ebp)
.Lc16H:
	incl %ecx
	decl %edx
	jmp .Lc16w

------------------------------
hand/space-bs-c8-acc-1-j.s:

Main_zdwcnt_info:
	movl (%ebp),%esi
	movl 4(%ebp),%ecx
	addl 12(%ebp),%ecx
	movl 16(%ebp),%edx

.Lc16w:
	cmpl $0,%edx
	jle .Lc16z
	movzbl (%ecx),%eax
	cmpl $32,%eax
	jne .Lc16H

	incl %ecx
	decl %edx
	inc  %esi
	jmp .Lc16w
.Lc16z:
	addl $20,%ebp
	jmp *(%ebp)
.Lc16H:
	incl %ecx
	decl %edx
	jmp .Lc16w

------------------------------
hand/space-bs-c8-acc-1-k.s:

Main_zdwcnt_info:
	movl (%ebp),%esi
	movl 4(%ebp),%ecx
	addl 12(%ebp),%ecx
	movl 16(%ebp),%edx

.Lc16w:
	cmpl $0,%edx
	jle .Lc16z
	movzbl (%ecx),%eax
	cmpl $32,%eax
	jne .Lc16H

	incl %ecx
	decl %edx
	inc  %esi
	jmp .Lc16w
.Lc16H:
	incl %ecx
	decl %edx
	jmp .Lc16w
.Lc16z:
	addl $20,%ebp
	jmp *(%ebp)

------------------------------
hand/space-bs-c8-acc-1-l.s:

Main_zdwcnt_info:
	movl (%ebp),%esi
	movl 4(%ebp),%ecx
	addl 12(%ebp),%ecx
	movl 16(%ebp),%edx

.Lc16w:
	cmpl $0,%edx
	jle .Lc16z
	movzbl (%ecx),%eax
	incl %ecx
	decl %edx
	cmpl $32,%eax
	jne .Lc16H

	inc  %esi
	jmp .Lc16w
.Lc16H:
	jmp .Lc16w
.Lc16z:
	addl $20,%ebp
	jmp *(%ebp)

------------------------------
hand/space-bs-c8-acc-1-m.s:

Main_zdwcnt_info:
	movl (%ebp),%esi
	movl 4(%ebp),%ecx
	addl 12(%ebp),%ecx
	movl 16(%ebp),%edx

.Lc16w:
	cmpl $0,%edx
	jle .Lc16z
	movzbl (%ecx),%eax
	incl %ecx
	decl %edx
	cmpl $32,%eax
	jne .Lc16w

	inc  %esi
	jmp .Lc16w

.Lc16z:
	addl $20,%ebp
	jmp *(%ebp)

------------------------------
hand/space-bs-c8-acc-1-n.s:

Main_zdwcnt_info:
	movl (%ebp),%esi
	movl 4(%ebp),%ecx
	addl 12(%ebp),%ecx
	movl 16(%ebp),%edx

.Lc16w:
	cmpl $0,%edx
	jle .Lc16z
.Lc16xx:
	movzbl (%ecx),%eax
	incl %ecx
	decl %edx
	cmpl $32,%eax
	jne .Lc16w

	inc  %esi
	cmpl $0,%edx
	jg  .Lc16xx

.Lc16z:
	addl $20,%ebp
	jmp *(%ebp)

------------------------------
hand/space-bs-c8-acc-1-o.s:

Main_zdwcnt_info:
	movl (%ebp),%esi
	movl 4(%ebp),%ecx
	addl 12(%ebp),%ecx
	movl 16(%ebp),%edx

.Lc16w:
	cmpl $0,%edx
	jle .Lc16z
.Lc16xx:
	movzbl (%ecx),%eax
	incl %ecx
	decl %edx
	cmpl $32,%eax
	jne .Lc16w

	inc  %esi
	cmpl $0,%edx
	jle  .Lc16z

	movzbl (%ecx),%eax
	incl %ecx
	decl %edx
	cmpl $32,%eax
	jne .Lc16w

	inc  %esi
	cmpl $0,%edx
	jg  .Lc16xx

.Lc16z:
	addl $20,%ebp
	jmp *(%ebp)

------------------------------
hand/space-bs-c8-acc-1-p.s:

Main_zdwcnt_info:
	movl (%ebp),%esi
	movl 4(%ebp),%ecx
	addl 12(%ebp),%ecx
	movl 16(%ebp),%edx

.Lc16w4:
	cmpl $4,%edx
	jl  .Lc16wxx
	movl (%ecx),%eax

	addl $4,%ecx
	subl $4,%edx

	cmpb $32,%al
	jne .Lc16wa
	incl %esi
.Lc16wa:
	cmpb $32,%ah
	jne .Lc16wb
	incl %esi
.Lc16wb:
	shrl $16,%eax

	cmpb $32,%al
	jne .Lc16wc
	incl %esi
.Lc16wc:
	cmpb $32,%ah
	jne .Lc16w4
	incl %esi
	jmp .Lc16w4

.Lc16w1:
	cmpl $0,%edx
	jle .Lc16z
.Lc16wxx:
	movzbl (%ecx),%eax
	incl %ecx
	decl %edx
	cmpl $32,%eax
	jne .Lc16w1

	inc  %esi
	jmp .Lc16w1

.Lc16z:
	addl $20,%ebp
	jmp *(%ebp)

------------------------------
hand/space-bs-c8-acc-1-q.s:

Main_zdwcnt_info:
	movl (%ebp),%esi	/* #spaces found          */
	movl 4(%ebp),%ecx	/* ptr                    */
	addl 12(%ebp),%ecx	/* ... + idx              */
	movl 16(%ebp),%edx	/* cnt of remaining bytes */

	emms			/* clear fp tags so we can use mmx instrs */

	mov   $0x20202020,%eax
	movd  %eax,%mm1		/* mm1: 0000000020202020 */
	movq  %mm1,%mm0         /* mm0: 0000000020202020 */
	psllq $32,%mm1          /* mm1: 2020202000000000 */
	por   %mm0,%mm1		/* mm1: 2020202020202020 */

	mov   $0x01010101,%eax
	movd  %eax,%mm2		/* mm2: 0000000001010101 */
	movq  %mm2,%mm0		/* mm0: 0000000001010101 */
	psllq $32,%mm2		/* mm2: 0101010100000000 */
	por   %mm0,%mm2		/* mm2: 0101010101010101 */

	/* MMX loads can use any alignment (potentially at a speed-hit) */

	/* this loop looks at 8 bytes at a time */
.Lc16w8:
	cmpl $8,%edx
	jl  .Lc16w1
	movq (%ecx),%mm0	/* mm0 holds 8 characters */
	addl $8,%ecx
	subl $8,%edx
	pcmpeqb %mm1,%mm0	/* cmp byte for byte with ' ' */
				/* the result flag is 00 or FF */
	pand  %mm2,%mm0		/* turn FF into 01, which is actually useful */

	/* if we could just add the bytes up horizontally in %mm0, sigh.. .*/
	movd  %mm0,%eax
	push  %eax
	add   %ah, %al
	and   $0x03,%eax
	add   %eax,%esi
	pop   %eax
	shr   $16,%eax
	add   %ah,%al
	and   $0x03,%eax
	add   %eax,%esi

	psrlq $32,%mm0
	movd  %mm0,%eax
	push  %eax
	add   %ah, %al
	and   $0x03,%eax
	add   %eax,%esi
	pop   %eax
	shr   $16,%eax
	add   %ah,%al
	and   $0x03,%eax
	add   %eax,%esi

	jmp .Lc16w8

	/* this loop looks at one byte at a time to handle the remainder */
.Lc16w1:
	cmpl $0,%edx
	jle .Lc16z
	movzbl (%ecx),%eax
	incl %ecx
	decl %edx
	cmpl $32,%eax
	jne .Lc16w1

	inc  %esi
	jmp .Lc16w1

	/* done, remember to clear fp/mmx tags with emms */
.Lc16z:
	emms
	addl $20,%ebp
	jmp *(%ebp)

------------------------------
hand/space-bs-c8-acc-1-r.s:

Main_zdwcnt_info:
	movl (%ebp),%esi	/* #spaces found          */
	movl 4(%ebp),%ecx	/* ptr                    */
	addl 12(%ebp),%ecx	/* ... + idx              */
	movl 16(%ebp),%edx	/* cnt of remaining bytes */

	emms			/* clear fp tags so we can use mmx instrs */

	mov   $0x20202020,%eax
	movd  %eax,%mm1		/* mm1: 0000000020202020 */
	movq  %mm1,%mm0         /* mm0: 0000000020202020 */
	psllq $32,%mm1          /* mm1: 2020202000000000 */
	por   %mm0,%mm1		/* mm1: 2020202020202020 */

	mov   $0x01010101,%eax
	movd  %eax,%mm2		/* mm2: 0000000001010101 */
	movq  %mm2,%mm0		/* mm0: 0000000001010101 */
	psllq $32,%mm2		/* mm2: 0101010100000000 */
	por   %mm0,%mm2		/* mm2: 0101010101010101 */

	/* MMX loads can use any alignment (potentially at a speed-hit)   */
	/* therefore we don't have to try to read 1-7 bytes one at a time */
	/* first in order to end up with an aligned %ecx.                 */

.Lc16_mainloop:
	cmpl $8,%edx
	jl   .Lc16w1
	movl %edx,%eax
	shr  $3,%eax
	cmpl $127,%eax
	jle  .Lc16_127
	movl $127,%eax
.Lc16_127:

	shl  $3,%eax
	sub  %eax,%edx
	shr  $3,%eax

	pxor %mm3,%mm3		/* clear block of space counters */

	/* loop up to 127 times in a loop that looks at 8 bytes at a time. */
	/* Going above 255 could overflow the 8 counters in mm3. */
	/* Going above 127 could overflow the horizontal summation code. */

.Lc16w8:
	cmpl $0,%eax
	jle  .Lc16w8end
	movq (%ecx),%mm0	/* mm0 holds 8 characters */
	addl $8,%ecx
	decl %eax
	pcmpeqb %mm1,%mm0	/* cmp byte for byte with ' ' */
				/* the result flag is 00 or FF */
	pand  %mm2,%mm0		/* turn FF into 01, which is actually useful */
	paddb %mm0,%mm3		/* add to the 8 space counters */
	jmp   .Lc16w8

.Lc16w8end:
	/* sum the 8 space counters in mm3 and add to %esi */

	/* if only MMX had horizontal byte adds... */
	movd  %mm3,%eax
	push  %eax
	add   %ah, %al		/* NOTE!  potential overflow! */
	and   $0xFF,%eax
	add   %eax,%esi
	pop   %eax
	shr   $16,%eax
	add   %ah,%al		/* NOTE!  potential overflow! */
	and   $0xFF,%eax
	add   %eax,%esi

	psrlq $32,%mm3
	movd  %mm3,%eax
	push  %eax
	add   %ah, %al		/* NOTE!  potential overflow! */
	and   $0xFF,%eax
	add   %eax,%esi
	pop   %eax
	shr   $16,%eax
	add   %ah,%al		/* NOTE!  potential overflow! */
	and   $0xFF,%eax
	add   %eax,%esi

	jmp .Lc16_mainloop

	/* this loop looks at one byte at a time to handle the remainder */
.Lc16w1:
	cmpl $0,%edx
	jle .Lc16z
	movzbl (%ecx),%eax
	incl %ecx
	decl %edx
	cmpl $32,%eax
	jne .Lc16w1

	inc  %esi
	jmp .Lc16w1

	/* done, remember to clear fp/mmx tags with emms */
.Lc16z:
	emms
	addl $20,%ebp
	jmp *(%ebp)

------------------------------
hand/space-bs-c8-acc-1-s.s:

Main_zdwcnt_info:
	movl (%ebp),%esi	/* #spaces found          */
	movl 4(%ebp),%ecx	/* ptr                    */
	addl 12(%ebp),%ecx	/* ... + idx              */
	movl 16(%ebp),%edx	/* cnt of remaining bytes */

	emms			/* clear fp tags so we can use mmx instrs */

	mov   $0x20202020,%eax
	movd  %eax,%mm1		/* mm1: 0000000020202020 */
	movq  %mm1,%mm0         /* mm0: 0000000020202020 */
	psllq $32,%mm1          /* mm1: 2020202000000000 */
	por   %mm0,%mm1		/* mm1: 2020202020202020 */

	mov   $0x01010101,%eax
	movd  %eax,%mm2		/* mm2: 0000000001010101 */
	movq  %mm2,%mm0		/* mm0: 0000000001010101 */
	psllq $32,%mm2		/* mm2: 0101010100000000 */
	por   %mm0,%mm2		/* mm2: 0101010101010101 */

	/* MMX loads can use any alignment (potentially at a speed-hit)   */
	/* therefore we don't have to try to read 1-7 bytes one at a time */
	/* first in order to end up with an aligned %ecx.                 */

.Lc16_mainloop:
	cmpl $8,%edx
	jl   .Lc16w1
	movl %edx,%eax
	shr  $3,%eax
	cmpl $127,%eax
	jle  .Lc16_127
	movl $127,%eax
.Lc16_127:

	shl  $3,%eax
	sub  %eax,%edx
	shr  $3,%eax

	pxor %mm3,%mm3		/* clear block of space counters */

	/* loop up to 127 times in a loop that looks at 8 bytes at a time. */
	/* Going above 255 could overflow the 8 counters in mm3. */
	/* Going above 127 could overflow the horizontal summation code. */

	cmpl $0,%eax
	jle  .Lc16w8end

	/* this is an unspeakably ugly and sloppy loop unrolling.  Doesn't  */
	/* seem to help much on an Athlon64 3000+.                          */
	test $1,%eax
	jz   .Lc16w8
	incl %eax
	jmp  .Lc16w8x

.Lc16w8:
	movq (%ecx),%mm0	/* mm0 holds 8 characters */
	addl $8,%ecx
	pcmpeqb %mm1,%mm0	/* cmp byte for byte with ' ' */
				/* the result flag is 00 or FF */
	pand  %mm2,%mm0		/* turn FF into 01, which is actually useful */
	paddb %mm0,%mm3		/* add to the 8 space counters */

.Lc16w8x:
	movq (%ecx),%mm0	/* mm0 holds 8 characters */
	addl $8,%ecx
	pcmpeqb %mm1,%mm0	/* cmp byte for byte with ' ' */
				/* the result flag is 00 or FF */
	pand  %mm2,%mm0		/* turn FF into 01, which is actually useful */
	paddb %mm0,%mm3		/* add to the 8 space counters */

	subl  $2,%eax
	jnz  .Lc16w8

.Lc16w8end:
	/* sum the 8 space counters in mm3 and add to %esi */

	/* if only MMX had horizontal byte adds... */
	movd  %mm3,%eax
	push  %eax
	add   %ah, %al		/* NOTE!  potential overflow! */
	and   $0xFF,%eax
	add   %eax,%esi
	pop   %eax
	shr   $16,%eax
	add   %ah,%al		/* NOTE!  potential overflow! */
	and   $0xFF,%eax
	add   %eax,%esi

	psrlq $32,%mm3
	movd  %mm3,%eax
	push  %eax
	add   %ah, %al		/* NOTE!  potential overflow! */
	and   $0xFF,%eax
	add   %eax,%esi
	pop   %eax
	shr   $16,%eax
	add   %ah,%al		/* NOTE!  potential overflow! */
	and   $0xFF,%eax
	add   %eax,%esi

	jmp .Lc16_mainloop

	/* this loop looks at one byte at a time to handle the remainder */
.Lc16w1:
	cmpl $0,%edx
	jle .Lc16z
	movzbl (%ecx),%eax
	incl %ecx
	decl %edx
	cmpl $32,%eax
	jne .Lc16w1

	inc  %esi
	jmp .Lc16w1

	/* done, remember to clear fp/mmx tags with emms */
.Lc16z:
	emms
	addl $20,%ebp
	jmp *(%ebp)

==============================