soup up, fix bugs, replace PSHUFW with pre-Katmai-friendly PSRLQ mm,32

This commit is contained in:
Josh Coalson 2001-05-21 23:40:16 +00:00
parent d1299d9626
commit f5490343ba

View File

@ -106,27 +106,35 @@ FLAC__fixed_compute_best_predictor_asm:
add edi, ebx
sub edi, ebp ; last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]);
mov ebx, [esp + 28] ; ebx = data[]
mov ecx, [esp + 32] ; ecx = data_len
movd mm6, ecx ; mm6 = 0:last_error_0
movd mm3, edx ; mm3 = 0:last_error_1
movd mm7, esi ; mm7 = 0:last_error_2
movd mm4, edi ; mm4 = 0:last_error_3
punpckldq mm6, mm3 ; mm6 = last_error_1:last_error_0
punpckldq mm7, mm4 ; mm7 = last_error_3:last_error_2
mov ecx, [esp + 32] ; ecx = data_len
; for(i = 0; i < data_len; i++) {
; error_0 = data[i] ; save = error_0; total_error_0 += local_abs(error_0);
; error_1 -= last_error_0; last_error_0 = save; save = error_1; total_error_1 += local_abs(error_1);
; error_2 -= last_error_1; last_error_1 = save; save = error_2; total_error_2 += local_abs(error_2);
; error_3 -= last_error_2; last_error_2 = save; save = error_3; total_error_3 += local_abs(error_3);
; error_4 -= last_error_3; last_error_3 = save; total_error_4 += local_abs(error_4);
; }
.loop:
mov eax, [ebx] ; eax = error_0 = data[i]
add ebx, 4
mov edx, eax ; edx = error_0
mov edi, eax ; edi == save = error_0
mov edx, eax ; edx = error_0
neg edx ; edx = -error_0
cmovns eax, edx ; eax = abs(error_0)
movd mm5, eax ; mm5 = 0:abs(error_0)
movd edx, mm6 ; edx = last_error_0
mov eax, edi ; eax = error(error_0)
pshufw mm3, mm6, 4eh ; 4eh=1-0-3-2, mm3 = last_error_0:last_error_1
movaps mm3, mm6 ; mm3 = last_error_1:last_error_0
psrlq mm3, 32 ; mm3 = 0:last_error_1
movd mm6, edi ; mm6 = 0:last_error_0(=save)
punpckldq mm6, mm3 ; mm6 = last_error_1:last_error_0
punpckldq mm6, mm3 ; mm6 = last_error_1:last_error_0(=save)
sub eax, edx ; error -= last_error_0
mov edi, eax ; edi == save = error_1
mov edx, eax ; edx = error_1
@ -134,21 +142,23 @@ FLAC__fixed_compute_best_predictor_asm:
cmovns eax, edx ; eax = abs(error_1)
movd mm4, eax ; mm4 = 0:abs(error_1)
punpckldq mm5, mm4 ; mm5 = abs(error_1):abs(error_0)
pshufw mm3, mm6, 4eh ; 4eh=1-0-3-2, mm3 = last_error_0:last_error_1
movaps mm3, mm6 ; mm3 = last_error_1:last_error_0
psrlq mm3, 32 ; mm3 = 0:last_error_1
movd edx, mm3 ; edx = last_error_1
mov eax, edi ; eax = error(error_1)
movd mm4, edi ; mm4 = 0:save
punpckldq mm6, mm4 ; mm6 = last_error_1(=save):last_error_0
sub eax, edx ; error -= last_error_1
mov edi, eax ; edi == save = error_2
paddd mm0, mm5 ; [CR] total_error_1 += abs(error_1) ; total_error_0 += abs(error_0)
mov edx, eax ; edx = error_2
paddd mm0, mm5 ; [CR] total_error_1 += abs(error_1) ; total_error_0 += abs(error_0)
neg edx ; edx = -error_2
cmovns eax, edx ; eax = abs(error_2)
movd mm5, eax ; mm5 = 0:abs(error_2)
movd edx, mm7 ; edx = last_error_2
mov eax, edi ; eax = error(error_2)
pshufw mm3, mm7, 4eh ; 4eh=1-0-3-2, mm3 = last_error_2:last_error_3
movaps mm3, mm7 ; mm3 = last_error_3:last_error_2
psrlq mm3, 32 ; mm3 = 0:last_error_3
movd mm7, edi ; mm7 = 0:last_error_2(=save)
punpckldq mm7, mm3 ; mm7 = last_error_3:last_error_2
sub eax, edx ; error -= last_error_2
@ -158,14 +168,15 @@ FLAC__fixed_compute_best_predictor_asm:
cmovns eax, edx ; eax = abs(error_3)
movd mm4, eax ; mm4 = 0:abs(error_3)
punpckldq mm5, mm4 ; mm5 = abs(error_3):abs(error_2)
pshufw mm3, mm7, 4eh ; 4eh=1-0-3-2, mm3 = last_error_2:last_error_3
movaps mm3, mm7
psrlq mm3, 32 ; mm3 = 0:last_error_3
movd edx, mm3 ; edx = last_error_3
mov eax, edi ; eax = error(error_3)
movd mm4, edi ; mm4 = 0:save
punpckldq mm7, mm4 ; mm7 = last_error_3(=save):last_error_2
sub eax, edx ; error -= last_error_3
paddd mm1, mm5 ; [CR] total_error_3 += abs(error_3) ; total_error_2 += abs(error_2)
mov edx, eax ; edx = error_4
paddd mm1, mm5 ; [CR] total_error_3 += abs(error_3) ; total_error_2 += abs(error_2)
neg edx ; edx = -error_4
cmovns eax, edx ; eax = abs(error_4)
movd mm5, eax ; mm5 = 0:abs(error_4)
@ -184,10 +195,12 @@ FLAC__fixed_compute_best_predictor_asm:
; else
; order = 4;
movd edi, mm2 ; edi = total_error_4
pshufw mm4, mm1, 4eh ; 4eh=1-0-4-2, mm3 = total_error_2:total_error_3
movaps mm4, mm1 ; mm4 = total_error_3:total_error_2
psrlq mm4, 32 ; mm4 = 0:total_error_3
movd edx, mm1 ; edx = total_error_2
movd esi, mm4 ; esi = total_error_3
pshufw mm3, mm0, 4eh ; 4eh=1-0-3-2, mm3 = total_error_0:total_error_1
movaps mm3, mm0 ; mm3 = total_error_1:total_error_0
psrlq mm3, 32 ; mm3 = 0:total_error_1
movd ebx, mm0 ; ebx = total_error_0
movd ecx, mm3 ; ecx = total_error_1
emms
@ -207,17 +220,17 @@ FLAC__fixed_compute_best_predictor_asm:
jmp short .got_order
.not_order_0:
cmp eax, ecx
jne .not_order_0
jne .not_order_1
mov ebp, 1
jmp short .got_order
.not_order_1:
cmp eax, edx
jne .not_order_0
jne .not_order_2
mov ebp, 2
jmp short .got_order
.not_order_2:
cmp eax, esi
jne .not_order_0
jne .not_order_3
mov ebp, 3
jmp short .got_order
.not_order_3:
@ -228,90 +241,88 @@ FLAC__fixed_compute_best_predictor_asm:
; residual_bits_per_sample[2] = (real)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (real)total_error_2 / (real) data_len) / M_LN2 : 0.0);
; residual_bits_per_sample[3] = (real)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (real)total_error_3 / (real) data_len) / M_LN2 : 0.0);
; residual_bits_per_sample[4] = (real)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (real)total_error_4 / (real) data_len) / M_LN2 : 0.0);
fild dword [esp + 32] ; ST = data_len (NOTE: assumes data_len is <2gigs)
fldz ; ST = 0.0 data_len
xor eax, eax
cmp eax, [esp + 32]
je near .data_len_is_0
fild dword [esp + 32] ; ST = data_len (NOTE: assumes data_len is <2gigs)
.rbps_0:
cmp eax, ebx
je .total_error_0_is_0
fld1 ; ST = 1.0 0.0 data_len
test ebx, ebx
jz .total_error_0_is_0
fld1 ; ST = 1.0 data_len
mov [esp], ebx
mov [esp + 4], eax ; [esp + 0] = (uint64)total_error_0
fild qword [esp] ; ST = total_error_0 1.0 0.0 data_len
fdiv st3 ; ST = total_error_0/data_len 1.0 0.0 data_len
fldln2 ; ST = ln2 total_error_0/data_len 1.0 0.0 data_len
fmulp st1 ; ST = ln2*total_error_0/data_len 1.0 0.0 data_len
fyl2x ; ST = log2(ln2*total_error_0/data_len) 0.0 data_len
mov ebx, [esp + 36]
fstp dword [ebx] ; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len) ST = 0.0 data_len
fild qword [esp] ; ST = total_error_0 1.0 data_len
fdiv st2 ; ST = total_error_0/data_len 1.0 data_len
fldln2 ; ST = ln2 total_error_0/data_len 1.0 data_len
fmulp st1 ; ST = ln2*total_error_0/data_len 1.0 data_len
fyl2x ; ST = log2(ln2*total_error_0/data_len) data_len
fstp dword [ebx] ; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len) ST = data_len
jmp short .rbps_1
.total_error_0_is_0:
mov ebx, [esp + 36]
fst dword [ebx] ; ST = 0.0 data_len
mov [ebx], eax ; residual_bits_per_sample[0] = 0.0
.rbps_1:
cmp eax, ecx
je .total_error_1_is_0
fld1 ; ST = 1.0 0.0 data_len
test ecx, ecx
jz .total_error_1_is_0
fld1 ; ST = 1.0 data_len
mov [esp], ecx
mov [esp + 4], eax ; [esp + 0] = (uint64)total_error_1
fild qword [esp] ; ST = total_error_1 1.0 0.0 data_len
fdiv st3 ; ST = total_error_1/data_len 1.0 0.0 data_len
fldln2 ; ST = ln2 total_error_1/data_len 1.0 0.0 data_len
fmulp st1 ; ST = ln2*total_error_1/data_len 1.0 0.0 data_len
fyl2x ; ST = log2(ln2*total_error_1/data_len) 0.0 data_len
fstp dword [ebx + 4] ; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len) ST = 0.0 data_len
fild qword [esp] ; ST = total_error_1 1.0 data_len
fdiv st2 ; ST = total_error_1/data_len 1.0 data_len
fldln2 ; ST = ln2 total_error_1/data_len 1.0 data_len
fmulp st1 ; ST = ln2*total_error_1/data_len 1.0 data_len
fyl2x ; ST = log2(ln2*total_error_1/data_len) data_len
fstp dword [ebx + 4] ; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len) ST = data_len
jmp short .rbps_2
.total_error_1_is_0:
fst dword [ebx + 4] ; residual_bits_per_sample[1] = 0.0 ST = 0.0 data_len
mov [ebx + 4], eax ; residual_bits_per_sample[1] = 0.0
.rbps_2:
cmp eax, edx
je .total_error_2_is_0
fld1 ; ST = 1.0 0.0 data_len
test edx, edx
jz .total_error_2_is_0
fld1 ; ST = 1.0 data_len
mov [esp], edx
mov [esp + 4], eax ; [esp + 0] = (uint64)total_error_2
fild qword [esp] ; ST = total_error_2 1.0 0.0 data_len
fdiv st3 ; ST = total_error_2/data_len 1.0 0.0 data_len
fldln2 ; ST = ln2 total_error_2/data_len 1.0 0.0 data_len
fmulp st1 ; ST = ln2*total_error_2/data_len 1.0 0.0 data_len
fyl2x ; ST = log2(ln2*total_error_2/data_len) 0.0 data_len
fstp dword [ebx + 8] ; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len) ST = 0.0 data_len
fild qword [esp] ; ST = total_error_2 1.0 data_len
fdiv st2 ; ST = total_error_2/data_len 1.0 data_len
fldln2 ; ST = ln2 total_error_2/data_len 1.0 data_len
fmulp st1 ; ST = ln2*total_error_2/data_len 1.0 data_len
fyl2x ; ST = log2(ln2*total_error_2/data_len) data_len
fstp dword [ebx + 8] ; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len) ST = data_len
jmp short .rbps_3
.total_error_2_is_0:
fst dword [ebx + 8] ; residual_bits_per_sample[2] = 0.0 ST = 0.0 data_len
mov [ebx + 8], eax ; residual_bits_per_sample[2] = 0.0
.rbps_3:
cmp eax, esi
je .total_error_3_is_0
fld1 ; ST = 1.0 0.0 data_len
test esi, esi
jz .total_error_3_is_0
fld1 ; ST = 1.0 data_len
mov [esp], esi
mov [esp + 4], eax ; [esp + 0] = (uint64)total_error_3
fild qword [esp] ; ST = total_error_3 1.0 0.0 data_len
fdiv st3 ; ST = total_error_3/data_len 1.0 0.0 data_len
fldln2 ; ST = ln2 total_error_3/data_len 1.0 0.0 data_len
fmulp st1 ; ST = ln2*total_error_3/data_len 1.0 0.0 data_len
fyl2x ; ST = log2(ln2*total_error_3/data_len) 0.0 data_len
fstp dword [ebx + 12] ; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len) ST = 0.0 data_len
fild qword [esp] ; ST = total_error_3 1.0 data_len
fdiv st2 ; ST = total_error_3/data_len 1.0 data_len
fldln2 ; ST = ln2 total_error_3/data_len 1.0 data_len
fmulp st1 ; ST = ln2*total_error_3/data_len 1.0 data_len
fyl2x ; ST = log2(ln2*total_error_3/data_len) data_len
fstp dword [ebx + 12] ; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len) ST = data_len
jmp short .rbps_4
.total_error_3_is_0:
fst dword [ebx + 12] ; residual_bits_per_sample[3] = 0.0 ST = 0.0 data_len
mov [ebx + 12], eax ; residual_bits_per_sample[3] = 0.0
.rbps_4:
cmp eax, edi
je .total_error_4_is_0
fld1 ; ST = 1.0 0.0 data_len
test edi, edi
jz .total_error_4_is_0
fld1 ; ST = 1.0 data_len
mov [esp], edi
mov [esp + 4], eax ; [esp + 0] = (uint64)total_error_4
fild qword [esp] ; ST = total_error_4 1.0 0.0 data_len
fdiv st3 ; ST = total_error_4/data_len 1.0 0.0 data_len
fldln2 ; ST = ln2 total_error_4/data_len 1.0 0.0 data_len
fmulp st1 ; ST = ln2*total_error_4/data_len 1.0 0.0 data_len
fyl2x ; ST = log2(ln2*total_error_4/data_len) 0.0 data_len
fstp dword [ebx + 16] ; residual_bits_per_sample[2] = log2(ln2*total_error_4/data_len) ST = 0.0 data_len
fild qword [esp] ; ST = total_error_4 1.0 data_len
fdiv st2 ; ST = total_error_4/data_len 1.0 data_len
fldln2 ; ST = ln2 total_error_4/data_len 1.0 data_len
fmulp st1 ; ST = ln2*total_error_4/data_len 1.0 data_len
fyl2x ; ST = log2(ln2*total_error_4/data_len) data_len
fstp dword [ebx + 16] ; residual_bits_per_sample[4] = log2(ln2*total_error_4/data_len) ST = data_len
jmp short .rbps_end
.total_error_4_is_0:
fst dword [ebx + 16] ; residual_bits_per_sample[2] = 0.0 ST = 0.0 data_len
mov [ebx + 16], eax ; residual_bits_per_sample[4] = 0.0
.rbps_end:
fstp st0 ; ST = data_len
fstp st0 ; ST = [empty]
jmp short .end
.data_len_is_0: