diff --git a/src/libFLAC/ia32/lpc_asm.nasm b/src/libFLAC/ia32/lpc_asm.nasm index 882b82b9..56e6f536 100644 --- a/src/libFLAC/ia32/lpc_asm.nasm +++ b/src/libFLAC/ia32/lpc_asm.nasm @@ -59,10 +59,10 @@ cglobal FLAC__lpc_restore_signal_asm_ia32_mmx ; ALIGN 16 cident FLAC__lpc_compute_autocorrelation_asm_ia32 - ;[esp + 24] == autoc[] - ;[esp + 20] == lag - ;[esp + 16] == data_len - ;[esp + 12] == data[] + ;[esp + 28] == autoc[] + ;[esp + 24] == lag + ;[esp + 20] == data_len + ;[esp + 16] == data[] ;ASSERT(lag > 0) ;ASSERT(lag <= 33) @@ -71,21 +71,22 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32 .begin: push esi push edi + push ebx ; for(coeff = 0; coeff < lag; coeff++) ; autoc[coeff] = 0.0; - mov edi, [esp + 24] ; edi == autoc - mov ecx, [esp + 20] ; ecx = # of dwords (=lag) of 0 to write + mov edi, [esp + 28] ; edi == autoc + mov ecx, [esp + 24] ; ecx = # of dwords (=lag) of 0 to write xor eax, eax rep stosd ; const unsigned limit = data_len - lag; - mov eax, [esp + 20] ; eax == lag - mov ecx, [esp + 16] + mov eax, [esp + 24] ; eax == lag + mov ecx, [esp + 20] sub ecx, eax ; ecx == limit - mov edi, [esp + 24] ; edi == autoc - mov esi, [esp + 12] ; esi == data + mov edi, [esp + 28] ; edi == autoc + mov esi, [esp + 16] ; esi == data inc ecx ; we are looping <= limit so we add one to the counter ; for(sample = 0; sample <= limit; sample++) { @@ -97,7 +98,11 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) lea edx, [eax + eax*2] neg edx - lea edx, [eax + edx*4 + .jumper1_0] + lea edx, [eax + edx*4 + .jumper1_0 - .get_eip1] + call .get_eip1 +.get_eip1: + pop ebx + add edx, ebx inc edx ; compensate for the shorter opcode on the last iteration inc edx ; compensate for the shorter opcode on the last iteration inc edx ; compensate for the shorter opcode on the last iteration @@ -254,7 +259,7 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32 ; for(coeff = 0; coeff < data_len - sample; coeff++) ; autoc[coeff] += d * data[sample+coeff]; ; } - mov ecx, [esp + 20] ; ecx <- lag + mov ecx, [esp + 24] ; ecx <- lag dec ecx ; ecx <- lag - 1 jz near .end ; skip loop if 0 (i.e. lag == 1) @@ -263,7 +268,11 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) lea edx, [eax + eax*2] neg edx - lea edx, [eax + edx*4 + .jumper2_0] + lea edx, [eax + edx*4 + .jumper2_0 - .get_eip2] + call .get_eip2 +.get_eip2: + pop ebx + add edx, ebx inc edx ; compensate for the shorter opcode on the last iteration inc edx ; compensate for the shorter opcode on the last iteration inc edx ; compensate for the shorter opcode on the last iteration @@ -409,6 +418,7 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32 .loop2_end: .end: + pop ebx pop edi pop esi ret @@ -804,7 +814,11 @@ cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 .i_32: sub edi, esi neg eax - lea edx, [eax + eax * 8 + .jumper_0] + lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] + call .get_eip0 +.get_eip0: + pop eax + add edx, eax inc edx mov eax, [esp + 28] ; eax = qlp_coeff[] xor ebp, ebp @@ -1203,7 +1217,11 @@ cident FLAC__lpc_restore_signal_asm_ia32 .x87_32: sub esi, edi neg eax - lea edx, [eax + eax * 8 + .jumper_0] + lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] + call .get_eip0 +.get_eip0: + pop eax + add edx, eax inc edx ; compensate for the shorter opcode on the last iteration mov eax, [esp + 28] ; eax = qlp_coeff[] xor ebp, ebp