Patch downloaded from http://bugs.gentoo.org/show_bug.cgi?id=121871 http://bugs.gentoo.org/attachment.cgi?id=98094 --- libdv-0.104-old/libdv/asm_common.S +++ libdv-0.104/libdv/asm_common.S @@ -0,0 +1,29 @@ +/* public domain, do what you want */ + +#ifdef __PIC__ +# define MUNG(sym) sym##@GOTOFF(%ebp) +# define MUNG_ARR(sym, args...) sym##@GOTOFF(%ebp,##args) +#else +# define MUNG(sym) sym +# define MUNG_ARR(sym, args...) sym(,##args) +#endif + +#ifdef __PIC__ +# undef __i686 /* gcc define gets in our way */ +# define LOAD_PIC_REG(reg) \ + .ifndef __i686.get_pc_thunk.reg; \ + .section .gnu.linkonce.t.__i686.get_pc_thunk.reg,"ax",@progbits; \ + .global __i686.get_pc_thunk.reg; \ + .hidden __i686.get_pc_thunk.reg; \ + .type __i686.get_pc_thunk.reg,@function; \ + __i686.get_pc_thunk.reg: \ + movl (%esp), %e##reg; \ + ret; \ + .size __i686.get_pc_thunk.reg,.-__i686.get_pc_thunk.reg; \ + .previous; \ + .endif; \ + call __i686.get_pc_thunk.reg; \ + addl $_GLOBAL_OFFSET_TABLE_, %e##reg +#else +# define LOAD_PIC_REG(reg) +#endif --- libdv-0.104-old/libdv/dct_block_mmx.S +++ libdv-0.104/libdv/dct_block_mmx.S @@ -53,19 +53,22 @@ scratch2: .quad 0 .section .note.GNU-stack, "", @progbits +#include "asm_common.S" + .text .align 8 .global _dv_dct_88_block_mmx .hidden _dv_dct_88_block_mmx .type _dv_dct_88_block_mmx,@function _dv_dct_88_block_mmx: pushl %ebp - movl %esp, %ebp pushl %esi - movl 8(%ebp), %esi # source + LOAD_PIC_REG(bp) + + movl 12(%esp), %esi # source # column 0 movq 16*0(%esi), %mm0 # v0 @@ -86,22 +91,22 @@ _dv_dct_88_block_mmx: movq 16*3(%esi), %mm5 # v3 movq 16*4(%esi), %mm7 # v4 - movq %mm7, scratch1 # scratch1: v4 ; + movq %mm7, MUNG(scratch1) # scratch1: v4 ; movq %mm5, %mm7 # duplicate v3 - paddw scratch1, %mm5 # v03: v3+v4 - psubw scratch1, %mm7 # v04: v3-v4 - movq %mm5, scratch2 # scratch2: v03 + paddw MUNG(scratch1), %mm5 # v03: v3+v4 + psubw MUNG(scratch1), %mm7 # v04: v3-v4 + movq %mm5, MUNG(scratch2) # scratch2: v03 movq %mm0, %mm5 # mm5: v00 - paddw scratch2, %mm0 # v10: v00+v03 - psubw scratch2, %mm5 # v13: v00-v03 - movq %mm3, scratch3 # scratch3: v02 + paddw MUNG(scratch2), %mm0 # v10: v00+v03 + psubw MUNG(scratch2), %mm5 # v13: v00-v03 + movq %mm3, MUNG(scratch3) # scratch3: v02 movq %mm1, %mm3 # duplicate v01 - paddw scratch3, %mm1 # v11: v01+v02 - psubw scratch3, %mm3 # v12: v01-v02 + paddw MUNG(scratch3), %mm1 # v11: v01+v02 + psubw MUNG(scratch3), %mm3 # v12: v01-v02 - movq %mm6, scratch4 # scratch4: v05 + movq %mm6, MUNG(scratch4) # scratch4: v05 movq %mm0, %mm6 # duplicate v10 paddw %mm1, %mm0 # v10+v11 @@ -111,10 +116,10 @@ _dv_dct_88_block_mmx: movq %mm6, 16*4(%esi) # out4: v10-v11 movq %mm4, %mm0 # mm0: v06 - paddw scratch4, %mm4 # v15: v05+v06 + paddw MUNG(scratch4), %mm4 # v15: v05+v06 paddw %mm2, %mm0 # v16: v07+v06 - pmulhw WA3, %mm4 # v35~: WA3*v15 + pmulhw MUNG(WA3), %mm4 # v35~: WA3*v15 psllw $1, %mm4 # v35: compensate the coeefient scale movq %mm4, %mm6 # duplicate v35 @@ -123,7 +128,7 @@ _dv_dct_88_block_mmx: paddw %mm5, %mm3 # v22: v12+v13 - pmulhw WA1, %mm3 # v32~: WA1*v22 + pmulhw MUNG(WA1), %mm3 # v32~: WA1*v22 psllw $16-NSHIFT, %mm3 # v32: compensate the coeefient scale movq %mm5, %mm6 # duplicate v13 @@ -134,13 +139,13 @@ _dv_dct_88_block_mmx: movq %mm6, 16*6(%esi) # out6: v13-v32 - paddw scratch4, %mm7 # v14n: v04+v05 + paddw MUNG(scratch4), %mm7 # v14n: v04+v05 movq %mm0, %mm5 # duplicate v16 psubw %mm7, %mm0 # va1: v16-v14n - pmulhw WA5, %mm0 # va0~: va1*WA5 - pmulhw WA4, %mm5 # v36~~: v16*WA4 - pmulhw WA2, %mm7 # v34~~: v14n*WA2 + pmulhw MUNG(WA5), %mm0 # va0~: va1*WA5 + pmulhw MUNG(WA4), %mm5 # v36~~: v16*WA4 + pmulhw MUNG(WA2), %mm7 # v34~~: v14n*WA2 psllw $16-WA4_SHIFT, %mm5 # v36: compensate the coeefient scale psllw $16-NSHIFT, %mm7 # v34: compensate the coeefient scale @@ -188,22 +193,22 @@ _dv_dct_88_block_mmx: movq 16*3(%esi), %mm5 # v3 movq 16*4(%esi), %mm7 # v4 - movq %mm7, scratch1 # scratch1: v4 ; + movq %mm7, MUNG(scratch1) # scratch1: v4 ; movq %mm5, %mm7 # duplicate v3 - paddw scratch1, %mm5 # v03: v3+v4 - psubw scratch1, %mm7 # v04: v3-v4 - movq %mm5, scratch2 # scratch2: v03 + paddw MUNG(scratch1), %mm5 # v03: v3+v4 + psubw MUNG(scratch1), %mm7 # v04: v3-v4 + movq %mm5, MUNG(scratch2) # scratch2: v03 movq %mm0, %mm5 # mm5: v00 - paddw scratch2, %mm0 # v10: v00+v03 - psubw scratch2, %mm5 # v13: v00-v03 - movq %mm3, scratch3 # scratc3: v02 + paddw MUNG(scratch2), %mm0 # v10: v00+v03 + psubw MUNG(scratch2), %mm5 # v13: v00-v03 + movq %mm3, MUNG(scratch3) # scratc3: v02 movq %mm1, %mm3 # duplicate v01 - paddw scratch3, %mm1 # v11: v01+v02 - psubw scratch3, %mm3 # v12: v01-v02 + paddw MUNG(scratch3), %mm1 # v11: v01+v02 + psubw MUNG(scratch3), %mm3 # v12: v01-v02 - movq %mm6, scratch4 # scratc4: v05 + movq %mm6, MUNG(scratch4) # scratc4: v05 movq %mm0, %mm6 # duplicate v10 paddw %mm1, %mm0 # v10+v11 @@ -213,10 +218,10 @@ _dv_dct_88_block_mmx: movq %mm6, 16*4(%esi) # out4: v10-v11 movq %mm4, %mm0 # mm0: v06 - paddw scratch4, %mm4 # v15: v05+v06 + paddw MUNG(scratch4), %mm4 # v15: v05+v06 paddw %mm2, %mm0 # v16: v07+v06 - pmulhw WA3, %mm4 # v35~: WA3*v15 + pmulhw MUNG(WA3), %mm4 # v35~: WA3*v15 psllw $16-NSHIFT, %mm4 # v35: compensate the coeefient scale movq %mm4, %mm6 # duplicate v35 @@ -225,7 +230,7 @@ _dv_dct_88_block_mmx: paddw %mm5, %mm3 # v22: v12+v13 - pmulhw WA1, %mm3 # v32~: WA3*v15 + pmulhw MUNG(WA1), %mm3 # v32~: WA3*v15 psllw $16-NSHIFT, %mm3 # v32: compensate the coeefient scale movq %mm5, %mm6 # duplicate v13 @@ -235,13 +240,13 @@ _dv_dct_88_block_mmx: movq %mm5, 16*2(%esi) # out2: v13+v32 movq %mm6, 16*6(%esi) # out6: v13-v32 - paddw scratch4, %mm7 # v14n: v04+v05 + paddw MUNG(scratch4), %mm7 # v14n: v04+v05 movq %mm0, %mm5 # duplicate v16 psubw %mm7, %mm0 # va1: v16-v14n - pmulhw WA2, %mm7 # v34~~: v14n*WA2 - pmulhw WA5, %mm0 # va0~: va1*WA5 - pmulhw WA4, %mm5 # v36~~: v16*WA4 + pmulhw MUNG(WA2), %mm7 # v34~~: v14n*WA2 + pmulhw MUNG(WA5), %mm0 # va0~: va1*WA5 + pmulhw MUNG(WA4), %mm5 # v36~~: v16*WA4 psllw $16-NSHIFT, %mm7 psllw $16-WA4_SHIFT, %mm5 # v36: compensate the coeffient # scale note that WA4 is shifted 1 bit less than the others @@ -748,11 +755,12 @@ _dv_dct_block_mmx_postscale_88: _dv_dct_248_block_mmx: pushl %ebp - movl %esp, %ebp pushl %esi pushl %edi - movl 8(%ebp), %esi # source + LOAD_PIC_REG(bp) + + movl 16(%esp), %esi # source # column 0 @@ -779,7 +789,7 @@ _dv_dct_248_block_mmx: paddw %mm1, %mm0 # v20: v10+v11 psubw %mm1, %mm3 # v21: v10-v11 - pmulhw WA1, %mm5 # v32~: WA1*v22 + pmulhw MUNG(WA1), %mm5 # v32~: WA1*v22 movq %mm4, %mm2 psllw $16-NSHIFT, %mm5 # v32: compensate the coeffient scale @@ -818,7 +828,7 @@ _dv_dct_248_block_mmx: paddw %mm1, %mm0 # v20: v10+v11 psubw %mm1, %mm3 # v21: v10-v11 - pmulhw WA1, %mm5 # v32~: WA1*v22 + pmulhw MUNG(WA1), %mm5 # v32~: WA1*v22 movq %mm4, %mm2 psllw $16-NSHIFT, %mm5 # v32: compensate the coeffient scale @@ -855,7 +865,7 @@ _dv_dct_248_block_mmx: paddw %mm1, %mm0 # v20: v10+v11 psubw %mm1, %mm3 # v21: v10-v11 - pmulhw WA1, %mm5 # v32~: WA1*v22 + pmulhw MUNG(WA1), %mm5 # v32~: WA1*v22 movq %mm4, %mm2 psllw $16-NSHIFT, %mm5 # v32: compensate the coeffient scale @@ -892,7 +902,7 @@ _dv_dct_248_block_mmx: paddw %mm1, %mm0 # v20: v10+v11 psubw %mm1, %mm3 # v21: v10-v11 - pmulhw WA1, %mm5 # v32~: WA1*v22 + pmulhw MUNG(WA1), %mm5 # v32~: WA1*v22 movq %mm4, %mm2 psllw $16-NSHIFT, %mm5 # v32: compensate the coeffient scale --- libdv-0.104-old/libdv/dv.c +++ libdv-0.104/libdv/dv.c @@ -205,6 +205,9 @@ dv_reconfigure(int clamp_luma, int clamp } /* dv_reconfigure */ +extern uint8_t dv_quant_offset[4]; +extern uint8_t dv_quant_shifts[22][4]; + static inline void dv_decode_macroblock(dv_decoder_t *dv, dv_macroblock_t *mb, unsigned int quality) { int i; @@ -218,7 +221,7 @@ dv_decode_macroblock(dv_decoder_t *dv, d dv_idct_248 (co248, mb->b[i].coeffs); } else { #if ARCH_X86 - _dv_quant_88_inverse_x86(mb->b[i].coeffs,mb->qno,mb->b[i].class_no); + _dv_quant_88_inverse_x86(mb->b[i].coeffs,mb->qno,mb->b[i].class_no,dv_quant_offset,dv_quant_shifts); _dv_idct_88(mb->b[i].coeffs); #elif ARCH_X86_64 _dv_quant_88_inverse_x86_64(mb->b[i].coeffs,mb->qno,mb->b[i].class_no); @@ -250,7 +253,7 @@ dv_decode_video_segment(dv_decoder_t *dv dv_idct_248 (co248, mb->b[b].coeffs); } else { #if ARCH_X86 - _dv_quant_88_inverse_x86(bl->coeffs,mb->qno,bl->class_no); + _dv_quant_88_inverse_x86(bl->coeffs,mb->qno,bl->class_no,dv_quant_offset,dv_quant_shifts); _dv_weight_88_inverse(bl->coeffs); _dv_idct_88(bl->coeffs); #elif ARCH_X86_64 --- libdv-0.104-old/libdv/encode.c +++ libdv-0.104/libdv/encode.c @@ -521,7 +521,8 @@ static void reorder_block(dv_block_t *bl } extern unsigned long _dv_vlc_encode_block_mmx(dv_coeff_t* coeffs, - dv_vlc_entry_t ** out); + dv_vlc_entry_t ** out, + dv_vlc_entry_t * lookup); extern unsigned long _dv_vlc_encode_block_mmx_x86_64(dv_coeff_t* coeffs, dv_vlc_entry_t ** out); @@ -558,7 +559,7 @@ static unsigned long vlc_encode_block(dv #elif ARCH_X86 int num_bits; - num_bits = _dv_vlc_encode_block_mmx(coeffs, &o); + num_bits = _dv_vlc_encode_block_mmx(coeffs, &o, vlc_encode_lookup); emms(); #else int num_bits; @@ -574,7 +575,7 @@ static unsigned long vlc_encode_block(dv return num_bits; } -extern unsigned long _dv_vlc_num_bits_block_x86(dv_coeff_t* coeffs); +extern unsigned long _dv_vlc_num_bits_block_x86(dv_coeff_t* coeffs, unsigned char* lookup); extern unsigned long _dv_vlc_num_bits_block_x86_64(dv_coeff_t* coeffs); extern unsigned long _dv_vlc_num_bits_block(dv_coeff_t* coeffs) @@ -600,7 +601,7 @@ extern unsigned long _dv_vlc_num_bits_bl #elif ARCH_X86_64 return _dv_vlc_num_bits_block_x86_64(coeffs); #else - return _dv_vlc_num_bits_block_x86(coeffs); + return _dv_vlc_num_bits_block_x86(coeffs, vlc_num_bits_lookup); #endif } --- libdv-0.104-old/libdv/encode_x86.S +++ libdv-0.104/libdv/encode_x86.S @@ -23,9 +23,6 @@ * The libdv homepage is http://libdv.sourceforge.net/. */ -.data -ALLONE: .word 1,1,1,1 -VLCADDMASK: .byte 255,0,0,0,255,0,0,0 .section .note.GNU-stack, "", @progbits @@ -45,11 +43,14 @@ _dv_vlc_encode_block_mmx: movl $63, %ecx - movl vlc_encode_lookup, %esi + movl 4+4*4+8(%esp), %esi # vlc_encode_lookup pxor %mm0, %mm0 pxor %mm2, %mm2 - movq VLCADDMASK, %mm1 + pushl $0x000000FF # these four lines + pushl $0x000000FF # load VLCADDMASK + movq (%esp), %mm1 # into %mm1 off the stack + addl $8, %esp # --> no TEXTRELs xorl %ebp, %ebp subl $8, %edx vlc_encode_block_mmx_loop: @@ -121,7 +124,7 @@ _dv_vlc_num_bits_block_x86: addl $2, %edi movl $63, %ecx - movl vlc_num_bits_lookup, %esi + movl 4+4*4+4(%esp), %esi # vlc_num_bits_lookup vlc_num_bits_block_x86_loop: movw (%edi), %ax @@ -579,8 +590,11 @@ _dv_need_dct_248_mmx_rows: paddw %mm5, %mm1 paddw %mm1, %mm0 - - pmaddwd ALLONE, %mm0 + + pushl $0x00010001 # these four lines + pushl $0x00010001 # load ALLONE + pmaddwd (%esp), %mm0 # into %mm0 off the stack + addl $8, %esp # --> no TEXTRELs movq %mm0, %mm1 psrlq $32, %mm1 paddd %mm1, %mm0 --- libdv-0.104-old/libdv/idct_block_mmx.S +++ libdv-0.104/libdv/idct_block_mmx.S @@ -8,17 +8,21 @@ .section .note.GNU-stack, "", @progbits +#include "asm_common.S" + .text .align 4 .global _dv_idct_block_mmx .hidden _dv_idct_block_mmx .type _dv_idct_block_mmx,@function _dv_idct_block_mmx: pushl %ebp - movl %esp,%ebp pushl %esi - leal preSC, %ecx - movl 8(%ebp),%esi /* source matrix */ + + LOAD_PIC_REG(bp) + + leal MUNG(preSC), %ecx + movl 12(%esp),%esi /* source matrix */ /* * column 0: even part @@ -35,7 +41,7 @@ _dv_idct_block_mmx: movq %mm1, %mm2 /* added 11/1/96 */ pmulhw 8*8(%esi),%mm5 /* V8 */ psubsw %mm0, %mm1 /* V16 */ - pmulhw x5a825a825a825a82, %mm1 /* 23170 ->V18 */ + pmulhw MUNG(x5a825a825a825a82), %mm1 /* 23170 ->V18 */ paddsw %mm0, %mm2 /* V17 */ movq %mm2, %mm0 /* duplicate V17 */ psraw $1, %mm2 /* t75=t82 */ @@ -76,7 +82,7 @@ _dv_idct_block_mmx: paddsw %mm0, %mm3 /* V29 ; free mm0 */ movq %mm7, %mm1 /* duplicate V26 */ psraw $1, %mm3 /* t91=t94 */ - pmulhw x539f539f539f539f,%mm7 /* V33 */ + pmulhw MUNG(x539f539f539f539f),%mm7 /* V33 */ psraw $1, %mm1 /* t96 */ movq %mm5, %mm0 /* duplicate V2 */ psraw $2, %mm4 /* t85=t87 */ @@ -84,15 +90,15 @@ _dv_idct_block_mmx: psubsw %mm4, %mm0 /* V28 ; free mm4 */ movq %mm0, %mm2 /* duplicate V28 */ psraw $1, %mm5 /* t90=t93 */ - pmulhw x4546454645464546,%mm0 /* V35 */ + pmulhw MUNG(x4546454645464546),%mm0 /* V35 */ psraw $1, %mm2 /* t97 */ movq %mm5, %mm4 /* duplicate t90=t93 */ psubsw %mm2, %mm1 /* V32 ; free mm2 */ - pmulhw x61f861f861f861f8,%mm1 /* V36 */ + pmulhw MUNG(x61f861f861f861f8),%mm1 /* V36 */ psllw $1, %mm7 /* t107 */ paddsw %mm3, %mm5 /* V31 */ psubsw %mm3, %mm4 /* V30 ; free mm3 */ - pmulhw x5a825a825a825a82,%mm4 /* V34 */ + pmulhw MUNG(x5a825a825a825a82),%mm4 /* V34 */ nop psubsw %mm1, %mm0 /* V38 */ psubsw %mm7, %mm1 /* V37 ; free mm7 */ @@ -159,7 +165,7 @@ _dv_idct_block_mmx: psubsw %mm7, %mm1 /* V50 */ pmulhw 8*9(%esi), %mm5 /* V9 */ paddsw %mm7, %mm2 /* V51 */ - pmulhw x5a825a825a825a82, %mm1 /* 23170 ->V52 */ + pmulhw MUNG(x5a825a825a825a82), %mm1 /* 23170 ->V52 */ movq %mm2, %mm6 /* duplicate V51 */ psraw $1, %mm2 /* t138=t144 */ movq %mm3, %mm4 /* duplicate V1 */ @@ -200,11 +206,11 @@ _dv_idct_block_mmx: * even more by doing the correction step in a later stage when the number * is actually multiplied by 16 */ - paddw x0005000200010001, %mm4 + paddw MUNG(x0005000200010001), %mm4 psubsw %mm6, %mm3 /* V60 ; free mm6 */ psraw $1, %mm0 /* t154=t156 */ movq %mm3, %mm1 /* duplicate V60 */ - pmulhw x539f539f539f539f, %mm1 /* V67 */ + pmulhw MUNG(x539f539f539f539f), %mm1 /* V67 */ movq %mm5, %mm6 /* duplicate V3 */ psraw $2, %mm4 /* t148=t150 */ paddsw %mm4, %mm5 /* V61 */ @@ -213,13 +219,13 @@ _dv_idct_block_mmx: psllw $1, %mm1 /* t169 */ paddsw %mm0, %mm5 /* V65 -> result */ psubsw %mm0, %mm4 /* V64 ; free mm0 */ - pmulhw x5a825a825a825a82, %mm4 /* V68 */ + pmulhw MUNG(x5a825a825a825a82), %mm4 /* V68 */ psraw $1, %mm3 /* t158 */ psubsw %mm6, %mm3 /* V66 */ movq %mm5, %mm2 /* duplicate V65 */ - pmulhw x61f861f861f861f8, %mm3 /* V70 */ + pmulhw MUNG(x61f861f861f861f8), %mm3 /* V70 */ psllw $1, %mm6 /* t165 */ - pmulhw x4546454645464546, %mm6 /* V69 */ + pmulhw MUNG(x4546454645464546), %mm6 /* V69 */ psraw $1, %mm2 /* t172 */ /* moved from next block */ movq 8*5(%esi), %mm0 /* V56 */ @@ -344,7 +350,7 @@ _dv_idct_block_mmx: * movq 8*13(%esi), %mm4 tmt13 */ psubsw %mm4, %mm3 /* V134 */ - pmulhw x5a825a825a825a82, %mm3 /* 23170 ->V136 */ + pmulhw MUNG(x5a825a825a825a82), %mm3 /* 23170 ->V136 */ movq 8*9(%esi), %mm6 /* tmt9 */ paddsw %mm4, %mm5 /* V135 ; mm4 free */ movq %mm0, %mm4 /* duplicate tmt1 */ @@ -373,17 +379,17 @@ _dv_idct_block_mmx: psubsw %mm7, %mm0 /* V144 */ movq %mm0, %mm3 /* duplicate V144 */ paddsw %mm7, %mm2 /* V147 ; free mm7 */ - pmulhw x539f539f539f539f, %mm0 /* 21407-> V151 */ + pmulhw MUNG(x539f539f539f539f), %mm0 /* 21407-> V151 */ movq %mm1, %mm7 /* duplicate tmt3 */ paddsw %mm5, %mm7 /* V145 */ psubsw %mm5, %mm1 /* V146 ; free mm5 */ psubsw %mm1, %mm3 /* V150 */ movq %mm7, %mm5 /* duplicate V145 */ - pmulhw x4546454645464546, %mm1 /* 17734-> V153 */ + pmulhw MUNG(x4546454645464546), %mm1 /* 17734-> V153 */ psubsw %mm2, %mm5 /* V148 */ - pmulhw x61f861f861f861f8, %mm3 /* 25080-> V154 */ + pmulhw MUNG(x61f861f861f861f8), %mm3 /* 25080-> V154 */ psllw $2, %mm0 /* t311 */ - pmulhw x5a825a825a825a82, %mm5 /* 23170-> V152 */ + pmulhw MUNG(x5a825a825a825a82), %mm5 /* 23170-> V152 */ paddsw %mm2, %mm7 /* V149 ; free mm2 */ psllw $1, %mm1 /* t313 */ nop /* without the nop - freeze here for one clock */ @@ -409,7 +415,7 @@ _dv_idct_block_mmx: paddsw %mm3, %mm6 /* V164 ; free mm3 */ movq %mm4, %mm3 /* duplicate V142 */ psubsw %mm5, %mm4 /* V165 ; free mm5 */ - movq %mm2, scratch7 /* out7 */ + movq %mm2, MUNG(scratch7) /* out7 */ psraw $4, %mm6 psraw $4, %mm4 paddsw %mm5, %mm3 /* V162 */ @@ -420,11 +426,11 @@ _dv_idct_block_mmx: */ movq %mm6, 8*9(%esi) /* out9 */ paddsw %mm1, %mm0 /* V161 */ - movq %mm3, scratch5 /* out5 */ + movq %mm3, MUNG(scratch5) /* out5 */ psubsw %mm1, %mm5 /* V166 ; free mm1 */ movq %mm4, 8*11(%esi) /* out11 */ psraw $4, %mm5 - movq %mm0, scratch3 /* out3 */ + movq %mm0, MUNG(scratch3) /* out3 */ movq %mm2, %mm4 /* duplicate V140 */ movq %mm5, 8*13(%esi) /* out13 */ paddsw %mm7, %mm2 /* V160 */ @@ -434,7 +440,7 @@ _dv_idct_block_mmx: /* moved from the next block */ movq 8*3(%esi), %mm7 psraw $4, %mm4 - movq %mm2, scratch1 /* out1 */ + movq %mm2, MUNG(scratch1) /* out1 */ /* moved from the next block */ movq %mm0, %mm1 movq %mm4, 8*15(%esi) /* out15 */ @@ -491,15 +497,15 @@ _dv_idct_block_mmx: paddsw %mm4, %mm3 /* V113 ; free mm4 */ movq %mm0, %mm4 /* duplicate V110 */ paddsw %mm1, %mm2 /* V111 */ - pmulhw x539f539f539f539f, %mm0 /* 21407-> V117 */ + pmulhw MUNG(x539f539f539f539f), %mm0 /* 21407-> V117 */ psubsw %mm1, %mm5 /* V112 ; free mm1 */ psubsw %mm5, %mm4 /* V116 */ movq %mm2, %mm1 /* duplicate V111 */ - pmulhw x4546454645464546, %mm5 /* 17734-> V119 */ + pmulhw MUNG(x4546454645464546), %mm5 /* 17734-> V119 */ psubsw %mm3, %mm2 /* V114 */ - pmulhw x61f861f861f861f8, %mm4 /* 25080-> V120 */ + pmulhw MUNG(x61f861f861f861f8), %mm4 /* 25080-> V120 */ paddsw %mm3, %mm1 /* V115 ; free mm3 */ - pmulhw x5a825a825a825a82, %mm2 /* 23170-> V118 */ + pmulhw MUNG(x5a825a825a825a82), %mm2 /* 23170-> V118 */ psllw $2, %mm0 /* t266 */ movq %mm1, (%esi) /* save V115 */ psllw $1, %mm5 /* t268 */ @@ -517,7 +523,7 @@ _dv_idct_block_mmx: movq %mm6, %mm3 /* duplicate tmt4 */ psubsw %mm0, %mm6 /* V100 */ paddsw %mm0, %mm3 /* V101 ; free mm0 */ - pmulhw x5a825a825a825a82, %mm6 /* 23170 ->V102 */ + pmulhw MUNG(x5a825a825a825a82), %mm6 /* 23170 ->V102 */ movq %mm7, %mm5 /* duplicate tmt0 */ movq 8*8(%esi), %mm1 /* tmt8 */ paddsw %mm1, %mm7 /* V103 */ @@ -551,10 +557,10 @@ _dv_idct_block_mmx: movq 8*2(%esi), %mm3 /* V123 */ paddsw %mm4, %mm7 /* out0 */ /* moved up from next block */ - movq scratch3, %mm0 + movq MUNG(scratch3), %mm0 psraw $4, %mm7 /* moved up from next block */ - movq scratch5, %mm6 + movq MUNG(scratch5), %mm6 psubsw %mm4, %mm1 /* out14 ; free mm4 */ paddsw %mm3, %mm5 /* out2 */ psraw $4, %mm1 @@ -565,7 +571,7 @@ _dv_idct_block_mmx: movq %mm5, 8*2(%esi) /* out2 ; free mm5 */ psraw $4, %mm2 /* moved up to the prev block */ - movq scratch7, %mm4 + movq MUNG(scratch7), %mm4 /* moved up to the prev block */ psraw $4, %mm0 movq %mm2, 8*12(%esi) /* out12 ; free mm2 */ @@ -579,7 +585,7 @@ _dv_idct_block_mmx: * psraw $4, %mm0 * psraw $4, %mm6 */ - movq scratch1, %mm1 + movq MUNG(scratch1), %mm1 psraw $4, %mm4 movq %mm0, 8*3(%esi) /* out3 */ psraw $4, %mm1 --- libdv-0.104-old/libdv/parse.c +++ libdv-0.104/libdv/parse.c @@ -477,6 +477,13 @@ dv_parse_ac_coeffs(dv_videosegment_t *se exit(0); #endif } /* dv_parse_ac_coeffs */ +#if defined __GNUC__ && __ELF__ +# define dv_strong_hidden_alias(name, aliasname) \ + extern __typeof (name) aliasname __attribute__ ((alias (#name), visibility ("hidden"))) +dv_strong_hidden_alias(dv_parse_ac_coeffs, asm_dv_parse_ac_coeffs); +#else +int asm_dv_parse_ac_coeffs(dv_videosegment_t *seg) { return dv_parse_ac_coeffs(seg); } +#endif /* --------------------------------------------------------------------------- */ --- libdv-0.104-old/libdv/quant.c +++ libdv-0.104/libdv/quant.c @@ -144,7 +144,7 @@ uint8_t dv_quant_offset[4] = { 6,3,0,1 uint32_t dv_quant_248_mul_tab [2] [22] [64]; uint32_t dv_quant_88_mul_tab [2] [22] [64]; -extern void _dv_quant_x86(dv_coeff_t *block,int qno,int klass); +extern void _dv_quant_x86(dv_coeff_t *block,int qno,int klass,uint8_t *dv_quant_offset,uint8_t *dv_quant_shifts); extern void _dv_quant_x86_64(dv_coeff_t *block,int qno,int klass); static void quant_248_inverse_std(dv_coeff_t *block,int qno,int klass,dv_248_coeff_t *co); static void quant_248_inverse_mmx(dv_coeff_t *block,int qno,int klass,dv_248_coeff_t *co); @@ -210,7 +210,7 @@ void _dv_quant(dv_coeff_t *block,int qno _dv_quant_x86_64(block, qno, klass); emms(); #else - _dv_quant_x86(block, qno, klass); + _dv_quant_x86(block, qno, klass, dv_quant_offset, dv_quant_shifts); emms(); #endif } --- libdv-0.104-old/libdv/quant.h +++ libdv-0.104/libdv/quant.h @@ -27,7 +27,7 @@ extern void _dv_quant(dv_coeff_t *block, extern void _dv_quant_88_inverse(dv_coeff_t *block,int qno,int klass); extern void (*_dv_quant_248_inverse) (dv_coeff_t *block,int qno,int klass, dv_248_coeff_t *co); -extern void _dv_quant_88_inverse_x86(dv_coeff_t *block,int qno,int klass); +extern void _dv_quant_88_inverse_x86(dv_coeff_t *block,int qno,int klass, uint8_t *offset, uint8_t *shifts); extern void _dv_quant_88_inverse_x86_64(dv_coeff_t *block,int qno,int klass); extern void dv_quant_init (void); #ifdef __cplusplus --- libdv-0.104-old/libdv/quant_x86.S +++ libdv-0.104/libdv/quant_x86.S @@ -71,10 +73,13 @@ _dv_quant_88_inverse_x86: /* pq = dv_quant_shifts[qno + dv_quant_offset[class]]; */ movl ARGn(1),%eax /* qno */ + movl ARGn(3),%ebx /* dv_quant_offset */ + addl ARGn(2),%ebx /* class */ + movzbl (%ebx),%ecx movl ARGn(2),%ebx /* class */ - movzbl dv_quant_offset(%ebx),%ecx addl %ecx,%eax - leal dv_quant_shifts(,%eax,4),%edx /* edx is pq */ + movl ARGn(4),%edx /* dv_quant_shifts */ + leal (%edx,%eax,4),%edx /* edx is pq */ /* extra = (class == 3); */ /* 0 1 2 3 */ @@ -212,11 +219,13 @@ _dv_quant_x86: /* pq = dv_quant_shifts[qno + dv_quant_offset[class]]; */ movl ARGn(1),%eax /* qno */ + movl ARGn(3),%ebx /* offset */ + addl ARGn(2),%ebx /* class */ + movzbl (%ebx),%ecx movl ARGn(2),%ebx /* class */ - - movzbl dv_quant_offset(%ebx),%ecx + movl ARGn(4),%edx /* shifts */ addl %ecx,%eax - leal dv_quant_shifts(,%eax,4),%edx /* edx is pq */ + leal (%edx,%eax,4),%edx /* edx is pq */ /* extra = (class == 3); */ /* 0 1 2 3 */ --- libdv-0.104-old/libdv/rgbtoyuv.S +++ libdv-0.104/libdv/rgbtoyuv.S @@ -41,9 +41,6 @@ #define DV_WIDTH_SHORT_HALF 720 #define DV_WIDTH_BYTE_HALF 360 -.global _dv_rgbtoycb_mmx -# .global yuvtoycb_mmx - .data .align 8 @@ -110,25 +107,26 @@ VR0GR: .long 0,0 VBG0B: .long 0,0 #endif - + +#include "asm_common.S" + .section .note.GNU-stack, "", @progbits .text -#define _inPtr 8 -#define _rows 12 -#define _columns 16 -#define _outyPtr 20 -#define _outuPtr 24 -#define _outvPtr 28 +#define _inPtr 24+8 +#define _rows 24+12 +#define _columns 24+16 +#define _outyPtr 24+20 +#define _outuPtr 24+24 +#define _outvPtr 24+28 .global _dv_rgbtoycb_mmx .hidden _dv_rgbtoycb_mmx .type _dv_rgbtoycb_mmx,@function _dv_rgbtoycb_mmx: pushl %ebp - movl %esp, %ebp pushl %eax pushl %ebx pushl %ecx @@ -131,46 +132,47 @@ _dv_rgbtoycb_mmx: pushl %esi pushl %edi - leal ZEROSX, %eax #This section gets around a bug + LOAD_PIC_REG(bp) + + leal MUNG(ZEROSX), %eax #This section gets around a bug movq (%eax), %mm0 #unlikely to persist - movq %mm0, ZEROS - leal OFFSETDX, %eax + movq %mm0, MUNG(ZEROS) + leal MUNG(OFFSETDX), %eax movq (%eax), %mm0 - movq %mm0, OFFSETD - leal OFFSETWX, %eax + movq %mm0, MUNG(OFFSETD) + leal MUNG(OFFSETWX), %eax movq (%eax), %mm0 - movq %mm0, OFFSETW - leal OFFSETBX, %eax + movq %mm0, MUNG(OFFSETW) + leal MUNG(OFFSETBX), %eax movq (%eax), %mm0 - movq %mm0, OFFSETB - leal YR0GRX, %eax + movq %mm0, MUNG(OFFSETB) + leal MUNG(YR0GRX), %eax movq (%eax), %mm0 - movq %mm0, YR0GR - leal YBG0BX, %eax + movq %mm0, MUNG(YR0GR) + leal MUNG(YBG0BX), %eax movq (%eax), %mm0 - movq %mm0, YBG0B - leal UR0GRX, %eax + movq %mm0, MUNG(YBG0B) + leal MUNG(UR0GRX), %eax movq (%eax), %mm0 - movq %mm0, UR0GR - leal UBG0BX, %eax + movq %mm0, MUNG(UR0GR) + leal MUNG(UBG0BX), %eax movq (%eax), %mm0 - movq %mm0, UBG0B - leal VR0GRX, %eax + movq %mm0, MUNG(UBG0B) + leal MUNG(VR0GRX), %eax movq (%eax), %mm0 - movq %mm0, VR0GR - leal VBG0BX, %eax + movq %mm0, MUNG(VR0GR) + leal MUNG(VBG0BX), %eax movq (%eax), %mm0 - movq %mm0, VBG0B - - movl _rows(%ebp), %eax - movl _columns(%ebp), %ebx + movq %mm0, MUNG(VBG0B) + movl _rows(%esp), %eax + movl _columns(%esp), %ebx mull %ebx #number pixels shrl $3, %eax #number of loops movl %eax, %edi #loop counter in edi - movl _inPtr(%ebp), %eax - movl _outyPtr(%ebp), %ebx - movl _outuPtr(%ebp), %ecx - movl _outvPtr(%ebp), %edx + movl _inPtr(%esp), %eax + movl _outyPtr(%esp), %ebx + movl _outuPtr(%esp), %ecx + movl _outvPtr(%esp), %edx rgbtoycb_mmx_loop: movq (%eax), %mm1 #load G2R2B1G1R1B0G0R0 pxor %mm6, %mm6 #0 -> mm6 @@ -184,29 +186,29 @@ rgbtoycb_mmx_loop: punpcklbw %mm6, %mm1 #B1G1R1B0 -> mm1 movq %mm0, %mm2 #R1B0G0R0 -> mm2 - pmaddwd YR0GR, %mm0 #yrR1,ygG0+yrR0 -> mm0 + pmaddwd MUNG(YR0GR), %mm0 #yrR1,ygG0+yrR0 -> mm0 movq %mm1, %mm3 #B1G1R1B0 -> mm3 - pmaddwd YBG0B, %mm1 #ybB1+ygG1,ybB0 -> mm1 + pmaddwd MUNG(YBG0B), %mm1 #ybB1+ygG1,ybB0 -> mm1 movq %mm2, %mm4 #R1B0G0R0 -> mm4 - pmaddwd UR0GR, %mm2 #urR1,ugG0+urR0 -> mm2 + pmaddwd MUNG(UR0GR), %mm2 #urR1,ugG0+urR0 -> mm2 movq %mm3, %mm5 #B1G1R1B0 -> mm5 - pmaddwd UBG0B, %mm3 #ubB1+ugG1,ubB0 -> mm3 + pmaddwd MUNG(UBG0B), %mm3 #ubB1+ugG1,ubB0 -> mm3 punpckhbw %mm6, %mm7 # 00G2R2 -> mm7 - pmaddwd VR0GR, %mm4 #vrR1,vgG0+vrR0 -> mm4 + pmaddwd MUNG(VR0GR), %mm4 #vrR1,vgG0+vrR0 -> mm4 paddd %mm1, %mm0 #Y1Y0 -> mm0 - pmaddwd VBG0B, %mm5 #vbB1+vgG1,vbB0 -> mm5 + pmaddwd MUNG(VBG0B), %mm5 #vbB1+vgG1,vbB0 -> mm5 movq 8(%eax), %mm1 #R5B4G4R4B3G3R3B2 -> mm1 paddd %mm3, %mm2 #U1U0 -> mm2 movq %mm1, %mm6 #R5B4G4R4B3G3R3B2 -> mm6 - punpcklbw ZEROS, %mm1 #B3G3R3B2 -> mm1 + punpcklbw MUNG(ZEROS), %mm1 #B3G3R3B2 -> mm1 paddd %mm5, %mm4 #V1V0 -> mm4 movq %mm1, %mm5 #B3G3R3B2 -> mm5 @@ -214,29 +216,29 @@ rgbtoycb_mmx_loop: paddd %mm7, %mm1 #R3B200+00G2R2=R3B2G2R2->mm1 - punpckhbw ZEROS, %mm6 #R5B4G4R3 -> mm6 + punpckhbw MUNG(ZEROS), %mm6 #R5B4G4R3 -> mm6 movq %mm1, %mm3 #R3B2G2R2 -> mm3 - pmaddwd YR0GR, %mm1 #yrR3,ygG2+yrR2 -> mm1 + pmaddwd MUNG(YR0GR), %mm1 #yrR3,ygG2+yrR2 -> mm1 movq %mm5, %mm7 #B3G3R3B2 -> mm7 - pmaddwd YBG0B, %mm5 #ybB3+ygG3,ybB2 -> mm5 + pmaddwd MUNG(YBG0B), %mm5 #ybB3+ygG3,ybB2 -> mm5 psrad $FIXPSHIFT, %mm0 #32-bit scaled Y1Y0 -> mm0 - movq %mm6, TEMP0 #R5B4G4R4 -> TEMP0 + movq %mm6, MUNG(TEMP0) #R5B4G4R4 -> TEMP0 movq %mm3, %mm6 #R3B2G2R2 -> mm6 - pmaddwd UR0GR, %mm6 #urR3,ugG2+urR2 -> mm6 + pmaddwd MUNG(UR0GR), %mm6 #urR3,ugG2+urR2 -> mm6 psrad $FIXPSHIFT, %mm2 #32-bit scaled U1U0 -> mm2 paddd %mm5, %mm1 #Y3Y2 -> mm1 movq %mm7, %mm5 #B3G3R3B2 -> mm5 - pmaddwd UBG0B, %mm7 #ubB3+ugG3,ubB2 + pmaddwd MUNG(UBG0B), %mm7 #ubB3+ugG3,ubB2 psrad $FIXPSHIFT, %mm1 #32-bit scaled Y3Y2 -> mm1 - pmaddwd VR0GR, %mm3 #vrR3,vgG2+vgR2 + pmaddwd MUNG(VR0GR), %mm3 #vrR3,vgG2+vgR2 packssdw %mm1, %mm0 #Y3Y2Y1Y0 -> mm0 - pmaddwd VBG0B, %mm5 #vbB3+vgG3,vbB2 -> mm5 + pmaddwd MUNG(VBG0B), %mm5 #vbB3+vgG3,vbB2 -> mm5 psrad $FIXPSHIFT, %mm4 #32-bit scaled V1V0 -> mm4 movq 16(%eax), %mm1 #B7G7R7B6G6R6B5G5 -> mm7 @@ -251,58 +253,58 @@ rgbtoycb_mmx_loop: movq %mm7, %mm5 #R7B6G6R6B5G500 -> mm5 psrad $FIXPSHIFT, %mm3 #32-bit scaled V3V2 -> mm3 - paddw OFFSETY, %mm0 + paddw MUNG(OFFSETY), %mm0 movq %mm0, (%ebx) #store Y3Y2Y1Y0 packssdw %mm6, %mm2 #32-bit scaled U3U2U1U0 -> mm2 - movq TEMP0, %mm0 #R5B4G4R4 -> mm0 + movq MUNG(TEMP0), %mm0 #R5B4G4R4 -> mm0 addl $8, %ebx - - punpcklbw ZEROS, %mm7 #B5G500 -> mm7 + + punpcklbw MUNG(ZEROS), %mm7 #B5G500 -> mm7 movq %mm0, %mm6 #R5B4G4R4 -> mm6 - movq %mm2, TEMPU #32-bit scaled U3U2U1U0 -> TEMPU + movq %mm2, MUNG(TEMPU) #32-bit scaled U3U2U1U0 -> TEMPU psrlq $32, %mm0 #00R5B4 -> mm0 paddw %mm0, %mm7 #B5G5R5B4 -> mm7 movq %mm6, %mm2 #B5B4G4R4 -> mm2 - pmaddwd YR0GR, %mm2 #yrR5,ygG4+yrR4 -> mm2 + pmaddwd MUNG(YR0GR), %mm2 #yrR5,ygG4+yrR4 -> mm2 movq %mm7, %mm0 #B5G5R5B4 -> mm0 - pmaddwd YBG0B, %mm7 #ybB5+ygG5,ybB4 -> mm7 + pmaddwd MUNG(YBG0B), %mm7 #ybB5+ygG5,ybB4 -> mm7 packssdw %mm3, %mm4 #32-bit scaled V3V2V1V0 -> mm4 addl $24, %eax #increment RGB count - movq %mm4, TEMPV #(V3V2V1V0)/256 -> mm4 + movq %mm4, MUNG(TEMPV) #(V3V2V1V0)/256 -> mm4 movq %mm6, %mm4 #B5B4G4R4 -> mm4 - pmaddwd UR0GR, %mm6 #urR5,ugG4+urR4 + pmaddwd MUNG(UR0GR), %mm6 #urR5,ugG4+urR4 movq %mm0, %mm3 #B5G5R5B4 -> mm0 - pmaddwd UBG0B, %mm0 #ubB5+ugG5,ubB4 + pmaddwd MUNG(UBG0B), %mm0 #ubB5+ugG5,ubB4 paddd %mm7, %mm2 #Y5Y4 -> mm2 - pmaddwd VR0GR, %mm4 #vrR5,vgG4+vrR4 -> mm4 + pmaddwd MUNG(VR0GR), %mm4 #vrR5,vgG4+vrR4 -> mm4 pxor %mm7, %mm7 #0 -> mm7 - pmaddwd VBG0B, %mm3 #vbB5+vgG5,vbB4 -> mm3 + pmaddwd MUNG(VBG0B), %mm3 #vbB5+vgG5,vbB4 -> mm3 punpckhbw %mm7, %mm1 #B7G7R7B6 -> mm1 paddd %mm6, %mm0 #U5U4 -> mm0 movq %mm1, %mm6 #B7G7R7B6 -> mm6 - pmaddwd YBG0B, %mm6 #ybB7+ygG7,ybB6 -> mm6 + pmaddwd MUNG(YBG0B), %mm6 #ybB7+ygG7,ybB6 -> mm6 punpckhbw %mm7, %mm5 #R7B6G6R6 -> mm5 movq %mm5, %mm7 #R7B6G6R6 -> mm7 paddd %mm4, %mm3 #V5V4 -> mm3 - pmaddwd YR0GR, %mm5 #yrR7,ygG6+yrR6 -> mm5 + pmaddwd MUNG(YR0GR), %mm5 #yrR7,ygG6+yrR6 -> mm5 movq %mm1, %mm4 #B7G7R7B6 -> mm4 - pmaddwd UBG0B, %mm4 #ubB7+ugG7,ubB6 -> mm4 + pmaddwd MUNG(UBG0B), %mm4 #ubB7+ugG7,ubB6 -> mm4 psrad $FIXPSHIFT, %mm0 #32-bit scaled U5U4 -> mm0 psrad $FIXPSHIFT, %mm2 #32-bit scaled Y5Y4 -> mm2 @@ -310,25 +312,25 @@ rgbtoycb_mmx_loop: paddd %mm5, %mm6 #Y7Y6 -> mm6 movq %mm7, %mm5 #R7B6G6R6 -> mm5 - pmaddwd UR0GR, %mm7 #urR7,ugG6+ugR6 -> mm7 + pmaddwd MUNG(UR0GR), %mm7 #urR7,ugG6+ugR6 -> mm7 psrad $FIXPSHIFT, %mm3 #32-bit scaled V5V4 -> mm3 - pmaddwd VBG0B, %mm1 #vbB7+vgG7,vbB6 -> mm1 + pmaddwd MUNG(VBG0B), %mm1 #vbB7+vgG7,vbB6 -> mm1 psrad $FIXPSHIFT, %mm6 #32-bit scaled Y7Y6 -> mm6 packssdw %mm6, %mm2 #Y7Y6Y5Y4 -> mm2 - pmaddwd VR0GR, %mm5 #vrR7,vgG6+vrR6 -> mm5 + pmaddwd MUNG(VR0GR), %mm5 #vrR7,vgG6+vrR6 -> mm5 paddd %mm4, %mm7 #U7U6 -> mm7 psrad $FIXPSHIFT, %mm7 #32-bit scaled U7U6 -> mm7 - paddw OFFSETY, %mm2 + paddw MUNG(OFFSETY), %mm2 movq %mm2, (%ebx) #store Y7Y6Y5Y4 - movq ALLONE, %mm6 + movq MUNG(ALLONE), %mm6 packssdw %mm7, %mm0 #32-bit scaled U7U6U5U4 -> mm0 - movq TEMPU, %mm4 #32-bit scaled U3U2U1U0 -> mm4 + movq MUNG(TEMPU), %mm4 #32-bit scaled U3U2U1U0 -> mm4 pmaddwd %mm6, %mm0 #U7U6U5U4 averaged -> (U7U6)(U5U4)=UU3 UU2->mm0 pmaddwd %mm6, %mm4 #U3U2U1U0 averaged -> (U3U2)(U1U0)=UU1 UU0->mm4 @@ -338,8 +340,8 @@ rgbtoycb_mmx_loop: psrad $FIXPSHIFT, %mm1 #32-bit scaled V7V6 -> mm1 psraw $1, %mm4 #divide UU3 UU2 UU1 UU0 by 2 -> mm4 - - movq TEMPV, %mm5 #32-bit scaled V3V2V1V0 -> mm5 + + movq MUNG(TEMPV), %mm5 #32-bit scaled V3V2V1V0 -> mm5 movq %mm4, (%ecx) # store U @@ -422,14 +426,15 @@ _dv_ppm_copy_y_block_mmx: _dv_pgm_copy_y_block_mmx: pushl %ebp - movl %esp, %ebp pushl %esi pushl %edi - - movl 8(%ebp), %edi # dest - movl 12(%ebp), %esi # src - movq OFFSETY, %mm7 + LOAD_PIC_REG(bp) + + movl 16(%esp), %edi # dest + movl 20(%esp), %esi # src + + movq MUNG(OFFSETY), %mm7 pxor %mm6, %mm6 movq (%esi), %mm0 @@ -564,14 +571,15 @@ _dv_pgm_copy_y_block_mmx: _dv_video_copy_y_block_mmx: pushl %ebp - movl %esp, %ebp pushl %esi pushl %edi - - movl 8(%ebp), %edi # dest - movl 12(%ebp), %esi # src - movq OFFSETBX, %mm7 + LOAD_PIC_REG(bp) + + movl 16(%esp), %edi # dest + movl 20(%esp), %esi # src + + movq MUNG(OFFSETBX), %mm7 pxor %mm6, %mm6 movq (%esi), %mm0 @@ -852,16 +864,16 @@ _dv_ppm_copy_pal_c_block_mmx: _dv_pgm_copy_pal_c_block_mmx: pushl %ebp - movl %esp, %ebp pushl %esi pushl %edi pushl %ebx - - movl 8(%ebp), %edi # dest - movl 12(%ebp), %esi # src + LOAD_PIC_REG(bp) + + movl 20(%esp), %edi # dest + movl 24(%esp), %esi # src - movq OFFSETBX, %mm7 + movq MUNG(OFFSETBX), %mm7 pxor %mm6, %mm6 @@ -1000,15 +1014,16 @@ _dv_pgm_copy_pal_c_block_mmx: _dv_video_copy_pal_c_block_mmx: pushl %ebp - movl %esp, %ebp pushl %esi pushl %edi pushl %ebx - - movl 8(%ebp), %edi # dest - movl 12(%ebp), %esi # src - movq OFFSETBX, %mm7 + LOAD_PIC_REG(bp) + + movl 20(%esp), %edi # dest + movl 24(%esp), %esi # src + + movq MUNG(OFFSETBX), %mm7 paddw %mm7, %mm7 pxor %mm6, %mm6 @@ -1095,18 +1112,18 @@ video_copy_pal_c_block_mmx_loop: _dv_ppm_copy_ntsc_c_block_mmx: pushl %ebp - movl %esp, %ebp pushl %esi pushl %edi pushl %ebx - - movl 8(%ebp), %edi # dest - movl 12(%ebp), %esi # src + + LOAD_PIC_REG(bp) + + movl 20(%esp), %edi # dest + movl 24(%esp), %esi # src movl $4, %ebx - movq ALLONE, %mm6 - + movq MUNG(ALLONE), %mm6 ppm_copy_ntsc_c_block_mmx_loop: movq (%esi), %mm0 @@ -1168,14 +1187,15 @@ ppm_copy_ntsc_c_block_mmx_loop: _dv_pgm_copy_ntsc_c_block_mmx: pushl %ebp - movl %esp, %ebp pushl %esi pushl %edi - - movl 8(%ebp), %edi # dest - movl 12(%ebp), %esi # src - movq OFFSETBX, %mm7 + LOAD_PIC_REG(bp) + + movl 16(%esp), %edi # dest + movl 20(%esp), %esi # src + + movq MUNG(OFFSETBX), %mm7 paddw %mm7, %mm7 pxor %mm6, %mm6 @@ -1325,15 +1347,16 @@ _dv_pgm_copy_ntsc_c_block_mmx: _dv_video_copy_ntsc_c_block_mmx: pushl %ebp - movl %esp, %ebp pushl %esi pushl %edi pushl %ebx - - movl 8(%ebp), %edi # dest - movl 12(%ebp), %esi # src - movq OFFSETBX, %mm7 + LOAD_PIC_REG(bp) + + movl 20(%esp), %edi # dest + movl 24(%esp), %esi # src + + movq MUNG(OFFSETBX), %mm7 paddw %mm7, %mm7 pxor %mm6, %mm6 --- libdv-0.104-old/libdv/rgbtoyuv_x86_64.S +++ libdv-0.104/libdv/rgbtoyuv_x86_64.S @@ -41,9 +41,6 @@ #define DV_WIDTH_SHORT_HALF 720 #define DV_WIDTH_BYTE_HALF 360 -.global _dv_rgbtoycb_mmx_x86_64 -# .global yuvtoycb_mmx_x86_64 - .data .align 8 --- libdv-0.104-old/libdv/vlc_x86.S +++ libdv-0.104/libdv/vlc_x86.S @@ -1,31 +1,39 @@ #include "asmoff.h" .section .note.GNU-stack, "", @progbits + #include "asm_common.S" .text .align 4 .globl dv_decode_vlc +.globl asm_dv_decode_vlc +.hidden asm_dv_decode_vlc +asm_dv_decode_vlc = dv_decode_vlc + .type dv_decode_vlc,@function dv_decode_vlc: pushl %ebx + pushl %ebp + + LOAD_PIC_REG(bp) - /* Args are at 8(%esp). */ - movl 8(%esp),%eax /* %eax is bits */ - movl 12(%esp),%ebx /* %ebx is maxbits */ + /* Args are at 12(%esp). */ + movl 12(%esp),%eax /* %eax is bits */ + movl 16(%esp),%ebx /* %ebx is maxbits */ andl $0x3f,%ebx /* limit index range STL*/ - movl dv_vlc_class_index_mask(,%ebx,4),%edx + movl MUNG_ARR(dv_vlc_class_index_mask,%ebx,4),%edx andl %eax,%edx - movl dv_vlc_class_index_rshift(,%ebx,4),%ecx + movl MUNG_ARR(dv_vlc_class_index_rshift,%ebx,4),%ecx sarl %cl,%edx - movl dv_vlc_classes(,%ebx,4),%ecx + movl MUNG_ARR(dv_vlc_classes,%ebx,4),%ecx movsbl (%ecx,%edx,1),%edx /* %edx is class */ - movl dv_vlc_index_mask(,%edx,4),%ebx - movl dv_vlc_index_rshift(,%edx,4),%ecx + movl MUNG_ARR(dv_vlc_index_mask,%edx,4),%ebx + movl MUNG_ARR(dv_vlc_index_rshift,%edx,4),%ecx andl %eax,%ebx sarl %cl,%ebx - movl dv_vlc_lookups(,%edx,4),%edx + movl MUNG_ARR(dv_vlc_lookups,%edx,4),%edx movl (%edx,%ebx,4),%edx /* Now %edx holds result, like this: @@ -42,7 +51,7 @@ dv_decode_vlc: movl %edx,%ecx sarl $8,%ecx andl $0xff,%ecx - movl sign_mask(,%ecx,4),%ebx + movl MUNG_ARR(sign_mask,%ecx,4),%ebx andl %ebx,%eax negl %eax sarl $31,%eax @@ -63,14 +72,14 @@ dv_decode_vlc: *result = broken; Note that the 'broken' pattern is all ones (i.e. 0xffffffff) */ - movl 12(%esp),%ebx /* %ebx is maxbits */ + movl 16(%esp),%ebx /* %ebx is maxbits */ subl %ecx,%ebx sbbl %ebx,%ebx orl %ebx,%edx - movl 16(%esp),%eax + movl 20(%esp),%eax movl %edx,(%eax) - + popl %ebp popl %ebx ret @@ -80,21 +89,28 @@ dv_decode_vlc: .type __dv_decode_vlc,@function __dv_decode_vlc: pushl %ebx + pushl %ebp + + LOAD_PIC_REG(bp) - /* Args are at 8(%esp). */ - movl 8(%esp),%eax /* %eax is bits */ + /* Args are at 12(%esp). */ + movl 12(%esp),%eax /* %eax is bits */ movl %eax,%edx /* %edx is class */ andl $0xfe00,%edx sarl $9,%edx +#ifdef __PIC__ + movsbl dv_vlc_class_lookup5@GOTOFF(%ebp,%edx),%edx +#else movsbl dv_vlc_class_lookup5(%edx),%edx - - movl dv_vlc_index_mask(,%edx,4),%ebx - movl dv_vlc_index_rshift(,%edx,4),%ecx +#endif + + movl MUNG_ARR(dv_vlc_index_mask,%edx,4),%ebx + movl MUNG_ARR(dv_vlc_index_rshift,%edx,4),%ecx andl %eax,%ebx sarl %cl,%ebx - movl dv_vlc_lookups(,%edx,4),%edx + movl MUNG_ARR(dv_vlc_lookups,%edx,4),%edx movl (%edx,%ebx,4),%edx /* Now %edx holds result, like this: @@ -112,7 +128,7 @@ __dv_decode_vlc: movl %edx,%ecx sarl $8,%ecx andl $0xff,%ecx - movl sign_mask(,%ecx,4),%ecx + movl MUNG_ARR(sign_mask,%ecx,4),%ecx andl %ecx,%eax negl %eax sarl $31,%eax @@ -127,9 +143,9 @@ __dv_decode_vlc: xorl %eax,%edx subl %eax,%edx - movl 12(%esp),%eax + movl 16(%esp),%eax movl %edx,(%eax) - + popl %ebp popl %ebx ret @@ -140,14 +156,20 @@ void dv_parse_ac_coeffs_pass0(bitstream_ */ .text .align 4 +.globl asm_dv_parse_ac_coeffs_pass0 +.hidden asm_dv_parse_ac_coeffs_pass0 + asm_dv_parse_ac_coeffs_pass0 = dv_parse_ac_coeffs_pass0 + .globl dv_parse_ac_coeffs_pass0 .type dv_parse_ac_coeffs_pass0,@function dv_parse_ac_coeffs_pass0: pushl %ebx pushl %edi pushl %esi pushl %ebp + LOAD_PIC_REG(si) + #define ARGn(N) (20+(4*(N)))(%esp) /* @@ -159,8 +182,10 @@ dv_parse_ac_coeffs_pass0: ebp bl */ movl ARGn(2),%ebp +#ifndef __PIC__ movl ARGn(0),%esi movl bitstream_t_buf(%esi),%esi +#endif movl dv_block_t_offset(%ebp),%edi movl dv_block_t_reorder(%ebp),%ebx @@ -170,7 +195,11 @@ dv_parse_ac_coeffs_pass0: movq dv_block_t_coeffs(%ebp),%mm1 pxor %mm0,%mm0 +#ifdef __PIC__ + pand const_f_0_0_0@GOTOFF(%esi),%mm1 +#else pand const_f_0_0_0,%mm1 +#endif movq %mm1,dv_block_t_coeffs(%ebp) movq %mm0,(dv_block_t_coeffs + 8)(%ebp) movq %mm0,(dv_block_t_coeffs + 16)(%ebp) @@ -191,9 +220,17 @@ dv_parse_ac_coeffs_pass0: readloop: movl %edi,%ecx shrl $3,%ecx +#ifdef __PIC__ + movl ARGn(0),%eax + addl bitstream_t_buf(%eax),%ecx + movzbl (%ecx),%eax + movzbl 1(%ecx),%edx + movzbl 2(%ecx),%ecx +#else movzbl (%esi,%ecx,1),%eax movzbl 1(%esi,%ecx,1),%edx movzbl 2(%esi,%ecx,1),%ecx +#endif shll $16,%eax shll $8,%edx orl %ecx,%eax @@ -217,7 +254,11 @@ readloop: /* Attempt to use the shortcut first. If it hits, then this vlc term has been decoded. */ +#ifdef __PIC__ + movl dv_vlc_class1_shortcut@GOTOFF(%esi,%ecx,4),%edx +#else movl dv_vlc_class1_shortcut(,%ecx,4),%edx +#endif test $0x80,%edx je done_decode @@ -228,12 +269,19 @@ readloop: movl %ebx,dv_block_t_reorder(%ebp) /* %eax is bits */ - +#ifdef __PIC__ + movsbl dv_vlc_class_lookup5@GOTOFF(%esi,%ecx),%ecx + + movl dv_vlc_index_mask@GOTOFF(%esi,%ecx,4),%ebx + movl dv_vlc_lookups@GOTOFF(%esi,%ecx,4),%edx + movl dv_vlc_index_rshift@GOTOFF(%esi,%ecx,4),%ecx +#else movsbl dv_vlc_class_lookup5(%ecx),%ecx movl dv_vlc_index_mask(,%ecx,4),%ebx movl dv_vlc_lookups(,%ecx,4),%edx movl dv_vlc_index_rshift(,%ecx,4),%ecx +#endif andl %eax,%ebx sarl %cl,%ebx @@ -256,7 +304,11 @@ readloop: movl %edx,%ecx sarl $8,%ecx andl $0xff,%ecx +#ifdef __PIC__ + movl sign_mask@GOTOFF(%esi,%ecx,4),%ecx +#else movl sign_mask(,%ecx,4),%ecx +#endif andl %ecx,%eax negl %eax sarl $31,%eax @@ -326,10 +378,16 @@ alldone: slowpath: /* slow path: use dv_decode_vlc */; +#ifdef __PIC__ + pushl %esi + leal vlc@GOTOFF(%esi),%esi + xchgl %esi,(%esp) /* last parameter is &vlc */ +#else pushl $vlc /* last parameter is &vlc */ +#endif pushl %edx /* bits_left */ pushl %eax /* bits */ - call dv_decode_vlc + call asm_dv_decode_vlc addl $12,%esp test $0x80,%edx /* If (vlc.run < 0) break */ jne escape @@ -359,6 +417,8 @@ show16: pushl %esi pushl %ebp + LOAD_PIC_REG(si) + #define ARGn(N) (20+(4*(N)))(%esp) movl ARGn(1),%eax /* quality */ @@ -373,7 +434,11 @@ dv_parse_video_segment: jz its_mono movl $6,%ebx its_mono: +#ifdef __PIC__ + movl %ebx,n_blocks@GOTOFF(%esi) +#else movl %ebx,n_blocks +#endif /* * ebx seg/b @@ -384,15 +449,22 @@ its_mono: * ebp bl */ movl ARGn(0),%ebx +#ifndef __PIC__ movl dv_videosegment_t_bs(%ebx),%esi movl bitstream_t_buf(%esi),%esi +#endif leal dv_videosegment_t_mb(%ebx),%edi movl $0,%eax movl $0,%ecx macloop: +#ifdef __PIC__ + movl %eax,m@GOTOFF(%esi) + movl %ecx,mb_start@GOTOFF(%esi) +#else movl %eax,m movl %ecx,mb_start +#endif movl ARGn(0),%ebx @@ -400,7 +472,13 @@ macloop: /* mb->qno = bitstream_get(bs,4); */ movl %ecx,%edx shr $3,%edx +#ifdef __PIC__ + movl dv_videosegment_t_bs(%ebx),%ecx + movl bitstream_t_buf(%ecx),%ecx + movzbl 3(%ecx,%edx,1),%edx +#else movzbl 3(%esi,%edx,1),%edx +#endif andl $0xf,%edx movl %edx,dv_macroblock_t_qno(%edi) @@ -411,7 +489,11 @@ macloop: movl %edx,dv_macroblock_t_eob_count(%edi) /* mb->i = (seg->i + dv_super_map_vertical[m]) % (seg->isPAL?12:10); */ +#ifdef __PIC__ + movl dv_super_map_vertical@GOTOFF(%esi,%eax,4),%edx +#else movl dv_super_map_vertical(,%eax,4),%edx +#endif movl dv_videosegment_t_i(%ebx),%ecx addl %ecx,%edx @@ -422,11 +504,20 @@ skarly: andl $1,%ecx shll $5,%ecx /* ecx = (isPAL ? 32 : 0) */ +#ifdef __PIC__ + leal mod_10@GOTOFF(%esi),%edx + movzbl (%edx,%ecx,1),%edx /* uses mod_12 for PAL */ +#else movzbl mod_10(%edx,%ecx,1),%edx /* uses mod_12 for PAL */ +#endif movl %edx,dv_macroblock_t_i(%edi) /* mb->j = dv_super_map_horizontal[m]; */ +#ifdef __PIC__ + movl dv_super_map_horizontal@GOTOFF(%esi,%eax,4),%edx +#else movl dv_super_map_horizontal(,%eax,4),%edx +#endif movl %edx,dv_macroblock_t_j(%edi) /* mb->k = seg->k; */ @@ -445,12 +536,28 @@ blkloop: +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ */ /* dc = bitstream_get(bs,9); */ +#ifdef __PIC__ + movl mb_start@GOTOFF(%esi),%ecx +#else movl mb_start,%ecx +#endif shr $3,%ecx +#ifdef __PIC__ + movzbl blk_start@GOTOFF(%esi,%ebx),%edx +#else movzbl blk_start(%ebx),%edx +#endif addl %ecx,%edx +#ifdef __PIC__ + movl ARGn(0),%ecx + movl dv_videosegment_t_bs(%ecx),%ecx + movl bitstream_t_buf(%ecx),%ecx + movzbl (%ecx,%edx,1),%eax /* hi byte */ + movzbl 1(%ecx,%edx,1),%ecx /* lo byte */ +#else movzbl (%esi,%edx,1),%eax /* hi byte */ movzbl 1(%esi,%edx,1),%ecx /* lo byte */ +#endif shll $8,%eax orl %ecx,%eax @@ -477,7 +584,11 @@ blkloop: /* bl->reorder = &dv_reorder[bl->dct_mode][1]; */ shll $6,%eax +#ifdef __PIC__ + leal dv_reorder@GOTOFF+1(%esi,%eax),%eax +#else addl $(dv_reorder+1),%eax +#endif movl %eax,dv_block_t_reorder(%ebp) /* bl->reorder_sentinel = bl->reorder + 63; */ @@ -485,13 +596,22 @@ blkloop: movl %eax,dv_block_t_reorder_sentinel(%ebp) /* bl->offset= mb_start + dv_parse_bit_start[b]; */ +#ifdef __PIC__ + movl mb_start@GOTOFF(%esi),%ecx + movl dv_parse_bit_start@GOTOFF(%esi,%ebx,4),%eax +#else movl mb_start,%ecx movl dv_parse_bit_start(,%ebx,4),%eax +#endif addl %ecx,%eax movl %eax,dv_block_t_offset(%ebp) /* bl->end= mb_start + dv_parse_bit_end[b]; */ +#ifdef __PIC__ + movl dv_parse_bit_end@GOTOFF(%esi,%ebx,4),%eax +#else movl dv_parse_bit_end(,%ebx,4),%eax +#endif addl %ecx,%eax movl %eax,dv_block_t_end(%ebp) @@ -503,7 +623,11 @@ blkloop: /* no AC pass. Just zero out the remaining coeffs */ movq dv_block_t_coeffs(%ebp),%mm1 pxor %mm0,%mm0 +#ifdef __PIC__ + pand const_f_0_0_0@GOTOFF(%esi),%mm1 +#else pand const_f_0_0_0,%mm1 +#endif movq %mm1,dv_block_t_coeffs(%ebp) movq %mm0,(dv_block_t_coeffs + 8)(%ebp) movq %mm0,(dv_block_t_coeffs + 16)(%ebp) @@ -528,18 +652,27 @@ do_ac_pass: pushl %ebp pushl %edi pushl %eax - call dv_parse_ac_coeffs_pass0 + call asm_dv_parse_ac_coeffs_pass0 addl $12,%esp done_ac: +#ifdef __PIC__ + movl n_blocks@GOTOFF(%esi),%eax +#else movl n_blocks,%eax +#endif addl $dv_block_t_size,%ebp incl %ebx cmpl %eax,%ebx jnz blkloop +#ifdef __PIC__ + movl m@GOTOFF(%esi),%eax + movl mb_start@GOTOFF(%esi),%ecx +#else movl m,%eax movl mb_start,%ecx +#endif addl $(8 * 80),%ecx addl $dv_macroblock_t_size,%edi incl %eax @@ -557,7 +690,7 @@ done_ac: andl $DV_QUALITY_AC_MASK,%eax cmpl $DV_QUALITY_AC_2,%eax - jz dv_parse_ac_coeffs + jz asm_dv_parse_ac_coeffs movl $0,%eax ret