1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
|
Upstream: yes
From 464ca6c2f37b93180cc27ea41889ffaf1eab388e Mon Sep 17 00:00:00 2001
From: Henrik Gramner <gramner@twoorioles.com>
Date: Thu, 25 Jun 2020 01:27:28 +0200
Subject: [PATCH] x86: Fix 32-bit build with PIC enabled
---
src/x86/mc_sse.asm | 147 +++++++++++++++++----------------------------
1 file changed, 56 insertions(+), 91 deletions(-)
diff --git a/src/x86/mc_sse.asm b/src/x86/mc_sse.asm
index d98ac621..5d5c5e3f 100644
--- a/src/x86/mc_sse.asm
+++ b/src/x86/mc_sse.asm
@@ -1263,7 +1263,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
%if ARCH_X86_64
mova m8, [pw_8]
%else
- %define m8 [pw_8]
+ %define m8 [t1-prep_sse2+pw_8]
%endif
pxor m7, m7
%endif
@@ -1272,13 +1272,11 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
pshuflw m6, m6, q0000
%if cpuflag(ssse3)
punpcklqdq m6, m6
-%else
- %if ARCH_X86_64
+%elif ARCH_X86_64
psrlw m0, m8, 3
punpcklwd m6, m0
- %else
+%else
punpcklwd m6, [base+pw_1]
- %endif
%endif
%if ARCH_X86_32
mov t1, t2 ; save base reg for w4
@@ -1396,8 +1394,8 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
PUSH r7
%endif
mov r7, tmpq
+ mov r5, srcq
%endif
- mov t1, srcq
.hv_w16_hloop:
movu m0, [srcq+strideq*0+8*0]
movu m1, [srcq+strideq*0+8*1]
@@ -1440,14 +1438,17 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
sub hd, 2
jg .hv_w16_vloop
movzx hd, t2w
- add t1, 16
- mov srcq, t1
%if ARCH_X86_64
+ add r5, 16
add r7, 2*16
+ mov srcq, r5
mov tmpq, r7
%else
+ mov srcq, srcmp
mov tmpq, tmpmp
+ add srcq, 16
add tmpq, 2*16
+ mov srcmp, srcq
mov tmpmp, tmpq
%endif
sub t2d, 1<<16
@@ -2624,22 +2625,20 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
%macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1
%if cpuflag(ssse3)
phaddw %1, %2
- %else
- %ifnidn %1, %2
+ %elifnidn %1, %2
%if %4 == 1
- mova %3, [pw_1]
+ mova %3, [base+pw_1]
%endif
pmaddwd %1, %3
pmaddwd %2, %3
packssdw %1, %2
- %else
+ %else
%if %4 == 1
- pmaddwd %1, [pw_1]
+ pmaddwd %1, [base+pw_1]
%else
pmaddwd %1, %3
%endif
packssdw %1, %1
- %endif
%endif
%endmacro
@@ -2795,11 +2794,9 @@ PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
%if ARCH_X86_32
%define base_reg r2
%define base base_reg-prep%+SUFFIX
- %define W32_RESTORE_SSQ mov strideq, stridem
%else
%define base_reg r7
%define base 0
- %define W32_RESTORE_SSQ
%endif
cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%assign org_stack_offset stack_offset
@@ -2834,6 +2831,10 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
WIN64_SPILL_XMM 12
%else
WIN64_SPILL_XMM 16
+%endif
+%if ARCH_X86_32
+ %define strideq r6
+ mov strideq, stridem
%endif
cmp wd, 4
je .h_w4
@@ -2894,7 +2895,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
punpcklbw m4, m4
psraw m4, 8
%endif
- W32_RESTORE_SSQ
%if ARCH_X86_64
lea stride3q, [strideq*3]
%endif
@@ -2916,8 +2916,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
pshufb m1, m5
pshufb m2, m5
pshufb m3, m5
-%else
- %if ARCH_X86_64
+%elif ARCH_X86_64
movd m0, [srcq+strideq*0+0]
movd m12, [srcq+strideq*0+1]
movd m1, [srcq+strideq*1+0]
@@ -2947,7 +2946,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
punpcklqdq m1, m5 ; 1
punpcklqdq m2, m13 ; 2
punpcklqdq m3, m7 ; 3
- %else
+%else
movd m0, [srcq+strideq*0+0]
movd m1, [srcq+strideq*0+1]
movd m2, [srcq+strideq*0+2]
@@ -2978,7 +2977,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
lea srcq, [srcq+strideq*2]
punpckldq m7, m5
punpcklqdq m3, m7 ; 3
- %endif
%endif
PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2
PMADDUBSW m1, m4, m5, m7, 0
@@ -2994,14 +2992,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
sub hd, 4
jg .h_w4_loop
RET
- ;
.h_w8:
-%if ARCH_X86_32
- mov r3, r2
- %define base_reg r3
- W32_RESTORE_SSQ
-%endif
-.h_w8_loop:
%if cpuflag(ssse3)
PREP_8TAP_H 0, srcq+strideq*0
PREP_8TAP_H 1, srcq+strideq*1
@@ -3017,51 +3008,42 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
add tmpq, 16
dec hd
%endif
- jg .h_w8_loop
+ jg .h_w8
RET
.h_w16:
- mov r6, -16*1
+ mov r3, -16*1
jmp .h_start
.h_w32:
- mov r6, -16*2
+ mov r3, -16*2
jmp .h_start
.h_w64:
- mov r6, -16*4
+ mov r3, -16*4
jmp .h_start
.h_w128:
- mov r6, -16*8
+ mov r3, -16*8
.h_start:
-%if ARCH_X86_32
- mov r3, r2
- %define base_reg r3
-%endif
- sub srcq, r6
- mov r5, r6
- W32_RESTORE_SSQ
+ sub srcq, r3
+ mov r5, r3
.h_loop:
%if cpuflag(ssse3)
- PREP_8TAP_H 0, srcq+r6+8*0
- PREP_8TAP_H 1, srcq+r6+8*1
+ PREP_8TAP_H 0, srcq+r3+8*0
+ PREP_8TAP_H 1, srcq+r3+8*1
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
add tmpq, 32
- add r6, 16
+ add r3, 16
%else
- PREP_8TAP_H 0, srcq+r6
+ PREP_8TAP_H 0, srcq+r3
mova [tmpq], m0
add tmpq, 16
- add r6, 8
+ add r3, 8
%endif
jl .h_loop
add srcq, strideq
- mov r6, r5
+ mov r3, r5
dec hd
jg .h_loop
RET
-%if ARCH_X86_32
- %define base_reg r2
-%endif
- ;
.v:
LEA base_reg, prep%+SUFFIX
%if ARCH_X86_32
@@ -3086,7 +3068,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%define subpel1 [rsp+mmsize*1]
%define subpel2 [rsp+mmsize*2]
%define subpel3 [rsp+mmsize*3]
-%assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed
+%assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed
%if cpuflag(ssse3)
ALLOC_STACK -mmsize*4
%else
@@ -3105,15 +3087,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
movd m0, [myq+6]
PSHUFB_0X1X m0, m2
mova subpel3, m0
- %if notcpuflag(ssse3)
- mov r6, base_reg
- %define base_reg r6
- %endif
- mov strideq, [rstk+stack_offset+gprsize*3]
- lea strideq, [strideq*3]
- sub [rstk+stack_offset+gprsize*2], strideq
mov strideq, [rstk+stack_offset+gprsize*3]
- mov srcq, [rstk+stack_offset+gprsize*2]
+ lea r5, [strideq*3]
+ sub srcq, r5
%else
%define subpel0 m8
%define subpel1 m9
@@ -3245,10 +3221,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
jg .v_w4_loop0
%endif
RET
-%if ARCH_X86_32 && notcpuflag(ssse3)
- %define base_reg r2
-%endif
- ;
%if ARCH_X86_64
.v_w8:
lea r5d, [wq - 8] ; horizontal loop
@@ -3373,16 +3345,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
cmp hd, 6
cmovs myd, mxd
movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
- mov r5, r2; use as new base
- %define base_reg r5
- %assign regs_used 2
+ mov strideq, stridem
+ %assign regs_used 6
ALLOC_STACK -mmsize*14
%assign regs_used 7
- mov strideq, [rstk+stack_offset+gprsize*3]
- lea strideq, [strideq*3 + 1]
- sub [rstk+stack_offset+gprsize*2], strideq
- mov strideq, [rstk+stack_offset+gprsize*3]
- mov srcq, [rstk+stack_offset+gprsize*2]
+ lea r5, [strideq*3+1]
+ sub srcq, r5
%define subpelv0 [rsp+mmsize*0]
%define subpelv1 [rsp+mmsize*1]
%define subpelv2 [rsp+mmsize*2]
@@ -3445,9 +3413,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%define hv4_line_1_3 13
%if ARCH_X86_32
%if cpuflag(ssse3)
- %define w8192reg [base+pw_8192]
+ %define w8192reg [base+pw_8192]
%else
- %define w8192reg [base+pw_2]
+ %define w8192reg [base+pw_2]
%endif
%define d32reg [base+pd_32]
%else
@@ -3676,7 +3644,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%define hv8_line_6 4
shr mxd, 16
%if ARCH_X86_32
- %define base_reg r2
%define subpelh0 [rsp+mmsize*5]
%define subpelh1 [rsp+mmsize*6]
%define subpelv0 [rsp+mmsize*7]
@@ -3692,16 +3659,16 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
cmp hd, 6
cmovs myd, mxd
movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
- ALLOC_STACK -mmsize*13
+ mov strideq, stridem
+ %assign regs_used 6
+ ALLOC_STACK -mmsize*14
+ %assign regs_used 7
%if STACK_ALIGNMENT < mmsize
- mov rstk, r2m
- %define tmpm [rsp+mmsize*13+gprsize*1]
- %define srcm [rsp+mmsize*13+gprsize*2]
- %define stridem [rsp+mmsize*13+gprsize*3]
- mov stridem, rstk
+ %define tmpm [rsp+mmsize*13+gprsize*1]
+ %define srcm [rsp+mmsize*13+gprsize*2]
+ %define stridem [rsp+mmsize*13+gprsize*3]
+ mov stridem, strideq
%endif
- mov r6, r2
- %define base_reg r6
pshufd m0, m1, q0000
pshufd m1, m1, q1111
punpcklbw m5, m5
@@ -3724,12 +3691,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
mova subpelv1, m3
mova subpelv2, m4
mova subpelv3, m5
- W32_RESTORE_SSQ
- lea strided, [strided*3]
- sub srcd, strided
- sub srcd, 3
- mov srcm, srcd
- W32_RESTORE_SSQ
+ lea r5, [strideq*3+3]
+ sub srcq, r5
+ mov srcm, srcq
%else
ALLOC_STACK mmsize*5, 16
%define subpelh0 m10
@@ -3765,7 +3729,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%if notcpuflag(ssse3)
mova m7, [base+pw_2]
%endif
- lea stride3q, [strideq*3]
+ lea stride3q, [strideq*3]
sub srcq, 3
sub srcq, stride3q
mov r6, srcq
@@ -3939,11 +3903,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
.hv_w8_outer:
movzx hd, r5w
%if ARCH_X86_32
- add dword tmpm, 8
- mov tmpq, tmpm
mov srcq, srcm
+ mov tmpq, tmpm
add srcq, 4
+ add tmpq, 8
mov srcm, srcq
+ mov tmpm, tmpq
%else
add r8, 8
mov tmpq, r8
--
2.26.2
|