summaryrefslogtreecommitdiffstats
path: root/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0003-ARM-NEON-Some-cleanup-of-bilinear-scanline-functions.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0003-ARM-NEON-Some-cleanup-of-bilinear-scanline-functions.patch')
-rw-r--r--meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0003-ARM-NEON-Some-cleanup-of-bilinear-scanline-functions.patch331
1 files changed, 331 insertions, 0 deletions
diff --git a/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0003-ARM-NEON-Some-cleanup-of-bilinear-scanline-functions.patch b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0003-ARM-NEON-Some-cleanup-of-bilinear-scanline-functions.patch
new file mode 100644
index 0000000000..338e2ad83e
--- /dev/null
+++ b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0003-ARM-NEON-Some-cleanup-of-bilinear-scanline-functions.patch
@@ -0,0 +1,331 @@
1From ed7580525054e6a543694088c561dee525b4ae28 Mon Sep 17 00:00:00 2001
2From: Taekyun Kim <tkq.kim@samsung.com>
3Date: Tue, 20 Sep 2011 19:46:25 +0900
4Subject: [PATCH 3/8] ARM: NEON: Some cleanup of bilinear scanline functions
5
6Use STRIDE and initial horizontal weight update is done before
7entering interpolation loop. Cache preload for mask and dst.
8---
9 pixman/pixman-arm-neon-asm-bilinear.S | 128 +++++++++++++++++----------------
10 1 files changed, 67 insertions(+), 61 deletions(-)
11
12diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
13index 3c7fe0f..c5ba929 100644
14--- a/pixman/pixman-arm-neon-asm-bilinear.S
15+++ b/pixman/pixman-arm-neon-asm-bilinear.S
16@@ -44,10 +44,6 @@
17 * All temp registers can be used freely outside the code block.
18 * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
19 *
20- * TODOs
21- * Support 0565 pixel format
22- * Optimization for two and last pixel cases
23- *
24 * Remarks
25 * There can be lots of pipeline stalls inside code block and between code blocks.
26 * Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
27@@ -92,21 +88,19 @@ fname:
28 */
29
30 .macro bilinear_load_8888 reg1, reg2, tmp
31- mov TMP2, X, asr #16
32+ mov TMP1, X, asr #16
33 add X, X, UX
34- add TMP1, TOP, TMP2, asl #2
35- add TMP2, BOTTOM, TMP2, asl #2
36- vld1.32 {reg1}, [TMP1]
37- vld1.32 {reg2}, [TMP2]
38+ add TMP1, TOP, TMP1, asl #2
39+ vld1.32 {reg1}, [TMP1], STRIDE
40+ vld1.32 {reg2}, [TMP1]
41 .endm
42
43 .macro bilinear_load_0565 reg1, reg2, tmp
44- mov TMP2, X, asr #16
45+ mov TMP1, X, asr #16
46 add X, X, UX
47- add TMP1, TOP, TMP2, asl #1
48- add TMP2, BOTTOM, TMP2, asl #1
49- vld1.32 {reg2[0]}, [TMP1]
50- vld1.32 {reg2[1]}, [TMP2]
51+ add TMP1, TOP, TMP1, asl #1
52+ vld1.32 {reg2[0]}, [TMP1], STRIDE
53+ vld1.32 {reg2[1]}, [TMP1]
54 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
55 .endm
56
57@@ -134,18 +128,16 @@ fname:
58 .macro bilinear_load_and_vertical_interpolate_two_0565 \
59 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
60
61- mov TMP2, X, asr #16
62+ mov TMP1, X, asr #16
63 add X, X, UX
64- mov TMP4, X, asr #16
65+ add TMP1, TOP, TMP1, asl #1
66+ mov TMP2, X, asr #16
67 add X, X, UX
68- add TMP1, TOP, TMP2, asl #1
69- add TMP2, BOTTOM, TMP2, asl #1
70- add TMP3, TOP, TMP4, asl #1
71- add TMP4, BOTTOM, TMP4, asl #1
72- vld1.32 {acc2lo[0]}, [TMP1]
73- vld1.32 {acc2hi[0]}, [TMP3]
74- vld1.32 {acc2lo[1]}, [TMP2]
75- vld1.32 {acc2hi[1]}, [TMP4]
76+ add TMP2, TOP, TMP2, asl #1
77+ vld1.32 {acc2lo[0]}, [TMP1], STRIDE
78+ vld1.32 {acc2hi[0]}, [TMP2], STRIDE
79+ vld1.32 {acc2lo[1]}, [TMP1]
80+ vld1.32 {acc2hi[1]}, [TMP2]
81 convert_0565_to_x888 acc2, reg3, reg2, reg1
82 vzip.u8 reg1, reg3
83 vzip.u8 reg2, reg4
84@@ -161,34 +153,30 @@ fname:
85 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
86 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
87
88- mov TMP2, X, asr #16
89+ mov TMP1, X, asr #16
90 add X, X, UX
91- mov TMP4, X, asr #16
92+ add TMP1, TOP, TMP1, asl #1
93+ mov TMP2, X, asr #16
94 add X, X, UX
95- add TMP1, TOP, TMP2, asl #1
96- add TMP2, BOTTOM, TMP2, asl #1
97- add TMP3, TOP, TMP4, asl #1
98- add TMP4, BOTTOM, TMP4, asl #1
99- vld1.32 {xacc2lo[0]}, [TMP1]
100- vld1.32 {xacc2hi[0]}, [TMP3]
101- vld1.32 {xacc2lo[1]}, [TMP2]
102- vld1.32 {xacc2hi[1]}, [TMP4]
103+ add TMP2, TOP, TMP2, asl #1
104+ vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
105+ vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
106+ vld1.32 {xacc2lo[1]}, [TMP1]
107+ vld1.32 {xacc2hi[1]}, [TMP2]
108 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
109- mov TMP2, X, asr #16
110+ mov TMP1, X, asr #16
111 add X, X, UX
112- mov TMP4, X, asr #16
113+ add TMP1, TOP, TMP1, asl #1
114+ mov TMP2, X, asr #16
115 add X, X, UX
116- add TMP1, TOP, TMP2, asl #1
117- add TMP2, BOTTOM, TMP2, asl #1
118- add TMP3, TOP, TMP4, asl #1
119- add TMP4, BOTTOM, TMP4, asl #1
120- vld1.32 {yacc2lo[0]}, [TMP1]
121+ add TMP2, TOP, TMP2, asl #1
122+ vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
123 vzip.u8 xreg1, xreg3
124- vld1.32 {yacc2hi[0]}, [TMP3]
125+ vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
126 vzip.u8 xreg2, xreg4
127- vld1.32 {yacc2lo[1]}, [TMP2]
128+ vld1.32 {yacc2lo[1]}, [TMP1]
129 vzip.u8 xreg3, xreg4
130- vld1.32 {yacc2hi[1]}, [TMP4]
131+ vld1.32 {yacc2hi[1]}, [TMP2]
132 vzip.u8 xreg1, xreg2
133 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
134 vmull.u8 xacc1, xreg1, d28
135@@ -252,6 +240,7 @@ fname:
136 .else
137 .error bilinear_load_mask_8 numpix is unsupported
138 .endif
139+ pld [MASK, #prefetch_offset]
140 .endm
141
142 .macro bilinear_load_mask mask_fmt, numpix, mask
143@@ -279,6 +268,7 @@ fname:
144 .else
145 .error bilinear_load_dst_8888 numpix is unsupported
146 .endif
147+ pld [OUT, #(prefetch_offset * 4)]
148 .endm
149
150 .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
151@@ -303,7 +293,7 @@ fname:
152 * For two pixel case
153 * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
154 * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
155- * We can do some optimizations for this including one pixel cases.
156+ * We can do some optimizations for this including last pixel cases.
157 */
158 .macro bilinear_duplicate_mask_x numpix, mask
159 .endm
160@@ -497,8 +487,7 @@ fname:
161 bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
162 vmull.u8 q1, d0, d28
163 vmlal.u8 q1, d1, d29
164- vshr.u16 d30, d24, #8
165- /* 4 cycles bubble */
166+ /* 5 cycles bubble */
167 vshll.u16 q0, d2, #8
168 vmlsl.u16 q0, d2, d30
169 vmlal.u16 q0, d3, d30
170@@ -525,18 +514,18 @@ fname:
171 q1, q11, d0, d1, d20, d21, d22, d23
172 bilinear_load_mask mask_fmt, 2, d4
173 bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
174- vshr.u16 q15, q12, #8
175- vadd.u16 q12, q12, q13
176 vshll.u16 q0, d2, #8
177 vmlsl.u16 q0, d2, d30
178 vmlal.u16 q0, d3, d30
179 vshll.u16 q10, d22, #8
180 vmlsl.u16 q10, d22, d31
181 vmlal.u16 q10, d23, d31
182- vshrn.u32 d30, q0, #16
183- vshrn.u32 d31, q10, #16
184+ vshrn.u32 d0, q0, #16
185+ vshrn.u32 d1, q10, #16
186 bilinear_duplicate_mask mask_fmt, 2, d4
187- vmovn.u16 d0, q15
188+ vshr.u16 q15, q12, #8
189+ vadd.u16 q12, q12, q13
190+ vmovn.u16 d0, q0
191 bilinear_interleave_src_dst \
192 mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
193 bilinear_apply_mask_to_src \
194@@ -554,8 +543,7 @@ fname:
195 q1, q11, d0, d1, d20, d21, d22, d23 \
196 q3, q9, d4, d5, d16, d17, d18, d19
197 pld [TMP1, PF_OFFS]
198- vshr.u16 q15, q12, #8
199- vadd.u16 q12, q12, q13
200+ sub TMP1, TMP1, STRIDE
201 vshll.u16 q0, d2, #8
202 vmlsl.u16 q0, d2, d30
203 vmlal.u16 q0, d3, d30
204@@ -567,9 +555,9 @@ fname:
205 vmlsl.u16 q2, d6, d30
206 vmlal.u16 q2, d7, d30
207 vshll.u16 q8, d18, #8
208- bilinear_load_mask mask_fmt, 4, d30
209+ bilinear_load_mask mask_fmt, 4, d22
210 bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
211- pld [TMP2, PF_OFFS]
212+ pld [TMP1, PF_OFFS]
213 vmlsl.u16 q8, d18, d31
214 vmlal.u16 q8, d19, d31
215 vadd.u16 q12, q12, q13
216@@ -577,17 +565,19 @@ fname:
217 vshrn.u32 d1, q10, #16
218 vshrn.u32 d4, q2, #16
219 vshrn.u32 d5, q8, #16
220- bilinear_duplicate_mask mask_fmt, 4, d30
221+ bilinear_duplicate_mask mask_fmt, 4, d22
222+ vshr.u16 q15, q12, #8
223 vmovn.u16 d0, q0
224 vmovn.u16 d1, q2
225+ vadd.u16 q12, q12, q13
226 bilinear_interleave_src_dst \
227 mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
228 bilinear_apply_mask_to_src \
229- mask_fmt, 4, d0, d1, q0, d30, \
230+ mask_fmt, 4, d0, d1, q0, d22, \
231 q3, q8, q9, q10
232 bilinear_combine \
233 op, 4, d0, d1, q0, d2, d3, q1, \
234- q3, q8, q9, q10, d22
235+ q3, q8, q9, q10, d23
236 bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
237 bilinear_store_&dst_fmt 4, q2, q3
238 .endm
239@@ -610,6 +600,7 @@ pixman_asm_function fname
240 PF_OFFS .req r7
241 TMP3 .req r8
242 TMP4 .req r9
243+ STRIDE .req r2
244
245 mov ip, sp
246 push {r4, r5, r6, r7, r8, r9}
247@@ -617,6 +608,11 @@ pixman_asm_function fname
248 ldmia ip, {WB, X, UX, WIDTH}
249 mul PF_OFFS, PF_OFFS, UX
250
251+ .set prefetch_offset, prefetch_distance
252+
253+ sub STRIDE, BOTTOM, TOP
254+ .unreq BOTTOM
255+
256 cmp WIDTH, #0
257 ble 3f
258
259@@ -626,6 +622,8 @@ pixman_asm_function fname
260 vdup.u8 d29, WB
261 vadd.u16 d25, d25, d26
262 vadd.u16 q13, q13, q13
263+ vshr.u16 q15, q12, #8
264+ vadd.u16 q12, q12, q13
265
266 subs WIDTH, WIDTH, #4
267 blt 1f
268@@ -648,7 +646,6 @@ pixman_asm_function fname
269
270 .unreq OUT
271 .unreq TOP
272- .unreq BOTTOM
273 .unreq WT
274 .unreq WB
275 .unreq X
276@@ -659,6 +656,7 @@ pixman_asm_function fname
277 .unreq PF_OFFS
278 .unreq TMP3
279 .unreq TMP4
280+ .unreq STRIDE
281 .endfunc
282
283 .endm
284@@ -682,6 +680,7 @@ pixman_asm_function fname
285 PF_OFFS .req r8
286 TMP3 .req r9
287 TMP4 .req r10
288+ STRIDE .req r3
289
290 mov ip, sp
291 push {r4, r5, r6, r7, r8, r9, r10, ip}
292@@ -689,6 +688,11 @@ pixman_asm_function fname
293 ldmia ip, {WT, WB, X, UX, WIDTH}
294 mul PF_OFFS, PF_OFFS, UX
295
296+ .set prefetch_offset, prefetch_distance
297+
298+ sub STRIDE, BOTTOM, TOP
299+ .unreq BOTTOM
300+
301 cmp WIDTH, #0
302 ble 3f
303
304@@ -698,6 +702,8 @@ pixman_asm_function fname
305 vdup.u8 d29, WB
306 vadd.u16 d25, d25, d26
307 vadd.u16 q13, q13, q13
308+ vshr.u16 q15, q12, #8
309+ vadd.u16 q12, q12, q13
310
311 subs WIDTH, WIDTH, #4
312 blt 1f
313@@ -720,7 +726,6 @@ pixman_asm_function fname
314
315 .unreq OUT
316 .unreq TOP
317- .unreq BOTTOM
318 .unreq WT
319 .unreq WB
320 .unreq X
321@@ -732,6 +737,7 @@ pixman_asm_function fname
322 .unreq PF_OFFS
323 .unreq TMP3
324 .unreq TMP4
325+ .unreq STRIDE
326 .endfunc
327
328 .endm
329--
3301.6.6.1
331