summaryrefslogtreecommitdiffstats
path: root/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0007-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0007-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch')
-rw-r--r--meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0007-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch206
1 files changed, 206 insertions, 0 deletions
diff --git a/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0007-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0007-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch
new file mode 100644
index 0000000000..e4e741f906
--- /dev/null
+++ b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0007-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch
@@ -0,0 +1,206 @@
1From 94585f9a618821a5c06c3a497902579b4a08b05f Mon Sep 17 00:00:00 2001
2From: Taekyun Kim <tkq.kim@samsung.com>
3Date: Mon, 26 Sep 2011 19:04:53 +0900
4Subject: [PATCH 7/8] ARM: NEON: Instruction scheduling of bilinear over_8888_8_8888
5
6Instructions are reordered to eliminate pipeline stalls and get
7better memory access.
8
9Performance of before/after on cortex-a8 @ 1GHz
10
11<< 2000 x 2000 with scale factor close to 1.x >>
12before : 40.53 Mpix/s
13after : 50.76 Mpix/s
14---
15 pixman/pixman-arm-neon-asm-bilinear.S | 162 ++++++++++++++++++++++++++++++++-
16 1 files changed, 158 insertions(+), 4 deletions(-)
17
18diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
19index 76937e0..4ab46e1 100644
20--- a/pixman/pixman-arm-neon-asm-bilinear.S
21+++ b/pixman/pixman-arm-neon-asm-bilinear.S
22@@ -949,7 +949,7 @@ pixman_asm_function fname
23 vshrn.u32 d0, q0, #16
24 vshrn.u32 d1, q1, #16
25 vld1.32 {d2, d3}, [OUT, :128]
26- pld [OUT, PF_OFFS]
27+ pld [OUT, #(prefetch_offset * 4)]
28 vshrn.u32 d4, q2, #16
29 vshr.u16 q15, q12, #8
30 vshrn.u32 d5, q3, #16
31@@ -1061,15 +1061,169 @@ pixman_asm_function fname
32 .endm
33
34 .macro bilinear_over_8888_8_8888_process_pixblock_head
35- bilinear_over_8888_8_8888_process_four_pixels
36+ mov TMP1, X, asr #16
37+ add X, X, UX
38+ add TMP1, TOP, TMP1, asl #2
39+ vld1.32 {d0}, [TMP1], STRIDE
40+ mov TMP2, X, asr #16
41+ add X, X, UX
42+ add TMP2, TOP, TMP2, asl #2
43+ vld1.32 {d1}, [TMP1]
44+ mov TMP3, X, asr #16
45+ add X, X, UX
46+ add TMP3, TOP, TMP3, asl #2
47+ vld1.32 {d2}, [TMP2], STRIDE
48+ mov TMP4, X, asr #16
49+ add X, X, UX
50+ add TMP4, TOP, TMP4, asl #2
51+ vld1.32 {d3}, [TMP2]
52+ vmull.u8 q2, d0, d28
53+ vmull.u8 q3, d2, d28
54+ vmlal.u8 q2, d1, d29
55+ vmlal.u8 q3, d3, d29
56+ vshll.u16 q0, d4, #8
57+ vshll.u16 q1, d6, #8
58+ vmlsl.u16 q0, d4, d30
59+ vmlsl.u16 q1, d6, d31
60+ vmlal.u16 q0, d5, d30
61+ vmlal.u16 q1, d7, d31
62+ vshrn.u32 d0, q0, #16
63+ vshrn.u32 d1, q1, #16
64+ vld1.32 {d2}, [TMP3], STRIDE
65+ vld1.32 {d3}, [TMP3]
66+ pld [TMP4, PF_OFFS]
67+ vld1.32 {d4}, [TMP4], STRIDE
68+ vld1.32 {d5}, [TMP4]
69+ pld [TMP4, PF_OFFS]
70+ vmull.u8 q3, d2, d28
71+ vmlal.u8 q3, d3, d29
72+ vmull.u8 q1, d4, d28
73+ vmlal.u8 q1, d5, d29
74+ vshr.u16 q15, q12, #8
75+ vld1.32 {d22[0]}, [MASK]!
76+ pld [MASK, #prefetch_offset]
77+ vadd.u16 q12, q12, q13
78+ vmovn.u16 d16, q0
79 .endm
80
81 .macro bilinear_over_8888_8_8888_process_pixblock_tail
82+ vshll.u16 q9, d6, #8
83+ vshll.u16 q10, d2, #8
84+ vmlsl.u16 q9, d6, d30
85+ vmlsl.u16 q10, d2, d31
86+ vmlal.u16 q9, d7, d30
87+ vmlal.u16 q10, d3, d31
88+ vshr.u16 q15, q12, #8
89+ vadd.u16 q12, q12, q13
90+ vdup.32 d22, d22[0]
91+ vshrn.u32 d18, q9, #16
92+ vshrn.u32 d19, q10, #16
93+ vmovn.u16 d17, q9
94+ vld1.32 {d18, d19}, [OUT, :128]
95+ pld [OUT, PF_OFFS]
96+ vuzp.8 d16, d17
97+ vuzp.8 d18, d19
98+ vuzp.8 d16, d17
99+ vuzp.8 d18, d19
100+ vmull.u8 q10, d16, d22
101+ vmull.u8 q11, d17, d22
102+ vrsra.u16 q10, q10, #8
103+ vrsra.u16 q11, q11, #8
104+ vrshrn.u16 d16, q10, #8
105+ vrshrn.u16 d17, q11, #8
106+ vdup.32 d22, d17[1]
107+ vmvn.8 d22, d22
108+ vmull.u8 q10, d18, d22
109+ vmull.u8 q11, d19, d22
110+ vrshr.u16 q9, q10, #8
111+ vrshr.u16 q0, q11, #8
112+ vraddhn.u16 d18, q9, q10
113+ vraddhn.u16 d19, q0, q11
114+ vqadd.u8 q9, q8, q9
115+ vuzp.8 d18, d19
116+ vuzp.8 d18, d19
117+ vst1.32 {d18, d19}, [OUT, :128]!
118 .endm
119
120 .macro bilinear_over_8888_8_8888_process_pixblock_tail_head
121- bilinear_over_8888_8_8888_process_pixblock_tail
122- bilinear_over_8888_8_8888_process_pixblock_head
123+ vshll.u16 q9, d6, #8
124+ mov TMP1, X, asr #16
125+ add X, X, UX
126+ add TMP1, TOP, TMP1, asl #2
127+ vshll.u16 q10, d2, #8
128+ vld1.32 {d0}, [TMP1], STRIDE
129+ mov TMP2, X, asr #16
130+ add X, X, UX
131+ add TMP2, TOP, TMP2, asl #2
132+ vmlsl.u16 q9, d6, d30
133+ vmlsl.u16 q10, d2, d31
134+ vld1.32 {d1}, [TMP1]
135+ mov TMP3, X, asr #16
136+ add X, X, UX
137+ add TMP3, TOP, TMP3, asl #2
138+ vmlal.u16 q9, d7, d30
139+ vmlal.u16 q10, d3, d31
140+ vld1.32 {d2}, [TMP2], STRIDE
141+ mov TMP4, X, asr #16
142+ add X, X, UX
143+ add TMP4, TOP, TMP4, asl #2
144+ vshr.u16 q15, q12, #8
145+ vadd.u16 q12, q12, q13
146+ vld1.32 {d3}, [TMP2]
147+ vdup.32 d22, d22[0]
148+ vshrn.u32 d18, q9, #16
149+ vshrn.u32 d19, q10, #16
150+ vmull.u8 q2, d0, d28
151+ vmull.u8 q3, d2, d28
152+ vmovn.u16 d17, q9
153+ vld1.32 {d18, d19}, [OUT, :128]
154+ pld [OUT, #(prefetch_offset * 4)]
155+ vmlal.u8 q2, d1, d29
156+ vmlal.u8 q3, d3, d29
157+ vuzp.8 d16, d17
158+ vuzp.8 d18, d19
159+ vshll.u16 q0, d4, #8
160+ vshll.u16 q1, d6, #8
161+ vuzp.8 d16, d17
162+ vuzp.8 d18, d19
163+ vmlsl.u16 q0, d4, d30
164+ vmlsl.u16 q1, d6, d31
165+ vmull.u8 q10, d16, d22
166+ vmull.u8 q11, d17, d22
167+ vmlal.u16 q0, d5, d30
168+ vmlal.u16 q1, d7, d31
169+ vrsra.u16 q10, q10, #8
170+ vrsra.u16 q11, q11, #8
171+ vshrn.u32 d0, q0, #16
172+ vshrn.u32 d1, q1, #16
173+ vrshrn.u16 d16, q10, #8
174+ vrshrn.u16 d17, q11, #8
175+ vld1.32 {d2}, [TMP3], STRIDE
176+ vdup.32 d22, d17[1]
177+ vld1.32 {d3}, [TMP3]
178+ vmvn.8 d22, d22
179+ pld [TMP4, PF_OFFS]
180+ vld1.32 {d4}, [TMP4], STRIDE
181+ vmull.u8 q10, d18, d22
182+ vmull.u8 q11, d19, d22
183+ vld1.32 {d5}, [TMP4]
184+ pld [TMP4, PF_OFFS]
185+ vmull.u8 q3, d2, d28
186+ vrshr.u16 q9, q10, #8
187+ vrshr.u16 q15, q11, #8
188+ vmlal.u8 q3, d3, d29
189+ vmull.u8 q1, d4, d28
190+ vraddhn.u16 d18, q9, q10
191+ vraddhn.u16 d19, q15, q11
192+ vmlal.u8 q1, d5, d29
193+ vshr.u16 q15, q12, #8
194+ vqadd.u8 q9, q8, q9
195+ vld1.32 {d22[0]}, [MASK]!
196+ vuzp.8 d18, d19
197+ vadd.u16 q12, q12, q13
198+ vuzp.8 d18, d19
199+ vmovn.u16 d16, q0
200+ vst1.32 {d18, d19}, [OUT, :128]!
201 .endm
202
203 /* add_8888_8888 */
204--
2051.6.6.1
206