diff options
Diffstat (limited to 'meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0007-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch')
-rw-r--r-- | meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0007-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch | 206 |
1 files changed, 206 insertions, 0 deletions
diff --git a/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0007-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0007-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch new file mode 100644 index 0000000000..e4e741f906 --- /dev/null +++ b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0007-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch | |||
@@ -0,0 +1,206 @@ | |||
1 | From 94585f9a618821a5c06c3a497902579b4a08b05f Mon Sep 17 00:00:00 2001 | ||
2 | From: Taekyun Kim <tkq.kim@samsung.com> | ||
3 | Date: Mon, 26 Sep 2011 19:04:53 +0900 | ||
4 | Subject: [PATCH 7/8] ARM: NEON: Instruction scheduling of bilinear over_8888_8_8888 | ||
5 | |||
6 | Instructions are reordered to eliminate pipeline stalls and get | ||
7 | better memory access. | ||
8 | |||
9 | Performance of before/after on cortex-a8 @ 1GHz | ||
10 | |||
11 | << 2000 x 2000 with scale factor close to 1.x >> | ||
12 | before : 40.53 Mpix/s | ||
13 | after : 50.76 Mpix/s | ||
14 | --- | ||
15 | pixman/pixman-arm-neon-asm-bilinear.S | 162 ++++++++++++++++++++++++++++++++- | ||
16 | 1 files changed, 158 insertions(+), 4 deletions(-) | ||
17 | |||
18 | diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S | ||
19 | index 76937e0..4ab46e1 100644 | ||
20 | --- a/pixman/pixman-arm-neon-asm-bilinear.S | ||
21 | +++ b/pixman/pixman-arm-neon-asm-bilinear.S | ||
22 | @@ -949,7 +949,7 @@ pixman_asm_function fname | ||
23 | vshrn.u32 d0, q0, #16 | ||
24 | vshrn.u32 d1, q1, #16 | ||
25 | vld1.32 {d2, d3}, [OUT, :128] | ||
26 | - pld [OUT, PF_OFFS] | ||
27 | + pld [OUT, #(prefetch_offset * 4)] | ||
28 | vshrn.u32 d4, q2, #16 | ||
29 | vshr.u16 q15, q12, #8 | ||
30 | vshrn.u32 d5, q3, #16 | ||
31 | @@ -1061,15 +1061,169 @@ pixman_asm_function fname | ||
32 | .endm | ||
33 | |||
34 | .macro bilinear_over_8888_8_8888_process_pixblock_head | ||
35 | - bilinear_over_8888_8_8888_process_four_pixels | ||
36 | + mov TMP1, X, asr #16 | ||
37 | + add X, X, UX | ||
38 | + add TMP1, TOP, TMP1, asl #2 | ||
39 | + vld1.32 {d0}, [TMP1], STRIDE | ||
40 | + mov TMP2, X, asr #16 | ||
41 | + add X, X, UX | ||
42 | + add TMP2, TOP, TMP2, asl #2 | ||
43 | + vld1.32 {d1}, [TMP1] | ||
44 | + mov TMP3, X, asr #16 | ||
45 | + add X, X, UX | ||
46 | + add TMP3, TOP, TMP3, asl #2 | ||
47 | + vld1.32 {d2}, [TMP2], STRIDE | ||
48 | + mov TMP4, X, asr #16 | ||
49 | + add X, X, UX | ||
50 | + add TMP4, TOP, TMP4, asl #2 | ||
51 | + vld1.32 {d3}, [TMP2] | ||
52 | + vmull.u8 q2, d0, d28 | ||
53 | + vmull.u8 q3, d2, d28 | ||
54 | + vmlal.u8 q2, d1, d29 | ||
55 | + vmlal.u8 q3, d3, d29 | ||
56 | + vshll.u16 q0, d4, #8 | ||
57 | + vshll.u16 q1, d6, #8 | ||
58 | + vmlsl.u16 q0, d4, d30 | ||
59 | + vmlsl.u16 q1, d6, d31 | ||
60 | + vmlal.u16 q0, d5, d30 | ||
61 | + vmlal.u16 q1, d7, d31 | ||
62 | + vshrn.u32 d0, q0, #16 | ||
63 | + vshrn.u32 d1, q1, #16 | ||
64 | + vld1.32 {d2}, [TMP3], STRIDE | ||
65 | + vld1.32 {d3}, [TMP3] | ||
66 | + pld [TMP4, PF_OFFS] | ||
67 | + vld1.32 {d4}, [TMP4], STRIDE | ||
68 | + vld1.32 {d5}, [TMP4] | ||
69 | + pld [TMP4, PF_OFFS] | ||
70 | + vmull.u8 q3, d2, d28 | ||
71 | + vmlal.u8 q3, d3, d29 | ||
72 | + vmull.u8 q1, d4, d28 | ||
73 | + vmlal.u8 q1, d5, d29 | ||
74 | + vshr.u16 q15, q12, #8 | ||
75 | + vld1.32 {d22[0]}, [MASK]! | ||
76 | + pld [MASK, #prefetch_offset] | ||
77 | + vadd.u16 q12, q12, q13 | ||
78 | + vmovn.u16 d16, q0 | ||
79 | .endm | ||
80 | |||
81 | .macro bilinear_over_8888_8_8888_process_pixblock_tail | ||
82 | + vshll.u16 q9, d6, #8 | ||
83 | + vshll.u16 q10, d2, #8 | ||
84 | + vmlsl.u16 q9, d6, d30 | ||
85 | + vmlsl.u16 q10, d2, d31 | ||
86 | + vmlal.u16 q9, d7, d30 | ||
87 | + vmlal.u16 q10, d3, d31 | ||
88 | + vshr.u16 q15, q12, #8 | ||
89 | + vadd.u16 q12, q12, q13 | ||
90 | + vdup.32 d22, d22[0] | ||
91 | + vshrn.u32 d18, q9, #16 | ||
92 | + vshrn.u32 d19, q10, #16 | ||
93 | + vmovn.u16 d17, q9 | ||
94 | + vld1.32 {d18, d19}, [OUT, :128] | ||
95 | + pld [OUT, PF_OFFS] | ||
96 | + vuzp.8 d16, d17 | ||
97 | + vuzp.8 d18, d19 | ||
98 | + vuzp.8 d16, d17 | ||
99 | + vuzp.8 d18, d19 | ||
100 | + vmull.u8 q10, d16, d22 | ||
101 | + vmull.u8 q11, d17, d22 | ||
102 | + vrsra.u16 q10, q10, #8 | ||
103 | + vrsra.u16 q11, q11, #8 | ||
104 | + vrshrn.u16 d16, q10, #8 | ||
105 | + vrshrn.u16 d17, q11, #8 | ||
106 | + vdup.32 d22, d17[1] | ||
107 | + vmvn.8 d22, d22 | ||
108 | + vmull.u8 q10, d18, d22 | ||
109 | + vmull.u8 q11, d19, d22 | ||
110 | + vrshr.u16 q9, q10, #8 | ||
111 | + vrshr.u16 q0, q11, #8 | ||
112 | + vraddhn.u16 d18, q9, q10 | ||
113 | + vraddhn.u16 d19, q0, q11 | ||
114 | + vqadd.u8 q9, q8, q9 | ||
115 | + vuzp.8 d18, d19 | ||
116 | + vuzp.8 d18, d19 | ||
117 | + vst1.32 {d18, d19}, [OUT, :128]! | ||
118 | .endm | ||
119 | |||
120 | .macro bilinear_over_8888_8_8888_process_pixblock_tail_head | ||
121 | - bilinear_over_8888_8_8888_process_pixblock_tail | ||
122 | - bilinear_over_8888_8_8888_process_pixblock_head | ||
123 | + vshll.u16 q9, d6, #8 | ||
124 | + mov TMP1, X, asr #16 | ||
125 | + add X, X, UX | ||
126 | + add TMP1, TOP, TMP1, asl #2 | ||
127 | + vshll.u16 q10, d2, #8 | ||
128 | + vld1.32 {d0}, [TMP1], STRIDE | ||
129 | + mov TMP2, X, asr #16 | ||
130 | + add X, X, UX | ||
131 | + add TMP2, TOP, TMP2, asl #2 | ||
132 | + vmlsl.u16 q9, d6, d30 | ||
133 | + vmlsl.u16 q10, d2, d31 | ||
134 | + vld1.32 {d1}, [TMP1] | ||
135 | + mov TMP3, X, asr #16 | ||
136 | + add X, X, UX | ||
137 | + add TMP3, TOP, TMP3, asl #2 | ||
138 | + vmlal.u16 q9, d7, d30 | ||
139 | + vmlal.u16 q10, d3, d31 | ||
140 | + vld1.32 {d2}, [TMP2], STRIDE | ||
141 | + mov TMP4, X, asr #16 | ||
142 | + add X, X, UX | ||
143 | + add TMP4, TOP, TMP4, asl #2 | ||
144 | + vshr.u16 q15, q12, #8 | ||
145 | + vadd.u16 q12, q12, q13 | ||
146 | + vld1.32 {d3}, [TMP2] | ||
147 | + vdup.32 d22, d22[0] | ||
148 | + vshrn.u32 d18, q9, #16 | ||
149 | + vshrn.u32 d19, q10, #16 | ||
150 | + vmull.u8 q2, d0, d28 | ||
151 | + vmull.u8 q3, d2, d28 | ||
152 | + vmovn.u16 d17, q9 | ||
153 | + vld1.32 {d18, d19}, [OUT, :128] | ||
154 | + pld [OUT, #(prefetch_offset * 4)] | ||
155 | + vmlal.u8 q2, d1, d29 | ||
156 | + vmlal.u8 q3, d3, d29 | ||
157 | + vuzp.8 d16, d17 | ||
158 | + vuzp.8 d18, d19 | ||
159 | + vshll.u16 q0, d4, #8 | ||
160 | + vshll.u16 q1, d6, #8 | ||
161 | + vuzp.8 d16, d17 | ||
162 | + vuzp.8 d18, d19 | ||
163 | + vmlsl.u16 q0, d4, d30 | ||
164 | + vmlsl.u16 q1, d6, d31 | ||
165 | + vmull.u8 q10, d16, d22 | ||
166 | + vmull.u8 q11, d17, d22 | ||
167 | + vmlal.u16 q0, d5, d30 | ||
168 | + vmlal.u16 q1, d7, d31 | ||
169 | + vrsra.u16 q10, q10, #8 | ||
170 | + vrsra.u16 q11, q11, #8 | ||
171 | + vshrn.u32 d0, q0, #16 | ||
172 | + vshrn.u32 d1, q1, #16 | ||
173 | + vrshrn.u16 d16, q10, #8 | ||
174 | + vrshrn.u16 d17, q11, #8 | ||
175 | + vld1.32 {d2}, [TMP3], STRIDE | ||
176 | + vdup.32 d22, d17[1] | ||
177 | + vld1.32 {d3}, [TMP3] | ||
178 | + vmvn.8 d22, d22 | ||
179 | + pld [TMP4, PF_OFFS] | ||
180 | + vld1.32 {d4}, [TMP4], STRIDE | ||
181 | + vmull.u8 q10, d18, d22 | ||
182 | + vmull.u8 q11, d19, d22 | ||
183 | + vld1.32 {d5}, [TMP4] | ||
184 | + pld [TMP4, PF_OFFS] | ||
185 | + vmull.u8 q3, d2, d28 | ||
186 | + vrshr.u16 q9, q10, #8 | ||
187 | + vrshr.u16 q15, q11, #8 | ||
188 | + vmlal.u8 q3, d3, d29 | ||
189 | + vmull.u8 q1, d4, d28 | ||
190 | + vraddhn.u16 d18, q9, q10 | ||
191 | + vraddhn.u16 d19, q15, q11 | ||
192 | + vmlal.u8 q1, d5, d29 | ||
193 | + vshr.u16 q15, q12, #8 | ||
194 | + vqadd.u8 q9, q8, q9 | ||
195 | + vld1.32 {d22[0]}, [MASK]! | ||
196 | + vuzp.8 d18, d19 | ||
197 | + vadd.u16 q12, q12, q13 | ||
198 | + vuzp.8 d18, d19 | ||
199 | + vmovn.u16 d16, q0 | ||
200 | + vst1.32 {d18, d19}, [OUT, :128]! | ||
201 | .endm | ||
202 | |||
203 | /* add_8888_8888 */ | ||
204 | -- | ||
205 | 1.6.6.1 | ||
206 | |||