summaryrefslogtreecommitdiffstats
path: root/recipes-graphics/xorg-lib/pixman-0.21.2/0007-ARM-better-NEON-instructions-scheduling-for-over_n_8.patch
diff options
context:
space:
mode:
Diffstat (limited to 'recipes-graphics/xorg-lib/pixman-0.21.2/0007-ARM-better-NEON-instructions-scheduling-for-over_n_8.patch')
-rw-r--r--recipes-graphics/xorg-lib/pixman-0.21.2/0007-ARM-better-NEON-instructions-scheduling-for-over_n_8.patch170
1 files changed, 170 insertions, 0 deletions
diff --git a/recipes-graphics/xorg-lib/pixman-0.21.2/0007-ARM-better-NEON-instructions-scheduling-for-over_n_8.patch b/recipes-graphics/xorg-lib/pixman-0.21.2/0007-ARM-better-NEON-instructions-scheduling-for-over_n_8.patch
new file mode 100644
index 0000000000..acdfdf873d
--- /dev/null
+++ b/recipes-graphics/xorg-lib/pixman-0.21.2/0007-ARM-better-NEON-instructions-scheduling-for-over_n_8.patch
@@ -0,0 +1,170 @@
1From e6814837a6ccd3e4db329e0131eaf2055d2c864b Mon Sep 17 00:00:00 2001
2From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
3Date: Fri, 26 Nov 2010 17:06:58 +0200
4Subject: [PATCH 07/24] ARM: better NEON instructions scheduling for over_n_8_0565
5
6Code rearranged to get better instructions scheduling for ARM Cortex-A8/A9.
7Now it is ~30% faster for the pixel data in L1 cache and makes better use
8of memory bandwidth when running at lower clock frequencies (ex. 500MHz).
9Also register d24 (pixels from the mask image) is now not clobbered by
10supplementary macros, which allows to reuse them for the other variants
11of compositing operations later.
12
13Benchmark from ARM Cortex-A8 @500MHz:
14
15== before ==
16
17 over_n_8_0565 = L1: 63.90 L2: 63.15 M: 60.97 ( 73.53%)
18 HT: 28.89 VT: 24.14 R: 21.33 RT: 6.78 ( 67Kops/s)
19
20== after ==
21
22 over_n_8_0565 = L1: 82.64 L2: 75.19 M: 71.52 ( 84.14%)
23 HT: 30.49 VT: 25.56 R: 22.36 RT: 6.89 ( 68Kops/s)
24---
25 pixman/pixman-arm-neon-asm.S | 120 +++++++++++++++++++++++++++---------------
26 1 files changed, 77 insertions(+), 43 deletions(-)
27
28diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
29index 155a236..ffffc1c 100644
30--- a/pixman/pixman-arm-neon-asm.S
31+++ b/pixman/pixman-arm-neon-asm.S
32@@ -792,58 +792,92 @@ generate_composite_function \
33 /******************************************************************************/
34
35 .macro pixman_composite_over_n_8_0565_process_pixblock_head
36- /* in */
37- vmull.u8 q0, d24, d8
38- vmull.u8 q1, d24, d9
39- vmull.u8 q6, d24, d10
40- vmull.u8 q7, d24, d11
41- vrshr.u16 q10, q0, #8
42- vrshr.u16 q11, q1, #8
43- vrshr.u16 q12, q6, #8
44- vrshr.u16 q13, q7, #8
45- vraddhn.u16 d0, q0, q10
46- vraddhn.u16 d1, q1, q11
47- vraddhn.u16 d2, q6, q12
48- vraddhn.u16 d3, q7, q13
49-
50- vshrn.u16 d6, q2, #8
51- vshrn.u16 d7, q2, #3
52- vsli.u16 q2, q2, #5
53- vsri.u8 d6, d6, #5
54- vmvn.8 d3, d3
55- vsri.u8 d7, d7, #6
56- vshrn.u16 d30, q2, #2
57- /* now do alpha blending */
58- vmull.u8 q10, d3, d6
59- vmull.u8 q11, d3, d7
60- vmull.u8 q12, d3, d30
61- vrshr.u16 q13, q10, #8
62- vrshr.u16 q3, q11, #8
63- vrshr.u16 q15, q12, #8
64- vraddhn.u16 d20, q10, q13
65- vraddhn.u16 d23, q11, q3
66- vraddhn.u16 d22, q12, q15
67+ vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
68+ vmull.u8 q1, d24, d9
69+ vmull.u8 q6, d24, d10
70+ vmull.u8 q7, d24, d11
71+ vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
72+ vshrn.u16 d7, q2, #3
73+ vsli.u16 q2, q2, #5
74+ vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
75+ vrshr.u16 q9, q1, #8
76+ vrshr.u16 q10, q6, #8
77+ vrshr.u16 q11, q7, #8
78+ vraddhn.u16 d0, q0, q8
79+ vraddhn.u16 d1, q1, q9
80+ vraddhn.u16 d2, q6, q10
81+ vraddhn.u16 d3, q7, q11
82+ vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
83+ vsri.u8 d7, d7, #6
84+ vmvn.8 d3, d3
85+ vshrn.u16 d30, q2, #2
86+ vmull.u8 q8, d3, d6 /* now do alpha blending */
87+ vmull.u8 q9, d3, d7
88+ vmull.u8 q10, d3, d30
89 .endm
90
91 .macro pixman_composite_over_n_8_0565_process_pixblock_tail
92- vqadd.u8 d16, d2, d20
93- vqadd.u8 q9, q0, q11
94- /* convert to r5g6b5 */
95- vshll.u8 q14, d16, #8
96- vshll.u8 q8, d19, #8
97- vshll.u8 q9, d18, #8
98- vsri.u16 q14, q8, #5
99- vsri.u16 q14, q9, #11
100+ /* 3 cycle bubble (after vmull.u8) */
101+ vrshr.u16 q13, q8, #8
102+ vrshr.u16 q11, q9, #8
103+ vrshr.u16 q15, q10, #8
104+ vraddhn.u16 d16, q8, q13
105+ vraddhn.u16 d27, q9, q11
106+ vraddhn.u16 d26, q10, q15
107+ vqadd.u8 d16, d2, d16
108+ /* 1 cycle bubble */
109+ vqadd.u8 q9, q0, q13
110+ vshll.u8 q14, d16, #8 /* convert to 16bpp */
111+ vshll.u8 q8, d19, #8
112+ vshll.u8 q9, d18, #8
113+ vsri.u16 q14, q8, #5
114+ /* 1 cycle bubble */
115+ vsri.u16 q14, q9, #11
116 .endm
117
118-/* TODO: expand macros and do better instructions scheduling */
119 .macro pixman_composite_over_n_8_0565_process_pixblock_tail_head
120- pixman_composite_over_n_8_0565_process_pixblock_tail
121- vst1.16 {d28, d29}, [DST_W, :128]!
122 vld1.16 {d4, d5}, [DST_R, :128]!
123+ vshrn.u16 d6, q2, #8
124 fetch_mask_pixblock
125+ vshrn.u16 d7, q2, #3
126+ fetch_src_pixblock
127+ vmull.u8 q6, d24, d10
128+ vrshr.u16 q13, q8, #8
129+ vrshr.u16 q11, q9, #8
130+ vrshr.u16 q15, q10, #8
131+ vraddhn.u16 d16, q8, q13
132+ vraddhn.u16 d27, q9, q11
133+ vraddhn.u16 d26, q10, q15
134+ vqadd.u8 d16, d2, d16
135+ vmull.u8 q1, d24, d9
136+ vqadd.u8 q9, q0, q13
137+ vshll.u8 q14, d16, #8
138+ vmull.u8 q0, d24, d8
139+ vshll.u8 q8, d19, #8
140+ vshll.u8 q9, d18, #8
141+ vsri.u16 q14, q8, #5
142+ vmull.u8 q7, d24, d11
143+ vsri.u16 q14, q9, #11
144+
145 cache_preload 8, 8
146- pixman_composite_over_n_8_0565_process_pixblock_head
147+
148+ vsli.u16 q2, q2, #5
149+ vrshr.u16 q8, q0, #8
150+ vrshr.u16 q9, q1, #8
151+ vrshr.u16 q10, q6, #8
152+ vrshr.u16 q11, q7, #8
153+ vraddhn.u16 d0, q0, q8
154+ vraddhn.u16 d1, q1, q9
155+ vraddhn.u16 d2, q6, q10
156+ vraddhn.u16 d3, q7, q11
157+ vsri.u8 d6, d6, #5
158+ vsri.u8 d7, d7, #6
159+ vmvn.8 d3, d3
160+ vshrn.u16 d30, q2, #2
161+ vst1.16 {d28, d29}, [DST_W, :128]!
162+ vmull.u8 q8, d3, d6
163+ vmull.u8 q9, d3, d7
164+ vmull.u8 q10, d3, d30
165 .endm
166
167 /*
168--
1691.6.6.1
170