diff options
Diffstat (limited to 'recipes-graphics/xorg-lib/pixman-0.21.2/0007-ARM-better-NEON-instructions-scheduling-for-over_n_8.patch')
-rw-r--r-- | recipes-graphics/xorg-lib/pixman-0.21.2/0007-ARM-better-NEON-instructions-scheduling-for-over_n_8.patch | 170 |
1 files changed, 170 insertions, 0 deletions
diff --git a/recipes-graphics/xorg-lib/pixman-0.21.2/0007-ARM-better-NEON-instructions-scheduling-for-over_n_8.patch b/recipes-graphics/xorg-lib/pixman-0.21.2/0007-ARM-better-NEON-instructions-scheduling-for-over_n_8.patch new file mode 100644 index 0000000000..acdfdf873d --- /dev/null +++ b/recipes-graphics/xorg-lib/pixman-0.21.2/0007-ARM-better-NEON-instructions-scheduling-for-over_n_8.patch | |||
@@ -0,0 +1,170 @@ | |||
1 | From e6814837a6ccd3e4db329e0131eaf2055d2c864b Mon Sep 17 00:00:00 2001 | ||
2 | From: Siarhei Siamashka <siarhei.siamashka@nokia.com> | ||
3 | Date: Fri, 26 Nov 2010 17:06:58 +0200 | ||
4 | Subject: [PATCH 07/24] ARM: better NEON instructions scheduling for over_n_8_0565 | ||
5 | |||
6 | Code rearranged to get better instructions scheduling for ARM Cortex-A8/A9. | ||
7 | Now it is ~30% faster for the pixel data in L1 cache and makes better use | ||
8 | of memory bandwidth when running at lower clock frequencies (ex. 500MHz). | ||
9 | Also register d24 (pixels from the mask image) is now not clobbered by | ||
10 | supplementary macros, which allows to reuse them for the other variants | ||
11 | of compositing operations later. | ||
12 | |||
13 | Benchmark from ARM Cortex-A8 @500MHz: | ||
14 | |||
15 | == before == | ||
16 | |||
17 | over_n_8_0565 = L1: 63.90 L2: 63.15 M: 60.97 ( 73.53%) | ||
18 | HT: 28.89 VT: 24.14 R: 21.33 RT: 6.78 ( 67Kops/s) | ||
19 | |||
20 | == after == | ||
21 | |||
22 | over_n_8_0565 = L1: 82.64 L2: 75.19 M: 71.52 ( 84.14%) | ||
23 | HT: 30.49 VT: 25.56 R: 22.36 RT: 6.89 ( 68Kops/s) | ||
24 | --- | ||
25 | pixman/pixman-arm-neon-asm.S | 120 +++++++++++++++++++++++++++--------------- | ||
26 | 1 files changed, 77 insertions(+), 43 deletions(-) | ||
27 | |||
28 | diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S | ||
29 | index 155a236..ffffc1c 100644 | ||
30 | --- a/pixman/pixman-arm-neon-asm.S | ||
31 | +++ b/pixman/pixman-arm-neon-asm.S | ||
32 | @@ -792,58 +792,92 @@ generate_composite_function \ | ||
33 | /******************************************************************************/ | ||
34 | |||
35 | .macro pixman_composite_over_n_8_0565_process_pixblock_head | ||
36 | - /* in */ | ||
37 | - vmull.u8 q0, d24, d8 | ||
38 | - vmull.u8 q1, d24, d9 | ||
39 | - vmull.u8 q6, d24, d10 | ||
40 | - vmull.u8 q7, d24, d11 | ||
41 | - vrshr.u16 q10, q0, #8 | ||
42 | - vrshr.u16 q11, q1, #8 | ||
43 | - vrshr.u16 q12, q6, #8 | ||
44 | - vrshr.u16 q13, q7, #8 | ||
45 | - vraddhn.u16 d0, q0, q10 | ||
46 | - vraddhn.u16 d1, q1, q11 | ||
47 | - vraddhn.u16 d2, q6, q12 | ||
48 | - vraddhn.u16 d3, q7, q13 | ||
49 | - | ||
50 | - vshrn.u16 d6, q2, #8 | ||
51 | - vshrn.u16 d7, q2, #3 | ||
52 | - vsli.u16 q2, q2, #5 | ||
53 | - vsri.u8 d6, d6, #5 | ||
54 | - vmvn.8 d3, d3 | ||
55 | - vsri.u8 d7, d7, #6 | ||
56 | - vshrn.u16 d30, q2, #2 | ||
57 | - /* now do alpha blending */ | ||
58 | - vmull.u8 q10, d3, d6 | ||
59 | - vmull.u8 q11, d3, d7 | ||
60 | - vmull.u8 q12, d3, d30 | ||
61 | - vrshr.u16 q13, q10, #8 | ||
62 | - vrshr.u16 q3, q11, #8 | ||
63 | - vrshr.u16 q15, q12, #8 | ||
64 | - vraddhn.u16 d20, q10, q13 | ||
65 | - vraddhn.u16 d23, q11, q3 | ||
66 | - vraddhn.u16 d22, q12, q15 | ||
67 | + vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */ | ||
68 | + vmull.u8 q1, d24, d9 | ||
69 | + vmull.u8 q6, d24, d10 | ||
70 | + vmull.u8 q7, d24, d11 | ||
71 | + vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */ | ||
72 | + vshrn.u16 d7, q2, #3 | ||
73 | + vsli.u16 q2, q2, #5 | ||
74 | + vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */ | ||
75 | + vrshr.u16 q9, q1, #8 | ||
76 | + vrshr.u16 q10, q6, #8 | ||
77 | + vrshr.u16 q11, q7, #8 | ||
78 | + vraddhn.u16 d0, q0, q8 | ||
79 | + vraddhn.u16 d1, q1, q9 | ||
80 | + vraddhn.u16 d2, q6, q10 | ||
81 | + vraddhn.u16 d3, q7, q11 | ||
82 | + vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */ | ||
83 | + vsri.u8 d7, d7, #6 | ||
84 | + vmvn.8 d3, d3 | ||
85 | + vshrn.u16 d30, q2, #2 | ||
86 | + vmull.u8 q8, d3, d6 /* now do alpha blending */ | ||
87 | + vmull.u8 q9, d3, d7 | ||
88 | + vmull.u8 q10, d3, d30 | ||
89 | .endm | ||
90 | |||
91 | .macro pixman_composite_over_n_8_0565_process_pixblock_tail | ||
92 | - vqadd.u8 d16, d2, d20 | ||
93 | - vqadd.u8 q9, q0, q11 | ||
94 | - /* convert to r5g6b5 */ | ||
95 | - vshll.u8 q14, d16, #8 | ||
96 | - vshll.u8 q8, d19, #8 | ||
97 | - vshll.u8 q9, d18, #8 | ||
98 | - vsri.u16 q14, q8, #5 | ||
99 | - vsri.u16 q14, q9, #11 | ||
100 | + /* 3 cycle bubble (after vmull.u8) */ | ||
101 | + vrshr.u16 q13, q8, #8 | ||
102 | + vrshr.u16 q11, q9, #8 | ||
103 | + vrshr.u16 q15, q10, #8 | ||
104 | + vraddhn.u16 d16, q8, q13 | ||
105 | + vraddhn.u16 d27, q9, q11 | ||
106 | + vraddhn.u16 d26, q10, q15 | ||
107 | + vqadd.u8 d16, d2, d16 | ||
108 | + /* 1 cycle bubble */ | ||
109 | + vqadd.u8 q9, q0, q13 | ||
110 | + vshll.u8 q14, d16, #8 /* convert to 16bpp */ | ||
111 | + vshll.u8 q8, d19, #8 | ||
112 | + vshll.u8 q9, d18, #8 | ||
113 | + vsri.u16 q14, q8, #5 | ||
114 | + /* 1 cycle bubble */ | ||
115 | + vsri.u16 q14, q9, #11 | ||
116 | .endm | ||
117 | |||
118 | -/* TODO: expand macros and do better instructions scheduling */ | ||
119 | .macro pixman_composite_over_n_8_0565_process_pixblock_tail_head | ||
120 | - pixman_composite_over_n_8_0565_process_pixblock_tail | ||
121 | - vst1.16 {d28, d29}, [DST_W, :128]! | ||
122 | vld1.16 {d4, d5}, [DST_R, :128]! | ||
123 | + vshrn.u16 d6, q2, #8 | ||
124 | fetch_mask_pixblock | ||
125 | + vshrn.u16 d7, q2, #3 | ||
126 | + fetch_src_pixblock | ||
127 | + vmull.u8 q6, d24, d10 | ||
128 | + vrshr.u16 q13, q8, #8 | ||
129 | + vrshr.u16 q11, q9, #8 | ||
130 | + vrshr.u16 q15, q10, #8 | ||
131 | + vraddhn.u16 d16, q8, q13 | ||
132 | + vraddhn.u16 d27, q9, q11 | ||
133 | + vraddhn.u16 d26, q10, q15 | ||
134 | + vqadd.u8 d16, d2, d16 | ||
135 | + vmull.u8 q1, d24, d9 | ||
136 | + vqadd.u8 q9, q0, q13 | ||
137 | + vshll.u8 q14, d16, #8 | ||
138 | + vmull.u8 q0, d24, d8 | ||
139 | + vshll.u8 q8, d19, #8 | ||
140 | + vshll.u8 q9, d18, #8 | ||
141 | + vsri.u16 q14, q8, #5 | ||
142 | + vmull.u8 q7, d24, d11 | ||
143 | + vsri.u16 q14, q9, #11 | ||
144 | + | ||
145 | cache_preload 8, 8 | ||
146 | - pixman_composite_over_n_8_0565_process_pixblock_head | ||
147 | + | ||
148 | + vsli.u16 q2, q2, #5 | ||
149 | + vrshr.u16 q8, q0, #8 | ||
150 | + vrshr.u16 q9, q1, #8 | ||
151 | + vrshr.u16 q10, q6, #8 | ||
152 | + vrshr.u16 q11, q7, #8 | ||
153 | + vraddhn.u16 d0, q0, q8 | ||
154 | + vraddhn.u16 d1, q1, q9 | ||
155 | + vraddhn.u16 d2, q6, q10 | ||
156 | + vraddhn.u16 d3, q7, q11 | ||
157 | + vsri.u8 d6, d6, #5 | ||
158 | + vsri.u8 d7, d7, #6 | ||
159 | + vmvn.8 d3, d3 | ||
160 | + vshrn.u16 d30, q2, #2 | ||
161 | + vst1.16 {d28, d29}, [DST_W, :128]! | ||
162 | + vmull.u8 q8, d3, d6 | ||
163 | + vmull.u8 q9, d3, d7 | ||
164 | + vmull.u8 q10, d3, d30 | ||
165 | .endm | ||
166 | |||
167 | /* | ||
168 | -- | ||
169 | 1.6.6.1 | ||
170 | |||