1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
|
--- src/hermes/mmxp2_32.asm
+++ src/hermes/mmxp2_32.asm
@@ -27,22 +27,37 @@ GLOBAL _ConvertMMXpII32_16BGR565
GLOBAL _ConvertMMXpII32_16RGB555
GLOBAL _ConvertMMXpII32_16BGR555
-SECTION .data
-
-ALIGN 8
-
-;; Constants for conversion routines
-
-mmx32_rgb888_mask dd 00ffffffh,00ffffffh
-
-mmx32_rgb565_b dd 000000f8h, 000000f8h
-mmx32_rgb565_g dd 0000fc00h, 0000fc00h
-mmx32_rgb565_r dd 00f80000h, 00f80000h
-
-mmx32_rgb555_rb dd 00f800f8h,00f800f8h
-mmx32_rgb555_g dd 0000f800h,0000f800h
-mmx32_rgb555_mul dd 20000008h,20000008h
-mmx32_bgr555_mul dd 00082000h,00082000h
+
+;; Macros for conversion routines
+
+%macro _push_immq_mask 1
+ push dword %1
+ push dword %1
+%endmacro
+
+%macro load_immq 2
+ _push_immq_mask %2
+ movq %1, [esp]
+%endmacro
+
+%macro pand_immq 2
+ _push_immq_mask %2
+ pand %1, [esp]
+%endmacro
+
+%define CLEANUP_IMMQ_LOADS(num) \
+ add esp, byte 8 * num
+
+%define mmx32_rgb888_mask 00ffffffh
+
+%define mmx32_rgb565_b 000000f8h
+%define mmx32_rgb565_g 0000fc00h
+%define mmx32_rgb565_r 00f80000h
+
+%define mmx32_rgb555_rb 00f800f8h
+%define mmx32_rgb555_g 0000f800h
+%define mmx32_rgb555_mul 20000008h
+%define mmx32_bgr555_mul 00082000h
@@ -53,7 +66,8 @@ SECTION .text
_ConvertMMXpII32_24RGB888:
; set up mm6 as the mask, mm7 as zero
- movq mm6, qword [mmx32_rgb888_mask]
+ load_immq mm6, mmx32_rgb888_mask
+ CLEANUP_IMMQ_LOADS(1)
pxor mm7, mm7
mov edx, ecx ; save ecx
@@ -108,9 +122,10 @@ _ConvertMMXpII32_24RGB888:
_ConvertMMXpII32_16RGB565:
; set up masks
- movq mm5, [mmx32_rgb565_b]
- movq mm6, [mmx32_rgb565_g]
- movq mm7, [mmx32_rgb565_r]
+ load_immq mm5, mmx32_rgb565_b
+ load_immq mm6, mmx32_rgb565_g
+ load_immq mm7, mmx32_rgb565_r
+ CLEANUP_IMMQ_LOADS(3)
mov edx, ecx
shr ecx, 2
@@ -176,9 +191,10 @@ _ConvertMMXpII32_16RGB565:
_ConvertMMXpII32_16BGR565:
- movq mm5, [mmx32_rgb565_r]
- movq mm6, [mmx32_rgb565_g]
- movq mm7, [mmx32_rgb565_b]
+ load_immq mm5, mmx32_rgb565_r
+ load_immq mm6, mmx32_rgb565_g
+ load_immq mm7, mmx32_rgb565_b
+ CLEANUP_IMMQ_LOADS(3)
mov edx, ecx
shr ecx, 2
@@ -253,7 +269,7 @@ _ConvertMMXpII32_16BGR555:
; except it uses a different multiplier for the pmaddwd
; instruction. cool huh.
- movq mm7, qword [mmx32_bgr555_mul]
+ load_immq mm7, mmx32_bgr555_mul
jmp _convert_bgr555_cheat
; This is the same as the Intel version.. they obviously went to
@@ -263,9 +279,10 @@ _ConvertMMXpII32_16BGR555:
; (I think) a more accurate name..
_ConvertMMXpII32_16RGB555:
- movq mm7,qword [mmx32_rgb555_mul]
+ load_immq mm7, mmx32_rgb555_mul
_convert_bgr555_cheat:
- movq mm6,qword [mmx32_rgb555_g]
+ load_immq mm6, mmx32_rgb555_g
+ CLEANUP_IMMQ_LOADS(2)
mov edx,ecx ; Save ecx
@@ -280,12 +297,14 @@ _convert_bgr555_cheat:
movq mm0,[esi]
movq mm3,mm2
- pand mm3,qword [mmx32_rgb555_rb]
+ pand_immq mm3, mmx32_rgb555_rb
movq mm1,mm0
- pand mm1,qword [mmx32_rgb555_rb]
+ pand_immq mm1, mmx32_rgb555_rb
pmaddwd mm3,mm7
+ CLEANUP_IMMQ_LOADS(2)
+
pmaddwd mm1,mm7
pand mm2,mm6
@@ -302,13 +321,13 @@ _convert_bgr555_cheat:
movq mm0,mm4
psrld mm1,6
- pand mm0,qword [mmx32_rgb555_rb]
+ pand_immq mm0, mmx32_rgb555_rb
packssdw mm1,mm3
movq mm3,mm5
pmaddwd mm0,mm7
- pand mm3,qword [mmx32_rgb555_rb]
+ pand_immq mm3, mmx32_rgb555_rb
pand mm4,mm6
movq [edi],mm1
@@ -329,12 +348,14 @@ _convert_bgr555_cheat:
movq mm3,mm2
movq mm1,mm0
- pand mm3,qword [mmx32_rgb555_rb]
+ pand_immq mm3, mmx32_rgb555_rb
packssdw mm5,mm4
- pand mm1,qword [mmx32_rgb555_rb]
+ pand_immq mm1, mmx32_rgb555_rb
pand mm2,mm6
+ CLEANUP_IMMQ_LOADS(4)
+
movq [edi+8],mm5
pmaddwd mm3,mm7
|