@@ -120,16 +120,28 @@ extern jlib_decl void _rev(size32_t len, void * ptr);
120
120
#endif
121
121
122
122
inline void _cpyrev2 (void * _tgt, const void * _src) {
123
- char * tgt = (char *)_tgt; const char * src = (const char *)_src;
124
- tgt[1 ]=src[0 ]; tgt[0 ] = src[1 ];
123
+ // Technically undefined behaviour because the _src is likely to be a byte stream
124
+ // but this will work on all known architectures
125
+ unsigned value = *(const unsigned short *)_src;
126
+ // NOTE: Optimized by the compiler
127
+ value = ((value & 0xFF00 ) >> 8 ) |
128
+ ((value & 0x00FF ) << 8 );
129
+ *(unsigned short *)_tgt = value;
125
130
}
126
131
inline void _cpyrev3 (void * _tgt, const void * _src) {
127
132
char * tgt = (char *)_tgt; const char * src = (const char *)_src;
128
133
tgt[2 ] = src[0 ]; tgt[1 ]=src[1 ]; tgt[0 ] = src[2 ];
129
134
}
130
- inline void _cpyrev4 (void * _tgt, const void * _src) {
131
- char * tgt = (char *)_tgt; const char * src = (const char *)_src;
132
- tgt[3 ]=src[0 ]; tgt[2 ] = src[1 ]; tgt[1 ]=src[2 ]; tgt[0 ] = src[3 ];
135
+ inline void _cpyrev4 (void * _tgt, const void * _src) {
136
+ // Technically undefined behaviour because the _src is likely to be a byte stream
137
+ // but this will work on all known architectures
138
+ unsigned value = *(const unsigned *)_src;
139
+ // NOTE: The compiler spots this pattern an optimizes it into a byte-swap operation
140
+ value = ((value & 0xFF000000 ) >> 24 ) |
141
+ ((value & 0x00FF0000 ) >> 8 ) |
142
+ ((value & 0x0000FF00 ) << 8 ) |
143
+ ((value & 0x000000FF ) << 24 );
144
+ *(unsigned *)_tgt = value;
133
145
}
134
146
inline void _cpyrev5 (void * _tgt, const void * _src) {
135
147
char * tgt = (char *)_tgt; const char * src = (const char *)_src;
@@ -147,9 +159,19 @@ inline void _cpyrev7(void * _tgt, const void * _src) {
147
159
tgt[3 ] = src[3 ]; tgt[2 ]=src[4 ]; tgt[1 ]=src[5 ]; tgt[0 ]=src[6 ];
148
160
}
149
161
inline void _cpyrev8 (void * _tgt, const void * _src) {
150
- char * tgt = (char *)_tgt; const char * src = (const char *)_src;
151
- tgt[7 ]=src[0 ]; tgt[6 ] = src[1 ]; tgt[5 ]=src[2 ]; tgt[4 ] = src[3 ];
152
- tgt[3 ]=src[4 ]; tgt[2 ] = src[5 ]; tgt[1 ]=src[6 ]; tgt[0 ] = src[7 ];
162
+ // Technically undefined behaviour because the _src is likely to be a byte stream
163
+ // but this will work on all known architectures
164
+ unsigned __int64 value = *(const unsigned __int64 *)_src;
165
+ // NOTE: The compiler spots this pattern an optimizes it into a byte-swap operation
166
+ value = ((value & 0xFF00000000000000ULL ) >> 56 ) |
167
+ ((value & 0x00FF000000000000ULL ) >> 40 ) |
168
+ ((value & 0x0000FF0000000000ULL ) >> 24 ) |
169
+ ((value & 0x000000FF00000000ULL ) >> 8 ) |
170
+ ((value & 0x00000000FF000000ULL ) << 8 ) |
171
+ ((value & 0x0000000000FF0000ULL ) << 24 ) |
172
+ ((value & 0x000000000000FF00ULL ) << 40 ) |
173
+ ((value & 0x00000000000000FFULL ) << 56 );
174
+ *(unsigned __int64 *)_tgt = value;
153
175
}
154
176
inline void _cpyrevn (void * _tgt, const void * _src, unsigned len) {
155
177
char * tgt = (char *)_tgt; const char * src = (const char *)_src+len;
@@ -158,6 +180,25 @@ inline void _cpyrevn(void * _tgt, const void * _src, unsigned len) {
158
180
}
159
181
}
160
182
183
+ // Define a template class to allow the common byte reversal operations to be optimized
184
+ template <unsigned LEN>
185
+ inline void doCopyRev (void * tgt, const void * src) {
186
+ _cpyrevn (tgt, src, LEN);
187
+ }
188
+
189
+ template <>
190
+ inline void doCopyRev<2 >(void * tgt, const void * src) {
191
+ _cpyrev2 (tgt, src);
192
+ }
193
+ template <>
194
+ inline void doCopyRev<4 >(void * tgt, const void * src) {
195
+ _cpyrev4 (tgt, src);
196
+ }
197
+ template <>
198
+ inline void doCopyRev<8 >(void * tgt, const void * src) {
199
+ _cpyrev8 (tgt, src);
200
+ }
201
+
161
202
#if __BYTE_ORDER == __LITTLE_ENDIAN
162
203
#define _WINCPYREV (x, y, len ) _cpyrevn(x, y, len)
163
204
#define _WINCPYREV2 (x, y ) _cpyrev2(x, y)
0 commit comments