@@ -120,16 +120,28 @@ extern jlib_decl void _rev(size32_t len, void * ptr);
120120#endif
121121
122122inline void _cpyrev2 (void * _tgt, const void * _src) {
123- char * tgt = (char *)_tgt; const char * src = (const char *)_src;
124- tgt[1 ]=src[0 ]; tgt[0 ] = src[1 ];
123+ // Technically undefined behaviour because the _src is likely to be a byte stream
124+ // but this will work on all known architectures
125+ unsigned value = *(const unsigned short *)_src;
126+ // NOTE: Optimized by the compiler
127+ value = ((value & 0xFF00 ) >> 8 ) |
128+ ((value & 0x00FF ) << 8 );
129+ *(unsigned short *)_tgt = value;
125130}
126131inline void _cpyrev3 (void * _tgt, const void * _src) {
127132 char * tgt = (char *)_tgt; const char * src = (const char *)_src;
128133 tgt[2 ] = src[0 ]; tgt[1 ]=src[1 ]; tgt[0 ] = src[2 ];
129134}
130- inline void _cpyrev4 (void * _tgt, const void * _src) {
131- char * tgt = (char *)_tgt; const char * src = (const char *)_src;
132- tgt[3 ]=src[0 ]; tgt[2 ] = src[1 ]; tgt[1 ]=src[2 ]; tgt[0 ] = src[3 ];
135+ inline void _cpyrev4 (void * _tgt, const void * _src) {
136+ // Technically undefined behaviour because the _src is likely to be a byte stream
137+ // but this will work on all known architectures
138+ unsigned value = *(const unsigned *)_src;
139+ // NOTE: The compiler spots this pattern an optimizes it into a byte-swap operation
140+ value = ((value & 0xFF000000 ) >> 24 ) |
141+ ((value & 0x00FF0000 ) >> 8 ) |
142+ ((value & 0x0000FF00 ) << 8 ) |
143+ ((value & 0x000000FF ) << 24 );
144+ *(unsigned *)_tgt = value;
133145}
134146inline void _cpyrev5 (void * _tgt, const void * _src) {
135147 char * tgt = (char *)_tgt; const char * src = (const char *)_src;
@@ -147,9 +159,19 @@ inline void _cpyrev7(void * _tgt, const void * _src) {
147159 tgt[3 ] = src[3 ]; tgt[2 ]=src[4 ]; tgt[1 ]=src[5 ]; tgt[0 ]=src[6 ];
148160}
149161inline void _cpyrev8 (void * _tgt, const void * _src) {
150- char * tgt = (char *)_tgt; const char * src = (const char *)_src;
151- tgt[7 ]=src[0 ]; tgt[6 ] = src[1 ]; tgt[5 ]=src[2 ]; tgt[4 ] = src[3 ];
152- tgt[3 ]=src[4 ]; tgt[2 ] = src[5 ]; tgt[1 ]=src[6 ]; tgt[0 ] = src[7 ];
162+ // Technically undefined behaviour because the _src is likely to be a byte stream
163+ // but this will work on all known architectures
164+ unsigned __int64 value = *(const unsigned __int64 *)_src;
165+ // NOTE: The compiler spots this pattern an optimizes it into a byte-swap operation
166+ value = ((value & 0xFF00000000000000ULL ) >> 56 ) |
167+ ((value & 0x00FF000000000000ULL ) >> 40 ) |
168+ ((value & 0x0000FF0000000000ULL ) >> 24 ) |
169+ ((value & 0x000000FF00000000ULL ) >> 8 ) |
170+ ((value & 0x00000000FF000000ULL ) << 8 ) |
171+ ((value & 0x0000000000FF0000ULL ) << 24 ) |
172+ ((value & 0x000000000000FF00ULL ) << 40 ) |
173+ ((value & 0x00000000000000FFULL ) << 56 );
174+ *(unsigned __int64 *)_tgt = value;
153175}
154176inline void _cpyrevn (void * _tgt, const void * _src, unsigned len) {
155177 char * tgt = (char *)_tgt; const char * src = (const char *)_src+len;
@@ -158,6 +180,25 @@ inline void _cpyrevn(void * _tgt, const void * _src, unsigned len) {
158180 }
159181}
160182
183+ // Define a template class to allow the common byte reversal operations to be optimized
184+ template <unsigned LEN>
185+ inline void doCopyRev (void * tgt, const void * src) {
186+ _cpyrevn (tgt, src, LEN);
187+ }
188+
189+ template <>
190+ inline void doCopyRev<2 >(void * tgt, const void * src) {
191+ _cpyrev2 (tgt, src);
192+ }
193+ template <>
194+ inline void doCopyRev<4 >(void * tgt, const void * src) {
195+ _cpyrev4 (tgt, src);
196+ }
197+ template <>
198+ inline void doCopyRev<8 >(void * tgt, const void * src) {
199+ _cpyrev8 (tgt, src);
200+ }
201+
161202#if __BYTE_ORDER == __LITTLE_ENDIAN
162203#define _WINCPYREV (x, y, len ) _cpyrevn(x, y, len)
163204#define _WINCPYREV2 (x, y ) _cpyrev2(x, y)
0 commit comments