root / host / lib / convert / sse2_sc8_to_fc32.cpp @ 4fa889c4
History | View | Annotate | Download (5.48 KB)
| 1 |
//
|
|---|---|
| 2 |
// Copyright 2012 Ettus Research LLC
|
| 3 |
//
|
| 4 |
// This program is free software: you can redistribute it and/or modify
|
| 5 |
// it under the terms of the GNU General Public License as published by
|
| 6 |
// the Free Software Foundation, either version 3 of the License, or
|
| 7 |
// (at your option) any later version.
|
| 8 |
//
|
| 9 |
// This program is distributed in the hope that it will be useful,
|
| 10 |
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 12 |
// GNU General Public License for more details.
|
| 13 |
//
|
| 14 |
// You should have received a copy of the GNU General Public License
|
| 15 |
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
| 16 |
//
|
| 17 |
|
| 18 |
#include "convert_common.hpp" |
| 19 |
#include <uhd/utils/byteswap.hpp> |
| 20 |
#include <emmintrin.h> |
| 21 |
|
| 22 |
using namespace uhd::convert; |
| 23 |
|
| 24 |
static const __m128i zeroi = _mm_setzero_si128(); |
| 25 |
|
| 26 |
template <const int shuf> |
| 27 |
UHD_INLINE void unpack_sc32_4x(
|
| 28 |
const __m128i &in,
|
| 29 |
__m128 &out0, __m128 &out1, |
| 30 |
__m128 &out2, __m128 &out3, |
| 31 |
const __m128 &scalar
|
| 32 |
){
|
| 33 |
const __m128i tmplo = _mm_unpacklo_epi8(zeroi, in); /* value in upper 8 bits */ |
| 34 |
__m128i tmp0 = _mm_shuffle_epi32(_mm_unpacklo_epi16(zeroi, tmplo), shuf); /* value in upper 16 bits */
|
| 35 |
__m128i tmp1 = _mm_shuffle_epi32(_mm_unpackhi_epi16(zeroi, tmplo), shuf); |
| 36 |
out0 = _mm_mul_ps(_mm_cvtepi32_ps(tmp0), scalar); |
| 37 |
out1 = _mm_mul_ps(_mm_cvtepi32_ps(tmp1), scalar); |
| 38 |
|
| 39 |
const __m128i tmphi = _mm_unpackhi_epi8(zeroi, in);
|
| 40 |
__m128i tmp2 = _mm_shuffle_epi32(_mm_unpacklo_epi16(zeroi, tmphi), shuf); |
| 41 |
__m128i tmp3 = _mm_shuffle_epi32(_mm_unpackhi_epi16(zeroi, tmphi), shuf); |
| 42 |
out2 = _mm_mul_ps(_mm_cvtepi32_ps(tmp2), scalar); |
| 43 |
out3 = _mm_mul_ps(_mm_cvtepi32_ps(tmp3), scalar); |
| 44 |
} |
| 45 |
|
| 46 |
DECLARE_CONVERTER(sc8_item32_be, 1, fc32, 1, PRIORITY_SIMD){ |
| 47 |
const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3); |
| 48 |
fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]); |
| 49 |
|
| 50 |
const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 24)); |
| 51 |
const int shuf = _MM_SHUFFLE(1, 0, 3, 2); |
| 52 |
|
| 53 |
size_t i = 0, j = 0; |
| 54 |
fc32_t dummy; |
| 55 |
size_t num_samps = nsamps; |
| 56 |
|
| 57 |
if ((size_t(inputs[0]) & 0x3) != 0){ |
| 58 |
item32_sc8_to_xx<uhd::ntohx>(input++, output++, 1, scale_factor);
|
| 59 |
num_samps--; |
| 60 |
} |
| 61 |
|
| 62 |
#define convert_sc8_item32_1_to_fc32_1_bswap_guts(_al_) \
|
| 63 |
for (; j+7 < num_samps; j+=8, i+=4){ \ |
| 64 |
/* load from input */ \
|
| 65 |
__m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ |
| 66 |
\ |
| 67 |
/* unpack + swap 8-bit pairs */ \
|
| 68 |
__m128 tmp0, tmp1, tmp2, tmp3; \ |
| 69 |
unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar); \ |
| 70 |
\ |
| 71 |
/* store to output */ \
|
| 72 |
_mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+0), tmp0); \ |
| 73 |
_mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+2), tmp1); \ |
| 74 |
_mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+4), tmp2); \ |
| 75 |
_mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+6), tmp3); \ |
| 76 |
} |
| 77 |
|
| 78 |
//dispatch according to alignment
|
| 79 |
if ((size_t(output) & 0xf) == 0){ |
| 80 |
convert_sc8_item32_1_to_fc32_1_bswap_guts(_) |
| 81 |
} |
| 82 |
else{
|
| 83 |
convert_sc8_item32_1_to_fc32_1_bswap_guts(u_) |
| 84 |
} |
| 85 |
|
| 86 |
//convert remainder
|
| 87 |
item32_sc8_to_xx<uhd::ntohx>(input+i, output+j, num_samps-j, scale_factor); |
| 88 |
} |
| 89 |
|
| 90 |
DECLARE_CONVERTER(sc8_item32_le, 1, fc32, 1, PRIORITY_SIMD){ |
| 91 |
const item32_t *input = reinterpret_cast<const item32_t *>(size_t(inputs[0]) & ~0x3); |
| 92 |
fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]); |
| 93 |
|
| 94 |
const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 24)); |
| 95 |
const int shuf = _MM_SHUFFLE(2, 3, 0, 1); |
| 96 |
|
| 97 |
size_t i = 0, j = 0; |
| 98 |
fc32_t dummy; |
| 99 |
size_t num_samps = nsamps; |
| 100 |
|
| 101 |
if ((size_t(inputs[0]) & 0x3) != 0){ |
| 102 |
item32_sc8_to_xx<uhd::wtohx>(input++, output++, 1, scale_factor);
|
| 103 |
num_samps--; |
| 104 |
} |
| 105 |
|
| 106 |
#define convert_sc8_item32_1_to_fc32_1_nswap_guts(_al_) \
|
| 107 |
for (; j+7 < num_samps; j+=8, i+=4){ \ |
| 108 |
/* load from input */ \
|
| 109 |
__m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \ |
| 110 |
\ |
| 111 |
/* unpack + swap 8-bit pairs */ \
|
| 112 |
__m128 tmp0, tmp1, tmp2, tmp3; \ |
| 113 |
unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar); \ |
| 114 |
\ |
| 115 |
/* store to output */ \
|
| 116 |
_mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+0), tmp0); \ |
| 117 |
_mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+2), tmp1); \ |
| 118 |
_mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+4), tmp2); \ |
| 119 |
_mm_store ## _al_ ## ps(reinterpret_cast<float *>(output+j+6), tmp3); \ |
| 120 |
} |
| 121 |
|
| 122 |
//dispatch according to alignment
|
| 123 |
if ((size_t(output) & 0xf) == 0){ |
| 124 |
convert_sc8_item32_1_to_fc32_1_nswap_guts(_) |
| 125 |
} |
| 126 |
else{
|
| 127 |
convert_sc8_item32_1_to_fc32_1_nswap_guts(u_) |
| 128 |
} |
| 129 |
|
| 130 |
//convert remainder
|
| 131 |
item32_sc8_to_xx<uhd::wtohx>(input+i, output+j, num_samps-j, scale_factor); |
| 132 |
} |