GNU Radio 3.5.3.2 C++ API
volk_32f_x2_dot_prod_32f_a.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
2 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
3 
4 #include <volk/volk_common.h>
5 #include<stdio.h>
6 
7 
8 #ifdef LV_HAVE_GENERIC
9 
10 
11 static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
12 
13  float dotProduct = 0;
14  const float* aPtr = input;
15  const float* bPtr= taps;
16  unsigned int number = 0;
17 
18  for(number = 0; number < num_points; number++){
19  dotProduct += ((*aPtr++) * (*bPtr++));
20  }
21 
22  *result = dotProduct;
23 }
24 
25 #endif /*LV_HAVE_GENERIC*/
26 
27 
28 #ifdef LV_HAVE_SSE
29 
30 
31 static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
32 
33  unsigned int number = 0;
34  const unsigned int quarterPoints = num_points / 4;
35 
36  float dotProduct = 0;
37  const float* aPtr = input;
38  const float* bPtr = taps;
39 
40  __m128 aVal, bVal, cVal;
41 
42  __m128 dotProdVal = _mm_setzero_ps();
43 
44  for(;number < quarterPoints; number++){
45 
46  aVal = _mm_load_ps(aPtr);
47  bVal = _mm_load_ps(bPtr);
48 
49  cVal = _mm_mul_ps(aVal, bVal);
50 
51  dotProdVal = _mm_add_ps(cVal, dotProdVal);
52 
53  aPtr += 4;
54  bPtr += 4;
55  }
56 
57  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
58 
59  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
60 
61  dotProduct = dotProductVector[0];
62  dotProduct += dotProductVector[1];
63  dotProduct += dotProductVector[2];
64  dotProduct += dotProductVector[3];
65 
66  number = quarterPoints * 4;
67  for(;number < num_points; number++){
68  dotProduct += ((*aPtr++) * (*bPtr++));
69  }
70 
71  *result = dotProduct;
72 
73 }
74 
75 #endif /*LV_HAVE_SSE*/
76 
77 #ifdef LV_HAVE_SSE3
78 
79 #include <pmmintrin.h>
80 
81 static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
82  unsigned int number = 0;
83  const unsigned int quarterPoints = num_points / 4;
84 
85  float dotProduct = 0;
86  const float* aPtr = input;
87  const float* bPtr = taps;
88 
89  __m128 aVal, bVal, cVal;
90 
91  __m128 dotProdVal = _mm_setzero_ps();
92 
93  for(;number < quarterPoints; number++){
94 
95  aVal = _mm_load_ps(aPtr);
96  bVal = _mm_load_ps(bPtr);
97 
98  cVal = _mm_mul_ps(aVal, bVal);
99 
100  dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
101 
102  aPtr += 4;
103  bPtr += 4;
104  }
105 
106  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
107  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
108 
109  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
110 
111  dotProduct = dotProductVector[0];
112  dotProduct += dotProductVector[1];
113 
114  number = quarterPoints * 4;
115  for(;number < num_points; number++){
116  dotProduct += ((*aPtr++) * (*bPtr++));
117  }
118 
119  *result = dotProduct;
120 }
121 
122 #endif /*LV_HAVE_SSE3*/
123 
124 #ifdef LV_HAVE_SSE4_1
125 
126 #include <smmintrin.h>
127 
128 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
129  unsigned int number = 0;
130  const unsigned int sixteenthPoints = num_points / 16;
131 
132  float dotProduct = 0;
133  const float* aPtr = input;
134  const float* bPtr = taps;
135 
136  __m128 aVal1, bVal1, cVal1;
137  __m128 aVal2, bVal2, cVal2;
138  __m128 aVal3, bVal3, cVal3;
139  __m128 aVal4, bVal4, cVal4;
140 
141  __m128 dotProdVal = _mm_setzero_ps();
142 
143  for(;number < sixteenthPoints; number++){
144 
145  aVal1 = _mm_load_ps(aPtr); aPtr += 4;
146  aVal2 = _mm_load_ps(aPtr); aPtr += 4;
147  aVal3 = _mm_load_ps(aPtr); aPtr += 4;
148  aVal4 = _mm_load_ps(aPtr); aPtr += 4;
149 
150  bVal1 = _mm_load_ps(bPtr); bPtr += 4;
151  bVal2 = _mm_load_ps(bPtr); bPtr += 4;
152  bVal3 = _mm_load_ps(bPtr); bPtr += 4;
153  bVal4 = _mm_load_ps(bPtr); bPtr += 4;
154 
155  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
156  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
157  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
158  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
159 
160  cVal1 = _mm_or_ps(cVal1, cVal2);
161  cVal3 = _mm_or_ps(cVal3, cVal4);
162  cVal1 = _mm_or_ps(cVal1, cVal3);
163 
164  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
165  }
166 
167  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
168  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
169 
170  dotProduct = dotProductVector[0];
171  dotProduct += dotProductVector[1];
172  dotProduct += dotProductVector[2];
173  dotProduct += dotProductVector[3];
174 
175  number = sixteenthPoints * 16;
176  for(;number < num_points; number++){
177  dotProduct += ((*aPtr++) * (*bPtr++));
178  }
179 
180  *result = dotProduct;
181 }
182 
183 #endif /*LV_HAVE_SSE4_1*/
184 
185 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/