TI DSP FFT库函数 / 憋错料

在目录c64plus\dsplib_v210\src\DSP_fft16x16，包含了三个层次的FFT库函数，分别是natural C version， intrinsic C version， serial SA version，最后一个是汇编级。在DSP_fft16x16_d.c中有三个测试用例对比耗时。三个函数用法差不多，例如：

void DSP_fft16x16_cn (
    const short * ptr_w,
    int npoints,
    short * ptr_x,
    short * ptr_y
);

void test_fft(void)
{
    const int N = 512;
    short x[2*N], y[2*N], w[2*N];
    gen_twiddle_fft16x16(w, N);

    double fs = 8000.0, f1 = 114.8, f2 = 186.2;
    short x8k[8000];
    for (int i=0; i<8000; ++i)
        x8k[i] = 1800.0 * (cos(2*PI*f1*i/fs) + cos(2*PI*f2*i/fs));

    short x5[500]; // downsample
    for (int i=0; i<500; ++i)
        x5[i] = x8k[16*i];

    int k = 0;
    for (; k<6; k++) {
        x[2*k + 0] = 0;
        x[2*k + 1] = 0;
    }
    for (; k<106; k++) {
        x[2*k + 0] = x5[k-6];
        x[2*k + 1] = 0;
    }
    for (; k<N; k++) {
        x[2*k + 0] = 0;
        x[2*k + 1] = 0;
    }

    DSP_fft16x16_cn(w, N, x, y);
    for (int i=0; i<N; ++i) {
        printf("%d\n", y[2*i]);
        printf("%d\n", y[2*i+1]);
    }
}

输出y是实部、虚部交替，可以求出功率值。

其中库函数的等效C源码是：

  1 /* ======================================================================== */
  2 /*  TEXAS INSTRUMENTS, INC.                                                 */
  3 /*                                                                          */
  4 /*  NAME                                                                    */
  5 /*      DSP_fft16x16_cn -- fft16x16                                         */
  6 /*                                                                          */
  7 /*      USAGE                                                               */
  8 /*           This routine is C-callable and can be called as:               */
  9 /*                                                                          */
 10 /*          void DSP_fft16x16_cn (                                          */
 11 /*              const short * ptr_w,                                        */
 12 /*              int npoints,                                                */
 13 /*              short * ptr_x,                                              */
 14 /*              short * ptr_y                                               */
 15 /*          );                                                              */
 16 /*                                                                          */
 17 /*            ptr_w   =  input twiddle factors                              */
 18 /*            npoints =  number of points                                   */
 19 /*            ptr_x   =  transformed data reversed                          */
 20 /*            ptr_y   =  linear transformed data                            */
 21 /*                                                                          */
 22 /*                                                                          */
 23 /*  DESCRIPTION                                                             */
 24 /*      The following code performs a mixed radix FFT for "npoints" which   */
 25 /*      is either a multiple of 4 or 2. It uses logN4 - 1 stages of radix4  */
 26 /*      transform and performs either a radix2 or radix4 transform on the   */
 27 /*      last stage depending on "npoints". If "npoints" is a multiple of 4, */
 28 /*      then this last stage is also a radix4 transform, otherwise it is a  */
 29 /*      radix2 transform.                                                   */
 30 /*                                                                          */
 31 /*                                                                          */
 32 /* int gen_twiddle_fft16x16(short *w, int n)                                */
 33 /*                                                                          */
 34 /*    int i, j, k;                                                          */
 35 /*     double M = 32767.5;                                                  */
 36 /*                                                                          */
 37 /*    for (j = 1, k = 0; j < n >> 2; j = j << 2)                            */
 38 /*    {                                                                     */
 39 /*        for (i = 0; i < n >> 2; i += j << 1)                              */
 40 /*        {                                                                 */
 41 /*                                                                          */
 42 /*          w[k + 11] = d2s(M * cos(6.0 * PI * (i + j) / n));               */
 43 /*          w[k + 10] = d2s(M * sin(6.0 * PI * (i + j) / n));               */
 44 /*          w[k +  9] = d2s(M * cos(6.0 * PI * (i    ) / n));               */
 45 /*          w[k +  8] = d2s(M * sin(6.0 * PI * (i    ) / n));               */
 46 /*                                                                          */
 47 /*          w[k +  7] = d2s(M * cos(4.0 * PI * (i + j) / n));               */
 48 /*          w[k +  6] = d2s(M * sin(4.0 * PI * (i + j) / n));               */
 49 /*          w[k +  5] = d2s(M * cos(4.0 * PI * (i    ) / n));               */
 50 /*          w[k +  4] = d2s(M * sin(4.0 * PI * (i    ) / n));               */
 51 /*                                                                          */
 52 /*          w[k +  3] = d2s(M * cos(2.0 * PI * (i + j) / n));               */
 53 /*          w[k +  2] = d2s(M * sin(2.0 * PI * (i + j) / n));               */
 54 /*          w[k +  1] = d2s(M * cos(2.0 * PI * (i    ) / n));               */
 55 /*          w[k +  0] = d2s(M * sin(2.0 * PI * (i    ) / n));               */
 56 /*                                                                          */
 57 /*          k += 12;                                                        */
 58 /*                                                                          */
 59 /*                                                                          */
 60 /*      }                                                                   */
 61 /*    }                                                                     */
 62 /*                                                                          */
 63 /*    return k;                                                             */
 64 /*                                                                          */
 65 /*                                                                          */
 66 /*  ASSUMPTIONS                                                             */
 67 /*      This code works for  both "npoints" a multiple of 2 or 4.           */
 68 /*      The arrays ‘x[]‘, ‘y[]‘, and ‘w[]‘ all must be aligned on a         */
 69 /*      double-word boundary for the "optimized" implementations.           */
 70 /*                                                                          */
 71 /*      The input and output data are complex, with the real/imaginary      */
 72 /*      components stored in adjacent locations in the array.  The real     */
 73 /*      components are stored at even array indices, and the imaginary      */
 74 /*      components are stored at odd array indices.                         */
 75 /*                                                                          */
 76 /*  TECHNIQUES                                                              */
 77 /*      The following C code represents an implementation of the Cooley     */
 78 /*      Tukey radix 4 DIF FFT. It accepts the inputs in normal order and    */
 79 /*      produces the outputs in digit reversed order. The natural C code    */
 80 /*      shown in this file on the other hand, accepts the inputs in nor-    */
 81 /*      mal order and produces the outputs in normal order.                 */
 82 /*                                                                          */
 83 /*      Several transformations have been applied to the original Cooley    */
 84 /*      Tukey code to produce the natural C code description shown here.    */
 85 /*      In order to understand these it would first be educational to       */
 86 /*      understand some of the issues involved in the conventional Cooley   */
 87 /*      Tukey FFT code.                                                     */
 88 /*                                                                          */
 89 /*      void radix4(int n, short x[], short wn[])                           */
 90 /*      {                                                                   */
 91 /*          int    n1,  n2,  ie,   ia1,  ia2, ia3;                          */
 92 /*          int    i0,  i1,  i2,    i3,    i, j,     k;                     */
 93 /*          short  co1, co2, co3,  si1,  si2, si3;                          */
 94 /*          short  xt0, yt0, xt1,  yt1,  xt2, yt2;                          */
 95 /*          short  xh0, xh1, xh20, xh21, xl0, xl1,xl20,xl21;                */
 96 /*                                                                          */
 97 /*          n2 = n;                                                         */
 98 /*          ie = 1;                                                         */
 99 /*          for (k = n; k > 1; k >>= 2)                                     */
100 /*          {                                                               */
101 /*              n1 = n2;                                                    */
102 /*              n2 >>= 2;                                                   */
103 /*              ia1 = 0;                                                    */
104 /*                                                                          */
105 /*              for (j = 0; j < n2; j++)                                    */
106 /*              {                                                           */
107 /*                   ia2 = ia1 + ia1;                                       */
108 /*                   ia3 = ia2 + ia1;                                       */
109 /*                                                                          */
110 /*                   co1 = wn[2 * ia1    ];                                 */
111 /*                   si1 = wn[2 * ia1 + 1];                                 */
112 /*                   co2 = wn[2 * ia2    ];                                 */
113 /*                   si2 = wn[2 * ia2 + 1];                                 */
114 /*                   co3 = wn[2 * ia3    ];                                 */
115 /*                   si3 = wn[2 * ia3 + 1];                                 */
116 /*                   ia1 = ia1 + ie;                                        */
117 /*                                                                          */
118 /*                   for (i0 = j; i0< n; i0 += n1)                          */
119 /*                   {                                                      */
120 /*                       i1 = i0 + n2;                                      */
121 /*                       i2 = i1 + n2;                                      */
122 /*                       i3 = i2 + n2;                                      */
123 /*                                                                          */
124 /*                       xh0  = x[2 * i0    ] + x[2 * i2    ];              */
125 /*                       xh1  = x[2 * i0 + 1] + x[2 * i2 + 1];              */
126 /*                       xl0  = x[2 * i0    ] - x[2 * i2    ];              */
127 /*                       xl1  = x[2 * i0 + 1] - x[2 * i2 + 1];              */
128 /*                                                                          */
129 /*                       xh20 = x[2 * i1    ] + x[2 * i3    ];              */
130 /*                       xh21 = x[2 * i1 + 1] + x[2 * i3 + 1];              */
131 /*                       xl20 = x[2 * i1    ] - x[2 * i3    ];              */
132 /*                       xl21 = x[2 * i1 + 1] - x[2 * i3 + 1];              */
133 /*                                                                          */
134 /*                       x[2 * i0    ] = xh0 + xh20;                        */
135 /*                       x[2 * i0 + 1] = xh1 + xh21;                        */
136 /*                                                                          */
137 /*                       xt0  = xh0 - xh20;                                 */
138 /*                       yt0  = xh1 - xh21;                                 */
139 /*                       xt1  = xl0 + xl21;                                 */
140 /*                       yt2  = xl1 + xl20;                                 */
141 /*                       xt2  = xl0 - xl21;                                 */
142 /*                       yt1  = xl1 - xl20;                                 */
143 /*                                                                          */
144 /*                       x[2 * i1    ] = (xt1 * co1 + yt1 * si1) >> 15;     */
145 /*                       x[2 * i1 + 1] = (yt1 * co1 - xt1 * si1) >> 15;     */
146 /*                       x[2 * i2    ] = (xt0 * co2 + yt0 * si2) >> 15;     */
147 /*                       x[2 * i2 + 1] = (yt0 * co2 - xt0 * si2) >> 15;     */
148 /*                       x[2 * i3    ] = (xt2 * co3 + yt2 * si3) >> 15;     */
149 /*                       x[2 * i3 + 1] = (yt2 * co3 - xt2 * si3) >> 15;     */
150 /*                   }                                                      */
151 /*             }                                                            */
152 /*             ie <<= 2;                                                    */
153 /*         }                                                                */
154 /*     }                                                                    */
155 /*                                                                          */
156 /*      The conventional Cooley Tukey FFT, is written using three loops.    */
157 /*      The outermost loop "k" cycles through the stages. There are log     */
158 /*      N to the base 4 stages in all. The loop "j" cycles through the      */
159 /*      groups of butterflies with different twiddle factors, loop "i"      */
160 /*      reuses the twiddle factors for the different butterflies within     */
161 /*      a stage. It is interesting to note the following:                   */
162 /*                                                                          */
163 /* ------------------------------------------------------------------------ */
164 /*      Stage#     #Groups     # Butterflies with common     #Groups*Bflys  */
165 /*                               twiddle factors                            */
166 /* ------------------------------------------------------------------------ */
167 /*       1         N/4          1                            N/4            */
168 /*       2         N/16         4                            N/4            */
169 /*       ..                                                                 */
170 /*       logN      1            N/4                          N/4            */
171 /* ------------------------------------------------------------------------ */
172 /*                                                                          */
173 /*      The following statements can be made based on above observations:   */
174 /*                                                                          */
175 /*      a) Inner loop "i0" iterates a veriable number of times. In          */
176 /*      particular the number of iterations quadruples every time from      */
177 /*      1..N/4. Hence software pipelining a loop that iterates a vraiable   */
178 /*      number of times is not profitable.                                  */
179 /*                                                                          */
180 /*      b) Outer loop "j" iterates a variable number of times as well.      */
181 /*      However the number of iterations is quartered every time from       */
182 /*      N/4 ..1. Hence the behaviour in (a) and (b) are exactly opposite    */
183 /*      to each other.                                                      */
184 /*                                                                          */
185 /*      c) If the two loops "i" and "j" are colaesced together then they    */
186 /*      will iterate for a fixed number of times namely N/4. This allows    */
187 /*      us to combine the "i" and "j" loops into 1 loop. Optimized impl-    */
188 /*      ementations will make use of this fact.                             */
189 /*                                                                          */
190 /*      In addition the Cooley Tukey FFT accesses three twiddle factors     */
191 /*      per iteration of the inner loop, as the butterflies that re-use     */
192 /*      twiddle factors are lumped together. This leads to accessing the    */
193 /*      twiddle factor array at three points each sepearted by "ie". Note   */
194 /*      that "ie" is initially 1, and is quadrupled with every iteration.   */
195 /*      Therfore these three twiddle factors are not even contiguous in     */
196 /*      the array.                                                          */
197 /*                                                                          */
198 /*      In order to vectorize the FFT, it is desirable to access twiddle    */
199 /*      factor array using double word wide loads and fetch the twiddle     */
200 /*      factors needed. In order to do this a modified twiddle factor       */
201 /*      array is created, in which the factors WN/4, WN/2, W3N/4 are        */
202 /*      arranged to be contiguous. This eliminates the seperation between   */
203 /*      twiddle factors within a butterfly. However this implies that as    */
204 /*      the loop is traversed from one stage to another, that we maintain   */
205 /*      a redundant version of the twiddle factor array. Hence the size     */
206 /*      of the twiddle factor array increases as compared to the normal     */
207 /*      Cooley Tukey FFT.  The modified twiddle factor array is of size     */
208 /*      "2 * N" where the conventional Cooley Tukey FFT is of size"3N/4"    */
209 /*      where N is the number of complex points to be transformed. The      */
210 /*      routine that generates the modified twiddle factor array was        */
211 /*      presented earlier. With the above transformation of the FFT,        */
212 /*      both the input data and the twiddle factor array can be accessed    */
213 /*      using double-word wide loads to enable packed data processing.      */
214 /*                                                                          */
215 /*      The final stage is optimised to remove the multiplication as        */
216 /*      w0 = 1.  This stage also performs digit reversal on the data,       */
217 /*      so the final output is in natural order.                            */
218 /*                                                                          */
219 /*      The fft() code shown here performs the bulk of the computation      */
220 /*      in place. However, because digit-reversal cannot be performed       */
221 /*      in-place, the final result is written to a separate array, y[].     */
222 /*                                                                          */
223 /*      The actual twiddle factors for the FFT are cosine, - sine. The      */
224 /*      twiddle factors stored in the table are csine and sine, hence       */
225 /*      the sign of the "sine" term is comprehended during multipli-        */
226 /*      cation as shown above.                                              */
227 /*                                                                          */
228 /*  MEMORY NOTE                                                             */
229 /*      The optimized implementations are written for LITTLE ENDIAN.        */
230 /*                                                                          */
231 /* ------------------------------------------------------------------------ */
232 /*            Copyright (c) 2007 Texas Instruments, Incorporated.           */
233 /*                           All Rights Reserved.                           */
234 /* ======================================================================== */
235 #pragma CODE_SECTION(DSP_fft16x16_cn, ".text:ansi");
236
237 #include "DSP_fft16x16_cn.h"
238
239 /*--------------------------------------------------------------------------*/
240 /* The following macro is used to obtain a digit reversed index, of a given */
241 /* number i, into j where the number of bits in "i" is "m". For the natural */
242 /* form of C code, this is done by first interchanging every set of "2 bit" */
243 /* pairs, followed by exchanging nibbles, followed by exchanging bytes, and */
244 /* finally halfwords. To give an example, condider the following number:    */
245 /*                                                                          */
246 /* N = FEDCBA9876543210, where each digit represents a bit, the following   */
247 /* steps illustrate the changes as the exchanges are performed:             */
248 /* M = DCFE98BA54761032 is the number after every "2 bits" are exchanged.   */
249 /* O = 98BADCFE10325476 is the number after every nibble is exchanged.      */
250 /* P = 1032547698BADCFE is the number after every byte is exchanged.        */
251 /* Since only 16 digits were considered this represents the digit reversed  */
252 /* index. Since the numbers are represented as 32 bits, there is one more   */
253 /* step typically of exchanging the half words as well.                     */
254 /*--------------------------------------------------------------------------*/
255 #if 0
256 # define DIG_REV(i, m, j) ((j) = (_shfl(_rotl(_bitr(_deal(i)), 16)) >> (m)))
257 #else
258 # define DIG_REV(i, m, j)                                                   259     do {                                                                    260         unsigned _ = (i);                                                   261         _ = ((_ & 0x33333333) <<  2) | ((_ & ~0x33333333) >>  2);           262         _ = ((_ & 0x0F0F0F0F) <<  4) | ((_ & ~0x0F0F0F0F) >>  4);           263         _ = ((_ & 0x00FF00FF) <<  8) | ((_ & ~0x00FF00FF) >>  8);           264         _ = ((_ & 0x0000FFFF) << 16) | ((_ & ~0x0000FFFF) >> 16);           265         (j) = _ >> (m);                                                     266     } while (0)
267 #endif
268
269
270 void DSP_fft16x16_cn (
271     const short * ptr_w,
272     int npoints,
273     short * ptr_x,
274     short * ptr_y
275 )
276 {
277     const short *w;
278     short * x, * x2, * x0;
279     short * y0, * y1, * y2, *y3;
280
281     short xt0_0, yt0_0, xt1_0, yt1_0, xt2_0, yt2_0;
282     short xt0_1, yt0_1, xt1_1, yt1_1, xt2_1, yt2_1;
283     short xh0_0, xh1_0, xh20_0, xh21_0, xl0_0, xl1_0, xl20_0, xl21_0;
284     short xh0_1, xh1_1, xh20_1, xh21_1, xl0_1, xl1_1, xl20_1, xl21_1;
285     short x_0, x_1, x_2, x_3, x_l1_0, x_l1_1, x_l1_2, x_l1_3, x_l2_0, x_l2_1;
286     short xh0_2, xh1_2, xl0_2, xl1_2, xh0_3, xh1_3, xl0_3, xl1_3;
287     short x_4, x_5, x_6, x_7, x_l2_2, x_l2_3, x_h2_0, x_h2_1, x_h2_2, x_h2_3;
288     short x_8, x_9, x_a, x_b, x_c, x_d, x_e, x_f;
289     short si10, si20, si30, co10, co20, co30;
290     short si11, si21, si31, co11, co21, co31;
291     short n00, n10, n20, n30, n01, n11, n21, n31;
292     short n02, n12, n22, n32, n03, n13, n23, n33;
293     short n0, j0;
294
295     int i, j, l1, l2, h2, predj, tw_offset, stride, fft_jmp;
296     int radix, norm, m;
297
298     /*---------------------------------------------------------------------*/
299     /* Determine the magnitude od the number of points to be transformed.  */
300     /* Check whether we can use a radix4 decomposition or a mixed radix    */
301     /* transformation, by determining modulo 2.                            */
302     /*---------------------------------------------------------------------*/
303     for (i = 31, m = 1; (npoints & (1 << i)) == 0; i--, m++)
304         ;
305     norm = m - 2;
306
307     radix = m & 1 ? 2 : 4;
308
309     /*----------------------------------------------------------------------*/
310     /* The stride is quartered with every iteration of the outer loop. It   */
311     /* denotes the seperation between any two adjacent inputs to the butter */
312     /* -fly. This should start out at N/4, hence stride is initially set to */
313     /* N. For every stride, 6*stride twiddle factors are accessed. The      */
314     /* "tw_offset" is the offset within the current twiddle factor sub-     */
315     /* table. This is set to zero, at the start of the code and is used to  */
316     /* obtain the appropriate sub-table twiddle pointer by offseting it     */
317     /* with the base pointer "ptr_w".                                       */
318     /*----------------------------------------------------------------------*/
319
320     stride = npoints;
321     tw_offset = 0;
322     fft_jmp = 6 * stride;
323
324     #ifndef NOASSUME
325     _nassert(stride > 4);
326     #pragma MUST_ITERATE(1,,1);
327     #endif
328
329     while (stride > 4) {
330         /*-----------------------------------------------------------------*/
331         /* At the start of every iteration of the outer loop, "j" is set   */
332         /* to zero, as "w" is pointing to the correct location within the  */
333         /* twiddle factor array. For every iteration of the inner loop     */
334         /* 6 * stride twiddle factors are accessed. For eg,                */
335         /*                                                                 */
336         /* #Iteration of outer loop  # twiddle factors    #times cycled    */
337         /*  1                          6 N/4               1               */
338         /*  2                          6 N/16              4               */
339         /*  ...                                                            */
340         /*-----------------------------------------------------------------*/
341         j = 0;
342         fft_jmp >>= 2;
343
344         /*-----------------------------------------------------------------*/
345         /* Set up offsets to access "N/4", "N/2", "3N/4" complex point or  */
346         /* "N/2", "N", "3N/2" half word                                    */
347         /*-----------------------------------------------------------------*/
348         h2 = stride >> 1;
349         l1 = stride;
350         l2 = stride + (stride >> 1);
351
352         /*-----------------------------------------------------------------*/
353         /*  Reset "x" to point to the start of the input data array.       */
354         /* "tw_offset" starts off at 0, and increments by "6 * stride"     */
355         /*  The stride quarters with every iteration of the outer loop     */
356         /*-----------------------------------------------------------------*/
357         x = ptr_x;
358         w = ptr_w + tw_offset;
359         tw_offset += fft_jmp;
360         stride  >>= 2;
361
362         /*----------------------------------------------------------------*/
363         /* The following loop iterates through the different butterflies, */
364         /* within a given stage. Recall that there are logN to base 4     */
365         /* stages. Certain butterflies share the twiddle factors. These   */
366         /* are grouped together. On the very first stage there are no     */
367         /* butterflies that share the twiddle factor, all N/4 butter-     */
368         /* flies have different factors. On the next stage two sets of    */
369         /* N/8 butterflies share the same twiddle factor. Hence after     */
370         /* half the butterflies are performed, j the index into the       */
371         /* factor array resets to 0, and the twiddle factors are reused.  */
372         /* When this happens, the data pointer ‘x‘ is incremented by the  */
373         /* fft_jmp amount. In addition the following code is unrolled to  */
374         /* perform "2" radix4 butterflies in parallel.                    */
375         /*----------------------------------------------------------------*/
376         #ifndef NOASSUME
377         _nassert((int)(w) % 8 == 0);
378         _nassert((int)(x) % 8 == 0);
379         _nassert(h2 % 8 == 0);
380         _nassert(l1 % 8 == 0);
381         _nassert(l2 % 8 == 0);
382         #pragma MUST_ITERATE(1,,1);
383         #endif
384
385         for (i = 0; i < (npoints >> 3); i ++) {
386             /*------------------------------------------------------------*/
387             /* Read the first 12 twiddle factors, six of which are used   */
388             /* for one radix 4 butterfly and six of which are used for    */
389             /* next one.                                                  */
390             /*------------------------------------------------------------*/
391
392             // twiddle factors for first butterfly
393             co10 = w[j+1];
394             si10 = w[j+0];
395             co20 = w[j+5];
396             si20 = w[j+4];
397             co30 = w[j+9];
398             si30 = w[j+8];
399
400             // twiddle factors for second butterfly
401             co11 = w[j+3];
402             si11 = w[j+2];
403             co21 = w[j+7];
404             si21 = w[j+6];
405             co31 = w[j+11];
406             si31 = w[j+10];
407
408             /*------------------------------------------------------------*/
409             /* Read in the first complex input for the butterflies.       */
410             /* 1st complex input to 1st butterfly: x[0] + jx[1]           */
411             /* 1st complex input to 2nd butterfly: x[2] + jx[3]           */
412             /*------------------------------------------------------------*/
413             x_0 = x[0];  // Re[x(k)]
414             x_1 = x[1];  // Im[x(k)]
415             x_2 = x[2];  // second butterfly
416             x_3 = x[3];
417
418             /*------------------------------------------------------------*/
419             /* Read in the complex inputs for the butterflies. Each of the*/
420             /* successive complex inputs of the butterfly are seperated   */
421             /* by a fixed amount known as stride. The stride starts out   */
422             /* at N/4, and quarters with every stage.                     */
423             /*------------------------------------------------------------*/
424             x_l1_0 = x[l1  ]; // Re[x(k+N/2)]
425             x_l1_1 = x[l1+1]; // Im[x(k+N/2)]
426             x_l1_2 = x[l1+2]; // second butterfly
427             x_l1_3 = x[l1+3];
428
429             x_l2_0 = x[l2  ]; // Re[x(k+3*N/2)]
430             x_l2_1 = x[l2+1]; // Im[x(k+3*N/2)]
431             x_l2_2 = x[l2+2]; // second butterfly
432             x_l2_3 = x[l2+3];
433
434             x_h2_0 = x[h2  ]; // Re[x(k+N/4)]
435             x_h2_1 = x[h2+1]; // Im[x(k+N/4)]
436             x_h2_2 = x[h2+2]; // second butterfly
437             x_h2_3 = x[h2+3];
438
439             /*-----------------------------------------------------------*/
440             /* Two butterflies are evaluated in parallel. The following  */
441             /* results will be shown for one butterfly only, although    */
442             /* both are being evaluated in parallel.                     */
443             /*                                                           */
444             /* Perform radix2 style DIF butterflies.                     */
445             /*-----------------------------------------------------------*/
446             xh0_0 = x_0 + x_l1_0;    xh1_0 = x_1 + x_l1_1;
447             xh0_1 = x_2 + x_l1_2;    xh1_1 = x_3 + x_l1_3;
448
449             xl0_0 = x_0 - x_l1_0;    xl1_0 = x_1 - x_l1_1;
450             xl0_1 = x_2 - x_l1_2;    xl1_1 = x_3 - x_l1_3;
451
452             xh20_0 = x_h2_0 + x_l2_0;    xh21_0 = x_h2_1 + x_l2_1;
453             xh20_1 = x_h2_2 + x_l2_2;    xh21_1 = x_h2_3 + x_l2_3;
454
455             xl20_0 = x_h2_0 - x_l2_0;    xl21_0 = x_h2_1 - x_l2_1;
456             xl20_1 = x_h2_2 - x_l2_2;    xl21_1 = x_h2_3 - x_l2_3;
457
458             /*-----------------------------------------------------------*/
459             /* Derive output pointers using the input pointer "x"        */
460             /*-----------------------------------------------------------*/
461             x0 = x;
462             x2 = x0;
463
464             /*-----------------------------------------------------------*/
465             /* When the twiddle factors are not to be re-used, j is      */
466             /* incremented by 12, to reflect the fact that 12 half words */
467             /* are consumed in every iteration. The input data pointer   */
468             /* increments by 4. Note that within a stage, the stride     */
469             /* does not change and hence the offsets for the other three */
470             /* legs, 0, h2, l1, l2.                                      */
471             /*-----------------------------------------------------------*/
472             j += 12;
473             x += 4;
474
475             predj = (j - fft_jmp);
476             if (!predj) x += fft_jmp;
477             if (!predj) j = 0;
478
479             /*----------------------------------------------------------*/
480             /* These four partial results can be re-written to show     */
481             /* the underlying DIF structure similar to radix2 as        */
482             /* follows:                                                 */
483             /*                                                          */
484             /* X(4k)  = (x(n)+x(n + N/2)) + (x(n+N/4)+ x(n + 3N/4))     */
485             /* X(4k+1)= (x(n)-x(n + N/2)) -j(x(n+N/4) - x(n + 3N/4))    */
486             /* x(4k+2)= (x(n)+x(n + N/2)) - (x(n+N/4)+ x(n + 3N/4))     */
487             /* X(4k+3)= (x(n)-x(n + N/2)) +j(x(n+N/4) - x(n + 3N/4))    */
488             /*                                                          */
489             /* which leads to the real and imaginary values as foll:    */
490             /*                                                          */
491             /* y0r = x0r + x2r +  x1r +  x3r    =  xh0 + xh20           */
492             /* y0i = x0i + x2i +  x1i +  x3i    =  xh1 + xh21           */
493             /* y1r = x0r - x2r + (x1i -  x3i)   =  xl0 + xl21           */
494             /* y1i = x0i - x2i - (x1r -  x3r)   =  xl1 - xl20           */
495             /* y2r = x0r + x2r - (x1r +  x3r)   =  xh0 - xh20           */
496             /* y2i = x0i + x2i - (x1i +  x3i    =  xh1 - xh21           */
497             /* y3r = x0r - x2r - (x1i -  x3i)   =  xl0 - xl21           */
498             /* y3i = x0i - x2i + (x1r -  x3r)   =  xl1 + xl20           */
499             /* ---------------------------------------------------------*/
500             x0[0] = (xh0_0 + xh20_0 + 1) >> 1;
501             x0[1] = (xh1_0 + xh21_0 + 1) >> 1;
502             x0[2] = (xh0_1 + xh20_1 + 1) >> 1;
503             x0[3] = (xh1_1 + xh21_1 + 1) >> 1;
504
505             xt0_0 = xh0_0 - xh20_0;
506             yt0_0 = xh1_0 - xh21_0;
507             xt0_1 = xh0_1 - xh20_1;
508             yt0_1 = xh1_1 - xh21_1;
509
510             xt1_0 = xl0_0 + xl21_0;
511             yt2_0 = xl1_0 + xl20_0;
512             xt2_0 = xl0_0 - xl21_0;
513             yt1_0 = xl1_0 - xl20_0;
514
515             xt1_1 = xl0_1 + xl21_1;    yt2_1 = xl1_1 + xl20_1;
516             xt2_1 = xl0_1 - xl21_1;    yt1_1 = xl1_1 - xl20_1;
517
518             /*---------------------------------------------------------*/
519             /* Perform twiddle factor multiplies of three terms,top    */
520             /* term does not have any multiplies. Note the twiddle     */
521             /* factors for a normal FFT are C + j (-S). Since the      */
522             /* factors that are stored are C + j S, this is            */
523             /* corrected for in the multiplies.                        */
524             /*                                                         */
525             /* Y1 = (xt1 + jyt1) (c + js) = (xc + ys) + (yc -xs)       */
526             /*---------------------------------------------------------*/
527             x2[l1  ] = (-co20 * xt0_0 - si20 * yt0_0 + 0x8000) >> 16;
528             x2[l1+1] = (-co20 * yt0_0 + si20 * xt0_0 + 0x8000) >> 16;
529
530             x2[l1+2] = (-co21 * xt0_1 - si21 * yt0_1 + 0x8000) >> 16;
531             x2[l1+3] = (-co21 * yt0_1 + si21 * xt0_1 + 0x8000) >> 16;
532
533             x2[h2  ] = (co10 * xt1_0 + si10 * yt1_0 + 0x8000) >> 16;
534             x2[h2+1] = (co10 * yt1_0 - si10 * xt1_0 + 0x8000) >> 16;
535
536             x2[h2+2] = (co11 * xt1_1 + si11 * yt1_1 + 0x8000) >> 16;
537             x2[h2+3] = (co11 * yt1_1 - si11 * xt1_1 + 0x8000) >> 16;
538
539             x2[l2  ] = (co30 * xt2_0 + si30 * yt2_0 + 0x8000) >> 16;
540             x2[l2+1] = (co30 * yt2_0 - si30 * xt2_0 + 0x8000) >> 16;
541
542             x2[l2+2] = (co31 * xt2_1 + si31 * yt2_1 + 0x8000) >> 16;
543             x2[l2+3] = (co31 * yt2_1 - si31 * xt2_1 + 0x8000) >> 16;
544         }
545     }
546
547     /*-----------------------------------------------------------------*/
548     /* The following code performs either a standard radix4 pass or a  */
549     /* radix2 pass. Two pointers are used to access the input data.    */
550     /* The input data is read "N/4" complex samples apart or "N/2"     */
551     /* words apart using pointers "x0" and "x2". This produces out-    */
552     /* puts that are 0, N/4, N/2, 3N/4 for a radix4 FFT, and 0, N/8    */
553     /* N/2, 3N/8 for radix 2.                                          */
554     /*-----------------------------------------------------------------*/
555     y0 = ptr_y;
556     y2 = ptr_y + (int)npoints;
557     x0 = ptr_x;
558     x2 = ptr_x + (int)(npoints >> 1);
559
560     if (radix == 2) {
561         /*----------------------------------------------------------------*/
562         /* The pointers are set at the following locations which are half */
563         /* the offsets of a radix4 FFT.                                   */
564         /*----------------------------------------------------------------*/
565         y1 = y0 + (int)(npoints >> 2);
566         y3 = y2 + (int)(npoints >> 2);
567         l1 = norm + 1;
568         j0 = 8;
569         n0 = npoints >> 1;
570     }
571     else {
572         y1 = y0 + (int)(npoints >> 1);
573         y3 = y2 + (int)(npoints >> 1);
574         l1 = norm + 2;
575         j0 = 4;
576         n0 = npoints >> 2;
577     }
578
579     /*--------------------------------------------------------------------*/
580     /* The following code reads data indentically for either a radix 4    */
581     /* or a radix 2 style decomposition. It writes out at different       */
582     /* locations though. It checks if either half the points, or a        */
583     /* quarter of the complex points have been exhausted to jump to       */
584     /* pervent double reversal.                                           */
585     /*--------------------------------------------------------------------*/
586     j = 0;
587
588     #ifndef NOASSUME
589     _nassert((int)(n0) % 4  == 0);
590     _nassert((int)(x0) % 8 == 0);
591     _nassert((int)(x2) % 8 == 0);
592     _nassert((int)(y0) % 8 == 0);
593     #pragma MUST_ITERATE(2,,2);
594     #endif
595
596     for (i = 0; i < npoints; i += 8) {
597         /*----------------------------------------------------------------*/
598         /* Digit reverse the index starting from 0. The increment to "j"  */
599         /* is either by 4, or 8.                                          */
600         /*----------------------------------------------------------------*/
601         DIG_REV(j, l1, h2);
602
603         /*----------------------------------------------------------------*/
604         /* Read in the input data, from the first eight locations. These  */
605         /* are transformed either as a radix4 or as a radix 2.            */
606         /*----------------------------------------------------------------*/
607         x_0 = x0[0];
608         x_1 = x0[1];
609         x_2 = x0[2];
610         x_3 = x0[3];
611         x_4 = x0[4];
612         x_5 = x0[5];
613         x_6 = x0[6];
614         x_7 = x0[7];
615         x0 += 8;
616
617         xh0_0 = x_0 + x_4;
618         xh1_0 = x_1 + x_5;
619         xl0_0 = x_0 - x_4;
620         xl1_0 = x_1 - x_5;
621         xh0_1 = x_2 + x_6;
622         xh1_1 = x_3 + x_7;
623         xl0_1 = x_2 - x_6;
624         xl1_1 = x_3 - x_7;
625
626         n00 = xh0_0 + xh0_1;
627         n01 = xh1_0 + xh1_1;
628         n10 = xl0_0 + xl1_1;
629         n11 = xl1_0 - xl0_1;
630         n20 = xh0_0 - xh0_1;
631         n21 = xh1_0 - xh1_1;
632         n30 = xl0_0 - xl1_1;
633         n31 = xl1_0 + xl0_1;
634
635         if (radix == 2) {
636             /*-------------------------------------------------------------*/
637             /* Perform radix2 style decomposition.                         */
638             /*-------------------------------------------------------------*/
639             n00 = x_0 + x_2;
640             n01 = x_1 + x_3;
641             n20 = x_0 - x_2;
642             n21 = x_1 - x_3;
643             n10 = x_4 + x_6;
644             n11 = x_5 + x_7;
645             n30 = x_4 - x_6;
646             n31 = x_5 - x_7;
647         }
648
649         y0[2*h2] = n00;
650         y0[2*h2 + 1] = n01;
651         y1[2*h2] = n10;
652         y1[2*h2 + 1] = n11;
653         y2[2*h2] = n20;
654         y2[2*h2 + 1] = n21;
655         y3[2*h2] = n30;
656         y3[2*h2 + 1] = n31;
657
658         /*----------------------------------------------------------------*/
659         /* Read in ht enext eight inputs, and perform radix4 or radix2    */
660         /* decomposition.                                                 */
661         /*----------------------------------------------------------------*/
662         x_8 = x2[0];    x_9 = x2[1];
663         x_a = x2[2];    x_b = x2[3];
664         x_c = x2[4];    x_d = x2[5];
665         x_e = x2[6];    x_f = x2[7];
666         x2 += 8;
667
668         xh0_2 = x_8 + x_c;    xh1_2 = x_9 + x_d;
669         xl0_2 = x_8 - x_c;    xl1_2 = x_9 - x_d;
670         xh0_3 = x_a + x_e;    xh1_3 = x_b + x_f;
671         xl0_3 = x_a - x_e;    xl1_3 = x_b - x_f;
672
673         n02 = xh0_2 + xh0_3;    n03 = xh1_2 + xh1_3;
674         n12 = xl0_2 + xl1_3;    n13 = xl1_2 - xl0_3;
675         n22 = xh0_2 - xh0_3;    n23 = xh1_2 - xh1_3;
676         n32 = xl0_2 - xl1_3;    n33 = xl1_2 + xl0_3;
677
678         if (radix == 2) {
679             n02 = x_8 + x_a;    n03 = x_9 + x_b;
680             n22 = x_8 - x_a;    n23 = x_9 - x_b;
681             n12 = x_c + x_e;    n13 = x_d + x_f;
682             n32 = x_c - x_e;    n33 = x_d - x_f;
683         }
684
685         /*-----------------------------------------------------------------*/
686         /* Points that are read from succesive locations map to y, y[N/4]  */
687         /* y[N/2], y[3N/4] in a radix4 scheme, y, y[N/8], y[N/2],y[5N/8]   */
688         /*-----------------------------------------------------------------*/
689         y0[2*h2+2] = n02;    y0[2*h2+3] = n03;
690         y1[2*h2+2] = n12;    y1[2*h2+3] = n13;
691         y2[2*h2+2] = n22;    y2[2*h2+3] = n23;
692         y3[2*h2+2] = n32;    y3[2*h2+3] = n33;
693
694         j += j0;
695         if (j == n0) {
696             j += n0;
697             x0 += (int)npoints >> 1;
698             x2 += (int)npoints >> 1;
699         }
700     }
701 }
702
703 /* ======================================================================== */
704 /*  End of file: DSP_fft16x16_cn.c                                           */
705 /* ------------------------------------------------------------------------ */
706 /*          Copyright (C) 2007 Texas Instruments, Incorporated.             */
707 /*                          All Rights Reserved.                            */
708 /* ======================================================================== */

DSP_fft16x16_cn

用于计算旋转因子的gen_twiddle_fft16x16函数在同一目录下，源代码是：

  1 /* ======================================================================== */
  2 /*      gen_twiddle_fft16x16.c -- File with twiddle factor generators.      */
  3 /* ======================================================================== */
  4 /*      This code requires a special sequence of twiddle factors stored     */
  5 /*      in 1Q15 fixed-point format.  The following C code is used for       */
  6 /*      the natural C and intrinsic C implementations.                      */
  7 /*                                                                          */
  8 /*      In order to vectorize the FFT, it is desirable to access twiddle    */
  9 /*      factor array using double word wide loads and fetch the twiddle     */
 10 /*      factors needed. In order to do this a modified twiddle factor       */
 11 /*      array is created, in which the factors WN/4, WN/2, W3N/4 are        */
 12 /*      arranged to be contiguous. This eliminates the seperation between   */
 13 /*      twiddle factors within a butterfly. However this implies that as    */
 14 /*      the loop is traversed from one stage to another, that we maintain   */
 15 /*      a redundant version of the twiddle factor array. Hence the size     */
 16 /*      of the twiddle factor array increases as compared to the normal     */
 17 /*      Cooley Tukey FFT.  The modified twiddle factor array is of size     */
 18 /*      "2 * N" where the conventional Cooley Tukey FFT is of size"3N/4"    */
 19 /*      where N is the number of complex points to be transformed. The      */
 20 /*      routine that generates the modified twiddle factor array was        */
 21 /*      presented earlier. With the above transformation of the FFT,        */
 22 /*      both the input data and the twiddle factor array can be accessed    */
 23 /*      using double-word wide loads to enable packed data processing.      */
 24 /*                                                                          */
 25 /* ------------------------------------------------------------------------ */
 26 /*            Copyright (c) 2007 Texas Instruments, Incorporated.           */
 27 /*                           All Rights Reserved.                           */
 28 /* ======================================================================== */
 29 #include <math.h>
 30 #include "gen_twiddle_fft16x16.h"
 31
 32 #ifndef PI
 33 # ifdef M_PI
 34 #  define PI M_PI
 35 # else
 36 #  define PI 3.14159265358979323846
 37 # endif
 38 #endif
 39
 40
 41 /* ======================================================================== */
 42 /*  D2S -- Truncate a ‘double‘ to a ‘short‘, with clamping.                 */
 43 /* ======================================================================== */
 44 static short d2s(double d)
 45 {
 46     d = floor(0.5 + d);  // Explicit rounding to integer //
 47     if (d >=  32767.0) return  32767;
 48     if (d <= -32768.0) return -32768;
 49     return (short)d;
 50 }
 51
 52
 53 /* ======================================================================== */
 54 /*  GEN_TWIDDLE -- Generate twiddle factors for TI‘s custom FFTs.           */
 55 /*                                                                          */
 56 /*  USAGE                                                                   */
 57 /*      This routine is called as follows:                                  */
 58 /*                                                                          */
 59 /*          int gen_twiddle_fft16x16(short *w, int n)                       */
 60 /*                                                                          */
 61 /*          short *w      Pointer to twiddle-factor array                   */
 62 /*          int   n       Size of FFT                                       */
 63 /*                                                                          */
 64 /*      The routine will generate the twiddle-factors directly into the     */
 65 /*      array you specify.  The array needs to be approximately 2*N         */
 66 /*      elements long.  (The actual size, which is slightly smaller, is     */
 67 /*      returned by the function.)                                          */
 68 /* ======================================================================== */
 69 int gen_twiddle_fft16x16(short *w, int n)
 70 {
 71     int i, j, k;
 72     double M = 32767.5;
 73
 74     for (j = 1, k = 0; j < n >> 2; j = j << 2) {
 75         for (i = 0; i < n >> 2; i += j << 1) {
 76             w[k + 11] =  d2s(M * cos(6.0 * PI * (i + j) / n));
 77             w[k + 10] =  d2s(M * sin(6.0 * PI * (i + j) / n));
 78             w[k +  9] =  d2s(M * cos(6.0 * PI * (i    ) / n));
 79             w[k +  8] =  d2s(M * sin(6.0 * PI * (i    ) / n));
 80
 81             w[k +  7] = -d2s(M * cos(4.0 * PI * (i + j) / n));
 82             w[k +  6] = -d2s(M * sin(4.0 * PI * (i + j) / n));
 83             w[k +  5] = -d2s(M * cos(4.0 * PI * (i    ) / n));
 84             w[k +  4] = -d2s(M * sin(4.0 * PI * (i    ) / n));
 85
 86             w[k +  3] =  d2s(M * cos(2.0 * PI * (i + j) / n));
 87             w[k +  2] =  d2s(M * sin(2.0 * PI * (i + j) / n));
 88             w[k +  1] =  d2s(M * cos(2.0 * PI * (i    ) / n));
 89             w[k +  0] =  d2s(M * sin(2.0 * PI * (i    ) / n));
 90
 91             k += 12;
 92         }
 93     }
 94     return k;
 95 }
 96
 97 /* ======================================================================= */
 98 /*  End of file:  gen_twiddle_fft16x16.c                                   */
 99 /* ----------------------------------------------------------------------- */
100 /*            Copyright (c) 2007 Texas Instruments, Incorporated.          */
101 /*                           All Rights Reserved.                          */
102 /* ======================================================================= */

gen_twiddle_fft16x16

在c64plus\dsplib_v210\src目录下有其他几个FFT及其余函数的源代码。特别地，在c64plus\dsplib_v210\example目录下有fft_example目录，该目录下fft_example.c列举了几个FFT的例子。和上面的代码差不多。TI DSP的FFT库函数大致是这么使用的。

原文地址：https://www.cnblogs.com/songyj06/p/fft.html

时间： 2024-10-22 16:02:53

TI DSP FFT库函数

TI DSP FFT库函数的相关文章

纪念2014 TI DSP大奖赛

203-基于ARM和双TI DSP TMS320C6678的6UCPCI高清编解码处理平台

121-基于TI DSP TMS320DM8148的全高清1080P 60fs的视频编解码系统机器人主板

175-基于TI DSP TMS320C6455、Xilinx V5 FPGA XC5VSX95T的高速数据处理核心板

140-基于双TI DSP TMS320C6670+XC7K480T的6UCPCI Express高速数据处理平台

202-基于TI DSP TMS320C6678、Xilinx K7 FPGA XC72K325T的高速数据处理核心板

246-基于TI DSP TMS320C6678、Altera FPGA的CPCI处理卡

204-基于Xilinx Virtex-6 XC6VLX240T 和TI DSP TMS320C6678的信号处理板

375-基于TI DSP TMS320C6657、XC7K325T的高速数据处理核心板