dct-test.c
Go to the documentation of this file.
1 /*
2  * (c) 2001 Fabrice Bellard
3  * 2007 Marc Hoffman <marc.hoffman@analog.com>
4  *
5  * This file is part of Libav.
6  *
7  * Libav is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * Libav is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with Libav; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <sys/time.h>
32 #include <unistd.h>
33 #include <math.h>
34 
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
38 
39 #include "simple_idct.h"
40 #include "aandcttab.h"
41 #include "faandct.h"
42 #include "faanidct.h"
43 #include "x86/idct_xvid.h"
44 #include "dctref.h"
45 
46 #undef printf
47 
48 void ff_mmx_idct(DCTELEM *data);
50 
51 void odivx_idct_c(short *block);
52 
53 // BFIN
56 
57 // ALTIVEC
59 //void idct_altivec(DCTELEM *block);?? no routine
60 
61 // ARM
67 
69 
70 struct algo {
71  const char *name;
76  int nonspec;
77 };
78 
79 #ifndef FAAN_POSTSCALE
80 #define FAAN_SCALE SCALE_PERM
81 #else
82 #define FAAN_SCALE NO_PERM
83 #endif
84 
85 static int cpu_flags;
86 
87 static const struct algo fdct_tab[] = {
88  { "REF-DBL", ff_ref_fdct, NO_PERM },
89  { "FAAN", ff_faandct, FAAN_SCALE },
90  { "IJG-AAN-INT", fdct_ifast, SCALE_PERM },
91  { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
92 
93 #if HAVE_MMX
94  { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
97 #endif
98 
99 #if HAVE_ALTIVEC
100  { "altivecfdct", fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
101 #endif
102 
103 #if ARCH_BFIN
104  { "BFINfdct", ff_bfin_fdct, NO_PERM },
105 #endif
106 
107  { 0 }
108 };
109 
110 static const struct algo idct_tab[] = {
111  { "FAANI", ff_faanidct, NO_PERM },
112  { "REF-DBL", ff_ref_idct, NO_PERM },
113  { "INT", j_rev_dct, MMX_PERM },
114  { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
115 
116 #if HAVE_MMX
117 #if CONFIG_GPL
118  { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
119  { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
120 #endif
122  { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
123  { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
124  { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
125 #endif
126 
127 #if ARCH_BFIN
128  { "BFINidct", ff_bfin_idct, NO_PERM },
129 #endif
130 
131 #if ARCH_ARM
132  { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
133  { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
134 #endif
135 #if HAVE_ARMV5TE
136  { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
137 #endif
138 #if HAVE_ARMV6
139  { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
140 #endif
141 #if HAVE_NEON
142  { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
143 #endif
144 
145 #if ARCH_ALPHA
146  { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
147 #endif
148 
149  { 0 }
150 };
151 
152 #define AANSCALE_BITS 12
153 
154 static int64_t gettime(void)
155 {
156  struct timeval tv;
157  gettimeofday(&tv, NULL);
158  return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
159 }
160 
161 #define NB_ITS 20000
162 #define NB_ITS_SPEED 50000
163 
164 static short idct_mmx_perm[64];
165 
166 static short idct_simple_mmx_perm[64] = {
167  0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
168  0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
169  0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
170  0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
171  0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
172  0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
173  0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
174  0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
175 };
176 
177 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
178 
179 static void idct_mmx_init(void)
180 {
181  int i;
182 
183  /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
184  for (i = 0; i < 64; i++) {
185  idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
186  }
187 }
188 
189 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
191 
192 static inline void mmx_emms(void)
193 {
194 #if HAVE_MMX
196  __asm__ volatile ("emms\n\t");
197 #endif
198 }
199 
200 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng)
201 {
202  int i, j;
203 
204  memset(block, 0, 64 * sizeof(*block));
205 
206  switch (test) {
207  case 0:
208  for (i = 0; i < 64; i++)
209  block[i] = (av_lfg_get(prng) % 512) - 256;
210  if (is_idct) {
211  ff_ref_fdct(block);
212  for (i = 0; i < 64; i++)
213  block[i] >>= 3;
214  }
215  break;
216  case 1:
217  j = av_lfg_get(prng) % 10 + 1;
218  for (i = 0; i < j; i++)
219  block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
220  break;
221  case 2:
222  block[ 0] = av_lfg_get(prng) % 4096 - 2048;
223  block[63] = (block[0] & 1) ^ 1;
224  break;
225  }
226 }
227 
228 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
229 {
230  int i;
231 
232  if (perm == MMX_PERM) {
233  for (i = 0; i < 64; i++)
234  dst[idct_mmx_perm[i]] = src[i];
235  } else if (perm == MMX_SIMPLE_PERM) {
236  for (i = 0; i < 64; i++)
237  dst[idct_simple_mmx_perm[i]] = src[i];
238  } else if (perm == SSE2_PERM) {
239  for (i = 0; i < 64; i++)
240  dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
241  } else if (perm == PARTTRANS_PERM) {
242  for (i = 0; i < 64; i++)
243  dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
244  } else {
245  for (i = 0; i < 64; i++)
246  dst[i] = src[i];
247  }
248 }
249 
250 static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
251 {
252  void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
253  int it, i, scale;
254  int err_inf, v;
255  int64_t err2, ti, ti1, it1, err_sum = 0;
256  int64_t sysErr[64], sysErrMax = 0;
257  int maxout = 0;
258  int blockSumErrMax = 0, blockSumErr;
259  AVLFG prng;
260  double omse, ome;
261  int spec_err;
262 
263  av_lfg_init(&prng, 1);
264 
265  err_inf = 0;
266  err2 = 0;
267  for (i = 0; i < 64; i++)
268  sysErr[i] = 0;
269  for (it = 0; it < NB_ITS; it++) {
270  init_block(block1, test, is_idct, &prng);
271  permute(block, block1, dct->format);
272 
273  dct->func(block);
274  mmx_emms();
275 
276  if (dct->format == SCALE_PERM) {
277  for (i = 0; i < 64; i++) {
278  scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
279  block[i] = (block[i] * scale) >> AANSCALE_BITS;
280  }
281  }
282 
283  ref(block1);
284 
285  blockSumErr = 0;
286  for (i = 0; i < 64; i++) {
287  int err = block[i] - block1[i];
288  err_sum += err;
289  v = abs(err);
290  if (v > err_inf)
291  err_inf = v;
292  err2 += v * v;
293  sysErr[i] += block[i] - block1[i];
294  blockSumErr += v;
295  if (abs(block[i]) > maxout)
296  maxout = abs(block[i]);
297  }
298  if (blockSumErrMax < blockSumErr)
299  blockSumErrMax = blockSumErr;
300  }
301  for (i = 0; i < 64; i++)
302  sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
303 
304  for (i = 0; i < 64; i++) {
305  if (i % 8 == 0)
306  printf("\n");
307  printf("%7d ", (int) sysErr[i]);
308  }
309  printf("\n");
310 
311  omse = (double) err2 / NB_ITS / 64;
312  ome = (double) err_sum / NB_ITS / 64;
313 
314  spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
315 
316  printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
317  is_idct ? "IDCT" : "DCT", dct->name, err_inf,
318  omse, ome, (double) sysErrMax / NB_ITS,
319  maxout, blockSumErrMax);
320 
321  if (spec_err && !dct->nonspec)
322  return 1;
323 
324  if (!speed)
325  return 0;
326 
327  /* speed test */
328  init_block(block, test, is_idct, &prng);
329  permute(block1, block, dct->format);
330 
331  ti = gettime();
332  it1 = 0;
333  do {
334  for (it = 0; it < NB_ITS_SPEED; it++) {
335  memcpy(block, block1, sizeof(block));
336  dct->func(block);
337  }
338  it1 += NB_ITS_SPEED;
339  ti1 = gettime() - ti;
340  } while (ti1 < 1000000);
341  mmx_emms();
342 
343  printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
344  (double) it1 * 1000.0 / (double) ti1);
345 
346  return 0;
347 }
348 
349 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
350 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
351 
352 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
353 {
354  static int init;
355  static double c8[8][8];
356  static double c4[4][4];
357  double block1[64], block2[64], block3[64];
358  double s, sum, v;
359  int i, j, k;
360 
361  if (!init) {
362  init = 1;
363 
364  for (i = 0; i < 8; i++) {
365  sum = 0;
366  for (j = 0; j < 8; j++) {
367  s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
368  c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
369  sum += c8[i][j] * c8[i][j];
370  }
371  }
372 
373  for (i = 0; i < 4; i++) {
374  sum = 0;
375  for (j = 0; j < 4; j++) {
376  s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
377  c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
378  sum += c4[i][j] * c4[i][j];
379  }
380  }
381  }
382 
383  /* butterfly */
384  s = 0.5 * sqrt(2.0);
385  for (i = 0; i < 4; i++) {
386  for (j = 0; j < 8; j++) {
387  block1[8 * (2 * i) + j] =
388  (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
389  block1[8 * (2 * i + 1) + j] =
390  (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
391  }
392  }
393 
394  /* idct8 on lines */
395  for (i = 0; i < 8; i++) {
396  for (j = 0; j < 8; j++) {
397  sum = 0;
398  for (k = 0; k < 8; k++)
399  sum += c8[k][j] * block1[8 * i + k];
400  block2[8 * i + j] = sum;
401  }
402  }
403 
404  /* idct4 */
405  for (i = 0; i < 8; i++) {
406  for (j = 0; j < 4; j++) {
407  /* top */
408  sum = 0;
409  for (k = 0; k < 4; k++)
410  sum += c4[k][j] * block2[8 * (2 * k) + i];
411  block3[8 * (2 * j) + i] = sum;
412 
413  /* bottom */
414  sum = 0;
415  for (k = 0; k < 4; k++)
416  sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
417  block3[8 * (2 * j + 1) + i] = sum;
418  }
419  }
420 
421  /* clamp and store the result */
422  for (i = 0; i < 8; i++) {
423  for (j = 0; j < 8; j++) {
424  v = block3[8 * i + j];
425  if (v < 0) v = 0;
426  else if (v > 255) v = 255;
427  dest[i * linesize + j] = (int) rint(v);
428  }
429  }
430 }
431 
432 static void idct248_error(const char *name,
433  void (*idct248_put)(uint8_t *dest, int line_size,
434  int16_t *block),
435  int speed)
436 {
437  int it, i, it1, ti, ti1, err_max, v;
438  AVLFG prng;
439 
440  av_lfg_init(&prng, 1);
441 
442  /* just one test to see if code is correct (precision is less
443  important here) */
444  err_max = 0;
445  for (it = 0; it < NB_ITS; it++) {
446  /* XXX: use forward transform to generate values */
447  for (i = 0; i < 64; i++)
448  block1[i] = av_lfg_get(&prng) % 256 - 128;
449  block1[0] += 1024;
450 
451  for (i = 0; i < 64; i++)
452  block[i] = block1[i];
453  idct248_ref(img_dest1, 8, block);
454 
455  for (i = 0; i < 64; i++)
456  block[i] = block1[i];
457  idct248_put(img_dest, 8, block);
458 
459  for (i = 0; i < 64; i++) {
460  v = abs((int) img_dest[i] - (int) img_dest1[i]);
461  if (v == 255)
462  printf("%d %d\n", img_dest[i], img_dest1[i]);
463  if (v > err_max)
464  err_max = v;
465  }
466  }
467  printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
468 
469  if (!speed)
470  return;
471 
472  ti = gettime();
473  it1 = 0;
474  do {
475  for (it = 0; it < NB_ITS_SPEED; it++) {
476  for (i = 0; i < 64; i++)
477  block[i] = block1[i];
478  idct248_put(img_dest, 8, block);
479  }
480  it1 += NB_ITS_SPEED;
481  ti1 = gettime() - ti;
482  } while (ti1 < 1000000);
483  mmx_emms();
484 
485  printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
486  (double) it1 * 1000.0 / (double) ti1);
487 }
488 
489 static void help(void)
490 {
491  printf("dct-test [-i] [<test-number>]\n"
492  "test-number 0 -> test with random matrixes\n"
493  " 1 -> test with random sparse matrixes\n"
494  " 2 -> do 3. test from mpeg4 std\n"
495  "-i test IDCT implementations\n"
496  "-4 test IDCT248 implementations\n"
497  "-t speed test\n");
498 }
499 
500 int main(int argc, char **argv)
501 {
502  int test_idct = 0, test_248_dct = 0;
503  int c, i;
504  int test = 1;
505  int speed = 0;
506  int err = 0;
507 
509 
510  ff_ref_dct_init();
511  idct_mmx_init();
512 
513  for (;;) {
514  c = getopt(argc, argv, "ih4t");
515  if (c == -1)
516  break;
517  switch (c) {
518  case 'i':
519  test_idct = 1;
520  break;
521  case '4':
522  test_248_dct = 1;
523  break;
524  case 't':
525  speed = 1;
526  break;
527  default:
528  case 'h':
529  help();
530  return 0;
531  }
532  }
533 
534  if (optind < argc)
535  test = atoi(argv[optind]);
536 
537  printf("Libav DCT/IDCT test\n");
538 
539  if (test_248_dct) {
540  idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
541  } else {
542  const struct algo *algos = test_idct ? idct_tab : fdct_tab;
543  for (i = 0; algos[i].name; i++)
544  if (!(~cpu_flags & algos[i].mm_support)) {
545  err |= dct_error(&algos[i], test, test_idct, speed);
546  }
547  }
548 
549  return err;
550 }
Definition: lfg.h:25
static void mmx_emms(void)
Definition: dct-test.c:192
#define AV_CPU_FLAG_ALTIVEC
standard
Definition: cpu.h:45
static const struct algo idct_tab[]
Definition: dct-test.c:110
static uint8_t img_dest[64]
Definition: dct-test.c:349
const char * name
Definition: dct-test.c:71
void ff_simple_idct_8(DCTELEM *block)
#define AV_CPU_FLAG_MMX2
SSE integer functions or AMD MMX ext.
Definition: cpu.h:28
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:55
int main(int argc, char **argv)
Definition: dct-test.c:500
void j_rev_dct(DCTELEM *data)
formattag
Definition: dct-test.c:73
av_cold void ff_ref_dct_init(void)
Initialize the double precision discrete cosine transform functions fdct & idct.
Definition: dctref.c:41
#define v(n)
Definition: regs.h:34
static int64_t gettime(void)
Definition: dct-test.c:154
int nonspec
Definition: dct-test.c:76
void ff_idct_xvid_sse2(short *block)
int mm_support
Definition: dct-test.c:75
void ff_fdct_mmx(DCTELEM *block)
const char data[16]
Definition: mxf.c:60
const uint16_t ff_aanscales[64]
Definition: aandcttab.c:26
static int init(AVCodecParserContext *s)
Definition: h264_parser.c:336
#define AANSCALE_BITS
Definition: dct-test.c:152
void ff_fdct_sse2(DCTELEM *block)
void ff_simple_idct_mmx(int16_t *block)
void ff_simple_idct248_put(uint8_t *dest, int line_size, DCTELEM *block)
Definition: simple_idct.c:89
#define FFMAX(a, b)
Definition: common.h:53
void ff_faanidct(DCTELEM block[64])
Definition: faanidct.c:131
static const struct algo fdct_tab[]
Definition: dct-test.c:87
Definition: dct-test.c:70
static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
Definition: dct-test.c:352
void fdct_ifast(DCTELEM *data)
Definition: jfdctfst.c:208
void ff_fdct_mmx2(DCTELEM *block)
static DCTELEM block[64]
Definition: dct-test.c:189
static short idct_simple_mmx_perm[64]
Definition: dct-test.c:166
void ff_simple_idct_arm(DCTELEM *data)
void ff_simple_idct_axp(DCTELEM *data)
static DCTELEM block1[64]
Definition: dct-test.c:190
#define FFABS(a)
Definition: common.h:50
static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng)
Definition: dct-test.c:200
static int cpu_flags
Definition: dct-test.c:85
void ff_simple_idct_armv5te(DCTELEM *data)
static short idct_mmx_perm[64]
Definition: dct-test.c:164
void ff_j_rev_dct_arm(DCTELEM *data)
static void idct_mmx_init(void)
Definition: dct-test.c:179
void ff_bfin_idct(DCTELEM *block)
NULL
Definition: eval.c:50
void ff_mmxext_idct(DCTELEM *data)
void ff_faandct(DCTELEM *data)
Definition: faandct.c:125
#define NB_ITS_SPEED
Definition: dct-test.c:162
#define AV_CPU_FLAG_MMX
standard MMX
Definition: cpu.h:27
static void(WINAPI *cond_broadcast)(pthread_cond_t *cond)
static const uint8_t idct_sse2_row_perm[8]
Definition: dct-test.c:177
static unsigned int av_lfg_get(AVLFG *c)
Get the next random unsigned 32-bit number using an ALFG.
Definition: lfg.h:38
void ff_simple_idct_armv6(DCTELEM *data)
#define printf
Definition: internal.h:151
void ff_ref_fdct(short *block)
Transform 8x8 block of data with a double precision forward DCT This is a reference implementation...
Definition: dctref.c:59
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:22
void ff_bfin_fdct(DCTELEM *block)
short DCTELEM
Definition: dsputil.h:39
static const uint16_t scale[4]
AAN (Arai Agui Nakajima) (I)DCT tables.
void ff_idct_xvid_mmx2(short *block)
const char * name
Definition: audioconvert.c:61
header for Xvid IDCT functions
void ff_simple_idct_neon(DCTELEM *data)
void odivx_idct_c(short *block)
static uint8_t img_dest1[64]
Definition: dct-test.c:350
common internal and external API header
void ff_idct_xvid_mmx(short *block)
void ff_jpeg_fdct_islow_8(DCTELEM *data)
void(* func)(DCTELEM *block)
Definition: dct-test.c:72
void av_cold av_lfg_init(AVLFG *c, unsigned int seed)
Definition: lfg.c:30
static void idct248_error(const char *name, void(*idct248_put)(uint8_t *dest, int line_size, int16_t *block), int speed)
Definition: dct-test.c:432
void fdct_altivec(DCTELEM *block)
void ff_ref_idct(short *block)
Transform 8x8 block of data with a double precision inverse DCT This is a reference implementation...
Definition: dctref.c:95
#define c4
Definition: idct_sh4.c:30
simple idct header.
static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
Definition: dct-test.c:250
void ff_mmx_idct(DCTELEM *data)
#define NB_ITS
Definition: dct-test.c:161
static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
Definition: dct-test.c:228
#define AV_CPU_FLAG_SSE2
PIV SSE2 functions.
Definition: cpu.h:31
#define FAAN_SCALE
Definition: dct-test.c:82
enum algo::formattag format
Floating point AAN DCT
static void help(void)
Definition: dct-test.c:489
#define M_PI
Definition: cos_tablegen.c:28