aboutsummaryrefslogtreecommitdiff
path: root/libpostproc/postprocess_template.c
diff options
context:
space:
mode:
authorTucker DiNapoli2015-04-22 16:27:26 -0400
committerMichael Niedermayer2015-04-22 23:32:35 +0200
commit6264b6227c779af9d2520722f6acb45a2c51cdfd (patch)
treed9f6dada4bb2b2774c1c9344f85001a3d703793c /libpostproc/postprocess_template.c
parent748d4816d92c735f662c7ac299e79ff0f6fe252e (diff)
postproc: Replaced inline asm for prefetching with prefetch functions
Prefetching functions are defined in postprocess_template using the RENAME macro so that prefetching is used when available. For x86 targets inline asm is used and the functions are non-empty only for cpus where prefetching is available. For non x86 targets the gcc bultin prefetch is used if it is available, otherwise no prefetching is done. Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libpostproc/postprocess_template.c')
-rw-r--r--libpostproc/postprocess_template.c126
1 files changed, 71 insertions, 55 deletions
diff --git a/libpostproc/postprocess_template.c b/libpostproc/postprocess_template.c
index 16e441afe9..e153b13408 100644
--- a/libpostproc/postprocess_template.c
+++ b/libpostproc/postprocess_template.c
@@ -3242,6 +3242,69 @@ static inline void RENAME(duplicate)(uint8_t src[], int stride)
#endif
}
+#if ARCH_X86 && TEMPLATE_PP_MMXEXT
+static inline void RENAME(prefetchnta)(const void *p)
+{
+ __asm__ volatile( "prefetchnta (%0)\n\t"
+ : : "r" (p)
+ );
+}
+
+static inline void RENAME(prefetcht0)(const void *p)
+{
+ __asm__ volatile( "prefetcht0 (%0)\n\t"
+ : : "r" (p)
+ );
+}
+
+static inline void RENAME(prefetcht1)(const void *p)
+{
+ __asm__ volatile( "prefetcht1 (%0)\n\t"
+ : : "r" (p)
+ );
+}
+
+static inline void RENAME(prefetcht2)(const void *p)
+{
+ __asm__ volatile( "prefetcht2 (%0)\n\t"
+ : : "r" (p)
+ );
+}
+#elif !ARCH_X86 && AV_GCC_VERSION_AT_LEAST(3,2)
+static inline void RENAME(prefetchnta)(const void *p)
+{
+ __builtin_prefetch(p,0,0);
+}
+static inline void RENAME(prefetcht0)(const void *p)
+{
+ __builtin_prefetch(p,0,1);
+}
+static inline void RENAME(prefetcht1)(const void *p)
+{
+ __builtin_prefetch(p,0,2);
+}
+static inline void RENAME(prefetcht2)(const void *p)
+{
+ __builtin_prefetch(p,0,3);
+}
+#else
+static inline void RENAME(prefetchnta)(const void *p)
+{
+ return;
+}
+static inline void RENAME(prefetcht0)(const void *p)
+{
+ return;
+}
+static inline void RENAME(prefetcht1)(const void *p)
+{
+ return;
+}
+static inline void RENAME(prefetcht2)(const void *p)
+{
+ return;
+}
+#endif
/**
* Filter array of bytes (Y or U or V values)
*/
@@ -3368,34 +3431,10 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
// finish 1 block before the next otherwise we might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or something
for(x=0; x<width; x+=BLOCK_SIZE){
-
-#if TEMPLATE_PP_MMXEXT && HAVE_6REGS
-/*
- prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
- prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
- prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
- prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
-*/
-
- __asm__(
- "mov %4, %%"REG_a" \n\t"
- "shr $2, %%"REG_a" \n\t"
- "and $6, %%"REG_a" \n\t"
- "add %5, %%"REG_a" \n\t"
- "mov %%"REG_a", %%"REG_d" \n\t"
- "imul %1, %%"REG_a" \n\t"
- "imul %3, %%"REG_d" \n\t"
- "prefetchnta 32(%%"REG_a", %0) \n\t"
- "prefetcht0 32(%%"REG_d", %2) \n\t"
- "add %1, %%"REG_a" \n\t"
- "add %3, %%"REG_d" \n\t"
- "prefetchnta 32(%%"REG_a", %0) \n\t"
- "prefetcht0 32(%%"REG_d", %2) \n\t"
- :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
- "g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
- : "%"REG_a, "%"REG_d
- );
-#endif
+ RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
+ RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
+ RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
+ RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
@@ -3474,33 +3513,10 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
uint8_t *dstBlockStart = dstBlock;
const uint8_t *srcBlockStart = srcBlock;
for(; x < endx; x+=BLOCK_SIZE){
-#if TEMPLATE_PP_MMXEXT && HAVE_6REGS
-/*
- prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
- prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
- prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
- prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
-*/
-
- __asm__(
- "mov %4, %%"REG_a" \n\t"
- "shr $2, %%"REG_a" \n\t"
- "and $6, %%"REG_a" \n\t"
- "add %5, %%"REG_a" \n\t"
- "mov %%"REG_a", %%"REG_d" \n\t"
- "imul %1, %%"REG_a" \n\t"
- "imul %3, %%"REG_d" \n\t"
- "prefetchnta 32(%%"REG_a", %0) \n\t"
- "prefetcht0 32(%%"REG_d", %2) \n\t"
- "add %1, %%"REG_a" \n\t"
- "add %3, %%"REG_d" \n\t"
- "prefetchnta 32(%%"REG_a", %0) \n\t"
- "prefetcht0 32(%%"REG_d", %2) \n\t"
- :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
- "g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
- : "%"REG_a, "%"REG_d
- );
-#endif
+ RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
+ RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
+ RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
+ RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);