From 0ca87f05ba8bdc6791c14878464efc901ad71e99 Mon Sep 17 00:00:00 2001
From: Matt Evans
Date: Wed, 20 Jul 2011 15:51:00 +0000
Subject: net: filter: BPF 'JIT' compiler for PPC64

An implementation of a code generator for BPF programs to speed up packet
filtering on PPC64, inspired by Eric Dumazet's x86-64 version.

Filter code is generated as an ABI-compliant function in module_alloc()'d mem
with stackframe & prologue/epilogue generated if required (simple filters don't
need anything more than an li/blr).  The filter's local variables, M[], live in
registers.  Supports all BPF opcodes, although "complicated" loads from negative
packet offsets (e.g. SKF_LL_OFF) are not yet supported.

There are a couple of further optimisations left for future work; many-pass
assembly with branch-reach reduction and a register allocator to push M[]
variables into volatile registers would improve the code quality further.

This currently supports big-endian 64-bit PowerPC only (but is fairly simple
to port to PPC32 or LE!).

Enabled in the same way as x86-64:

	echo 1 > /proc/sys/net/core/bpf_jit_enable

Or, enabled with extra debug output:

	echo 2 > /proc/sys/net/core/bpf_jit_enable

Signed-off-by: Matt Evans <matt@ozlabs.org>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/powerpc/net/bpf_jit_64.S | 138 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 138 insertions(+)
 create mode 100644 arch/powerpc/net/bpf_jit_64.S

(limited to 'arch/powerpc/net/bpf_jit_64.S')

diff --git a/arch/powerpc/net/bpf_jit_64.S b/arch/powerpc/net/bpf_jit_64.S
new file mode 100644
index 000000000000..ff4506e85cce
--- /dev/null
+++ b/arch/powerpc/net/bpf_jit_64.S
@@ -0,0 +1,138 @@
+/* bpf_jit.S: Packet/header access helper functions
+ * for PPC64 BPF compiler.
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <asm/ppc_asm.h>
+#include "bpf_jit.h"
+
+/*
+ * All of these routines are called directly from generated code,
+ * whose register usage is:
+ *
+ * r3		skb
+ * r4,r5	A,X
+ * r6		*** address parameter to helper ***
+ * r7-r10	scratch
+ * r14		skb->data
+ * r15		skb headlen
+ * r16-31	M[]
+ */
+
+/*
+ * To consider: These helpers are so small it could be better to just
+ * generate them inline.  Inline code can do the simple headlen check
+ * then branch directly to slow_path_XXX if required.  (In fact, could
+ * load a spare GPR with the address of slow_path_generic and pass size
+ * as an argument, making the call site a mtlr, li and bllr.)
+ *
+ * Technically, the "is addr < 0" check is unnecessary & slowing down
+ * the ABS path, as it's statically checked on generation.
+ */
+	.globl	sk_load_word
+sk_load_word:
+	cmpdi	r_addr, 0
+	blt	bpf_error
+	/* Are we accessing past headlen? */
+	subi	r_scratch1, r_HL, 4
+	cmpd	r_scratch1, r_addr
+	blt	bpf_slow_path_word
+	/* Nope, just hitting the header.  cr0 here is eq or gt! */
+	lwzx	r_A, r_D, r_addr
+	/* When big endian we don't need to byteswap. */
+	blr	/* Return success, cr0 != LT */
+
+	.globl	sk_load_half
+sk_load_half:
+	cmpdi	r_addr, 0
+	blt	bpf_error
+	subi	r_scratch1, r_HL, 2
+	cmpd	r_scratch1, r_addr
+	blt	bpf_slow_path_half
+	lhzx	r_A, r_D, r_addr
+	blr
+
+	.globl	sk_load_byte
+sk_load_byte:
+	cmpdi	r_addr, 0
+	blt	bpf_error
+	cmpd	r_HL, r_addr
+	ble	bpf_slow_path_byte
+	lbzx	r_A, r_D, r_addr
+	blr
+
+/*
+ * BPF_S_LDX_B_MSH: ldxb  4*([offset]&0xf)
+ * r_addr is the offset value, already known positive
+ */
+	.globl sk_load_byte_msh
+sk_load_byte_msh:
+	cmpd	r_HL, r_addr
+	ble	bpf_slow_path_byte_msh
+	lbzx	r_X, r_D, r_addr
+	rlwinm	r_X, r_X, 2, 32-4-2, 31-2
+	blr
+
+bpf_error:
+	/* Entered with cr0 = lt */
+	li	r3, 0
+	/* Generated code will 'blt epilogue', returning 0. */
+	blr
+
+/* Call out to skb_copy_bits:
+ * We'll need to back up our volatile regs first; we have
+ * local variable space at r1+(BPF_PPC_STACK_BASIC).
+ * Allocate a new stack frame here to remain ABI-compliant in
+ * stashing LR.
+ */
+#define bpf_slow_path_common(SIZE)				\
+	mflr	r0;						\
+	std	r0, 16(r1);					\
+	/* R3 goes in parameter space of caller's frame */	\
+	std	r_skb, (BPF_PPC_STACKFRAME+48)(r1);		\
+	std	r_A, (BPF_PPC_STACK_BASIC+(0*8))(r1);		\
+	std	r_X, (BPF_PPC_STACK_BASIC+(1*8))(r1);		\
+	addi	r5, r1, BPF_PPC_STACK_BASIC+(2*8);		\
+	stdu	r1, -BPF_PPC_SLOWPATH_FRAME(r1);		\
+	/* R3 = r_skb, as passed */				\
+	mr	r4, r_addr;					\
+	li	r6, SIZE;					\
+	bl	skb_copy_bits;					\
+	/* R3 = 0 on success */					\
+	addi	r1, r1, BPF_PPC_SLOWPATH_FRAME;			\
+	ld	r0, 16(r1);					\
+	ld	r_A, (BPF_PPC_STACK_BASIC+(0*8))(r1);		\
+	ld	r_X, (BPF_PPC_STACK_BASIC+(1*8))(r1);		\
+	mtlr	r0;						\
+	cmpdi	r3, 0;						\
+	blt	bpf_error;	/* cr0 = LT */			\
+	ld	r_skb, (BPF_PPC_STACKFRAME+48)(r1);		\
+	/* Great success! */
+
+bpf_slow_path_word:
+	bpf_slow_path_common(4)
+	/* Data value is on stack, and cr0 != LT */
+	lwz	r_A, BPF_PPC_STACK_BASIC+(2*8)(r1)
+	blr
+
+bpf_slow_path_half:
+	bpf_slow_path_common(2)
+	lhz	r_A, BPF_PPC_STACK_BASIC+(2*8)(r1)
+	blr
+
+bpf_slow_path_byte:
+	bpf_slow_path_common(1)
+	lbz	r_A, BPF_PPC_STACK_BASIC+(2*8)(r1)
+	blr
+
+bpf_slow_path_byte_msh:
+	bpf_slow_path_common(1)
+	lbz	r_X, BPF_PPC_STACK_BASIC+(2*8)(r1)
+	rlwinm	r_X, r_X, 2, 32-4-2, 31-2
+	blr
-- 
cgit v1.2.3