From 2de4ff7bd658c97fb357efa3095a509674dacb5a Mon Sep 17 00:00:00 2001
From: Thomas Graf
Date: Thu, 23 Jun 2005 20:49:30 -0700
Subject: [LIB]: Textsearch infrastructure.

The textsearch infrastructure provides text searching
facitilies for both linear and non-linear data.
Individual search algorithms are implemented in modules
and chosen by the user.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/Kconfig      |   8 +-
 lib/Makefile     |   2 +
 lib/textsearch.c | 317 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 326 insertions(+), 1 deletion(-)
 create mode 100644 lib/textsearch.c

(limited to 'lib')

diff --git a/lib/Kconfig b/lib/Kconfig
index 2d4d4e3bc4aa..5bc2d523e6d1 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -63,5 +63,11 @@ config REED_SOLOMON_ENC16
 config REED_SOLOMON_DEC16
 	boolean
 
-endmenu
+config TEXTSEARCH
+	boolean "Textsearch infrastructure"
+	default y
+	help
+	  Say Y here if you want to provide a textsearch infrastructure
+	  to other subsystems.
 
+endmenu
diff --git a/lib/Makefile b/lib/Makefile
index dcb4231916e2..3e917436ad60 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -36,6 +36,8 @@ obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
 obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/
 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
 
+lib-$(CONFIG_TEXTSEARCH) += textsearch.o
+
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
 
diff --git a/lib/textsearch.c b/lib/textsearch.c
new file mode 100644
index 000000000000..1e934c196f0f
--- /dev/null
+++ b/lib/textsearch.c
@@ -0,0 +1,317 @@
+/*
+ * lib/textsearch.c	Generic text search interface
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ * 		Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * ==========================================================================
+ *
+ * INTRODUCTION
+ *
+ *   The textsearch infrastructure provides text searching facitilies for
+ *   both linear and non-linear data. Individual search algorithms are
+ *   implemented in modules and chosen by the user.
+ *
+ * ARCHITECTURE
+ *
+ *      User
+ *     +----------------+
+ *     |        finish()|<--------------(6)-----------------+
+ *     |get_next_block()|<--------------(5)---------------+ |
+ *     |                |                     Algorithm   | |
+ *     |                |                    +------------------------------+
+ *     |                |                    |  init()   find()   destroy() |
+ *     |                |                    +------------------------------+
+ *     |                |       Core API           ^       ^          ^
+ *     |                |      +---------------+  (2)     (4)        (8)
+ *     |             (1)|----->| prepare()     |---+       |          |
+ *     |             (3)|----->| find()/next() |-----------+          |
+ *     |             (7)|----->| destroy()     |----------------------+
+ *     +----------------+      +---------------+
+ *  
+ *   (1) User configures a search by calling _prepare() specifying the
+ *       search parameters such as the pattern and algorithm name.
+ *   (2) Core requests the algorithm to allocate and initialize a search
+ *       configuration according to the specified parameters.
+ *   (3) User starts the search(es) by calling _find() or _next() to
+ *       fetch subsequent occurrences. A state variable is provided
+ *       to the algorihtm to store persistant variables.
+ *   (4) Core eventually resets the search offset and forwards the find()
+ *       request to the algorithm.
+ *   (5) Algorithm calls get_next_block() provided by the user continously
+ *       to fetch the data to be searched in block by block.
+ *   (6) Algorithm invokes finish() after the last call to get_next_block
+ *       to clean up any leftovers from get_next_block. (Optional)
+ *   (7) User destroys the configuration by calling _destroy().
+ *   (8) Core notifies the algorithm to destroy algorithm specific
+ *       allocations. (Optional)
+ *
+ * USAGE
+ *
+ *   Before a search can be performed, a configuration must be created
+ *   by calling textsearch_prepare() specyfing the searching algorithm and
+ *   the pattern to look for. The returned configuration may then be used
+ *   for an arbitary amount of times and even in parallel as long as a
+ *   separate struct ts_state variable is provided to every instance.
+ *
+ *   The actual search is performed by either calling textsearch_find_-
+ *   continuous() for linear data or by providing an own get_next_block()
+ *   implementation and calling textsearch_find(). Both functions return
+ *   the position of the first occurrence of the patern or UINT_MAX if
+ *   no match was found. Subsequent occurences can be found by calling
+ *   textsearch_next() regardless of the linearity of the data.
+ *
+ *   Once you're done using a configuration it must be given back via
+ *   textsearch_destroy.
+ *
+ * EXAMPLE
+ *
+ *   int pos;
+ *   struct ts_config *conf;
+ *   struct ts_state state;
+ *   const char *pattern = "chicken";
+ *   const char *example = "We dance the funky chicken";
+ *
+ *   conf = textsearch_prepare("kmp", pattern, strlen(pattern),
+ *                             GFP_KERNEL, TS_AUTOLOAD);
+ *   if (IS_ERR(conf)) {
+ *       err = PTR_ERR(conf);
+ *       goto errout;
+ *   }
+ *
+ *   pos = textsearch_find_continuous(conf, &state, example, strlen(example));
+ *   if (pos != UINT_MAX)
+ *       panic("Oh my god, dancing chickens at %d\n", pos);
+ *
+ *   textsearch_destroy(conf);
+ *
+ * ==========================================================================
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/err.h>
+#include <linux/textsearch.h>
+
+static LIST_HEAD(ts_ops);
+static DEFINE_SPINLOCK(ts_mod_lock);
+
+static inline struct ts_ops *lookup_ts_algo(const char *name)
+{
+	struct ts_ops *o;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(o, &ts_ops, list) {
+		if (!strcmp(name, o->name)) {
+			if (!try_module_get(o->owner))
+				o = NULL;
+			rcu_read_unlock();
+			return o;
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+/**
+ * textsearch_register - register a textsearch module
+ * @ops: operations lookup table
+ *
+ * This function must be called by textsearch modules to announce
+ * their presence. The specified &@ops must have %name set to a
+ * unique identifier and the callbacks find(), init(), get_pattern(),
+ * and get_pattern_len() must be implemented.
+ *
+ * Returns 0 or -EEXISTS if another module has already registered
+ * with same name.
+ */
+int textsearch_register(struct ts_ops *ops)
+{
+	int err = -EEXIST;
+	struct ts_ops *o;
+
+	if (ops->name == NULL || ops->find == NULL || ops->init == NULL ||
+	    ops->get_pattern == NULL || ops->get_pattern_len == NULL)
+		return -EINVAL;
+
+	spin_lock(&ts_mod_lock);
+	list_for_each_entry(o, &ts_ops, list) {
+		if (!strcmp(ops->name, o->name))
+			goto errout;
+	}
+
+	list_add_tail_rcu(&ops->list, &ts_ops);
+	err = 0;
+errout:
+	spin_unlock(&ts_mod_lock);
+	return err;
+}
+
+/**
+ * textsearch_unregister - unregister a textsearch module
+ * @ops: operations lookup table
+ *
+ * This function must be called by textsearch modules to announce
+ * their disappearance for examples when the module gets unloaded.
+ * The &ops parameter must be the same as the one during the
+ * registration.
+ *
+ * Returns 0 on success or -ENOENT if no matching textsearch
+ * registration was found.
+ */
+int textsearch_unregister(struct ts_ops *ops)
+{
+	int err = 0;
+	struct ts_ops *o;
+
+	spin_lock(&ts_mod_lock);
+	list_for_each_entry(o, &ts_ops, list) {
+		if (o == ops) {
+			list_del_rcu(&o->list);
+			goto out;
+		}
+	}
+
+	err = -ENOENT;
+out:
+	spin_unlock(&ts_mod_lock);
+	return err;
+}
+
+struct ts_linear_state
+{
+	unsigned int	len;
+	const void	*data;
+};
+
+static unsigned int get_linear_data(unsigned int consumed, const u8 **dst,
+				    struct ts_config *conf,
+				    struct ts_state *state)
+{
+	struct ts_linear_state *st = (struct ts_linear_state *) state->cb;
+
+	if (likely(consumed < st->len)) {
+		*dst = st->data + consumed;
+		return st->len - consumed;
+	}
+
+	return 0;
+}
+
+/**
+ * textsearch_find_continuous - search a pattern in continuous/linear data
+ * @conf: search configuration
+ * @state: search state
+ * @data: data to search in
+ * @len: length of data
+ *
+ * A simplified version of textsearch_find() for continuous/linear data.
+ * Call textsearch_next() to retrieve subsequent matches.
+ *
+ * Returns the position of first occurrence of the pattern or
+ * UINT_MAX if no occurrence was found.
+ */ 
+unsigned int textsearch_find_continuous(struct ts_config *conf,
+					struct ts_state *state,
+					const void *data, unsigned int len)
+{
+	struct ts_linear_state *st = (struct ts_linear_state *) state->cb;
+
+	conf->get_next_block = get_linear_data;
+	st->data = data;
+	st->len = len;
+
+	return textsearch_find(conf, state);
+}
+
+/**
+ * textsearch_prepare - Prepare a search
+ * @algo: name of search algorithm
+ * @pattern: pattern data
+ * @len: length of pattern
+ * @gfp_mask: allocation mask
+ * @flags: search flags
+ *
+ * Looks up the search algorithm module and creates a new textsearch
+ * configuration for the specified pattern. Upon completion all
+ * necessary refcnts are held and the configuration must be put back
+ * using textsearch_put() after usage.
+ *
+ * Note: The format of the pattern may not be compatible between
+ *       the various search algorithms.
+ *
+ * Returns a new textsearch configuration according to the specified
+ *         parameters or a ERR_PTR().
+ */
+struct ts_config *textsearch_prepare(const char *algo, const void *pattern,
+				     unsigned int len, int gfp_mask, int flags)
+{
+	int err = -ENOENT;
+	struct ts_config *conf;
+	struct ts_ops *ops;
+	
+	ops = lookup_ts_algo(algo);
+#ifdef CONFIG_KMOD
+	/*
+	 * Why not always autoload you may ask. Some users are
+	 * in a situation where requesting a module may deadlock,
+	 * especially when the module is located on a NFS mount.
+	 */
+	if (ops == NULL && flags & TS_AUTOLOAD) {
+		request_module("ts_%s", algo);
+		ops = lookup_ts_algo(algo);
+	}
+#endif
+
+	if (ops == NULL)
+		goto errout;
+
+	conf = ops->init(pattern, len, gfp_mask);
+	if (IS_ERR(conf)) {
+		err = PTR_ERR(conf);
+		goto errout;
+	}
+
+	conf->ops = ops;
+	return conf;
+
+errout:
+	if (ops)
+		module_put(ops->owner);
+		
+	return ERR_PTR(err);
+}
+
+/**
+ * textsearch_destroy - destroy a search configuration
+ * @conf: search configuration
+ *
+ * Releases all references of the configuration and frees
+ * up the memory.
+ */
+void textsearch_destroy(struct ts_config *conf)
+{
+	if (conf->ops) {
+		if (conf->ops->destroy)
+			conf->ops->destroy(conf);
+		module_put(conf->ops->owner);
+	}
+
+	kfree(conf);
+}
+
+EXPORT_SYMBOL(textsearch_register);
+EXPORT_SYMBOL(textsearch_unregister);
+EXPORT_SYMBOL(textsearch_prepare);
+EXPORT_SYMBOL(textsearch_find_continuous);
+EXPORT_SYMBOL(textsearch_destroy);
-- 
cgit v1.2.3


From df3fb93ad9ec0b20c785c0ad82d42d159a1af272 Mon Sep 17 00:00:00 2001
From: Thomas Graf
Date: Thu, 23 Jun 2005 20:58:37 -0700
Subject: [LIB]: Knuth-Morris-Pratt textsearch algorithm

Implements a linear-time string-matching algorithm due to Knuth,
Morris, and Pratt [1]. Their algorithm avoids the explicit
computation of the transition function DELTA altogether. Its
matching time is O(n), for n being length(text), using just an
auxiliary function PI[1..m], for m being length(pattern),
precomputed from the pattern in time O(m). The array PI allows
the transition function DELTA to be computed efficiently
"on the fly" as needed. Roughly speaking, for any state
"q" = 0,1,...,m and any character "a" in SIGMA, the value
PI["q"] contains the information that is independent of "a" and
is needed to compute DELTA("q", "a") [2]. Since the array PI
has only m entries, whereas DELTA has O(m|SIGMA|) entries, we
save a factor of |SIGMA| in the preprocessing time by computing
PI rather than DELTA.
 
[1] Cormen, Leiserson, Rivest, Stein
    Introdcution to Algorithms, 2nd Edition, MIT Press
[2] See finite automation theory

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/Kconfig  |  10 +++++
 lib/Makefile |   1 +
 lib/ts_kmp.c | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 156 insertions(+)
 create mode 100644 lib/ts_kmp.c

(limited to 'lib')

diff --git a/lib/Kconfig b/lib/Kconfig
index 5bc2d523e6d1..16b8fa2175e4 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -70,4 +70,14 @@ config TEXTSEARCH
 	  Say Y here if you want to provide a textsearch infrastructure
 	  to other subsystems.
 
+config TEXTSEARCH_KMP
+	depends on TEXTSEARCH
+	tristate "Knuth-Morris-Pratt"
+	help
+	  Say Y here if you want to be able to search text using the
+	  Knuth-Morris-Pratt textsearch algorithm.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called ts_kmp.
+
 endmenu
diff --git a/lib/Makefile b/lib/Makefile
index 3e917436ad60..6cdb10f312df 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/
 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
 
 lib-$(CONFIG_TEXTSEARCH) += textsearch.o
+obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
 
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
diff --git a/lib/ts_kmp.c b/lib/ts_kmp.c
new file mode 100644
index 000000000000..73266b975585
--- /dev/null
+++ b/lib/ts_kmp.c
@@ -0,0 +1,145 @@
+/*
+ * lib/ts_kmp.c		Knuth-Morris-Pratt text search implementation
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ *
+ * ==========================================================================
+ * 
+ *   Implements a linear-time string-matching algorithm due to Knuth,
+ *   Morris, and Pratt [1]. Their algorithm avoids the explicit
+ *   computation of the transition function DELTA altogether. Its
+ *   matching time is O(n), for n being length(text), using just an
+ *   auxiliary function PI[1..m], for m being length(pattern),
+ *   precomputed from the pattern in time O(m). The array PI allows
+ *   the transition function DELTA to be computed efficiently
+ *   "on the fly" as needed. Roughly speaking, for any state
+ *   "q" = 0,1,...,m and any character "a" in SIGMA, the value
+ *   PI["q"] contains the information that is independent of "a" and
+ *   is needed to compute DELTA("q", "a") [2]. Since the array PI
+ *   has only m entries, whereas DELTA has O(m|SIGMA|) entries, we
+ *   save a factor of |SIGMA| in the preprocessing time by computing
+ *   PI rather than DELTA.
+ *
+ *   [1] Cormen, Leiserson, Rivest, Stein
+ *       Introdcution to Algorithms, 2nd Edition, MIT Press
+ *   [2] See finite automation theory
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/textsearch.h>
+
+struct ts_kmp
+{
+	u8 *		pattern;
+	unsigned int	pattern_len;
+	unsigned int 	prefix_tbl[0];
+};
+
+static unsigned int kmp_find(struct ts_config *conf, struct ts_state *state)
+{
+	struct ts_kmp *kmp = ts_config_priv(conf);
+	unsigned int i, q = 0, text_len, consumed = state->offset;
+	const u8 *text;
+
+	for (;;) {
+		text_len = conf->get_next_block(consumed, &text, conf, state);
+
+		if (unlikely(text_len == 0))
+			break;
+
+		for (i = 0; i < text_len; i++) {
+			while (q > 0 && kmp->pattern[q] != text[i])
+				q = kmp->prefix_tbl[q - 1];
+			if (kmp->pattern[q] == text[i])
+				q++;
+			if (unlikely(q == kmp->pattern_len)) {
+				state->offset = consumed + i + 1;
+				return state->offset - kmp->pattern_len;
+			}
+		}
+
+		consumed += text_len;
+	}
+
+	return UINT_MAX;
+}
+
+static inline void compute_prefix_tbl(const u8 *pattern, unsigned int len,
+				      unsigned int *prefix_tbl)
+{
+	unsigned int k, q;
+
+	for (k = 0, q = 1; q < len; q++) {
+		while (k > 0 && pattern[k] != pattern[q])
+			k = prefix_tbl[k-1];
+		if (pattern[k] == pattern[q])
+			k++;
+		prefix_tbl[q] = k;
+	}
+}
+
+static struct ts_config *kmp_init(const void *pattern, unsigned int len,
+				  int gfp_mask)
+{
+	struct ts_config *conf;
+	struct ts_kmp *kmp;
+	unsigned int prefix_tbl_len = len * sizeof(unsigned int);
+	size_t priv_size = sizeof(*kmp) + len + prefix_tbl_len;
+
+	conf = alloc_ts_config(priv_size, gfp_mask);
+	if (IS_ERR(conf))
+		return conf;
+
+	kmp = ts_config_priv(conf);
+	kmp->pattern_len = len;
+	compute_prefix_tbl(pattern, len, kmp->prefix_tbl);
+	kmp->pattern = (u8 *) kmp->prefix_tbl + prefix_tbl_len;
+	memcpy(kmp->pattern, pattern, len);
+
+	return conf;
+}
+
+static void *kmp_get_pattern(struct ts_config *conf)
+{
+	struct ts_kmp *kmp = ts_config_priv(conf);
+	return kmp->pattern;
+}
+
+static unsigned int kmp_get_pattern_len(struct ts_config *conf)
+{
+	struct ts_kmp *kmp = ts_config_priv(conf);
+	return kmp->pattern_len;
+}
+
+static struct ts_ops kmp_ops = {
+	.name		  = "kmp",
+	.find		  = kmp_find,
+	.init		  = kmp_init,
+	.get_pattern	  = kmp_get_pattern,
+	.get_pattern_len  = kmp_get_pattern_len,
+	.owner		  = THIS_MODULE,
+	.list		  = LIST_HEAD_INIT(kmp_ops.list)
+};
+
+static int __init init_kmp(void)
+{
+	return textsearch_register(&kmp_ops);
+}
+
+static void __exit exit_kmp(void)
+{
+	textsearch_unregister(&kmp_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_kmp);
+module_exit(exit_kmp);
-- 
cgit v1.2.3


From 6408f79cce401e1bfecf923e7156f84f96e021e3 Mon Sep 17 00:00:00 2001
From: Thomas Graf
Date: Thu, 23 Jun 2005 20:59:16 -0700
Subject: [LIB]: Naive finite state machine based textsearch

A finite state machine consists of n states (struct ts_fsm_token)
representing the pattern as a finite automation. The data is read
sequentially on a octet basis. Every state token specifies the number
of recurrences and the type of value accepted which can be either a
specific character or ctype based set of characters. The available
type of recurrences include 1, (0|1), [0 n], and [1 n].

The algorithm differs between strict/non-strict mode specyfing
whether the pattern has to start at the first octect. Strict mode
is enabled by default and can be disabled by inserting
TS_FSM_HEAD_IGNORE as the first token in the chain.

The runtime performance of the algorithm should be around O(n),
however while in strict mode the average runtime can be better.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/textsearch_fsm.h |  48 ++++++
 lib/Kconfig                    |  11 ++
 lib/Makefile                   |   1 +
 lib/ts_fsm.c                   | 338 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 398 insertions(+)
 create mode 100644 include/linux/textsearch_fsm.h
 create mode 100644 lib/ts_fsm.c

(limited to 'lib')

diff --git a/include/linux/textsearch_fsm.h b/include/linux/textsearch_fsm.h
new file mode 100644
index 000000000000..fdfa078c66e5
--- /dev/null
+++ b/include/linux/textsearch_fsm.h
@@ -0,0 +1,48 @@
+#ifndef __LINUX_TEXTSEARCH_FSM_H
+#define __LINUX_TEXTSEARCH_FSM_H
+
+#include <linux/types.h>
+
+enum {
+	TS_FSM_SPECIFIC,	/* specific character */
+	TS_FSM_WILDCARD,	/* any character */
+	TS_FSM_DIGIT,		/* isdigit() */
+	TS_FSM_XDIGIT,		/* isxdigit() */
+	TS_FSM_PRINT,		/* isprint() */
+	TS_FSM_ALPHA,		/* isalpha() */
+	TS_FSM_ALNUM,		/* isalnum() */
+	TS_FSM_ASCII,		/* isascii() */
+	TS_FSM_CNTRL,		/* iscntrl() */
+	TS_FSM_GRAPH,		/* isgraph() */
+	TS_FSM_LOWER,		/* islower() */
+	TS_FSM_UPPER,		/* isupper() */
+	TS_FSM_PUNCT,		/* ispunct() */
+	TS_FSM_SPACE,		/* isspace() */
+	__TS_FSM_TYPE_MAX,
+};
+#define TS_FSM_TYPE_MAX (__TS_FSM_TYPE_MAX - 1)
+
+enum {
+	TS_FSM_SINGLE,		/* 1 occurrence */
+	TS_FSM_PERHAPS,		/* 1 or 0 occurrence */
+	TS_FSM_ANY,		/* 0..n occurrences */
+	TS_FSM_MULTI,		/* 1..n occurrences */
+	TS_FSM_HEAD_IGNORE,	/* 0..n ignored occurrences at head */
+	__TS_FSM_RECUR_MAX,
+};
+#define TS_FSM_RECUR_MAX (__TS_FSM_RECUR_MAX - 1)
+
+/**
+ * struct ts_fsm_token - state machine token (state)
+ * @type: type of token
+ * @recur: number of recurrences
+ * @value: character value for TS_FSM_SPECIFIC
+ */
+struct ts_fsm_token
+{
+	__u16		type;
+	__u8		recur;
+	__u8		value;
+};
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index 16b8fa2175e4..455833a9e31a 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -80,4 +80,15 @@ config TEXTSEARCH_KMP
 	  To compile this code as a module, choose M here: the
 	  module will be called ts_kmp.
 
+config TEXTSEARCH_FSM
+	depends on TEXTSEARCH
+	tristate "Finite state machine"
+	help
+	  Say Y here if you want to be able to search text using a
+	  naive finite state machine approach implementing a subset
+	  of regular expressions.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called ts_fsm.
+
 endmenu
diff --git a/lib/Makefile b/lib/Makefile
index 6cdb10f312df..7f6eda449102 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -38,6 +38,7 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
 
 lib-$(CONFIG_TEXTSEARCH) += textsearch.o
 obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
+obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
 
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
diff --git a/lib/ts_fsm.c b/lib/ts_fsm.c
new file mode 100644
index 000000000000..d27c0a072940
--- /dev/null
+++ b/lib/ts_fsm.c
@@ -0,0 +1,338 @@
+/*
+ * lib/ts_fsm.c	   A naive finite state machine text search approach
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ *
+ * ==========================================================================
+ *
+ *   A finite state machine consists of n states (struct ts_fsm_token)
+ *   representing the pattern as a finite automation. The data is read
+ *   sequentially on a octet basis. Every state token specifies the number
+ *   of recurrences and the type of value accepted which can be either a
+ *   specific character or ctype based set of characters. The available
+ *   type of recurrences include 1, (0|1), [0 n], and [1 n].
+ *
+ *   The algorithm differs between strict/non-strict mode specyfing
+ *   whether the pattern has to start at the first octect. Strict mode
+ *   is enabled by default and can be disabled by inserting
+ *   TS_FSM_HEAD_IGNORE as the first token in the chain.
+ *
+ *   The runtime performance of the algorithm should be around O(n),
+ *   however while in strict mode the average runtime can be better.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/textsearch.h>
+#include <linux/textsearch_fsm.h>
+
+struct ts_fsm
+{
+	unsigned int		ntokens;
+	struct ts_fsm_token	tokens[0];
+};
+
+/* other values derived from ctype.h */
+#define _A		0x100 /* ascii */
+#define _W		0x200 /* wildcard */
+
+/* Map to _ctype flags and some magic numbers */
+static u16 token_map[TS_FSM_TYPE_MAX+1] = {
+	[TS_FSM_SPECIFIC] = 0,
+	[TS_FSM_WILDCARD] = _W,
+	[TS_FSM_CNTRL]	  = _C,
+	[TS_FSM_LOWER]	  = _L,
+	[TS_FSM_UPPER]	  = _U,
+	[TS_FSM_PUNCT]	  = _P,
+	[TS_FSM_SPACE]	  = _S,
+	[TS_FSM_DIGIT]	  = _D,
+	[TS_FSM_XDIGIT]	  = _D | _X,
+	[TS_FSM_ALPHA]	  = _U | _L,
+	[TS_FSM_ALNUM]	  = _U | _L | _D,
+	[TS_FSM_PRINT]	  = _P | _U | _L | _D | _SP,
+	[TS_FSM_GRAPH]	  = _P | _U | _L | _D,
+	[TS_FSM_ASCII]	  = _A,
+};
+
+static u16 token_lookup_tbl[256] = {
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*   0-  3 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*   4-  7 */
+_W|_A|_C,      _W|_A|_C|_S,  _W|_A|_C|_S,  _W|_A|_C|_S,		/*   8- 11 */
+_W|_A|_C|_S,   _W|_A|_C|_S,  _W|_A|_C,     _W|_A|_C,		/*  12- 15 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  16- 19 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  20- 23 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  24- 27 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  28- 31 */
+_W|_A|_S|_SP,  _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  32- 35 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  36- 39 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  40- 43 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  44- 47 */
+_W|_A|_D,      _W|_A|_D,     _W|_A|_D,     _W|_A|_D,		/*  48- 51 */
+_W|_A|_D,      _W|_A|_D,     _W|_A|_D,     _W|_A|_D,		/*  52- 55 */
+_W|_A|_D,      _W|_A|_D,     _W|_A|_P,     _W|_A|_P,		/*  56- 59 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  60- 63 */
+_W|_A|_P,      _W|_A|_U|_X,  _W|_A|_U|_X,  _W|_A|_U|_X,		/*  64- 67 */
+_W|_A|_U|_X,   _W|_A|_U|_X,  _W|_A|_U|_X,  _W|_A|_U,		/*  68- 71 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  72- 75 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  76- 79 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  80- 83 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  84- 87 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_P,		/*  88- 91 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  92- 95 */
+_W|_A|_P,      _W|_A|_L|_X,  _W|_A|_L|_X,  _W|_A|_L|_X,		/*  96- 99 */
+_W|_A|_L|_X,   _W|_A|_L|_X,  _W|_A|_L|_X,  _W|_A|_L,		/* 100-103 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 104-107 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 108-111 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 112-115 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 116-119 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_P,		/* 120-123 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_C,		/* 124-127 */
+_W,            _W,           _W,           _W,			/* 128-131 */
+_W,            _W,           _W,           _W,			/* 132-135 */
+_W,            _W,           _W,           _W,			/* 136-139 */
+_W,            _W,           _W,           _W,			/* 140-143 */
+_W,            _W,           _W,           _W,			/* 144-147 */
+_W,            _W,           _W,           _W,			/* 148-151 */
+_W,            _W,           _W,           _W,			/* 152-155 */
+_W,            _W,           _W,           _W,			/* 156-159 */
+_W|_S|_SP,     _W|_P,        _W|_P,        _W|_P,		/* 160-163 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 164-167 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 168-171 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 172-175 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 176-179 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 180-183 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 184-187 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 188-191 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 192-195 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 196-199 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 200-203 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 204-207 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 208-211 */
+_W|_U,         _W|_U,        _W|_U,        _W|_P,		/* 212-215 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 216-219 */
+_W|_U,         _W|_U,        _W|_U,        _W|_L,		/* 220-223 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 224-227 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 228-231 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 232-235 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 236-239 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 240-243 */
+_W|_L,         _W|_L,        _W|_L,        _W|_P,		/* 244-247 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 248-251 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L};		/* 252-255 */
+
+static inline int match_token(struct ts_fsm_token *t, u8 d)
+{
+	if (t->type)
+		return (token_lookup_tbl[d] & t->type) != 0;
+	else
+		return t->value == d;
+}
+
+static unsigned int fsm_find(struct ts_config *conf, struct ts_state *state)
+{
+	struct ts_fsm *fsm = ts_config_priv(conf);
+	struct ts_fsm_token *cur = NULL, *next;
+	unsigned int match_start, block_idx = 0, tok_idx;
+	unsigned block_len = 0, strict, consumed = state->offset;
+	const u8 *data;
+
+#define GET_NEXT_BLOCK()		\
+({	consumed += block_idx;		\
+	block_idx = 0;			\
+	block_len = conf->get_next_block(consumed, &data, conf, state); })
+
+#define TOKEN_MISMATCH()		\
+	do {				\
+		if (strict)		\
+			goto no_match;	\
+		block_idx++;		\
+		goto startover;		\
+	} while(0)
+
+#define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK())
+
+	if (end_of_data())
+		goto no_match;
+
+	strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE;
+
+startover:
+	match_start = consumed + block_idx;
+
+	for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) {
+		cur = &fsm->tokens[tok_idx];
+
+		if (likely(tok_idx < (fsm->ntokens - 1)))
+			next = &fsm->tokens[tok_idx + 1];
+		else
+			next = NULL;
+
+		switch (cur->recur) {
+		case TS_FSM_SINGLE:
+			if (end_of_data())
+				goto no_match;
+
+			if (!match_token(cur, data[block_idx]))
+				TOKEN_MISMATCH();
+			break;
+
+		case TS_FSM_PERHAPS:
+			if (end_of_data() ||
+			    !match_token(cur, data[block_idx]))
+				continue;
+			break;
+
+		case TS_FSM_MULTI:
+			if (end_of_data())
+				goto no_match;
+
+			if (!match_token(cur, data[block_idx]))
+				TOKEN_MISMATCH();
+
+			block_idx++;
+			/* fall through */
+
+		case TS_FSM_ANY:
+			if (next == NULL)
+				goto found_match;
+
+			if (end_of_data())
+				continue;
+
+			while (!match_token(next, data[block_idx])) {
+				if (!match_token(cur, data[block_idx]))
+					TOKEN_MISMATCH();
+				block_idx++;
+				if (end_of_data())
+					goto no_match;
+			}
+			continue;
+
+		/*
+		 * Optimization: Prefer small local loop over jumping
+		 * back and forth until garbage at head is munched.
+		 */
+		case TS_FSM_HEAD_IGNORE:
+			if (end_of_data())
+				continue;
+
+			while (!match_token(next, data[block_idx])) {
+				/*
+				 * Special case, don't start over upon
+				 * a mismatch, give the user the
+				 * chance to specify the type of data
+				 * allowed to be ignored.
+				 */
+				if (!match_token(cur, data[block_idx]))
+					goto no_match;
+
+				block_idx++;
+				if (end_of_data())
+					goto no_match;
+			}
+
+			match_start = consumed + block_idx;
+			continue;
+		}
+
+		block_idx++;
+	}
+
+	if (end_of_data())
+		goto found_match;
+
+no_match:
+	return UINT_MAX;
+
+found_match:
+	state->offset = consumed + block_idx;
+	return match_start;
+}
+
+static struct ts_config *fsm_init(const void *pattern, unsigned int len,
+				     int gfp_mask)
+{
+	int i, err = -EINVAL;
+	struct ts_config *conf;
+	struct ts_fsm *fsm;
+	struct ts_fsm_token *tokens = (struct ts_fsm_token *) pattern;
+	unsigned int ntokens = len / sizeof(*tokens);
+	size_t priv_size = sizeof(*fsm) + len;
+
+	if (len  % sizeof(struct ts_fsm_token) || ntokens < 1)
+		goto errout;
+
+	for (i = 0; i < ntokens; i++) {
+		struct ts_fsm_token *t = &tokens[i];
+
+		if (t->type > TS_FSM_TYPE_MAX || t->recur > TS_FSM_RECUR_MAX)
+			goto errout;
+
+		if (t->recur == TS_FSM_HEAD_IGNORE &&
+		    (i != 0 || i == (ntokens - 1)))
+			goto errout;
+	}
+
+	conf = alloc_ts_config(priv_size, gfp_mask);
+	if (IS_ERR(conf))
+		return conf;
+
+	fsm = ts_config_priv(conf);
+	fsm->ntokens = ntokens;
+	memcpy(fsm->tokens, pattern, len);
+
+	for (i = 0; i < fsm->ntokens; i++) {
+		struct ts_fsm_token *t = &fsm->tokens[i];
+		t->type = token_map[t->type];
+	}
+
+	return conf;
+
+errout:
+	return ERR_PTR(err);
+}
+
+static void *fsm_get_pattern(struct ts_config *conf)
+{
+	struct ts_fsm *fsm = ts_config_priv(conf);
+	return fsm->tokens;
+}
+
+static unsigned int fsm_get_pattern_len(struct ts_config *conf)
+{
+	struct ts_fsm *fsm = ts_config_priv(conf);
+	return fsm->ntokens * sizeof(struct ts_fsm_token);
+}
+
+static struct ts_ops fsm_ops = {
+	.name		  = "fsm",
+	.find		  = fsm_find,
+	.init		  = fsm_init,
+	.get_pattern	  = fsm_get_pattern,
+	.get_pattern_len  = fsm_get_pattern_len,
+	.owner		  = THIS_MODULE,
+	.list		  = LIST_HEAD_INIT(fsm_ops.list)
+};
+
+static int __init init_fsm(void)
+{
+	return textsearch_register(&fsm_ops);
+}
+
+static void __exit exit_fsm(void)
+{
+	textsearch_unregister(&fsm_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_fsm);
+module_exit(exit_fsm);
-- 
cgit v1.2.3


From 65df877ab2e2328a4704af218efaed0a45176c86 Mon Sep 17 00:00:00 2001
From: David S. Miller
Date: Thu, 23 Jun 2005 23:49:52 -0700
Subject: [LIB]: textsearch.o needs to be obj-y not lib-y.

It exports symbols.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/Makefile b/lib/Makefile
index 7f6eda449102..beed1585294c 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -36,7 +36,7 @@ obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
 obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/
 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
 
-lib-$(CONFIG_TEXTSEARCH) += textsearch.o
+obj-$(CONFIG_TEXTSEARCH) += textsearch.o
 obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
 obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
 
-- 
cgit v1.2.3