dovecot-2.2: lib-fts: Changed fts_tokenizer_next/final() to retu...

dovecot at dovecot.org dovecot at dovecot.org
Sat May 9 16:28:07 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/7f151aca47ac
changeset: 18610:7f151aca47ac
user:      Timo Sirainen <tss at iki.fi>
date:      Sat May 09 19:26:01 2015 +0300
description:
lib-fts: Changed fts_tokenizer_next/final() to return error string.
The current tokenizers can't fail, but if we're doing tokenization via
external services they could fail.

diffstat:

 src/lib-fts/fts-tokenizer-address.c |   3 +-
 src/lib-fts/fts-tokenizer-generic.c |  11 ++++++---
 src/lib-fts/fts-tokenizer-private.h |   3 +-
 src/lib-fts/fts-tokenizer.c         |  22 +++++++++++---------
 src/lib-fts/fts-tokenizer.h         |   5 ++-
 src/lib-fts/test-fts-tokenizer.c    |  40 ++++++++++++++++++------------------
 src/plugins/fts/fts-build-mail.c    |   2 +-
 src/plugins/fts/fts-search-args.c   |  18 +++++++++++-----
 8 files changed, 59 insertions(+), 45 deletions(-)

diffs (truncated from 314 to 300 lines):

diff -r fa55a06ffae2 -r 7f151aca47ac src/lib-fts/fts-tokenizer-address.c
--- a/src/lib-fts/fts-tokenizer-address.c	Sat May 09 19:21:45 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-address.c	Sat May 09 19:26:01 2015 +0300
@@ -203,7 +203,8 @@
 static int
 fts_tokenizer_email_address_next(struct fts_tokenizer *_tok,
                                  const unsigned char *data, size_t size,
-                                 size_t *skip_r, const char **token_r)
+				 size_t *skip_r, const char **token_r,
+				 const char **error_r ATTR_UNUSED)
 {
 	struct email_address_fts_tokenizer *tok =
 		(struct email_address_fts_tokenizer *)_tok;
diff -r fa55a06ffae2 -r 7f151aca47ac src/lib-fts/fts-tokenizer-generic.c
--- a/src/lib-fts/fts-tokenizer-generic.c	Sat May 09 19:21:45 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic.c	Sat May 09 19:26:01 2015 +0300
@@ -168,7 +168,8 @@
 static int
 fts_tokenizer_generic_next_simple(struct fts_tokenizer *_tok,
                                   const unsigned char *data, size_t size,
-                                  size_t *skip_r, const char **token_r)
+				  size_t *skip_r, const char **token_r,
+				  const char **error_r ATTR_UNUSED)
 {
 	struct generic_fts_tokenizer *tok =
 		(struct generic_fts_tokenizer *)_tok;
@@ -565,8 +566,9 @@
 
 static int
 fts_tokenizer_generic_next_tr29(struct fts_tokenizer *_tok,
-			   const unsigned char *data, size_t size,
-                                size_t *skip_r, const char **token_r)
+				const unsigned char *data, size_t size,
+				size_t *skip_r, const char **token_r,
+				const char **error_r ATTR_UNUSED)
 {
 	struct generic_fts_tokenizer *tok =
 		(struct generic_fts_tokenizer *)_tok;
@@ -614,7 +616,8 @@
 			   const unsigned char *data ATTR_UNUSED,
                            size_t size ATTR_UNUSED,
                            size_t *skip_r ATTR_UNUSED,
-                           const char **token_r ATTR_UNUSED)
+			   const char **token_r ATTR_UNUSED,
+			   const char **error_r ATTR_UNUSED)
 {
 	i_unreached();
 }
diff -r fa55a06ffae2 -r 7f151aca47ac src/lib-fts/fts-tokenizer-private.h
--- a/src/lib-fts/fts-tokenizer-private.h	Sat May 09 19:21:45 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-private.h	Sat May 09 19:26:01 2015 +0300
@@ -12,7 +12,8 @@
 
 	void (*reset)(struct fts_tokenizer *tok);
 	int (*next)(struct fts_tokenizer *tok, const unsigned char *data,
-	            size_t size, size_t *skip_r, const char **token_r);
+		    size_t size, size_t *skip_r, const char **token_r,
+		    const char **error_r);
 };
 
 enum fts_tokenizer_parent_state {
diff -r fa55a06ffae2 -r 7f151aca47ac src/lib-fts/fts-tokenizer.c
--- a/src/lib-fts/fts-tokenizer.c	Sat May 09 19:21:45 2015 +0300
+++ b/src/lib-fts/fts-tokenizer.c	Sat May 09 19:26:01 2015 +0300
@@ -123,7 +123,7 @@
 static int
 fts_tokenizer_next_self(struct fts_tokenizer *tok,
                         const unsigned char *data, size_t size,
-                        const char **token_r)
+                        const char **token_r, const char **error_r)
 {
 	int ret = 0;
 	size_t skip = 0;
@@ -133,12 +133,13 @@
 
 	if (tok->prev_reply_finished) {
 		/* whole new data */
-		ret = tok->v->next(tok, data, size, &skip, token_r);
+		ret = tok->v->next(tok, data, size, &skip, token_r, error_r);
 	} else {
 		/* continuing previous data */
 		i_assert(tok->prev_skip <= size);
 		ret = tok->v->next(tok, data + tok->prev_skip,
-		                   size - tok->prev_skip, &skip, token_r);
+				   size - tok->prev_skip, &skip,
+				   token_r, error_r);
 	}
 
 	if (ret > 0) {
@@ -164,13 +165,13 @@
 
 int fts_tokenizer_next(struct fts_tokenizer *tok,
 		       const unsigned char *data, size_t size,
-		       const char **token_r)
+		       const char **token_r, const char **error_r)
 {
 	int ret;
 
 	switch (tok->parent_state) {
 	case FTS_TOKENIZER_PARENT_STATE_ADD_DATA:
-		ret = fts_tokenizer_next_self(tok, data, size, token_r);
+		ret = fts_tokenizer_next_self(tok, data, size, token_r, error_r);
 		if (ret <= 0 || tok->parent == NULL || tok->skip_parents)
 			return ret;
 		buffer_set_used_size(tok->parent_input, 0);
@@ -179,25 +180,26 @@
 		/* fall through */
 	case FTS_TOKENIZER_PARENT_STATE_NEXT_OUTPUT:
 		ret = fts_tokenizer_next(tok->parent, tok->parent_input->data,
-		                         tok->parent_input->used, token_r);
+		                         tok->parent_input->used, token_r, error_r);
 		if (ret != 0)
 			return ret;
 		tok->parent_state++;
 		/* fall through */
 	case FTS_TOKENIZER_PARENT_STATE_FINALIZE:
-		ret = fts_tokenizer_next(tok->parent, NULL, 0, token_r);
+		ret = fts_tokenizer_next(tok->parent, NULL, 0, token_r, error_r);
 		if (ret != 0)
 			return ret;
 		/* we're finished sending this token to parent tokenizer.
 		   see if our own tokenizer has more tokens available */
 		tok->parent_state = FTS_TOKENIZER_PARENT_STATE_ADD_DATA;
-		return fts_tokenizer_next(tok, data, size, token_r);
+		return fts_tokenizer_next(tok, data, size, token_r, error_r);
 	default:
 		i_unreached();
 	}
 }
 
-int fts_tokenizer_final(struct fts_tokenizer *tok, const char **token_r)
+int fts_tokenizer_final(struct fts_tokenizer *tok, const char **token_r,
+			const char **error_r)
 {
-	return fts_tokenizer_next(tok, NULL, 0, token_r);
+	return fts_tokenizer_next(tok, NULL, 0, token_r, error_r);
 }
diff -r fa55a06ffae2 -r 7f151aca47ac src/lib-fts/fts-tokenizer.h
--- a/src/lib-fts/fts-tokenizer.h	Sat May 09 19:21:45 2015 +0300
+++ b/src/lib-fts/fts-tokenizer.h	Sat May 09 19:26:01 2015 +0300
@@ -77,9 +77,10 @@
 
 int fts_tokenizer_next(struct fts_tokenizer *tok,
 		       const unsigned char *data, size_t size,
-		       const char **token_r);
+		       const char **token_r, const char **error_r);
 /* Returns same as fts_tokenizer_next(). */
-int fts_tokenizer_final(struct fts_tokenizer *tok, const char **token_r);
+int fts_tokenizer_final(struct fts_tokenizer *tok, const char **token_r,
+			const char **error_r);
 
 const char *fts_tokenizer_name(const struct fts_tokenizer *tok);
 
diff -r fa55a06ffae2 -r 7f151aca47ac src/lib-fts/test-fts-tokenizer.c
--- a/src/lib-fts/test-fts-tokenizer.c	Sat May 09 19:21:45 2015 +0300
+++ b/src/lib-fts/test-fts-tokenizer.c	Sat May 09 19:26:01 2015 +0300
@@ -38,16 +38,16 @@
 			   const char *const *expected_output)
 {
 	const unsigned char *input = (const unsigned char *)_input;
-	const char *token;
+	const char *token, *error;
 	unsigned int i, max, outi, char_len, input_len = strlen(_input);
 
 	/* test all input at once */
 	outi = 0;
-	while (fts_tokenizer_next(tok, input, input_len, &token) > 0) {
+	while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
 		test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
 		outi++;
 	}
-	while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
+	while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
 		test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
 		outi++;
 	}
@@ -56,12 +56,12 @@
 	/* test input one byte at a time */
 	for (i = outi = 0; i < input_len; i += char_len) {
 		char_len = uni_utf8_char_bytes(input[i]);
-		while (fts_tokenizer_next(tok, input+i, char_len, &token) > 0) {
+		while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
 			test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
 			outi++;
 		}
 	}
-	while (fts_tokenizer_final(tok, &token) > 0) {
+	while (fts_tokenizer_final(tok, &token, &error) > 0) {
 		test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
 		outi++;
 	}
@@ -72,12 +72,12 @@
 		max = rand() % (input_len - i) + 1;
 		for (char_len = 0; char_len < max; )
 			char_len += uni_utf8_char_bytes(input[i+char_len]);
-		while (fts_tokenizer_next(tok, input+i, char_len, &token) > 0) {
+		while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
 			test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
 			outi++;
 		}
 	}
-	while (fts_tokenizer_final(tok, &token) > 0) {
+	while (fts_tokenizer_final(tok, &token, &error) > 0) {
 		test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
 		outi++;
 	}
@@ -257,28 +257,28 @@
 	test_tokenizer_inputoutput(tok, input, expected_output);
 
 	/* make sure state is forgotten at EOF */
-	test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token) == 0);
-	test_assert(fts_tokenizer_final(tok, &token) > 0 &&
+	test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0);
+	test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
 		    strcmp(token, "foo") == 0);
-	test_assert(fts_tokenizer_final(tok, &token) == 0);
+	test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
 
-	test_assert(fts_tokenizer_next(tok, (const void *)"bar at baz", 7, &token) == 0);
-	test_assert(fts_tokenizer_final(tok, &token) > 0 &&
+	test_assert(fts_tokenizer_next(tok, (const void *)"bar at baz", 7, &token, &error) == 0);
+	test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
 		    strcmp(token, "bar at baz") == 0);
-	test_assert(fts_tokenizer_final(tok, &token) == 0);
+	test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
 
-	test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token) == 0);
-	test_assert(fts_tokenizer_final(tok, &token) > 0 &&
+	test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token, &error) == 0);
+	test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
 		    strcmp(token, "foo") == 0);
-	test_assert(fts_tokenizer_final(tok, &token) == 0);
+	test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
 
 	/* test reset explicitly */
-	test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token) == 0);
+	test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token, &error) == 0);
 	fts_tokenizer_reset(tok);
-	test_assert(fts_tokenizer_next(tok, (const void *)"b at c", 3, &token) == 0);
-	test_assert(fts_tokenizer_final(tok, &token) > 0 &&
+	test_assert(fts_tokenizer_next(tok, (const void *)"b at c", 3, &token, &error) == 0);
+	test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
 		    strcmp(token, "b at c") == 0);
-	test_assert(fts_tokenizer_final(tok, &token) == 0);
+	test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
 
 	fts_tokenizer_unref(&tok);
 	fts_tokenizer_unref(&gen_tok);
diff -r fa55a06ffae2 -r 7f151aca47ac src/plugins/fts/fts-build-mail.c
--- a/src/plugins/fts/fts-build-mail.c	Sat May 09 19:21:45 2015 +0300
+++ b/src/plugins/fts/fts-build-mail.c	Sat May 09 19:26:01 2015 +0300
@@ -261,7 +261,7 @@
 	int ret;
 
 	tokenizer = fts_user_get_index_tokenizer(ctx->update_ctx->backend->ns->user);
-	while ((ret = fts_tokenizer_next(tokenizer, data, size, &token)) > 0) {
+	while ((ret = fts_tokenizer_next(tokenizer, data, size, &token, &error)) > 0) {
 		if (filter != NULL) {
 			ret = fts_filter_filter(filter, &token, &error);
 			if (ret == 0)
diff -r fa55a06ffae2 -r 7f151aca47ac src/plugins/fts/fts-search-args.c
--- a/src/plugins/fts/fts-search-args.c	Sat May 09 19:21:45 2015 +0300
+++ b/src/plugins/fts/fts-search-args.c	Sat May 09 19:26:01 2015 +0300
@@ -81,7 +81,7 @@
 			token2 = t_strdup(token2);
 			array_append(&tokens, &token2, 1);
 		} else if (ret < 0) {
-			i_error("fts: Couldn't create search tokens: %s", error);
+			i_error("fts: Couldn't filter search tokens: %s", error);
 			return -1;
 		}
 	}
@@ -99,9 +99,10 @@
 {
 	const ARRAY_TYPE(fts_user_language) *languages;
 	struct mail_search_arg *and_arg, *orig_arg = *argp;
-	const char *token, *orig_token = orig_arg->value.str;
+	const char *error, *token, *orig_token = orig_arg->value.str;
 	unsigned int orig_token_len = strlen(orig_token);
 	struct fts_tokenizer *tokenizer;
+	int ret;
 
 	languages = fts_user_get_all_languages(backend->ns->user);
 	tokenizer = fts_user_get_search_tokenizer(backend->ns->user);
@@ -117,20 +118,25 @@
 	/* reset tokenizer between search args in case there's any state left
 	   from some previous failure */
 	fts_tokenizer_reset(tokenizer);
-	while (fts_tokenizer_next(tokenizer,
-	                          (const void *)orig_token,
-	                          orig_token_len, &token) > 0) {
+	while ((ret = fts_tokenizer_next(tokenizer,
+					 (const void *)orig_token,
+					 orig_token_len, &token, &error)) > 0) {
 		if (fts_backend_dovecot_expand_lang_tokens(languages, pool, and_arg,
 							   orig_arg, orig_token,
 							   token) < 0)
 			return -1;
 	}
-	while (fts_tokenizer_final(tokenizer, &token) > 0) {


More information about the dovecot-cvs mailing list