dovecot-2.2: Reversed recent "short utf8" changes.

dovecot at dovecot.org dovecot at dovecot.org
Sat Jan 5 01:14:34 EET 2013


details:   http://hg.dovecot.org/dovecot-2.2/rev/f584aae3b566
changeset: 15523:f584aae3b566
user:      Timo Sirainen <tss at iki.fi>
date:      Tue Nov 27 07:50:06 2012 +0200
description:
Reversed recent "short utf8" changes.
Solr code needs to parse the UTF8 input explicitly anyway to encode the XML
characters. And all the character checks were already done in it.

diffstat:

 src/lib/unichar.c                           |  64 -----------------------------
 src/lib/unichar.h                           |   7 ---
 src/plugins/fts-solr/fts-backend-solr-old.c |   2 +-
 src/plugins/fts-solr/fts-backend-solr.c     |   3 +-
 src/plugins/fts/fts-api-private.h           |   4 +-
 src/plugins/fts/fts-build-mail.c            |   7 +--
 src/plugins/fts/fts-parser.c                |  24 ++--------
 src/plugins/fts/fts-parser.h                |   3 +-
 8 files changed, 12 insertions(+), 102 deletions(-)

diffs (227 lines):

diff -r fdc509644d05 -r f584aae3b566 src/lib/unichar.c
--- a/src/lib/unichar.c	Tue Nov 27 06:21:18 2012 +0200
+++ b/src/lib/unichar.c	Tue Nov 27 07:50:06 2012 +0200
@@ -420,67 +420,3 @@
 
 	return uni_utf8_find_invalid_pos(data, size, &i) == 0;
 }
-
-static int
-uni_utf8_short_find_invalid_pos(const unsigned char *input, size_t size,
-				size_t *pos_r)
-{
-	size_t i, len;
-
-	/* find the first invalid utf8 sequence */
-	for (i = 0; i < size;) {
-		if (input[i] < 0x80)
-			i++;
-		else {
-			len = is_valid_utf8_seq(input + i, size-i);
-			if (unlikely(len == 0 || len > 4)) {
-				*pos_r = i;
-				return -1;
-			}
-			i += len;
-		}
-	}
-	return 0;
-}
-
-bool uni_utf8_short_get_valid_data(const unsigned char *input, size_t size,
-				   buffer_t *buf)
-{
-	size_t i, len;
-
-	if (uni_utf8_short_find_invalid_pos(input, size, &i) == 0)
-		return TRUE;
-
-	/* broken utf-8 input - skip the broken characters */
-	while (i < size) {
-		if (input[i] < 0x80) {
-			buffer_append_c(buf, input[i++]);
-			continue;
-		}
-
-		len = is_valid_utf8_seq(input + i, size-i);
-		if (len == 0 || len > 4) {
-			i += I_MAX(len, 1);
-			output_add_replacement_char(buf);
-			continue;
-		}
-		buffer_append(buf, input + i, len);
-		i += len;
-	}
-	return FALSE;
-}
-
-bool uni_utf8_short_str_is_valid(const char *str)
-{
-	size_t i;
-
-	return uni_utf8_short_find_invalid_pos((const unsigned char *)str,
-					       strlen(str), &i) == 0;
-}
-
-bool uni_utf8_short_data_is_valid(const unsigned char *data, size_t size)
-{
-	size_t i;
-
-	return uni_utf8_find_invalid_pos(data, size, &i) == 0;
-}
diff -r fdc509644d05 -r f584aae3b566 src/lib/unichar.h
--- a/src/lib/unichar.h	Tue Nov 27 06:21:18 2012 +0200
+++ b/src/lib/unichar.h	Tue Nov 27 07:50:06 2012 +0200
@@ -88,11 +88,4 @@
 /* Returns TRUE if data contains only valid UTF-8 input. */
 bool uni_utf8_data_is_valid(const unsigned char *data, size_t size);
 
-/* Same as the non-short variants, but assume 5-byte and 6-byte UTF8
-   sequences are illegal. */
-bool uni_utf8_short_get_valid_data(const unsigned char *input, size_t size,
-				   buffer_t *buf);
-bool uni_utf8_short_str_is_valid(const char *str);
-bool uni_utf8_short_data_is_valid(const unsigned char *data, size_t size);
-
 #endif
diff -r fdc509644d05 -r f584aae3b566 src/plugins/fts-solr/fts-backend-solr-old.c
--- a/src/plugins/fts-solr/fts-backend-solr-old.c	Tue Nov 27 06:21:18 2012 +0200
+++ b/src/plugins/fts-solr/fts-backend-solr-old.c	Tue Nov 27 07:50:06 2012 +0200
@@ -810,7 +810,7 @@
 
 struct fts_backend fts_backend_solr_old = {
 	.name = "solr_old",
-	.flags = FTS_BACKEND_FLAG_BUILD_SHORT_UTF8,
+	.flags = 0,
 
 	{
 		fts_backend_solr_alloc,
diff -r fdc509644d05 -r f584aae3b566 src/plugins/fts-solr/fts-backend-solr.c
--- a/src/plugins/fts-solr/fts-backend-solr.c	Tue Nov 27 06:21:18 2012 +0200
+++ b/src/plugins/fts-solr/fts-backend-solr.c	Tue Nov 27 07:50:06 2012 +0200
@@ -879,8 +879,7 @@
 
 struct fts_backend fts_backend_solr = {
 	.name = "solr",
-	.flags = FTS_BACKEND_FLAG_FUZZY_SEARCH |
-		FTS_BACKEND_FLAG_BUILD_SHORT_UTF8,
+	.flags = FTS_BACKEND_FLAG_FUZZY_SEARCH,
 
 	{
 		fts_backend_solr_alloc,
diff -r fdc509644d05 -r f584aae3b566 src/plugins/fts/fts-api-private.h
--- a/src/plugins/fts/fts-api-private.h	Tue Nov 27 06:21:18 2012 +0200
+++ b/src/plugins/fts/fts-api-private.h	Tue Nov 27 07:50:06 2012 +0200
@@ -59,9 +59,7 @@
 	/* Send only fully indexable words rather than randomly sized blocks */
 	FTS_BACKEND_FLAG_BUILD_FULL_WORDS	= 0x04,
 	/* Fuzzy search works */
-	FTS_BACKEND_FLAG_FUZZY_SEARCH		= 0x08,
-	/* Don't allow 5-byte or 6-byte UTF8 sequences */
-	FTS_BACKEND_FLAG_BUILD_SHORT_UTF8	= 0x10
+	FTS_BACKEND_FLAG_FUZZY_SEARCH		= 0x08
 };
 
 struct fts_backend {
diff -r fdc509644d05 -r f584aae3b566 src/plugins/fts/fts-build-mail.c
--- a/src/plugins/fts/fts-build-mail.c	Tue Nov 27 06:21:18 2012 +0200
+++ b/src/plugins/fts/fts-build-mail.c	Tue Nov 27 07:50:06 2012 +0200
@@ -144,7 +144,6 @@
 	struct mail_storage *storage;
 	const char *content_type;
 	struct fts_backend_build_key key;
-	bool require_short_utf8;
 
 	i_assert(ctx->body_parser == NULL);
 
@@ -159,11 +158,9 @@
 		return FALSE;
 	}
 
-	require_short_utf8 = (ctx->update_ctx->backend->flags &
-			      FTS_BACKEND_FLAG_BUILD_SHORT_UTF8) != 0;
-
+	
 	storage = mailbox_get_storage(ctx->mail->box);
-	if (fts_parser_init(mail_storage_get_user(storage), require_short_utf8,
+	if (fts_parser_init(mail_storage_get_user(storage),
 			    content_type, ctx->content_disposition,
 			    &ctx->body_parser)) {
 		/* extract text using the the returned parser */
diff -r fdc509644d05 -r f584aae3b566 src/plugins/fts/fts-parser.c
--- a/src/plugins/fts/fts-parser.c	Tue Nov 27 06:21:18 2012 +0200
+++ b/src/plugins/fts/fts-parser.c	Tue Nov 27 07:50:06 2012 +0200
@@ -11,7 +11,7 @@
 	&fts_parser_script
 };
 
-bool fts_parser_init(struct mail_user *user, bool require_short_utf8,
+bool fts_parser_init(struct mail_user *user,
 		     const char *content_type, const char *content_disposition,
 		     struct fts_parser **parser_r)
 {
@@ -20,10 +20,8 @@
 	for (i = 0; i < N_ELEMENTS(parsers); i++) {
 		*parser_r = parsers[i]->try_init(user, content_type,
 						 content_disposition);
-		if (*parser_r != NULL) {
-			(*parser_r)->require_short_utf8 = require_short_utf8;
+		if (*parser_r != NULL)
 			return TRUE;
-		}
 	}
 	return FALSE;
 }
@@ -58,15 +56,11 @@
 
 void fts_parser_more(struct fts_parser *parser, struct message_block *block)
 {
-	bool valid_utf8;
-
 	if (parser->v.more != NULL)
 		parser->v.more(parser, block);
 
-	valid_utf8 = parser->require_short_utf8 ?
-		uni_utf8_short_data_is_valid(block->data, block->size) :
-		uni_utf8_data_is_valid(block->data, block->size);
-	if (!valid_utf8 || data_has_nuls(block->data, block->size)) {
+	if (!uni_utf8_data_is_valid(block->data, block->size) ||
+	    data_has_nuls(block->data, block->size)) {
 		/* output isn't valid UTF-8. make it. */
 		if (parser->utf8_output == NULL) {
 			parser->utf8_output =
@@ -74,14 +68,8 @@
 		} else {
 			buffer_set_used_size(parser->utf8_output, 0);
 		}
-		if (parser->require_short_utf8) {
-			(void)uni_utf8_short_get_valid_data(block->data,
-							    block->size,
-							    parser->utf8_output);
-		} else {
-			(void)uni_utf8_get_valid_data(block->data, block->size,
-						      parser->utf8_output);
-		}
+		(void)uni_utf8_get_valid_data(block->data, block->size,
+					      parser->utf8_output);
 		replace_nul_bytes(parser->utf8_output);
 		block->data = parser->utf8_output->data;
 		block->size = parser->utf8_output->used;
diff -r fdc509644d05 -r f584aae3b566 src/plugins/fts/fts-parser.h
--- a/src/plugins/fts/fts-parser.h	Tue Nov 27 06:21:18 2012 +0200
+++ b/src/plugins/fts/fts-parser.h	Tue Nov 27 07:50:06 2012 +0200
@@ -15,13 +15,12 @@
 struct fts_parser {
 	struct fts_parser_vfuncs v;
 	buffer_t *utf8_output;
-	bool require_short_utf8;
 };
 
 extern struct fts_parser_vfuncs fts_parser_html;
 extern struct fts_parser_vfuncs fts_parser_script;
 
-bool fts_parser_init(struct mail_user *user, bool require_short_utf8,
+bool fts_parser_init(struct mail_user *user,
 		     const char *content_type, const char *content_disposition,
 		     struct fts_parser **parser_r);
 struct fts_parser *fts_parser_text_init(void);


More information about the dovecot-cvs mailing list