dovecot-2.2: Replaced "decomposed titlecase" conversions with mo...

Sat Sep 15 03:12:36 EEST 2012

details:   http://hg.dovecot.org/dovecot-2.2/rev/c976a9c01613
changeset: 15053:c976a9c01613
user:      Timo Sirainen <tss at iki.fi>
date:      Sat Sep 15 03:12:20 2012 +0300
description:
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Plugins can now change mail_user.default_normalizer. Specific searches can
also use different normalizers by changing mail_search_context.normalizer.

diffstat:

 src/doveadm/doveadm-mail-fetch.c          |   2 +-
 src/lib-charset/charset-iconv.c           |  23 +++++++--------
 src/lib-charset/charset-utf8.c            |  46 ++++++++++++++++--------------
 src/lib-charset/charset-utf8.h            |  16 ++++------
 src/lib-imap/imap-base-subject.c          |   2 +-
 src/lib-mail/message-decoder.c            |  26 +++++++----------
 src/lib-mail/message-decoder.h            |   8 ++--
 src/lib-mail/message-header-decode.c      |  15 ++++------
 src/lib-mail/message-header-decode.h      |   7 ++--
 src/lib-mail/message-search.c             |  15 ++++-----
 src/lib-mail/message-search.h             |   8 ++---
 src/lib-mail/test-message-decoder.c       |   7 ++--
 src/lib-mail/test-message-header-decode.c |   2 +-
 src/lib-storage/index/index-search.c      |  25 ++++++++++------
 src/lib-storage/mail-storage-private.h    |   2 +
 src/lib-storage/mail-user.c               |   1 +
 src/lib-storage/mail-user.h               |   2 +
 src/lib/unichar.h                         |   6 ++++
 src/plugins/fts-squat/fts-backend-squat.c |   6 ++--
 src/plugins/fts/fts-api-private.h         |   6 ++-
 src/plugins/fts/fts-api.c                 |   7 ++++-
 src/plugins/fts/fts-build-mail.c          |   5 +--
 22 files changed, 125 insertions(+), 112 deletions(-)

diffs (truncated from 764 to 300 lines):

diff -r d5ebec837bfd -r c976a9c01613 src/doveadm/doveadm-mail-fetch.c

--- a/src/doveadm/doveadm-mail-fetch.c	Sat Sep 15 03:09:57 2012 +0300
+++ b/src/doveadm/doveadm-mail-fetch.c	Sat Sep 15 03:12:20 2012 +0300
@@ -265,7 +265,7 @@
 	parser = message_parser_init(pool_datastack_create(), input,
 				     MESSAGE_HEADER_PARSER_FLAG_CLEAN_ONELINE,
 				     0);
-	decoder = message_decoder_init(0);
+	decoder = message_decoder_init(NULL, 0);
 
 	while ((ret = message_parser_parse_next_block(parser, &raw_block)) > 0) {
 		if (!message_decoder_decode_next_block(decoder, &raw_block,
diff -r d5ebec837bfd -r c976a9c01613 src/lib-charset/charset-iconv.c
--- a/src/lib-charset/charset-iconv.c	Sat Sep 15 03:09:57 2012 +0300
+++ b/src/lib-charset/charset-iconv.c	Sat Sep 15 03:12:20 2012 +0300
@@ -12,10 +12,10 @@
 
 struct charset_translation {
 	iconv_t cd;
-	enum charset_flags flags;
+	normalizer_func_t *normalizer;
 };
 
-int charset_to_utf8_begin(const char *charset, enum charset_flags flags,
+int charset_to_utf8_begin(const char *charset, normalizer_func_t *normalizer,
 			  struct charset_translation **t_r)
 {
 	struct charset_translation *t;
@@ -31,7 +31,7 @@
 
 	t = i_new(struct charset_translation, 1);
 	t->cd = cd;
-	t->flags = flags;
+	t->normalizer = normalizer;
 	*t_r = t;
 	return 0;
 }
@@ -54,12 +54,12 @@
 }
 
 static int
-charset_append_utf8(const void *src, size_t src_size,
-		    buffer_t *dest, bool dtcase)
+charset_append_utf8(struct charset_translation *t,
+		    const void *src, size_t src_size, buffer_t *dest)
 {
-	if (dtcase)
-		return uni_utf8_to_decomposed_titlecase(src, src_size, dest);
-	if (!uni_utf8_get_valid_data(src, src_size, dest))
+	if (t->normalizer != NULL)
+		return t->normalizer(src, src_size, dest);
+	else if (!uni_utf8_get_valid_data(src, src_size, dest))
 		return -1;
 	else {
 		buffer_append(dest, src, src_size);
@@ -75,12 +75,11 @@
 	ICONV_CONST char *ic_srcbuf;
 	char tmpbuf[8192], *ic_destbuf;
 	size_t srcleft, destleft;
-	bool dtcase = (t->flags & CHARSET_FLAG_DECOMP_TITLECASE) != 0;
 	bool ret = TRUE;
 
 	if (t->cd == (iconv_t)-1) {
 		/* input is already supposed to be UTF-8 */
-		if (charset_append_utf8(src, *src_size, dest, dtcase) < 0)
+		if (charset_append_utf8(t, src, *src_size, dest) < 0)
 			*result = CHARSET_RET_INVALID_INPUT;
 		else
 			*result = CHARSET_RET_OK;
@@ -110,8 +109,8 @@
 	/* we just converted data to UTF-8. it shouldn't be invalid, but
 	   Solaris iconv appears to pass invalid data through sometimes
 	   (e.g. 8 bit characters with UTF-7) */
-	if (charset_append_utf8(tmpbuf, sizeof(tmpbuf) - destleft,
-				dest, dtcase) < 0)
+	if (charset_append_utf8(t, tmpbuf, sizeof(tmpbuf) - destleft,
+				dest) < 0)
 		*result = CHARSET_RET_INVALID_INPUT;
 	return ret;
 }
diff -r d5ebec837bfd -r c976a9c01613 src/lib-charset/charset-utf8.c
--- a/src/lib-charset/charset-utf8.c	Sat Sep 15 03:09:57 2012 +0300
+++ b/src/lib-charset/charset-utf8.c	Sat Sep 15 03:12:20 2012 +0300
@@ -16,14 +16,14 @@
 		strcasecmp(charset, "UTF8") == 0;
 }
 
-int charset_to_utf8_str(const char *charset, enum charset_flags flags,
+int charset_to_utf8_str(const char *charset, normalizer_func_t *normalizer,
 			const char *input, string_t *output,
 			enum charset_result *result_r)
 {
 	struct charset_translation *t;
 	size_t len = strlen(input);
 
-	if (charset_to_utf8_begin(charset, flags, &t) < 0)
+	if (charset_to_utf8_begin(charset, normalizer, &t) < 0)
 		return -1;
 
 	*result_r = charset_to_utf8(t, (const unsigned char *)input,
@@ -35,31 +35,31 @@
 #ifndef HAVE_ICONV
 
 struct charset_translation {
-	enum charset_flags flags;
+	normalizer_func_t *normalizer;
 };
 
-static struct charset_translation raw_translation = { 0 };
-static struct charset_translation tc_translation = {
-	CHARSET_FLAG_DECOMP_TITLECASE
-};
-
-int charset_to_utf8_begin(const char *charset, enum charset_flags flags,
+int charset_to_utf8_begin(const char *charset, normalizer_func_t *normalizer,
 			  struct charset_translation **t_r)
 {
-	if (charset_is_utf8(charset)) {
-		if ((flags & CHARSET_FLAG_DECOMP_TITLECASE) != 0)
-			*t_r = &tc_translation;
-		else
-			*t_r = &raw_translation;
-		return 0;
+	struct charset_translation *t;
+
+	if (!charset_is_utf8(charset)) {
+		/* no support for charsets that need translation */
+		return -1;
 	}
 
-	/* no support for charsets that need translation */
-	return -1;
+	t = i_new(struct charset_translation, 1);
+	t->normalizer = normalizer;
+	*t_r = t;
+	return 0;
 }
 
-void charset_to_utf8_end(struct charset_translation **t ATTR_UNUSED)
+void charset_to_utf8_end(struct charset_translation **_t)
 {
+	struct charset_translation *t = *_t;
+
+	*_t = NULL;
+	i_free(t);
 }
 
 void charset_to_utf8_reset(struct charset_translation *t ATTR_UNUSED)
@@ -70,11 +70,13 @@
 charset_to_utf8(struct charset_translation *t,
 		const unsigned char *src, size_t *src_size, buffer_t *dest)
 {
-	if ((t->flags & CHARSET_FLAG_DECOMP_TITLECASE) == 0)
+	if (t->normalizer != NULL) {
+		if (t->normalizer(src, *src_size, dest) < 0)
+			return CHARSET_RET_INVALID_INPUT;
+	} else if (!uni_utf8_get_valid_data(src, *src_size, dest)) {
+		return CHARSET_RET_INVALID_INPUT;
+	} else {
 		buffer_append(dest, src, *src_size);
-	else {
-		if (uni_utf8_to_decomposed_titlecase(src, *src_size, dest) < 0)
-			return CHARSET_RET_INVALID_INPUT;
 	}
 	return CHARSET_RET_OK;
 }
diff -r d5ebec837bfd -r c976a9c01613 src/lib-charset/charset-utf8.h
--- a/src/lib-charset/charset-utf8.h	Sat Sep 15 03:09:57 2012 +0300
+++ b/src/lib-charset/charset-utf8.h	Sat Sep 15 03:12:20 2012 +0300
@@ -1,13 +1,10 @@
 #ifndef CHARSET_UTF8_H
 #define CHARSET_UTF8_H
 
+#include "unichar.h"
+
 struct charset_translation;
 
-enum charset_flags {
-	/* Translate the output to decomposed titlecase */
-	CHARSET_FLAG_DECOMP_TITLECASE	= 0x01
-};
-
 enum charset_result {
 	CHARSET_RET_OK = 1,
 	CHARSET_RET_INCOMPLETE_INPUT = -1,
@@ -15,8 +12,9 @@
 };
 
 /* Begin translation to UTF-8. Returns -1 if charset is unknown. */
-int charset_to_utf8_begin(const char *charset, enum charset_flags flags,
-			  struct charset_translation **t_r);
+int charset_to_utf8_begin(const char *charset, normalizer_func_t *normalizer,
+			  struct charset_translation **t_r)
+	ATTR_NULL(2);
 void charset_to_utf8_end(struct charset_translation **t);
 void charset_to_utf8_reset(struct charset_translation *t);
 
@@ -30,8 +28,8 @@
 		const unsigned char *src, size_t *src_size, buffer_t *dest);
 
 /* Translate a single string to UTF8. */
-int charset_to_utf8_str(const char *charset, enum charset_flags flags,
+int charset_to_utf8_str(const char *charset, normalizer_func_t *normalizer,
 			const char *input, string_t *output,
-			enum charset_result *result_r);
+			enum charset_result *result_r) ATTR_NULL(2);
 
 #endif
diff -r d5ebec837bfd -r c976a9c01613 src/lib-imap/imap-base-subject.c
--- a/src/lib-imap/imap-base-subject.c	Sat Sep 15 03:09:57 2012 +0300
+++ b/src/lib-imap/imap-base-subject.c	Sat Sep 15 03:12:20 2012 +0300
@@ -210,7 +210,7 @@
 	   UTF-8.  Convert all tabs and continuations to space.
 	   Convert all multiple spaces to a single space. */
 	message_header_decode_utf8((const unsigned char *)subject, subject_len,
-				   buf, TRUE);
+				   buf, uni_utf8_to_decomposed_titlecase);
 	buffer_append_c(buf, '\0');
 
 	pack_whitespace(buf);
diff -r d5ebec837bfd -r c976a9c01613 src/lib-mail/message-decoder.c
--- a/src/lib-mail/message-decoder.c	Sat Sep 15 03:09:57 2012 +0300
+++ b/src/lib-mail/message-decoder.c	Sat Sep 15 03:12:20 2012 +0300
@@ -22,6 +22,7 @@
 
 struct message_decoder_context {
 	enum message_decoder_flags flags;
+	normalizer_func_t *normalizer;
 	struct message_part *prev_part;
 
 	struct message_header_line hdr;
@@ -46,12 +47,14 @@
 				 struct message_part *part);
 
 struct message_decoder_context *
-message_decoder_init(enum message_decoder_flags flags)
+message_decoder_init(normalizer_func_t *normalizer,
+		     enum message_decoder_flags flags)
 {
 	struct message_decoder_context *ctx;
 
 	ctx = i_new(struct message_decoder_context, 1);
 	ctx->flags = flags;
+	ctx->normalizer = normalizer;
 	ctx->buf = buffer_create_dynamic(default_pool, 8192);
 	ctx->buf2 = buffer_create_dynamic(default_pool, 8192);
 	ctx->encoding_buf = buffer_create_dynamic(default_pool, 128);
@@ -149,7 +152,6 @@
 				  struct message_header_line *hdr,
 				  struct message_block *output)
 {
-	bool dtcase = (ctx->flags & MESSAGE_DECODER_FLAG_DTCASE) != 0;
 	size_t value_len;
 
 	if (hdr->continues) {
@@ -168,12 +170,11 @@
 
 	buffer_set_used_size(ctx->buf, 0);
 	message_header_decode_utf8(hdr->full_value, hdr->full_value_len,
-				   ctx->buf, dtcase);
+				   ctx->buf, ctx->normalizer);
 	value_len = ctx->buf->used;
 
-	if (dtcase) {
-		(void)uni_utf8_to_decomposed_titlecase(hdr->name, hdr->name_len,
-						       ctx->buf);
+	if (ctx->normalizer != NULL) {
+		(void)ctx->normalizer(hdr->name, hdr->name_len, ctx->buf);
 		buffer_append_c(ctx->buf, '\0');
 	} else {
 		if (!uni_utf8_get_valid_data((const unsigned char *)hdr->name,
@@ -229,8 +230,6 @@
 message_decode_body_init_charset(struct message_decoder_context *ctx,
 				 struct message_part *part)
 {
-	enum charset_flags flags;
-
 	ctx->binary_input = ctx->content_charset == NULL &&
 		(ctx->flags & MESSAGE_DECODER_FLAG_RETURN_BINARY) != 0 &&
 		(part->flags & (MESSAGE_PART_FLAG_TEXT |
@@ -249,12 +248,10 @@
 		charset_to_utf8_end(&ctx->charset_trans);
 	i_free_and_null(ctx->charset_trans_charset);
 
-	flags = (ctx->flags & MESSAGE_DECODER_FLAG_DTCASE) != 0 ?
-		CHARSET_FLAG_DECOMP_TITLECASE : 0;
 	ctx->charset_trans_charset = i_strdup(ctx->content_charset != NULL ?
 					      ctx->content_charset : "UTF-8");
-	if (charset_to_utf8_begin(ctx->charset_trans_charset,
-				  flags, &ctx->charset_trans) < 0)
+	if (charset_to_utf8_begin(ctx->charset_trans_charset, ctx->normalizer,
+				  &ctx->charset_trans) < 0)
 		ctx->charset_trans = NULL;
 }
 
@@ -331,9 +328,8 @@
 		output->size = size;
 	} else if (ctx->charset_utf8) {
 		buffer_set_used_size(ctx->buf2, 0);
-		if ((ctx->flags & MESSAGE_DECODER_FLAG_DTCASE) != 0) {