dovecot-2.2: Replaced "decomposed titlecase" conversions with mo...
dovecot at dovecot.org
dovecot at dovecot.org
Sat Sep 15 03:12:36 EEST 2012
details: http://hg.dovecot.org/dovecot-2.2/rev/c976a9c01613
changeset: 15053:c976a9c01613
user: Timo Sirainen <tss at iki.fi>
date: Sat Sep 15 03:12:20 2012 +0300
description:
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Plugins can now change mail_user.default_normalizer. Specific searches can
also use different normalizers by changing mail_search_context.normalizer.
diffstat:
src/doveadm/doveadm-mail-fetch.c | 2 +-
src/lib-charset/charset-iconv.c | 23 +++++++--------
src/lib-charset/charset-utf8.c | 46 ++++++++++++++++--------------
src/lib-charset/charset-utf8.h | 16 ++++------
src/lib-imap/imap-base-subject.c | 2 +-
src/lib-mail/message-decoder.c | 26 +++++++----------
src/lib-mail/message-decoder.h | 8 ++--
src/lib-mail/message-header-decode.c | 15 ++++------
src/lib-mail/message-header-decode.h | 7 ++--
src/lib-mail/message-search.c | 15 ++++-----
src/lib-mail/message-search.h | 8 ++---
src/lib-mail/test-message-decoder.c | 7 ++--
src/lib-mail/test-message-header-decode.c | 2 +-
src/lib-storage/index/index-search.c | 25 ++++++++++------
src/lib-storage/mail-storage-private.h | 2 +
src/lib-storage/mail-user.c | 1 +
src/lib-storage/mail-user.h | 2 +
src/lib/unichar.h | 6 ++++
src/plugins/fts-squat/fts-backend-squat.c | 6 ++--
src/plugins/fts/fts-api-private.h | 6 ++-
src/plugins/fts/fts-api.c | 7 ++++-
src/plugins/fts/fts-build-mail.c | 5 +--
22 files changed, 125 insertions(+), 112 deletions(-)
diffs (truncated from 764 to 300 lines):
diff -r d5ebec837bfd -r c976a9c01613 src/doveadm/doveadm-mail-fetch.c
--- a/src/doveadm/doveadm-mail-fetch.c Sat Sep 15 03:09:57 2012 +0300
+++ b/src/doveadm/doveadm-mail-fetch.c Sat Sep 15 03:12:20 2012 +0300
@@ -265,7 +265,7 @@
parser = message_parser_init(pool_datastack_create(), input,
MESSAGE_HEADER_PARSER_FLAG_CLEAN_ONELINE,
0);
- decoder = message_decoder_init(0);
+ decoder = message_decoder_init(NULL, 0);
while ((ret = message_parser_parse_next_block(parser, &raw_block)) > 0) {
if (!message_decoder_decode_next_block(decoder, &raw_block,
diff -r d5ebec837bfd -r c976a9c01613 src/lib-charset/charset-iconv.c
--- a/src/lib-charset/charset-iconv.c Sat Sep 15 03:09:57 2012 +0300
+++ b/src/lib-charset/charset-iconv.c Sat Sep 15 03:12:20 2012 +0300
@@ -12,10 +12,10 @@
struct charset_translation {
iconv_t cd;
- enum charset_flags flags;
+ normalizer_func_t *normalizer;
};
-int charset_to_utf8_begin(const char *charset, enum charset_flags flags,
+int charset_to_utf8_begin(const char *charset, normalizer_func_t *normalizer,
struct charset_translation **t_r)
{
struct charset_translation *t;
@@ -31,7 +31,7 @@
t = i_new(struct charset_translation, 1);
t->cd = cd;
- t->flags = flags;
+ t->normalizer = normalizer;
*t_r = t;
return 0;
}
@@ -54,12 +54,12 @@
}
static int
-charset_append_utf8(const void *src, size_t src_size,
- buffer_t *dest, bool dtcase)
+charset_append_utf8(struct charset_translation *t,
+ const void *src, size_t src_size, buffer_t *dest)
{
- if (dtcase)
- return uni_utf8_to_decomposed_titlecase(src, src_size, dest);
- if (!uni_utf8_get_valid_data(src, src_size, dest))
+ if (t->normalizer != NULL)
+ return t->normalizer(src, src_size, dest);
+ else if (!uni_utf8_get_valid_data(src, src_size, dest))
return -1;
else {
buffer_append(dest, src, src_size);
@@ -75,12 +75,11 @@
ICONV_CONST char *ic_srcbuf;
char tmpbuf[8192], *ic_destbuf;
size_t srcleft, destleft;
- bool dtcase = (t->flags & CHARSET_FLAG_DECOMP_TITLECASE) != 0;
bool ret = TRUE;
if (t->cd == (iconv_t)-1) {
/* input is already supposed to be UTF-8 */
- if (charset_append_utf8(src, *src_size, dest, dtcase) < 0)
+ if (charset_append_utf8(t, src, *src_size, dest) < 0)
*result = CHARSET_RET_INVALID_INPUT;
else
*result = CHARSET_RET_OK;
@@ -110,8 +109,8 @@
/* we just converted data to UTF-8. it shouldn't be invalid, but
Solaris iconv appears to pass invalid data through sometimes
(e.g. 8 bit characters with UTF-7) */
- if (charset_append_utf8(tmpbuf, sizeof(tmpbuf) - destleft,
- dest, dtcase) < 0)
+ if (charset_append_utf8(t, tmpbuf, sizeof(tmpbuf) - destleft,
+ dest) < 0)
*result = CHARSET_RET_INVALID_INPUT;
return ret;
}
diff -r d5ebec837bfd -r c976a9c01613 src/lib-charset/charset-utf8.c
--- a/src/lib-charset/charset-utf8.c Sat Sep 15 03:09:57 2012 +0300
+++ b/src/lib-charset/charset-utf8.c Sat Sep 15 03:12:20 2012 +0300
@@ -16,14 +16,14 @@
strcasecmp(charset, "UTF8") == 0;
}
-int charset_to_utf8_str(const char *charset, enum charset_flags flags,
+int charset_to_utf8_str(const char *charset, normalizer_func_t *normalizer,
const char *input, string_t *output,
enum charset_result *result_r)
{
struct charset_translation *t;
size_t len = strlen(input);
- if (charset_to_utf8_begin(charset, flags, &t) < 0)
+ if (charset_to_utf8_begin(charset, normalizer, &t) < 0)
return -1;
*result_r = charset_to_utf8(t, (const unsigned char *)input,
@@ -35,31 +35,31 @@
#ifndef HAVE_ICONV
struct charset_translation {
- enum charset_flags flags;
+ normalizer_func_t *normalizer;
};
-static struct charset_translation raw_translation = { 0 };
-static struct charset_translation tc_translation = {
- CHARSET_FLAG_DECOMP_TITLECASE
-};
-
-int charset_to_utf8_begin(const char *charset, enum charset_flags flags,
+int charset_to_utf8_begin(const char *charset, normalizer_func_t *normalizer,
struct charset_translation **t_r)
{
- if (charset_is_utf8(charset)) {
- if ((flags & CHARSET_FLAG_DECOMP_TITLECASE) != 0)
- *t_r = &tc_translation;
- else
- *t_r = &raw_translation;
- return 0;
+ struct charset_translation *t;
+
+ if (!charset_is_utf8(charset)) {
+ /* no support for charsets that need translation */
+ return -1;
}
- /* no support for charsets that need translation */
- return -1;
+ t = i_new(struct charset_translation, 1);
+ t->normalizer = normalizer;
+ *t_r = t;
+ return 0;
}
-void charset_to_utf8_end(struct charset_translation **t ATTR_UNUSED)
+void charset_to_utf8_end(struct charset_translation **_t)
{
+ struct charset_translation *t = *_t;
+
+ *_t = NULL;
+ i_free(t);
}
void charset_to_utf8_reset(struct charset_translation *t ATTR_UNUSED)
@@ -70,11 +70,13 @@
charset_to_utf8(struct charset_translation *t,
const unsigned char *src, size_t *src_size, buffer_t *dest)
{
- if ((t->flags & CHARSET_FLAG_DECOMP_TITLECASE) == 0)
+ if (t->normalizer != NULL) {
+ if (t->normalizer(src, *src_size, dest) < 0)
+ return CHARSET_RET_INVALID_INPUT;
+ } else if (!uni_utf8_get_valid_data(src, *src_size, dest)) {
+ return CHARSET_RET_INVALID_INPUT;
+ } else {
buffer_append(dest, src, *src_size);
- else {
- if (uni_utf8_to_decomposed_titlecase(src, *src_size, dest) < 0)
- return CHARSET_RET_INVALID_INPUT;
}
return CHARSET_RET_OK;
}
diff -r d5ebec837bfd -r c976a9c01613 src/lib-charset/charset-utf8.h
--- a/src/lib-charset/charset-utf8.h Sat Sep 15 03:09:57 2012 +0300
+++ b/src/lib-charset/charset-utf8.h Sat Sep 15 03:12:20 2012 +0300
@@ -1,13 +1,10 @@
#ifndef CHARSET_UTF8_H
#define CHARSET_UTF8_H
+#include "unichar.h"
+
struct charset_translation;
-enum charset_flags {
- /* Translate the output to decomposed titlecase */
- CHARSET_FLAG_DECOMP_TITLECASE = 0x01
-};
-
enum charset_result {
CHARSET_RET_OK = 1,
CHARSET_RET_INCOMPLETE_INPUT = -1,
@@ -15,8 +12,9 @@
};
/* Begin translation to UTF-8. Returns -1 if charset is unknown. */
-int charset_to_utf8_begin(const char *charset, enum charset_flags flags,
- struct charset_translation **t_r);
+int charset_to_utf8_begin(const char *charset, normalizer_func_t *normalizer,
+ struct charset_translation **t_r)
+ ATTR_NULL(2);
void charset_to_utf8_end(struct charset_translation **t);
void charset_to_utf8_reset(struct charset_translation *t);
@@ -30,8 +28,8 @@
const unsigned char *src, size_t *src_size, buffer_t *dest);
/* Translate a single string to UTF8. */
-int charset_to_utf8_str(const char *charset, enum charset_flags flags,
+int charset_to_utf8_str(const char *charset, normalizer_func_t *normalizer,
const char *input, string_t *output,
- enum charset_result *result_r);
+ enum charset_result *result_r) ATTR_NULL(2);
#endif
diff -r d5ebec837bfd -r c976a9c01613 src/lib-imap/imap-base-subject.c
--- a/src/lib-imap/imap-base-subject.c Sat Sep 15 03:09:57 2012 +0300
+++ b/src/lib-imap/imap-base-subject.c Sat Sep 15 03:12:20 2012 +0300
@@ -210,7 +210,7 @@
UTF-8. Convert all tabs and continuations to space.
Convert all multiple spaces to a single space. */
message_header_decode_utf8((const unsigned char *)subject, subject_len,
- buf, TRUE);
+ buf, uni_utf8_to_decomposed_titlecase);
buffer_append_c(buf, '\0');
pack_whitespace(buf);
diff -r d5ebec837bfd -r c976a9c01613 src/lib-mail/message-decoder.c
--- a/src/lib-mail/message-decoder.c Sat Sep 15 03:09:57 2012 +0300
+++ b/src/lib-mail/message-decoder.c Sat Sep 15 03:12:20 2012 +0300
@@ -22,6 +22,7 @@
struct message_decoder_context {
enum message_decoder_flags flags;
+ normalizer_func_t *normalizer;
struct message_part *prev_part;
struct message_header_line hdr;
@@ -46,12 +47,14 @@
struct message_part *part);
struct message_decoder_context *
-message_decoder_init(enum message_decoder_flags flags)
+message_decoder_init(normalizer_func_t *normalizer,
+ enum message_decoder_flags flags)
{
struct message_decoder_context *ctx;
ctx = i_new(struct message_decoder_context, 1);
ctx->flags = flags;
+ ctx->normalizer = normalizer;
ctx->buf = buffer_create_dynamic(default_pool, 8192);
ctx->buf2 = buffer_create_dynamic(default_pool, 8192);
ctx->encoding_buf = buffer_create_dynamic(default_pool, 128);
@@ -149,7 +152,6 @@
struct message_header_line *hdr,
struct message_block *output)
{
- bool dtcase = (ctx->flags & MESSAGE_DECODER_FLAG_DTCASE) != 0;
size_t value_len;
if (hdr->continues) {
@@ -168,12 +170,11 @@
buffer_set_used_size(ctx->buf, 0);
message_header_decode_utf8(hdr->full_value, hdr->full_value_len,
- ctx->buf, dtcase);
+ ctx->buf, ctx->normalizer);
value_len = ctx->buf->used;
- if (dtcase) {
- (void)uni_utf8_to_decomposed_titlecase(hdr->name, hdr->name_len,
- ctx->buf);
+ if (ctx->normalizer != NULL) {
+ (void)ctx->normalizer(hdr->name, hdr->name_len, ctx->buf);
buffer_append_c(ctx->buf, '\0');
} else {
if (!uni_utf8_get_valid_data((const unsigned char *)hdr->name,
@@ -229,8 +230,6 @@
message_decode_body_init_charset(struct message_decoder_context *ctx,
struct message_part *part)
{
- enum charset_flags flags;
-
ctx->binary_input = ctx->content_charset == NULL &&
(ctx->flags & MESSAGE_DECODER_FLAG_RETURN_BINARY) != 0 &&
(part->flags & (MESSAGE_PART_FLAG_TEXT |
@@ -249,12 +248,10 @@
charset_to_utf8_end(&ctx->charset_trans);
i_free_and_null(ctx->charset_trans_charset);
- flags = (ctx->flags & MESSAGE_DECODER_FLAG_DTCASE) != 0 ?
- CHARSET_FLAG_DECOMP_TITLECASE : 0;
ctx->charset_trans_charset = i_strdup(ctx->content_charset != NULL ?
ctx->content_charset : "UTF-8");
- if (charset_to_utf8_begin(ctx->charset_trans_charset,
- flags, &ctx->charset_trans) < 0)
+ if (charset_to_utf8_begin(ctx->charset_trans_charset, ctx->normalizer,
+ &ctx->charset_trans) < 0)
ctx->charset_trans = NULL;
}
@@ -331,9 +328,8 @@
output->size = size;
} else if (ctx->charset_utf8) {
buffer_set_used_size(ctx->buf2, 0);
- if ((ctx->flags & MESSAGE_DECODER_FLAG_DTCASE) != 0) {
More information about the dovecot-cvs
mailing list