dovecot: Remove illegal UTF-8 sequences from output.
dovecot at dovecot.org
dovecot at dovecot.org
Mon Dec 3 15:41:05 EET 2007
details: http://hg.dovecot.org/dovecot/rev/671c2eb25f3d
changeset: 6915:671c2eb25f3d
user: Timo Sirainen <tss at iki.fi>
date: Mon Dec 03 15:41:01 2007 +0200
description:
Remove illegal UTF-8 sequences from output.
diffstat:
1 file changed, 64 insertions(+), 4 deletions(-)
src/lib-mail/message-decoder.c | 68 +++++++++++++++++++++++++++++++++++++---
diffs (87 lines):
diff -r 9c3f0e180751 -r 671c2eb25f3d src/lib-mail/message-decoder.c
--- a/src/lib-mail/message-decoder.c Mon Dec 03 15:06:27 2007 +0200
+++ b/src/lib-mail/message-decoder.c Mon Dec 03 15:41:01 2007 +0200
@@ -209,6 +209,65 @@ static void translation_buf_decode(struc
ctx->translation_size = 0;
}
+static inline unsigned int
+is_valid_utf8_seq(const unsigned char *input, unsigned int size)
+{
+ size_t i, len;
+
+ len = uni_utf8_char_bytes(input[0]);
+ if (unlikely(len > size))
+ return 0;
+
+ for (i = 0; i < len; i++) {
+ if (unlikely(uni_utf8_char_bytes(input[i]) != len-i))
+ return 0;
+ }
+ return len;
+}
+
+static const unsigned char *
+get_valid_utf8(const unsigned char *input, size_t size, buffer_t *tmpbuf,
+ size_t *output_size_r)
+{
+ size_t i, len;
+
+ /* find the first invalid utf8 sequence */
+ for (i = 0; i < size;) {
+ if (input[i] < 0x80)
+ i++;
+ else {
+ len = is_valid_utf8_seq(input + i, size-i);
+ if (unlikely(len == 0))
+ goto broken;
+ i += len;
+ }
+ }
+ /* we can use it as-is */
+ *output_size_r = size;
+ return input;
+broken:
+ /* broken utf-8 input - skip the broken characters */
+ buffer_set_used_size(tmpbuf, 0);
+ buffer_append(tmpbuf, input, i++);
+
+ while (i < size) {
+ if (input[i] < 0x80) {
+ buffer_append_c(tmpbuf, input[i++]);
+ continue;
+ }
+
+ len = is_valid_utf8_seq(input + i, size-i);
+ if (len == 0) {
+ i++;
+ continue;
+ }
+ buffer_append(tmpbuf, input + i, len);
+ i += len;
+ }
+ *output_size_r = tmpbuf->used;
+ return tmpbuf->data;
+}
+
static bool message_decode_body(struct message_decoder_context *ctx,
struct message_block *input,
struct message_block *output)
@@ -309,12 +368,13 @@ static bool message_decode_body(struct m
output->data = ctx->buf2->data;
output->size = ctx->buf2->used;
} else {
- output->data = data;
- output->size = size;
+ output->data = get_valid_utf8(data, size, ctx->buf2,
+ &output->size);
}
} else if (ctx->charset_trans == NULL) {
- output->data = data;
- output->size = size;
+ /* unknown charset */
+ output->data = get_valid_utf8(data, size, ctx->buf2,
+ &output->size);
} else {
buffer_set_used_size(ctx->buf2, 0);
if (ctx->translation_size != 0)
More information about the dovecot-cvs
mailing list