dovecot: Remove illegal UTF-8 sequences from output.

dovecot at dovecot.org dovecot at dovecot.org
Mon Dec 3 15:41:05 EET 2007


details:   http://hg.dovecot.org/dovecot/rev/671c2eb25f3d
changeset: 6915:671c2eb25f3d
user:      Timo Sirainen <tss at iki.fi>
date:      Mon Dec 03 15:41:01 2007 +0200
description:
Remove illegal UTF-8 sequences from output.

diffstat:

1 file changed, 64 insertions(+), 4 deletions(-)
src/lib-mail/message-decoder.c |   68 +++++++++++++++++++++++++++++++++++++---

diffs (87 lines):

diff -r 9c3f0e180751 -r 671c2eb25f3d src/lib-mail/message-decoder.c
--- a/src/lib-mail/message-decoder.c	Mon Dec 03 15:06:27 2007 +0200
+++ b/src/lib-mail/message-decoder.c	Mon Dec 03 15:41:01 2007 +0200
@@ -209,6 +209,65 @@ static void translation_buf_decode(struc
 	ctx->translation_size = 0;
 }
 
+static inline unsigned int
+is_valid_utf8_seq(const unsigned char *input, unsigned int size)
+{
+	size_t i, len;
+
+	len = uni_utf8_char_bytes(input[0]);
+	if (unlikely(len > size))
+		return 0;
+
+	for (i = 0; i < len; i++) {
+		if (unlikely(uni_utf8_char_bytes(input[i]) != len-i))
+			return 0;
+	}
+	return len;
+}
+
+static const unsigned char *
+get_valid_utf8(const unsigned char *input, size_t size, buffer_t *tmpbuf,
+	       size_t *output_size_r)
+{
+	size_t i, len;
+
+	/* find the first invalid utf8 sequence */
+	for (i = 0; i < size;) {
+		if (input[i] < 0x80)
+			i++;
+		else {
+			len = is_valid_utf8_seq(input + i, size-i);
+			if (unlikely(len == 0))
+				goto broken;
+			i += len;
+		}
+	}
+	/* we can use it as-is */
+	*output_size_r = size;
+	return input;
+broken:
+	/* broken utf-8 input - skip the broken characters */
+	buffer_set_used_size(tmpbuf, 0);
+	buffer_append(tmpbuf, input, i++);
+
+	while (i < size) {
+		if (input[i] < 0x80) {
+			buffer_append_c(tmpbuf, input[i++]);
+			continue;
+		}
+
+		len = is_valid_utf8_seq(input + i, size-i);
+		if (len == 0) {
+			i++;
+			continue;
+		}
+		buffer_append(tmpbuf, input + i, len);
+		i += len;
+	}
+	*output_size_r = tmpbuf->used;
+	return tmpbuf->data;
+}
+
 static bool message_decode_body(struct message_decoder_context *ctx,
 				struct message_block *input,
 				struct message_block *output)
@@ -309,12 +368,13 @@ static bool message_decode_body(struct m
 			output->data = ctx->buf2->data;
 			output->size = ctx->buf2->used;
 		} else {
-			output->data = data;
-			output->size = size;
+			output->data = get_valid_utf8(data, size, ctx->buf2,
+						      &output->size);
 		}
 	} else if (ctx->charset_trans == NULL) {
-		output->data = data;
-		output->size = size;
+		/* unknown charset */
+		output->data = get_valid_utf8(data, size, ctx->buf2,
+					      &output->size);
 	} else {
 		buffer_set_used_size(ctx->buf2, 0);
 		if (ctx->translation_size != 0)


More information about the dovecot-cvs mailing list