dovecot: Moved uni_utf8_get_valid_data() to lib/
dovecot at dovecot.org
dovecot at dovecot.org
Sat Dec 8 15:45:21 EET 2007
details: http://hg.dovecot.org/dovecot/rev/1f70c72e4312
changeset: 6951:1f70c72e4312
user: Timo Sirainen <tss at iki.fi>
date: Sat Dec 08 15:45:17 2007 +0200
description:
Moved uni_utf8_get_valid_data() to lib/
diffstat:
3 files changed, 71 insertions(+), 63 deletions(-)
src/lib-mail/message-decoder.c | 68 ++--------------------------------------
src/lib/unichar.c | 59 ++++++++++++++++++++++++++++++++++
src/lib/unichar.h | 7 ++++
diffs (169 lines):
diff -r 63e225ab7361 -r 1f70c72e4312 src/lib-mail/message-decoder.c
--- a/src/lib-mail/message-decoder.c Sat Dec 08 15:42:25 2007 +0200
+++ b/src/lib-mail/message-decoder.c Sat Dec 08 15:45:17 2007 +0200
@@ -207,65 +207,6 @@ static void translation_buf_decode(struc
ctx->translation_size = 0;
}
-static inline unsigned int
-is_valid_utf8_seq(const unsigned char *input, unsigned int size)
-{
- size_t i, len;
-
- len = uni_utf8_char_bytes(input[0]);
- if (unlikely(len > size))
- return 0;
-
- for (i = 0; i < len; i++) {
- if (unlikely(uni_utf8_char_bytes(input[i]) != len-i))
- return 0;
- }
- return len;
-}
-
-static const unsigned char *
-get_valid_utf8(const unsigned char *input, size_t size, buffer_t *tmpbuf,
- size_t *output_size_r)
-{
- size_t i, len;
-
- /* find the first invalid utf8 sequence */
- for (i = 0; i < size;) {
- if (input[i] < 0x80)
- i++;
- else {
- len = is_valid_utf8_seq(input + i, size-i);
- if (unlikely(len == 0))
- goto broken;
- i += len;
- }
- }
- /* we can use it as-is */
- *output_size_r = size;
- return input;
-broken:
- /* broken utf-8 input - skip the broken characters */
- buffer_set_used_size(tmpbuf, 0);
- buffer_append(tmpbuf, input, i++);
-
- while (i < size) {
- if (input[i] < 0x80) {
- buffer_append_c(tmpbuf, input[i++]);
- continue;
- }
-
- len = is_valid_utf8_seq(input + i, size-i);
- if (len == 0) {
- i++;
- continue;
- }
- buffer_append(tmpbuf, input + i, len);
- i += len;
- }
- *output_size_r = tmpbuf->used;
- return tmpbuf->data;
-}
-
static void message_decode_body_init_charset(struct message_decoder_context *ctx)
{
enum charset_flags flags;
@@ -382,13 +323,14 @@ static bool message_decode_body(struct m
output->data = ctx->buf2->data;
output->size = ctx->buf2->used;
} else {
- output->data = get_valid_utf8(data, size, ctx->buf2,
- &output->size);
+ output->data =
+ uni_utf8_get_valid_data(data, size, ctx->buf2,
+ &output->size);
}
} else if (ctx->charset_trans == NULL) {
/* unknown charset */
- output->data = get_valid_utf8(data, size, ctx->buf2,
- &output->size);
+ output->data = uni_utf8_get_valid_data(data, size, ctx->buf2,
+ &output->size);
} else {
buffer_set_used_size(ctx->buf2, 0);
if (ctx->translation_size != 0)
diff -r 63e225ab7361 -r 1f70c72e4312 src/lib/unichar.c
--- a/src/lib/unichar.c Sat Dec 08 15:42:25 2007 +0200
+++ b/src/lib/unichar.c Sat Dec 08 15:45:17 2007 +0200
@@ -285,3 +285,62 @@ int uni_utf8_to_decomposed_titlecase(con
}
return 0;
}
+
+static inline unsigned int
+is_valid_utf8_seq(const unsigned char *input, unsigned int size)
+{
+ size_t i, len;
+
+ len = uni_utf8_char_bytes(input[0]);
+ if (unlikely(len > size))
+ return 0;
+
+ for (i = 0; i < len; i++) {
+ if (unlikely(uni_utf8_char_bytes(input[i]) != len-i))
+ return 0;
+ }
+ return len;
+}
+
+const unsigned char *
+uni_utf8_get_valid_data(const unsigned char *input, size_t size,
+ buffer_t *tmpbuf, size_t *output_size_r)
+{
+ size_t i, len;
+
+ /* find the first invalid utf8 sequence */
+ for (i = 0; i < size;) {
+ if (input[i] < 0x80)
+ i++;
+ else {
+ len = is_valid_utf8_seq(input + i, size-i);
+ if (unlikely(len == 0))
+ goto broken;
+ i += len;
+ }
+ }
+ /* we can use it as-is */
+ *output_size_r = size;
+ return input;
+broken:
+ /* broken utf-8 input - skip the broken characters */
+ buffer_set_used_size(tmpbuf, 0);
+ buffer_append(tmpbuf, input, i++);
+
+ while (i < size) {
+ if (input[i] < 0x80) {
+ buffer_append_c(tmpbuf, input[i++]);
+ continue;
+ }
+
+ len = is_valid_utf8_seq(input + i, size-i);
+ if (len == 0) {
+ i++;
+ continue;
+ }
+ buffer_append(tmpbuf, input + i, len);
+ i += len;
+ }
+ *output_size_r = tmpbuf->used;
+ return tmpbuf->data;
+}
diff -r 63e225ab7361 -r 1f70c72e4312 src/lib/unichar.h
--- a/src/lib/unichar.h Sat Dec 08 15:42:25 2007 +0200
+++ b/src/lib/unichar.h Sat Dec 08 15:45:17 2007 +0200
@@ -40,4 +40,11 @@ int uni_utf8_to_decomposed_titlecase(con
int uni_utf8_to_decomposed_titlecase(const void *input, size_t max_len,
buffer_t *output);
+/* If input contains only valid UTF-8 input, return it directly. If input
+ contains invalid UTF-8 input, write only valid UTF-8 characters to the
+ given buffer and return it. */
+const unsigned char *
+uni_utf8_get_valid_data(const unsigned char *input, size_t size,
+ buffer_t *tmpbuf, size_t *output_size_r);
+
#endif
More information about the dovecot-cvs
mailing list