dovecot-2.2: lib-mail: Added message_snippet_generate() to produ...

dovecot at dovecot.org dovecot at dovecot.org
Fri Jan 16 22:33:26 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/5211234206ea
changeset: 18157:5211234206ea
user:      Timo Sirainen <tss at iki.fi>
date:      Sat Jan 17 00:23:36 2015 +0200
description:
lib-mail: Added message_snippet_generate() to produce a short text snippet of a mail.

diffstat:

 src/lib-mail/Makefile.am            |    7 +
 src/lib-mail/message-snippet.c      |  136 ++++++++++++++++++++++++++++++++++++
 src/lib-mail/message-snippet.h      |   14 +++
 src/lib-mail/test-message-snippet.c |   80 +++++++++++++++++++++
 4 files changed, 237 insertions(+), 0 deletions(-)

diffs (280 lines):

diff -r d59753d9f5e9 -r 5211234206ea src/lib-mail/Makefile.am
--- a/src/lib-mail/Makefile.am	Sat Jan 17 00:15:44 2015 +0200
+++ b/src/lib-mail/Makefile.am	Sat Jan 17 00:23:36 2015 +0200
@@ -29,6 +29,7 @@
 	message-part-serialize.c \
 	message-search.c \
 	message-size.c \
+	message-snippet.c \
 	ostream-dot.c \
 	quoted-printable.c \
 	rfc2231-parser.c \
@@ -62,6 +63,7 @@
 	message-part-serialize.h \
 	message-search.h \
 	message-size.h \
+	message-snippet.h \
 	ostream-dot.h \
 	quoted-printable.h \
 	rfc2231-parser.h \
@@ -87,6 +89,7 @@
 	test-message-id \
 	test-message-parser \
 	test-message-part \
+	test-message-snippet \
 	test-ostream-dot \
 	test-quoted-printable \
 	test-rfc2231-parser
@@ -166,6 +169,10 @@
 test_message_part_LDADD = message-part.lo message-parser.lo message-header-parser.lo message-size.lo rfc822-parser.lo rfc2231-parser.lo $(test_libs)
 test_message_part_DEPENDENCIES = $(test_deps)
 
+test_message_snippet_SOURCES = test-message-snippet.c
+test_message_snippet_LDADD = message-snippet.lo mail-html2text.lo $(test_message_decoder_LDADD) message-parser.lo message-header-parser.lo message-header-decode.lo message-size.lo
+test_message_snippet_DEPENDENCIES = $(test_deps)
+
 test_mail_html2text_SOURCES = test-mail-html2text.c
 test_mail_html2text_LDADD = mail-html2text.lo $(test_libs)
 test_mail_html2text_DEPENDENCIES = $(test_deps)
diff -r d59753d9f5e9 -r 5211234206ea src/lib-mail/message-snippet.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-mail/message-snippet.c	Sat Jan 17 00:23:36 2015 +0200
@@ -0,0 +1,136 @@
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "buffer.h"
+#include "str.h"
+#include "istream.h"
+#include "mail-html2text.h"
+#include "message-parser.h"
+#include "message-decoder.h"
+#include "message-snippet.h"
+
+enum snippet_state {
+	/* beginning of the line */
+	SNIPPET_STATE_NEWLINE = 0,
+	/* within normal text */
+	SNIPPET_STATE_NORMAL,
+	/* within quoted text - skip until EOL */
+	SNIPPET_STATE_QUOTED
+};
+
+struct snippet_context {
+	string_t *snippet;
+	unsigned int chars_left;
+	enum snippet_state state;
+	bool add_whitespace;
+	struct mail_html2text *html2text;
+	buffer_t *plain_output;
+};
+
+static bool snippet_generate(struct snippet_context *ctx,
+			     const unsigned char *data, size_t size)
+{
+	unsigned int i, count;
+
+	if (ctx->html2text != NULL) {
+		buffer_set_used_size(ctx->plain_output, 0);
+		mail_html2text_more(ctx->html2text, data, size,
+				    ctx->plain_output);
+		data = ctx->plain_output->data;
+		size = ctx->plain_output->used;
+	}
+
+	/* message-decoder should feed us only valid and complete
+	   UTF-8 input */
+	for (i = 0; i < size; i += count) {
+		count = 1;
+		switch (ctx->state) {
+		case SNIPPET_STATE_NEWLINE:
+			if (data[i] == '>' && ctx->html2text == NULL) {
+				ctx->state = SNIPPET_STATE_QUOTED;
+				break;
+			}
+			ctx->state = SNIPPET_STATE_NORMAL;
+			/* fallthrough */
+		case SNIPPET_STATE_NORMAL:
+			if (data[i] == '\r' || data[i] == '\n' ||
+			    data[i] == '\t' || data[i] == ' ') {
+				ctx->add_whitespace = TRUE;
+				if (data[i] == '\n')
+					ctx->state = SNIPPET_STATE_NEWLINE;
+				break;
+			}
+			if (ctx->add_whitespace) {
+				str_append_c(ctx->snippet, ' ');
+				ctx->add_whitespace = FALSE;
+				if (ctx->chars_left-- == 0)
+					return FALSE;
+			}
+			if (ctx->chars_left-- == 0)
+				return FALSE;
+			count = uni_utf8_char_bytes(data[i]);
+			i_assert(i + count <= size);
+			str_append_n(ctx->snippet, data + i, count);
+			break;
+		case SNIPPET_STATE_QUOTED:
+			if (data[i] == '\n')
+				ctx->state = SNIPPET_STATE_NEWLINE;
+			break;
+		}
+	}
+	return TRUE;
+}
+
+int message_snippet_generate(struct istream *input,
+			     unsigned int max_snippet_chars,
+			     string_t *snippet)
+{
+	struct message_parser_ctx *parser;
+	struct message_part *parts;
+	struct message_decoder_context *decoder;
+	struct message_block raw_block, block;
+	struct snippet_context ctx;
+	pool_t pool;
+	int ret;
+
+	memset(&ctx, 0, sizeof(ctx));
+	pool = pool_alloconly_create("message snippet", 1024);
+	ctx.snippet = snippet;
+	ctx.chars_left = max_snippet_chars;
+
+	parser = message_parser_init(pool_datastack_create(), input, 0, 0);
+	decoder = message_decoder_init(NULL, 0);
+	while ((ret = message_parser_parse_next_block(parser, &raw_block)) > 0) {
+		if (!message_decoder_decode_next_block(decoder, &raw_block, &block))
+			continue;
+		if (block.size == 0) {
+			const char *ct;
+
+			if (block.hdr != NULL)
+				continue;
+
+			/* end of headers - verify that we can use this
+			   Content-Type. we get here only once, because we
+			   always handle only one non-multipart MIME part. */
+			ct = message_decoder_current_content_type(decoder);
+			if (ct == NULL)
+				/* text/plain */ ;
+			else if (strcasecmp(ct, "text/html") == 0) {
+				ctx.html2text = mail_html2text_init(MAIL_HTML2TEXT_FLAG_SKIP_QUOTED);
+				ctx.plain_output = buffer_create_dynamic(pool, 1024);
+			} else if (strncasecmp(ct, "text/", 5) != 0)
+				break;
+			continue;
+		}
+		if (!snippet_generate(&ctx, block.data, block.size))
+			break;
+	}
+	i_assert(ret != 0);
+	message_decoder_deinit(&decoder);
+	if (message_parser_deinit(&parser, &parts) < 0)
+		i_unreached();
+	if (ctx.html2text != NULL)
+		mail_html2text_deinit(&ctx.html2text);
+	pool_unref(&pool);
+	return input->stream_errno == 0 ? 0 : -1;
+}
diff -r d59753d9f5e9 -r 5211234206ea src/lib-mail/message-snippet.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-mail/message-snippet.h	Sat Jan 17 00:23:36 2015 +0200
@@ -0,0 +1,14 @@
+#ifndef MESSAGE_SNIPPET_H
+#define MESSAGE_SNIPPET_H
+
+/* Generate UTF-8 text snippet from the beginning of the given mail input
+   stream. The stream is expected to start at the MIME part's headers whose
+   snippet is being generated. Returns 0 if ok, -1 if I/O error.
+
+   Currently only Content-Type: text/ is supported, others will result in an
+   empty string. */
+int message_snippet_generate(struct istream *input,
+			     unsigned int max_snippet_chars,
+			     string_t *snippet);
+
+#endif
diff -r d59753d9f5e9 -r 5211234206ea src/lib-mail/test-message-snippet.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-mail/test-message-snippet.c	Sat Jan 17 00:23:36 2015 +0200
@@ -0,0 +1,80 @@
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "istream.h"
+#include "message-snippet.h"
+#include "test-common.h"
+
+static struct {
+	const char *input;
+	unsigned int max_snippet_chars;
+	const char *output;
+} tests[] = {
+	{ "Content-Type: text/plain\n"
+	  "\n"
+	  "1234567890 234567890",
+	  12,
+	  "1234567890 2" },
+	{ "Content-Type: text/plain\n"
+	  "\n"
+	  "line1\n>quote2\nline2\n",
+	  100,
+	  "line1 line2" },
+	{ "Content-Type: text/plain\n"
+	  "\n"
+	  "line1\n>quote2\n> quote3\n > line4\n\n  \t\t  \nline5\n  \t ",
+	  100,
+	  "line1 > line4 line5" },
+	{ "Content-Type: text/plain; charset=utf-8\n"
+	  "\n"
+	  "hyv\xC3\xA4\xC3\xA4 p\xC3\xA4iv\xC3\xA4\xC3\xA4",
+	  11,
+	  "hyv\xC3\xA4\xC3\xA4 p\xC3\xA4iv\xC3\xA4" },
+	{ "Content-Type: text/plain; charset=utf-8\n"
+	  "Content-Transfer-Encoding: quoted-printable\n"
+	  "\n"
+	  "hyv=C3=A4=C3=A4 p=C3=A4iv=C3=A4=C3=A4",
+	  11,
+	  "hyv\xC3\xA4\xC3\xA4 p\xC3\xA4iv\xC3\xA4" },
+
+	{ "Content-Transfer-Encoding: quoted-printable\n"
+	  "Content-Type: text/html;\n"
+	  "      charset=utf-8\n"
+	  "\n"
+	  "<html><head><meta http-equiv=3D\"Content-Type\" content=3D\"text/html =\n"
+	  "charset=3Dutf-8\"></head><body style=3D\"word-wrap: break-word; =\n"
+	  "-webkit-nbsp-mode: space; -webkit-line-break: after-white-space;\" =\n"
+	  "class=3D\"\">Hi,<div class=3D\"\"><br class=3D\"\"></div><div class=3D\"\">How =\n"
+	  "is it going? <blockquote>quoted text is ignored</blockquote>\n"
+	  "> -foo\n"
+	  "</div><br =class=3D\"\"></body></html>=\n",
+	  100,
+	  "Hi, How is it going? > -foo" },
+};
+
+static void test_message_snippet(void)
+{
+	string_t *str = t_str_new(128);
+	struct istream *input;
+	unsigned int i;
+
+	test_begin("message snippet");
+	for (i = 0; i < N_ELEMENTS(tests); i++) {
+		str_truncate(str, 0);
+		input = i_stream_create_from_data(tests[i].input, strlen(tests[i].input));
+		test_assert_idx(message_snippet_generate(input, tests[i].max_snippet_chars, str) == 0, i);
+		test_assert_idx(strcmp(tests[i].output, str_c(str)) == 0, i);
+		i_stream_destroy(&input);
+	}
+	test_end();
+}
+
+int main(void)
+{
+	static void (*test_functions[])(void) = {
+		test_message_snippet,
+		NULL
+	};
+	return test_run(test_functions);
+}


More information about the dovecot-cvs mailing list