[dovecot-cvs] dovecot/src/lib-mail message-body-search.c,NONE,1.1 message-body-search.h,NONE,1.1 quoted-printable.c,NONE,1.1 quoted-printable.h,NONE,1.1 Makefile.am,1.4,1.5 message-header-search.c,1.1,1.2 message-header-search.h,1.1,1.2

Wed Nov 13 13:08:20 EET 2002

Update of /home/cvs/dovecot/src/lib-mail
In directory danu:/tmp/cvs-serv7219/lib-mail

Modified Files:
	Makefile.am message-header-search.c message-header-search.h 
Added Files:
	message-body-search.c message-body-search.h quoted-printable.c 
	quoted-printable.h 
Log Message:
SEARCH CHARSET now works properly with message bodies, and in general body
searching works more correctly by decoding base64/qp data. Non-text MIME
parts are currently not included in search, that could be made optional.
Also the body is parsed separately for each keyword, that could be
optimized.

Changed base64_decode() behaviour so that it can accept non-base64 data as
well, ie. line feeds etc.

--- NEW FILE: message-body-search.c ---
/* Copyright (C) 2002 Timo Sirainen */

#include "lib.h"
#include "base64.h"
#include "ibuffer.h"
#include "charset-utf8.h"
#include "rfc822-tokenize.h"
#include "quoted-printable.h"
#include "message-parser.h"
#include "message-content-parser.h"
#include "message-header-search.h"
#include "message-body-search.h"

#define DECODE_BLOCK_SIZE 8192

typedef struct {
	Pool pool;

	const char *key;
	size_t key_len;

	const char *charset;
	unsigned int unknown_charset:1;
} BodySearchContext;

typedef struct {
	BodySearchContext *body_ctx;

	HeaderSearchContext *hdr_search_ctx;
	CharsetTranslation *translation;

	unsigned char decode_buf[DECODE_BLOCK_SIZE];
	size_t decode_buf_used;

	size_t *matches;
	ssize_t match_count;

	const char *content_type;
	const char *content_charset;

	unsigned int content_qp:1;
	unsigned int content_base64:1;
	unsigned int content_unknown:1;
	unsigned int content_type_text:1; /* text/any or message/any */
	unsigned int found:1;
} PartSearchContext;

static void parse_content_type(const Rfc822Token *tokens, int count,
			       void *context)
{
	PartSearchContext *ctx = context;

	if (ctx->content_type != NULL && tokens[0].token == 'A') {
		ctx->content_type = rfc822_tokens_get_value(tokens, count);
		ctx->content_type_text =
			strncasecmp(ctx->content_type, "text/", 5) == 0 ||
			strncasecmp(ctx->content_type, "message/", 8) == 0;
	}
}

static void parse_content_type_param(const Rfc822Token *name,
				     const Rfc822Token *value,
				     int value_count, void *context)
{
	PartSearchContext *ctx = context;

	if (name->len != 7 || strncasecmp(name->ptr, "charset", 7) != 0)
		return;

	if (ctx->content_charset == NULL) {
		ctx->content_charset =
			rfc822_tokens_get_value(value, value_count);
	}
}

static void parse_content_encoding(const Rfc822Token *tokens,
				   int count __attr_unused__, void *context)
{
	PartSearchContext *ctx = context;

	if (tokens[0].token != 'A')
		return;

	switch (tokens[0].len) {
	case 4:
		if (strncasecmp(tokens[0].ptr, "7bit", 4) != 0 &&
		    strncasecmp(tokens[0].ptr, "8bit", 4) != 0)
			ctx->content_unknown = TRUE;
		break;
	case 6:
		if (strncasecmp(tokens[0].ptr, "base64", 6) == 0)
			ctx->content_base64 = TRUE;
		else if (strncasecmp(tokens[0].ptr, "binary", 6) != 0)
			ctx->content_unknown = TRUE;
		break;
	case 16:
		if (strncasecmp(tokens[0].ptr, "quoted-printable", 16) == 0)
			ctx->content_qp = TRUE;
		else
			ctx->content_unknown = TRUE;
		break;
	default:
		ctx->content_unknown = TRUE;
		break;
	}
}

static void header_find(MessagePart *part __attr_unused__,
			const char *name, size_t name_len,
			const char *value, size_t value_len, void *context)
{
	PartSearchContext *ctx = context;

	if (ctx->found)
		return;

	ctx->found = message_header_search(value, &value_len,
					   ctx->hdr_search_ctx);

	if (name_len == 12 && strncasecmp(name, "Content-Type", 12) == 0) {
		(void)message_content_parse_header(t_strndup(value, value_len),
						   parse_content_type,
						   parse_content_type_param,
						   ctx);
	} else if (name_len == 25 &&
		   strncasecmp(name, "Content-Transfer-Encoding", 25) == 0) {
		(void)message_content_parse_header(t_strndup(value, value_len),
						   parse_content_encoding,
						   NULL, ctx);
	}
}

static int message_search_header(PartSearchContext *ctx, IBuffer *inbuf)
{
	ctx->hdr_search_ctx = message_header_search_init(data_stack_pool,
							 ctx->body_ctx->key,
							 ctx->body_ctx->charset,
							 NULL);

	/* we default to text content-type */
	ctx->content_type_text = TRUE;
	message_parse_header(NULL, inbuf, NULL, header_find, ctx);

	return ctx->found;
}

static int message_search_decoded_block(PartSearchContext *ctx,
					const unsigned char *data, size_t size)
{
	const unsigned char *p, *end, *key;
	size_t key_len;
	ssize_t i;
	int found;

	key = (const unsigned char *) ctx->body_ctx->key;
	key_len = ctx->body_ctx->key_len;

	end = data + size; found = 0;
	for (p = data; p != end; p++) {
		for (i = ctx->match_count-1; i >= 0; i--) {
			if (key[ctx->matches[i]] == *p) {
				if (++ctx->matches[i] == key_len) {
					/* full match */
					p++;
					found = TRUE;
					break;
				}
			} else {
				/* non-match */
				ctx->match_count--;
				if (i != ctx->match_count) {
					memmove(ctx->matches + i,
						ctx->matches + i + 1,
						ctx->match_count - i);
				}
			}
		}

		if (found)
			break;

		if (*p == key[0]) {
			if (key_len == 1) {
				/* only one character in search key */
				p++;
				found = 1;
				break;
			}
			i_assert((size_t)ctx->match_count < key_len);
			ctx->matches[ctx->match_count++] = 1;
		}
	}

	return found;
}

static int message_search_body_block(PartSearchContext *ctx,
				     const unsigned char *data, size_t size)
{
	const unsigned char *inbuf;
	unsigned char outbuf[DECODE_BLOCK_SIZE];
	size_t inbuf_size, outbuf_size, max_size;

	if (ctx->body_ctx->unknown_charset || ctx->translation == NULL)
		return message_search_decoded_block(ctx, data, size);

	while (size > 0) {
		if (ctx->decode_buf_used == 0) {
			inbuf = data;
			inbuf_size = I_MIN(size, sizeof(ctx->decode_buf));

			data += inbuf_size;
			size -= inbuf_size;
		} else {
			/* some characters already in buffer, ie. last
			   conversion contained partial data */
			max_size = sizeof(ctx->decode_buf) -
				ctx->decode_buf_used;
			if (max_size > size)
				max_size = size;

			memcpy(ctx->decode_buf + ctx->decode_buf_used,
			       data, max_size);
			ctx->decode_buf_used += max_size;

			inbuf = ctx->decode_buf;
			inbuf_size = ctx->decode_buf_used;

			data += max_size;
			size -= max_size;
		}

		outbuf_size = sizeof(outbuf);
		if (!charset_to_ucase_utf8(ctx->translation,
					   &inbuf, &inbuf_size,
					   outbuf, &outbuf_size)) {
			/* something failed */
			return -1;
		}

		if (message_search_decoded_block(ctx, outbuf, outbuf_size))
			return 1;

		if (inbuf_size > 0) {
			/* partial input, save it */
			memmove(ctx->decode_buf, inbuf, inbuf_size);
			ctx->decode_buf_used = inbuf_size;
		}
	}

	return 0;
}

static int message_search_body(PartSearchContext *ctx, IBuffer *inbuf,
			       MessagePart *part)
{
	const unsigned char *data, *decoded;
	unsigned char *decodebuf;
	size_t data_size, decoded_size, pos;
	uoff_t old_limit;
	ssize_t ret;
	int found;

	if (ctx->content_unknown) {
		/* unknown content-encoding-type, ignore */
		return FALSE;
	}

	if (!ctx->content_type_text) {
		/* non-text content, ignore - FIXME: should be configurable? */
		return FALSE;
	}

	ctx->translation = charset_to_utf8_begin(ctx->content_charset != NULL ?
						 ctx->content_charset : "ascii",
						 NULL);

	ctx->match_count = 0;
	ctx->matches = t_malloc(sizeof(size_t) * ctx->body_ctx->key_len);

	i_buffer_skip(inbuf, part->physical_pos +
		      part->header_size.physical_size - inbuf->v_offset);

	old_limit = inbuf->v_limit;
	i_buffer_set_read_limit(inbuf, inbuf->v_offset +
				part->body_size.physical_size);

	found = FALSE; pos = 0;
	while (i_buffer_read_data(inbuf, &data, &data_size, pos) > 0) {
		/* limit the size of t_malloc()s */
		if (data_size > DECODE_BLOCK_SIZE)
			data_size = DECODE_BLOCK_SIZE;
		pos = data_size;

		t_push();
		if (ctx->content_qp) {
			decoded = decodebuf = t_malloc(data_size);
			decoded_size = quoted_printable_decode(data, &data_size,
							       decodebuf);
		} else if (ctx->content_base64) {
			decoded_size = MAX_BASE64_DECODED_SIZE(data_size);
			decoded = decodebuf = t_malloc(decoded_size);

			ret = base64_decode(data, &data_size, decodebuf);
			decoded_size = ret < 0 ? 0 : (size_t)decoded_size;
		} else {
			decoded = data;
			decoded_size = data_size;
		}

		ret = message_search_body_block(ctx, decoded, decoded_size);
		if (ret != 0) {
			t_pop();
			found = ret > 0;
			break;
		}

		t_pop();
		i_buffer_skip(inbuf, data_size);
		pos -= data_size;
	}

	i_buffer_set_read_limit(inbuf, old_limit);

	if (ctx->translation != NULL)
		charset_to_utf8_end(ctx->translation);
	return found;
}

static int message_body_search_init(BodySearchContext *ctx, const char *key,
				    const char *charset, int *unknown_charset)
{
	size_t size;

	memset(ctx, 0, sizeof(BodySearchContext));

	/* get the key uppercased */
	size = strlen(key);
	key = charset_to_ucase_utf8_string(charset, unknown_charset,
					   (const unsigned char *) key, &size);
	if (key == NULL)
		return FALSE;

	i_assert(size <= SSIZE_T_MAX/sizeof(size_t));

	ctx->key = key;
	ctx->key_len = size;
	ctx->charset = charset;
	ctx->unknown_charset = charset == NULL;

	return TRUE;
}

static int message_body_search_ctx(BodySearchContext *ctx, IBuffer *inbuf,
				   MessagePart *part)
{
	PartSearchContext part_ctx;
	int found;

	found = FALSE;
	while (part != NULL && !found) {
		i_assert(inbuf->v_offset <= part->physical_pos);

		i_buffer_skip(inbuf, part->physical_pos - inbuf->v_offset);

		memset(&part_ctx, 0, sizeof(part_ctx));
		part_ctx.body_ctx = ctx;

		t_push();

		if (message_search_header(&part_ctx, inbuf)) {
			found = TRUE;
		} else if (part->children != NULL) {
			/* multipart/xxx or message/rfc822 */
			if (message_body_search_ctx(ctx, inbuf, part->children))
				found = TRUE;
		} else {
			if (message_search_body(&part_ctx, inbuf, part))
				found = TRUE;
		}

		t_pop();

		part = part->next;
	}

	return found;
}

int message_body_search(const char *key, const char *charset,
			int *unknown_charset, IBuffer *inbuf,
			MessagePart *part)
{
        BodySearchContext ctx;

        if (!message_body_search_init(&ctx, key, charset, unknown_charset))
		return -1;

	return message_body_search_ctx(&ctx, inbuf, part);
}

--- NEW FILE: message-body-search.h ---
#ifndef __MESSAGE_BODY_SEARCH_H
#define __MESSAGE_BODY_SEARCH_H

/* Returns 1 if key is found from input buffer, 0 if not and -1 if error.
   There's two possible errors: either the charset is unknown or the key
   is invalid. If charset is NULL, the key isn't assumed to be in any
   specific charset but is compared to message data without any translation. */
int message_body_search(const char *key, const char *charset,
			int *unknown_charset, IBuffer *inbuf,
			MessagePart *part);

#endif

--- NEW FILE: quoted-printable.c ---
/* Copyright (C) 2002 Timo Sirainen */

#include "lib.h"
#include "hex-binary.h"
#include "quoted-printable.h"

size_t quoted_printable_decode(const unsigned char *src, size_t *size,
			       unsigned char *dest)
{
	const unsigned char *end;
	unsigned char *dest_start;
	char hexbuf[3];

	hexbuf[2] = '\0';

	dest_start = dest;
	end = src + *size;

	for (; src != end; src++) {
		if (*src == '_') {
			*dest++ = ' ';
			continue;
		}

		if (*src == '=') {
			if (src+2 >= end)
				break;

			hexbuf[0] = src[1];
			hexbuf[1] = src[2];

			if (hex_to_binary(hexbuf, dest) == 1) {
				dest++;
				src += 2;
				continue;
			}
		}

		*dest++ = *src;
	}

	*size -= (end-src);
	return (size_t) (dest - dest_start);
}

--- NEW FILE: quoted-printable.h ---
#ifndef __QUOTED_PRINTABLE_H
#define __QUOTED_PRINTABLE_H

/* Translates quoted printable data into binary. dest must be at least the
   size of src, and may be same as src. Returns size of the binary data.
   Decoding errors are ignored.

   This function may be called multiple times for parsing same stream.
   The *size is updated at return to contain the amount of data actually
   parsed - the rest of the data should be passed again to this function. */
size_t quoted_printable_decode(const unsigned char *src, size_t *size,
			       unsigned char *dest);

#endif

Index: Makefile.am
===================================================================
RCS file: /home/cvs/dovecot/src/lib-mail/Makefile.am,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- Makefile.am	3 Nov 2002 08:39:43 -0000	1.4
+++ Makefile.am	13 Nov 2002 11:08:18 -0000	1.5
@@ -5,6 +5,7 @@
 	-I$(top_srcdir)/src/lib-charset
 
 libmail_a_SOURCES = \
+	message-body-search.c \
 	message-content-parser.c \
 	message-header-search.c \
 	message-parser.c \
@@ -13,9 +14,11 @@
 	message-size.c \
 	rfc822-address.c \
 	rfc822-date.c \
-	rfc822-tokenize.c
+	rfc822-tokenize.c \
+	quoted-printable.c
 
 noinst_HEADERS = \
+	message-body-search.h \
 	message-content-parser.h \
 	message-header-search.h \
 	message-parser.h \
@@ -24,4 +27,5 @@
 	message-size.h \
 	rfc822-address.h \
 	rfc822-date.h \
-	rfc822-tokenize.h
+	rfc822-tokenize.h \
+	quoted-printable.h

Index: message-header-search.c
===================================================================
RCS file: /home/cvs/dovecot/src/lib-mail/message-header-search.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- message-header-search.c	3 Nov 2002 08:39:43 -0000	1.1
+++ message-header-search.c	13 Nov 2002 11:08:18 -0000	1.2
@@ -2,15 +2,17 @@
 
 #include "lib.h"
 #include "base64.h"
-#include "hex-binary.h"
 #include "charset-utf8.h"
 #include "rfc822-tokenize.h"
+#include "quoted-printable.h"
 #include "message-header-search.h"
 
 #include <ctype.h>
 
 struct _HeaderSearchContext {
-	const unsigned char *key;
+	Pool pool;
+
+	unsigned char *key;
 	size_t key_len;
 
 	size_t *matches; /* size of strlen(key) */
@@ -30,56 +32,33 @@
 	size_t size;
 
 	ctx = p_new(pool, HeaderSearchContext, 1);
+	ctx->pool = pool;
 
 	/* get the key uppercased */
 	size = strlen(key);
-	ctx->key = charset_to_ucase_utf8((const unsigned char *) key, &size,
-					 charset, unknown_charset);
-	if (ctx->key == NULL)
+	key = charset_to_ucase_utf8_string(charset, unknown_charset,
+					   (const unsigned char *) key, &size);
+	if (key == NULL)
 		return NULL;
 
-	ctx->key = p_strdup(pool, ctx->key);
+	i_assert(size <= SSIZE_T_MAX/sizeof(size_t));
+
+	ctx->key = p_strdup(pool, key);
 	ctx->key_len = size;
 	ctx->unknown_charset = charset == NULL;
 
 	ctx->matches = p_malloc(pool, sizeof(size_t) * ctx->key_len);
-	i_assert(ctx->key_len <= SSIZE_T_MAX);
 	return ctx;
 }
 
-static size_t quoted_printable_decode(const unsigned char *src, size_t size,
-				      unsigned char *dest)
+void message_header_search_free(HeaderSearchContext *ctx)
 {
-	const unsigned char *end;
-	unsigned char *dest_start;
-	char hexbuf[3];
-
-	hexbuf[2] = '\0';
-
-	dest_start = dest;
-	end = src + size;
-
-	for (; src != end; src++) {
-		if (*src == '_') {
-			*dest++ = ' ';
-			continue;
-		}
-
-		if (*src == '=' && src+2 < end) {
-			hexbuf[0] = src[1];
-			hexbuf[1] = src[2];
-
-			if (hex_to_binary(hexbuf, dest) == 1) {
-				dest++;
-				src += 2;
-				continue;
-			}
-		}
-
-		*dest++ = *src;
-	}
+	Pool pool;
 
-	return (size_t) (dest - dest_start);
+	pool = ctx->pool;
+	p_free(pool, ctx->key);
+	p_free(pool, ctx->matches);
+	p_free(pool, ctx);
 }
 
 static int match_data(const unsigned char *data, size_t size,
@@ -93,8 +72,8 @@
 		charset = NULL;
 	}
 
-	data = (const unsigned char *) charset_to_ucase_utf8(data, &size,
-							     charset, NULL);
+	data = (const unsigned char *)
+		charset_to_ucase_utf8_string(charset, NULL, data, &size);
 	if (data == NULL) {
 		/* unknown character set, or invalid data */
 		return FALSE;
@@ -113,7 +92,7 @@
 	const unsigned char *p, *encoding, *text, *new_end;
 	const char *charset;
 	unsigned char *buf;
-	ssize_t size;
+	ssize_t size, buf_size;
 	int ok, ret;
 
 	/* first split the string =?charset?encoding?text?= */
@@ -154,12 +133,14 @@
 		t_push();
 
 		size = (ssize_t) (end - text);
-		buf = t_malloc(size);
+
+		buf_size = size;
+		buf = t_malloc(buf_size);
 
 		if (*encoding == 'Q')
-			size = quoted_printable_decode(text, size, buf);
+			size = quoted_printable_decode(text, &buf_size, buf);
 		else
-			size = base64_decode(text, size, buf);
+			size = base64_decode(text, &buf_size, buf);
 
 		if (size >= 0) {
 			/* non-corrupted encoding */

Index: message-header-search.h
===================================================================
RCS file: /home/cvs/dovecot/src/lib-mail/message-header-search.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- message-header-search.h	3 Nov 2002 08:39:43 -0000	1.1
+++ message-header-search.h	13 Nov 2002 11:08:18 -0000	1.2
@@ -3,11 +3,14 @@
 
 typedef struct _HeaderSearchContext HeaderSearchContext;
 
-/* Initialize new search. Allocates memory from data stack. Returns NULL
-   if charset is unknown or key is not valid in specified charset. */
+/* Initialize new search. Returns NULL if charset is unknown or key is not
+   valid in specified charset. */
 HeaderSearchContext *
 message_header_search_init(Pool pool, const char *key, const char *charset,
 			   int *unknown_charset);
+
+/* Free search context. Not needed if you just destroy the pool. */
+void message_header_search_free(HeaderSearchContext *ctx);
 
 /* Returns TRUE if key is found from header. This function may be called
    multiple times with partial header blocks, but the blocks must contain only