dovecot-2.1: dbox: When fixing a broken file, try harder to find...

dovecot at dovecot.org dovecot at dovecot.org
Tue Aug 27 04:11:35 EEST 2013


details:   http://hg.dovecot.org/dovecot-2.1/rev/9efbc0731929
changeset: 14997:9efbc0731929
user:      Timo Sirainen <tss at iki.fi>
date:      Tue Aug 27 04:11:24 2013 +0300
description:
dbox: When fixing a broken file, try harder to find only valid message/metadata positions.
The previous method sometime broke with compressed mail content, because the
magic blocks were so short.

diffstat:

 src/lib-storage/index/dbox-common/dbox-file-fix.c |  204 ++++++++++++++++++---
 src/lib-storage/index/dbox-common/dbox-file.h     |    2 +
 2 files changed, 176 insertions(+), 30 deletions(-)

diffs (260 lines):

diff -r 5d6cc3d03eee -r 9efbc0731929 src/lib-storage/index/dbox-common/dbox-file-fix.c
--- a/src/lib-storage/index/dbox-common/dbox-file-fix.c	Wed Aug 21 23:30:07 2013 +0300
+++ b/src/lib-storage/index/dbox-common/dbox-file-fix.c	Tue Aug 27 04:11:24 2013 +0300
@@ -1,6 +1,7 @@
 /* Copyright (c) 2009-2012 Dovecot authors, see the included COPYING file */
 
 #include "lib.h"
+#include "hex-dec.h"
 #include "istream.h"
 #include "ostream.h"
 #include "str-find.h"
@@ -13,38 +14,186 @@
 #define DBOX_MAIL_FILE_BROKEN_COPY_SUFFIX ".broken"
 
 static int
+dbox_file_match_pre_magic(struct istream *input,
+			  uoff_t *pre_offset, size_t *need_bytes)
+{
+	const struct dbox_message_header *hdr;
+	const unsigned char *data;
+	size_t size;
+	uoff_t offset = input->v_offset;
+
+	data = i_stream_get_data(input, &size);
+	if (data[0] == '\n') {
+		data++; size--; offset++;
+	}
+	i_assert(data[0] == DBOX_MAGIC_PRE[0]);
+	if (size < sizeof(*hdr)) {
+		*need_bytes = sizeof(*hdr);
+		return -1;
+	}
+	hdr = (const void *)data;
+	if (memcmp(hdr->magic_pre, DBOX_MAGIC_PRE, strlen(DBOX_MAGIC_PRE)) != 0)
+		return 0;
+	if (hdr->type != DBOX_MESSAGE_TYPE_NORMAL)
+		return 0;
+	if (hdr->space1 != ' ' || hdr->space2 != ' ')
+		return 0;
+	if (hex2dec(hdr->message_size_hex, sizeof(hdr->message_size_hex)) == 0 &&
+	    memcmp(hdr->message_size_hex, "0000000000000000", sizeof(hdr->message_size_hex)) != 0)
+		return 0;
+
+	*pre_offset = offset;
+	return 1;
+}
+
+static bool memchr_nocontrol(const unsigned char *data, char chr,
+			     unsigned int len, const unsigned char **pos_r)
+{
+	unsigned int i;
+
+	for (i = 0; i < len; i++) {
+		if (data[i] == chr) {
+			*pos_r = data+i;
+			return TRUE;
+		}
+		if (data[i] < ' ')
+			return FALSE;
+	}
+	*pos_r = NULL;
+	return TRUE;
+}
+
+static int
+dbox_file_match_post_magic(struct istream *input, bool input_full,
+			   size_t *need_bytes)
+{
+	const unsigned char *data, *p;
+	size_t i, size;
+	bool allow_control;
+
+	data = i_stream_get_data(input, &size);
+	if (size < strlen(DBOX_MAGIC_POST)) {
+		*need_bytes = strlen(DBOX_MAGIC_POST);
+		return -1;
+	}
+	if (memcmp(data, DBOX_MAGIC_POST, strlen(DBOX_MAGIC_POST)) != 0)
+		return 0;
+
+	/* see if the metadata block looks valid */
+	for (i = strlen(DBOX_MAGIC_POST); i < size; ) {
+		switch (data[i]) {
+		case '\n':
+			return 1;
+		case DBOX_METADATA_GUID:
+		case DBOX_METADATA_POP3_UIDL:
+		case DBOX_METADATA_ORIG_MAILBOX:
+		case DBOX_METADATA_OLDV1_KEYWORDS:
+			/* these could contain anything */
+			allow_control = TRUE;
+			break;
+		case DBOX_METADATA_RECEIVED_TIME:
+		case DBOX_METADATA_PHYSICAL_SIZE:
+		case DBOX_METADATA_VIRTUAL_SIZE:
+		case DBOX_METADATA_EXT_REF:
+		case DBOX_METADATA_OLDV1_EXPUNGED:
+		case DBOX_METADATA_OLDV1_FLAGS:
+		case DBOX_METADATA_OLDV1_SAVE_TIME:
+		case DBOX_METADATA_OLDV1_SPACE:
+			/* no control chars */
+			allow_control = FALSE;
+			break;
+		default:
+			if (data[i] < 'A' || data[i] > 'Z')
+				return 0;
+			/* unknown */
+			allow_control = TRUE;
+			break;
+		}
+		if (allow_control) {
+			p = memchr(data+i, '\n', size-i);
+		} else {
+			if (!memchr_nocontrol(data+i, '\n', size-i, &p))
+				return 0;
+		}
+		if (p == NULL) {
+			/* LF not found - try to find the end-of-metadata LF */
+			if (input_full) {
+				/* can't look any further - assume it's ok */
+				return 1;
+			}
+			*need_bytes = size+1;
+			return -1;
+		}
+		i = p - data+1;
+	}
+	*need_bytes = size+1;
+	return -1;
+}
+
+static int
 dbox_file_find_next_magic(struct dbox_file *file, uoff_t *offset_r, bool *pre_r)
 {
+	/* We're scanning message bodies here, trying to find the beginning of
+	   the next message. Although our magic strings are very unlikely to
+	   be found in regular emails, they are much more likely when emails
+	   are stored compressed.. So try to be sure we find the correct
+	   magic markers. */
+
 	struct istream *input = file->input;
-	struct str_find_context *pre_ctx, *post_ctx;
 	uoff_t orig_offset, pre_offset, post_offset;
-	const unsigned char *data;
-	size_t size;
-	int ret;
+	const unsigned char *data, *magic;
+	size_t size, need_bytes;
+	int ret, match;
 
 	*pre_r = FALSE;
 
-	pre_ctx = str_find_init(default_pool, "\n"DBOX_MAGIC_PRE);
-	post_ctx = str_find_init(default_pool, DBOX_MAGIC_POST);
+	orig_offset = input->v_offset;
+	need_bytes = strlen(DBOX_MAGIC_POST);
+	while ((ret = i_stream_read_data(input, &data, &size, need_bytes-1)) > 0 ||
+	       ret == -2) {
+		/* search for the beginning of a potential pre/post magic */
+		i_assert(size > 1);
+		magic = memchr(data, DBOX_MAGIC_PRE[0], size);
+		if (magic == NULL) {
+			i_stream_skip(input, size-1);
+			need_bytes = strlen(DBOX_MAGIC_POST);
+			continue;
+		}
+		if (magic == data && input->v_offset == orig_offset) {
+			/* beginning of the file */
+		} else if (magic != data && magic[-1] == '\n') {
+			/* PRE/POST block? leave \n */
+			i_stream_skip(input, magic-data-1);
+		} else {
+			i_stream_skip(input, magic-data+1);
+			need_bytes = strlen(DBOX_MAGIC_POST);
+			continue;
+		}
 
-	/* \n isn't part of the DBOX_MAGIC_PRE, but it always preceds it.
-	   assume that at this point we've already just read the \n. when
-	   scanning for it later we'll need to find the \n though. */
-	str_find_more(pre_ctx, (const unsigned char *)"\n", 1);
+		pre_offset = (uoff_t)-1;
+		match = dbox_file_match_pre_magic(input, &pre_offset, &need_bytes);
+		if (match < 0) {
+			/* more data needed */
+			if (ret == -2) {
+				i_stream_skip(input, 2);
+				need_bytes = strlen(DBOX_MAGIC_POST);
+			}
+			continue;
+		}
+		if (match > 0)
+			*pre_r = TRUE;
 
-	orig_offset = input->v_offset;
-	while ((ret = i_stream_read_data(input, &data, &size, 0)) > 0) {
-		pre_offset = (uoff_t)-1;
-		if (str_find_more(pre_ctx, data, size)) {
-			pre_offset = input->v_offset +
-				str_find_get_match_end_pos(pre_ctx) -
-				(strlen(DBOX_MAGIC_PRE) + 1);
-			*pre_r = TRUE;
+		match = dbox_file_match_post_magic(input, ret == -2, &need_bytes);
+		if (match < 0) {
+			/* more data needed */
+			if (ret == -2) {
+				i_stream_skip(input, 2);
+				need_bytes = strlen(DBOX_MAGIC_POST);
+			}
+			continue;
 		}
-		if (str_find_more(post_ctx, data, size)) {
-			post_offset = input->v_offset +
-				str_find_get_match_end_pos(post_ctx) -
-				strlen(DBOX_MAGIC_POST);
+		if (match > 0) {
+			post_offset = input->v_offset;
 			if (pre_offset == (uoff_t)-1 ||
 			    post_offset < pre_offset) {
 				pre_offset = post_offset;
@@ -53,14 +202,11 @@
 		}
 
 		if (pre_offset != (uoff_t)-1) {
-			if (*pre_r) {
-				/* LF isn't part of the magic */
-				pre_offset++;
-			}
 			*offset_r = pre_offset;
+			ret = 1;
 			break;
 		}
-		i_stream_skip(input, size);
+		i_stream_skip(input, size-1);
 	}
 	if (ret <= 0) {
 		i_assert(ret == -1);
@@ -72,9 +218,7 @@
 		} 
 	}
 	i_stream_seek(input, orig_offset);
-	str_find_deinit(&pre_ctx);
-	str_find_deinit(&post_ctx);
-	return ret;
+	return ret <= 0 ? ret : 1;
 }
 
 static int
diff -r 5d6cc3d03eee -r 9efbc0731929 src/lib-storage/index/dbox-common/dbox-file.h
--- a/src/lib-storage/index/dbox-common/dbox-file.h	Wed Aug 21 23:30:07 2013 +0300
+++ b/src/lib-storage/index/dbox-common/dbox-file.h	Tue Aug 27 04:11:24 2013 +0300
@@ -39,6 +39,8 @@
 	DBOX_HEADER_OLDV1_APPEND_OFFSET	= 'A'
 };
 
+/* NOTE: all valid keys are uppercase characters. if this changes, change
+   dbox-file-fix.c:dbox_file_match_post_magic() to recognize them */
 enum dbox_metadata_key {
 	/* Globally unique identifier for the message. Preserved when
 	   copying. */


More information about the dovecot-cvs mailing list