dovecot-1.1: Message sort index handling rewrite to fix several ...

Thu May 29 06:37:16 EEST 2008

details:   http://hg.dovecot.org/dovecot-1.1/rev/4a9ce9df52c5
changeset: 7564:4a9ce9df52c5
user:      Timo Sirainen <tss at iki.fi>
date:      Thu May 29 06:16:06 2008 +0300
description:
Message sort index handling rewrite to fix several race conditions when
multiple processes are sorting at the same time. Also fixes sorting to work
correctly with 64bit timestamps and file sizes.

diffstat:

4 files changed, 988 insertions(+), 449 deletions(-)
src/lib-storage/index/Makefile.am          |    2 
src/lib-storage/index/index-sort-private.h |   33 +
src/lib-storage/index/index-sort-string.c  |  705 ++++++++++++++++++++++++++++
src/lib-storage/index/index-sort.c         |  697 +++++++++------------------

diffs (truncated from 1593 to 300 lines):

diff -r 6de1aed24ce5 -r 4a9ce9df52c5 src/lib-storage/index/Makefile.am

--- a/src/lib-storage/index/Makefile.am	Thu May 29 04:47:53 2008 +0300
+++ b/src/lib-storage/index/Makefile.am	Thu May 29 06:16:06 2008 +0300
@@ -16,6 +16,7 @@ libstorage_index_a_SOURCES = \
 	index-mailbox-check.c \
 	index-search.c \
 	index-sort.c \
+	index-sort-string.c \
 	index-status.c \
 	index-storage.c \
 	index-sync.c \
@@ -25,6 +26,7 @@ headers = \
 headers = \
 	index-mail.h \
 	index-sort.h \
+	index-sort-private.h \
 	index-storage.h \
 	index-sync-changes.h
 
diff -r 6de1aed24ce5 -r 4a9ce9df52c5 src/lib-storage/index/index-sort-private.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-storage/index/index-sort-private.h	Thu May 29 06:16:06 2008 +0300
@@ -0,0 +1,33 @@
+#ifndef INDEX_SORT_PRIVATE_H
+#define INDEX_SORT_PRIVATE_H
+
+#include "index-sort.h"
+
+struct mail_search_sort_program {
+	struct mailbox_transaction_context *t;
+	enum mail_sort_type sort_program[MAX_SORT_PROGRAM_SIZE];
+	struct mail *temp_mail;
+
+	void (*sort_list_add)(struct mail_search_sort_program *program,
+			      struct mail *mail);
+	void (*sort_list_finish)(struct mail_search_sort_program *program);
+	void *context;
+
+	ARRAY_TYPE(uint32_t) seqs;
+	unsigned int iter_idx;
+
+	unsigned int reverse:1;
+};
+
+int index_sort_header_get(struct mail *mail, uint32_t seq,
+			  enum mail_sort_type sort_type, string_t *dest);
+int index_sort_node_cmp_type(struct mail *mail,
+			     const enum mail_sort_type *sort_program,
+			     uint32_t seq1, uint32_t seq2);
+
+void index_sort_list_init_string(struct mail_search_sort_program *program);
+void index_sort_list_add_string(struct mail_search_sort_program *program,
+				struct mail *mail);
+void index_sort_list_finish_string(struct mail_search_sort_program *program);
+
+#endif
diff -r 6de1aed24ce5 -r 4a9ce9df52c5 src/lib-storage/index/index-sort-string.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-storage/index/index-sort-string.c	Thu May 29 06:16:06 2008 +0300
@@ -0,0 +1,705 @@
+/* Copyright (c) 2006-2008 Dovecot authors, see the included COPYING file */
+
+/* The idea is that we use 32bit integers for string sort IDs which specifiy
+   the sort order for primary sort condition. The whole 32bit integer space is
+   used and whenever adding a string, the available space is halved and the new
+   ID is added in the middle. For example if we add one mail the first time, it
+   gets ID 2^31. If we then add two mails which are sorted before the first
+   one, they get IDs 2^31/3 and 2^31/3*2. Once we run out of the available
+   space between IDs, more space is made by renumbering some IDs.
+*/
+#include "lib.h"
+#include "array.h"
+#include "str.h"
+#include "index-storage.h"
+#include "index-sort-private.h"
+
+#include <stdlib.h>
+
+struct mail_sort_node {
+	uint32_t seq:29;
+	uint32_t wanted:1;
+	uint32_t no_update:1;
+	uint32_t sort_id_changed:1;
+	uint32_t sort_id;
+};
+ARRAY_DEFINE_TYPE(mail_sort_node, struct mail_sort_node);
+
+struct sort_string_context {
+	struct mail_search_sort_program *program;
+
+	ARRAY_TYPE(mail_sort_node) zero_nodes, nonzero_nodes, sorted_nodes;
+	const char **sort_strings;
+	pool_t sort_string_pool;
+	unsigned int first_missing_sort_id_idx;
+
+	uint32_t ext_id, last_seq, highest_reset_id;
+
+	unsigned int regetting:1;
+};
+
+static char expunged_msg;
+static struct sort_string_context *static_zero_cmp_context;
+static struct mail_search_sort_program *static_sort_node_cmp_context;
+
+static void index_sort_node_add(struct sort_string_context *ctx,
+				struct mail_sort_node *node);
+
+void index_sort_list_init_string(struct mail_search_sort_program *program)
+{
+	struct index_mailbox *ibox = (struct index_mailbox *)program->t->box;
+	struct sort_string_context *ctx;
+	const char *name;
+
+	switch (program->sort_program[0] & MAIL_SORT_MASK) {
+	case MAIL_SORT_CC:
+		name = "sort-c";
+		break;
+	case MAIL_SORT_FROM:
+		name = "sort-f";
+		break;
+	case MAIL_SORT_SUBJECT:
+		name = "sort-s";
+		break;
+	case MAIL_SORT_TO:
+		name = "sort-t";
+		break;
+	default:
+		i_unreached();
+	}
+
+	program->context = ctx = i_new(struct sort_string_context, 1);
+	ctx->program = program;
+	ctx->ext_id = mail_index_ext_register(ibox->index, name, 0,
+					      sizeof(uint32_t),
+					      sizeof(uint32_t));
+	i_array_init(&ctx->zero_nodes, 128);
+	i_array_init(&ctx->nonzero_nodes, 128);
+}
+
+static void index_sort_generate_seqs(struct sort_string_context *ctx)
+{
+	struct mail_sort_node *nodes, *nodes2;
+	unsigned int i, j, count, count2;
+	uint32_t seq;
+
+	nodes = array_get_modifiable(&ctx->nonzero_nodes, &count);
+	nodes2 = array_get_modifiable(&ctx->zero_nodes, &count2);
+
+	if (!array_is_created(&ctx->program->seqs))
+		i_array_init(&ctx->program->seqs, count + count2);
+	else
+		array_clear(&ctx->program->seqs);
+
+	for (i = j = 0;;) {
+		if (i < count && j < count2) {
+			if (nodes[i].seq < nodes2[j].seq)
+				seq = nodes[i++].seq;
+			else
+				seq = nodes2[j++].seq;
+		} else if (i < count) {
+			seq = nodes[i++].seq;
+		} else if (j < count2) {
+			seq = nodes2[j++].seq;
+		} else {
+			break;
+		}
+		array_append(&ctx->program->seqs, &seq, 1);
+	}
+}
+
+static void index_sort_reget_sort_ids(struct sort_string_context *ctx)
+{
+	struct mail_sort_node node;
+	const uint32_t *seqs;
+	unsigned int i, count;
+
+	i_assert(!ctx->regetting);
+	ctx->regetting = TRUE;
+
+	index_sort_generate_seqs(ctx);
+	array_clear(&ctx->zero_nodes);
+	array_clear(&ctx->nonzero_nodes);
+
+	memset(&node, 0, sizeof(node));
+	node.wanted = TRUE;
+	seqs = array_get(&ctx->program->seqs, &count);
+	for (i = 0; i < count; i++) {
+		node.seq = seqs[i];
+		index_sort_node_add(ctx, &node);
+	}
+	ctx->regetting = FALSE;
+}
+
+static void index_sort_node_add(struct sort_string_context *ctx,
+				struct mail_sort_node *node)
+{
+	struct index_transaction_context *t =
+		(struct index_transaction_context *)ctx->program->t;
+	struct mail_index_map *map;
+	const void *data;
+	uint32_t reset_id;
+	bool expunged;
+
+	mail_index_lookup_ext_full(t->trans_view, node->seq,
+				   ctx->ext_id, &map, &data, &expunged);
+	if (expunged) {
+		/* we don't want to update expunged messages' sort IDs */
+		node->no_update = TRUE;
+		/* we can't trust expunged messages' sort IDs. they might be
+		   valid, but it's also possible that sort IDs were updated
+		   and the expunged messages' sort IDs became invalid. we could
+		   use sort ID if we could know the extension's reset_id at the
+		   time of the expunge so we could compare it to
+		   highest_reset_id, but this isn't currently possible. */
+		node->sort_id = 0;
+	} else {
+		node->sort_id = data == NULL ? 0 : *(const uint32_t *)data;
+	}
+
+	if (node->sort_id != 0) {
+		/* if reset ID increases, lookup all existing messages' sort
+		   IDs again. if it decreases, ignore the sort ID. */
+		if (!mail_index_ext_get_reset_id(t->trans_view, map,
+						 ctx->ext_id, &reset_id))
+			reset_id = 0;
+		if (reset_id != ctx->highest_reset_id) {
+			if (reset_id > ctx->highest_reset_id) {
+				ctx->highest_reset_id = reset_id;
+				index_sort_reget_sort_ids(ctx);
+			} else {
+				i_assert(expunged);
+				node->sort_id = 0;
+			}
+		}
+	}
+
+	if (node->sort_id == 0)
+		array_append(&ctx->zero_nodes, node, 1);
+	else
+		array_append(&ctx->nonzero_nodes, node, 1);
+	if (ctx->last_seq < node->seq)
+		ctx->last_seq = node->seq;
+}
+
+void index_sort_list_add_string(struct mail_search_sort_program *program,
+				struct mail *mail)
+{
+	struct mail_sort_node node;
+
+	memset(&node, 0, sizeof(node));
+	node.seq = mail->seq;
+	node.wanted = TRUE;
+
+	index_sort_node_add(program->context, &node);
+}
+
+static int sort_node_zero_string_cmp(const void *p1, const void *p2)
+{
+	const struct mail_sort_node *n1 = p1, *n2 = p2;
+
+	return strcmp(static_zero_cmp_context->sort_strings[n1->seq],
+		      static_zero_cmp_context->sort_strings[n2->seq]);
+}
+
+static void index_sort_zeroes(struct sort_string_context *ctx)
+{
+	struct mail *mail = ctx->program->temp_mail;
+	enum mail_sort_type sort_type = ctx->program->sort_program[0];
+	string_t *str;
+	pool_t pool;
+	struct mail_sort_node *nodes;
+	unsigned int i, count;
+
+	/* first get all the messages' sort strings. although this takes more
+	   memory, it makes error handling easier and probably also helps
+	   CPU caching. */
+	ctx->sort_strings = i_new(const char *, ctx->last_seq + 1);
+	ctx->sort_string_pool = pool =
+		pool_alloconly_create("sort strings", 1024*64);
+	str = t_str_new(512);
+	nodes = array_get_modifiable(&ctx->zero_nodes, &count);
+	for (i = 0; i < count; i++) {
+		i_assert(nodes[i].seq <= ctx->last_seq);
+
+		index_sort_header_get(mail, nodes[i].seq, sort_type, str);
+		ctx->sort_strings[nodes[i].seq] = str_len(str) == 0 ? "" :
+			p_strdup(pool, str_c(str));
+	}
+
+	/* we have all strings, sort nodes based on them */
+	static_zero_cmp_context = ctx;
+	qsort(nodes, count, sizeof(struct mail_sort_node),
+	      sort_node_zero_string_cmp);
+}
+
+static const char *
+index_sort_get_expunged_string(struct sort_string_context *ctx, uint32_t idx,
+			       string_t *str)
+{
+	struct mail *mail = ctx->program->temp_mail;