dovecot-2.0: fts-lucene: Fixes to work with CLucene v2.3.3.4 and...

dovecot at dovecot.org dovecot at dovecot.org
Tue May 31 15:38:11 EEST 2011


details:   http://hg.dovecot.org/dovecot-2.0/rev/9ae30e5d6935
changeset: 12829:9ae30e5d6935
user:      Timo Sirainen <tss at iki.fi>
date:      Tue May 31 15:38:03 2011 +0300
description:
fts-lucene: Fixes to work with CLucene v2.3.3.4 and new FTS API.
It's still not recommended to actually use this.

diffstat:

 src/plugins/fts-lucene/Makefile.am          |    2 +-
 src/plugins/fts-lucene/fts-backend-lucene.c |   71 ++++++++++---
 src/plugins/fts-lucene/lucene-wrapper.cc    |  149 ++++++++++++---------------
 3 files changed, 121 insertions(+), 101 deletions(-)

diffs (truncated from 397 to 300 lines):

diff -r 7a7c22755b7a -r 9ae30e5d6935 src/plugins/fts-lucene/Makefile.am
--- a/src/plugins/fts-lucene/Makefile.am	Tue May 31 15:36:22 2011 +0300
+++ b/src/plugins/fts-lucene/Makefile.am	Tue May 31 15:38:03 2011 +0300
@@ -12,7 +12,7 @@
 	lib21_fts_lucene_plugin.la
 
 lib21_fts_lucene_plugin_la_LIBADD = \
-	-lclucene
+	-lclucene-shared -lclucene-core
 
 lib21_fts_lucene_plugin_la_SOURCES = \
 	fts-lucene-plugin.c \
diff -r 7a7c22755b7a -r 9ae30e5d6935 src/plugins/fts-lucene/fts-backend-lucene.c
--- a/src/plugins/fts-lucene/fts-backend-lucene.c	Tue May 31 15:36:22 2011 +0300
+++ b/src/plugins/fts-lucene/fts-backend-lucene.c	Tue May 31 15:38:03 2011 +0300
@@ -24,8 +24,13 @@
 	struct fts_backend backend;
 	struct lucene_mail_storage *lstorage;
 	struct mailbox *box;
+};
 
-	uint32_t last_uid;
+struct lucene_fts_backend_build_context {
+	struct fts_backend_build_context ctx;
+
+	uint32_t uid;
+	bool hdr;
 };
 
 static MODULE_CONTEXT_DEFINE_INIT(fts_lucene_storage_module,
@@ -110,38 +115,69 @@
 {
 	struct lucene_fts_backend *backend =
 		(struct lucene_fts_backend *)_backend;
-	struct fts_backend_build_context *ctx;
+	struct lucene_fts_backend_build_context *ctx;
+	uint32_t last_uid;
 
 	fts_backend_select(backend);
 	if (lucene_index_build_init(backend->lstorage->index,
-				    &backend->last_uid) < 0)
+				    &last_uid) < 0)
 		return -1;
 
-	ctx = i_new(struct fts_backend_build_context, 1);
-	ctx->backend = _backend;
+	ctx = i_new(struct lucene_fts_backend_build_context, 1);
+	ctx->ctx.backend = _backend;
+	ctx->uid = last_uid + 1;
 
-	*last_uid_r = backend->last_uid;
-	*ctx_r = ctx;
+	*last_uid_r = last_uid;
+	*ctx_r = &ctx->ctx;
 	return 0;
 }
 
+static void
+fts_backend_lucene_build_hdr(struct fts_backend_build_context *_ctx,
+			     uint32_t uid)
+{
+	struct lucene_fts_backend_build_context *ctx =
+		(struct lucene_fts_backend_build_context *)_ctx;
+
+	i_assert(uid >= ctx->uid);
+
+	ctx->uid = uid;
+	ctx->hdr = TRUE;
+}
+
+static bool
+fts_backend_lucene_build_body_begin(struct fts_backend_build_context *_ctx,
+				    uint32_t uid, const char *content_type,
+				    const char *content_disposition ATTR_UNUSED)
+{
+	struct lucene_fts_backend_build_context *ctx =
+		(struct lucene_fts_backend_build_context *)_ctx;
+
+	i_assert(uid >= ctx->uid);
+
+	if (!fts_backend_default_can_index(content_type))
+		return FALSE;
+
+	ctx->uid = uid;
+	ctx->hdr = FALSE;
+	return TRUE;
+}
+
 static int
-fts_backend_lucene_build_more(struct fts_backend_build_context *ctx,
-			      uint32_t uid, const unsigned char *data,
-			      size_t size, bool headers)
+fts_backend_lucene_build_more(struct fts_backend_build_context *_ctx,
+			      const unsigned char *data, size_t size)
 {
+	struct lucene_fts_backend_build_context *ctx =
+		(struct lucene_fts_backend_build_context *)_ctx;
 	struct lucene_fts_backend *backend =
-		(struct lucene_fts_backend *)ctx->backend;
+		(struct lucene_fts_backend *)_ctx->backend;
 
-	if (ctx->failed)
+	if (_ctx->failed)
 		return -1;
 
-	i_assert(uid >= backend->last_uid);
-	backend->last_uid = uid;
-
 	i_assert(backend->lstorage->selected_box == backend->box);
 	return lucene_index_build_more(backend->lstorage->index,
-				       uid, data, size, headers);
+				       ctx->uid, data, size, ctx->hdr);
 }
 
 static int
@@ -212,6 +248,9 @@
 		fts_backend_lucene_get_last_uid,
 		NULL,
 		fts_backend_lucene_build_init,
+		fts_backend_lucene_build_hdr,
+		fts_backend_lucene_build_body_begin,
+		NULL,
 		fts_backend_lucene_build_more,
 		fts_backend_lucene_build_deinit,
 		fts_backend_lucene_expunge,
diff -r 7a7c22755b7a -r 9ae30e5d6935 src/plugins/fts-lucene/lucene-wrapper.cc
--- a/src/plugins/fts-lucene/lucene-wrapper.cc	Tue May 31 15:36:22 2011 +0300
+++ b/src/plugins/fts-lucene/lucene-wrapper.cc	Tue May 31 15:38:03 2011 +0300
@@ -33,8 +33,7 @@
 
 struct lucene_index {
 	char *path, *lock_path;
-	char *mailbox_name;
-	TCHAR *tmailbox_name;
+	wchar_t *mailbox_name;
 
 	time_t last_stale_check;
 	bool lock_error;
@@ -48,45 +47,6 @@
 	uint32_t prev_uid, last_uid;
 };
 
-class RawTokenStream : public TokenStream {
-	CL_NS(util)::Reader *reader;
-
-public:
-	RawTokenStream(CL_NS(util)::Reader *reader) {
-		this->reader = reader;
-	};
-
-	bool next(Token *token) {
-		const TCHAR *data;
-
-		int32_t len = this->reader->read(data);
-		if (len <= 0)
-			return false;
-
-		token->set(data, 0, len);
-		return true;
-	}
-
-	void close() { }
-};
-
-class DovecotAnalyzer : public standard::StandardAnalyzer {
-public:
-	TokenStream *tokenStream(const TCHAR *fieldName,
-				 CL_NS(util)::Reader *reader) {
-		/* Everything except body/headers should go as-is without any
-		   modifications. Isn't there any easier way to do this than
-		   to implement a whole new RawTokenStream?.. */
-		if (fieldName != 0 &&
-		    wcscmp(fieldName, L"headers") != 0 &&
-		    wcscmp(fieldName, L"body") != 0)
-			return _CLNEW RawTokenStream(reader);
-
-		return standard::StandardAnalyzer::
-			tokenStream(fieldName, reader);
-	}
-};
-
 static bool lucene_dir_scan(const char *dir, const char *skip_path,
 			    time_t stale_stamp, bool unlink_staled)
 {
@@ -174,7 +134,7 @@
 	index = i_new(struct lucene_index, 1);
 	index->path = i_strdup(path);
 	index->lock_path = i_strdup(lock_path);
-	index->analyzer = _CLNEW DovecotAnalyzer();
+	index->analyzer = _CLNEW standard::StandardAnalyzer();
 
 	lucene_delete_stale_locks(index);
 	return index;
@@ -192,24 +152,54 @@
 	lucene_index_close(index);
 	_CLDELETE(index->analyzer);
 	i_free(index->mailbox_name);
-	i_free(index->tmailbox_name);
 	i_free(index->path);
 	i_free(index->lock_path);
 	i_free(index);
 }
 
+static void
+lucene_utf8_to_tchar(const char *src, wchar_t *dest, size_t destsize)
+{
+	ARRAY_TYPE(unichars) dest_arr;
+	buffer_t buf = { 0, 0 };
+
+	i_assert(sizeof(wchar_t) == sizeof(unichar_t));
+
+	buffer_create_data(&buf, dest, sizeof(wchar_t) * destsize);
+	array_create_from_buffer(&dest_arr, &buf, sizeof(wchar_t));
+	if (uni_utf8_to_ucs4(src, &dest_arr) < 0)
+		i_unreached();
+	i_assert(array_count(&dest_arr)+1 == destsize);
+	dest[destsize-1] = 0;
+}
+
+static void
+lucene_utf8_n_to_tchar(const unsigned char *src, size_t srcsize,
+		       wchar_t *dest, size_t destsize)
+{
+	ARRAY_TYPE(unichars) dest_arr;
+	buffer_t buf = { 0, 0 };
+
+	i_assert(sizeof(wchar_t) == sizeof(unichar_t));
+
+	buffer_create_data(&buf, dest, sizeof(wchar_t) * destsize);
+	array_create_from_buffer(&dest_arr, &buf, sizeof(wchar_t));
+	if (uni_utf8_to_ucs4_n(src, srcsize, &dest_arr) < 0)
+		i_unreached();
+	i_assert(array_count(&dest_arr)+1 == destsize);
+	dest[destsize-1] = 0;
+}
+
 void lucene_index_select_mailbox(struct lucene_index *index,
 				 const char *mailbox_name)
 {
-	size_t len;
+	size_t size;
 
 	i_free(index->mailbox_name);
-	i_free(index->tmailbox_name);
 
-	len = strlen(mailbox_name);
-	index->mailbox_name = i_strdup(mailbox_name);
-	index->tmailbox_name = i_new(TCHAR, len + 1);
-	STRCPY_AtoT(index->tmailbox_name, mailbox_name, len);
+	size = uni_utf8_strlen_n(mailbox_name, (size_t)-1) + 1;
+	index->mailbox_name = i_new(wchar_t, size);
+	lucene_utf8_to_tchar(mailbox_name, index->mailbox_name, size);
 }
 
 static void lucene_handle_error(struct lucene_index *index, CLuceneError &err,
@@ -265,7 +255,7 @@
 		   const TCHAR *field_name, uint32_t *uid_r)
 {
 	Field *field = doc->getField(field_name);
-	TCHAR *uid = field == NULL ? NULL : field->stringValue();
+	const TCHAR *uid = field == NULL ? NULL : field->stringValue();
 	if (uid == NULL) {
 		i_error("lucene: Corrupted FTS index %s: No UID for document",
 			index->path);
@@ -298,7 +288,7 @@
 	   if there are more than one, delete the smaller ones. this is normal
 	   behavior because we can't update/delete documents in writer, so
 	   we'll do it only in here.. */
-	Term mailbox_term(_T("box"), index->tmailbox_name);
+	Term mailbox_term(_T("box"), index->mailbox_name);
 	Term last_uid_term(_T("last_uid"), _T("*"));
 	TermQuery mailbox_query(&mailbox_term);
 	WildcardQuery last_uid_query(&last_uid_term);
@@ -421,49 +411,43 @@
 			    const unsigned char *data, size_t size,
 			    bool headers)
 {
-	unsigned int len;
+	size_t destsize;
 
 	i_assert(uid > index->last_uid);
 	i_assert(size > 0);
 
-	len = uni_utf8_strlen_n(data, size);
-	wchar_t dest[len+1];
-	lucene_utf8towcs(dest, (const char *)data, len);
-	dest[len] = 0;
+	destsize = uni_utf8_strlen_n(data, size) + 1;
+	wchar_t dest[destsize];
+	lucene_utf8_n_to_tchar(data, size, dest, destsize);
 
 	if (uid != index->prev_uid) {
-		char id[MAX_INT_STRLEN];
-		TCHAR tid[MAX_INT_STRLEN];
+		wchar_t id[MAX_INT_STRLEN];
 
 		if (lucene_index_build_flush(index) < 0)
 			return -1;
 		index->prev_uid = uid;
 


More information about the dovecot-cvs mailing list