dovecot-2.2: fts-lucene: Support normalize setting also without ...

dovecot at dovecot.org dovecot at dovecot.org
Sun Jun 9 03:10:55 EEST 2013


details:   http://hg.dovecot.org/dovecot-2.2/rev/7e54af474ea4
changeset: 16485:7e54af474ea4
user:      Timo Sirainen <tss at iki.fi>
date:      Sun Jun 09 03:10:43 2013 +0300
description:
fts-lucene: Support normalize setting also without snowball. Added no_snowball setting.
Snowball seems to be converting / breaking words down rather annoyingly.

diffstat:

 src/plugins/fts-lucene/fts-lucene-plugin.c |   9 ++---
 src/plugins/fts-lucene/fts-lucene-plugin.h |   1 +
 src/plugins/fts-lucene/lucene-wrapper.cc   |  43 ++++++++++++++++++++++++-----
 3 files changed, 40 insertions(+), 13 deletions(-)

diffs (126 lines):

diff -r 1f3f21081ee5 -r 7e54af474ea4 src/plugins/fts-lucene/fts-lucene-plugin.c
--- a/src/plugins/fts-lucene/fts-lucene-plugin.c	Sun Jun 09 02:49:48 2013 +0300
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.c	Sun Jun 09 03:10:43 2013 +0300
@@ -30,6 +30,8 @@
 			set->whitespace_chars = p_strdup(user->pool, *tmp + 17);
 		} else if (strcmp(*tmp, "normalize") == 0) {
 			set->normalize = TRUE;
+		} else if (strcmp(*tmp, "no_snowball") == 0) {
+			set->no_snowball = TRUE;
 		} else {
 			i_error("fts_lucene: Invalid setting: %s", *tmp);
 			return -1;
@@ -51,11 +53,6 @@
 			"but Dovecot built without stemmer support");
 		return -1;
 	}
-	if (set->normalize) {
-		i_error("fts_lucene: normalize not currently supported "
-			"without stemmer support");
-		return -1;
-	}
 #else
 	if (set->default_language == NULL)
 		set->default_language = "english";
@@ -80,6 +77,8 @@
 	crc = crc32_str_more(crc, set->whitespace_chars);
 	if (set->normalize)
 		crc = crc32_str_more(crc, "n");
+	if (set->no_snowball)
+		crc = crc32_str_more(crc, "s");
 	return crc;
 }
 
diff -r 1f3f21081ee5 -r 7e54af474ea4 src/plugins/fts-lucene/fts-lucene-plugin.h
--- a/src/plugins/fts-lucene/fts-lucene-plugin.h	Sun Jun 09 02:49:48 2013 +0300
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.h	Sun Jun 09 03:10:43 2013 +0300
@@ -13,6 +13,7 @@
 	const char *textcat_conf, *textcat_dir;
 	const char *whitespace_chars;
 	bool normalize;
+	bool no_snowball;
 };
 
 struct fts_lucene_user {
diff -r 1f3f21081ee5 -r 7e54af474ea4 src/plugins/fts-lucene/lucene-wrapper.cc
--- a/src/plugins/fts-lucene/lucene-wrapper.cc	Sun Jun 09 02:49:48 2013 +0300
+++ b/src/plugins/fts-lucene/lucene-wrapper.cc	Sun Jun 09 03:10:43 2013 +0300
@@ -67,6 +67,7 @@
 	IndexWriter *writer;
 	IndexSearcher *searcher;
 
+	buffer_t *normalizer_buf;
 	Analyzer *default_analyzer, *cur_analyzer;
 	ARRAY(struct lucene_analyzer) analyzers;
 
@@ -118,13 +119,20 @@
 		index->set.default_language = "";
 	}
 #ifdef HAVE_LUCENE_STEMMER
-	index->default_analyzer =
-		_CLNEW snowball::SnowballAnalyzer(index->normalizer,
-						  index->set.default_language);
-#else
-	index->default_analyzer = _CLNEW standard::StandardAnalyzer();
-	i_assert(index->normalizer == NULL);
+	if (!set->no_snowball) {
+		index->default_analyzer =
+			_CLNEW snowball::SnowballAnalyzer(index->normalizer,
+							  index->set.default_language);
+	}
 #endif
+	else {
+		index->default_analyzer = _CLNEW standard::StandardAnalyzer();
+		if (index->normalizer != NULL) {
+			index->normalizer_buf =
+				buffer_create_dynamic(default_pool, 1024);
+		}
+	}
+
 	i_array_init(&index->analyzers, 32);
 	textcat_refcount++;
 
@@ -155,6 +163,8 @@
 		textcat = NULL;
 	}
 	_CLDELETE(index->default_analyzer);
+	if (index->normalizer_buf != NULL)
+		buffer_free(&index->normalizer_buf);
 	i_free(index->path);
 	i_free(index);
 }
@@ -517,6 +527,13 @@
 		index->doc->add(*_CLNEW Field(_T("box"), index->mailbox_guid, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
 	}
 
+	if (index->normalizer_buf != NULL) {
+		buffer_set_used_size(index->normalizer_buf, 0);
+		index->normalizer(data, size, index->normalizer_buf);
+		data = (const unsigned char *)index->normalizer_buf->data;
+		size = index->normalizer_buf->used;
+	}
+
 	datasize = uni_utf8_strlen_n(data, size) + 1;
 	wchar_t dest[datasize];
 	lucene_utf8_n_to_tchar(data, size, dest, datasize);
@@ -1055,8 +1072,18 @@
 lucene_get_query_str(struct lucene_index *index,
 		     const TCHAR *key, const char *str, bool fuzzy)
 {
-	const TCHAR *wvalue = t_lucene_utf8_to_tchar(index, str, TRUE);
-	Analyzer *analyzer = guess_analyzer(index, str, strlen(str));
+	const TCHAR *wvalue;
+	Analyzer *analyzer;
+
+	if (index->normalizer_buf != NULL) {
+		buffer_set_used_size(index->normalizer_buf, 0);
+		index->normalizer(str, strlen(str), index->normalizer_buf);
+		buffer_append_c(index->normalizer_buf, '\0');
+		str = (const char *)index->normalizer_buf->data;
+	}
+
+	wvalue = t_lucene_utf8_to_tchar(index, str, TRUE);
+	analyzer = guess_analyzer(index, str, strlen(str));
 	if (analyzer == NULL)
 		analyzer = index->default_analyzer;
 


More information about the dovecot-cvs mailing list