dovecot-2.1: fts-lucene: Added whitespace_chars subsetting to ft...

dovecot at dovecot.org dovecot at dovecot.org
Fri Nov 4 19:25:09 EET 2011


details:   http://hg.dovecot.org/dovecot-2.1/rev/6d483a22134e
changeset: 13646:6d483a22134e
user:      Timo Sirainen <tss at iki.fi>
date:      Fri Nov 04 19:35:30 2011 +0200
description:
fts-lucene: Added whitespace_chars subsetting to fts_lucene.
A value of "@." could be useful so that user at domain.tld allows searching
user, domain and tld separately instead of requiring the whole string to
match.

diffstat:

 src/plugins/fts-lucene/fts-lucene-plugin.c |  12 +++++++--
 src/plugins/fts-lucene/fts-lucene-plugin.h |   1 +
 src/plugins/fts-lucene/lucene-wrapper.cc   |  35 ++++++++++++++++++++++++-----
 3 files changed, 39 insertions(+), 9 deletions(-)

diffs (129 lines):

diff -r b6e5cf112b3e -r 6d483a22134e src/plugins/fts-lucene/fts-lucene-plugin.c
--- a/src/plugins/fts-lucene/fts-lucene-plugin.c	Fri Nov 04 18:50:24 2011 +0200
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.c	Fri Nov 04 19:35:30 2011 +0200
@@ -26,6 +26,8 @@
 			set->textcat_conf = p_strdup(user->pool, *tmp + 13);
 		} else if (strncmp(*tmp, "textcat_dir=", 12) == 0) {
 			set->textcat_dir = p_strdup(user->pool, *tmp + 12);
+		} else if (strncmp(*tmp, "whitespace_chars=", 17) == 0) {
+			set->whitespace_chars = p_strdup(user->pool, *tmp + 17);
 		} else {
 			i_error("fts_lucene: Invalid setting: %s", *tmp);
 			return -1;
@@ -39,6 +41,8 @@
 		i_error("fts_lucene: textcat_dir set, but textcat_conf unset");
 		return -1;
 	}
+	if (set->whitespace_chars == NULL)
+		set->whitespace_chars = "";
 #ifndef HAVE_LUCENE_STEMMER
 	if (set->default_language != NULL) {
 		i_error("fts_lucene: default_language set, "
@@ -61,9 +65,11 @@
 
 uint32_t fts_lucene_settings_checksum(const struct fts_lucene_settings *set)
 {
-	/* only the default language change matters */
-	return set->default_language == NULL ? 0 :
-		crc32_str(set->default_language);
+	uint32_t crc;
+
+	crc = crc32_str(set->default_language);
+	crc = crc32_str_more(crc, set->whitespace_chars);
+	return crc;
 }
 
 static void fts_lucene_mail_user_created(struct mail_user *user)
diff -r b6e5cf112b3e -r 6d483a22134e src/plugins/fts-lucene/fts-lucene-plugin.h
--- a/src/plugins/fts-lucene/fts-lucene-plugin.h	Fri Nov 04 18:50:24 2011 +0200
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.h	Fri Nov 04 19:35:30 2011 +0200
@@ -11,6 +11,7 @@
 struct fts_lucene_settings {
 	const char *default_language;
 	const char *textcat_conf, *textcat_dir;
+	const char *whitespace_chars;
 };
 
 struct fts_lucene_user {
diff -r b6e5cf112b3e -r 6d483a22134e src/plugins/fts-lucene/lucene-wrapper.cc
--- a/src/plugins/fts-lucene/lucene-wrapper.cc	Fri Nov 04 18:50:24 2011 +0200
+++ b/src/plugins/fts-lucene/lucene-wrapper.cc	Fri Nov 04 19:35:30 2011 +0200
@@ -143,6 +143,21 @@
 	i_free(index);
 }
 
+static void lucene_data_translate(struct lucene_index *index,
+				  wchar_t *data, unsigned int len)
+{
+	const char *whitespace_chars = index->set.whitespace_chars;
+	unsigned int i;
+
+	if (*whitespace_chars == '\0')
+		return;
+
+	for (i = 0; i < len; i++) {
+		if (strchr(whitespace_chars, data[i]) != NULL)
+			data[i] = ' ';
+	}
+}
+
 void lucene_utf8_n_to_tchar(const unsigned char *src, size_t srcsize,
 			    wchar_t *dest, size_t destsize)
 {
@@ -159,10 +174,14 @@
 	dest[destsize-1] = 0;
 }
 
-static const wchar_t *t_lucene_utf8_to_tchar(const char *str)
+static const wchar_t *
+t_lucene_utf8_to_tchar(struct lucene_index *index,
+		       const char *str, bool translate)
 {
 	ARRAY_TYPE(unichars) dest_arr;
-	const unichar_t *ret;
+	const unichar_t *chars;
+	wchar_t *ret;
+	unsigned int len;
 
 	i_assert(sizeof(wchar_t) == sizeof(unichar_t));
 
@@ -170,8 +189,11 @@
 	if (uni_utf8_to_ucs4(str, &dest_arr) < 0)
 		i_unreached();
 	(void)array_append_space(&dest_arr);
-	ret = array_idx(&dest_arr, 0);
-	return (const wchar_t *)ret;
+
+	chars = array_get_modifiable(&dest_arr, &len);
+	ret = (wchar_t *)chars;
+	lucene_data_translate(index, ret, len - 1);
+	return ret;
 }
 
 void lucene_index_select_mailbox(struct lucene_index *index,
@@ -478,6 +500,7 @@
 	datasize = uni_utf8_strlen_n(data, size) + 1;
 	wchar_t dest[datasize];
 	lucene_utf8_n_to_tchar(data, size, dest, datasize);
+	lucene_data_translate(index, dest, datasize);
 
 	if (hdr_name != NULL) {
 		/* hdr_name should be ASCII, but don't break in case it isn't */
@@ -1010,7 +1033,7 @@
 lucene_get_query_str(struct lucene_index *index,
 		     const TCHAR *key, const char *str, bool fuzzy)
 {
-	const TCHAR *wvalue = t_lucene_utf8_to_tchar(str);
+	const TCHAR *wvalue = t_lucene_utf8_to_tchar(index, str, TRUE);
 	Analyzer *analyzer = guess_analyzer(index, str, strlen(str));
 	if (analyzer == NULL)
 		analyzer = index->default_analyzer;
@@ -1067,7 +1090,7 @@
 		}
 
 		q = lucene_get_query(index,
-				     t_lucene_utf8_to_tchar(arg->hdr_field_name),
+				     t_lucene_utf8_to_tchar(index, arg->hdr_field_name, FALSE),
 				     arg);
 		break;
 	default:


More information about the dovecot-cvs mailing list