dovecot-2.2: lib-fts: ICU normalization changes some characters ...

dovecot at dovecot.org dovecot at dovecot.org
Fri May 22 02:05:24 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/5aae57dc5ad6
changeset: 18735:5aae57dc5ad6
user:      Timo Sirainen <tss at iki.fi>
date:      Thu May 21 22:03:10 2015 -0400
description:
lib-fts: ICU normalization changes some characters to spaces - remove them.
We don't really want to add spaces to our index. It would be nice if the
words between spaces were actually split to different tokens, but that's
more of the fts-tokenizer's job and at filter stage that's probably not
wanted anymore.

diffstat:

 src/lib-fts/fts-filter-normalizer-icu.c |  2 +-
 src/lib-fts/test-fts-filter.c           |  5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diffs (39 lines):

diff -r 191eaf662c21 -r 5aae57dc5ad6 src/lib-fts/fts-filter-normalizer-icu.c
--- a/src/lib-fts/fts-filter-normalizer-icu.c	Thu May 21 08:38:56 2015 -0400
+++ b/src/lib-fts/fts-filter-normalizer-icu.c	Thu May 21 22:03:10 2015 -0400
@@ -41,7 +41,7 @@
 	struct fts_filter_normalizer_icu *np;
 	pool_t pp;
 	unsigned int i;
-	const char *id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC";
+	const char *id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC; [\\x20] Remove";
 
 	for (i = 0; settings[i] != NULL; i += 2) {
 		const char *key = settings[i], *value = settings[i+1];
diff -r 191eaf662c21 -r 5aae57dc5ad6 src/lib-fts/test-fts-filter.c
--- a/src/lib-fts/test-fts-filter.c	Thu May 21 08:38:56 2015 -0400
+++ b/src/lib-fts/test-fts-filter.c	Thu May 21 22:03:10 2015 -0400
@@ -372,7 +372,7 @@
 		"vem",
 		"a",
 		"aao",
-		"vem kan segla forutan vind?\naaooaa"
+		"vemkanseglaforutanvind?\naaooaa"
 	};
 	const char *error = NULL;
 	const char *token = NULL;
@@ -446,12 +446,13 @@
 {
 	/* test just a couple of these */
 	static const char *empty_tokens[] = {
+		"\xC2\xAF", /* U+00AF */
 		"\xCC\x80", /* U+0300 */
 		"\xF3\xA0\x87\xAF", /* U+E01EF */
 		"\xCC\x80\xF3\xA0\x87\xAF" /* U+0300 U+E01EF */
 	};
 	const char * const settings[] =
-		{"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", NULL};
+		{"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; [\\x20] Remove", NULL};
 	struct fts_filter *norm;
 	const char *error;
 	unsigned int i;


More information about the dovecot-cvs mailing list