dovecot-2.2: lib-fts: Fix simple tokenizer apostrophe handling.

Thu May 21 10:39:16 UTC 2015

details:   http://hg.dovecot.org/dovecot-2.2/rev/6c655ce3b857
changeset: 18732:6c655ce3b857
user:      Teemu Huovila <teemu.huovila at dovecot.fi>
date:      Thu May 21 06:29:15 2015 -0400
description:
lib-fts: Fix simple tokenizer apostrophe handling.
Apostrophes and quotation marks are now treated as word breaks,
except U+0027 between non-wordbrek characters. The characters
U+2019 and U+FF07 are transformed to U+0027 before processing.

diffstat:

 src/lib-fts/fts-tokenizer-generic-private.h |    3 +-
 src/lib-fts/fts-tokenizer-generic.c         |  110 ++++++++++++++++++++-------
 src/lib-fts/test-fts-tokenizer.c            |    9 ++
 src/lib-fts/word-properties.pl              |    2 +-
 4 files changed, 93 insertions(+), 31 deletions(-)

diffs (254 lines):

diff -r 5ca59cffbf2f -r 6c655ce3b857 src/lib-fts/fts-tokenizer-generic-private.h

--- a/src/lib-fts/fts-tokenizer-generic-private.h	Thu May 21 06:17:32 2015 -0400
+++ b/src/lib-fts/fts-tokenizer-generic-private.h	Thu May 21 06:29:15 2015 -0400
@@ -40,8 +40,7 @@
 	struct fts_tokenizer tokenizer;
 	unsigned int max_length;
 	enum boundary_algorithm algorithm;
-	enum letter_type prev_letter; /* These two are basically the
-	                                     state of the parsing. */
+	enum letter_type prev_letter;
 	enum letter_type prev_prev_letter;
 	size_t last_size; /* Bytes in latest utf8 character. */
 	buffer_t *token;
diff -r 5ca59cffbf2f -r 6c655ce3b857 src/lib-fts/fts-tokenizer-generic.c
--- a/src/lib-fts/fts-tokenizer-generic.c	Thu May 21 06:17:32 2015 -0400
+++ b/src/lib-fts/fts-tokenizer-generic.c	Thu May 21 06:29:15 2015 -0400
@@ -11,7 +11,12 @@
 
 #define FTS_DEFAULT_TOKEN_MAX_LENGTH 30
 
-static unsigned char fts_ascii_word_boundaries[128] = {
+#define IS_NONASCII_APOSTROPHE(c) \
+	((c) == 0x2019 || (c) == 0xFF07)
+#define IS_APOSTROPHE(c) \
+	((c) == 0x0027 || IS_NONASCII_APOSTROPHE(c))
+
+static unsigned char fts_ascii_word_breaks[128] = {
 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */
 
@@ -95,34 +100,60 @@
 	return t_strndup(data, pos);
 }
 
-static void
+static bool
 fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
                                            const char **token_r)
 {
-	*token_r = fts_uni_strndup(tok->token->data, tok->token->used);
+	const unsigned char *data;
+	size_t start = 0, len;
+
+	/* clean trailing and starting apostrophes. they were all made
+	   into U+0027 earlier. */
+	data = tok->token->data;
+	len = tok->token->used;
+	while (len > 0 && data[len - 1] == '\'')
+		len--;
+	while (start < len && data[start] == '\'')
+		start++;
+
+	*token_r = len - start == 0 ? "" :
+		fts_uni_strndup(CONST_PTR_OFFSET(tok->token->data, start),
+				len - start);
 	buffer_set_used_size(tok->token, 0);
+	return (*token_r)[0] != '\0';
 }
 
-/* TODO: This is duplicated from unichar.c */
 static bool uint32_find(const uint32_t *data, unsigned int count,
 			uint32_t value, unsigned int *idx_r)
 {
 	BINARY_NUMBER_SEARCH(data, count, value, idx_r);
 }
 
-static bool is_word_break(unichar_t c)
+static bool fts_ascii_word_break(unsigned char c)
+{
+	if (c < 0x80)
+		return fts_ascii_word_breaks[c] != 0;
+	return FALSE;
+}
+
+static bool fts_uni_word_break(unichar_t c)
 {
 	unsigned int idx;
 
+	/* Override some apostrophes, which get special treatment. */
+	if (IS_APOSTROPHE(c))
+		return FALSE;
+
 	/* Unicode General Punctuation, including deprecated characters. */
 	if (c >= 0x2000 && c <= 0x206f)
 		return TRUE;
-
 	/* From word-break-data.c, which is generated from PropList.txt. */
 	if (uint32_find(White_Space, N_ELEMENTS(White_Space), c, &idx))
 		return TRUE;
 	if (uint32_find(Dash, N_ELEMENTS(Dash), c, &idx))
 		return TRUE;
+	if (uint32_find(Quotation_Mark, N_ELEMENTS(Quotation_Mark), c, &idx))
+		return TRUE;
 	if (uint32_find(Terminal_Punctuation, N_ELEMENTS(Terminal_Punctuation), c, &idx))
 		return TRUE;
 	if (uint32_find(STerm, N_ELEMENTS(STerm), c, &idx))
@@ -133,17 +164,17 @@
 }
 
 static bool
-data_is_word_boundary(const unsigned char *data, size_t size, size_t *i)
+fts_apostrophe_word_break(struct generic_fts_tokenizer *tok, unichar_t c)
 {
-	unichar_t c;
-
-	if (data[*i] < 0x80)
-		return fts_ascii_word_boundaries[data[*i]] != 0;
-	/* unicode punctuation? */
-	if (uni_utf8_get_char_n(data + *i, size - *i, &c) <= 0)
-		i_unreached();
-	*i += uni_utf8_char_bytes(data[*i]) - 1;
-	return is_word_break(c);
+	if (IS_APOSTROPHE(c)) {
+		if (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE)
+			return TRUE;
+		else
+			tok->prev_letter = LETTER_TYPE_SINGLE_QUOTE;
+	} else {
+		tok->prev_letter = LETTER_TYPE_NONE;
+	}
+	return FALSE;
 }
 
 static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
@@ -160,10 +191,26 @@
 static void tok_append_truncated(struct generic_fts_tokenizer *tok,
 				 const unsigned char *data, size_t size)
 {
+	size_t append_len, pos = 0, appended = 0;
+	unichar_t c;
+
 	i_assert(tok->max_length >= tok->token->used);
+	append_len = I_MIN(size, tok->max_length - tok->token->used);
 
-	buffer_append(tok->token, data,
-		      I_MIN(size, tok->max_length - tok->token->used));
+	/* Append only one kind of apostrophes. Simplifies things when returning
+	   token. */
+	while (pos < append_len) {
+		if (uni_utf8_get_char_n(data + pos, size - pos, &c) <= 0)
+			i_unreached();
+		if (IS_NONASCII_APOSTROPHE(c)) {
+			buffer_append(tok->token, data, pos);
+			buffer_append_c(tok->token, '\'');
+			appended = pos + 1;
+		}
+		pos += uni_utf8_char_bytes(data[pos]);
+	}
+	if (appended < append_len)
+		buffer_append(tok->token, data + appended, append_len - appended);
 }
 
 static int
@@ -175,21 +222,27 @@
 	struct generic_fts_tokenizer *tok =
 		(struct generic_fts_tokenizer *)_tok;
 	size_t i, char_start_i, len, start = 0;
+	unsigned int char_size;
+	unichar_t c;
 
-	for (i = 0; i < size; i++) {
+	for (i = 0; i < size; i += char_size) {
 		char_start_i = i;
-		if (data_is_word_boundary(data, size, &i)) {
+		if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
+			i_unreached();
+		char_size = uni_utf8_char_bytes(data[i]);
+		if (fts_ascii_word_break(data[i]) || fts_uni_word_break(c) ||
+		    fts_apostrophe_word_break(tok, c)) {
 			len = char_start_i - start;
 			tok_append_truncated(tok, data + start, len);
 			if (tok->token->used == 0) {
-				/* no text read yet */
-				start = i + 1;
+				start = i + char_size;
 				continue;
 			}
-			/* word boundary found - return a new token */
-			*skip_r = i + 1;
-			fts_tokenizer_generic_simple_current_token(tok, token_r);
-			return 1;
+
+			if (fts_tokenizer_generic_simple_current_token(tok, token_r)) {
+				*skip_r = i + char_size;
+				return 1;
+			}
 		}
 	}
 	/* word boundary not found yet */
@@ -199,9 +252,10 @@
 
 	/* return the last token */
 	if (size == 0 && tok->token->used > 0) {
-		fts_tokenizer_generic_simple_current_token(tok, token_r);
-		return 1;
+		if (fts_tokenizer_generic_simple_current_token(tok, token_r))
+			return 1;
 	}
+
 	return 0;
 }
 
diff -r 5ca59cffbf2f -r 6c655ce3b857 src/lib-fts/test-fts-tokenizer.c
--- a/src/lib-fts/test-fts-tokenizer.c	Thu May 21 06:17:32 2015 -0400
+++ b/src/lib-fts/test-fts-tokenizer.c	Thu May 21 06:29:15 2015 -0400
@@ -29,6 +29,8 @@
 
 	"1.",
 
+	"'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
+
 	/* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
 	   U+205A(e2 81 9a) and U+205F(e2 81 9f) */
 	"hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
@@ -99,6 +101,7 @@
 		outi++;
 	}
 	test_assert_idx(expected_output[outi] == NULL, outi);
+
 	return outi+1;
 }
 
@@ -130,6 +133,9 @@
 
 		"1", NULL,
 
+		"quoted", "text", "word", "hlo", "words", "you're", "bad",
+		"word", "pre", "post", NULL,
+
 		"hello", "world", "And",
 		"there", "was", "text", "galore",
 		"and", "more", NULL,
@@ -169,6 +175,9 @@
 
 		"1", NULL,
 
+		"quoted", "text", "word", "hlo", "words", "you're", "bad",
+		"word", "pre", "post", NULL,
+
 		"hello", "world", "And",
 		"there", "was", "text", "galore",
 		"and", "more", NULL,
diff -r 5ca59cffbf2f -r 6c655ce3b857 src/lib-fts/word-properties.pl
--- a/src/lib-fts/word-properties.pl	Thu May 21 06:17:32 2015 -0400
+++ b/src/lib-fts/word-properties.pl	Thu May 21 06:29:15 2015 -0400
@@ -8,7 +8,7 @@
     @categories = qw(CR LF Newline Extend Regional_Indicator Format Katakana Hebrew_Letter ALetter
 		    Single_Quote Double_Quote MidNumLet MidLetter MidNum Numeric ExtendNumLet);
 } elsif ($which eq 'breaks') {
-    @categories = qw(White_Space Dash Terminal_Punctuation STerm Pattern_White_Space);
+    @categories = qw(White_Space Dash Quotation_Mark Terminal_Punctuation STerm Pattern_White_Space);
 } else {
     die "specify 'boundaries' or 'breaks'";
 }