dovecot-2.2: lib-fts: Fixed token truncation.
dovecot at dovecot.org
dovecot at dovecot.org
Sat May 9 15:29:58 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/dd04199a689f
changeset: 18601:dd04199a689f
user: Timo Sirainen <tss at iki.fi>
date: Sat May 09 17:34:59 2015 +0300
description:
lib-fts: Fixed token truncation.
diffstat:
src/lib-fts/fts-tokenizer-generic.c | 38 +++++++++++++++++++++++++-----------
1 files changed, 26 insertions(+), 12 deletions(-)
diffs (98 lines):
diff -r 99ad974a3828 -r dd04199a689f src/lib-fts/fts-tokenizer-generic.c
--- a/src/lib-fts/fts-tokenizer-generic.c Sat May 09 17:07:32 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic.c Sat May 09 17:34:59 2015 +0300
@@ -85,11 +85,20 @@
i_free(tok);
}
+static const char *fts_uni_strndup(const unsigned char *data, size_t size)
+{
+ size_t pos;
+
+ /* if input is truncated with a partial UTF-8 character, drop it */
+ (void)uni_utf8_partial_strlen_n(data, size, &pos);
+ return t_strndup(data, pos);
+}
+
static int
fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
const char **token_r)
{
- *token_r = t_strndup(tok->token->data, I_MIN(tok->token->used, tok->max_length));
+ *token_r = fts_uni_strndup(tok->token->data, tok->token->used);
buffer_set_used_size(tok->token, 0);
return 1;
}
@@ -148,6 +157,15 @@
buffer_set_used_size(tok->token, 0);
}
+static void tok_append_truncated(struct generic_fts_tokenizer *tok,
+ const unsigned char *data, size_t size)
+{
+ i_assert(tok->max_length >= tok->token->used);
+
+ buffer_append(tok->token, data,
+ I_MIN(size, tok->max_length - tok->token->used));
+}
+
static int
fts_tokenizer_generic_next_simple(struct fts_tokenizer *_tok,
const unsigned char *data, size_t size,
@@ -161,7 +179,7 @@
char_start_i = i;
if (data_is_word_boundary(data, size, &i)) {
len = char_start_i - start;
- buffer_append(tok->token, data + start, len);
+ tok_append_truncated(tok, data + start, len);
if (tok->token->used == 0) {
/* no text read yet */
start = i + 1;
@@ -174,16 +192,12 @@
}
/* word boundary not found yet */
len = i - start;
- buffer_append(tok->token, data + start, len);
+ tok_append_truncated(tok, data + start, len);
*skip_r = i;
/* return the last token */
if (size == 0 && tok->token->used > 0)
return fts_tokenizer_generic_simple_current_token(tok, token_r);
-
- /* token too long */
- if (tok->token->used > tok->max_length)
- return fts_tokenizer_generic_simple_current_token(tok, token_r);
return 0;
}
@@ -488,9 +502,9 @@
if (is_one_past_end(tok))
end_skip = tok->last_size;
- len = I_MIN(tok->token->used, tok->max_length) - end_skip;
+ len = tok->token->used - end_skip;
i_assert(len > 0);
- *token_r = t_strndup(tok->token->data, len);
+ *token_r = fts_uni_strndup(tok->token->data, len);
buffer_set_used_size(tok->token, 0);
tok->prev_prev_letter = LETTER_TYPE_NONE;
tok->prev_letter = LETTER_TYPE_NONE;
@@ -575,14 +589,14 @@
}
if (uni_found_word_boundary(tok, lt)) {
i_assert(char_start_i >= start_skip && size >= start_skip);
- buffer_append(tok->token, data + start_skip,
- char_start_i - start_skip);
+ tok_append_truncated(tok, data + start_skip,
+ char_start_i - start_skip);
*skip_r = i + 1;
return fts_tokenizer_generic_tr29_current_token(tok, token_r);
}
}
i_assert(i >= start_skip && size >= start_skip);
- buffer_append(tok->token, data + start_skip, i - start_skip);
+ tok_append_truncated(tok, data + start_skip, i - start_skip);
*skip_r = i;
if (size == 0 && tok->token->used > 0) {
More information about the dovecot-cvs
mailing list