dovecot: Fixed non-ASCII searches.
dovecot at dovecot.org
dovecot at dovecot.org
Mon Dec 3 14:23:50 EET 2007
details: http://hg.dovecot.org/dovecot/rev/a215deb3de8d
changeset: 6911:a215deb3de8d
user: Timo Sirainen <tss at iki.fi>
date: Mon Dec 03 14:23:45 2007 +0200
description:
Fixed non-ASCII searches.
diffstat:
5 files changed, 143 insertions(+), 52 deletions(-)
src/plugins/fts-squat/squat-trie.c | 152 ++++++++++++++++++++++++++----------
src/plugins/fts/Makefile.am | 1
src/plugins/fts/fts-search.c | 40 ++++++---
src/plugins/fts/fts-storage.c | 1
src/plugins/fts/fts-storage.h | 1
diffs (truncated from 363 to 300 lines):
diff -r c68564884bae -r a215deb3de8d src/plugins/fts-squat/squat-trie.c
--- a/src/plugins/fts-squat/squat-trie.c Mon Dec 03 14:23:19 2007 +0200
+++ b/src/plugins/fts-squat/squat-trie.c Mon Dec 03 14:23:45 2007 +0200
@@ -5,6 +5,7 @@
#include "str.h"
#include "istream.h"
#include "ostream.h"
+#include "unichar.h"
#include "seq-range-array.h"
#include "squat-uidlist.h"
#include "squat-trie-private.h"
@@ -702,18 +703,65 @@ static int squat_build_add(struct squat_
}
static int
+squat_build_word_bytes(struct squat_trie *trie, uint32_t uid,
+ const unsigned char *data, unsigned int size)
+{
+ unsigned int i;
+
+ if (trie->hdr.full_len <= trie->hdr.partial_len)
+ i = 0;
+ else {
+ /* the first word is longer than others */
+ if (squat_build_add(trie, uid, data,
+ I_MIN(size, trie->hdr.full_len)) < 0)
+ return -1;
+ i = 1;
+ }
+
+ for (; i < size; i++) {
+ if (squat_build_add(trie, uid, data + i,
+ I_MIN(trie->hdr.partial_len, size-i)) < 0)
+ return -1;
+ }
+ return 0;
+}
+
+static int
squat_build_word(struct squat_trie *trie, uint32_t uid,
- const unsigned char *data, unsigned int size)
-{
- unsigned int i;
-
- for (i = size - 1; i > 0; i--) {
- if (squat_build_add(trie, uid, data + i,
- I_MIN(trie->hdr.partial_len, size - i)) < 0)
- return -1;
- }
- return squat_build_add(trie, uid, data,
- I_MIN(size, trie->hdr.full_len));
+ const unsigned char *data, const uint8_t *char_lengths,
+ unsigned int size)
+{
+ unsigned int i, j, bytelen;
+
+ if (char_lengths == NULL) {
+ /* optimization path: all characters are bytes */
+ return squat_build_word_bytes(trie, uid, data, size);
+ }
+
+ if (trie->hdr.full_len <= trie->hdr.partial_len)
+ i = 0;
+ else {
+ /* the first word is longer than others */
+ bytelen = 0;
+ for (j = 0; j < trie->hdr.full_len && bytelen < size; j++)
+ bytelen += char_lengths[bytelen];
+ i_assert(bytelen <= size);
+
+ if (squat_build_add(trie, uid, data, bytelen) < 0)
+ return -1;
+ i = char_lengths[0];
+ }
+
+ for (; i < size; i += char_lengths[i]) {
+ bytelen = 0;
+ for (j = 0; j < trie->hdr.partial_len && i+bytelen < size; j++)
+ bytelen += char_lengths[i + bytelen];
+ i_assert(i + bytelen <= size);
+
+ if (squat_build_add(trie, uid, data + i, bytelen) < 0)
+ return -1;
+ }
+ return 0;
}
static unsigned char *
@@ -731,17 +779,24 @@ squat_data_normalize(struct squat_trie *
int squat_trie_build_more(struct squat_trie_build_context *ctx,
uint32_t uid, enum squat_index_type type,
- const unsigned char *data, unsigned int size)
+ const unsigned char *input, unsigned int size)
{
struct squat_trie *trie = ctx->trie;
+ const unsigned char *data;
+ uint8_t *char_lengths;
unsigned int i, start = 0;
+ bool multibyte_chars = FALSE;
int ret = 0;
uid = uid * 2 + (type == SQUAT_INDEX_TYPE_HEADER ? 0 : 1);
t_push();
- data = squat_data_normalize(trie, data, size);
+ char_lengths = t_malloc(size);
+ data = squat_data_normalize(trie, input, size);
for (i = 0; i < size; i++) {
+ char_lengths[i] = uni_utf8_char_bytes(input[i]);
+ if (char_lengths[i] != 1)
+ multibyte_chars = TRUE;
if (data[i] != '\0')
continue;
@@ -749,6 +804,8 @@ int squat_trie_build_more(struct squat_t
start++;
if (i != start) {
if (squat_build_word(trie, uid, data + start,
+ !multibyte_chars ? NULL :
+ char_lengths + start,
i - start) < 0) {
ret = -1;
start = i;
@@ -760,7 +817,9 @@ int squat_trie_build_more(struct squat_t
while (start < i && data[start] == '\0')
start++;
if (i != start) {
- if (squat_build_word(trie, uid, data + start, i - start) < 0)
+ if (squat_build_word(trie, uid, data + start,
+ !multibyte_chars ? NULL :
+ char_lengths + start, i - start) < 0)
ret = -1;
}
t_pop();
@@ -1355,20 +1414,23 @@ struct squat_trie_lookup_context {
static int
squat_trie_lookup_partial(struct squat_trie_lookup_context *ctx,
- const unsigned char *data, unsigned int size)
-{
- const unsigned char *block;
- unsigned int block_len;
+ const unsigned char *data, uint8_t *char_lengths,
+ unsigned int size)
+{
+ const unsigned int partial_len = ctx->trie->hdr.partial_len;
+ unsigned int char_idx, max_chars, i, j, bytelen;
int ret;
- do {
- if (size <= ctx->trie->hdr.partial_len)
- block_len = size;
- else
- block_len = ctx->trie->hdr.partial_len;
- block = data + size - block_len;
-
- ret = squat_trie_lookup_data(ctx->trie, block, block_len,
+ max_chars = uni_utf8_strlen_n(data, size);
+ if (max_chars > ctx->trie->hdr.partial_len)
+ max_chars = partial_len;
+
+ for (i = 0, char_idx = 0; char_idx < max_chars; char_idx++) {
+ bytelen = 0;
+ for (j = 0; j < partial_len && i+bytelen < size; j++)
+ bytelen += char_lengths[i + bytelen];
+
+ ret = squat_trie_lookup_data(ctx->trie, data + i, bytelen,
&ctx->tmp_uids);
if (ret <= 0) {
array_clear(ctx->maybe_uids);
@@ -1385,7 +1447,9 @@ squat_trie_lookup_partial(struct squat_t
seq_range_array_remove_invert_range(ctx->maybe_uids,
&ctx->tmp_uids2);
}
- } while (--size >= ctx->trie->hdr.partial_len);
+
+ i += char_lengths[i];
+ }
return 1;
}
@@ -1416,7 +1480,8 @@ int squat_trie_lookup(struct squat_trie
{
struct squat_trie_lookup_context ctx;
unsigned char *data;
- unsigned int i, start, size;
+ uint8_t *char_lengths;
+ unsigned int i, start, bytes, str_bytelen, str_charlen;
int ret = 0;
t_push();
@@ -1429,12 +1494,17 @@ int squat_trie_lookup(struct squat_trie
t_array_init(&ctx.tmp_uids2, 128);
ctx.first = TRUE;
- size = strlen(str);
- data = t_malloc(size);
- memcpy(data, str, size);
- data = squat_data_normalize(trie, data, size);
-
- for (i = start = 0; i < size && ret >= 0; i++) {
+ str_bytelen = strlen(str);
+ char_lengths = t_malloc0(str_bytelen);
+ for (i = 0; i < str_bytelen; ) {
+ bytes = uni_utf8_char_bytes(str[i]);
+ char_lengths[i] = bytes;
+ i += bytes;
+ }
+ data = squat_data_normalize(trie, (const unsigned char *)str,
+ str_bytelen);
+
+ for (i = start = 0; i < str_bytelen && ret >= 0; i += char_lengths[i]) {
if (data[i] != '\0')
continue;
@@ -1442,9 +1512,10 @@ int squat_trie_lookup(struct squat_trie
search it in parts. */
if (i != start) {
ret = squat_trie_lookup_partial(&ctx, data + start,
+ char_lengths,
i - start);
}
- start = i + 1;
+ start = i + char_lengths[i];
}
if (start != 0) {
@@ -1452,6 +1523,7 @@ int squat_trie_lookup(struct squat_trie
array_clear(definite_uids);
if (i != start && ret >= 0) {
ret = squat_trie_lookup_partial(&ctx, data + start,
+ char_lengths,
i - start);
}
t_pop();
@@ -1459,9 +1531,10 @@ int squat_trie_lookup(struct squat_trie
return ret < 0 ? -1 : 0;
}
- if (size <= trie->hdr.partial_len ||
+ if (str_charlen <= trie->hdr.partial_len ||
trie->hdr.full_len > trie->hdr.partial_len) {
- ret = squat_trie_lookup_data(trie, data, size, &ctx.tmp_uids);
+ ret = squat_trie_lookup_data(trie, data, str_bytelen,
+ &ctx.tmp_uids);
if (ret > 0) {
squat_trie_filter_type(type, &ctx.tmp_uids,
definite_uids);
@@ -1470,12 +1543,13 @@ int squat_trie_lookup(struct squat_trie
array_clear(definite_uids);
}
- if (size <= trie->hdr.partial_len || trie->hdr.partial_len == 0) {
+ if (str_charlen <= trie->hdr.partial_len ||
+ trie->hdr.partial_len == 0) {
/* we have the result */
array_clear(maybe_uids);
} else {
ret = squat_trie_lookup_partial(&ctx, data + start,
- i - start);
+ char_lengths, i - start);
}
t_pop();
squat_trie_add_unknown(trie, maybe_uids);
diff -r c68564884bae -r a215deb3de8d src/plugins/fts/Makefile.am
--- a/src/plugins/fts/Makefile.am Mon Dec 03 14:23:19 2007 +0200
+++ b/src/plugins/fts/Makefile.am Mon Dec 03 14:23:45 2007 +0200
@@ -1,5 +1,6 @@ AM_CPPFLAGS = \
AM_CPPFLAGS = \
-I$(top_srcdir)/src/lib \
+ -I$(top_srcdir)/src/lib-charset \
-I$(top_srcdir)/src/lib-mail \
-I$(top_srcdir)/src/lib-index \
-I$(top_srcdir)/src/lib-storage
diff -r c68564884bae -r a215deb3de8d src/plugins/fts/fts-search.c
--- a/src/plugins/fts/fts-search.c Mon Dec 03 14:23:19 2007 +0200
+++ b/src/plugins/fts/fts-search.c Mon Dec 03 14:23:45 2007 +0200
@@ -2,7 +2,9 @@
#include "lib.h"
#include "array.h"
+#include "str.h"
#include "seq-range-array.h"
+#include "charset-utf8.h"
#include "mail-search.h"
#include "mail-storage-private.h"
#include "fts-api-private.h"
@@ -47,6 +49,9 @@ static int fts_search_lookup_arg(struct
struct fts_backend *backend;
enum fts_lookup_flags flags = 0;
const char *key;
+ string_t *key_utf8;
+ enum charset_result result;
+ int ret;
switch (arg->type) {
case SEARCH_HEADER:
@@ -81,20 +86,29 @@ static int fts_search_lookup_arg(struct
if (arg->not)
More information about the dovecot-cvs
mailing list