[dovecot-cvs] dovecot/src/lib-charset charset-ascii.c,1.3,1.4 charset-iconv.c,1.2,1.3 charset-utf8.h,1.1,1.2

Wed Nov 13 13:08:21 EET 2002

Update of /home/cvs/dovecot/src/lib-charset
In directory danu:/tmp/cvs-serv7219/lib-charset

Modified Files:
	charset-ascii.c charset-iconv.c charset-utf8.h 
Log Message:
SEARCH CHARSET now works properly with message bodies, and in general body
searching works more correctly by decoding base64/qp data. Non-text MIME
parts are currently not included in search, that could be made optional.
Also the body is parsed separately for each keyword, that could be
optimized.

Changed base64_decode() behaviour so that it can accept non-base64 data as
well, ie. line feeds etc.



Index: charset-ascii.c
===================================================================
RCS file: /home/cvs/dovecot/src/lib-charset/charset-ascii.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4

--- charset-ascii.c	4 Nov 2002 07:11:32 -0000	1.3
+++ charset-ascii.c	13 Nov 2002 11:08:18 -0000	1.4
@@ -5,12 +5,63 @@
 
 #ifndef HAVE_ICONV_H
 
-const char *charset_to_ucase_utf8(const unsigned char *data,
-				  size_t *size __attr_unused__,
-				  const char *charset, int *unknown_charset)
+#include <ctype.h>
+
+struct _CharsetTranslation {
+	int dummy;
+};
+
+static CharsetTranslation ascii_translation;
+
+CharsetTranslation *charset_to_utf8_begin(const char *charset,
+					  int *unknown_charset)
 {
-	if (charset == NULL || strcasecmp(charset, "us-ascii") == 0)
-		return str_ucase(t_strdup_noconst(data));
+	if (unknown_charset != NULL)
+		*unknown_charset = FALSE;
+
+	if (strcasecmp(charset, "us-ascii") != 0 &&
+	    strcasecmp(charset, "ascii") != 0) {
+		/* no support for non-ascii charsets */
+		if (unknown_charset != NULL)
+			*unknown_charset = TRUE;
+		return NULL;
+	}
+
+	return &ascii_translation;
+}
+
+void charset_to_utf8_end(CharsetTranslation *t __attr_unused__)
+{
+}
+
+void charset_to_utf8_reset(CharsetTranslation *t __attr_unused__)
+{
+}
+
+int charset_to_ucase_utf8(CharsetTranslation *t __attr_unused__,
+			  const unsigned char **inbuf, size_t *insize,
+			  unsigned char *outbuf, size_t *outsize)
+{
+	size_t max_size, i;
+
+	max_size = I_MIN(*insize, *outsize);
+	for (i = 0; i < max_size; i++)
+		outbuf[i] = i_toupper((*inbuf)[i]);
+
+	*insize = 0;
+	*outsize = max_size;
+
+	return TRUE;
+}
+
+const char *
+charset_to_ucase_utf8_string(const char *charset, int *unknown_charset,
+			     const unsigned char *buf,
+			     size_t *size __attr_unused__)
+{
+	if (charset == NULL || strcasecmp(charset, "us-ascii") == 0 ||
+	    strcasecmp(charset, "ascii") == 0)
+		return str_ucase(t_strdup_noconst(buf));
 
 	if (unknown_charset != NULL)
 		*unknown_charset = TRUE;

Index: charset-iconv.c
===================================================================
RCS file: /home/cvs/dovecot/src/lib-charset/charset-iconv.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -d -r1.2 -r1.3
--- charset-iconv.c	4 Nov 2002 07:11:32 -0000	1.2
+++ charset-iconv.c	13 Nov 2002 11:08:18 -0000	1.3
@@ -6,16 +6,102 @@
 #ifdef HAVE_ICONV_H
 
 #include <iconv.h>
+#include <ctype.h>
 
-const char *charset_to_ucase_utf8(const unsigned char *data, size_t *size,
-				  const char *charset, int *unknown_charset)
+struct _CharsetTranslation {
+	iconv_t cd;
+};
+
+CharsetTranslation *charset_to_utf8_begin(const char *charset,
+					  int *unknown_charset)
+{
+	CharsetTranslation *t;
+	iconv_t cd;
+
+	if (unknown_charset != NULL)
+		*unknown_charset = FALSE;
+
+	if (strcasecmp(charset, "us-ascii") == 0 ||
+	    strcasecmp(charset, "ascii") == 0) {
+		/* no need to do any actual translation */
+		cd = NULL;
+	} else {
+		cd = iconv_open("UTF8", charset);
+		if (cd == (iconv_t)-1) {
+			if (unknown_charset != NULL)
+				*unknown_charset = TRUE;
+			return NULL;
+		}
+	}
+
+	t = i_new(CharsetTranslation, 1);
+	t->cd = cd;
+	return t;
+}
+
+void charset_to_utf8_end(CharsetTranslation *t)
+{
+	if (t->cd != NULL)
+		iconv_close(t->cd);
+	i_free(t);
+}
+
+void charset_to_utf8_reset(CharsetTranslation *t)
+{
+	if (t->cd != NULL)
+		(void)iconv(t->cd, NULL, NULL, NULL, NULL);
+}
+
+int charset_to_ucase_utf8(CharsetTranslation *t,
+			  const unsigned char **inbuf, size_t *insize,
+			  unsigned char *outbuf, size_t *outsize)
+{
+	char *ic_inbuf, *ic_outbuf;
+	size_t outleft, max_size, i;
+
+	if (t->cd == NULL) {
+		/* ascii - just copy it to outbuf uppercased */
+		max_size = I_MIN(*insize, *outsize);
+		for (i = 0; i < max_size; i++)
+			outbuf[i] = i_toupper((*inbuf)[i]);
+		*insize = 0;
+		*outsize = max_size;
+		return TRUE;
+	}
+
+	ic_inbuf = (char *) *inbuf;
+	ic_outbuf = (char *) outbuf;
+	outleft = *outsize;
+
+	if (iconv(t->cd, &ic_inbuf, insize,
+		  &ic_outbuf, &outleft) == (size_t)-1) {
+		if (errno != E2BIG && errno != EINVAL) {
+			/* should be EILSEQ - invalid input */
+			return FALSE;
+		}
+	}
+
+	*inbuf = (const unsigned char *) ic_inbuf;
+	*outsize -= outleft;
+
+	max_size = *outsize;
+	for (i = 0; i < max_size; i++)
+		outbuf[i] = i_toupper(outbuf[i]);
+
+	return TRUE;
+}
+
+const char *
+charset_to_ucase_utf8_string(const char *charset, int *unknown_charset,
+			     const unsigned char *buf, size_t *size)
 {
 	iconv_t cd;
 	char *inbuf, *outbuf, *outpos;
 	size_t inleft, outleft, outsize, pos;
 
-	if (charset == NULL || strcasecmp(charset, "us-ascii") == 0)
-		return str_ucase(t_strdup_noconst(data));
+	if (charset == NULL || strcasecmp(charset, "us-ascii") == 0 ||
+	    strcasecmp(charset, "ascii") == 0)
+		return str_ucase(t_strdup_noconst(buf));
 
 	cd = iconv_open("UTF8", charset);
 	if (cd == (iconv_t)-1) {
@@ -27,7 +113,7 @@
 	if (unknown_charset != NULL)
 		*unknown_charset = FALSE;
 
-	inbuf = (char *) data;
+	inbuf = (char *) buf;
 	inleft = *size;
 
 	outsize = outleft = *size * 2;

Index: charset-utf8.h
===================================================================
RCS file: /home/cvs/dovecot/src/lib-charset/charset-utf8.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- charset-utf8.h	3 Nov 2002 08:39:43 -0000	1.1
+++ charset-utf8.h	13 Nov 2002 11:08:18 -0000	1.2
@@ -1,7 +1,28 @@
 #ifndef __CHARSET_UTF8_H
 #define __CHARSET_UTF8_H
 
-const char *charset_to_ucase_utf8(const unsigned char *data, size_t *size,
-				  const char *charset, int *unknown_charset);
+typedef struct _CharsetTranslation CharsetTranslation;
+
+/* Begin translation to UTF-8. */
+CharsetTranslation *charset_to_utf8_begin(const char *charset,
+					  int *unknown_charset);
+
+void charset_to_utf8_end(CharsetTranslation *t);
+
+void charset_to_utf8_reset(CharsetTranslation *t);
+
+/* Convert inbuf to UTF-8. inbuf and inbuf_size is updated to specify beginning
+   of data that was not written to outbuf, either because of inbuf ended with
+   incomplete character sequence or because the outbuf got full. Returns TRUE
+   if no conversion errors were detected. */
+int charset_to_ucase_utf8(CharsetTranslation *t,
+			  const unsigned char **inbuf, size_t *insize,
+			  unsigned char *outbuf, size_t *outsize);
+
+/* Simple wrapper for above functions. size is updated to strlen() of
+   returned UTF-8 string. */
+const char *
+charset_to_ucase_utf8_string(const char *charset, int *unknown_charset,
+			     const unsigned char *buf, size_t *size);
 
 #endif