dovecot-2.0: fts-solr: Replace characters not valid for XML with...

dovecot at dovecot.org dovecot at dovecot.org
Fri Aug 20 22:38:50 EEST 2010


details:   http://hg.dovecot.org/dovecot-2.0/rev/38674aff6956
changeset: 12025:38674aff6956
user:      Timo Sirainen <tss at iki.fi>
date:      Fri Aug 20 20:38:26 2010 +0100
description:
fts-solr: Replace characters not valid for XML with replacement char.

diffstat:

 src/plugins/fts-solr/fts-backend-solr.c |  32 ++++++++++++++++++++++++++++++++
 1 files changed, 32 insertions(+), 0 deletions(-)

diffs (64 lines):

diff -r 6105706de7b6 -r 38674aff6956 src/plugins/fts-solr/fts-backend-solr.c
--- a/src/plugins/fts-solr/fts-backend-solr.c	Fri Aug 20 20:37:31 2010 +0100
+++ b/src/plugins/fts-solr/fts-backend-solr.c	Fri Aug 20 20:38:26 2010 +0100
@@ -4,6 +4,7 @@
 #include "array.h"
 #include "str.h"
 #include "strescape.h"
+#include "unichar.h"
 #include "mail-storage-private.h"
 #include "mail-namespace.h"
 #include "solr-connection.h"
@@ -75,9 +76,25 @@
 	return name;
 }
 
+static bool is_valid_xml_char(unichar_t chr)
+{
+	/* Valid characters in XML:
+
+	   #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
+	   [#x10000-#x10FFFF]
+
+	   This function gets called only for #x80 and higher */
+	if (chr > 0xd7ff && chr < 0xe000)
+		return FALSE;
+	if (chr > 0xfffd && chr < 0x10000)
+		return FALSE;
+	return chr < 0x10ffff;
+}
+
 static void
 xml_encode_data(string_t *dest, const unsigned char *data, unsigned int len)
 {
+	unichar_t chr;
 	unsigned int i;
 
 	for (i = 0; i < len; i++) {
@@ -102,11 +119,26 @@
 				/* SOLR doesn't like control characters.
 				   replace them with spaces. */
 				str_append_c(dest, ' ');
+			} else if (data[i] >= 0x80) {
+				/* make sure the character is valid for XML
+				   so we don't get XML parser errors */
+				unsigned int char_len =
+					uni_utf8_char_bytes(data[0]);
+				if (i + char_len <= len &&
+				    uni_utf8_get_char_n(data, len, &chr) == 0 &&
+				    is_valid_xml_char(chr))
+					str_append_n(dest, data + i, char_len);
+				else {
+					str_append_n(dest, utf8_replacement_char,
+						     UTF8_REPLACEMENT_CHAR_LEN);
+				}
+				i += char_len - 1;
 			} else {
 				str_append_c(dest, data[i]);
 			}
 			break;
 		}
+		i += uni_utf8_char_bytes(data[0]);
 	}
 }
 


More information about the dovecot-cvs mailing list