dovecot-2.2: fts: Parse HTML MIME parts using lib-mail's mail-ht...
dovecot at dovecot.org
dovecot at dovecot.org
Fri Jan 16 22:33:32 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/08afd516a622
changeset: 18158:08afd516a622
user: Timo Sirainen <tss at iki.fi>
date: Sat Jan 17 00:24:36 2015 +0200
description:
fts: Parse HTML MIME parts using lib-mail's mail-html2text.
diffstat:
src/plugins/fts/Makefile.am | 1 -
src/plugins/fts/fts-parser-html.c | 236 +----------------------------------
src/plugins/fts/html-entities.h | 253 --------------------------------------
3 files changed, 6 insertions(+), 484 deletions(-)
diffs (truncated from 543 to 300 lines):
diff -r 5211234206ea -r 08afd516a622 src/plugins/fts/Makefile.am
--- a/src/plugins/fts/Makefile.am Sat Jan 17 00:23:36 2015 +0200
+++ b/src/plugins/fts/Makefile.am Sat Jan 17 00:24:36 2015 +0200
@@ -42,7 +42,6 @@
noinst_HEADERS = \
doveadm-fts.h \
- html-entities.h \
fts-build-mail.h \
fts-plugin.h \
fts-search-serialize.h \
diff -r 5211234206ea -r 08afd516a622 src/plugins/fts/fts-parser-html.c
--- a/src/plugins/fts/fts-parser-html.c Sat Jan 17 00:23:36 2015 +0200
+++ b/src/plugins/fts/fts-parser-html.c Sat Jan 17 00:24:36 2015 +0200
@@ -2,49 +2,14 @@
#include "lib.h"
#include "buffer.h"
-#include "unichar.h"
#include "message-parser.h"
+#include "mail-html2text.h"
#include "fts-parser.h"
-/* Zero-width space (​) apparently also belongs here, but that gets a
- bit tricky to handle.. is it actually used anywhere? */
-#define HTML_WHITESPACE(c) \
- ((c) == ' ' || (c) == '\t' || (c) == '\r' || (c) == '\n')
-
-enum html_state {
- /* regular text */
- HTML_STATE_TEXT,
- /* tag outside "quoted string" */
- HTML_STATE_TAG,
- /* tag inside "double quoted string" */
- HTML_STATE_TAG_DQUOTED,
- /* tag -> "escape\ */
- HTML_STATE_TAG_DQUOTED_ESCAPE,
- /* tag inside 'single quoted string' */
- HTML_STATE_TAG_SQUOTED,
- /* tag -> 'escape\ */
- HTML_STATE_TAG_SQUOTED_ESCAPE,
- /* script/stype content */
- HTML_STATE_IGNORE,
- /* comment */
- HTML_STATE_COMMENT,
- /* comment is ending, we've seen "--" and now just waiting for ">" */
- HTML_STATE_COMMENT_END
-};
-
struct html_fts_parser {
struct fts_parser parser;
-
- enum html_state state;
- buffer_t *input, *output;
- bool ignore_next_text;
-};
-
-static struct {
- const char *name;
- unichar_t chr;
-} html_entities[] = {
-#include "html-entities.h"
+ struct mail_html2text *html2text;
+ buffer_t *output;
};
static struct fts_parser *
@@ -59,207 +24,19 @@
parser = i_new(struct html_fts_parser, 1);
parser->parser.v = fts_parser_html;
- parser->input = buffer_create_dynamic(default_pool, 512);
+ parser->html2text = mail_html2text_init(0);
parser->output = buffer_create_dynamic(default_pool, 4096);
return &parser->parser;
}
-static size_t
-parse_tag_name(struct html_fts_parser *parser,
- const unsigned char *data, size_t size)
-{
- size_t i;
-
- if (size >= 3 && memcmp(data, "!--", 3) == 0) {
- parser->state = HTML_STATE_COMMENT;
- return 3 + 1;
- }
-
- if (size > 5 && i_memcasecmp(data, "style", 5) == 0) {
- i = 5;
- } else if (size > 6 && i_memcasecmp(data, "script", 6) == 0) {
- i = 6;
- } else {
- if (size <= 6) {
- /* can we see the whole tag name? */
- for (i = 0; i < size; i++) {
- if (HTML_WHITESPACE(data[i]) || data[i] == '>')
- break;
- }
- if (i == size) {
- /* need more data */
- return 0;
- }
- }
- parser->state = HTML_STATE_TAG;
- return 1;
- }
- parser->state = HTML_STATE_TAG;
- if (HTML_WHITESPACE(data[i]) || data[i] == '>')
- parser->ignore_next_text = TRUE;
- return 1;
-}
-
-static bool html_entity_get_unichar(const char *name, unichar_t *chr_r)
-{
- unsigned int i;
-
- for (i = 0; i < N_ELEMENTS(html_entities); i++) {
- if (strcasecmp(html_entities[i].name, name) == 0) {
- *chr_r = html_entities[i].chr;
- return TRUE;
- }
- }
- return FALSE;
-}
-
-static size_t parse_entity(struct html_fts_parser *parser,
- const unsigned char *data, size_t size)
-{
- char entity[10];
- unichar_t chr;
- size_t i;
-
- for (i = 0; i < size; i++) {
- if (HTML_WHITESPACE(data[i]) || i >= sizeof(entity)) {
- /* broken entity */
- return 1;
- }
- if (data[i] == ';')
- break;
- }
- if (i == size)
- return 0;
-
- i_assert(i < sizeof(entity));
- memcpy(entity, data, i); entity[i] = '\0';
-
- if (html_entity_get_unichar(entity, &chr))
- uni_ucs4_to_utf8_c(chr, parser->output);
- return i + 1;
-}
-
-static void parser_add_space(struct html_fts_parser *parser)
-{
- const unsigned char *data = parser->output->data;
-
- if (parser->output->used > 0 &&
- data[parser->output->used-1] != ' ')
- buffer_append_c(parser->output, ' ');
-}
-
-static size_t
-parse_data(struct html_fts_parser *parser,
- const unsigned char *data, size_t size)
-{
- size_t i, ret;
-
- for (i = 0; i < size; i++) {
- char c = data[i];
-
- switch (parser->state) {
- case HTML_STATE_TEXT:
- if (c == '<') {
- ret = parse_tag_name(parser, data+i+1, size-i-1);
- if (ret == 0)
- return i;
- i += ret - 1;
- } else if (c == '&') {
- ret = parse_entity(parser, data+i+1, size-i-1);
- if (ret == 0)
- return i;
- i += ret - 1;
- } else {
- buffer_append_c(parser->output, c);
- }
- break;
- case HTML_STATE_TAG:
- if (c == '"')
- parser->state = HTML_STATE_TAG_DQUOTED;
- else if (c == '\'')
- parser->state = HTML_STATE_TAG_DQUOTED;
- else if (c == '>') {
- parser->state = parser->ignore_next_text ?
- HTML_STATE_IGNORE : HTML_STATE_TEXT;
- parser_add_space(parser);
- }
- break;
- case HTML_STATE_TAG_DQUOTED:
- if (c == '"')
- parser->state = HTML_STATE_TAG;
- else if (c == '\\')
- parser->state = HTML_STATE_TAG_DQUOTED_ESCAPE;
- break;
- case HTML_STATE_TAG_DQUOTED_ESCAPE:
- parser->state = HTML_STATE_TAG_DQUOTED;
- break;
- case HTML_STATE_TAG_SQUOTED:
- if (c == '\'')
- parser->state = HTML_STATE_TAG;
- else if (c == '\\')
- parser->state = HTML_STATE_TAG_SQUOTED_ESCAPE;
- break;
- case HTML_STATE_TAG_SQUOTED_ESCAPE:
- parser->state = HTML_STATE_TAG_SQUOTED;
- break;
- case HTML_STATE_IGNORE:
- if (c == '<') {
- parser->state = HTML_STATE_TAG;
- parser->ignore_next_text = FALSE;
- }
- break;
- case HTML_STATE_COMMENT:
- if (c == '-') {
- if (i+1 == size)
- return i;
- if (data[i+1] == '-') {
- parser->state = HTML_STATE_COMMENT_END;
- i++;
- }
- }
- break;
- case HTML_STATE_COMMENT_END:
- if (c == '>')
- parser->state = HTML_STATE_TEXT;
- else if (!HTML_WHITESPACE(c))
- parser->state = HTML_STATE_COMMENT;
- break;
- }
- }
- return i;
-}
-
static void fts_parser_html_more(struct fts_parser *_parser,
struct message_block *block)
{
struct html_fts_parser *parser = (struct html_fts_parser *)_parser;
- size_t size, buf_orig_size;
buffer_set_used_size(parser->output, 0);
-
- if (parser->input->used > 0) {
- /* we didn't get enough input the last time to know
- what to do. */
- buf_orig_size = parser->input->used;
-
- size = I_MIN(block->size, 128);
- buffer_append(parser->input, block->data, size);
- size = parse_data(parser, parser->input->data,
- parser->input->used);
- if (size != 0) {
- i_assert(size >= buf_orig_size);
- block->data += size - buf_orig_size;
- block->size -= size - buf_orig_size;
- } else if (block->size != 0) {
- /* we're slowly parsing forward */
- return;
- } else {
- /* we're at EOF and can't finish this */
- }
- buffer_set_used_size(parser->input, 0);
- }
- size = parse_data(parser, block->data, block->size);
- buffer_append(parser->input, block->data + size, block->size - size);
+ mail_html2text_more(parser->html2text, block->data, block->size,
+ parser->output);
block->data = parser->output->data;
block->size = parser->output->used;
@@ -269,7 +46,6 @@
{
struct html_fts_parser *parser = (struct html_fts_parser *)_parser;
- buffer_free(&parser->input);
buffer_free(&parser->output);
i_free(parser);
}
diff -r 5211234206ea -r 08afd516a622 src/plugins/fts/html-entities.h
--- a/src/plugins/fts/html-entities.h Sat Jan 17 00:23:36 2015 +0200
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,253 +0,0 @@
-{ "quot", 0x0022 },
-{ "amp", 0x0026 },
-{ "apos", 0x0027 },
-{ "lt", 0x003C },
-{ "gt", 0x003E },
-{ "nbsp", 0x00A0 },
-{ "iexcl", 0x00A1 },
-{ "cent", 0x00A2 },
-{ "pound", 0x00A3 },
-{ "curren", 0x00A4 },
More information about the dovecot-cvs
mailing list