dovecot-2.2: lib-mail: Added mail-html2text API

dovecot at dovecot.org dovecot at dovecot.org
Fri Jan 16 22:16:56 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/d59753d9f5e9
changeset: 18156:d59753d9f5e9
user:      Timo Sirainen <tss at iki.fi>
date:      Sat Jan 17 00:15:44 2015 +0200
description:
lib-mail: Added mail-html2text API
What makes it mail-specific is that it allows skipping over data inside
<blockquote>. This code probably doesn't parse HTML perfectly, but hopefully
good enough for HTML emails.

diffstat:

 src/lib-mail/Makefile.am           |   10 +
 src/lib-mail/html-entities.h       |  253 ++++++++++++++++++++++++++++
 src/lib-mail/mail-html2text.c      |  323 +++++++++++++++++++++++++++++++++++++
 src/lib-mail/mail-html2text.h      |   15 +
 src/lib-mail/test-mail-html2text.c |   80 +++++++++
 5 files changed, 681 insertions(+), 0 deletions(-)

diffs (truncated from 735 to 300 lines):

diff -r 3d612ade5d75 -r d59753d9f5e9 src/lib-mail/Makefile.am
--- a/src/lib-mail/Makefile.am	Thu Jan 15 01:26:02 2015 +0200
+++ b/src/lib-mail/Makefile.am	Sat Jan 17 00:15:44 2015 +0200
@@ -13,6 +13,7 @@
 	istream-header-filter.c \
 	istream-nonuls.c \
 	istream-qp-decoder.c \
+	mail-html2text.c \
 	mail-user-hash.c \
 	mbox-from.c \
 	message-address.c \
@@ -33,6 +34,9 @@
 	rfc2231-parser.c \
 	rfc822-parser.c
 
+noinst_HEADERS = \
+	html-entities.h
+
 headers = \
 	istream-attachment-connector.h \
 	istream-attachment-extractor.h \
@@ -43,6 +47,7 @@
 	istream-qp.h \
 	mail-user-hash.h \
 	mbox-from.h \
+	mail-html2text.h \
 	mail-types.h \
 	message-address.h \
 	message-binary-part.h \
@@ -71,6 +76,7 @@
 	test-istream-binary-converter \
 	test-istream-header-filter \
 	test-istream-qp-decoder \
+	test-mail-html2text \
 	test-mbox-from \
 	test-message-address \
 	test-message-date \
@@ -160,6 +166,10 @@
 test_message_part_LDADD = message-part.lo message-parser.lo message-header-parser.lo message-size.lo rfc822-parser.lo rfc2231-parser.lo $(test_libs)
 test_message_part_DEPENDENCIES = $(test_deps)
 
+test_mail_html2text_SOURCES = test-mail-html2text.c
+test_mail_html2text_LDADD = mail-html2text.lo $(test_libs)
+test_mail_html2text_DEPENDENCIES = $(test_deps)
+
 test_ostream_dot_SOURCES = test-ostream-dot.c
 test_ostream_dot_LDADD = ostream-dot.lo $(test_libs)
 test_ostream_dot_DEPENDENCIES = $(test_deps)
diff -r 3d612ade5d75 -r d59753d9f5e9 src/lib-mail/html-entities.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-mail/html-entities.h	Sat Jan 17 00:15:44 2015 +0200
@@ -0,0 +1,253 @@
+{ "quot",	0x0022 },
+{ "amp",	0x0026 },
+{ "apos",	0x0027 },
+{ "lt",		0x003C },
+{ "gt",		0x003E },
+{ "nbsp",	0x00A0 },
+{ "iexcl",	0x00A1 },
+{ "cent",	0x00A2 },
+{ "pound",	0x00A3 },
+{ "curren",	0x00A4 },
+{ "yen",	0x00A5 },
+{ "brvbar",	0x00A6 },
+{ "sect",	0x00A7 },
+{ "uml",	0x00A8 },
+{ "copy",	0x00A9 },
+{ "ordf",	0x00AA },
+{ "laquo",	0x00AB },
+{ "not",	0x00AC },
+{ "shy",	0x00AD },
+{ "reg",	0x00AE },
+{ "macr",	0x00AF },
+{ "deg",	0x00B0 },
+{ "plusmn",	0x00B1 },
+{ "sup2",	0x00B2 },
+{ "sup3",	0x00B3 },
+{ "acute",	0x00B4 },
+{ "micro",	0x00B5 },
+{ "para",	0x00B6 },
+{ "middot",	0x00B7 },
+{ "cedil",	0x00B8 },
+{ "sup1",	0x00B9 },
+{ "ordm",	0x00BA },
+{ "raquo",	0x00BB },
+{ "frac14",	0x00BC },
+{ "frac12",	0x00BD },
+{ "frac34",	0x00BE },
+{ "iquest",	0x00BF },
+{ "Agrave",	0x00C0 },
+{ "Aacute",	0x00C1 },
+{ "Acirc",	0x00C2 },
+{ "Atilde",	0x00C3 },
+{ "Auml",	0x00C4 },
+{ "Aring",	0x00C5 },
+{ "AElig",	0x00C6 },
+{ "Ccedil",	0x00C7 },
+{ "Egrave",	0x00C8 },
+{ "Eacute",	0x00C9 },
+{ "Ecirc",	0x00CA },
+{ "Euml",	0x00CB },
+{ "Igrave",	0x00CC },
+{ "Iacute",	0x00CD },
+{ "Icirc",	0x00CE },
+{ "Iuml",	0x00CF },
+{ "ETH",	0x00D0 },
+{ "Ntilde",	0x00D1 },
+{ "Ograve",	0x00D2 },
+{ "Oacute",	0x00D3 },
+{ "Ocirc",	0x00D4 },
+{ "Otilde",	0x00D5 },
+{ "Ouml",	0x00D6 },
+{ "times",	0x00D7 },
+{ "Oslash",	0x00D8 },
+{ "Ugrave",	0x00D9 },
+{ "Uacute",	0x00DA },
+{ "Ucirc",	0x00DB },
+{ "Uuml",	0x00DC },
+{ "Yacute",	0x00DD },
+{ "THORN",	0x00DE },
+{ "szlig",	0x00DF },
+{ "agrave",	0x00E0 },
+{ "aacute",	0x00E1 },
+{ "acirc",	0x00E2 },
+{ "atilde",	0x00E3 },
+{ "auml",	0x00E4 },
+{ "aring",	0x00E5 },
+{ "aelig",	0x00E6 },
+{ "ccedil",	0x00E7 },
+{ "egrave",	0x00E8 },
+{ "eacute",	0x00E9 },
+{ "ecirc",	0x00EA },
+{ "euml",	0x00EB },
+{ "igrave",	0x00EC },
+{ "iacute",	0x00ED },
+{ "icirc",	0x00EE },
+{ "iuml",	0x00EF },
+{ "eth",	0x00F0 },
+{ "ntilde",	0x00F1 },
+{ "ograve",	0x00F2 },
+{ "oacute",	0x00F3 },
+{ "ocirc",	0x00F4 },
+{ "otilde",	0x00F5 },
+{ "ouml",	0x00F6 },
+{ "divide",	0x00F7 },
+{ "oslash",	0x00F8 },
+{ "ugrave",	0x00F9 },
+{ "uacute",	0x00FA },
+{ "ucirc",	0x00FB },
+{ "uuml",	0x00FC },
+{ "yacute",	0x00FD },
+{ "thorn",	0x00FE },
+{ "yuml",	0x00FF },
+{ "OElig",	0x0152 },
+{ "oelig",	0x0153 },
+{ "Scaron",	0x0160 },
+{ "scaron",	0x0161 },
+{ "Yuml",	0x0178 },
+{ "fnof",	0x0192 },
+{ "circ",	0x02C6 },
+{ "tilde",	0x02DC },
+{ "Alpha",	0x0391 },
+{ "Beta",	0x0392 },
+{ "Gamma",	0x0393 },
+{ "Delta",	0x0394 },
+{ "Epsilon",	0x0395 },
+{ "Zeta",	0x0396 },
+{ "Eta",	0x0397 },
+{ "Theta",	0x0398 },
+{ "Iota",	0x0399 },
+{ "Kappa",	0x039A },
+{ "Lambda",	0x039B },
+{ "Mu",		0x039C },
+{ "Nu",		0x039D },
+{ "Xi",		0x039E },
+{ "Omicron",	0x039F },
+{ "Pi",		0x03A0 },
+{ "Rho",	0x03A1 },
+{ "Sigma",	0x03A3 },
+{ "Tau",	0x03A4 },
+{ "Upsilon",	0x03A5 },
+{ "Phi",	0x03A6 },
+{ "Chi",	0x03A7 },
+{ "Psi",	0x03A8 },
+{ "Omega",	0x03A9 },
+{ "alpha",	0x03B1 },
+{ "beta",	0x03B2 },
+{ "gamma",	0x03B3 },
+{ "delta",	0x03B4 },
+{ "epsilon",	0x03B5 },
+{ "zeta",	0x03B6 },
+{ "eta",	0x03B7 },
+{ "theta",	0x03B8 },
+{ "iota",	0x03B9 },
+{ "kappa",	0x03BA },
+{ "lambda",	0x03BB },
+{ "mu",		0x03BC },
+{ "nu",		0x03BD },
+{ "xi",		0x03BE },
+{ "omicron",	0x03BF },
+{ "pi",		0x03C0 },
+{ "rho",	0x03C1 },
+{ "sigmaf",	0x03C2 },
+{ "sigma",	0x03C3 },
+{ "tau",	0x03C4 },
+{ "upsilon",	0x03C5 },
+{ "phi",	0x03C6 },
+{ "chi",	0x03C7 },
+{ "psi",	0x03C8 },
+{ "omega",	0x03C9 },
+{ "thetasym",	0x03D1 },
+{ "upsih",	0x03D2 },
+{ "piv",	0x03D6 },
+{ "ensp",	0x2002 },
+{ "emsp",	0x2003 },
+{ "thinsp",	0x2009 },
+{ "zwnj",	0x200C },
+{ "zwj",	0x200D },
+{ "lrm",	0x200E },
+{ "rlm",	0x200F },
+{ "ndash",	0x2013 },
+{ "mdash",	0x2014 },
+{ "lsquo",	0x2018 },
+{ "rsquo",	0x2019 },
+{ "sbquo",	0x201A },
+{ "ldquo",	0x201C },
+{ "rdquo",	0x201D },
+{ "bdquo",	0x201E },
+{ "dagger",	0x2020 },
+{ "Dagger",	0x2021 },
+{ "bull",	0x2022 },
+{ "hellip",	0x2026 },
+{ "permil",	0x2030 },
+{ "prime",	0x2032 },
+{ "Prime",	0x2033 },
+{ "lsaquo",	0x2039 },
+{ "rsaquo",	0x203A },
+{ "oline",	0x203E },
+{ "frasl",	0x2044 },
+{ "euro",	0x20AC },
+{ "image",	0x2111 },
+{ "weierp",	0x2118 },
+{ "real",	0x211C },
+{ "trade",	0x2122 },
+{ "alefsym",	0x2135 },
+{ "larr",	0x2190 },
+{ "uarr",	0x2191 },
+{ "rarr",	0x2192 },
+{ "darr",	0x2193 },
+{ "harr",	0x2194 },
+{ "crarr",	0x21B5 },
+{ "lArr",	0x21D0 },
+{ "uArr",	0x21D1 },
+{ "rArr",	0x21D2 },
+{ "dArr",	0x21D3 },
+{ "hArr",	0x21D4 },
+{ "forall",	0x2200 },
+{ "part",	0x2202 },
+{ "exist",	0x2203 },
+{ "empty",	0x2205 },
+{ "nabla",	0x2207 },
+{ "isin",	0x2208 },
+{ "notin",	0x2209 },
+{ "ni",		0x220B },
+{ "prod",	0x220F },
+{ "sum",	0x2211 },
+{ "minus",	0x2212 },
+{ "lowast",	0x2217 },
+{ "radic",	0x221A },
+{ "prop",	0x221D },
+{ "infin",	0x221E },
+{ "ang",	0x2220 },
+{ "and",	0x2227 },
+{ "or",		0x2228 },
+{ "cap",	0x2229 },
+{ "cup",	0x222A },
+{ "int",	0x222B },
+{ "there4",	0x2234 },
+{ "sim",	0x223C },
+{ "cong",	0x2245 },
+{ "asymp",	0x2248 },
+{ "ne",		0x2260 },
+{ "equiv",	0x2261 },
+{ "le",		0x2264 },
+{ "ge",		0x2265 },
+{ "sub",	0x2282 },
+{ "sup",	0x2283 },
+{ "nsub",	0x2284 },
+{ "sube",	0x2286 },
+{ "supe",	0x2287 },
+{ "oplus",	0x2295 },
+{ "otimes",	0x2297 },
+{ "perp",	0x22A5 },
+{ "sdot",	0x22C5 },
+{ "lceil",	0x2308 },
+{ "rceil",	0x2309 },
+{ "lfloor",	0x230A },
+{ "rfloor",	0x230B },
+{ "lang",	0x27E8 },
+{ "rang",	0x27E9 },


More information about the dovecot-cvs mailing list