dovecot-2.2: lib-fts: Add Norwegian.
dovecot at dovecot.org
dovecot at dovecot.org
Tue Nov 17 09:56:32 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/301d48ef7398
changeset: 19374:301d48ef7398
user: Teemu Huovila <teemu.huovila at dovecot.fi>
date: Tue Nov 17 11:44:19 2015 +0200
description:
lib-fts: Add Norwegian.
Norwegian has two main dialects, Bokmal(nb) and Nynorsk(nn). They
are detected separately by libexttextcat, but the stemmer only
knows Norwegian. Thus they are treated as a single language,
Norwegian (no). This might also make more sense in everyday
use of mixed writing style Norwegian.
Caveat: The default normalizer filter does not modify U+00F8
(Latin Small Letter O with Stroke). In some configurations it
might be desirable to rewrite it to e.g. o. Same goes for the
upper case version. This can be done by passing a modified "id"
setting to the normalizer filter.
diffstat:
src/lib-fts/Makefile.am | 1 +
src/lib-fts/fts-language.c | 5 +
src/lib-fts/stopwords_no.txt | 192 ++++++++++++++++++++++++++++++++++++++++
src/lib-fts/test-fts-filter.c | 113 +++++++++++++++++++++++
src/lib-fts/test-fts-language.c | 50 ++++++++++
5 files changed, 361 insertions(+), 0 deletions(-)
diffs (truncated from 447 to 300 lines):
diff -r f31fadf622f2 -r 301d48ef7398 src/lib-fts/Makefile.am
--- a/src/lib-fts/Makefile.am Tue Nov 17 11:43:58 2015 +0200
+++ b/src/lib-fts/Makefile.am Tue Nov 17 11:44:19 2015 +0200
@@ -15,6 +15,7 @@
stopwords_en.txt \
stopwords_fi.txt \
stopwords_fr.txt \
+ stopwords_no.txt \
stopwords_sv.txt
BUILT_SOURCES = word-boundary-data.c word-break-data.c
diff -r f31fadf622f2 -r 301d48ef7398 src/lib-fts/fts-language.c
--- a/src/lib-fts/fts-language.c Tue Nov 17 11:43:58 2015 +0200
+++ b/src/lib-fts/fts-language.c Tue Nov 17 11:44:19 2015 +0200
@@ -39,6 +39,7 @@
{ "fr" }, /* French */
{ "it" }, /* Italian */
{ "nl" }, /* Dutch */
+ { "no" }, /* Both Bokmal and Nynorsk are detected as Norwegian */
{ "pt" }, /* Portuguese */
{ "ro" }, /* Romanian */
{ "ru" }, /* Russian */
@@ -175,6 +176,10 @@
/* name is <lang>-<optional country or characterset>-<encoding>
eg, fi--utf8 or pt-PT-utf8 */
name = t_strcut(candp[i].name, '-');
+
+ /* For Norwegian we treat both bokmal and nynorsk as "no". */
+ if (strcmp(name, "nb") == 0 || strcmp(name, "nn") == 0)
+ name = "no";
if ((*lang_r = fts_language_list_find(list, name)) != NULL)
return TRUE;
}
diff -r f31fadf622f2 -r 301d48ef7398 src/lib-fts/stopwords_no.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/stopwords_no.txt Tue Nov 17 11:44:19 2015 +0200
@@ -0,0 +1,192 @@
+ | From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ | - Encoding was converted to UTF-8.
+ | - This notice was added.
+
+ | A Norwegian stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ | This stop word list is for the dominant bokmål dialect. Words unique
+ | to nynorsk are marked *.
+
+ | Revised by Jan Bruusgaard <Jan.Bruusgaard at ssb.no>, Jan 2005
+
+og | and
+i | in
+jeg | I
+det | it/this/that
+at | to (w. inf.)
+en | a/an
+et | a/an
+den | it/this/that
+til | to
+er | is/am/are
+som | who/that
+på | on
+de | they / you(formal)
+med | with
+han | he
+av | of
+ikke | not
+ikkje | not *
+der | there
+så | so
+var | was/were
+meg | me
+seg | you
+men | but
+ett | one
+har | have
+om | about
+vi | we
+min | my
+mitt | my
+ha | have
+hadde | had
+hun | she
+nå | now
+over | over
+da | when/as
+ved | by/know
+fra | from
+du | you
+ut | out
+sin | your
+dem | them
+oss | us
+opp | up
+man | you/one
+kan | can
+hans | his
+hvor | where
+eller | or
+hva | what
+skal | shall/must
+selv | self (reflective)
+sjøl | self (reflective)
+her | here
+alle | all
+vil | will
+bli | become
+ble | became
+blei | became *
+blitt | have become
+kunne | could
+inn | in
+når | when
+være | be
+kom | come
+noen | some
+noe | some
+ville | would
+dere | you
+som | who/which/that
+deres | their/theirs
+kun | only/just
+ja | yes
+etter | after
+ned | down
+skulle | should
+denne | this
+for | for/because
+deg | you
+si | hers/his
+sine | hers/his
+sitt | hers/his
+mot | against
+å | to
+meget | much
+hvorfor | why
+dette | this
+disse | these/those
+uten | without
+hvordan | how
+ingen | none
+din | your
+ditt | your
+blir | become
+samme | same
+hvilken | which
+hvilke | which (plural)
+sånn | such a
+inni | inside/within
+mellom | between
+vår | our
+hver | each
+hvem | who
+vors | us/ours
+hvis | whose
+både | both
+bare | only/just
+enn | than
+fordi | as/because
+før | before
+mange | many
+også | also
+slik | just
+vært | been
+være | to be
+båe | both *
+begge | both
+siden | since
+dykk | your *
+dykkar | yours *
+dei | they *
+deira | them *
+deires | theirs *
+deim | them *
+di | your (fem.) *
+då | as/when *
+eg | I *
+ein | a/an *
+eit | a/an *
+eitt | a/an *
+elles | or *
+honom | he *
+hjå | at *
+ho | she *
+hoe | she *
+henne | her
+hennar | her/hers
+hennes | hers
+hoss | how *
+hossen | how *
+ikkje | not *
+ingi | noone *
+inkje | noone *
+korleis | how *
+korso | how *
+kva | what/which *
+kvar | where *
+kvarhelst | where *
+kven | who/whom *
+kvi | why *
+kvifor | why *
+me | we *
+medan | while *
+mi | my *
+mine | my *
+mykje | much *
+no | now *
+nokon | some (masc./neut.) *
+noka | some (fem.) *
+nokor | some *
+noko | some *
+nokre | some *
+si | his/hers *
+sia | since *
+sidan | since *
+so | so *
+somt | some *
+somme | some *
+um | about*
+upp | up *
+vere | be *
+vore | was *
+verte | become *
+vort | become *
+varte | became *
+vart | became *
+
diff -r f31fadf622f2 -r 301d48ef7398 src/lib-fts/test-fts-filter.c
--- a/src/lib-fts/test-fts-filter.c Tue Nov 17 11:43:58 2015 +0200
+++ b/src/lib-fts/test-fts-filter.c Tue Nov 17 11:44:19 2015 +0200
@@ -13,6 +13,7 @@
static const char *const stopword_settings[] = {"stopwords_dir", TEST_STOPWORDS_DIR, NULL};
static struct fts_language english_language = { .name = "en" };
static struct fts_language french_language = { .name = "fr" };
+static struct fts_language norwegian_language = { .name = "no" };
static struct fts_language swedish_language = { .name = "sv" };
static void test_fts_filter_find(void)
@@ -282,6 +283,54 @@
test_end();
}
+static void test_fts_filter_stopwords_no(void)
+{
+ struct fts_filter *filter;
+ const char *error;
+ int ret;
+
+ const char *input[] = {"og", "d\xC3\xA5", "medlemsstatane", "har",
+ "bunde", "seg", "til", "\xC3\xA5", "fremje",
+ "allmenn", "v\xC3\xB8rdnad", "for", "pakta",
+ "og", "halde", "seg", "etter", "menneskerettane",
+ "og", "den", "grunnleggjande", "fridomen", "i",
+ "samarbeid", "med", "Dei", "Sameinte",
+ "Nasjonane", NULL};
+
+ const char *output[] = {NULL, NULL, "medlemsstatane", NULL,
+ "bunde", NULL, NULL, NULL, "fremje",
+ "allmenn", "v\xC3\xB8rdnad", NULL, "pakta",
+ NULL, "halde", NULL, NULL, "menneskerettane",
+ NULL, NULL, "grunnleggjande", "fridomen", NULL,
+ "samarbeid", NULL, "Dei", "Sameinte",
+ "Nasjonane"};
+ const char **ip, **op;
+ const char *token;
+
+ test_begin("fts filter stopwords, Norwegian");
+ test_assert(fts_filter_create(fts_filter_stopwords, NULL, &norwegian_language, stopword_settings, &filter, &error) == 0);
+
+ ip = input;
+ op = output;
+ while (*ip != NULL) {
+ token = *ip;
+ ret = fts_filter_filter(filter, &token, &error);
+ if (ret <= 0) {
+ test_assert(ret == 0);
+ test_assert(*op == NULL);
+ } else {
+ test_assert(*op != NULL);
+ test_assert(strcmp(*ip, token) == 0);
+ }
+ op++;
+ ip++;
+ }
+
+ fts_filter_unref(&filter);
+ test_assert(filter == NULL);
+ test_end();
+}
+
static void test_fts_filter_stopwords_fail_lazy_init(void)
{
const struct fts_language unknown = { .name = "bebobidoop" };
@@ -655,6 +704,68 @@
test_end();
}
+static void test_fts_filter_stopwords_normalizer_stemmer_no(void)
More information about the dovecot-cvs
mailing list