From fd9eaf10c49239d700af848062acc1d5efd54aa8 Mon Sep 17 00:00:00 2001 From: liuzhangjian Date: Fri, 4 Dec 2020 15:41:31 +0800 Subject: [PATCH] Title:fix a bug of ChineseTokenizer Description:When I use ChineseAnalyzer for Chinese word segmentation, I find that English and numbers are treated as one word and I think they should be separated. RootCause:Null Solution: --- .../common/analysis/cn/ChineseFilter.cpp | 2 +- .../common/analysis/cn/ChineseTokenizer.cpp | 22 ++++++++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp b/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp index d2a19f3f..83134454 100644 --- a/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp +++ b/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp @@ -38,7 +38,7 @@ bool ChineseFilter::incrementToken() { if (text.length() > 1) { return true; } - } else if (UnicodeUtil::isOther(text[0])) { + } else if (UnicodeUtil::isOther(text[0]) || UnicodeUtil::isDigit(text[0])) { // One Chinese character as one Chinese word. // Chinese word extraction to be added later here. return true; diff --git a/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp b/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp index 38bf9875..3b4de742 100644 --- a/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp +++ b/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp @@ -65,6 +65,7 @@ bool ChineseTokenizer::incrementToken() { length = 0; start = offset; + bool last_is_en = false, last_is_num = false; while (true) { wchar_t c; @@ -82,11 +83,30 @@ bool ChineseTokenizer::incrementToken() { c = ioBuffer[bufferIndex++]; } - if (UnicodeUtil::isDigit(c) || UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) { + if (UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) { + if (last_is_num) { + --bufferIndex; + --offset; + return flush(); + } + + push(c); + if (length == MAX_WORD_LEN) { + return flush(); + } + last_is_en = true; + } else if (UnicodeUtil::isDigit(c)) { + if (last_is_en) { + --bufferIndex; + --offset; + return flush(); + } + push(c); if (length == MAX_WORD_LEN) { return flush(); } + last_is_num = true; } else if (UnicodeUtil::isOther(c)) { if (length > 0) { --bufferIndex;