aboutsummaryrefslogblamecommitdiffstats
path: root/community/lucene++/160.patch
blob: d41106ccc00a16e243ce62133dd9b15d7f75a7d9 (plain) (tree)







































































                                                                                                                                                                   
From fd9eaf10c49239d700af848062acc1d5efd54aa8 Mon Sep 17 00:00:00 2001
From: liuzhangjian <liuzhangjian@uniontech.com>
Date: Fri, 4 Dec 2020 15:41:31 +0800
Subject: [PATCH] Title:fix a bug of ChineseTokenizer

Description:When I use ChineseAnalyzer for Chinese word segmentation, I find that English and numbers are treated as one word and I think they should be separated.

RootCause:Null

Solution:
---
 .../common/analysis/cn/ChineseFilter.cpp      |  2 +-
 .../common/analysis/cn/ChineseTokenizer.cpp   | 22 ++++++++++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp b/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp
index d2a19f3f..83134454 100644
--- a/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp
+++ b/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp
@@ -38,7 +38,7 @@ bool ChineseFilter::incrementToken() {
                 if (text.length() > 1) {
                     return true;
                 }
-            } else if (UnicodeUtil::isOther(text[0])) {
+            } else if (UnicodeUtil::isOther(text[0]) || UnicodeUtil::isDigit(text[0])) {
                 // One Chinese character as one Chinese word.
                 // Chinese word extraction to be added later here.
                 return true;
diff --git a/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp b/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp
index 38bf9875..3b4de742 100644
--- a/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp
+++ b/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp
@@ -65,6 +65,7 @@ bool ChineseTokenizer::incrementToken() {
 
     length = 0;
     start = offset;
+    bool last_is_en = false, last_is_num = false;
 
     while (true) {
         wchar_t c;
@@ -82,11 +83,30 @@ bool ChineseTokenizer::incrementToken() {
             c = ioBuffer[bufferIndex++];
         }
 
-        if (UnicodeUtil::isDigit(c) || UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) {
+        if (UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) {
+            if (last_is_num) {
+                --bufferIndex;
+                --offset;
+                return flush();
+            }
+
+            push(c);
+            if (length == MAX_WORD_LEN) {
+                return flush();
+            }
+            last_is_en = true;
+        } else if (UnicodeUtil::isDigit(c)) {
+            if (last_is_en) {
+                --bufferIndex;
+                --offset;
+                return flush();
+            }
+
             push(c);
             if (length == MAX_WORD_LEN) {
                 return flush();
             }
+            last_is_num = true;
         } else if (UnicodeUtil::isOther(c)) {
             if (length > 0) {
                 --bufferIndex;