blob: d41106ccc00a16e243ce62133dd9b15d7f75a7d9 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
|
From fd9eaf10c49239d700af848062acc1d5efd54aa8 Mon Sep 17 00:00:00 2001
From: liuzhangjian <liuzhangjian@uniontech.com>
Date: Fri, 4 Dec 2020 15:41:31 +0800
Subject: [PATCH] Title:fix a bug of ChineseTokenizer
Description:When I use ChineseAnalyzer for Chinese word segmentation, I find that English and numbers are treated as one word and I think they should be separated.
RootCause:Null
Solution:
---
.../common/analysis/cn/ChineseFilter.cpp | 2 +-
.../common/analysis/cn/ChineseTokenizer.cpp | 22 ++++++++++++++++++-
2 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp b/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp
index d2a19f3f..83134454 100644
--- a/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp
+++ b/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp
@@ -38,7 +38,7 @@ bool ChineseFilter::incrementToken() {
if (text.length() > 1) {
return true;
}
- } else if (UnicodeUtil::isOther(text[0])) {
+ } else if (UnicodeUtil::isOther(text[0]) || UnicodeUtil::isDigit(text[0])) {
// One Chinese character as one Chinese word.
// Chinese word extraction to be added later here.
return true;
diff --git a/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp b/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp
index 38bf9875..3b4de742 100644
--- a/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp
+++ b/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp
@@ -65,6 +65,7 @@ bool ChineseTokenizer::incrementToken() {
length = 0;
start = offset;
+ bool last_is_en = false, last_is_num = false;
while (true) {
wchar_t c;
@@ -82,11 +83,30 @@ bool ChineseTokenizer::incrementToken() {
c = ioBuffer[bufferIndex++];
}
- if (UnicodeUtil::isDigit(c) || UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) {
+ if (UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) {
+ if (last_is_num) {
+ --bufferIndex;
+ --offset;
+ return flush();
+ }
+
+ push(c);
+ if (length == MAX_WORD_LEN) {
+ return flush();
+ }
+ last_is_en = true;
+ } else if (UnicodeUtil::isDigit(c)) {
+ if (last_is_en) {
+ --bufferIndex;
+ --offset;
+ return flush();
+ }
+
push(c);
if (length == MAX_WORD_LEN) {
return flush();
}
+ last_is_num = true;
} else if (UnicodeUtil::isOther(c)) {
if (length > 0) {
--bufferIndex;
|