aboutsummaryrefslogtreecommitdiffstats
path: root/community/lucene++/160.patch
diff options
context:
space:
mode:
authorLeo <thinkabit.ukim@gmail.com>2021-01-15 03:07:13 -0300
committerLeo <thinkabit.ukim@gmail.com>2021-01-15 06:39:41 +0000
commitd2575b85e896947c3b985707529c1bb74f24fbdc (patch)
treef072caebac0c5393f009738c67480046ede7c19b /community/lucene++/160.patch
parent888bdca89d5944e6917e405d2b923dadda03aecc (diff)
downloadaports-d2575b85e896947c3b985707529c1bb74f24fbdc.tar.gz
aports-d2575b85e896947c3b985707529c1bb74f24fbdc.tar.bz2
aports-d2575b85e896947c3b985707529c1bb74f24fbdc.tar.xz
community/lucene++: fix packaging
This buildsystem is so easy to get wrong and so hard to get right, just import all the work done by Debian.
Diffstat (limited to 'community/lucene++/160.patch')
-rw-r--r--community/lucene++/160.patch72
1 files changed, 72 insertions, 0 deletions
diff --git a/community/lucene++/160.patch b/community/lucene++/160.patch
new file mode 100644
index 0000000000..d41106ccc0
--- /dev/null
+++ b/community/lucene++/160.patch
@@ -0,0 +1,72 @@
+From fd9eaf10c49239d700af848062acc1d5efd54aa8 Mon Sep 17 00:00:00 2001
+From: liuzhangjian <liuzhangjian@uniontech.com>
+Date: Fri, 4 Dec 2020 15:41:31 +0800
+Subject: [PATCH] Title:fix a bug of ChineseTokenizer
+
+Description:When I use ChineseAnalyzer for Chinese word segmentation, I find that English and numbers are treated as one word and I think they should be separated.
+
+RootCause:Null
+
+Solution:
+---
+ .../common/analysis/cn/ChineseFilter.cpp | 2 +-
+ .../common/analysis/cn/ChineseTokenizer.cpp | 22 ++++++++++++++++++-
+ 2 files changed, 22 insertions(+), 2 deletions(-)
+
+diff --git a/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp b/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp
+index d2a19f3f..83134454 100644
+--- a/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp
++++ b/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp
+@@ -38,7 +38,7 @@ bool ChineseFilter::incrementToken() {
+ if (text.length() > 1) {
+ return true;
+ }
+- } else if (UnicodeUtil::isOther(text[0])) {
++ } else if (UnicodeUtil::isOther(text[0]) || UnicodeUtil::isDigit(text[0])) {
+ // One Chinese character as one Chinese word.
+ // Chinese word extraction to be added later here.
+ return true;
+diff --git a/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp b/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp
+index 38bf9875..3b4de742 100644
+--- a/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp
++++ b/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp
+@@ -65,6 +65,7 @@ bool ChineseTokenizer::incrementToken() {
+
+ length = 0;
+ start = offset;
++ bool last_is_en = false, last_is_num = false;
+
+ while (true) {
+ wchar_t c;
+@@ -82,11 +83,30 @@ bool ChineseTokenizer::incrementToken() {
+ c = ioBuffer[bufferIndex++];
+ }
+
+- if (UnicodeUtil::isDigit(c) || UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) {
++ if (UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) {
++ if (last_is_num) {
++ --bufferIndex;
++ --offset;
++ return flush();
++ }
++
++ push(c);
++ if (length == MAX_WORD_LEN) {
++ return flush();
++ }
++ last_is_en = true;
++ } else if (UnicodeUtil::isDigit(c)) {
++ if (last_is_en) {
++ --bufferIndex;
++ --offset;
++ return flush();
++ }
++
+ push(c);
+ if (length == MAX_WORD_LEN) {
+ return flush();
+ }
++ last_is_num = true;
+ } else if (UnicodeUtil::isOther(c)) {
+ if (length > 0) {
+ --bufferIndex;