Skip to content

Commit 270c65d

Browse files
committed
chg: [abbreviations] remove . if word is not an abbreviation
1 parent 86d1c69 commit 270c65d

File tree

3 files changed

+29
-7
lines changed

3 files changed

+29
-7
lines changed

lexilang/detector.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,27 +3,42 @@
33
from .languages import get_language_weight, is_cjk
44

55
_words = None
6+
_abbreviations = None
67
_translate_table = str.maketrans(dict.fromkeys("!\"#$%&()*+,/:;<=>?@[\\]^_`{|}~", " ")) # not included . ' -
78

89
def detect(text, languages=[]):
910
global _words
11+
global _abbreviations
1012

1113
if _words is None:
1214
# Initialize
1315
words_file = os.path.join(os.path.dirname(__file__), "data", "words.pickle")
14-
if not os.path.isfile(words_file):
16+
abbreviations_file = os.path.join(os.path.dirname(__file__), "data", "abbreviations.pickle")
17+
18+
if not os.path.isfile(words_file) or not os.path.isfile(abbreviations_file):
1519
from .utils import compile_data
1620
compile_data()
17-
21+
1822
with open(words_file, "rb") as f:
1923
_words = pickle.load(f, encoding="utf-8")
24+
with open(abbreviations_file, "rb") as f:
25+
_abbreviations = pickle.load(f, encoding="utf-8")
2026

2127
text = text.lower().strip()
2228
text = text.translate(_translate_table)
2329
if is_cjk(text):
30+
text = text.replace(".", "")
2431
tokens = list(text)
2532
else:
26-
tokens = text.split(" ")
33+
tokens = []
34+
words = text.split()
35+
for word in words:
36+
if word in _abbreviations:
37+
tokens.append(word)
38+
else:
39+
for w in word.split("."):
40+
if w:
41+
tokens.append(w)
2742

2843
lang_bins = {}
2944
for tok in tokens:
@@ -52,4 +67,4 @@ def detect(text, languages=[]):
5267
best_lang = lang
5368
best_weight = weight
5469

55-
return best_lang, (1 / len(candidates)) * 0.9
70+
return best_lang, (1 / len(candidates)) * 0.9

lexilang/utils.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
def compile_data():
88
print("Compiling database...")
99
words = {}
10+
abbreviations = set()
1011
langs = get_supported_languages()
1112
for name in langs:
1213
code = langs[name]
@@ -20,13 +21,17 @@ def compile_data():
2021
words[tok] = [code]
2122
elif not code in words[tok]:
2223
words[tok].append(code)
24+
if '.' in tok:
25+
abbreviations.add(tok)
2326

2427
print("Serializing...")
2528

2629
outfile = os.path.join(root_dir, "lexilang", "data", "words.pickle")
2730
with open(outfile, "wb") as f:
2831
pickle.dump(words, f, protocol=4)
29-
3032
print(outfile)
3133

32-
34+
outfile = os.path.join(root_dir, "lexilang", "data", "abbreviations.pickle")
35+
with open(outfile, "wb") as f:
36+
pickle.dump(abbreviations, f, protocol=4)
37+
print(outfile)

test.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22

33
print(detect("bonjour")) # ('fr', 0.45)
44
print(detect("bonjour!")) # ('fr', 0.45)
5+
print(detect("grand bonjour. ... . .salut.")) # ('fr', 0.9)
6+
print(detect("т.н.")) # ('bg', 0.9)
57
print(detect("学中文")) # ('zh', 0.45)
68
print(detect("ciao mondo")) # ('it', 0.9)
79
print(detect("El gato doméstico")) # ('es', 0.45)
810
print(detect("El\"gato\",doméstico")) # ('es', 0.45)
9-
print(detect("ciao", languages=["de", "ro"])) # ('de', 0.45)
11+
print(detect("ciao", languages=["de", "ro"])) # ('de', 0.45)

0 commit comments

Comments
 (0)