Skip to content

Commit 94b8974

Browse files
committed
chg: [abbreviations] remove abbreviations + remove . characters
1 parent 270c65d commit 94b8974

File tree

5 files changed

+6
-28
lines changed

5 files changed

+6
-28
lines changed

dictionaries/bulgarian.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10009,5 +10009,4 @@ but
1000910009
мокър
1001010010
равен
1001110011
равна
10012-
т.н.
1001310012
юмрук

dictionaries/hungarian.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ hogy
7373
hogyan
7474
igen
7575
ill
76-
ill.
7776
illetve
7877
ilyen
7978
ilyenkor

lexilang/detector.py

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
_words = None
66
_abbreviations = None
7-
_translate_table = str.maketrans(dict.fromkeys("!\"#$%&()*+,/:;<=>?@[\\]^_`{|}~", " ")) # not included . ' -
7+
_translate_table = str.maketrans(dict.fromkeys("!\"#$%&()*+,/:;<=>?@[\\]^_`{|}~.", " ")) # not included ' -
88

99
def detect(text, languages=[]):
1010
global _words
@@ -13,32 +13,21 @@ def detect(text, languages=[]):
1313
if _words is None:
1414
# Initialize
1515
words_file = os.path.join(os.path.dirname(__file__), "data", "words.pickle")
16-
abbreviations_file = os.path.join(os.path.dirname(__file__), "data", "abbreviations.pickle")
1716

18-
if not os.path.isfile(words_file) or not os.path.isfile(abbreviations_file):
17+
if not os.path.isfile(words_file):
1918
from .utils import compile_data
2019
compile_data()
2120

2221
with open(words_file, "rb") as f:
2322
_words = pickle.load(f, encoding="utf-8")
24-
with open(abbreviations_file, "rb") as f:
25-
_abbreviations = pickle.load(f, encoding="utf-8")
26-
23+
2724
text = text.lower().strip()
2825
text = text.translate(_translate_table)
26+
2927
if is_cjk(text):
30-
text = text.replace(".", "")
3128
tokens = list(text)
3229
else:
33-
tokens = []
34-
words = text.split()
35-
for word in words:
36-
if word in _abbreviations:
37-
tokens.append(word)
38-
else:
39-
for w in word.split("."):
40-
if w:
41-
tokens.append(w)
30+
tokens = text.split(" ")
4231

4332
lang_bins = {}
4433
for tok in tokens:

lexilang/utils.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
def compile_data():
88
print("Compiling database...")
99
words = {}
10-
abbreviations = set()
1110
langs = get_supported_languages()
1211
for name in langs:
1312
code = langs[name]
@@ -21,17 +20,10 @@ def compile_data():
2120
words[tok] = [code]
2221
elif not code in words[tok]:
2322
words[tok].append(code)
24-
if '.' in tok:
25-
abbreviations.add(tok)
2623

2724
print("Serializing...")
2825

2926
outfile = os.path.join(root_dir, "lexilang", "data", "words.pickle")
3027
with open(outfile, "wb") as f:
3128
pickle.dump(words, f, protocol=4)
3229
print(outfile)
33-
34-
outfile = os.path.join(root_dir, "lexilang", "data", "abbreviations.pickle")
35-
with open(outfile, "wb") as f:
36-
pickle.dump(abbreviations, f, protocol=4)
37-
print(outfile)

test.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22

33
print(detect("bonjour")) # ('fr', 0.45)
44
print(detect("bonjour!")) # ('fr', 0.45)
5-
print(detect("grand bonjour. ... . .salut.")) # ('fr', 0.9)
6-
print(detect("т.н.")) # ('bg', 0.9)
5+
print(detect("grand.bonjour. ... . .salut.")) # ('fr', 0.9)
76
print(detect("学中文")) # ('zh', 0.45)
87
print(detect("ciao mondo")) # ('it', 0.9)
98
print(detect("El gato doméstico")) # ('es', 0.45)

0 commit comments

Comments
 (0)