chg: [abbreviations] remove abbreviations + remove . characters

Terrtia · Terrtia · commit 94b8974cda0c · 2024-03-20T13:56:09.000+01:00
diff --git a/dictionaries/bulgarian.txt b/dictionaries/bulgarian.txt
@@ -10009,5 +10009,4 @@ but
 мокър
 равен
 равна
-т.н.
 юмрук
diff --git a/dictionaries/hungarian.txt b/dictionaries/hungarian.txt
@@ -73,7 +73,6 @@ hogy
 hogyan
 igen
 ill
-ill.
 illetve
 ilyen
 ilyenkor
diff --git a/lexilang/detector.py b/lexilang/detector.py
@@ -4,7 +4,7 @@
 
 _words = None
 _abbreviations = None
-_translate_table = str.maketrans(dict.fromkeys("!\"#$%&()*+,/:;<=>?@[\\]^_`{|}~", " "))  # not included . ' -
+_translate_table = str.maketrans(dict.fromkeys("!\"#$%&()*+,/:;<=>?@[\\]^_`{|}~.", " "))  # not included ' -
 
 def detect(text, languages=[]):
     global _words
@@ -13,32 +13,21 @@ def detect(text, languages=[]):
     if _words is None:
         # Initialize
         words_file = os.path.join(os.path.dirname(__file__), "data", "words.pickle")
-        abbreviations_file = os.path.join(os.path.dirname(__file__), "data", "abbreviations.pickle")
 
-        if not os.path.isfile(words_file) or not os.path.isfile(abbreviations_file):
+        if not os.path.isfile(words_file):
             from .utils import compile_data
             compile_data()
 
         with open(words_file, "rb") as f:
             _words = pickle.load(f, encoding="utf-8")
-        with open(abbreviations_file, "rb") as f:
-            _abbreviations = pickle.load(f, encoding="utf-8")
-    
+
     text = text.lower().strip()
     text = text.translate(_translate_table)
+
     if is_cjk(text):
-        text = text.replace(".", "")
         tokens = list(text)
     else:
-        tokens = []
-        words = text.split()
-        for word in words:
-            if word in _abbreviations:
-                tokens.append(word)
-            else:
-                for w in word.split("."):
-                    if w:
-                        tokens.append(w)
+        tokens = text.split(" ")
 
     lang_bins = {}
     for tok in tokens:
diff --git a/lexilang/utils.py b/lexilang/utils.py
@@ -7,7 +7,6 @@
 def compile_data():
     print("Compiling database...")
     words = {}
-    abbreviations = set()
     langs = get_supported_languages()
     for name in langs:
         code = langs[name]
@@ -21,17 +20,10 @@ def compile_data():
                         words[tok] = [code]
                     elif not code in words[tok]:
                         words[tok].append(code)
-                    if '.' in tok:
-                        abbreviations.add(tok)
 
     print("Serializing...")
 
     outfile = os.path.join(root_dir, "lexilang", "data", "words.pickle")
     with open(outfile, "wb") as f:
         pickle.dump(words, f, protocol=4)
     print(outfile)
-
-    outfile = os.path.join(root_dir, "lexilang", "data", "abbreviations.pickle")
-    with open(outfile, "wb") as f:
-        pickle.dump(abbreviations, f, protocol=4)
-    print(outfile)
diff --git a/test.py b/test.py
@@ -2,8 +2,7 @@
 
 print(detect("bonjour"))   # ('fr', 0.45)
 print(detect("bonjour!"))  # ('fr', 0.45)
-print(detect("grand bonjour. ... . .salut."))  # ('fr', 0.9)
-print(detect("т.н."))  # ('bg', 0.9)
+print(detect("grand.bonjour. ... . .salut."))  # ('fr', 0.9)
 print(detect("学中文")) # ('zh', 0.45)
 print(detect("ciao mondo")) # ('it', 0.9)
 print(detect("El gato doméstico")) # ('es', 0.45)

-Original file line number
+Diff line change
 hogyan
 igen
 ill
 -ill.
 illetve
 ilyen
 ilyenkor