chg: [abbreviations] remove . if word is not an abbreviation

Terrtia · Terrtia · commit 270c65d5acd8 · 2024-03-07T15:02:28.000+01:00
diff --git a/lexilang/detector.py b/lexilang/detector.py
@@ -3,27 +3,42 @@
 from .languages import get_language_weight, is_cjk
 
 _words = None
+_abbreviations = None
 _translate_table = str.maketrans(dict.fromkeys("!\"#$%&()*+,/:;<=>?@[\\]^_`{|}~", " "))  # not included . ' -
 
 def detect(text, languages=[]):
     global _words
+    global _abbreviations
 
     if _words is None:
         # Initialize
         words_file = os.path.join(os.path.dirname(__file__), "data", "words.pickle")
-        if not os.path.isfile(words_file):
+        abbreviations_file = os.path.join(os.path.dirname(__file__), "data", "abbreviations.pickle")
+
+        if not os.path.isfile(words_file) or not os.path.isfile(abbreviations_file):
             from .utils import compile_data
             compile_data()
-        
+
         with open(words_file, "rb") as f:
             _words = pickle.load(f, encoding="utf-8")
+        with open(abbreviations_file, "rb") as f:
+            _abbreviations = pickle.load(f, encoding="utf-8")
     
     text = text.lower().strip()
     text = text.translate(_translate_table)
     if is_cjk(text):
+        text = text.replace(".", "")
         tokens = list(text)
     else:
-        tokens = text.split(" ")
+        tokens = []
+        words = text.split()
+        for word in words:
+            if word in _abbreviations:
+                tokens.append(word)
+            else:
+                for w in word.split("."):
+                    if w:
+                        tokens.append(w)
 
     lang_bins = {}
     for tok in tokens:
@@ -52,4 +67,4 @@ def detect(text, languages=[]):
                 best_lang = lang
                 best_weight = weight
 
-        return best_lang, (1 / len(candidates)) * 0.9
+        return best_lang, (1 / len(candidates)) * 0.9
diff --git a/lexilang/utils.py b/lexilang/utils.py
@@ -7,6 +7,7 @@
 def compile_data():
     print("Compiling database...")
     words = {}
+    abbreviations = set()
     langs = get_supported_languages()
     for name in langs:
         code = langs[name]
@@ -20,13 +21,17 @@ def compile_data():
                         words[tok] = [code]
                     elif not code in words[tok]:
                         words[tok].append(code)
+                    if '.' in tok:
+                        abbreviations.add(tok)
 
     print("Serializing...")
 
     outfile = os.path.join(root_dir, "lexilang", "data", "words.pickle")
     with open(outfile, "wb") as f:
         pickle.dump(words, f, protocol=4)
-
     print(outfile)
 
-
+    outfile = os.path.join(root_dir, "lexilang", "data", "abbreviations.pickle")
+    with open(outfile, "wb") as f:
+        pickle.dump(abbreviations, f, protocol=4)
+    print(outfile)
diff --git a/test.py b/test.py
@@ -2,8 +2,10 @@
 
 print(detect("bonjour"))   # ('fr', 0.45)
 print(detect("bonjour!"))  # ('fr', 0.45)
+print(detect("grand bonjour. ... . .salut."))  # ('fr', 0.9)
+print(detect("т.н."))  # ('bg', 0.9)
 print(detect("学中文")) # ('zh', 0.45)
 print(detect("ciao mondo")) # ('it', 0.9)
 print(detect("El gato doméstico")) # ('es', 0.45)
 print(detect("El\"gato\",doméstico")) # ('es', 0.45)
-print(detect("ciao", languages=["de", "ro"])) # ('de', 0.45)
+print(detect("ciao", languages=["de", "ro"])) # ('de', 0.45)