xinyandai
diff --git a/‎bound.ipynb
+199 b/‎bound.ipynb
+199
diff --git a/‎datasets.py
+9-4 b/‎datasets.py
+9-4
diff --git a/‎distance_estimation_cgk.py
+35 b/‎distance_estimation_cgk.py
+35
diff --git a/‎distance_estimation_l2.py
+94 b/‎distance_estimation_l2.py
+94
diff --git a/‎embed_cnn.py
+6-3 b/‎embed_cnn.py
+6-3
@@ -0,0 +1,199 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "# edit distance 1000x1000: 100%|██████████| 1000/1000 [02:04<00:00,  8.02it/s]\n",
+      "  0%|          | 0/52 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# Calculate edit distance time: 124.6652319431305\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 52/52 [1:32:50<00:00, 91.40s/it] "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "52.0 1.8483353884093712 28.439545176737834\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import random\n",
+    "import string\n",
+    "import numpy as np\n",
+    "from multiprocessing import cpu_count\n",
+    "np.random.seed(1)\n",
+    "random.seed(1)\n",
+    "\n",
+    "C = 52 \n",
+    "M = 1000\n",
+    "letters = list(range(C))\n",
+    "\n",
+    "def randomString(stringLength):\n",
+    "    \"\"\"Generate a random string of fixed length \"\"\"\n",
+    "    return [random.choice(letters) for _ in range(stringLength)]\n",
+    "\n",
+    "def int2str(l):\n",
+    "    return \"\".join(chr(i+ord('a')) for i in l)\n",
+    "\n",
+    "N = 1000\n",
+    "strings = [randomString(random.randint(1, M)) for _ in range(N)]\n",
+    "lengths = [len(i) for i in  strings]\n",
+    "def one_hot(s):\n",
+    "    encode = np.zeros((C, M), dtype=np.int)\n",
+    "    encode[np.array(s), np.arange(len(s))] = 1\n",
+    "    return encode\n",
+    "\n",
+    "oh_strs = [one_hot(s) for s in strings]\n",
+    "or_strs = [int2str(s) for s in strings]\n",
+    "\n",
+    "from datasets import all_pair_distance\n",
+    "knnd = all_pair_distance(or_strs, or_strs, cpu_count())\n",
+    "\n",
+    "oh_strs = np.array(oh_strs)\n",
+    "\n",
+    "import tqdm\n",
+    "dist = []\n",
+    "def int2str(s):\n",
+    "    return \"\".join(str(i) for i in s)\n",
+    "for i in tqdm.tqdm(range(C)):\n",
+    "    ss = oh_strs[:, i, :]\n",
+    "    ss = [int2str(s[:lengths[i]]) for i, s in enumerate(ss)]\n",
+    "    d = all_pair_distance(ss, ss, 8, progress=False)\n",
+    "    dist.append(d)\n",
+    "\n",
+    "dist = np.array(dist)\n",
+    "bound = np.sum(dist, axis=0)\n",
+    "index = np.where(knnd != 0)\n",
+    "ration = bound[index] / knnd[index]\n",
+    "print(np.max(ration), np.min(ration), np.mean(ration))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(1000, 1000)\n",
+      "(1000, 1000)\n",
+      "(array([], dtype=int64), array([], dtype=int64))\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/xinyan/.conda/envs/py3/lib/python3.6/site-packages/ipykernel_launcher.py:3: RuntimeWarning: invalid value encountered in true_divide\n",
+      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(knnd.shape)\n",
+    "print(bound.shape)\n",
+    "idx = np.where( bound/knnd == 1.6344086021505377)\n",
+    "print(idx)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pairs = list((or_strs[i], or_strs[j]) for i, j  in zip(idx[0], idx[1]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pairs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
@@ -15,18 +15,20 @@ def f(x):
     return [Levenshtein.distance(a, b) for b in B]
 
 
-def all_pair_distance(A, B, n_thread):
+def all_pair_distance(A, B, n_thread, progress=True):
+    bar = tqdm if progress else lambda iterable,total,desc : iterable
     def all_pair(A, B, n_thread):
         with Pool(n_thread) as pool:
             start_time = time.time()
             edit = list(
-                tqdm(
+                bar(
                     pool.imap(f, zip(A, [B for _ in A])),
                     total=len(A),
                     desc="# edit distance {}x{}".format(len(A), len(B)),
                 )
             )
-            print("# Calculate edit distance time: {}".format(time.time() - start_time))
+            if progress:
+                print("# Calculate edit distance time: {}".format(time.time() - start_time))
             return np.array(edit)
 
     if len(A) < len(B):
@@ -62,17 +64,20 @@ def word2sig(lines, max_length=None):
 
     all_chars = dict()
     all_chars["counter"] = 0
+    alphabet = ''
 
     def to_ord(c):
         nonlocal all_chars
+        nonlocal alphabet
         if not (c in all_chars):
+            alphabet += c
             all_chars[c] = all_chars["counter"]
             all_chars["counter"] = all_chars["counter"] + 1
         return all_chars[c]
 
     x = [[to_ord(c) for c in line] for line in lines]
 
-    return all_chars["counter"], max_length, x
+    return all_chars["counter"], max_length, x, alphabet
 
 
 def ivecs_read(file):
 
@@ -0,0 +1,35 @@
+import os
+import numpy as np
+from main import get_args
+from nns import linear_fit
+from embed_cgk import random_seed, cgk_string, distance
+
+
+threshold = 1000
+
+args, data_handler, data_file = get_args()
+train_dist, query_dist = data_handler.train_dist, data_handler.query_dist
+train_idx = np.where(train_dist < threshold)
+query_idx = np.where(query_dist < threshold)
+
+dis_dir = "cgk_dist/{}".format(args.dataset)
+os.makedirs(dis_dir, exist_ok=True)
+if not os.path.isfile(dis_dir + "train_idx.npy"):
+    h = random_seed(data_handler.M, data_handler.C)
+    xq = cgk_string(h, data_handler.xq.sig, data_handler.M)
+    xt = cgk_string(h, data_handler.xt.sig, data_handler.M)
+    xb = cgk_string(h, data_handler.xb.sig, data_handler.M)
+
+    train_dist_hm = distance(xt, xt)
+    query_dist_hm = distance(xq, xb)
+
+    np.save(dis_dir + "train_dist_hm.npy", train_dist_hm)
+    np.save(dis_dir + "query_dist_hm.npy", query_dist_hm)
+else:
+    train_dist_hm = np.load(dis_dir + "train_dist_hm.npy")
+    query_dist_hm = np.load(dis_dir + "query_dist_hm.npy")
+
+l2ed_gru = linear_fit(
+    train_dist_hm[train_idx],
+    train_dist[train_idx], deg=2)
+print(np.mean(np.abs(l2ed_gru(query_dist_hm[query_idx]) / query_dist[query_idx] - 1.0)))
@@ -0,0 +1,94 @@
+import numpy as np
+from utils import l2_dist
+from nns import linear_fit, load_vec, get_args
+
+
+
+threshold = 1000
+
+args = get_args()
+
+# args.embed = 'gru'
+# xq_gru, xb_gru, xt_gru, train_dist, query_dist = load_vec(args)
+# query_dist = query_dist[:, :50000]
+# xb_gru = xb_gru[:50000, :]
+#
+# train_idx = np.where(train_dist < threshold)
+# query_idx = np.where(query_dist < threshold)
+#
+# train_dist_l2_gru = l2_dist(xt_gru, xt_gru)
+#
+# l2ed_gru = linear_fit(
+#     train_dist_l2_gru[train_idx],
+#     train_dist[train_idx], deg=1)
+#
+# query_dist_l2_gru = l2_dist(xq_gru, xb_gru)
+# print(np.mean(np.abs(l2ed_gru(query_dist_l2_gru[query_idx]) / query_dist[query_idx] - 1.0)))
+
+
+# args.embed = 'cnn'
+# xq_cnn, xb_cnn, xt_cnn,train_dist, query_dist = load_vec(args)
+# query_dist = query_dist[:, :50000]
+# xb_cnn = xb_cnn[:50000, :]
+#
+# train_idx = np.where(train_dist < threshold)
+# query_idx = np.where(query_dist < threshold)
+#
+# print("# training all pair distance")
+# train_dist_l2_cnn = l2_dist(xt_cnn, xt_cnn)
+# print("# training all pair distance fitting to edit distance")
+# l2ed_cnn = linear_fit(
+#     train_dist_l2_cnn[train_idx],
+#     train_dist[train_idx],
+#     deg=1)
+# print("# query all pair distance")
+# query_dist_l2_cnn = l2_dist(xq_cnn, xb_cnn)
+# print("# fitting errors")
+# print(np.mean(np.abs(l2ed_cnn(query_dist_l2_cnn)[query_idx] / query_dist[query_idx] - 1.0)))
+
+
+import matplotlib.pyplot as plt
+fontsize = 44
+ticksize = 40
+labelsize = 35
+legendsize = 30
+plt.style.use("seaborn-white")
+
+W = 12.0
+H = 9.5
+def _plot_setting():
+    plt.yticks(fontsize=ticksize)
+    plt.xticks(fontsize=ticksize)
+    plt.gcf().set_size_inches(W, H)
+    plt.subplots_adjust(
+        top=0.976,
+        bottom=0.141,
+        left=0.133,
+        right=0.988,
+        hspace=0.2,
+        wspace=0.2
+    )
+print("# plotting")
+# idx = np.random.choice(np.size(query_dist[query_idx]), threshold)
+
+# plt.scatter(query_dist[query_idx].reshape(-1)[idx],
+#             l2ed_gru(query_dist_l2_gru[query_idx].reshape(-1))[idx], color="blue")
+
+# plt.scatter(query_dist[query_idx].reshape(-1)[idx],
+#             l2ed_cnn(query_dist_l2_cnn[query_idx].reshape(-1))[idx], color="red")
+# plt.scatter(query_dist[query_idx].reshape(-1)[idx],
+#             query_dist[query_idx].reshape(-1)[idx], color="black")
+
+plt.xlim(left=-10, right=threshold)
+plt.ylim(bottom=-10, top=threshold)
+_plot_setting()
+plt.xlabel("True Edit Distance", fontsize=fontsize)
+plt.ylabel("Estimated Edit Distance", fontsize=fontsize)
+plt.text(
+    x=0.02, y=0.8, color='blue',
+    s=args.dataset.upper(),
+    fontsize=labelsize,
+    transform=plt.subplot().transAxes
+)
+# plt.savefig("/home/xinyan/Dropbox/project/Yan Xiao Paper/string-embedding/figures/distance_estimation_{}.pdf".format(args.dataset))
+plt.show()
@@ -50,9 +50,12 @@ def cnn_embedding(args, h, data_file):
     xq = _batch_embed(args, model.embedding_net, h.xq, device)
     print("# Embedding time: " + str(embed_time))
     if args.save_embed:
-        np.save("{}/embedding_xb".format(data_file), xb)
-        np.save("{}/embedding_xt".format(data_file), xt)
-        np.save("{}/embedding_xq".format(data_file), xq)
+        if args.embed_dir != "":
+            args.embed_dir = args.embed_dir + "/"
+        os.makedirs("{}/{}".format(data_file, args.embed_dir), exist_ok=True)
+        np.save("{}/{}embedding_xb".format(data_file, args.embed_dir), xb)
+        np.save("{}/{}embedding_xt".format(data_file, args.embed_dir), xt)
+        np.save("{}/{}embedding_xq".format(data_file, args.embed_dir), xq)
 
     if args.recall:
         test_recall(xb, xq, h.query_knn)