Skip to content

Commit 1f72822

Browse files
committed
cosine similarity algo
0 parents  commit 1f72822

File tree

1 file changed

+160
-0
lines changed

1 file changed

+160
-0
lines changed

cosine_ranking.ipynb

+160
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {
5+
"colab": {
6+
"name": "cosine ranking.ipynb",
7+
"provenance": [],
8+
"collapsed_sections": []
9+
},
10+
"kernelspec": {
11+
"name": "python3",
12+
"display_name": "Python 3"
13+
},
14+
"language_info": {
15+
"name": "python"
16+
}
17+
},
18+
"cells": [
19+
{
20+
"cell_type": "code",
21+
"metadata": {
22+
"id": "tONK6x7lDQGn"
23+
},
24+
"source": [
25+
"import math\n",
26+
"import pandas as pd\n",
27+
"from matplotlib import pyplot as plt"
28+
],
29+
"execution_count": 4,
30+
"outputs": []
31+
},
32+
{
33+
"cell_type": "code",
34+
"metadata": {
35+
"colab": {
36+
"base_uri": "https://localhost:8080/",
37+
"height": 406
38+
},
39+
"id": "rQ8-lSDiDG2K",
40+
"outputId": "f653941e-7fff-46f5-cec2-d0010feb83f4"
41+
},
42+
"source": [
43+
"\n",
44+
"document_split=lambda document_list: [x.split() for x in document_list] \n",
45+
"\n",
46+
"frequency= lambda terms , doc_list : [doc_list.count(x) for x in terms ]\n",
47+
"\n",
48+
"find_frequency = lambda terms , doc_list : [ frequency(terms,x) for x in doc_list]\n",
49+
"\n",
50+
"def idfi(dfi,D):\n",
51+
" l=[]\n",
52+
" for x in dfi:\n",
53+
" if(x==0):\n",
54+
" l.append(0)\n",
55+
" else:\n",
56+
" l.append(math.log(D/x,10))\n",
57+
" return l\n",
58+
"\n",
59+
"def dfi(docs):\n",
60+
" l=[]\n",
61+
" for x in range(len(docs[0])):\n",
62+
" sum=0\n",
63+
" for y in range(len(docs)):\n",
64+
" if(docs[y][x]>0):\n",
65+
" sum=sum+1\n",
66+
" l.append(sum)\n",
67+
" print(\"dfi\",l)\n",
68+
" return idfi(l,len(docs))\n",
69+
"\n",
70+
"def dot_product(lis1,lis2):\n",
71+
" s=0\n",
72+
" for x in range(len(lis1)):\n",
73+
" s=s +lis1[x]*lis2[x]\n",
74+
" return s\n",
75+
"\n",
76+
"weight=lambda doc_f,idfi: [doc_f[x]*idfi[x] for x in range(len(doc_f))] #formula: term frequency of documment * Inverse document frequency\n",
77+
"\n",
78+
"magnitude=lambda lis:math.sqrt(sum(list(map(lambda x:x**2,lis)))) #formula : Square_root(x1^2 + x2^2 ...)\n",
79+
"\n",
80+
"cosine=lambda dot,query,doc: dot/(query*doc) #formula : (dot_product(query.docment)) / magnitude(Query) * magnitude(Document)\n",
81+
"\n",
82+
"def ranks(idfi,docs_f,q_f): #takes idfi, term frequency of doumnents and term frequency of query and return ranks\n",
83+
" query_weight=weight(q_f,idfi)\n",
84+
" query_magnitude=magnitude(query_weight)\n",
85+
" docs_weight=[weight(x,idfi) for x in docs_f ]\n",
86+
" print('documents weight:',docs_weight)\n",
87+
" docs_magnitude=[magnitude(x) for x in docs_weight]\n",
88+
" print('documents magnitude:',docs_magnitude)\n",
89+
" docs_dotproduct=[dot_product(query_weight,x) for x in docs_weight]\n",
90+
" print('documents dot-product:',docs_dotproduct)\n",
91+
" docs_cosine=[cosine(docs_dotproduct[x],query_magnitude,docs_magnitude[x] ) for x in range(len(docs_dotproduct))]\n",
92+
" docs_cosine={ f\"Doc {x+1}\":docs_cosine[x] for x in range(len(docs_cosine))}\n",
93+
" print(\"documents cosine:\",docs_cosine)\n",
94+
" r={k: v for k, v in sorted(docs_cosine.items(), key=lambda item: item[1],reverse=True )}\n",
95+
" return r\n",
96+
"\n",
97+
"def cosine_ranking_algo(query,docs): #takes Query and List of Documents as input and return ranks in dictionary sorted by ranks\n",
98+
" query=query.lower() \n",
99+
" terms=\" \".join(docs) + query\n",
100+
" terms=terms.lower()\n",
101+
" terms=terms.split()\n",
102+
" terms=list(dict.fromkeys(terms)) \n",
103+
" terms.sort()\n",
104+
" query=query.split()\n",
105+
" docs=document_split(docs)\n",
106+
" docs_f=find_frequency(terms,docs)\n",
107+
" q_f=frequency(terms,query)\n",
108+
" idfi= dfi(docs_f)\n",
109+
" print(\"idfi:\",idfi)\n",
110+
" return ranks(idfi,docs_f,q_f)\n",
111+
"\n",
112+
"query=\"stock exchange pakistan \"\n",
113+
"docs=[\"market of stock exchange is affected by brokers \",\n",
114+
" 'pakistan stock market is very popular ',\n",
115+
" \"stock exchange pakistan is in loss now a days \"]\n",
116+
"\n",
117+
"d=cosine_ranking_algo(query,docs) \n",
118+
"plt.bar(d.keys(),d.values())\n",
119+
"\n"
120+
],
121+
"execution_count": 5,
122+
"outputs": [
123+
{
124+
"output_type": "stream",
125+
"name": "stdout",
126+
"text": [
127+
"dfi [1, 1, 1, 1, 1, 2, 1, 3, 1, 2, 1, 1, 2, 1, 3, 1]\n",
128+
"idfi: [0.47712125471966244, 0.47712125471966244, 0.47712125471966244, 0.47712125471966244, 0.47712125471966244, 0.17609125905568124, 0.47712125471966244, 0.0, 0.47712125471966244, 0.17609125905568124, 0.47712125471966244, 0.47712125471966244, 0.17609125905568124, 0.47712125471966244, 0.0, 0.47712125471966244]\n",
129+
"documents weight: [[0.0, 0.47712125471966244, 0.47712125471966244, 0.47712125471966244, 0.0, 0.17609125905568124, 0.0, 0.0, 0.0, 0.17609125905568124, 0.0, 0.47712125471966244, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.17609125905568124, 0.0, 0.0, 0.17609125905568124, 0.47712125471966244, 0.0, 0.47712125471966244], [0.47712125471966244, 0.0, 0.0, 0.0, 0.47712125471966244, 0.17609125905568124, 0.47712125471966244, 0.0, 0.47712125471966244, 0.0, 0.47712125471966244, 0.0, 0.17609125905568124, 0.0, 0.0, 0.0]]\n",
130+
"documents magnitude: [0.9862023270367446, 0.7192396307505309, 1.095554526967031]\n",
131+
"documents dot-product: [0.031008131515815038, 0.031008131515815038, 0.062016263031630076]\n",
132+
"documents cosine: {'Doc 1': 0.1262573814443149, 'Doc 2': 0.1731207765289807, 'Doc 3': 0.22731013440410225}\n"
133+
]
134+
},
135+
{
136+
"output_type": "execute_result",
137+
"data": {
138+
"text/plain": [
139+
"<BarContainer object of 3 artists>"
140+
]
141+
},
142+
"metadata": {},
143+
"execution_count": 5
144+
},
145+
{
146+
"output_type": "display_data",
147+
"data": {
148+
"text/plain": [
149+
"<Figure size 432x288 with 1 Axes>"
150+
],
151+
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD4CAYAAADiry33AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAANKUlEQVR4nO3db4xld13H8ffHXVsSCVjo8MD+YRbZKmswNBmWB2oNAcoSTFdjG7dCLEqyNnH1QSW6hqRNlpgUMGoiVVrDGlLbtKWNySasKY2gMZriTqGAW9gwrP2zjUkXtkIISNn264M5kOtkypx27t3Z/c77lUx6zzm/c/q7Pd33nD333plUFZKkvn5soycgSZotQy9JzRl6SWrO0EtSc4ZekprbutETWOnCCy+s+fn5jZ6GJJ1THnrooa9X1dxq28660M/Pz7O4uLjR05Ckc0qSx55vm7duJKk5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Z+glqbmz7pOx6zW//5MbPYW2Hr35nRs9BUkvglf0ktScoZek5gy9JDVn6CWpOUMvSc0ZeklqztBLUnOGXpKaM/SS1Jyhl6TmDL0kNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Z+glqblRoU+yK8mxJEtJ9q+y/YYkjyT5YpJ/SvLqiW3XJfnq8HXdNCcvSVrbmqFPsgW4BXgHsAO4NsmOFcM+DyxU1c8D9wIfGvZ9BXAT8CZgJ3BTkgumN31J0lrGXNHvBJaq6nhVPQPcBeyeHFBVn6mq7wyLDwIXD4/fDjxQVaeq6mngAWDXdKYuSRpjTOgvAp6YWD4xrHs+7wX+8UXuK0masq3TPFiSdwMLwC+/wP32AnsBLr300mlOSZI2vTFX9E8Cl0wsXzys+3+SvBV4P3BVVX3vhexbVbdV1UJVLczNzY2duyRphDGhPwJsT7ItyXnAHuDQ5IAklwO3shz5pyY23Q9cmeSC4UXYK4d1kqQzZM1bN1V1Osk+lgO9BThYVUeTHAAWq+oQ8GHgpcAnkgA8XlVXVdWpJB9g+ZsFwIGqOjWTZyJJWtWoe/RVdRg4vGLdjROP3/oj9j0IHHyxE5QkrY+fjJWk5gy9JDVn6CWpOUMvSc0ZeklqztBLUnOGXpKaM/SS1Jyhl6TmDL0kNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Z+glqblRvzNWmpX5/Z/c6Cm09ejN79zoKegs4RW9JDVn6CWpOUMvSc0ZeklqztBLUnOGXpKaM/SS1Jyhl6TmDL0kNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Nyr0SXYlOZZkKcn+VbZfkeRzSU4nuXrFtmeTPDx8HZrWxCVJ46z5qwSTbAFuAd4GnACOJDlUVY9MDHsceA/wvlUO8d2qesMU5ipJehHG/M7YncBSVR0HSHIXsBv4Yeir6tFh23MzmKMkaR3G3Lq5CHhiYvnEsG6slyRZTPJgkl9dbUCSvcOYxZMnT76AQ0uS1nImXox9dVUtAL8J/GWSn145oKpuq6qFqlqYm5s7A1OSpM1jTOifBC6ZWL54WDdKVT05/PM48M/A5S9gfpKkdRoT+iPA9iTbkpwH7AFGvXsmyQVJzh8eXwj8AhP39iVJs7dm6KvqNLAPuB/4MnBPVR1NciDJVQBJ3pjkBHANcGuSo8PurwMWk3wB+Axw84p360iSZmzMu26oqsPA4RXrbpx4fITlWzor9/t34PXrnKMkaR38ZKwkNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Z+glqTlDL0nNGXpJas7QS1Jzhl6SmjP0ktScoZek5kb94hFJ+oH5/Z/c6Cm09ejN75zJcb2il6TmDL0kNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Z+glqTlDL0nNGXpJas7QS1Jzhl6SmjP0ktScoZek5gy9JDVn6CWpOUMvSc2NCn2SXUmOJVlKsn+V7Vck+VyS00muXrHtuiRfHb6um9bEJUnjrBn6JFuAW4B3ADuAa5PsWDHsceA9wJ0r9n0FcBPwJmAncFOSC9Y/bUnSWGOu6HcCS1V1vKqeAe4Cdk8OqKpHq+qLwHMr9n078EBVnaqqp4EHgF1TmLckaaQxob8IeGJi+cSwboxR+ybZm2QxyeLJkydHHlqSNMZZ8WJsVd1WVQtVtTA3N7fR05GkVsaE/kngkonli4d1Y6xnX0nSFIwJ/RFge5JtSc4D9gCHRh7/fuDKJBcML8JeOayTJJ0ha4a+qk4D+1gO9JeBe6rqaJIDSa4CSPLGJCeAa4Bbkxwd9j0FfIDlbxZHgAPDOknSGbJ1zKCqOgwcXrHuxonHR1i+LbPavgeBg+uYoyRpHc6KF2MlSbNj6CWpOUMvSc0ZeklqztBLUnOGXpKaM/SS1Jyhl6TmDL0kNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Z+glqTlDL0nNGXpJas7QS1Jzhl6SmjP0ktScoZek5gy9JDVn6CWpOUMvSc0ZeklqztBLUnOGXpKaM/SS1Jyhl6TmDL0kNWfoJak5Qy9JzRl6SWpuVOiT7EpyLMlSkv2rbD8/yd3D9s8mmR/Wzyf5bpKHh6+PTnf6kqS1bF1rQJItwC3A24ATwJEkh6rqkYlh7wWerqrXJtkDfBD4jWHb16rqDVOetyRppDFX9DuBpao6XlXPAHcBu1eM2Q18fHh8L/CWJJneNCVJL9aY0F8EPDGxfGJYt+qYqjoNfBN45bBtW5LPJ/mXJL+02r8gyd4ki0kWT548+YKegCTpR5v1i7H/DVxaVZcDNwB3JnnZykFVdVtVLVTVwtzc3IynJEmby5jQPwlcMrF88bBu1TFJtgIvB75RVd+rqm8AVNVDwNeAy9Y7aUnSeGNCfwTYnmRbkvOAPcChFWMOAdcNj68GPl1VlWRueDGXJK8BtgPHpzN1SdIYa77rpqpOJ9kH3A9sAQ5W1dEkB4DFqjoEfAy4PckScIrlbwYAVwAHknwfeA64vqpOzeKJSJJWt2boAarqMHB4xbobJx7/L3DNKvvdB9y3zjlKktbBT8ZKUnOGXpKaM/SS1Jyhl6TmDL0kNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Z+glqTlDL0nNGXpJas7QS1Jzhl6SmjP0ktScoZek5gy9JDVn6CWpOUMvSc0ZeklqztBLUnOGXpKaM/SS1Jyhl6TmDL0kNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Nyr0SXYlOZZkKcn+Vbafn+TuYftnk8xPbPuTYf2xJG+f3tQlSWOsGfokW4BbgHcAO4Brk+xYMey9wNNV9VrgL4APDvvuAPYAPwfsAv56OJ4k6QwZc0W/E1iqquNV9QxwF7B7xZjdwMeHx/cCb0mSYf1dVfW9qvovYGk4niTpDNk6YsxFwBMTyyeANz3fmKo6neSbwCuH9Q+u2Peilf+CJHuBvcPit5McGzX7c9+FwNc3ehJj5YMbPYOzwjlzzjxfP7RZztmrn2/DmNDPXFXdBty20fM405IsVtXCRs9D43nOzj2es3G3bp4ELplYvnhYt+qYJFuBlwPfGLmvJGmGxoT+CLA9ybYk57H84uqhFWMOAdcNj68GPl1VNazfM7wrZxuwHfiP6UxdkjTGmrduhnvu+4D7gS3Awao6muQAsFhVh4CPAbcnWQJOsfzNgGHcPcAjwGng96rq2Rk9l3PRprtd1YDn7Nyz6c9Zli+8JUld+clYSWrO0EtSc4Z+SpI8m+ThJEeTfCHJHyZZ93/fJB9I8sXh2J9K8lPTmO9mN8Pz9eEkXxnO2T8k+clpzFczPWfXDMd8LknLt2F6j35Kkny7ql46PH4VcCfwb1V10zqP+7Kq+tbw+A+AHVV1/bonvMnN8HxdyfK7zk4nyx9/qao/XveENctz9jrgOeBW4H1VtbjuyZ5lvKKfgap6iuVP+u7Lspck+bskX0ry+SRvhuWfI5Tkz5L853AF+PurHOtbE4s/AfidecqmfL4+VVWnh8UHWf7siKZsyufsy1XV+tP4Z8UnYzuqquPDD3B7FfDu5VX1+iQ/C3wqyWXAbwPzwBuGK8BXrHasJH8K/BbwTeDNZ+QJbDLTPF8Tfge4e5bz3sxmdM5a8or+zPhF4O8BquorwGPAZcBbgVt/cAVYVadW27mq3l9VlwB3APvOyIw3t3WdL4Ak72f5syN3zHy2gimcs84M/YwkeQ3wLPDUFA97B/DrUzyeBtM8X0neA/wK8K7yRbCZmdGfsZYM/QwkmQM+Cnxk+IP+r8C7hm2XAZcCx4AHgN8dfj4Qq/21Msn2icXdwFdmO/vNZ8rnaxfwR8BVVfWdM/MMNp9pnrPNwHfdTEmSZ4EvAT/O8l/Zbwf+vKqeS/IS4G+AhWHbDVX1meF/vg+x/EtZvg/8bVV9ZMVx7wN+huV3BTwGXF9V/mC4dZrh+VoCzmf5h/oBPOi7pKZjhufs14C/AuaA/wEerqpWvw3P0EtSc966kaTmDL0kNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpr7P8cC0RCBZ9GPAAAAAElFTkSuQmCC\n"
152+
},
153+
"metadata": {
154+
"needs_background": "light"
155+
}
156+
}
157+
]
158+
}
159+
]
160+
}

0 commit comments

Comments
 (0)