1
+ {
2
+ "nbformat" : 4 ,
3
+ "nbformat_minor" : 0 ,
4
+ "metadata" : {
5
+ "colab" : {
6
+ "name" : " cosine ranking.ipynb" ,
7
+ "provenance" : [],
8
+ "collapsed_sections" : []
9
+ },
10
+ "kernelspec" : {
11
+ "name" : " python3" ,
12
+ "display_name" : " Python 3"
13
+ },
14
+ "language_info" : {
15
+ "name" : " python"
16
+ }
17
+ },
18
+ "cells" : [
19
+ {
20
+ "cell_type" : " code" ,
21
+ "metadata" : {
22
+ "id" : " tONK6x7lDQGn"
23
+ },
24
+ "source" : [
25
+ " import math\n " ,
26
+ " import pandas as pd\n " ,
27
+ " from matplotlib import pyplot as plt"
28
+ ],
29
+ "execution_count" : 4 ,
30
+ "outputs" : []
31
+ },
32
+ {
33
+ "cell_type" : " code" ,
34
+ "metadata" : {
35
+ "colab" : {
36
+ "base_uri" : " https://localhost:8080/" ,
37
+ "height" : 406
38
+ },
39
+ "id" : " rQ8-lSDiDG2K" ,
40
+ "outputId" : " f653941e-7fff-46f5-cec2-d0010feb83f4"
41
+ },
42
+ "source" : [
43
+ " \n " ,
44
+ " document_split=lambda document_list: [x.split() for x in document_list] \n " ,
45
+ " \n " ,
46
+ " frequency= lambda terms , doc_list : [doc_list.count(x) for x in terms ]\n " ,
47
+ " \n " ,
48
+ " find_frequency = lambda terms , doc_list : [ frequency(terms,x) for x in doc_list]\n " ,
49
+ " \n " ,
50
+ " def idfi(dfi,D):\n " ,
51
+ " l=[]\n " ,
52
+ " for x in dfi:\n " ,
53
+ " if(x==0):\n " ,
54
+ " l.append(0)\n " ,
55
+ " else:\n " ,
56
+ " l.append(math.log(D/x,10))\n " ,
57
+ " return l\n " ,
58
+ " \n " ,
59
+ " def dfi(docs):\n " ,
60
+ " l=[]\n " ,
61
+ " for x in range(len(docs[0])):\n " ,
62
+ " sum=0\n " ,
63
+ " for y in range(len(docs)):\n " ,
64
+ " if(docs[y][x]>0):\n " ,
65
+ " sum=sum+1\n " ,
66
+ " l.append(sum)\n " ,
67
+ " print(\" dfi\" ,l)\n " ,
68
+ " return idfi(l,len(docs))\n " ,
69
+ " \n " ,
70
+ " def dot_product(lis1,lis2):\n " ,
71
+ " s=0\n " ,
72
+ " for x in range(len(lis1)):\n " ,
73
+ " s=s +lis1[x]*lis2[x]\n " ,
74
+ " return s\n " ,
75
+ " \n " ,
76
+ " weight=lambda doc_f,idfi: [doc_f[x]*idfi[x] for x in range(len(doc_f))] #formula: term frequency of documment * Inverse document frequency\n " ,
77
+ " \n " ,
78
+ " magnitude=lambda lis:math.sqrt(sum(list(map(lambda x:x**2,lis)))) #formula : Square_root(x1^2 + x2^2 ...)\n " ,
79
+ " \n " ,
80
+ " cosine=lambda dot,query,doc: dot/(query*doc) #formula : (dot_product(query.docment)) / magnitude(Query) * magnitude(Document)\n " ,
81
+ " \n " ,
82
+ " def ranks(idfi,docs_f,q_f): #takes idfi, term frequency of doumnents and term frequency of query and return ranks\n " ,
83
+ " query_weight=weight(q_f,idfi)\n " ,
84
+ " query_magnitude=magnitude(query_weight)\n " ,
85
+ " docs_weight=[weight(x,idfi) for x in docs_f ]\n " ,
86
+ " print('documents weight:',docs_weight)\n " ,
87
+ " docs_magnitude=[magnitude(x) for x in docs_weight]\n " ,
88
+ " print('documents magnitude:',docs_magnitude)\n " ,
89
+ " docs_dotproduct=[dot_product(query_weight,x) for x in docs_weight]\n " ,
90
+ " print('documents dot-product:',docs_dotproduct)\n " ,
91
+ " docs_cosine=[cosine(docs_dotproduct[x],query_magnitude,docs_magnitude[x] ) for x in range(len(docs_dotproduct))]\n " ,
92
+ " docs_cosine={ f\" Doc {x+1}\" :docs_cosine[x] for x in range(len(docs_cosine))}\n " ,
93
+ " print(\" documents cosine:\" ,docs_cosine)\n " ,
94
+ " r={k: v for k, v in sorted(docs_cosine.items(), key=lambda item: item[1],reverse=True )}\n " ,
95
+ " return r\n " ,
96
+ " \n " ,
97
+ " def cosine_ranking_algo(query,docs): #takes Query and List of Documents as input and return ranks in dictionary sorted by ranks\n " ,
98
+ " query=query.lower() \n " ,
99
+ " terms=\" \" .join(docs) + query\n " ,
100
+ " terms=terms.lower()\n " ,
101
+ " terms=terms.split()\n " ,
102
+ " terms=list(dict.fromkeys(terms)) \n " ,
103
+ " terms.sort()\n " ,
104
+ " query=query.split()\n " ,
105
+ " docs=document_split(docs)\n " ,
106
+ " docs_f=find_frequency(terms,docs)\n " ,
107
+ " q_f=frequency(terms,query)\n " ,
108
+ " idfi= dfi(docs_f)\n " ,
109
+ " print(\" idfi:\" ,idfi)\n " ,
110
+ " return ranks(idfi,docs_f,q_f)\n " ,
111
+ " \n " ,
112
+ " query=\" stock exchange pakistan \"\n " ,
113
+ " docs=[\" market of stock exchange is affected by brokers \" ,\n " ,
114
+ " 'pakistan stock market is very popular ',\n " ,
115
+ " \" stock exchange pakistan is in loss now a days \" ]\n " ,
116
+ " \n " ,
117
+ " d=cosine_ranking_algo(query,docs) \n " ,
118
+ " plt.bar(d.keys(),d.values())\n " ,
119
+ " \n "
120
+ ],
121
+ "execution_count" : 5 ,
122
+ "outputs" : [
123
+ {
124
+ "output_type" : " stream" ,
125
+ "name" : " stdout" ,
126
+ "text" : [
127
+ " dfi [1, 1, 1, 1, 1, 2, 1, 3, 1, 2, 1, 1, 2, 1, 3, 1]\n " ,
128
+ " idfi: [0.47712125471966244, 0.47712125471966244, 0.47712125471966244, 0.47712125471966244, 0.47712125471966244, 0.17609125905568124, 0.47712125471966244, 0.0, 0.47712125471966244, 0.17609125905568124, 0.47712125471966244, 0.47712125471966244, 0.17609125905568124, 0.47712125471966244, 0.0, 0.47712125471966244]\n " ,
129
+ " documents weight: [[0.0, 0.47712125471966244, 0.47712125471966244, 0.47712125471966244, 0.0, 0.17609125905568124, 0.0, 0.0, 0.0, 0.17609125905568124, 0.0, 0.47712125471966244, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.17609125905568124, 0.0, 0.0, 0.17609125905568124, 0.47712125471966244, 0.0, 0.47712125471966244], [0.47712125471966244, 0.0, 0.0, 0.0, 0.47712125471966244, 0.17609125905568124, 0.47712125471966244, 0.0, 0.47712125471966244, 0.0, 0.47712125471966244, 0.0, 0.17609125905568124, 0.0, 0.0, 0.0]]\n " ,
130
+ " documents magnitude: [0.9862023270367446, 0.7192396307505309, 1.095554526967031]\n " ,
131
+ " documents dot-product: [0.031008131515815038, 0.031008131515815038, 0.062016263031630076]\n " ,
132
+ " documents cosine: {'Doc 1': 0.1262573814443149, 'Doc 2': 0.1731207765289807, 'Doc 3': 0.22731013440410225}\n "
133
+ ]
134
+ },
135
+ {
136
+ "output_type" : " execute_result" ,
137
+ "data" : {
138
+ "text/plain" : [
139
+ " <BarContainer object of 3 artists>"
140
+ ]
141
+ },
142
+ "metadata" : {},
143
+ "execution_count" : 5
144
+ },
145
+ {
146
+ "output_type" : " display_data" ,
147
+ "data" : {
148
+ "text/plain" : [
149
+ " <Figure size 432x288 with 1 Axes>"
150
+ ],
151
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD4CAYAAADiry33AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAANKUlEQVR4nO3db4xld13H8ffHXVsSCVjo8MD+YRbZKmswNBmWB2oNAcoSTFdjG7dCLEqyNnH1QSW6hqRNlpgUMGoiVVrDGlLbtKWNySasKY2gMZriTqGAW9gwrP2zjUkXtkIISNn264M5kOtkypx27t3Z/c77lUx6zzm/c/q7Pd33nD333plUFZKkvn5soycgSZotQy9JzRl6SWrO0EtSc4ZekprbutETWOnCCy+s+fn5jZ6GJJ1THnrooa9X1dxq28660M/Pz7O4uLjR05Ckc0qSx55vm7duJKk5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Z+glqbmz7pOx6zW//5MbPYW2Hr35nRs9BUkvglf0ktScoZek5gy9JDVn6CWpOUMvSc0ZeklqztBLUnOGXpKaM/SS1Jyhl6TmDL0kNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Z+glqblRoU+yK8mxJEtJ9q+y/YYkjyT5YpJ/SvLqiW3XJfnq8HXdNCcvSVrbmqFPsgW4BXgHsAO4NsmOFcM+DyxU1c8D9wIfGvZ9BXAT8CZgJ3BTkgumN31J0lrGXNHvBJaq6nhVPQPcBeyeHFBVn6mq7wyLDwIXD4/fDjxQVaeq6mngAWDXdKYuSRpjTOgvAp6YWD4xrHs+7wX+8UXuK0masq3TPFiSdwMLwC+/wP32AnsBLr300mlOSZI2vTFX9E8Cl0wsXzys+3+SvBV4P3BVVX3vhexbVbdV1UJVLczNzY2duyRphDGhPwJsT7ItyXnAHuDQ5IAklwO3shz5pyY23Q9cmeSC4UXYK4d1kqQzZM1bN1V1Osk+lgO9BThYVUeTHAAWq+oQ8GHgpcAnkgA8XlVXVdWpJB9g+ZsFwIGqOjWTZyJJWtWoe/RVdRg4vGLdjROP3/oj9j0IHHyxE5QkrY+fjJWk5gy9JDVn6CWpOUMvSc0ZeklqztBLUnOGXpKaM/SS1Jyhl6TmDL0kNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Z+glqblRvzNWmpX5/Z/c6Cm09ejN79zoKegs4RW9JDVn6CWpOUMvSc0ZeklqztBLUnOGXpKaM/SS1Jyhl6TmDL0kNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Nyr0SXYlOZZkKcn+VbZfkeRzSU4nuXrFtmeTPDx8HZrWxCVJ46z5qwSTbAFuAd4GnACOJDlUVY9MDHsceA/wvlUO8d2qesMU5ipJehHG/M7YncBSVR0HSHIXsBv4Yeir6tFh23MzmKMkaR3G3Lq5CHhiYvnEsG6slyRZTPJgkl9dbUCSvcOYxZMnT76AQ0uS1nImXox9dVUtAL8J/GWSn145oKpuq6qFqlqYm5s7A1OSpM1jTOifBC6ZWL54WDdKVT05/PM48M/A5S9gfpKkdRoT+iPA9iTbkpwH7AFGvXsmyQVJzh8eXwj8AhP39iVJs7dm6KvqNLAPuB/4MnBPVR1NciDJVQBJ3pjkBHANcGuSo8PurwMWk3wB+Axw84p360iSZmzMu26oqsPA4RXrbpx4fITlWzor9/t34PXrnKMkaR38ZKwkNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Z+glqTlDL0nNGXpJas7QS1Jzhl6SmjP0ktScoZek5kb94hFJ+oH5/Z/c6Cm09ejN75zJcb2il6TmDL0kNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Z+glqTlDL0nNGXpJas7QS1Jzhl6SmjP0ktScoZek5gy9JDVn6CWpOUMvSc2NCn2SXUmOJVlKsn+V7Vck+VyS00muXrHtuiRfHb6um9bEJUnjrBn6JFuAW4B3ADuAa5PsWDHsceA9wJ0r9n0FcBPwJmAncFOSC9Y/bUnSWGOu6HcCS1V1vKqeAe4Cdk8OqKpHq+qLwHMr9n078EBVnaqqp4EHgF1TmLckaaQxob8IeGJi+cSwboxR+ybZm2QxyeLJkydHHlqSNMZZ8WJsVd1WVQtVtTA3N7fR05GkVsaE/kngkonli4d1Y6xnX0nSFIwJ/RFge5JtSc4D9gCHRh7/fuDKJBcML8JeOayTJJ0ha4a+qk4D+1gO9JeBe6rqaJIDSa4CSPLGJCeAa4Bbkxwd9j0FfIDlbxZHgAPDOknSGbJ1zKCqOgwcXrHuxonHR1i+LbPavgeBg+uYoyRpHc6KF2MlSbNj6CWpOUMvSc0ZeklqztBLUnOGXpKaM/SS1Jyhl6TmDL0kNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Z+glqTlDL0nNGXpJas7QS1Jzhl6SmjP0ktScoZek5gy9JDVn6CWpOUMvSc0ZeklqztBLUnOGXpKaM/SS1Jyhl6TmDL0kNWfoJak5Qy9JzRl6SWpuVOiT7EpyLMlSkv2rbD8/yd3D9s8mmR/Wzyf5bpKHh6+PTnf6kqS1bF1rQJItwC3A24ATwJEkh6rqkYlh7wWerqrXJtkDfBD4jWHb16rqDVOetyRppDFX9DuBpao6XlXPAHcBu1eM2Q18fHh8L/CWJJneNCVJL9aY0F8EPDGxfGJYt+qYqjoNfBN45bBtW5LPJ/mXJL+02r8gyd4ki0kWT548+YKegCTpR5v1i7H/DVxaVZcDNwB3JnnZykFVdVtVLVTVwtzc3IynJEmby5jQPwlcMrF88bBu1TFJtgIvB75RVd+rqm8AVNVDwNeAy9Y7aUnSeGNCfwTYnmRbkvOAPcChFWMOAdcNj68GPl1VlWRueDGXJK8BtgPHpzN1SdIYa77rpqpOJ9kH3A9sAQ5W1dEkB4DFqjoEfAy4PckScIrlbwYAVwAHknwfeA64vqpOzeKJSJJWt2boAarqMHB4xbobJx7/L3DNKvvdB9y3zjlKktbBT8ZKUnOGXpKaM/SS1Jyhl6TmDL0kNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Z+glqTlDL0nNGXpJas7QS1Jzhl6SmjP0ktScoZek5gy9JDVn6CWpOUMvSc0ZeklqztBLUnOGXpKaM/SS1Jyhl6TmDL0kNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpoz9JLUnKGXpOYMvSQ1Nyr0SXYlOZZkKcn+Vbafn+TuYftnk8xPbPuTYf2xJG+f3tQlSWOsGfokW4BbgHcAO4Brk+xYMey9wNNV9VrgL4APDvvuAPYAPwfsAv56OJ4k6QwZc0W/E1iqquNV9QxwF7B7xZjdwMeHx/cCb0mSYf1dVfW9qvovYGk4niTpDNk6YsxFwBMTyyeANz3fmKo6neSbwCuH9Q+u2Peilf+CJHuBvcPit5McGzX7c9+FwNc3ehJj5YMbPYOzwjlzzjxfP7RZztmrn2/DmNDPXFXdBty20fM405IsVtXCRs9D43nOzj2es3G3bp4ELplYvnhYt+qYJFuBlwPfGLmvJGmGxoT+CLA9ybYk57H84uqhFWMOAdcNj68GPl1VNazfM7wrZxuwHfiP6UxdkjTGmrduhnvu+4D7gS3Awao6muQAsFhVh4CPAbcnWQJOsfzNgGHcPcAjwGng96rq2Rk9l3PRprtd1YDn7Nyz6c9Zli+8JUld+clYSWrO0EtSc4Z+SpI8m+ThJEeTfCHJHyZZ93/fJB9I8sXh2J9K8lPTmO9mN8Pz9eEkXxnO2T8k+clpzFczPWfXDMd8LknLt2F6j35Kkny7ql46PH4VcCfwb1V10zqP+7Kq+tbw+A+AHVV1/bonvMnN8HxdyfK7zk4nyx9/qao/XveENctz9jrgOeBW4H1VtbjuyZ5lvKKfgap6iuVP+u7Lspck+bskX0ry+SRvhuWfI5Tkz5L853AF+PurHOtbE4s/AfidecqmfL4+VVWnh8UHWf7siKZsyufsy1XV+tP4Z8UnYzuqquPDD3B7FfDu5VX1+iQ/C3wqyWXAbwPzwBuGK8BXrHasJH8K/BbwTeDNZ+QJbDLTPF8Tfge4e5bz3sxmdM5a8or+zPhF4O8BquorwGPAZcBbgVt/cAVYVadW27mq3l9VlwB3APvOyIw3t3WdL4Ak72f5syN3zHy2gimcs84M/YwkeQ3wLPDUFA97B/DrUzyeBtM8X0neA/wK8K7yRbCZmdGfsZYM/QwkmQM+Cnxk+IP+r8C7hm2XAZcCx4AHgN8dfj4Qq/21Msn2icXdwFdmO/vNZ8rnaxfwR8BVVfWdM/MMNp9pnrPNwHfdTEmSZ4EvAT/O8l/Zbwf+vKqeS/IS4G+AhWHbDVX1meF/vg+x/EtZvg/8bVV9ZMVx7wN+huV3BTwGXF9V/mC4dZrh+VoCzmf5h/oBPOi7pKZjhufs14C/AuaA/wEerqpWvw3P0EtSc966kaTmDL0kNWfoJak5Qy9JzRl6SWrO0EtSc4Zekpr7P8cC0RCBZ9GPAAAAAElFTkSuQmCC\n"
152
+ },
153
+ "metadata" : {
154
+ "needs_background" : " light"
155
+ }
156
+ }
157
+ ]
158
+ }
159
+ ]
160
+ }
0 commit comments