Skip to content

Commit bc7b373

Browse files
committed
Added files via upload
1 parent 2268300 commit bc7b373

File tree

2 files changed

+580
-0
lines changed

2 files changed

+580
-0
lines changed

BayesSemanticAnalyzer.ipynb

+392
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,392 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"import collections\n",
12+
"import nltk.classify.util, nltk.metrics\n",
13+
"from nltk.classify import NaiveBayesClassifier\n",
14+
"from nltk.corpus import movie_reviews\n",
15+
"from nltk.metrics import scores\n",
16+
"from nltk import precision"
17+
]
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": 2,
22+
"metadata": {
23+
"collapsed": false
24+
},
25+
"outputs": [],
26+
"source": [
27+
"def evaluate_classifier(featx,collocationFunc):\n",
28+
" #negFiles = movie_reviews.fileids('neg')\n",
29+
" #posFiles = movie_reviews.fileids('pos')\n",
30+
" #negWordsList=[movie_reviews.words(fileids=[f]) for f in negFiles]\n",
31+
" #posWordsList=[movie_reviews.words(fileids=[f]) for f in posFiles]\n",
32+
" #negfeats = [(featx(negWords), 'neg') for negWords in negWordsList]\n",
33+
" #posfeats = [(featx(posWords), 'pos') for posWords in posWordsList]\n",
34+
"\n",
35+
" negids = movie_reviews.fileids('neg')\n",
36+
" posids = movie_reviews.fileids('pos')\n",
37+
" \n",
38+
" negfeats = [(featx(movie_reviews.words(fileids=[f]),collocationFunc), 'neg') for f in negids]\n",
39+
" posfeats = [(featx(movie_reviews.words(fileids=[f]),collocationFunc), 'pos') for f in posids]\n",
40+
"\n",
41+
" negcutoff = int(len(negfeats)*3/4)\n",
42+
" poscutoff = int(len(posfeats)*3/4)\n",
43+
" \n",
44+
" trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]\n",
45+
" testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]\n",
46+
" \n",
47+
" classifier = NaiveBayesClassifier.train(trainfeats)\n",
48+
" refsets = collections.defaultdict(set)\n",
49+
" testsets = collections.defaultdict(set)\n",
50+
" \n",
51+
" for i, (feats, label) in enumerate(testfeats):\n",
52+
" refsets[label].add(i)\n",
53+
" observed = classifier.classify(feats)\n",
54+
" testsets[observed].add(i)\n",
55+
" evaluationMetrics={}\n",
56+
" classifier.show_most_informative_features()\n",
57+
" evaluationMetrics['accuracy']=nltk.classify.util.accuracy(classifier, testfeats)\n",
58+
" evaluationMetrics['posPrec']=nltk.precision(refsets['pos'], testsets['pos'])\n",
59+
" evaluationMetrics['posRecall']=nltk.recall(refsets['pos'], testsets['pos'])\n",
60+
" evaluationMetrics['posF_Score']=nltk.f_measure(refsets['pos'], testsets['pos'])\n",
61+
" evaluationMetrics['negPrec']=nltk.precision(refsets['neg'], testsets['neg'])\n",
62+
" evaluationMetrics['negRecall']=nltk.recall(refsets['neg'], testsets['neg'])\n",
63+
" evaluationMetrics['negF_Score']=nltk.f_measure(refsets['neg'], testsets['neg'])\n",
64+
" return evaluationMetrics"
65+
]
66+
},
67+
{
68+
"cell_type": "code",
69+
"execution_count": 3,
70+
"metadata": {
71+
"collapsed": false
72+
},
73+
"outputs": [
74+
{
75+
"name": "stdout",
76+
"output_type": "stream",
77+
"text": [
78+
"Most Informative Features\n",
79+
" magnificent = True pos : neg = 15.0 : 1.0\n",
80+
" outstanding = True pos : neg = 13.6 : 1.0\n",
81+
" insulting = True neg : pos = 13.0 : 1.0\n",
82+
" vulnerable = True pos : neg = 12.3 : 1.0\n",
83+
" ludicrous = True neg : pos = 11.8 : 1.0\n",
84+
" avoids = True pos : neg = 11.7 : 1.0\n",
85+
" uninvolving = True neg : pos = 11.7 : 1.0\n",
86+
" fascination = True pos : neg = 10.3 : 1.0\n",
87+
" astounding = True pos : neg = 10.3 : 1.0\n",
88+
" idiotic = True neg : pos = 9.8 : 1.0\n"
89+
]
90+
}
91+
],
92+
"source": [
93+
"from nltk.corpus import stopwords\n",
94+
"stopset = set(stopwords.words('english'))\n",
95+
"evaluations=[] \n",
96+
"def stopword_filtered_word_feats(words,collocator):\n",
97+
" return dict([(word, True) for word in words if word not in stopset])\n",
98+
"evaluations.append(evaluate_classifier(stopword_filtered_word_feats,None)) "
99+
]
100+
},
101+
{
102+
"cell_type": "code",
103+
"execution_count": 4,
104+
"metadata": {
105+
"collapsed": false,
106+
"scrolled": true
107+
},
108+
"outputs": [
109+
{
110+
"name": "stdout",
111+
"output_type": "stream",
112+
"text": [
113+
"Most Informative Features\n",
114+
" magnificent = True pos : neg = 15.0 : 1.0\n",
115+
" outstanding = True pos : neg = 13.6 : 1.0\n",
116+
" insulting = True neg : pos = 13.0 : 1.0\n",
117+
" ('matt', 'damon') = True pos : neg = 12.3 : 1.0\n",
118+
" ('give', 'us') = True neg : pos = 12.3 : 1.0\n",
119+
" vulnerable = True pos : neg = 12.3 : 1.0\n",
120+
" ludicrous = True neg : pos = 11.8 : 1.0\n",
121+
" uninvolving = True neg : pos = 11.7 : 1.0\n",
122+
" avoids = True pos : neg = 11.7 : 1.0\n",
123+
" ('absolutely', 'no') = True neg : pos = 10.6 : 1.0\n",
124+
"Most Informative Features\n",
125+
" magnificent = True pos : neg = 15.0 : 1.0\n",
126+
" outstanding = True pos : neg = 13.6 : 1.0\n",
127+
" insulting = True neg : pos = 13.0 : 1.0\n",
128+
" ('matt', 'damon') = True pos : neg = 12.3 : 1.0\n",
129+
" ('give', 'us') = True neg : pos = 12.3 : 1.0\n",
130+
" vulnerable = True pos : neg = 12.3 : 1.0\n",
131+
" ludicrous = True neg : pos = 11.8 : 1.0\n",
132+
" uninvolving = True neg : pos = 11.7 : 1.0\n",
133+
" avoids = True pos : neg = 11.7 : 1.0\n",
134+
" ('absolutely', 'no') = True neg : pos = 10.6 : 1.0\n",
135+
"Most Informative Features\n",
136+
" magnificent = True pos : neg = 15.0 : 1.0\n",
137+
" outstanding = True pos : neg = 13.6 : 1.0\n",
138+
" insulting = True neg : pos = 13.0 : 1.0\n",
139+
" ('matt', 'damon') = True pos : neg = 12.3 : 1.0\n",
140+
" vulnerable = True pos : neg = 12.3 : 1.0\n",
141+
" ludicrous = True neg : pos = 11.8 : 1.0\n",
142+
" ('the', 'worst') = True neg : pos = 11.7 : 1.0\n",
143+
" uninvolving = True neg : pos = 11.7 : 1.0\n",
144+
" avoids = True pos : neg = 11.7 : 1.0\n",
145+
" ('give', 'us') = True neg : pos = 11.0 : 1.0\n"
146+
]
147+
}
148+
],
149+
"source": [
150+
"#Bigram Collocations- Handle Cases like “not good”, here B-O-W Approach will Fail\n",
151+
"import itertools\n",
152+
"from nltk.collocations import BigramCollocationFinder\n",
153+
"from nltk.metrics import BigramAssocMeasures\n",
154+
" \n",
155+
"def bigram_word_feats(words, score_fn, n=200):\n",
156+
" bigram_finder = BigramCollocationFinder.from_words(words)\n",
157+
" bigrams = bigram_finder.nbest(score_fn, n)\n",
158+
" return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])\n",
159+
" \n",
160+
"evaluations.append(evaluate_classifier(bigram_word_feats,BigramAssocMeasures.chi_sq))#Works best for this Data\n",
161+
"evaluations.append(evaluate_classifier(bigram_word_feats,BigramAssocMeasures.jaccard))\n",
162+
"evaluations.append(evaluate_classifier(bigram_word_feats,BigramAssocMeasures.likelihood_ratio))"
163+
]
164+
},
165+
{
166+
"cell_type": "code",
167+
"execution_count": 5,
168+
"metadata": {
169+
"collapsed": true
170+
},
171+
"outputs": [],
172+
"source": [
173+
"negFids = movie_reviews.fileids('neg')\n",
174+
"posFids = movie_reviews.fileids('pos')"
175+
]
176+
},
177+
{
178+
"cell_type": "code",
179+
"execution_count": 6,
180+
"metadata": {
181+
"collapsed": false
182+
},
183+
"outputs": [
184+
{
185+
"name": "stdout",
186+
"output_type": "stream",
187+
"text": [
188+
"1583820 39768 [(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), (\"'\", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('\"', 17612), ('it', 16107), ('that', 15924), ('-', 15595), (')', 11781), ('(', 11664), ('as', 11378), ('with', 10792), ('for', 9961)]\n",
189+
"1583820 ['neg', 'pos'] dict_items([('neg', FreqDist({',': 42448, 'the': 41471, '.': 33714, 'a': 20196, 'and': 19896, 'of': 18636, 'to': 16517, \"'\": 15268, 'is': 14059, 'in': 11725, ...})), ('pos', FreqDist({',': 35269, 'the': 35058, '.': 32162, 'a': 17910, 'and': 15680, 'of': 15487, 'to': 15420, \"'\": 15317, 'is': 11136, 'in': 10097, ...}))])\n",
190+
"751256 832564\n"
191+
]
192+
}
193+
],
194+
"source": [
195+
"from nltk.collocations import *\n",
196+
"from nltk.probability import FreqDist\n",
197+
"from nltk.probability import ConditionalFreqDist\n",
198+
"word_fd = FreqDist()\n",
199+
"label_word_fd = ConditionalFreqDist()\n",
200+
"\n",
201+
"testNegWords = movie_reviews.words(categories=['pos'])\n",
202+
"testPosWords = movie_reviews.words(categories=['neg'])\n",
203+
"\n",
204+
"for word in testNegWords:\n",
205+
" word_fd[word.lower()]+=1\n",
206+
" label_word_fd['neg'][word.lower()]+=1\n",
207+
"for word in testPosWords:\n",
208+
" word_fd[word.lower()]+=1\n",
209+
" label_word_fd['pos'][word.lower()]+=1\n",
210+
"print(word_fd.N(),word_fd.B(),word_fd.most_common(20))\n",
211+
"print(label_word_fd.N(),label_word_fd.conditions(),label_word_fd.items())\n",
212+
"print(label_word_fd['pos'].N(),label_word_fd['neg'].N())"
213+
]
214+
},
215+
{
216+
"cell_type": "code",
217+
"execution_count": 7,
218+
"metadata": {
219+
"collapsed": false
220+
},
221+
"outputs": [
222+
{
223+
"name": "stdout",
224+
"output_type": "stream",
225+
"text": [
226+
"Most Informative Features\n",
227+
" magnificent = True pos : neg = 15.0 : 1.0\n",
228+
" outstanding = True pos : neg = 13.6 : 1.0\n",
229+
" insulting = True neg : pos = 13.0 : 1.0\n",
230+
" vulnerable = True pos : neg = 12.3 : 1.0\n",
231+
" ludicrous = True neg : pos = 11.8 : 1.0\n",
232+
" avoids = True pos : neg = 11.7 : 1.0\n",
233+
" uninvolving = True neg : pos = 11.7 : 1.0\n",
234+
" fascination = True pos : neg = 10.3 : 1.0\n",
235+
" astounding = True pos : neg = 10.3 : 1.0\n",
236+
" idiotic = True neg : pos = 9.8 : 1.0\n",
237+
"Most Informative Features\n",
238+
" magnificent = True pos : neg = 15.0 : 1.0\n",
239+
" outstanding = True pos : neg = 13.6 : 1.0\n",
240+
" insulting = True neg : pos = 13.0 : 1.0\n",
241+
" ('matt', 'damon') = True pos : neg = 12.3 : 1.0\n",
242+
" ('give', 'us') = True neg : pos = 12.3 : 1.0\n",
243+
" vulnerable = True pos : neg = 12.3 : 1.0\n",
244+
" ludicrous = True neg : pos = 11.8 : 1.0\n",
245+
" uninvolving = True neg : pos = 11.7 : 1.0\n",
246+
" avoids = True pos : neg = 11.7 : 1.0\n",
247+
" ('absolutely', 'no') = True neg : pos = 10.6 : 1.0\n"
248+
]
249+
}
250+
],
251+
"source": [
252+
"# n_ii = label_word_fd[label][word]\n",
253+
"# n_ix = word_fd[word]\n",
254+
"# n_xi = label_word_fd[label].N()\n",
255+
"# n_xx = label_word_fd.N()\n",
256+
"# w1 ~w1\n",
257+
"# ------ ------\n",
258+
"# w2 | n_ii | n_oi | = n_xi\n",
259+
"# ------ ------\n",
260+
"# ~w2 | n_io | n_oo |\n",
261+
"# ------ ------\n",
262+
"# =n_ix TOTAL = n_xx\n",
263+
"# A number of measures are available to score collocations or other associations. The arguments to measure \n",
264+
"# functions are marginals of a contingency table, in the bigram case (n_ii, (n_ix, n_xi), n_xx):\n",
265+
"# n_ii = label_word_fd[label][word]\n",
266+
"# n_ix = word_fd[word]\n",
267+
"# n_xi = label_word_fd[label].N()\n",
268+
"# n_xx = label_word_fd.N()\n",
269+
"# Chi-Sq Contingency Table : Relating Word w1 with \"pos\" classification \n",
270+
"# w1 ~w1\n",
271+
"# ------ ------\n",
272+
"# +ve | n_ii | n_oi | = n_xi\n",
273+
"# ------ ------\n",
274+
"# -ve | n_io | n_oo |\n",
275+
"# ------ ------\n",
276+
"# =n_ix TOTAL = n_xx\n",
277+
"# n_ix : Total Freq of word w1, n_xi: pos_word_count \n",
278+
"pos_word_count = label_word_fd['pos'].N()\n",
279+
"neg_word_count = label_word_fd['neg'].N()\n",
280+
"total_word_count = pos_word_count + neg_word_count\n",
281+
" \n",
282+
"word_scores = {}\n",
283+
"\n",
284+
"#print(word_fd.items())\n",
285+
"for word, freq in word_fd.items():\n",
286+
" pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],(freq, pos_word_count), total_word_count)\n",
287+
" neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],(freq, neg_word_count), total_word_count)\n",
288+
" word_scores[word] = pos_score + neg_score\n",
289+
"import operator\n",
290+
"best1 = sorted(word_scores.items(), key=operator.itemgetter(1), reverse=True)[:10000]\n",
291+
"bestwords = set([w for w, s in best1])\n",
292+
" \n",
293+
"def best_word_feats(words,biGramMeasure):\n",
294+
" return dict([(word, True) for word in words if word in bestwords])\n",
295+
" \n",
296+
"evaluations.append(evaluate_classifier(best_word_feats,BigramAssocMeasures.chi_sq))\n",
297+
" \n",
298+
"def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):\n",
299+
" bigram_finder = BigramCollocationFinder.from_words(words)\n",
300+
" bigrams = bigram_finder.nbest(score_fn, n)\n",
301+
" d = dict([(bigram, True) for bigram in bigrams])\n",
302+
" d.update(best_word_feats(words,score_fn))\n",
303+
" return d\n",
304+
"evaluations.append(evaluate_classifier(best_bigram_word_feats,BigramAssocMeasures.chi_sq))"
305+
]
306+
},
307+
{
308+
"cell_type": "code",
309+
"execution_count": 8,
310+
"metadata": {
311+
"collapsed": false
312+
},
313+
"outputs": [
314+
{
315+
"name": "stdout",
316+
"output_type": "stream",
317+
"text": [
318+
"{'posF_Score': 0.7814992025518341, 'negPrec': 0.959349593495935, 'negRecall': 0.472, 'negF_Score': 0.6327077747989276, 'posRecall': 0.98, 'accuracy': 0.726, 'posPrec': 0.649867374005305}\n",
319+
"{'posF_Score': 0.8362989323843416, 'negPrec': 0.9202127659574468, 'negRecall': 0.692, 'negF_Score': 0.7899543378995434, 'posRecall': 0.94, 'accuracy': 0.816, 'posPrec': 0.7532051282051282}\n",
320+
"{'posF_Score': 0.825, 'negPrec': 0.9, 'negRecall': 0.684, 'negF_Score': 0.7772727272727273, 'posRecall': 0.924, 'accuracy': 0.804, 'posPrec': 0.7451612903225806}\n",
321+
"{'posF_Score': 0.8239436619718309, 'negPrec': 0.9120879120879121, 'negRecall': 0.664, 'negF_Score': 0.7685185185185185, 'posRecall': 0.936, 'accuracy': 0.8, 'posPrec': 0.7358490566037735}\n",
322+
"{'posF_Score': 0.935114503816794, 'negPrec': 0.9778761061946902, 'negRecall': 0.884, 'negF_Score': 0.9285714285714284, 'posRecall': 0.98, 'accuracy': 0.932, 'posPrec': 0.8941605839416058}\n",
323+
"{'posF_Score': 0.9206349206349206, 'negPrec': 0.926829268292683, 'negRecall': 0.912, 'negF_Score': 0.9193548387096776, 'posRecall': 0.928, 'accuracy': 0.92, 'posPrec': 0.9133858267716536}\n"
324+
]
325+
}
326+
],
327+
"source": [
328+
"for modelEvalMetrics in evaluations:\n",
329+
" print(modelEvalMetrics)"
330+
]
331+
},
332+
{
333+
"cell_type": "code",
334+
"execution_count": 74,
335+
"metadata": {
336+
"collapsed": false,
337+
"scrolled": true
338+
},
339+
"outputs": [
340+
{
341+
"name": "stdout",
342+
"output_type": "stream",
343+
"text": [
344+
"1764\n",
345+
"Help on method apply_freq_filter in module nltk.collocations:\n",
346+
"\n",
347+
"apply_freq_filter(min_freq) method of nltk.collocations.BigramCollocationFinder instance\n",
348+
" Removes candidate ngrams which have frequency less than min_freq.\n",
349+
"\n",
350+
"1764\n",
351+
"JACCARD: [('nice', 'hair'), ('/', '10'), (\"'\", 's'), ('there', 'are'), ('10', ')'), ('television', 'show'), ('we', 'don'), (')', '-'), (\"'\", 't'), ('t', 'know'), ('based', 'on'), ('they', 'are'), ('this', 'film'), ('film', 'is'), ('don', \"'\"), ('most', 'of'), (',', 'but'), ('it', \"'\"), (',', 'and'), ('on', 'a'), ('is', 'not'), ('of', 'course'), ('the', 'film'), ('.', 'it'), ('.', '.'), ('all', 'of'), (\"'\", 've'), ('doesn', \"'\"), ('like', 'a'), ('.', 'the'), ('in', 'a'), (',', 'there'), ('that', 'the'), ('it', 'is'), ('of', 'the'), ('for', 'the'), ('is', 'that'), (',', 'it'), ('the', 'movie'), (')', ','), ('.', 'there'), ('on', 'the'), ('by', 'the'), ('the', 'characters'), ('the', 'audience'), ('and', 'the'), (',', 'as'), ('from', 'the'), (',', 'we'), ('with', 'the'), ('-', 'the'), ('in', 'the')]\n"
352+
]
353+
}
354+
],
355+
"source": [
356+
"#bigramFinder=BigramCollocationFinder.from_words(movRevWords)\n",
357+
"#print(bigramFinder.N)\n",
358+
"#help(bigramFinder.apply_freq_filter)\n",
359+
"#print(bigramFinder.N)\n",
360+
"#bigramFinder.apply_freq_filter(3)\n",
361+
"#print(\"FREQ BASED:\",bigramFinder.nbest(bigram_measures.raw_freq,30))\n",
362+
"#print(\"JACCARD:\",bigramFinder.nbest(bigram_measures.jaccard,30))\n",
363+
"#print(\"JACCARD:\",bigramFinder.nbest(bigram_measures.jaccard,-1))\n",
364+
"#bestNGrams=bigramFinder.nbest(bigram_measures.jaccard,-1)\n",
365+
"#print(\"CHI-SQ:\",bigramFinder.nbest(bigram_measures.chi_sq,30))\n",
366+
"#print(\"LIKELIHOOD:\",bigramFinder.nbest(bigram_measures.likelihood_ratio,30))\n",
367+
"#biGfinder.apply_freq_filter()"
368+
]
369+
}
370+
],
371+
"metadata": {
372+
"kernelspec": {
373+
"display_name": "Python 3",
374+
"language": "python",
375+
"name": "python3"
376+
},
377+
"language_info": {
378+
"codemirror_mode": {
379+
"name": "ipython",
380+
"version": 3
381+
},
382+
"file_extension": ".py",
383+
"mimetype": "text/x-python",
384+
"name": "python",
385+
"nbconvert_exporter": "python",
386+
"pygments_lexer": "ipython3",
387+
"version": "3.5.1"
388+
}
389+
},
390+
"nbformat": 4,
391+
"nbformat_minor": 0
392+
}

0 commit comments

Comments
 (0)