Skip to content

Commit 4c3a3e1

Browse files
committed
Added files via upload
1 parent bc7b373 commit 4c3a3e1

File tree

2 files changed

+425
-0
lines changed

2 files changed

+425
-0
lines changed

MaxEntSemanticAnalysis.ipynb

Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {
7+
"collapsed": true
8+
},
9+
"outputs": [],
10+
"source": [
11+
"import collections\n",
12+
"import nltk.classify.util, nltk.metrics\n",
13+
"from nltk.classify import MaxentClassifier\n",
14+
"from nltk.corpus import movie_reviews\n",
15+
"from nltk.metrics import scores\n",
16+
"from nltk import precision\n",
17+
"import itertools\n",
18+
"from nltk.collocations import BigramCollocationFinder\n",
19+
"from nltk.metrics import BigramAssocMeasures"
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": 2,
25+
"metadata": {
26+
"collapsed": true
27+
},
28+
"outputs": [],
29+
"source": [
30+
"def evaluate_classifier(featx,collocationFunc):\n",
31+
" #negFiles = movie_reviews.fileids('neg')\n",
32+
" #posFiles = movie_reviews.fileids('pos')\n",
33+
" #negWordsList=[movie_reviews.words(fileids=[f]) for f in negFiles]\n",
34+
" #posWordsList=[movie_reviews.words(fileids=[f]) for f in posFiles]\n",
35+
" #negfeats = [(featx(negWords), 'neg') for negWords in negWordsList]\n",
36+
" #posfeats = [(featx(posWords), 'pos') for posWords in posWordsList]\n",
37+
"\n",
38+
" negids = movie_reviews.fileids('neg')\n",
39+
" posids = movie_reviews.fileids('pos')\n",
40+
" \n",
41+
" negfeats = [(featx(movie_reviews.words(fileids=[f]),collocationFunc), 'neg') for f in negids]\n",
42+
" posfeats = [(featx(movie_reviews.words(fileids=[f]),collocationFunc), 'pos') for f in posids]\n",
43+
"\n",
44+
"# lenNegFeats=min(len(negfeats),24)\n",
45+
"# lenPosFeats=min(len(posfeats),24)\n",
46+
" lenNegFeats=len(negfeats)\n",
47+
" lenPosFeats=len(posfeats)\n",
48+
" negcutoff = int(lenNegFeats*3/4)\n",
49+
" poscutoff = int(lenPosFeats*3/4)\n",
50+
" \n",
51+
" trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]\n",
52+
" testfeats = negfeats[negcutoff:lenNegFeats] + posfeats[poscutoff:lenPosFeats]\n",
53+
" \n",
54+
" classifier = MaxentClassifier.train(trainfeats)\n",
55+
" refsets = collections.defaultdict(set)\n",
56+
" testsets = collections.defaultdict(set)\n",
57+
" \n",
58+
" for i, (feats, label) in enumerate(testfeats):\n",
59+
" refsets[label].add(i)\n",
60+
" observed = classifier.classify(feats)\n",
61+
" testsets[observed].add(i)\n",
62+
" evaluationMetrics={}\n",
63+
" classifier.show_most_informative_features()\n",
64+
" evaluationMetrics['accuracy']=nltk.classify.util.accuracy(classifier, testfeats)\n",
65+
" evaluationMetrics['posPrec']=nltk.precision(refsets['pos'], testsets['pos'])\n",
66+
" evaluationMetrics['posRecall']=nltk.recall(refsets['pos'], testsets['pos'])\n",
67+
" evaluationMetrics['posF_Score']=nltk.f_measure(refsets['pos'], testsets['pos'])\n",
68+
" evaluationMetrics['negPrec']=nltk.precision(refsets['neg'], testsets['neg'])\n",
69+
" evaluationMetrics['negRecall']=nltk.recall(refsets['neg'], testsets['neg'])\n",
70+
" evaluationMetrics['negF_Score']=nltk.f_measure(refsets['neg'], testsets['neg'])\n",
71+
" return evaluationMetrics"
72+
]
73+
},
74+
{
75+
"cell_type": "code",
76+
"execution_count": 5,
77+
"metadata": {
78+
"collapsed": false
79+
},
80+
"outputs": [],
81+
"source": [
82+
"from nltk.corpus import stopwords\n",
83+
"stopset = set(stopwords.words('english'))\n",
84+
"evaluations=[] \n",
85+
"def stopword_filtered_word_feats(words,collocator):\n",
86+
" return dict([(word, True) for word in words if word not in stopset])\n",
87+
"#evaluations.append(evaluate_classifier(stopword_filtered_word_feats,None)) "
88+
]
89+
},
90+
{
91+
"cell_type": "code",
92+
"execution_count": 6,
93+
"metadata": {
94+
"collapsed": true
95+
},
96+
"outputs": [],
97+
"source": [
98+
"#Bigram Collocations- Handle Cases like “not good”, here B-O-W Approach will Fail\n",
99+
"def bigram_word_feats(words, score_fn, n=200):\n",
100+
" bigram_finder = BigramCollocationFinder.from_words(words)\n",
101+
" bigrams = bigram_finder.nbest(score_fn, n)\n",
102+
" return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])\n",
103+
" \n",
104+
"#evaluations.append(evaluate_classifier(bigram_word_feats,BigramAssocMeasures.chi_sq))#Works best for this Data\n",
105+
"#evaluations.append(evaluate_classifier(bigram_word_feats,BigramAssocMeasures.jaccard))\n",
106+
"#evaluations.append(evaluate_classifier(bigram_word_feats,BigramAssocMeasures.likelihood_ratio))"
107+
]
108+
},
109+
{
110+
"cell_type": "code",
111+
"execution_count": 3,
112+
"metadata": {
113+
"collapsed": false
114+
},
115+
"outputs": [
116+
{
117+
"name": "stdout",
118+
"output_type": "stream",
119+
"text": [
120+
"1583820 39768 [(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), (\"'\", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('\"', 17612), ('it', 16107), ('that', 15924), ('-', 15595), (')', 11781), ('(', 11664), ('as', 11378), ('with', 10792), ('for', 9961)]\n",
121+
"1583820 ['pos', 'neg'] dict_items([('pos', FreqDist({',': 35269, 'the': 35058, '.': 32162, 'a': 17910, 'and': 15680, 'of': 15487, 'to': 15420, \"'\": 15317, 'is': 11136, 'in': 10097, ...})), ('neg', FreqDist({',': 42448, 'the': 41471, '.': 33714, 'a': 20196, 'and': 19896, 'of': 18636, 'to': 16517, \"'\": 15268, 'is': 14059, 'in': 11725, ...}))])\n",
122+
"751256 832564\n"
123+
]
124+
}
125+
],
126+
"source": [
127+
"from nltk.collocations import *\n",
128+
"from nltk.probability import FreqDist\n",
129+
"from nltk.probability import ConditionalFreqDist\n",
130+
"word_fd = FreqDist()\n",
131+
"label_word_fd = ConditionalFreqDist()\n",
132+
"\n",
133+
"testNegWords = movie_reviews.words(categories=['pos'])\n",
134+
"testPosWords = movie_reviews.words(categories=['neg'])\n",
135+
"\n",
136+
"for word in testNegWords:\n",
137+
" word_fd[word.lower()]+=1\n",
138+
" label_word_fd['neg'][word.lower()]+=1\n",
139+
"for word in testPosWords:\n",
140+
" word_fd[word.lower()]+=1\n",
141+
" label_word_fd['pos'][word.lower()]+=1\n",
142+
"print(word_fd.N(),word_fd.B(),word_fd.most_common(20))\n",
143+
"print(label_word_fd.N(),label_word_fd.conditions(),label_word_fd.items())\n",
144+
"print(label_word_fd['pos'].N(),label_word_fd['neg'].N())"
145+
]
146+
},
147+
{
148+
"cell_type": "code",
149+
"execution_count": null,
150+
"metadata": {
151+
"collapsed": false
152+
},
153+
"outputs": [
154+
{
155+
"name": "stdout",
156+
"output_type": "stream",
157+
"text": [
158+
" ==> Training (100 iterations)\n",
159+
"\n",
160+
" Iteration Log Likelihood Accuracy\n",
161+
" ---------------------------------------\n",
162+
" 1 -0.69315 0.500"
163+
]
164+
}
165+
],
166+
"source": [
167+
"# n_ii = label_word_fd[label][word]\n",
168+
"# n_ix = word_fd[word]\n",
169+
"# n_xi = label_word_fd[label].N()\n",
170+
"# n_xx = label_word_fd.N()\n",
171+
"# w1 ~w1\n",
172+
"# ------ ------\n",
173+
"# w2 | n_ii | n_oi | = n_xi\n",
174+
"# ------ ------\n",
175+
"# ~w2 | n_io | n_oo |\n",
176+
"# ------ ------\n",
177+
"# =n_ix TOTAL = n_xx\n",
178+
"# A number of measures are available to score collocations or other associations. The arguments to measure \n",
179+
"# functions are marginals of a contingency table, in the bigram case (n_ii, (n_ix, n_xi), n_xx):\n",
180+
"# n_ii = label_word_fd[label][word]\n",
181+
"# n_ix = word_fd[word]\n",
182+
"# n_xi = label_word_fd[label].N()\n",
183+
"# n_xx = label_word_fd.N()\n",
184+
"# Chi-Sq Contingency Table : Relating Word w1 with \"pos\" classification \n",
185+
"# w1 ~w1\n",
186+
"# ------ ------\n",
187+
"# +ve | n_ii | n_oi | = n_xi\n",
188+
"# ------ ------\n",
189+
"# -ve | n_io | n_oo |\n",
190+
"# ------ ------\n",
191+
"# =n_ix TOTAL = n_xx\n",
192+
"# n_ix : Total Freq of word w1, n_xi: pos_word_count \n",
193+
"pos_word_count = label_word_fd['pos'].N()\n",
194+
"neg_word_count = label_word_fd['neg'].N()\n",
195+
"total_word_count = pos_word_count + neg_word_count\n",
196+
" \n",
197+
"word_scores = {}\n",
198+
"\n",
199+
"#print(word_fd.items())\n",
200+
"for word, freq in word_fd.items():\n",
201+
" pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],(freq, pos_word_count), total_word_count)\n",
202+
" neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],(freq, neg_word_count), total_word_count)\n",
203+
" word_scores[word] = pos_score + neg_score\n",
204+
"import operator\n",
205+
"best1 = sorted(word_scores.items(), key=operator.itemgetter(1), reverse=True)[:10000]\n",
206+
"bestwords = set([w for w, s in best1])\n",
207+
" \n",
208+
"def best_word_feats(words,biGramMeasure):\n",
209+
" return dict([(word, True) for word in words if word in bestwords])\n",
210+
" \n",
211+
"evaluations.append(evaluate_classifier(best_word_feats,BigramAssocMeasures.chi_sq))\n",
212+
" \n",
213+
"def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):\n",
214+
" bigram_finder = BigramCollocationFinder.from_words(words)\n",
215+
" bigrams = bigram_finder.nbest(score_fn, n)\n",
216+
" d = dict([(bigram, True) for bigram in bigrams])\n",
217+
" d.update(best_word_feats(words,score_fn))\n",
218+
" return d\n",
219+
"#evaluations.append(evaluate_classifier(best_bigram_word_feats,BigramAssocMeasures.chi_sq))"
220+
]
221+
},
222+
{
223+
"cell_type": "code",
224+
"execution_count": null,
225+
"metadata": {
226+
"collapsed": true
227+
},
228+
"outputs": [],
229+
"source": [
230+
"for modelEvalMetrics in evaluations:\n",
231+
" print(modelEvalMetrics)"
232+
]
233+
}
234+
],
235+
"metadata": {
236+
"kernelspec": {
237+
"display_name": "Python 3",
238+
"language": "python",
239+
"name": "python3"
240+
},
241+
"language_info": {
242+
"codemirror_mode": {
243+
"name": "ipython",
244+
"version": 3
245+
},
246+
"file_extension": ".py",
247+
"mimetype": "text/x-python",
248+
"name": "python",
249+
"nbconvert_exporter": "python",
250+
"pygments_lexer": "ipython3",
251+
"version": "3.5.1"
252+
}
253+
},
254+
"nbformat": 4,
255+
"nbformat_minor": 0
256+
}

0 commit comments

Comments
 (0)