Skip to content

Commit 6103082

Browse files
authored
Add files via upload
1 parent ec7a927 commit 6103082

File tree

1 file changed

+160
-0
lines changed

1 file changed

+160
-0
lines changed

movieReviewLog.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
4+
# In[1]:
5+
6+
7+
8+
from collections import defaultdict
9+
import os
10+
import pandas as pd
11+
12+
uniqueDict=defaultdict(list)
13+
dicPos=defaultdict(int)
14+
dicNeg=defaultdict(int)
15+
16+
#for positive data
17+
filesPos = os.listdir("./pos")
18+
print("lehgth of total files in positive is",len(filesPos),". So training data is till 600.");print()
19+
20+
for i in range(600):
21+
file=filesPos[i].split('.')
22+
if file[0] != "":
23+
data=open("./pos/"+file[0]+'.txt',encoding="utf-8",errors='ignore' )
24+
df=data.read()
25+
26+
for word in df.split():
27+
if word not in uniqueDict:
28+
uniqueDict[word]
29+
if word not in dicPos:
30+
dicPos[word]=1
31+
else:
32+
dicPos[word]+=1
33+
34+
#for negative data
35+
filesNeg = os.listdir("./neg")
36+
print("lehgth of total files in negative is",len(filesNeg),". So training data is till 600.");print()
37+
38+
for i in range(600):
39+
file=filesNeg[i].split('.')
40+
if file[0] != "":
41+
data=open("./neg/"+file[0]+'.txt',encoding="utf-8",errors='ignore' )
42+
df=data.read()
43+
44+
for word in df.split():
45+
if word not in uniqueDict:
46+
uniqueDict[word]
47+
if word not in dicNeg:
48+
dicNeg[word]=1
49+
else:
50+
dicNeg[word]+=1
51+
52+
53+
vocab=len(uniqueDict)
54+
pos=len(dicPos)
55+
neg=len(dicNeg)
56+
print("words in +ve",pos,"; in -ve",neg,"and total unique words",vocab);print()
57+
58+
# Naive Base training for 1200 documents (600 +ve and 600 -ve)
59+
probPos=defaultdict(float)
60+
probNeg=defaultdict(float)
61+
62+
totalPos=0;totalNeg=0 #total counts of duplicates words
63+
for word in dicPos:
64+
totalPos+=dicPos[word]
65+
66+
for word in dicNeg:
67+
totalNeg+=dicNeg[word]
68+
69+
for word in dicPos: #probability of a word in both +ve and -ve
70+
probPos[word] = (1+dicPos[word])/(vocab+totalPos) # plus 1, in case of count=0
71+
for word in dicNeg:
72+
probNeg[word] = (1+dicNeg[word])/(vocab+totalNeg)
73+
74+
75+
# In[2]:
76+
77+
78+
#testing +ve the data remaining 93 files in both +ve and -ve
79+
import math
80+
count=0
81+
for i in range(600,693,1):
82+
file=filesPos[i].split('.')
83+
if file[0] != "":
84+
data=open("./pos/"+file[0]+'.txt',encoding="utf-8",errors='ignore' )
85+
df=data.read()
86+
87+
# print(file[0]+"."+file[1],end=" is ")
88+
89+
dic=defaultdict(int)
90+
for word in df.split():
91+
dic[word]= 1 if word not in dic else (dic[word]+1)
92+
93+
positive=0.5
94+
negative=0.5
95+
notFound=10**(-20)
96+
for word in dic: #probability of a word in both +ve and -ve
97+
positive += math.log10(probPos[word]) if word in probPos else math.log10(notFound)
98+
negative += math.log10(probNeg[word]) if word in probNeg else math.log10(notFound)
99+
100+
if (positive >negative):
101+
count+=1
102+
# print("+ve")
103+
# else:
104+
# print("-ve")
105+
106+
107+
108+
# In[3]:
109+
110+
111+
#testing -ve the data remaining
112+
import math
113+
countN=0
114+
for i in range(600,693,1):
115+
file=filesNeg[i].split('.')
116+
if file[0] != "":
117+
data=open("./neg/"+file[0]+'.txt',encoding="utf-8",errors='ignore' )
118+
df=data.read()
119+
120+
# print(file[0]+"."+file[1],end=" is ")
121+
122+
dic=defaultdict(int)
123+
for word in df.split():
124+
dic[word]= 1 if word not in dic else (dic[word]+1)
125+
126+
positive=0.5
127+
negative=0.5
128+
notFound=10**(-20)
129+
for word in dic: #probability of a word in both +ve and -ve
130+
131+
positive += math.log10(probPos[word]) if word in probPos else math.log10(notFound)
132+
negative += math.log10(probNeg[word]) if word in probNeg else math.log10(notFound)
133+
# if word in probPos:
134+
# positive *= probPos[word]
135+
# if word in probNeg:
136+
# negative *= probNeg[word]
137+
# print(positive,negative)
138+
139+
if (positive < negative):
140+
countN+=1
141+
# print("-ve")
142+
# else:
143+
# print("+ve")
144+
145+
146+
147+
# In[4]:
148+
149+
150+
acc=(count)*100/93
151+
accN=countN*100/93
152+
print("accuracy for negative 93 files",accN)
153+
print("accuracy for positive 93 files",acc)
154+
155+
156+
# In[ ]:
157+
158+
159+
160+

0 commit comments

Comments
 (0)