|
| 1 | +#!/usr/bin/env python |
| 2 | +# coding: utf-8 |
| 3 | + |
| 4 | +# In[1]: |
| 5 | + |
| 6 | + |
| 7 | + |
| 8 | +from collections import defaultdict |
| 9 | +import os |
| 10 | +import pandas as pd |
| 11 | + |
| 12 | +uniqueDict=defaultdict(list) |
| 13 | +dicPos=defaultdict(int) |
| 14 | +dicNeg=defaultdict(int) |
| 15 | + |
| 16 | +#for positive data |
| 17 | +filesPos = os.listdir("./pos") |
| 18 | +print("lehgth of total files in positive is",len(filesPos),". So training data is till 600.");print() |
| 19 | + |
| 20 | +for i in range(600): |
| 21 | + file=filesPos[i].split('.') |
| 22 | + if file[0] != "": |
| 23 | + data=open("./pos/"+file[0]+'.txt',encoding="utf-8",errors='ignore' ) |
| 24 | + df=data.read() |
| 25 | + |
| 26 | + for word in df.split(): |
| 27 | + if word not in uniqueDict: |
| 28 | + uniqueDict[word] |
| 29 | + if word not in dicPos: |
| 30 | + dicPos[word]=1 |
| 31 | + else: |
| 32 | + dicPos[word]+=1 |
| 33 | + |
| 34 | +#for negative data |
| 35 | +filesNeg = os.listdir("./neg") |
| 36 | +print("lehgth of total files in negative is",len(filesNeg),". So training data is till 600.");print() |
| 37 | + |
| 38 | +for i in range(600): |
| 39 | + file=filesNeg[i].split('.') |
| 40 | + if file[0] != "": |
| 41 | + data=open("./neg/"+file[0]+'.txt',encoding="utf-8",errors='ignore' ) |
| 42 | + df=data.read() |
| 43 | + |
| 44 | + for word in df.split(): |
| 45 | + if word not in uniqueDict: |
| 46 | + uniqueDict[word] |
| 47 | + if word not in dicNeg: |
| 48 | + dicNeg[word]=1 |
| 49 | + else: |
| 50 | + dicNeg[word]+=1 |
| 51 | + |
| 52 | + |
| 53 | +vocab=len(uniqueDict) |
| 54 | +pos=len(dicPos) |
| 55 | +neg=len(dicNeg) |
| 56 | +print("words in +ve",pos,"; in -ve",neg,"and total unique words",vocab);print() |
| 57 | + |
| 58 | +# Naive Base training for 1200 documents (600 +ve and 600 -ve) |
| 59 | +probPos=defaultdict(float) |
| 60 | +probNeg=defaultdict(float) |
| 61 | + |
| 62 | +totalPos=0;totalNeg=0 #total counts of duplicates words |
| 63 | +for word in dicPos: |
| 64 | + totalPos+=dicPos[word] |
| 65 | + |
| 66 | +for word in dicNeg: |
| 67 | + totalNeg+=dicNeg[word] |
| 68 | + |
| 69 | +for word in dicPos: #probability of a word in both +ve and -ve |
| 70 | + probPos[word] = (1+dicPos[word])/(vocab+totalPos) # plus 1, in case of count=0 |
| 71 | +for word in dicNeg: |
| 72 | + probNeg[word] = (1+dicNeg[word])/(vocab+totalNeg) |
| 73 | + |
| 74 | + |
| 75 | +# In[2]: |
| 76 | + |
| 77 | + |
| 78 | +#testing +ve the data remaining 93 files in both +ve and -ve |
| 79 | +import math |
| 80 | +count=0 |
| 81 | +for i in range(600,693,1): |
| 82 | + file=filesPos[i].split('.') |
| 83 | + if file[0] != "": |
| 84 | + data=open("./pos/"+file[0]+'.txt',encoding="utf-8",errors='ignore' ) |
| 85 | + df=data.read() |
| 86 | + |
| 87 | +# print(file[0]+"."+file[1],end=" is ") |
| 88 | + |
| 89 | + dic=defaultdict(int) |
| 90 | + for word in df.split(): |
| 91 | + dic[word]= 1 if word not in dic else (dic[word]+1) |
| 92 | + |
| 93 | + positive=0.5 |
| 94 | + negative=0.5 |
| 95 | + notFound=10**(-20) |
| 96 | + for word in dic: #probability of a word in both +ve and -ve |
| 97 | + positive += math.log10(probPos[word]) if word in probPos else math.log10(notFound) |
| 98 | + negative += math.log10(probNeg[word]) if word in probNeg else math.log10(notFound) |
| 99 | + |
| 100 | + if (positive >negative): |
| 101 | + count+=1 |
| 102 | +# print("+ve") |
| 103 | +# else: |
| 104 | +# print("-ve") |
| 105 | + |
| 106 | + |
| 107 | + |
| 108 | +# In[3]: |
| 109 | + |
| 110 | + |
| 111 | +#testing -ve the data remaining |
| 112 | +import math |
| 113 | +countN=0 |
| 114 | +for i in range(600,693,1): |
| 115 | + file=filesNeg[i].split('.') |
| 116 | + if file[0] != "": |
| 117 | + data=open("./neg/"+file[0]+'.txt',encoding="utf-8",errors='ignore' ) |
| 118 | + df=data.read() |
| 119 | + |
| 120 | +# print(file[0]+"."+file[1],end=" is ") |
| 121 | + |
| 122 | + dic=defaultdict(int) |
| 123 | + for word in df.split(): |
| 124 | + dic[word]= 1 if word not in dic else (dic[word]+1) |
| 125 | + |
| 126 | + positive=0.5 |
| 127 | + negative=0.5 |
| 128 | + notFound=10**(-20) |
| 129 | + for word in dic: #probability of a word in both +ve and -ve |
| 130 | + |
| 131 | + positive += math.log10(probPos[word]) if word in probPos else math.log10(notFound) |
| 132 | + negative += math.log10(probNeg[word]) if word in probNeg else math.log10(notFound) |
| 133 | +# if word in probPos: |
| 134 | +# positive *= probPos[word] |
| 135 | +# if word in probNeg: |
| 136 | +# negative *= probNeg[word] |
| 137 | +# print(positive,negative) |
| 138 | + |
| 139 | + if (positive < negative): |
| 140 | + countN+=1 |
| 141 | +# print("-ve") |
| 142 | +# else: |
| 143 | +# print("+ve") |
| 144 | + |
| 145 | + |
| 146 | + |
| 147 | +# In[4]: |
| 148 | + |
| 149 | + |
| 150 | +acc=(count)*100/93 |
| 151 | +accN=countN*100/93 |
| 152 | +print("accuracy for negative 93 files",accN) |
| 153 | +print("accuracy for positive 93 files",acc) |
| 154 | + |
| 155 | + |
| 156 | +# In[ ]: |
| 157 | + |
| 158 | + |
| 159 | + |
| 160 | + |
0 commit comments