Skip to content

Commit 8f5cacc

Browse files
committed
Added 3 python files in reference to the Task Maintioned in TODO for Ensemble_strategy, GTPModel and XLNetTransformer.
1 parent a73119c commit 8f5cacc

File tree

3 files changed

+334
-0
lines changed

3 files changed

+334
-0
lines changed

models/GPTModel.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
# In this code we have used GPT-2 Model for classification of IMDB Dataset:
2+
3+
# Code is as followed:
4+
5+
import pandas as pd
6+
from sklearn.model_selection import train_test_split
7+
from sklearn.metrics import accuracy_score
8+
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, GPT2Config
9+
from sklearn.pipeline import Pipeline
10+
from sklearn.base import BaseEstimator, TransformerMixin
11+
import torch
12+
import re
13+
14+
15+
# Defining cleaning function:
16+
def clean(text):
17+
for token in ["<br/>", "<br>", "<br>"]:
18+
text = re.sub(token, " ", text)
19+
20+
text = re.sub("[\s+\.\!\/_,$%^*()\(\)<>+\"\[\]\-\?;:\'{}`]+|[+——!,。?、~@#¥%……&*()]+", " ", text)
21+
22+
return text.lower()
23+
24+
# Loading Data:
25+
def load_imdb_dataset(data_path, nrows=100):
26+
df = pd.read_csv(data_path, nrows=nrows)
27+
texts = df['review'].apply(clean)
28+
labels = df['sentiment']
29+
return texts, labels
30+
31+
# Class for GPT-Transformer
32+
class GPT2Transformer(BaseEstimator, TransformerMixin):
33+
def __init__(self, tokenizer, max_length=2):
34+
self.tokenizer = tokenizer
35+
self.max_length = max_length
36+
37+
def fit(self, X, y=None):
38+
return self
39+
40+
def transform(self, X):
41+
input_ids = []
42+
for text in X:
43+
encoded_text = self.tokenizer.encode(text, add_special_tokens=True, max_length=self.max_length, truncation=True)
44+
input_ids.append(encoded_text)
45+
return input_ids
46+
47+
# Class for the classifier
48+
class GPT2Classifier(BaseEstimator):
49+
def __init__(self, model):
50+
self.model = model
51+
52+
def fit(self, X, y):
53+
return self
54+
55+
def predict(self, X):
56+
# Finding the maximum sequence length
57+
max_length = max(len(seq) for seq in X)
58+
59+
# Pading sequences to the maximum length
60+
padded_input_ids = [seq + [0] * (max_length - len(seq)) for seq in X]
61+
62+
# Convert input to tensors
63+
input_ids = torch.tensor(padded_input_ids)
64+
65+
# Move input tensors to the appropriate device (GPU if available)
66+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
67+
input_ids = input_ids.to(device)
68+
69+
# Moving the model to the appropriate device
70+
self.model.to(device)
71+
72+
# Predicting logits
73+
with torch.no_grad():
74+
logits = self.model(input_ids)[0]
75+
76+
# Moving logits back to CPU if necessary
77+
logits = logits.cpu()
78+
79+
# Converting logits to class labels
80+
predicted_labels = torch.argmax(logits, dim=1).tolist()
81+
82+
# Converting predicted labels to original label format
83+
label_map = {1: 'positive', 0: 'negative'}
84+
predicted_labels = [label_map[label] for label in predicted_labels]
85+
86+
return predicted_labels
87+
88+
def main():
89+
data_path = "/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
90+
texts, labels = load_imdb_dataset(data_path, nrows=20000) # Load only the top 100 rows
91+
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
92+
93+
# Initializing tokenizer and model
94+
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
95+
96+
# Loading GPT2 configuration
97+
config = GPT2Config.from_pretrained('gpt2')
98+
config.pad_token_id = config.eos_token_id # Set the padding token ID to the end-of-sequence token ID
99+
100+
# Initializing model with updated configuration
101+
model = GPT2ForSequenceClassification.from_pretrained('gpt2', config=config)
102+
103+
# Defining the pipeline for text classification
104+
pipeline = Pipeline([
105+
('transformer', GPT2Transformer(tokenizer, max_length=2)),
106+
('clf', GPT2Classifier(model)),
107+
])
108+
109+
# Training the classifier
110+
pipeline.fit(train_texts, train_labels)
111+
112+
# Predicting on the test set
113+
predicted_labels = pipeline.predict(test_texts)
114+
115+
# Calculating accuracy
116+
accuracy = accuracy_score(test_labels, predicted_labels)
117+
print("GPT2 Accuracy:", accuracy)
118+
119+
if __name__ == "__main__":
120+
main()
121+
122+
123+
# Got Accuracy of 0.48825.

models/XLNetTransformer.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# This python code applies XLNetTransformer for classification of IMDB Dataset:
2+
3+
4+
# Importing Neccessary Libraries
5+
import pandas as pd
6+
from sklearn.model_selection import train_test_split
7+
from sklearn.metrics import accuracy_score
8+
from transformers import XLNetTokenizer, XLNetForSequenceClassification
9+
from sklearn.pipeline import Pipeline
10+
from sklearn.base import BaseEstimator, TransformerMixin
11+
import torch
12+
import re
13+
14+
# Defining cleaning Text function
15+
def clean(text):
16+
for token in ["<br/>", "<br>", "<br>"]:
17+
text = re.sub(token, " ", text)
18+
19+
text = re.sub("[\s+\.\!\/_,$%^*()\(\)<>+\"\[\]\-\?;:\'{}`]+|[+——!,。?、~@#¥%……&*()]+", " ", text)
20+
21+
return text.lower()
22+
23+
# Loading Dataset
24+
def load_imdb_dataset(data_path, nrows=100):
25+
df = pd.read_csv(data_path, nrows=nrows) # Limit to the first 100 rows
26+
texts = df['review'].apply(clean)
27+
labels = df['sentiment']
28+
return texts, labels
29+
30+
# Class for XLNET-Transformer
31+
class XLNetTransformer(BaseEstimator, TransformerMixin):
32+
def __init__(self, tokenizer, max_length=256):
33+
self.tokenizer = tokenizer
34+
self.max_length = max_length
35+
36+
def fit(self, X, y=None):
37+
return self
38+
39+
def transform(self, X):
40+
input_ids = []
41+
for text in X:
42+
encoded_text = self.tokenizer.encode(text, add_special_tokens=True, max_length=self.max_length, truncation=True)
43+
input_ids.append(encoded_text)
44+
return input_ids
45+
46+
# Class for the classifier:
47+
class XLNetClassifier(BaseEstimator):
48+
def __init__(self, model):
49+
self.model = model
50+
51+
def fit(self, X, y):
52+
# No training required for this example
53+
return self
54+
55+
def predict(self, X):
56+
# Finding the maximum sequence length
57+
max_length = max(len(seq) for seq in X)
58+
59+
# Pading the sequences to the maximum length
60+
padded_input_ids = [seq + [0] * (max_length - len(seq)) for seq in X]
61+
62+
# Converting input to tensors
63+
input_ids = torch.tensor(padded_input_ids)
64+
65+
# Moving input tensors to the appropriate device (GPU if available)
66+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
67+
input_ids = input_ids.to(device)
68+
69+
# Moving the model to the appropriate device
70+
self.model.to(device)
71+
72+
# Predicting logits
73+
with torch.no_grad():
74+
logits = self.model(input_ids)[0]
75+
76+
# Moveing logits back to CPU if necessary
77+
logits = logits.cpu()
78+
79+
# Converting logits to class labels
80+
predicted_labels = torch.argmax(logits, dim=1).tolist()
81+
82+
# Converting predicted labels to original label format
83+
label_map = {1: 'positive', 0: 'negative'}
84+
predicted_labels = [label_map[label] for label in predicted_labels]
85+
86+
return predicted_labels
87+
88+
def main():
89+
data_path = "/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
90+
texts, labels = load_imdb_dataset(data_path, nrows=1500) # Load only the top 100 rows
91+
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
92+
93+
# Initializing tokenizer and model
94+
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
95+
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased')
96+
97+
# Defining the pipeline for text classification
98+
pipeline = Pipeline([
99+
('transformer', XLNetTransformer(tokenizer, max_length=256)),
100+
('clf', XLNetClassifier(model)),
101+
])
102+
103+
# Trainig the classifier
104+
pipeline.fit(train_texts, train_labels)
105+
106+
# Predicting on the test set
107+
predicted_labels = pipeline.predict(test_texts)
108+
109+
# Calculating accuracy
110+
accuracy = accuracy_score(test_labels, predicted_labels)
111+
print("XLNet Accuracy:", accuracy)
112+
113+
if __name__ == "__main__":
114+
main()
115+
116+
117+
#XLNet Accuracy: 0.5166666666666667

models/ensemble_strategy.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# This file contains the ensemble methods like Random Forest, Gradient Boosting, AdaBoost Accuracy and Bagging Accuracy
2+
# This has been applied on the data of IMDB.
3+
# Following code would include whole process:
4+
5+
6+
7+
# Importing Neccessary Modules:
8+
import re
9+
import pandas as pd
10+
import os
11+
import pickle
12+
import numpy as np
13+
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
14+
from sklearn.pipeline import Pipeline
15+
from sklearn.ensemble import GradientBoostingClassifier
16+
from sklearn.model_selection import train_test_split
17+
from sklearn import metrics
18+
import pandas as pd
19+
from sklearn.model_selection import train_test_split
20+
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
21+
from sklearn.pipeline import Pipeline
22+
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
23+
from sklearn.metrics import accuracy_score
24+
import re
25+
26+
27+
28+
# Data Preprocessing Function:
29+
30+
def clean(text):
31+
for token in ["<br/>", "<br>", "<br"]:
32+
text = re.sub(token, " ", text)
33+
34+
text = re.sub("[\s+\.\!\/_,$%^*()\(\)<>+\"\[\]\-\?;:\'{}`]+|[+——!,。?、~@#¥%……&*()]+", " ", text)
35+
36+
return text.lower()
37+
38+
39+
# Data Loading Function:
40+
41+
def load_imdb_dataset(data_path):
42+
df = pd.read_csv(data_path)
43+
44+
texts = df['review'].apply(clean)
45+
labels = df['sentiment']
46+
47+
48+
49+
# Main function to load data and implementation of ensemble Methods:
50+
51+
52+
def main():
53+
data_path = "/content/drive/MyDrive/DATASETS/IMDB Dataset.csv"
54+
55+
texts, labels = load_imdb_dataset(data_path)
56+
57+
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
58+
59+
# Defining classifiers for each ensemble method
60+
classifiers = {
61+
"Random Forest": RandomForestClassifier(),
62+
"Gradient Boosting": GradientBoostingClassifier(),
63+
"AdaBoost": AdaBoostClassifier(),
64+
"Bagging": BaggingClassifier(),
65+
}
66+
67+
68+
for name, classifier in classifiers.items():
69+
# Defining the pipeline for text classification
70+
pipeline = Pipeline([
71+
('vect', CountVectorizer()),
72+
('tfidf', TfidfTransformer()),
73+
('clf', classifier),
74+
])
75+
76+
77+
pipeline.fit(train_texts, train_labels)
78+
79+
80+
predicted_labels = pipeline.predict(test_texts)
81+
82+
# Calculating accuracy
83+
accuracy = accuracy_score(test_labels, predicted_labels)
84+
print(f"{name} Accuracy:", accuracy)
85+
86+
if __name__ == "__main__":
87+
main()
88+
89+
90+
91+
# Random Forest Accuracy: 0.8457
92+
# Gradient Boosting Accuracy: 0.8162
93+
# AdaBoost Accuracy: 0.807
94+
# Bagging Accuracy: 0.7833

0 commit comments

Comments
 (0)