Added 3 python files in reference to the Task Maintioned in TODO for Ensemble_strategy, GTPModel and XLNetTransformer.

rohan12345a · rohan12345a · commit 8f5cacc85aa0 · 2024-04-15T23:53:04.000+05:30
diff --git a/models/GPTModel.py b/models/GPTModel.py
@@ -0,0 +1,123 @@
+# In this code we have used GPT-2 Model for classification of IMDB Dataset:
+
+# Code is as followed:
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, GPT2Config
+from sklearn.pipeline import Pipeline
+from sklearn.base import BaseEstimator, TransformerMixin
+import torch
+import re
+
+
+# Defining cleaning function:
+def clean(text):
+    for token in ["<br/>", "<br>", "<br>"]:
+        text = re.sub(token, " ", text)
+    
+    text = re.sub("[\s+\.\!\/_,$%^*()\(\)<>+\"\[\]\-\?;:\'{}`]+|[+——！，。？、~@#￥%……&*（）]+", " ", text)
+    
+    return text.lower()
+
+# Loading Data:
+def load_imdb_dataset(data_path, nrows=100):
+    df = pd.read_csv(data_path, nrows=nrows)
+    texts = df['review'].apply(clean)
+    labels = df['sentiment']
+    return texts, labels
+
+# Class for GPT-Transformer
+class GPT2Transformer(BaseEstimator, TransformerMixin):
+    def __init__(self, tokenizer, max_length=2):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        input_ids = []
+        for text in X:
+            encoded_text = self.tokenizer.encode(text, add_special_tokens=True, max_length=self.max_length, truncation=True)
+            input_ids.append(encoded_text)
+        return input_ids
+
+# Class for the classifier
+class GPT2Classifier(BaseEstimator):
+    def __init__(self, model):
+        self.model = model
+
+    def fit(self, X, y):
+        return self
+
+    def predict(self, X):
+        # Finding the maximum sequence length
+        max_length = max(len(seq) for seq in X)
+
+        # Pading sequences to the maximum length
+        padded_input_ids = [seq + [0] * (max_length - len(seq)) for seq in X]
+
+        # Convert input to tensors
+        input_ids = torch.tensor(padded_input_ids)
+
+        # Move input tensors to the appropriate device (GPU if available)
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        input_ids = input_ids.to(device)
+
+        # Moving the model to the appropriate device
+        self.model.to(device)
+
+        # Predicting logits
+        with torch.no_grad():
+            logits = self.model(input_ids)[0]
+
+        # Moving logits back to CPU if necessary
+        logits = logits.cpu()
+
+        # Converting logits to class labels
+        predicted_labels = torch.argmax(logits, dim=1).tolist()
+
+        # Converting predicted labels to original label format
+        label_map = {1: 'positive', 0: 'negative'}
+        predicted_labels = [label_map[label] for label in predicted_labels]
+
+        return predicted_labels
+
+def main():
+    data_path = "/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
+    texts, labels = load_imdb_dataset(data_path, nrows=20000)  # Load only the top 100 rows
+    train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
+
+    # Initializing tokenizer and model
+    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+    
+    # Loading GPT2 configuration
+    config = GPT2Config.from_pretrained('gpt2')
+    config.pad_token_id = config.eos_token_id  # Set the padding token ID to the end-of-sequence token ID
+    
+    # Initializing model with updated configuration
+    model = GPT2ForSequenceClassification.from_pretrained('gpt2', config=config)
+
+    # Defining the pipeline for text classification
+    pipeline = Pipeline([
+        ('transformer', GPT2Transformer(tokenizer, max_length=2)),
+        ('clf', GPT2Classifier(model)),
+    ])
+
+    # Training the classifier
+    pipeline.fit(train_texts, train_labels)
+
+    # Predicting on the test set
+    predicted_labels = pipeline.predict(test_texts)
+
+    # Calculating accuracy
+    accuracy = accuracy_score(test_labels, predicted_labels)
+    print("GPT2 Accuracy:", accuracy)
+
+if __name__ == "__main__":
+    main()
+
+
+# Got Accuracy of 0.48825.
diff --git a/models/XLNetTransformer.py b/models/XLNetTransformer.py
@@ -0,0 +1,117 @@
+# This python code applies XLNetTransformer for classification of IMDB Dataset:
+
+
+# Importing Neccessary Libraries 
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+from transformers import XLNetTokenizer, XLNetForSequenceClassification
+from sklearn.pipeline import Pipeline
+from sklearn.base import BaseEstimator, TransformerMixin
+import torch
+import re
+
+# Defining cleaning Text function
+def clean(text):
+    for token in ["<br/>", "<br>", "<br>"]:
+        text = re.sub(token, " ", text)
+    
+    text = re.sub("[\s+\.\!\/_,$%^*()\(\)<>+\"\[\]\-\?;:\'{}`]+|[+——！，。？、~@#￥%……&*（）]+", " ", text)
+    
+    return text.lower()
+
+# Loading Dataset
+def load_imdb_dataset(data_path, nrows=100):
+    df = pd.read_csv(data_path, nrows=nrows)  # Limit to the first 100 rows
+    texts = df['review'].apply(clean)
+    labels = df['sentiment']
+    return texts, labels
+
+# Class for XLNET-Transformer
+class XLNetTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self, tokenizer, max_length=256):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        input_ids = []
+        for text in X:
+            encoded_text = self.tokenizer.encode(text, add_special_tokens=True, max_length=self.max_length, truncation=True)
+            input_ids.append(encoded_text)
+        return input_ids
+
+# Class for the classifier:
+class XLNetClassifier(BaseEstimator):
+    def __init__(self, model):
+        self.model = model
+
+    def fit(self, X, y):
+        # No training required for this example
+        return self
+
+    def predict(self, X):
+        # Finding the maximum sequence length
+        max_length = max(len(seq) for seq in X)
+
+        # Pading the  sequences to the maximum length
+        padded_input_ids = [seq + [0] * (max_length - len(seq)) for seq in X]
+
+        # Converting input to tensors
+        input_ids = torch.tensor(padded_input_ids)
+
+        # Moving input tensors to the appropriate device (GPU if available)
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        input_ids = input_ids.to(device)
+
+        # Moving the model to the appropriate device
+        self.model.to(device)
+
+        # Predicting logits
+        with torch.no_grad():
+            logits = self.model(input_ids)[0]
+
+        # Moveing logits back to CPU if necessary
+        logits = logits.cpu()
+
+        # Converting logits to class labels
+        predicted_labels = torch.argmax(logits, dim=1).tolist()
+
+        # Converting predicted labels to original label format
+        label_map = {1: 'positive', 0: 'negative'}
+        predicted_labels = [label_map[label] for label in predicted_labels]
+
+        return predicted_labels
+
+def main():
+    data_path = "/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
+    texts, labels = load_imdb_dataset(data_path, nrows=1500)  # Load only the top 100 rows
+    train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
+
+    # Initializing tokenizer and model
+    tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
+    model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased')
+
+    # Defining the pipeline for text classification
+    pipeline = Pipeline([
+        ('transformer', XLNetTransformer(tokenizer, max_length=256)),
+        ('clf', XLNetClassifier(model)),
+    ])
+
+    # Trainig the classifier
+    pipeline.fit(train_texts, train_labels)
+
+    # Predicting on the test set
+    predicted_labels = pipeline.predict(test_texts)
+
+    # Calculating accuracy
+    accuracy = accuracy_score(test_labels, predicted_labels)
+    print("XLNet Accuracy:", accuracy)
+
+if __name__ == "__main__":
+    main()
+
+
+#XLNet Accuracy: 0.5166666666666667
diff --git a/models/ensemble_strategy.py b/models/ensemble_strategy.py
@@ -0,0 +1,94 @@
+# This file contains the ensemble methods like Random Forest, Gradient Boosting, AdaBoost Accuracy and Bagging Accuracy
+# This has been applied on the data of IMDB.
+# Following code would include whole process:
+
+
+
+# Importing Neccessary Modules:
+import re
+import pandas as pd
+import os
+import pickle
+import numpy as np
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.model_selection import train_test_split
+from sklearn import metrics
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
+from sklearn.metrics import accuracy_score
+import re
+
+
+
+# Data Preprocessing Function:
+
+def clean(text):
+    for token in ["<br/>", "<br>", "<br"]:
+        text = re.sub(token, " ", text)
+
+    text = re.sub("[\s+\.\!\/_,$%^*()\(\)<>+\"\[\]\-\?;:\'{}`]+|[+——！，。？、~@#￥%……&*（）]+", " ", text)
+
+    return text.lower()
+
+
+# Data Loading Function:
+
+def load_imdb_dataset(data_path):
+    df = pd.read_csv(data_path)
+
+    texts = df['review'].apply(clean)
+    labels = df['sentiment']
+
+
+
+# Main function to load data and implementation of ensemble Methods:
+
+
+def main():
+    data_path = "/content/drive/MyDrive/DATASETS/IMDB Dataset.csv"
+
+    texts, labels = load_imdb_dataset(data_path)
+
+    train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
+
+    # Defining classifiers for each ensemble method
+    classifiers = {
+        "Random Forest": RandomForestClassifier(),
+        "Gradient Boosting": GradientBoostingClassifier(),
+        "AdaBoost": AdaBoostClassifier(),
+        "Bagging": BaggingClassifier(),
+    }
+
+
+    for name, classifier in classifiers.items():
+        # Defining the pipeline for text classification
+        pipeline = Pipeline([
+            ('vect', CountVectorizer()),
+            ('tfidf', TfidfTransformer()),
+            ('clf', classifier),
+        ])
+
+        
+        pipeline.fit(train_texts, train_labels)
+
+        
+        predicted_labels = pipeline.predict(test_texts)
+
+        # Calculating accuracy
+        accuracy = accuracy_score(test_labels, predicted_labels)
+        print(f"{name} Accuracy:", accuracy)
+
+if __name__ == "__main__":
+    main()
+
+
+
+# Random Forest Accuracy: 0.8457
+# Gradient Boosting Accuracy: 0.8162
+# AdaBoost Accuracy: 0.807
+# Bagging Accuracy: 0.7833