Ziruliu
diff --git a/‎Project Description.pdf
395 KB b/‎Project Description.pdf
395 KB
diff --git a/‎Project Report.pdf
900 KB b/‎Project Report.pdf
900 KB
diff --git a/‎classificationMethod.py
+56 b/‎classificationMethod.py
+56
diff --git a/‎classificationMethod.pyc
2.25 KB b/‎classificationMethod.pyc
2.25 KB
diff --git a/‎commands.txt
+12 b/‎commands.txt
+12
diff --git a/‎dataClassifier.py
+226 b/‎dataClassifier.py
+226
@@ -0,0 +1,56 @@
+# classificationMethod.py
+# -----------------------
+# Licensing Information: Please do not distribute or publish solutions to this
+# project. You are free to use and extend these projects for educational
+# purposes. The Pacman AI projects were developed at UC Berkeley, primarily by
+# John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
+# For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
+
+# This file contains the abstract class ClassificationMethod
+
+class ClassificationMethod:
+  """
+  ClassificationMethod is the abstract superclass of 
+   - MostFrequentClassifier
+   - NaiveBayesClassifier
+   - PerceptronClassifier
+   - MiraClassifier
+ 
+  As such, you need not add any code to this file.  You can write
+  all of your implementation code in the files for the individual
+  classification methods listed above.
+  """
+  def __init__(self, legalLabels):
+    """
+    For digits dataset, the set of legal labels will be 0,1,..,9
+    For faces dataset, the set of legal labels will be 0 (non-face) or 1 (face)
+    """
+    self.legalLabels = legalLabels
+    
+    
+  def train(self, trainingData, trainingLabels, validationData, validationLabels):
+    """
+    This is the supervised training function for the classifier.  Two sets of 
+    labeled data are passed in: a large training set and a small validation set.
+    
+    Many types of classifiers have a common training structure in practice: using
+    training data for the main supervised training loop but tuning certain parameters
+    with a small held-out validation set.
+
+    For some classifiers (naive Bayes, MIRA), you will need to return the parameters' 
+    values after traning and tuning step.
+    
+    To make the classifier generic to multiple problems, the data should be represented
+    as lists of Counters containing feature descriptions and their counts.
+    """
+    abstract
+    
+  def classify(self, data):
+    """
+    This function returns a list of labels, each drawn from the set of legal labels
+    provided to the classifier upon construction.
+
+    To make the classifier generic to multiple problems, the data should be represented
+    as lists of Counters containing feature descriptions and their counts.
+    """
+    abstract
@@ -0,0 +1,12 @@
+python dataClassifier.py
+(this runs the MostFrequent classifier on default numbers of training and test set)
+
+python dataClassifier.py -h
+(help with commands)
+
+python dataClassifier.py -c perceptron -t 5000 -s 1000
+(run the perceptron classifier with 5000 training images and 1000 test images)
+
+python dataClassifier.py -c mlp
+
+python dataClassifier.py -c svm
@@ -0,0 +1,226 @@
+# dataClassifier.py
+# -----------------
+
+import mostFrequent
+import perceptron
+import svm
+import mlp
+import samples
+import sys
+import util
+
+TRAINING_SET_SIZE = 5000
+TEST_SET_SIZE = 1000
+DIGIT_DATUM_WIDTH = 28
+DIGIT_DATUM_HEIGHT = 28
+
+
+def basicFeatureExtractorDigit(datum):
+    """
+    Returns a set of pixel features indicating whether
+    each pixel in the provided datum is white (0) or gray/black (1)
+    """
+    features = util.Counter()
+    for x in range(DIGIT_DATUM_WIDTH):
+        for y in range(DIGIT_DATUM_HEIGHT):
+            if datum.getPixel(x, y) > 0:
+                features[(x, y)] = 1
+            else:
+                features[(x, y)] = 0
+    return features
+
+def analysis(classifier, guesses, testLabels, testData, rawTestData, printImage):
+    """
+    This function is called after learning.
+    Include any code that you want here to help you analyze your results.
+
+    Use the printImage(<list of pixels>) function to visualize features.
+
+    An example of use has been given to you.
+
+    - classifier is the trained classifier
+    - guesses is the list of labels predicted by your classifier on the test set
+    - testLabels is the list of true labels
+    - testData is the list of training datapoints (as util.Counter of features)
+    - rawTestData is the list of training datapoints (as samples.Datum)
+    - printImage is a method to visualize the features
+    (see its use in the odds ratio part in runClassifier method)
+
+    This code won't be evaluated. It is for your own optional use
+    (and you can modify the signature if you want).
+    """
+
+    # Put any code here...
+    # Example of use:
+    for i in range(len(guesses)):
+        prediction = guesses[i]
+        truth = testLabels[i]
+        if (prediction != truth):
+            print "==================================="
+            print "Mistake on example %d" % i
+            print "Predicted %d; truth is %d" % (prediction, truth)
+            print "Image: "
+            print rawTestData[i]
+            break
+
+
+class ImagePrinter:
+    def __init__(self, width, height):
+        self.width = width
+        self.height = height
+
+    def printImage(self, pixels):
+        """
+        Prints a Datum object that contains all pixels in the
+        provided list of pixels.  This will serve as a helper function
+        to the analysis function you write.
+
+        Pixels should take the form
+        [(2,2), (2, 3), ...]
+        where each tuple represents a pixel.
+        """
+        image = samples.Datum(None, self.width, self.height)
+        for pix in pixels:
+            try:
+                # This is so that new features that you could define which
+                # which are not of the form of (x,y) will not break
+                # this image printer...
+                x, y = pix
+                image.pixels[x][y] = 2
+            except:
+                print "new features:", pix
+                continue
+        print image
+
+
+def default(str):
+    return str + ' [Default: %default]'
+
+
+def readCommand(argv):
+    "Processes the command used to run from the command line."
+    from optparse import OptionParser
+    parser = OptionParser(USAGE_STRING)
+
+    parser.add_option('-c', '--classifier', help=default('The type of classifier'),
+                      choices=['mostFrequent', 'perceptron', 'mlp', 'svm'], default='mostFrequent')
+    parser.add_option('-t', '--training', help=default('The size of the training set'), default=TRAINING_SET_SIZE,
+                      type="int")
+    parser.add_option('-w', '--weights', help=default('Whether to print weights'), default=False, action="store_true")
+    parser.add_option('-i', '--iterations', help=default("Maximum iterations to run training"), default=3, type="int")
+    parser.add_option('-s', '--test', help=default("Amount of test data to use"), default=TEST_SET_SIZE, type="int")
+
+    options, otherjunk = parser.parse_args(argv)
+    if len(otherjunk) != 0: raise Exception('Command line input not understood: ' + str(otherjunk))
+    args = {}
+
+    # Set up variables according to the command line input.
+    print "Doing classification"
+    print "--------------------"
+    print "classifier:\t\t" + options.classifier
+    print "training set size:\t" + str(options.training)
+
+    printImage = ImagePrinter(DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT).printImage
+    featureFunction = basicFeatureExtractorDigit
+    legalLabels = range(10)
+
+    if options.training <= 0:
+        print "Training set size should be a positive integer (you provided: %d)" % options.training
+        print USAGE_STRING
+        sys.exit(2)
+
+    if (options.classifier == "mostFrequent"):
+        classifier = mostFrequent.MostFrequentClassifier(legalLabels)
+    elif (options.classifier == "mlp"):
+        classifier = mlp.MLPClassifier(legalLabels, options.iterations)
+    elif (options.classifier == "perceptron"):
+        classifier = perceptron.PerceptronClassifier(legalLabels, options.iterations)
+    elif (options.classifier == "svm"):
+        classifier = svm.SVMClassifier(legalLabels)
+    else:
+        print "Unknown classifier:", options.classifier
+        print USAGE_STRING
+
+        sys.exit(2)
+
+    args['classifier'] = classifier
+    args['featureFunction'] = featureFunction
+    args['printImage'] = printImage
+
+    return args, options
+
+
+USAGE_STRING = """
+  USAGE:      python dataClassifier.py <options>
+  EXAMPLES:   (1) python dataClassifier.py
+                  - trains the default mostFrequent classifier on the digit dataset
+                  using the default 100 training examples and
+                  then test the classifier on test data
+              (2) python dataClassifier.py -c perceptron -t 1000 -s 500
+                  - would run the perceptron classifier on 1000 training examples, would
+                  test the classifier on 500 test data points
+                 """
+
+
+# Main harness code
+
+def runClassifier(args, options):
+    featureFunction = args['featureFunction']
+    classifier = args['classifier']
+    printImage = args['printImage']
+
+    # Load data
+    numTraining = options.training
+    numTest = options.test
+
+    rawTrainingData = samples.loadDataFile("data/digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH,
+                                           DIGIT_DATUM_HEIGHT)
+    trainingLabels = samples.loadLabelsFile("data/digitdata/traininglabels", numTraining)
+    completeRawTrainingData = samples.loadDataFile("data/digitdata/trainingimages", 5000, DIGIT_DATUM_WIDTH,
+                                                   DIGIT_DATUM_HEIGHT)
+    completeTrainingLabels = samples.loadLabelsFile("data/digitdata/traininglabels", 5000)
+    rawValidationData = samples.loadDataFile("data/digitdata/validationimages", numTest, DIGIT_DATUM_WIDTH,
+                                             DIGIT_DATUM_HEIGHT)
+    validationLabels = samples.loadLabelsFile("data/digitdata/validationlabels", numTest)
+    rawTestData = samples.loadDataFile("data/digitdata/testimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)
+    testLabels = samples.loadLabelsFile("data/digitdata/testlabels", numTest)
+
+    # Extract features
+    print "Extracting features..."
+    trainingData = map(featureFunction, rawTrainingData)
+    completeTrainingData = map(featureFunction, completeRawTrainingData)
+    validationData = map(featureFunction, rawValidationData)
+    testData = map(featureFunction, rawTestData)
+
+    # Conduct training and testing
+    print "Training..."
+    classifier.train(trainingData, trainingLabels, validationData, validationLabels)
+    print "Validating..."
+    guesses = classifier.classify(validationData)
+    correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
+    print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (
+    100.0 * correct / len(validationLabels))
+    print "Testing..."
+    guesses = classifier.classify(testData)
+    correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
+    print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))
+    print "Testing training data..."
+    guesses = classifier.classify(completeTrainingData)
+    correct = [guesses[i] == completeTrainingLabels[i] for i in range(len(completeTrainingLabels))].count(True)
+    print str(correct), ("correct out of " + str(len(completeTrainingLabels)) + " (%.1f%%).") % (
+    100.0 * correct / len(completeTrainingLabels))
+
+    analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)
+
+    if ((options.classifier == "perceptron")):
+        for l in classifier.legalLabels:
+            features_weights = classifier.findHighWeightFeatures(l)
+            print ("=== Features with high weight for label %d ===" % l)
+            printImage(features_weights)
+
+
+if __name__ == '__main__':
+    # Read input
+    args, options = readCommand(sys.argv[1:])
+    # Run classifier
+    runClassifier(args, options)