Реализация КНН с K-кратным резюме в Python


Я учусь мл, алгоритмы классификации. Вот моя реализация k ближайших соседей с K-кратная кросс-проверка

knn.py

from numpy import *
import operator
import sys

def classify(inX, dataSet, labels, neighboursNumber):
    dataSetSize = dataSet.shape[0]
    tiled = tile(inX, (dataSetSize,1))

    squaredDiffMat = (tiled - dataSet)**2
    sqDistances = squaredDiffMat.sum(axis=1)
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort() #get indices sorted by distance
    classCount={}
    for i in range(neighboursNumber):
        voteLabel = labels[sortedDistIndicies[i]]
        classCount[voteLabel] = classCount.get(voteLabel,0) + 1 #incremenent vote, or put 1 if doesn't exist yet

    # sort dictionary by value in reverse order
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)

    #and retrieve maximum vote class
    return sortedClassCount[0][0]

def normalizeDataSet(dataSet):
    minVals = dataSet.min(axis=0)
    maxVals = dataSet.max(axis=0)
    ranges = maxVals - minVals

    normDataSet = zeros(shape(dataSet))
    numberOfrows = dataSet.shape[0]

    newMinVals = tile(0, (dataSet.shape[1]))
    newMaxVals = tile(1, (dataSet.shape[1]))
    newRanges = newMaxVals - newMinVals

    deltas = dataSet - tile(minVals, (numberOfrows,1))
    ratios = tile(newRanges, (numberOfrows,1))/tile(ranges, (numberOfrows,1))
    normalizedDataSet = deltas*ratios

    return normalizedDataSet, ranges, minVals

def testClassifier(testData, testDataLabels, trainingData, trainingDataLabels):
    error = 0.0
    for index, testEntry in enumerate(testData):
        inArr = array([testEntry[0], testEntry[1], testEntry[2]])
        classifierResult = classify(inArr, trainingData, trainingDataLabels, 3)
        print("Calculated", classifierResult, "should be", testDataLabels[index])
        error += classifierResult != testDataLabels[index]

    return error/len(testData)

def readDataFromFile(filename):
    labeltoIntMap={'VeryGood':3, 'Good':2, 'Bad':1}
    dataFile = open(filename)
    arrayOfLines = dataFile.readlines()
    numberOfLines = len(arrayOfLines)
    matrix = zeros((numberOfLines,3))
    classLabelVector = []

    for lineIndex, line in enumerate(arrayOfLines):
        line = line.strip()
        listFromLine = line.split('\t')
        matrix[lineIndex,:] = listFromLine[0:3]
        if(listFromLine[-1].isdigit()):
            classLabelVector.append(int(listFromLine[-1]))
        else:
            classLabelVector.append(labeltoIntMap.get(listFromLine[-1]))

    return matrix, classLabelVector

main.py

import knn
import matplotlib
import matplotlib.pyplot as plt
from numpy import *
import itertools

def splitIntoChunks(data, chunkSize):
    b = [data[i:i + chunkSize] for i in range(0, len(data), chunkSize)]
    return b

def main():

    datingDataMat,datingLabels = knn.readDataFromFile('dataset.txt')
    normMat, ranges, minVals = knn.normalizeDataSet(datingDataMat)

    chinkSize = 100
    splittedData = splitIntoChunks(normMat, chinkSize)
    splittedLabels = splitIntoChunks(datingLabels, chinkSize)

    error = 0.0
    for i in range(0, len(splittedData)):
        # move testing fold data to 0-position
        splittedData[0], splittedData[i] = splittedData[i], splittedData[0]
        testingData = splittedData[0]

        # flatten data of the rest of the folds into list
        trainingData = list(itertools.chain(*splittedData[1:]))
        npTrainingData = array(trainingData)

        # move testing fold labels to 0-position
        splittedLabels[0], splittedLabels[i] = splittedLabels[i], splittedLabels[0]
        testingLabels = splittedLabels[0]

        # flatten labels of the rest of the folds into list
        trainingLabels = list(itertools.chain(*splittedLabels[1:]))
        npTrainingLabels = array(trainingLabels)

        error += knn.testClassifier(testingData, testingLabels, npTrainingData, npTrainingLabels)

        # move testing fold back to its initial position
        splittedData[0], splittedData[i] = splittedData[i], splittedData[0]

        # move testing fold labels back to its initial position
        splittedLabels[0], splittedLabels[i] = splittedLabels[i], splittedLabels[0]

    print("Total Error", error)

if __name__ == "__main__"    :
    main()


259
-1
задан 12 апреля 2018 в 12:04 Источник Поделиться
Комментарии