python实现KNN近邻算法

(编辑:jimmy 日期: 2024/12/27 浏览:2)

示例:《电影类型分类》

获取数据来源

电影名称 打斗次数 接吻次数 电影类型 California Man 3 104 Romance He's Not Really into Dudes 8 95 Romance Beautiful Woman 1 81 Romance Kevin Longblade 111 15 Action Roob Slayer 3000 99 2 Action Amped II 88 10 Action Unknown 18 90 unknown

数据显示:肉眼判断电影类型unknown是什么

from matplotlib import pyplot as plt
"font.sans-serif"] = ["SimHei"]
# 电影名称
names = ["California Man", "He's Not Really into Dudes", "Beautiful Woman",
   "Kevin Longblade", "Robo Slayer 3000", "Amped II", "Unknown"]
# 类型标签
labels = ["Romance", "Romance", "Romance", "Action", "Action", "Action", "Unknown"]
colors = ["darkblue", "red", "green"]
colorDict = {label: color for (label, color) in zip(set(labels), colors)}
print(colorDict)
# 打斗次数,接吻次数
X = [3, 8, 1, 111, 99, 88, 18]
Y = [104, 95, 81, 15, 2, 10, 88]
"通过打斗次数和接吻次数判断电影类型", fontsize=18)
plt.xlabel("电影中打斗镜头出现的次数", fontsize=16)
plt.ylabel("电影中接吻镜头出现的次数", fontsize=16)
"htmlcode">
# 自定义实现 mytest1.py
import numpy as np
"Romance", "Romance", "Romance", "Action", "Action", "Action"]
 return features, labels
"""
 KNN算法实现,采用欧式距离
 :param testFeature: 测试数据集,ndarray类型,一维数组
 :param trainingSet: 训练数据集,ndarray类型,二维数组
 :param labels: 训练集对应标签,ndarray类型,一维数组
 :param k: k值,int类型
 :return: 预测结果,类型与标签中元素一致
 """
 dataSetsize = trainingSet.shape[0]
 """
 构建一个由dataSet[i] - testFeature的新的数据集diffMat
 diffMat中的每个元素都是dataSet中每个特征与testFeature的差值(欧式距离中差)
 """
 testFeatureArray = np.tile(testFeature, (dataSetsize, 1))
 diffMat = testFeatureArray - trainingSet
 # 对每个差值求平方
 sqDiffMat = diffMat ** 2
 # 计算dataSet中每个属性与testFeature的差的平方的和
 sqDistances = sqDiffMat.sum(axis=1)
 # 计算每个feature与testFeature之间的欧式距离
 distances = sqDistances ** 0.5
"""
 排序,按照从小到大的顺序记录distances中各个数据的位置
 如distance = [5, 9, 0, 2]
 则sortedStance = [2, 3, 0, 1]
 """
 sortedDistances = distances.argsort()
"htmlcode">
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
"FlightDistance", "PlaytimePreweek", "IcecreamCostPreweek", "label"]
 datingTrainData = np.array(datingData[["FlightDistance", "PlaytimePreweek", "IcecreamCostPreweek"]])
 datingTrainLabel = np.array(datingData["label"])
 return datingData, datingTrainData, datingTrainLabel
"3d")
 plt.scatter(np.array([datingTrainData[x][0]
       for x in range(len(datingTrainLabel))
       if datingTrainLabel[x] == "smallDoses"]),
    np.array([datingTrainData[x][1]
       for x in range(len(datingTrainLabel))
       if datingTrainLabel[x] == "smallDoses"]),
    np.array([datingTrainData[x][2]
       for x in range(len(datingTrainLabel))
       if datingTrainLabel[x] == "smallDoses"]), c="red")
 plt.scatter(np.array([datingTrainData[x][0]
       for x in range(len(datingTrainLabel))
       if datingTrainLabel[x] == "didntLike"]),
    np.array([datingTrainData[x][1]
       for x in range(len(datingTrainLabel))
       if datingTrainLabel[x] == "didntLike"]),
    np.array([datingTrainData[x][2]
       for x in range(len(datingTrainLabel))
       if datingTrainLabel[x] == "didntLike"]), c="green")
 plt.scatter(np.array([datingTrainData[x][0]
       for x in range(len(datingTrainLabel))
       if datingTrainLabel[x] == "largeDoses"]),
    np.array([datingTrainData[x][1]
       for x in range(len(datingTrainLabel))
       if datingTrainLabel[x] == "largeDoses"]),
    np.array([datingTrainData[x][2]
       for x in range(len(datingTrainLabel))
       if datingTrainLabel[x] == "largeDoses"]), c="blue")
 plt.xlabel("飞行里程数", fontsize=16)
 plt.ylabel("视频游戏耗时百分比", fontsize=16)
 plt.clabel("冰淇凌消耗", fontsize=16)
 plt.show()
 
datingData, datingTrainData, datingTrainLabel = loadDatingData(FILEPATH1)
datingView3D(datingTrainData, datingTrainLabel)

问题分析:抽取数据集的前10%在数据集的后90%进行测试

编码实现

# 自定义方法实现
import pandas as pd
import numpy as np
"FlightDistance", "PlaytimePreweek", "IcecreamCostPreweek", "label"]
 datingTrainData = np.array(datingData[["FlightDistance", "PlaytimePreweek", "IcecreamCostPreweek"]])
 datingTrainLabel = np.array(datingData["label"])
 return datingData, datingTrainData, datingTrainLabel
"The total error rate is : {}\n".format(error/float(numberTest)))
"__main__":
 FILEPATH = "./datingTestSet1.txt"
 datingTest(FILEPATH)
# python 第三方包实现
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
"__main__":
 FILEPATH = "./datingTestSet1.txt"
 datingData, datingTrainData, datingTrainLabel = loadDatingData(FILEPATH)
 normValuesData = autoNorm(datingTrainData)
 errorCount = 0
 ratio = 0.10
 total = normValuesData.shape[0]
 numberTest = int(total * ratio)
 
 k = 5
 clf = KNeighborsClassifier(n_neighbors=k)
 clf.fit(normValuesData[numberTest:total], datingTrainLabel[numberTest:total])
 
 for i in range(numberTest):
  res = clf.predict(normValuesData[i].reshape(1, -1))
  if res != datingTrainLabel[i]:
   errorCount += 1
 print("The total error rate is : {}\n".format(errorCount/float(numberTest)))

以上就是python实现KNN近邻算法的详细内容,更多关于python实现KNN近邻算法的资料请关注其它相关文章!