R语言实现 懒惰学习——使用近邻分类
# Example: Classifying Cancer Samples ----
## Step 1: Exploring and preparing the data ----
import the CSV file
wbcd <- read.csv("F:\\rwork\\Machine Learning with R (2nd Ed.)\\Chapter 03\\wisc_bc_data.csv", stringsAsFactors = FALSE)
examine the structure of the wbcd data frame
str(wbcd)
将Id特征删除
wbcd <- wbcd[-1]
table of diagnosis
table(wbcd$diagnosis)
因子化数据
wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"),
labels = c("Benign", "Malignant"))
table or proportions with more informative labels
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)
summarize three numeric features
summary(wbcd[c("radius_mean", "area_mean", "smoothness_mean")])
转换一min-max 标准化数值型数据
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))}
normalize the wbcd data
wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))
summary(wbcd_n$area_mean)
正如预期的那样,area_ _mean 变量的原始范围是143.5 ~ 2501.0, 而现在的范围是0~1。
create training and test data
wbcd_train <- wbcd_n[1:469, ]
wbcd_test <- wbcd_n[470:569, ]
create labels for training and test data
wbcd_train_labels <- wbcd[1:469, 1]
wbcd_test_labels <- wbcd[470:569, 1]
## Step 2: Training a model on the data ----
library(class)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test,
cl = wbcd_train_labels, k = 21)
## Step 3: Evaluating model performance ----
library(gmodels)
Create the cross tabulation of predicted vs. actual
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred,
prop.chisq = FALSE)
## Step 4: Improving model performance ----
对模型数据进行标准化处理
wbcd_z <- as.data.frame(scale(wbcd[-1]))
confirm that the transformation was applied correctly
summary(wbcd_z$area_mean)
create training and test datasets
wbcd_train <- wbcd_z[1:469, ]
wbcd_test <- wbcd_z[470:569, ]
re-classify test cases
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test,
cl = wbcd_train_labels, k = 21)
Create the cross tabulation of predicted vs. actual
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred,
prop.chisq = FALSE)
try several different values of k
wbcd_train <- wbcd_n[1:469, ]
wbcd_test <- wbcd_n[470:569, ]
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k=1)
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq=FALSE)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k=5)
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq=FALSE)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k=11)
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq=FALSE)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k=15)
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq=FALSE)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k=21)
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq=FALSE)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k=27)
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq=FALSE)
```
欢迎指正哦~
需要数据请私聊哦~
原理百度一下就有很多哦,这里就不阐述了。