Title | Question 3 - my take iris.txt contains 150 data points, each with four predictor variables and one categorical response |
---|---|
Author | Akshay SR |
Course | Intro to Analytics Modeling |
Institution | Georgia Institute of Technology |
Pages | 3 |
File Size | 316.5 KB |
File Type | |
Total Downloads | 82 |
Total Views | 121 |
my takeiris.txt contains 150 data points, each with four predictor variables and one categorical response...
Question 3.1 Using the same data set (credit_card_data.txt or credit_card_data-headers.txt) as in Question 2.2, use the ksvm or kknn function to find a good classifier: using cross-validation (do this for the k-nearest-neighbors model; SVM is optional); and splitting the data into training, validation, and test data sets (pick either KNN or SVM; the other is optional). # Get the libraries kernlab and kvnn library(kernlab) library(kknn) # Load the datafiles and print the header cc_data = read.delim(file.choose(),header=F) head(cc_data,10)
#Set seed to ensure reproducability set.seed(1) # Create a mask for ensuring the Training , Validation and Test set are in 70%,15%,15% ratio # Training Set : 70% Data mask_cc_training = sample(nrow(cc_data), size = floor(nrow(cc_data) * 0.7)) # Training data set cc_trainingset = cc_data[mask_cc_training,] #Filter out the other data into validation and test sets other_data = cc_data[-mask_cc_training,] # 15% of total data (50% of the other data is test and rest is validation) mask_cc_validation = sample(nrow(other_data), size = floor(nrow(other_data)/2)) # Validation and Test Data sets cc_validationset = other_data[mask_cc_validation,] cc_testset = other_data[-mask_cc_validation,]
# There are 9 SVM models and 20 KNN models accuracy = rep(0,29) # 1-9 are SVM, 10-29 are KNN # there are certain c's to be put in place to test. amounts_c = c(0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000)
for (i in 1:9) { #Fitting using training dataset using C classification and simple linear kernel (vanilladot) cc_scaledmodel = ksvm(as.matrix(cc_trainingset[,1:10]),as.factor(cc_trainingset[,11]), type = "C-svc",kernel = "vanilladot",C = amounts_c[i],scaled=TRUE) # Pridcting the Model predicted = predict(cc_scaledmodel,cc_validationset[,1:10]) # Matching the model prediction to actual classification accuracy[i] = (sum(predicted == cc_validationset$V11) / nrow(cc_validationset))*100 }
# Model Accuracy Percentage accuracy[1:9]
# Getting the best SVM model from the above exercise cat("Top SVM model is : ",which.max(accuracy[1:9]),"\n")
cat("Top C value is : ",amounts_c[which.max(accuracy[1:9])],"\n")
cat("Top validation set accuracy is (%)=",max(accuracy[1:9]),"\n")
# Keeping the best model using above training data cc_scaledmodel = ksvm(as.matrix(cc_trainingset[,1:10]),as.factor(cc_trainingset[,11]),type = "C-svc",kernel = "vanilladot",C = amounts_c[which.max(accuracy[1:9])],scaled=TRUE)
cat("Test data set accuracy = ",sum(predict(cc_scaledmodel,cc_testset[,1:10]) == cc_testset$V11) / nrow(cc_testset),"\n")
# Now, training SVM models for (k in 1:20) { # Fitting the k-nearest-neighbor models using the training dataset via kknn knn_model = kknn(V11~.,cc_trainingset,cc_validationset,k=k,scale=TRUE) # Comparing model with validation dataset
prediction = as.integer(fitted(knn_model)+0.5) # rounding off to 0 or 1 accuracy[k+9] = (sum(prediction == cc_validationset$V11) / nrow(cc_validationset))*100 } # Percentage of accuracy of the models accuracy[10:29]
# Now, we have to find the best KNN model with its accuracyracy % in validation data cat("Top KNN model number is :",which.max(accuracy[10:29]),"\n")
cat("Top Validation accuracy percentage is : ",max(accuracy[10:29]),"\n")
# Using test data to verify the actual results knn_model= kknn(V11~.,cc_trainingset,cc_testset,k=which.max(accuracy[10:29]),scale=TRUE) prediction = as.integer(fitted(knn_model)+0.5) cat("Test Data set accuracy = ",sum(prediction == cc_testset$V11) / nrow(cc_testset),"\n")
# The best model in evaluation data is below # Checking whether kvsm or kknn method is better if (which.max(accuracy)...