Homework 1 Intro to Analytical Modeling PDF

Title	Homework 1 Intro to Analytical Modeling
Author	Gary Gan
Course	Intro to Analytics Modeling
Institution	Georgia Institute of Technology
Pages	6
File Size	141.2 KB
File Type	PDF
Total Downloads	58
Total Views	151

Preview

CLICK TO PREVIEW PDF

Summary

Homework 1 Intro to Analytical Modeling. There are multiple questions...

Description

Hw1 Gary 5/23/2021

Question 2.1 Digital Marketing in Ecommerce is a space where classification plays a huge role. For example, using website traffic analytics data to determine if a consumer will add an item to the shopping cart. Some potential predictos include: - New vs returner customer - Types of Phone - Number of pages visited - Time spent on the site - Landing page

Question 2.2.1 library(kernlab) # Import data df =read.delim("credit_card_data.txt",header=F) # Transform data into a matrix data = as.matrix(df) # Define a function that takes lambda as an argument and return the accuracy # of the model svm_with_c = function(c){ model = ksvm(data[,1:10],as.factor(data[,11]),type="C-svc",kernel="vanilladot",C=c,scaled=TRUE) pred = predict(model,data[,1:10]) sum(pred == data[,11]) / nrow(data) } print(svm_with_c(10)) ## Setting default kernel parameters ## [1] 0.8639144 print(svm_with_c(50)) ## Setting default kernel parameters ## [1] 0.8639144 print(svm_with_c(1000)) ## Setting default kernel parameters ## [1] 0.8623853 1

# Seems like 50 is a good value for C model = ksvm(data[,1:10],as.factor(data[,11]),type="C-svc",kernel="vanilladot",C=50,scaled=TRUE) ##

Setting default kernel parameters

# calculate a1...am a = colSums(model@xmatrix[[1]]*model@coef[[1]]) print(a) ## V1 V2 V3 V4 ## -0.0010523630 -0.0012025131 -0.0015382662 0.0028761998 ## V6 V7 V8 V9 ## -0.0024958086 0.0001810245 -0.0006514829 -0.0013757143

V5 1.0052764944 V10 0.1064002847

# calculate a0 a0 = model@b print(a0) ## [1] -0.08147145 # Get the prediction pred = predict(model,data[,1:10]) # Calculate the accuracy print(sum(pred == data[,11]) / nrow(data)) ## [1] 0.8639144

2.2.3 # Import kknn package library('kknn') # Function that returns the model accuracy for a given k knn_accuracy_given_k = function(k){ all_pred= c() for (i in 1:nrow(df)){ knn_model = kknn(V11~V1+V2+V3+V4+V5+V6+V7+V8+V9+V10,df[-i,],df[i,],k=k,scale=TRUE) predicted = fitted(knn_model) if(predicted>=0.5){ rounded = 1 } else { rounded = 0} all_pred = c(all_pred, rounded) }

2

sum(all_pred == df[,11]) / nrow(df) } # To determine the optimal k, we try out k from 1 - 50 and plot them out ks = seq(50) accuracies = c() for (k in ks) { accuracies = c(accuracies,knn_accuracy_given_k(k)) }

0.84 0.83 0.82

accuracies

0.85

plot(ks,accuracies)

0

10

20

30

40

ks seems like around 12 is when we have the highest accuracy, let’s verify it with which.max() which.max(accuracies) ## [1] 12 # Lets see how will it classifies all the data knn_accuracy_given_k(which.max(accuracies)) ## [1] 0.853211

3.1.a library(caret) ## Loading required package: lattice 3

50 It

## Loading required package: ggplot2 ## ## Attaching package: ’ggplot2’ ## The following object is masked from ’package:kernlab’: ## ## alpha ## ## Attaching package: ’caret’ ## The following object is masked from ’package:kknn’: ## ## contr.dummy # set random seed set.seed(1) split training and testing I use the createDataPartition from caret package here. This function will return a set of randomly selected index based on the attributes you give. In my case, I put p=80% so that it returns randomly selected 80% of all the data as my training set index = createDataPartition(df[,11],p = 0.8,list = FALSE,times = 1) # Split training and test with the returned index train_df = df[index,] test_df = df[-index,] # Transforming the responses to factor so that train.kknn can pick the right # metrics when selecting the best k train_df$V11 = as.factor(train_df$V11) test_df$V11 = as.factor(test_df$V11) # Use LOOCV validation to find the best model knn_cv_model = train.kknn(V11~V1+V2+V3+V4+V5+V6+V7+V8+V9+V10,train_df, kmax=50, scale=TRUE) # Get the best k value best_k = knn_cv_model[['best.parameters']][[2]] # Report the training accuracy train_y_hat = fitted(knn_cv_model)[[best_k]][1:nrow(train_df)] sum(train_y_hat == train_df[,11]) / nrow(train_df) ## [1] 0.8549618 Now that we have selected the best k, which is 19, we use that to retrain our model

4

# A knn model with k = 19 knn_cv_model_retrained = kknn(V11~V1+V2+V3+V4+V5+V6+V7+V8+V9+V10,train_df,test_df,k=19, scale=TRUE) # Fit the model test_pred = fitted(knn_cv_model_retrained) #Get the accuracy sum(test_pred == test_df[,11]) / nrow(test_df) ## [1] 0.8538462

3.1.b # Set random see for this problem set.seed(1) split training, validation and testing I use the createDataPartition from caret package here. This function will return a set of randomly selected index based on the attributes you give. In my case, I put p=0.7 so that it return randomly selected 70% of all the data, as my training set first_index = createDataPartition(df[,11],p = 0.7,list = FALSE,times = 1) train_df = df[first_index,] test_vali_df = df[-first_index,] # Then I use the same function again but this time with the non-training data # and then split them for testing and validation second_index = createDataPartition(test_vali_df[,11],p = 0.5,list = FALSE,times = 1) test_df = test_vali_df[second_index,] validation_df = test_vali_df[-second_index,] # Change response to factor train_df$V11 = as.factor(train_df$V11) validation_df$V11 = as.factor(validation_df$V11) test_df$V11 = as.factor(test_df$V11) # Function that returns the model accuracy for a given k knn_accuracy_given_k = function(k){ knn_model = kknn(V11~V1+V2+V3+V4+V5+V6+V7+V8+V9+V10,train_df,validation_df,k=k,scale=TRUE) predicted = fitted(knn_model) sum(predicted == validation_df[,11]) / nrow(validation_df) } # To determine the optimal k, we try out k from 1 - 50 and plot them out ks = seq(50) accuracies = c() for (k in ks) { accuracies = c(accuracies,knn_accuracy_given_k(k)) } # Get the best k

5

best_k = which.max(accuracies) # Use the best k to run with testing data knn_model = kknn(V11~V1+V2+V3+V4+V5+V6+V7+V8+V9+V10,train_df,test_df,k=best_k,scale=TRUE) predicted = fitted(knn_model) # This is the accuracy result sum(predicted == test_df[,11]) / nrow(test_df) ## [1] 0.877551 As we can see from the result, cross validation, specifically, leave-one-out cross validation, was able to generate a better result than the generic train/validation/test split. A potential reason is that LOOCV tends to generate unbiased but high variance result, given that it uses almost all data, especially I added the step to retrain the model with all training data.

6...