This case presents a model that can be used for classification of fraudulent credit card transactions. It considers the Gradient Boosting Machine Learning Model on a 284,807 x 31 csv dataset which is approximately 66.3 mb. It makes use of Spark via the sparklyr package for efficient processing of somewhat big data.
data <- read.csv("creditcardfraud.csv")
data$Class <- factor(data$Class, levels = c("0", "1"), labels = c("legit", "fraud"))
# Check for NaNs
which(is.na(data))
## integer(0)
str(data)
## 'data.frame': 284807 obs. of 31 variables:
## $ Time : num 0 0 1 1 2 2 4 7 7 9 ...
## $ V1 : num -1.36 1.192 -1.358 -0.966 -1.158 ...
## $ V2 : num -0.0728 0.2662 -1.3402 -0.1852 0.8777 ...
## $ V3 : num 2.536 0.166 1.773 1.793 1.549 ...
## $ V4 : num 1.378 0.448 0.38 -0.863 0.403 ...
## $ V5 : num -0.3383 0.06 -0.5032 -0.0103 -0.4072 ...
## $ V6 : num 0.4624 -0.0824 1.8005 1.2472 0.0959 ...
## $ V7 : num 0.2396 -0.0788 0.7915 0.2376 0.5929 ...
## $ V8 : num 0.0987 0.0851 0.2477 0.3774 -0.2705 ...
## $ V9 : num 0.364 -0.255 -1.515 -1.387 0.818 ...
## $ V10 : num 0.0908 -0.167 0.2076 -0.055 0.7531 ...
## $ V11 : num -0.552 1.613 0.625 -0.226 -0.823 ...
## $ V12 : num -0.6178 1.0652 0.0661 0.1782 0.5382 ...
## $ V13 : num -0.991 0.489 0.717 0.508 1.346 ...
## $ V14 : num -0.311 -0.144 -0.166 -0.288 -1.12 ...
## $ V15 : num 1.468 0.636 2.346 -0.631 0.175 ...
## $ V16 : num -0.47 0.464 -2.89 -1.06 -0.451 ...
## $ V17 : num 0.208 -0.115 1.11 -0.684 -0.237 ...
## $ V18 : num 0.0258 -0.1834 -0.1214 1.9658 -0.0382 ...
## $ V19 : num 0.404 -0.146 -2.262 -1.233 0.803 ...
## $ V20 : num 0.2514 -0.0691 0.525 -0.208 0.4085 ...
## $ V21 : num -0.01831 -0.22578 0.248 -0.1083 -0.00943 ...
## $ V22 : num 0.27784 -0.63867 0.77168 0.00527 0.79828 ...
## $ V23 : num -0.11 0.101 0.909 -0.19 -0.137 ...
## $ V24 : num 0.0669 -0.3398 -0.6893 -1.1756 0.1413 ...
## $ V25 : num 0.129 0.167 -0.328 0.647 -0.206 ...
## $ V26 : num -0.189 0.126 -0.139 -0.222 0.502 ...
## $ V27 : num 0.13356 -0.00898 -0.05535 0.06272 0.21942 ...
## $ V28 : num -0.0211 0.0147 -0.0598 0.0615 0.2152 ...
## $ Amount: num 149.62 2.69 378.66 123.5 69.99 ...
## $ Class : Factor w/ 2 levels "legit","fraud": 1 1 1 1 1 1 1 1 1 1 ...
pca <- prcomp(data[,c(-1,-31)])
plot(pca, type = "l", col = "deeppink")
DFpca <- data.frame(pca$rotation)
var <- DFpca[order(DFpca$PC1, decreasing = 1),]
Ndata <- data[, c(21, 7, 8, 3, 6, 2, 30, 31)]
library(sparklyr)
library(tidyverse)
sc <- spark_connect(master = "local")
df <- Ndata %>%
map_if(is.factor, as.integer) %>%
as_tibble() %>%
mutate(Class = Class - 1) %>%
copy_to(sc, ., name = "df", overwrite = 1)
df_part <- df %>%
compute("df_part") %>%
sdf_random_split(test = 0.2, train = 0.8, seed = 2017)
mod <- df_part$train %>% ml_gradient_boosted_trees(Class~., type = "classification")
ml_tree_feature_importance(sc = sc, model = mod)
## feature importance
## 1 Amount 0.22787750
## 2 V6 0.21590532
## 3 V5 0.16825887
## 4 V2 0.12167889
## 5 V1 0.09470615
## 6 V7 0.08596699
## 7 V20 0.08560628
test_rf <- ml_predict(mod, df_part$test)
# Accuracy check with Sparklyr
round((test_rf_f1 <- test_rf %>%
ml_binary_classification_evaluator(label = "Class",
prediction_col = "prediction",
metric = "areaUnderROC"))*100, 2) # default, F1 score
## [1] 96.53