Situationer

This case presents a model that can be used for classification of fraudulent credit card transactions. It considers the Gradient Boosting Machine Learning Model on a 284,807 x 31 csv dataset which is approximately 66.3 mb. It makes use of Spark via the sparklyr package for efficient processing of somewhat big data.

Load and Manage Data

data <- read.csv("creditcardfraud.csv")
data$Class <- factor(data$Class, levels = c("0", "1"), labels = c("legit", "fraud"))
# Check for NaNs
which(is.na(data))
## integer(0)

Check Data Structure

str(data)
## 'data.frame':    284807 obs. of  31 variables:
##  $ Time  : num  0 0 1 1 2 2 4 7 7 9 ...
##  $ V1    : num  -1.36 1.192 -1.358 -0.966 -1.158 ...
##  $ V2    : num  -0.0728 0.2662 -1.3402 -0.1852 0.8777 ...
##  $ V3    : num  2.536 0.166 1.773 1.793 1.549 ...
##  $ V4    : num  1.378 0.448 0.38 -0.863 0.403 ...
##  $ V5    : num  -0.3383 0.06 -0.5032 -0.0103 -0.4072 ...
##  $ V6    : num  0.4624 -0.0824 1.8005 1.2472 0.0959 ...
##  $ V7    : num  0.2396 -0.0788 0.7915 0.2376 0.5929 ...
##  $ V8    : num  0.0987 0.0851 0.2477 0.3774 -0.2705 ...
##  $ V9    : num  0.364 -0.255 -1.515 -1.387 0.818 ...
##  $ V10   : num  0.0908 -0.167 0.2076 -0.055 0.7531 ...
##  $ V11   : num  -0.552 1.613 0.625 -0.226 -0.823 ...
##  $ V12   : num  -0.6178 1.0652 0.0661 0.1782 0.5382 ...
##  $ V13   : num  -0.991 0.489 0.717 0.508 1.346 ...
##  $ V14   : num  -0.311 -0.144 -0.166 -0.288 -1.12 ...
##  $ V15   : num  1.468 0.636 2.346 -0.631 0.175 ...
##  $ V16   : num  -0.47 0.464 -2.89 -1.06 -0.451 ...
##  $ V17   : num  0.208 -0.115 1.11 -0.684 -0.237 ...
##  $ V18   : num  0.0258 -0.1834 -0.1214 1.9658 -0.0382 ...
##  $ V19   : num  0.404 -0.146 -2.262 -1.233 0.803 ...
##  $ V20   : num  0.2514 -0.0691 0.525 -0.208 0.4085 ...
##  $ V21   : num  -0.01831 -0.22578 0.248 -0.1083 -0.00943 ...
##  $ V22   : num  0.27784 -0.63867 0.77168 0.00527 0.79828 ...
##  $ V23   : num  -0.11 0.101 0.909 -0.19 -0.137 ...
##  $ V24   : num  0.0669 -0.3398 -0.6893 -1.1756 0.1413 ...
##  $ V25   : num  0.129 0.167 -0.328 0.647 -0.206 ...
##  $ V26   : num  -0.189 0.126 -0.139 -0.222 0.502 ...
##  $ V27   : num  0.13356 -0.00898 -0.05535 0.06272 0.21942 ...
##  $ V28   : num  -0.0211 0.0147 -0.0598 0.0615 0.2152 ...
##  $ Amount: num  149.62 2.69 378.66 123.5 69.99 ...
##  $ Class : Factor w/ 2 levels "legit","fraud": 1 1 1 1 1 1 1 1 1 1 ...

Preprocess with Principal Component Analysis

Remove variables that do not contribute much to variability

pca <- prcomp(data[,c(-1,-31)])
plot(pca, type = "l", col = "deeppink")

DFpca <- data.frame(pca$rotation)
var <- DFpca[order(DFpca$PC1, decreasing = 1),]
Ndata <- data[, c(21, 7, 8, 3, 6, 2, 30, 31)]

Initialize Local Spark and Convert data frame to Source: Spark

library(sparklyr)
library(tidyverse)
sc <- spark_connect(master = "local")
df <- Ndata %>%
  map_if(is.factor, as.integer) %>%
  as_tibble() %>%
  mutate(Class = Class - 1) %>% 
  copy_to(sc, ., name = "df", overwrite = 1) 

Data Partition: 80% Training and 20% Testing

df_part <- df %>%
  compute("df_part") %>%
  sdf_random_split(test = 0.2, train = 0.8, seed = 2017)

Fit a Gradient Boosted Trees Model

mod <- df_part$train %>%  ml_gradient_boosted_trees(Class~., type = "classification")

Retrieve Feature Importance

ml_tree_feature_importance(sc = sc, model = mod)
##   feature importance
## 1  Amount 0.22787750
## 2      V6 0.21590532
## 3      V5 0.16825887
## 4      V2 0.12167889
## 5      V1 0.09470615
## 6      V7 0.08596699
## 7     V20 0.08560628

Conduct predictions with the test data

test_rf <- ml_predict(mod, df_part$test)
# Accuracy check with Sparklyr
round((test_rf_f1 <- test_rf %>%
    ml_binary_classification_evaluator(label = "Class",
                                           prediction_col = "prediction",
                                           metric = "areaUnderROC"))*100, 2) # default, F1 score 
## [1] 96.53

Based on its accuracy, this GBM model can be considered as excellent for classification.