Skip to content

Commit c177452

Browse files
authored
Merge pull request #2 from mifek/master
Changed your script to a reusable package
2 parents cb5cdb9 + 3b57397 commit c177452

33 files changed

+573
-62
lines changed

.Rbuildignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
^bmarketing\.Rproj$
2+
^\.Rproj\.user$
3+
^.Rproj$
4+
^R\bmarketing.R$

DESCRIPTION

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
Package: bmarketing
2+
Title: Decision tree model data process
3+
Version: 0.0.0.9000
4+
Authors@R: person("Michael", "Mifek", email = "[email protected]",
5+
role = c("aut", "cre"))
6+
Description: Clean, transform data for use in decission tree model
7+
Depends: R (>= 2.10)
8+
License: What license it uses
9+
Encoding: UTF-8
10+
LazyData: true
11+
RoxygenNote: 6.1.1
12+
Imports:
13+
readr, dplyr, rpart, rpart.plot
14+
Suggests:
15+
testthat

NAMESPACE

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Generated by roxygen2: do not edit by hand
2+
3+
export(clean_data)
4+
export(import_data_from_csv)
5+
export(model_data)
6+
export(model_performance)
7+
export(model_plot)
8+
export(model_predict)
9+
export(transform_data)
10+
importFrom(dplyr,mutate_at)
11+
importFrom(dplyr,mutate_if)
12+
importFrom(readr,parse_double)
13+
importFrom(readr,read_csv2)
14+
importFrom(rpart,rpart)
15+
importFrom(rpart.plot,rpart.plot)

R/clean_data.R

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#' Cleaning function
2+
#'
3+
#' Data is cleand
4+
#'
5+
#' @param df data.frame, data to be cleaned
6+
#' @param target_var character, columnname of dataframe which acts as the target variable
7+
#' @param ... args forwarded to \code{\link{read_csv2}}. Hint: You can edit the import column type by the col_types argument.
8+
#' @param na_threshold numeric, if NAs within a column are greater than the treshold given, this columns will be excluded
9+
#'
10+
#' @return data.frame, excluding columns with too many NAs
11+
#'
12+
#' @importFrom readr read_csv2
13+
#'
14+
#' @export
15+
#' @examples
16+
#' \dontrun{
17+
#' data <- import_data("/data/bmarketing2.csv")
18+
#' clean_data(data, target_var = "Y")
19+
#' }
20+
clean_data <- function(df, target_var, na_threshold = 0.5, ...) {
21+
22+
stopifnot(nrow(df) > 0)
23+
stopifnot(target_var %in% colnames(df))
24+
stopifnot(ncol(df) > 1)
25+
26+
## return an error if the target variable contains any missing values (NA’s).
27+
if (any(is.na(df[[target_var]]))) stop("Target Var should not include any NAs")
28+
29+
exclude_index <- vapply(df, function(x) mean(is.na(x)), numeric(1)) > na_threshold
30+
31+
## give clear warnings for all other variables which contain NA’s.
32+
if (sum(exclude_index) > 0) warning(paste("Column(s)", paste(colnames(df[exclude_index]), collapse = ", "), "have too many NAs and will be excluded"))
33+
34+
df[, !exclude_index]
35+
}

R/data_doc.R

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#' This is data to be included in my package for testing
2+
#'
3+
#' @name test_data
4+
#' @docType data
5+
#' @keywords data
6+
NULL

R/data_man.R

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#' This is data to be included in my package
2+
#'
3+
#' @name bmarketing
4+
#' @docType data
5+
#' @keywords data
6+
NULL

R/import_data.R

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#' Imports Data from a csv file
2+
#'
3+
#' @param filename character, filename
4+
#' @param ... args, forward to read_csv function
5+
#'
6+
#' @return data.frame
7+
#' @export
8+
import_data_from_csv <- function(filename, ...) {
9+
suppressMessages(readr::read_csv(filename, ...))
10+
}

R/model_data.R

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#' Returns the predicted classes from the decision tree
2+
#'
3+
#' @param df data.frame on which we want to make a model
4+
#' @param target_var character, Dependent variable
5+
#'
6+
#' @return return decision tree model
7+
#'
8+
#' @importFrom rpart rpart
9+
#'
10+
#' @export
11+
#'
12+
#' @examples
13+
#' \dontrun{
14+
#' model_data(df, target_var="Y")
15+
#' }
16+
model_data <- function(df, target_var){
17+
rpart(as.formula(paste(target_var, "~ .")), data = df, model = TRUE)
18+
}

R/model_performance.R

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#' Calculate the accuracy of the model
2+
#'
3+
#' @param target vector, target values
4+
#' @param prediction vector, predicted values
5+
#'
6+
#' @return list, measures out of confusion matrix: accuracy, sensitivity, specificity. \url{https://en.wikipedia.org/wiki/Confusion_matrix}
7+
#'
8+
#' @examples
9+
#' target <- sample(c(0, 1), 100, replace = TRUE)
10+
#' prediction <- sample(c(0, 1), 100, replace = TRUE)
11+
#' model_performance(target, prediction)
12+
#'
13+
#' @export
14+
model_performance <- function(target, prediction) {
15+
16+
stopifnot(length(target) == length(prediction))
17+
18+
cm <- table(target, prediction)
19+
20+
list(
21+
accuracy = sum(diag(cm)) / sum(cm),
22+
sensitivity = cm[1,1] / sum(cm[, 1]),
23+
specificity = cm[2,2] / sum(cm[, 2])
24+
)
25+
}

R/model_plot.R

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#' Plot the tree and respective nodes
2+
#'
3+
#' @param dt_model randomForest, received by \code{link{model_data}}
4+
#'
5+
#' @export
6+
#'
7+
#' @examples
8+
#' \dontrun{
9+
#' model_plot(dt_model)
10+
#' }
11+
#' @importFrom rpart.plot rpart.plot
12+
model_plot <- function(dt_model) {
13+
rpart.plot(dt_model)
14+
}
15+

R/model_predict.R

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#' Title
2+
#'
3+
#'The main goal of linear regression is
4+
#'to predict an outcome value on the basis of one or multiple predictor variables.
5+
#'
6+
#' @param dt_model, randomForest, received by \code{link{model_data}}
7+
#' @param data df data.frame , received by \code{\link{clean_data}}
8+
#'
9+
#' @return data df data.frame
10+
#' @export
11+
#' @examples
12+
#' \dontrun{
13+
#' model_predict(dt_model,dataframe)
14+
#' }
15+
model_predict <- function(dt_model, data) {
16+
predict(dt_model, data, type = "class")
17+
}

R/transform_data.R

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#' Tranform data
2+
#'
3+
#' 1. Changes factor columns to numeric, if possible
4+
#' 2. Apply mean-impute to user defined columns
5+
#' 3. Apply log to user defined columns
6+
#' 4. Apply normalization to user defined columns
7+
#'
8+
#' @param cols_impute character
9+
#' @param cols_log character
10+
#' @param cols_normalize character
11+
#' @param df data.frame, recveived by \code{\link{clean_data}}
12+
#'
13+
#' @return data.frame
14+
#'
15+
#' @importFrom dplyr mutate_if mutate_at
16+
#' @importFrom readr parse_double
17+
#'
18+
#' @export
19+
transform_data <- function(df, cols_impute = NULL, cols_log = NULL, cols_normalize = NULL) {
20+
21+
res_df <- mutate_if(df, is.factor, readr::parse_double)
22+
23+
if (length(cols_impute) > 0) res_df <- mutate_at(res_df, cols_impute, impute_transform)
24+
if (length(cols_log) > 0) res_df <- mutate_at(res_df, cols_log, log_transform)
25+
if (length(cols_normalize) > 0) res_df <- mutate_at(res_df, cols_normalize, normalize_transform)
26+
27+
res_df
28+
}
29+
30+
impute_transform <- function(x) {
31+
replace(x, is.na(x), mean(x, na.rm = TRUE))
32+
}
33+
34+
normalize_transform <- function(x) {
35+
(x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
36+
}
37+
38+
log_transform<- function(x) {
39+
log(x)
40+
}

README.Rmd

+42-5
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,49 @@ knitr::opts_chunk$set(
1212
)
1313
```
1414

15-
[![Travis Build Status](https://travis-ci.org/Quantargo/bmarketing.svg?branch=master)](https://travis-ci.org/Quantargo/bmarketing)
16-
[![Coverage Status](https://img.shields.io/codecov/c/github/Quantargo/bmarketing/master.svg)](https://codecov.io/github/Quantargo/bmarketing?branch=master)
17-
1815
## Overview
1916

20-
The bmarketing dataset
17+
This package offers a series of functions that will clean, transform and build a decision tree model for your input dataset. The functions are split in different R scripts inside the R folder and should be executed as explained below.
18+
19+
## Function Description
20+
21+
clean_data : This function will check if the target variable contains missing calues and returns an error. It also gives warnings if other variables contain NAs and removes thoe columns that have more than 50% NAs.
22+
23+
transform_data : Data transformation step transforms the numeric variables using log and transforms factors into numeric variables (and vice versa).
24+
25+
model_data : This function builds a decision tree for the provided dataset.
26+
27+
model_plot : Plots the decision tree created in the precious step.
28+
29+
model_predict: Returns the predicted classes from the decision tree.
30+
31+
model_performance : This functions calculated the accuracy of the model.
32+
33+
## Usage
34+
35+
Here is how to use the package
36+
37+
```{r}
38+
library(bmarketing)
39+
df <- import_data_from_csv("~/CloudStation/Projekte/r_projects/bmarketing2.csv")
40+
##clean_data(df, target_var = "Y")
41+
42+
df[is.na(df$Y), "Y"] <- 0
43+
df <- clean_data(df, target_var = "Y")
44+
45+
df <- transform_data(df, c("AGE", "DURATION"))
46+
47+
target_var <- "Y"
48+
m <- model_data(df, target_var)
49+
50+
model_plot(m)
51+
52+
target <- df[, target_var][[1]]
53+
predictions <- model_predict(m, df)
54+
55+
56+
model_performance(target, predictions)
57+
```
58+
2159

22-
<!-- TODO: Change README to make it more descriptive, add examples, etc. -->
2360

README.md

+66-8
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,72 @@
11

22
<!-- README.md is generated from README.Rmd. Please edit that file -->
33

4-
[![Travis Build
5-
Status](https://travis-ci.org/Quantargo/bmarketing.svg?branch=master)](https://travis-ci.org/Quantargo/bmarketing)
6-
[![Coverage
7-
Status](https://img.shields.io/codecov/c/github/Quantargo/bmarketing/master.svg)](https://codecov.io/github/Quantargo/bmarketing?branch=master)
8-
94
## Overview
105

11-
The bmarketing
12-
dataset
6+
This package offers a series of functions that will clean, transform and
7+
build a decision tree model for your input dataset. The functions are
8+
split in different R scripts inside the R folder and should be executed
9+
as explained below.
10+
11+
## Function Description
12+
13+
clean\_data : This function will check if the target variable contains
14+
missing calues and returns an error. It also gives warnings if other
15+
variables contain NAs and removes thoe columns that have more than 50%
16+
NAs.
17+
18+
transform\_data : Data transformation step transforms the numeric
19+
variables using log and transforms factors into numeric variables (and
20+
vice versa).
21+
22+
model\_data : This function builds a decision tree for the provided
23+
dataset.
24+
25+
model\_plot : Plots the decision tree created in the precious step.
26+
27+
model\_predict: Returns the predicted classes from the decision tree.
28+
29+
model\_performance : This functions calculated the accuracy of the
30+
model.
31+
32+
## Usage
33+
34+
Here is how to use the package
35+
36+
``` r
37+
library(bmarketing)
38+
df <- import_data_from_csv("~/CloudStation/Projekte/r_projects/bmarketing2.csv")
39+
#> Warning: Missing column names filled in: 'X1' [1]
40+
##clean_data(df, target_var = "Y")
41+
42+
df[is.na(df$Y), "Y"] <- 0
43+
df <- clean_data(df, target_var = "Y")
44+
#> Warning in clean_data(df, target_var = "Y"): Column(s) HOUSING, POUTCOME,
45+
#> MONTH have too many NAs and will be excluded
46+
47+
df <- transform_data(df, c("AGE", "DURATION"))
48+
49+
target_var <- "Y"
50+
m <- model_data(df, target_var)
51+
52+
model_plot(m)
53+
```
54+
55+
![](man/figures/README-unnamed-chunk-2-1.png)<!-- -->
56+
57+
``` r
58+
59+
target <- df[, target_var][[1]]
60+
predictions <- model_predict(m, df)
61+
1362

14-
<!-- TODO: Change README to make it more descriptive, add examples, etc. -->
63+
model_performance(target, predictions)
64+
#> $accuracy
65+
#> [1] 0.8131207
66+
#>
67+
#> $sensitivity
68+
#> [1] NaN
69+
#>
70+
#> $specificity
71+
#> [1] 0.8499763
72+
```

_config.yml

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
theme: jekyll-theme-midnight

0 commit comments

Comments
 (0)