quantargo
diff --git a/‎.Rbuildignore
+4 b/‎.Rbuildignore
+4
diff --git a/‎DESCRIPTION
+15 b/‎DESCRIPTION
+15
diff --git a/‎NAMESPACE
+15 b/‎NAMESPACE
+15
diff --git a/‎R/clean_data.R
+35 b/‎R/clean_data.R
+35
diff --git a/‎R/data_doc.R
+6 b/‎R/data_doc.R
+6
diff --git a/‎R/data_man.R
+6 b/‎R/data_man.R
+6
diff --git a/‎R/import_data.R
+10 b/‎R/import_data.R
+10
diff --git a/‎R/model_data.R
+18 b/‎R/model_data.R
+18
diff --git a/‎R/model_performance.R
+25 b/‎R/model_performance.R
+25
diff --git a/‎R/model_plot.R
+15 b/‎R/model_plot.R
+15
diff --git a/‎R/model_predict.R
+17 b/‎R/model_predict.R
+17
diff --git a/‎R/transform_data.R
+40 b/‎R/transform_data.R
+40
diff --git a/‎README.Rmd
+42-5 b/‎README.Rmd
+42-5
diff --git a/‎README.md
+66-8 b/‎README.md
+66-8
diff --git a/‎_config.yml
+1 b/‎_config.yml
+1
@@ -0,0 +1,4 @@
+^bmarketing\.Rproj$
+^\.Rproj\.user$
+^.Rproj$
+^R\bmarketing.R$
@@ -0,0 +1,15 @@
+Package: bmarketing
+Title: Decision tree model data process
+Version: 0.0.0.9000
+Authors@R: person("Michael", "Mifek", email = "[email protected]",
+                  role = c("aut", "cre"))
+Description: Clean, transform data for use in decission tree model
+Depends: R (>= 2.10)
+License: What license it uses
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 6.1.1
+Imports:
+    readr, dplyr, rpart, rpart.plot
+Suggests: 
+    testthat
@@ -0,0 +1,15 @@
+# Generated by roxygen2: do not edit by hand
+
+export(clean_data)
+export(import_data_from_csv)
+export(model_data)
+export(model_performance)
+export(model_plot)
+export(model_predict)
+export(transform_data)
+importFrom(dplyr,mutate_at)
+importFrom(dplyr,mutate_if)
+importFrom(readr,parse_double)
+importFrom(readr,read_csv2)
+importFrom(rpart,rpart)
+importFrom(rpart.plot,rpart.plot)
@@ -0,0 +1,35 @@
+#' Cleaning function
+#' 
+#' Data is cleand
+#'
+#' @param df data.frame, data to be cleaned
+#' @param target_var character, columnname of dataframe which acts as the target variable 
+#' @param ... args forwarded to \code{\link{read_csv2}}. Hint: You can edit the import column type by the col_types argument.
+#' @param na_threshold numeric, if NAs within a column are greater than the treshold given, this columns will be excluded
+#'
+#' @return data.frame, excluding columns with too many NAs
+#' 
+#' @importFrom readr read_csv2
+#' 
+#' @export
+#' @examples
+#' \dontrun{
+#' data <- import_data("/data/bmarketing2.csv")
+#' clean_data(data, target_var = "Y")
+#' }
+clean_data <- function(df, target_var, na_threshold = 0.5, ...) {
+
+  stopifnot(nrow(df) > 0)
+  stopifnot(target_var %in% colnames(df))
+  stopifnot(ncol(df) > 1)
+  
+  ## return an error if the target variable contains any missing values (NA’s).
+  if (any(is.na(df[[target_var]]))) stop("Target Var should not include any NAs")
+  
+  exclude_index <- vapply(df, function(x) mean(is.na(x)), numeric(1)) > na_threshold
+  
+  ## give clear warnings for all other variables which contain NA’s.
+  if (sum(exclude_index) > 0) warning(paste("Column(s)", paste(colnames(df[exclude_index]), collapse = ", "), "have too many NAs and will be excluded"))
+  
+  df[, !exclude_index]
+}
@@ -0,0 +1,6 @@
+#' This is data to be included in my package for testing
+#'
+#' @name test_data
+#' @docType data
+#' @keywords data
+NULL
@@ -0,0 +1,6 @@
+#' This is data to be included in my package
+#'
+#' @name bmarketing
+#' @docType data
+#' @keywords data
+NULL
@@ -0,0 +1,10 @@
+#' Imports Data from a csv file
+#'
+#' @param filename character, filename
+#' @param ... args, forward to read_csv function
+#'
+#' @return data.frame
+#' @export
+import_data_from_csv <- function(filename, ...) {
+  suppressMessages(readr::read_csv(filename, ...))
+}
@@ -0,0 +1,18 @@
+#' Returns the predicted classes from the decision tree
+#'
+#' @param df data.frame on which we want to make a model
+#' @param target_var character, Dependent variable 
+#'
+#' @return return decision tree model
+#' 
+#' @importFrom rpart rpart
+#' 
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' model_data(df, target_var="Y")
+#' }
+model_data <- function(df, target_var){
+  rpart(as.formula(paste(target_var, "~ .")), data = df, model = TRUE)
+}
@@ -0,0 +1,25 @@
+#' Calculate the accuracy of the model
+#'
+#' @param target vector, target values
+#' @param prediction vector, predicted values
+#'
+#' @return list, measures out of confusion matrix: accuracy, sensitivity, specificity. \url{https://en.wikipedia.org/wiki/Confusion_matrix}
+#' 
+#' @examples 
+#' target <- sample(c(0, 1), 100, replace = TRUE)
+#' prediction <- sample(c(0, 1), 100, replace = TRUE)
+#' model_performance(target, prediction)
+#' 
+#' @export
+model_performance <- function(target, prediction) {
+  
+  stopifnot(length(target) == length(prediction))
+  
+  cm <- table(target, prediction) 
+  
+  list(
+    accuracy = sum(diag(cm)) / sum(cm),
+    sensitivity = cm[1,1] / sum(cm[, 1]),
+    specificity = cm[2,2] / sum(cm[, 2])
+  )
+}
@@ -0,0 +1,15 @@
+#' Plot the tree and respective nodes
+#'
+#' @param dt_model randomForest, received by \code{link{model_data}}
+#'
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' model_plot(dt_model)
+#' }
+#' @importFrom rpart.plot rpart.plot
+model_plot <- function(dt_model) {
+  rpart.plot(dt_model)
+}
+
@@ -0,0 +1,17 @@
+#' Title
+#' 
+#'The main goal of linear regression is 
+#'to predict an outcome value on the basis of one or multiple predictor variables.
+#' 
+#' @param dt_model, randomForest, received by \code{link{model_data}}
+#' @param data df data.frame , received by \code{\link{clean_data}} 
+#'
+#' @return data df data.frame
+#' @export
+#' @examples 
+#' \dontrun{
+#' model_predict(dt_model,dataframe)
+#' }
+model_predict <- function(dt_model, data) {
+  predict(dt_model, data, type = "class")
+}
@@ -0,0 +1,40 @@
+#' Tranform data 
+#' 
+#' 1. Changes factor columns to numeric, if possible
+#' 2. Apply mean-impute to user defined columns
+#' 3. Apply log to user defined columns
+#' 4. Apply normalization to user defined columns
+#'
+#' @param cols_impute character
+#' @param cols_log character
+#' @param cols_normalize character
+#' @param df data.frame, recveived by \code{\link{clean_data}}
+#'
+#' @return data.frame
+#' 
+#' @importFrom dplyr mutate_if mutate_at
+#' @importFrom readr parse_double
+#' 
+#' @export
+transform_data <- function(df, cols_impute = NULL, cols_log = NULL, cols_normalize = NULL) {
+  
+  res_df <- mutate_if(df, is.factor, readr::parse_double)
+  
+  if (length(cols_impute) > 0) res_df <- mutate_at(res_df, cols_impute, impute_transform)
+  if (length(cols_log) > 0) res_df <- mutate_at(res_df, cols_log, log_transform)
+  if (length(cols_normalize) > 0) res_df <- mutate_at(res_df, cols_normalize, normalize_transform)
+
+  res_df
+}
+
+impute_transform <- function(x) {
+  replace(x, is.na(x), mean(x, na.rm = TRUE))
+}
+
+normalize_transform <- function(x) {
+  (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
+}
+
+log_transform<- function(x) {
+  log(x)
+}
@@ -12,12 +12,49 @@ knitr::opts_chunk$set(
 )
 ```
 
-[![Travis Build Status](https://travis-ci.org/Quantargo/bmarketing.svg?branch=master)](https://travis-ci.org/Quantargo/bmarketing)
-[![Coverage Status](https://img.shields.io/codecov/c/github/Quantargo/bmarketing/master.svg)](https://codecov.io/github/Quantargo/bmarketing?branch=master)
-
 ## Overview
 
-The bmarketing dataset
+This package offers a series of functions that will clean, transform and build a decision tree model for your input dataset. The functions are split in different R scripts inside the R folder and should be executed as explained below. 
+
+## Function Description
+
+clean_data : This function will check if the target variable contains missing calues and returns an error. It also gives warnings if other variables contain NAs and removes thoe columns that have more than 50% NAs. 
+
+transform_data : Data transformation step transforms the numeric variables using log and transforms factors into numeric variables (and vice versa).
+
+model_data : This function builds a decision tree for the provided dataset. 
+
+model_plot : Plots the decision tree created in the precious step.
+
+model_predict: Returns the predicted classes from the decision tree.
+
+model_performance : This functions calculated the accuracy of the model. 
+
+## Usage
+
+Here is how to use the package
+
+```{r}
+library(bmarketing)
+df <- import_data_from_csv("~/CloudStation/Projekte/r_projects/bmarketing2.csv")
+##clean_data(df, target_var = "Y")
+
+df[is.na(df$Y), "Y"] <- 0
+df <- clean_data(df, target_var = "Y")
+
+df <- transform_data(df, c("AGE", "DURATION"))
+
+target_var <- "Y"
+m <- model_data(df, target_var)
+
+model_plot(m)
+
+target <- df[, target_var][[1]]
+predictions <- model_predict(m, df)
+
+
+model_performance(target, predictions)
+```
+
 
-<!-- TODO: Change README to make it more descriptive, add examples, etc. -->
 
@@ -1,14 +1,72 @@
 
 <!-- README.md is generated from README.Rmd. Please edit that file -->
 
-[![Travis Build
-Status](https://travis-ci.org/Quantargo/bmarketing.svg?branch=master)](https://travis-ci.org/Quantargo/bmarketing)
-[![Coverage
-Status](https://img.shields.io/codecov/c/github/Quantargo/bmarketing/master.svg)](https://codecov.io/github/Quantargo/bmarketing?branch=master)
-
 ## Overview
 
-The bmarketing
-dataset
+This package offers a series of functions that will clean, transform and
+build a decision tree model for your input dataset. The functions are
+split in different R scripts inside the R folder and should be executed
+as explained below.
+
+## Function Description
+
+clean\_data : This function will check if the target variable contains
+missing calues and returns an error. It also gives warnings if other
+variables contain NAs and removes thoe columns that have more than 50%
+NAs.
+
+transform\_data : Data transformation step transforms the numeric
+variables using log and transforms factors into numeric variables (and
+vice versa).
+
+model\_data : This function builds a decision tree for the provided
+dataset.
+
+model\_plot : Plots the decision tree created in the precious step.
+
+model\_predict: Returns the predicted classes from the decision tree.
+
+model\_performance : This functions calculated the accuracy of the
+model.
+
+## Usage
+
+Here is how to use the package
+
+``` r
+library(bmarketing)
+df <- import_data_from_csv("~/CloudStation/Projekte/r_projects/bmarketing2.csv")
+#> Warning: Missing column names filled in: 'X1' [1]
+##clean_data(df, target_var = "Y")
+
+df[is.na(df$Y), "Y"] <- 0
+df <- clean_data(df, target_var = "Y")
+#> Warning in clean_data(df, target_var = "Y"): Column(s) HOUSING, POUTCOME,
+#> MONTH have too many NAs and will be excluded
+
+df <- transform_data(df, c("AGE", "DURATION"))
+
+target_var <- "Y"
+m <- model_data(df, target_var)
+
+model_plot(m)
+```
+
+![](man/figures/README-unnamed-chunk-2-1.png)<!-- -->
+
+``` r
+
+target <- df[, target_var][[1]]
+predictions <- model_predict(m, df)
+
 
-<!-- TODO: Change README to make it more descriptive, add examples, etc. -->
+model_performance(target, predictions)
+#> $accuracy
+#> [1] 0.8131207
+#> 
+#> $sensitivity
+#> [1] NaN
+#> 
+#> $specificity
+#> [1] 0.8499763
+```
@@ -0,0 +1 @@
+theme: jekyll-theme-midnight