From 207bb1b8dfb67c5039f7a8a4d9b48d257272c732 Mon Sep 17 00:00:00 2001
From: EOGrady21 <38440373+EOGrady21@users.noreply.github.com>
Date: Tue, 17 Dec 2024 13:58:30 -0400
Subject: [PATCH 1/2] Issue #69 add .txt file extension

---
 R/clf_check_test.R | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/R/clf_check_test.R b/R/clf_check_test.R
index b538ebc..c131b6c 100644
--- a/R/clf_check_test.R
+++ b/R/clf_check_test.R
@@ -546,7 +546,8 @@ vpr_autoid_create <- function(reclassify, misclassified, basepath, day, hour, me
       day_hour_re <- paste(day, hour, sep = ".")
       recl_roi_gen <- unlist(vpr_roi(recl_roi))
       if (length(unique(day_hour_re)) > 1) {
-        stop(paste(reclassify_category, "has more than one unique hour value!\n                                                     Please double check file."))
+        stop(paste(reclassify_category, "has more than one unique hour value!\n
+                   Please double check file."))
       }
       recl_roi_df <- data.frame(recl_roi_gen, day_hour_re,
                                 recl_roi, stringsAsFactors = FALSE)
@@ -608,7 +609,7 @@ vpr_autoid_create <- function(reclassify, misclassified, basepath, day, hour, me
     dirpath <- file.path("new_autoid", category[[1]])
     dir.create(dirpath, showWarnings = FALSE, recursive = TRUE)
     if (mea == TRUE) {
-      aidMea_final_nm <- paste0("new_aid.mea.", unique(day_hour))
+      aidMea_final_nm <- paste0("new_aid.mea.", unique(day_hour), '.txt')
       aidMea_final_fn <- file.path(dirpath, "aidmea",
                                    aidMea_final_nm)
       dir.create(file.path(category, "aidmea"), showWarnings = FALSE,
@@ -619,7 +620,7 @@ vpr_autoid_create <- function(reclassify, misclassified, basepath, day, hour, me
       cat(paste(">>>> New aidmea file created for",
                 category, "in", unique(day_hour), "\n"))
     }
-    aid_final_nm <- paste0("new_aid.", unique(day_hour))
+    aid_final_nm <- paste0("new_aid.", unique(day_hour), '.txt')
     aid_final_fn <- file.path(dirpath, "aid", aid_final_nm)
     dir.create(file.path(dirpath, "aid"), showWarnings = FALSE,
                recursive = TRUE)

From a5e2957f0eba345c4c7b47af4a39b19088cc5fca Mon Sep 17 00:00:00 2001
From: EOGrady21 <38440373+EOGrady21@users.noreply.github.com>
Date: Tue, 17 Dec 2024 14:51:58 -0400
Subject: [PATCH 2/2] vignette edits from @kevinsorochan

---
 vignettes/VPR_processing.Rmd | 192 +++++------------------------------
 1 file changed, 28 insertions(+), 164 deletions(-)

diff --git a/vignettes/VPR_processing.Rmd b/vignettes/VPR_processing.Rmd
index 3625cc4..e20f82b 100644
--- a/vignettes/VPR_processing.Rmd
+++ b/vignettes/VPR_processing.Rmd
@@ -25,44 +25,10 @@ library(vprr)
 
 # Section 1: Background
 
-This document was produced at Bedford Institute of Oceanography (BIO) to
-accompany the vprr package, a processing and visualization package for
-data obtained from the Digital Auto Video Plankton Recorder (VPR)
-produced by SeaScan Inc. The VPR consists of a CPU, CTD, and camera
-system with different optical settings (i.e., magnifications). It
-captures underwater images and records their corresponding salinity,
-temperature, and depth. The vprr package functions to join environmental
-and plankton data derived from the CTD and camera, respectively, and
-calculate plankton concentration and averaged environmental variables
-along the path of the VPR. The package does not include automated image
-classification; however, there is an optional manual classification
-module, which can be used to review and correct outputs from automated
-image classification while providing a record of any
-(re)classifications.
-
-The VPR outputs two raw files (.dat and .idx) for a given time period in
-a deployment. These files are processed together in a software provided
-with the VPR (i.e., AutoDeck), which decompresses the images, extracts
-"regions of interest" (ROIs), and outputs ROI image files and a
-corresponding CTD data file (.dat). The ROI file names are numeric
-consisting of 10 digits. The first 8 digits correspond to the number of
-milliseconds elapsed in the day at the time the image was captured. The
-last two digits correspond to the ROI identifier (01-99). The ROIs and
-corresponding CTD data are linked by their 8 digit time stamp. After the
-ROIs have been extracted from the raw files they may be sorted into
-categories manually or by an automated classification procedure. In
-vprr, file naming conventions and directory structures are inherited
-from a VPR image classification and analysis software, Visual Plankton.
-However, the functionality of vprr is not dependent on the use of Visual
-Plankton.
-
-The data inputs for processing in vprr consist of the following file
-types: aid (.txt), aidmeas (.txt), and CTD (.dat). The aid and aidmeas
-files are derived from separate image classification and measurement
-steps outside of vprr. Each "aid" (i.e., autoid) file contains file
-paths to individual ROIs that have been classified to the category of
-interest. The corresponding "aidmeas" file contains morphological data
-for the ROIs (e.g., long axis length, perimeter, etc.).
+This document was produced at Bedford Institute of Oceanography (BIO) to accompany the vprr package, a processing and visualization package for data obtained from the Digital Auto Video Plankton Recorder (VPR) produced by SeaScan Inc. The VPR consists of a CPU, CTD, and camera system with different optical settings (i.e., magnifications). It captures underwater images and records their corresponding salinity, temperature, and depth. The vprr package functions to join environmental and plankton data derived from the CTD and camera, respectively, and calculate plankton concentration and averaged environmental variables along the path of the VPR. The package does not include automated image classification; however, there is an optional manual classification module, which can be used to review and correct outputs from automated image classification while providing a record of any (re)classifications.
+The VPR outputs two raw files (.dat and .idx) for a given time period in a deployment. These files are processed together in a software provided with the VPR (i.e., AutoDeck), which decompresses the images, extracts “regions of interest” (ROIs), and outputs ROI image files and a corresponding CTD data file (.dat). The ROI file names are numeric consisting of 10 digits. The first 8 digits correspond to the number of milliseconds elapsed in the day at the time the image was captured. The last two digits correspond to the ROI identifier (01-99). In each deployment, the ROIs and corresponding CTD data are linked by their 8 digit time stamp. After the ROIs have been extracted from the raw files they may be sorted into categories. In vprr, file naming conventions and directory structures are inherited from a previously available VPR image classification and analysis software (i.e., Visual Plankton). The functionality of vprr is not dependent on Visual Plankton.
+The data inputs for processing in vprr consist of the following file types: aid (.txt), aidmeas (.txt), and CTD (.dat). The aid and aidmeas files are derived from separate image classification and measurement steps outside of vprr. Each “aid” (i.e., autoid) file contains file paths to individual ROIs that have been classified to the category of interest. The corresponding “aidmeas” file contains morphological data for the ROIs (e.g., long axis length) that can be obtained from software external to vprr. These morphological data are not required for sorting images or computing plankton concentrations in vprr. 
+
 
 ![Figure 1. VPR data processing flow chart. Blue boxes represent software and processes, green ovals represent data products. Starting at the top left, data flows through multiple workflows, with the final product being,  ecologically significant data.](vprr_figure1.PNG){width=100%}
 
@@ -75,20 +41,7 @@ associated data outputs.
 
 #### Processing Environment
 
-Before beginning data processing with vprr, it is recommended that a
-processing environment be created containing commonly used variables and
-file paths. The simplest and most reproducible way to achieve this is to
-write an R script where all the mission and system specific variables
-are contained, then save the environment as a RData file to be loaded at
-the start of any processing scripts. This processing environment
-contains reference to a station identifier csv file which should be
-created for each mission. This file links station names from deck sheets
-to the day and hour values assigned by AutoDeck. Day and hour values
-represent the Julian day (3 digit) and two digit hour (24 hour clock)
-when sampling was done. Note that the day and hour values will be in the
-time zone of the computer used to run AutoDeck. Ensure that this matches
-the time zone of the VPR CPU at the time of data collection to avoid a
-time offset between data sources.
+Before beginning data processing with vprr, it is recommended that a processing environment be created containing commonly used variables and file paths. The simplest and most reproducible way to achieve this is to write an R script where all the mission and system specific variables are contained, then save the environment as a RData file to be loaded at the start of any processing scripts. The processing environment in the example below links a station identifier to the day and hour values assigned by AutoDeck. Day and hour values represent the Julian day (3 digit) and two digit hour (24 hour clock) when sampling was done. Note that the day and hour values will be in the time zone of the computer used to run AutoDeck. Ensure that this matches the time zone of the VPR CPU at the time of data collection to avoid a time offset between data sources.
 
 Another important part of setting up the processing environment is
 ensuring the proper directory structure is in place, see Appendix 1 for
@@ -131,10 +84,7 @@ session by using
 
     load('COR2019002_env.RData') # where COR2019002 is cruise name
 
-If sharing processing code with colleagues on version control, keeping
-the environment variables separate (outside of the git project) will
-allow collaboration while avoiding inconsistencies in file paths or
-folder names.
+
 
 #### Image Copying (optional):
 
@@ -148,22 +98,12 @@ on image copying are provided in Section 3.
 #### Manual re-classification (optional):
 
 
-Automated classifications from are manually checked, which allows for
-manual correction and addition of categories not previously used for
-automated classification. ROIs that have been copied are 
-manually sorted to correct for misclassifications. Updated aid and aidmeas files are produced. 
-Further details on manual re-classification are provided in Section 4.
+Automated classifications are manually checked, which allows for manual correction and addition of categories not previously used for automated classification. ROIs that have been copied are manually sorted to correct for misclassifications. Updated aid and aidmeas files are produced. Further details on manual re-classification are provided in Section 4.
+
 
 #### Data Processing:
 
-Data outputs including CTD (.dat files), automated classifications (aid files)
-and measurements (aidmeas files) are joined together. The aid and aidmeas files,
-which may have been updated, are joined with CTD text files by the 8 digit time
-stamp (ROI number). The data are then averaged in user-defined vertical bins to
-produce a time series of plankton concentrations and environmental
-variables. Quality controlled data products (before and after binning)
-are then exported in simple formats (csv, RData, oce) for plotting and
-analysis. Further details on data processing are provided in Section 5.
+Data outputs including CTD (.dat files), automated classifications (aid files) and measurements (aidmeas files) are joined together. For each deployment, the aid and aidmeas files, which may have been updated, are joined with CTD text files by the 8 digit time stamp (ROI number). The data are then averaged in user-defined vertical bins to produce a time series of plankton concentrations and environmental variables. Data products (before and after binning) can then exported in simple formats (e.g., csv, RData, oce) for plotting and analysis. Further details on data processing are provided in Section 5.
 
 
 # Section 3: Image Copying
@@ -187,39 +127,27 @@ variables, please see documentation for `vpr_autoid_copy()`
                 hour = hr,
                 cast = cast,
                 station = station,
+                threshold = NULL,
                 org = 'dayhour'
                 )
 
 
 # Section 4: Manual Re-classification
 
-Manual re-classification of some categories after automated
-classification may be required to achieve identification accuracy
-standards. In this step, ROIs are displayed on the screen one at a time
-for manual verification. If an image has been misclassified or if it
-falls into a new user-defined category (described below), the image can
-be re-classified. This is especially useful for classification of rare
-categories that were not defined prior to automated classification.
-After completing manual re-classification for a day-hour set, new aid
-and aidmeas files are created for new categories, which are identical in
-format to original aid and aidmeas files.
+Manual re-classification of some categories after automated classification may be required to achieve identification accuracy standards. In this step, ROIs are displayed on the screen one at a time for manual verification. If an image has been misclassified or if it falls into a new user-defined category (described below), the image can be re-classified. This is especially useful for classification of rare categories that were not defined prior to automated classification. After completing manual re-classification for a day-hour set, new aid and aidmeas files are created in a separate folder for the categories, which are identical in format to original aid and aidmeas files.
 
 ## Section 4.1: Preparing the environment by setting some variables
 
 -   Load the processing environment, which includes the `basepath`
     variable.
 -   Set day and hour of interest.
--   (optional) Set category of interest. These categories are the existing automated
-    classification categories which require manual re-classification, as
-    well as any new categories. The `vprr::vpr_category_create()`
-    function sets up the folder structure for any new categories which
-    have been added to the list of interest.
+-   Set the categories  for manual classification. These categories are the existing automated classification categories which require manual re-classification, as well as any new categories. The `vprr::vpr_category_create()` function sets up the folder structure for any new categories which have been added to the list of interest.
 -   Note that if you create new categories part way through manual 
     classification, no misclassified files will have been created in this 
     category for hours which you have already processed. This may cause some 
     unexpected errors down the line in processing, you could either reprocess 
     previous hours of data with all the new categories or manually create empty
-    misclassified files.
+    “misclassified files” (see below).
 -   Run manual re-classification with
     `vprr::vpr_manual_classification()`. This function has a few
     optional arguments to customize the manual re-classification
@@ -231,9 +159,7 @@ format to original aid and aidmeas files.
     the user to see the outline of the organism better, any thin
     appendages become more clear and gelatinous organisms like
     chaetognaths or ctenophores are easier to distinguish.
--   The threshold argument in `vpr_manual_classification()` can be used to 
-    determine the minimum automated classification confidence required for 
-    images to be passed through without manual verification.
+-   If the output from automated classification includes a probability score for each ROI, the threshold argument in vpr_manual_classification() can be used to determine the minimum automated classification confidence required for images to be passed through without manual verification.
 
         #### MANUAL RE-CLASSIFICATION -------------------------------------
         # Once automated classification is complete
@@ -257,16 +183,7 @@ format to original aid and aidmeas files.
 
 ## Section 4.2: Generate new aid and aidmeas files
 
-The function `vprr::vpr_manual_classification()` produces two files
-('misclassified' and 're-classified' text files) as a record of manual
-re-classification, which are found in the R project working directory in
-folders named by the day and hour that the data were collected. The
-function `vprr::vpr_autoid_create()` takes these files and outputs new aid
-and aidmeas files in the R working directory in folders named by
-classification category. This step should be run after each hour of data
-is manually re-classified. If aidmeas files have not been created (through a 
-separate measurement workflow), these functions will run on just the aid files
-without issue.
+The function `vprr::vpr_manual_classification()` produces two files (‘misclassified’ and ‘re-classified’ text files) as a record of manual re-classification. These files are stored in the R project working directory in a folder named “manual_reclassification_record” with subfolders named by the day and hour that the data were collected. The function `vprr::vpr_autoid_create()` takes these files and outputs new aid and aidmeas files in the R working directory in a folder named “new_autoid” with sub folders named by classification category. This step should be run after each hour of data is manually re-classified. If aidmeas files have not been created (through a separate measurement workflow), these functions will run on just the aid files without issue.
 
     #### REORGANIZE ROI AND ROIMEAS DATA ---------------------------------------
     # get mis/re classified files
@@ -281,29 +198,11 @@ without issue.
     vpr_autoid_create(reclassify, misclassified, manual_class_basepath,
           mea = FALSE, categories = categories)
 
-The aid and aidmeas files are both text files which are
-specifically formatted to record classification outputs for further
-processing. The format and naming conventions of these files has been inherited 
-from a VPR image classification and data processing tool called Visual Plankton 
-(written in Matlab); however, the functionality of vprr is independent from that
-of Visual Plankton. The aid files are text records of image paths,
-where each individual text file represents a classification category. Each line 
-of the aid file is the full path to an image which was classified into the
-designated category. Note that the naming scheme of aid files does not
-include the category name in the file title and the category is only
-identifiable by the folder in which it is located. For example the 'krill'
-classification aid file might be named 'oct10_1svmaid.d224.h01' but be located within the
-'krill' autoid folder. The aidmeas files are also text files which represent a
-variety of different measurements taken of the object(s) within a ROI image. 
-The columns of the aidmeas files are
-c('Perimeter', 'Area', 'width1', 'width2', 'width3', 'short_axis_length', 'long_axis_length').
-The aidmeas files were originally created during processing with Visual Plankton
-but are not created or required for processing with `vprr`. We have begun 
-development of an ImageJ workflow which will create an equivalent product.
-
+The aid and aidmeas files are specifically formatted to record classification outputs for further processing. The aid files are text records of image paths, where each individual text file represents a classification category. Each line of the aid file is the full path to an image which was classified into the designated category. Note that the naming scheme of aid files does not include the category name in the file title and the category is only identifiable by the folder in which it is located. For example a ‘krill’ classification aid file might be named ‘oct10_1svmaid.d224.h01’ but be located within the ‘krill’ autoid folder. The aidmeas files are also text files which represent a variety of different measurements taken of the object(s) within a ROI image. The columns of the aidmeas files are c(‘Perimeter’, ‘Area’, ‘width1’, ‘width2’, ‘width3’, ‘short_axis_length’, ‘long_axis_length’). The aidmeas files are not created or required for processing with vprr. 
 Examples of each of these files can be found below.
 
 
+
 ```{r, eval =  TRUE}
 
 aid <- read.table(file = system.file("extdata/COR2019002/autoid/bad_image_blurry/aid/sep20_2svmaid.d222.h04",
@@ -320,15 +219,7 @@ head(aidmeas)
 
 ## Section 4.3: File check
 
-The last step of manual re-classification includes some manual file 
-organization and final checks. These files should be manually reorganized
-in a new directory which will become the new auto_id_folder (see Appendix 1:
-Directory Structure). Any aid and aidmeas files from categories which were 
-not manually checked and re-classified should also be added to this new auto_id_folder
-if they are to be included in further processing (e.g., computation of concentration
-in user-specified depth bins). After the updated aid and aidmeas files have been
-manually reorganized they can be quality controlled using vprr::vpr_autoid_check().
-The user could also manually check the files. 
+The last step of manual re-classification includes some manual file organization and final checks.  The user could also manually check the files.
 
 
     #### FILE CHECK ----------------------------------------------------------
@@ -458,18 +349,12 @@ using the function `vpr_ctd_ymd`.
         dplyr::mutate(., time_hr = time_ms / 3.6e+06)
     data <- vpr_ctd_ymd(data, year)
 
-Average plankton concentration and environmental variables (e.g.,
-temperature, salinity, density, etc.) are then computed within a user
-defined depth bin. The computation of plankton concentration is dependent
-on the assumption that the same animals are not re-sampled by the
-instrument. The bin-averaging step standardizes plankton concentrations
-when the VPR does not sample the water column evenly. This can occur 
-due to characteristics of the deployment or variability in the sampling
-rate, which is not necessarily constant in older versions of the VPR. 
-Binning also reduces noise in the data. First, an oce CTD object is 
-created using `vprr::vpr_oce_create()`. Then, bin-averaging is done 
-using `vprr::bin_cast()`. Concentrations are calculated for each 
-category of interest.
+Average plankton concentration and environmental variables (e.g., temperature, salinity, density, etc.) are then computed along the path of the VPR  in depth bins. The width of the depth bin is set by the user.  In each bin plankton concentration,  C, is computed as follows for each classification category:
+
+     C=  R⁄nV
+
+Where, in each bin, R is the number of ROIs, n is the number of frames, and V is the image volume (constant). An oce CTD object is created using vprr::vpr_oce_create(). Then, bin-averaging is done using vprr::bin_cast().
+
 
 
     ##### BIN DATA AND DERIVE CONCENTRATION ---------------------------------
@@ -630,7 +515,7 @@ will be stored:
 # Appendix 2: Glossary
 
 **Aid files** - Visual Plankton style file output text file, listing file path
-information for ROI's of a specific classification group
+information for ROI's of a specific classification group. The aid file may also include a column specifying the probability score associated with each ROI.
 
 **AidMeas files (AutoID measurements)** - Visual Plankton style output text
 file, listing measurement data for ROI's of a specific classification
@@ -646,8 +531,6 @@ machine learning algorithm
 **AutoID files** - Includes both Aid and AidMeas files as part of 
 automated classifications
 
-**BIO** - Bedford Institute of Oceanography, a research institute in
-Halifax NS, Canada
 
 **Classification category (category)** - A defined group under which VPR
 images can be classified, often represents a taxonomic group (e.g.
@@ -663,22 +546,11 @@ digits)
 **Hour** - Two digit hour (24 hour clock) describing time at which VPR
 data was collected
 
-**Image volume** - The measured volume of water captured within a VPR
-image. Calculated based on optical setting and VPR standards. This is
-based on AutoDeck settings, it is calculated from the VPR calibration
-file (unique to each instrument). It will change based on AutoDeck
-settings and should be updated with each cruise/ processing batch. It is
-measured in cubic mm
-
-**Optical Setting** - A VPR setting controlling image magnification and
-field of view, which can be S0, S1, S2 or S3, where S0 has the greatest
-magnification and smallest image volume, and S3 has the least
-magnification and largest image volume
+**Image volume** - The measured volume of water captured within a VPR image. It varies with optical setting and image segmentation settings in Autodeck software  and. The image volume is derived is calculated from the VPR calibration file (unique to each instrument) and measured in cubic mm.
 
-**ROI** - Region of interest, images identified by autodeck within VPR
-frames based on settings defined in autoDeck program
+**Optical Setting** - A VPR camera setting controlling image magnification and field of view.  Optical settings can be S0, S1, S2 or S3, where S0 has the greatest magnification and smallest image volume, and S3 has the least magnification and largest image volume.
 
-**SeaScan** - Oceanographic instrument manufacturing company
+**ROI** - Images (regions of interest) identified by Autodeck software within VPR frames
 
 **station** - A named geographic location, where the VPR was deployed
 
@@ -689,13 +561,5 @@ to sample over both depth and distance
 **TRROIS** - Training set of images used to train machine learning
 algorithm
 
-**VP** - Visual Plankton program, written in Matlab to classify VPR images
-
-**VPR** - Video Plankton Recorder, oceanographic instrument used to
-image small volumes of water for the purpose of capturing images of
-plankton
-
 **vprtow#** - A numeric code which is unique to each VPR deployment
 
-**Working Directory** - File path on your computer that defines the
-default location of any files you read into R, or save out of R