diff --git a/NEWS.md b/NEWS.md
index 6254ef95..1968dcaa 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -5,7 +5,13 @@
- Fix. `docx_summary` preserves non-breaking hyphens. Non-breaking hyphens are
replaced with a hyphen-minus (Unicode character 002D). Closes #573.
+## Features
+- `docx_summary()` gains parameter 'detailed' which allows to get a detailed
+summary including formatting properties of runs in a paragraph. Formatting
+properties are stored in a list column `run`, where each element
+is a dataframe with rows corresponding to a single
+run and columns containing the information on formatting properties.
# officer 0.6.6
diff --git a/R/fortify_docx.R b/R/fortify_docx.R
index f762af5f..330c11aa 100644
--- a/R/fortify_docx.R
+++ b/R/fortify_docx.R
@@ -109,7 +109,7 @@ docxtable_as_tibble <- function(node, styles, preserve = FALSE) {
}
#' @importFrom xml2 xml_has_attr
-par_as_tibble <- function(node, styles) {
+par_as_tibble <- function(node, styles, detailed = FALSE) {
style_node <- xml_child(node, "w:pPr/w:pStyle")
if (inherits(style_node, "xml_missing")) {
style_name <- NA
@@ -129,14 +129,71 @@ par_as_tibble <- function(node, styles) {
stringsAsFactors = FALSE
)
+ if (detailed) {
+ nodes_run <- xml_find_all(node, "w:r")
+ run_data <- lapply(nodes_run, run_as_tibble)
+
+ run_data <- mapply(function(x, id) {
+ x$id <- id
+ x
+ }, run_data, seq_along(run_data), SIMPLIFY = FALSE)
+ run_data <- rbind_match_columns(run_data)
+
+ par_data$run <- I(list(run_data))
+ }
+
par_data$content_type <- rep("paragraph", nrow(par_data))
par_data
}
+#' @importFrom xml2 xml_has_attr
+val_child <- function(node, child_path, attr = "val", default = NULL) {
+ child_node <- xml_child(node, child_path)
+ if (inherits(child_node, "xml_missing")) return(NA_character_)
+ if (!xml_has_attr(child_node, attr)) default
+ else xml_attr(child_node, attr)
+}
+
+val_child_lgl <- function(node, child_path, attr = "val", default = NULL) {
+ val <- val_child(node = node, child_path = child_path, attr = attr, default = default)
+ if (is.na(val)) return(NA)
+ else (val %in% c("1", "on", "true"))
+}
+
+val_child_int <- function(node, child_path, attr = "val", default = NULL) {
+ as.integer(
+ val_child(node = node, child_path = child_path, attr = attr, default = default)
+ )
+}
+
+run_as_tibble <- function(node, styles) {
+ style_node <- xml_child(node, "w:rPr/w:rStyle")
+ if (inherits(style_node, "xml_missing")) {
+ style_name <- NA
+ } else {
+ style_id <- xml_attr(style_node, "val")
+ style_name <- styles$style_name[styles$style_id %in% style_id]
+ }
+ run_data <- data.frame(
+ text = xml_text(node),
+ bold = val_child_lgl(node, "w:rPr/w:b", default = "true"),
+ italic = val_child_lgl(node, "w:rPr/w:i", default = "true"),
+ underline = val_child(node, "w:rPr/w:u"),
+ sz = val_child_int(node, "w:rPr/w:sz"),
+ szCs = val_child_int(node, "w:rPr/w:szCs"),
+ color = val_child(node, "w:rPr/w:color"),
+ shading = val_child(node, "w:rPr/w:shd"),
+ shading_color = val_child(node, "w:rPr/w:shd", attr = "color"),
+ shading_fill = val_child(node, "w:rPr/w:shd", attr = "fill"),
+ stringsAsFactors = FALSE
+ )
+
+ run_data
+}
-node_content <- function(node, x, preserve = FALSE) {
+node_content <- function(node, x, preserve = FALSE, detailed = FALSE) {
node_name <- xml_name(node)
switch(node_name,
- p = par_as_tibble(node, styles_info(x)),
+ p = par_as_tibble(node, styles_info(x), detailed = detailed),
tbl = docxtable_as_tibble(node, styles_info(x), preserve = preserve),
NULL
)
@@ -158,6 +215,12 @@ node_content <- function(node, x, preserve = FALSE) {
#' the `{docxtractr}` package by Bob Rudis.
#' @param remove_fields if TRUE, prevent field codes from appearing in the
#' returned data.frame.
+#' @param detailed Should information on runs be included in summary dataframe?
+#' Defaults to `FALSE`. If `TRUE` a list column `run` is added to the summary
+#' containing a summary of formatting properties of runs as a dataframe with
+#' rows corresponding to a single run and columns containing the information
+#' on formatting properties.
+#'
#' @examples
#' example_docx <- system.file(
#' package = "officer",
@@ -169,7 +232,7 @@ node_content <- function(node, x, preserve = FALSE) {
#'
#' docx_summary(doc, preserve = TRUE)[28, ]
#' @export
-docx_summary <- function(x, preserve = FALSE, remove_fields = FALSE) {
+docx_summary <- function(x, preserve = FALSE, remove_fields = FALSE, detailed = FALSE) {
if (remove_fields) {
instrText_nodes <- xml_find_all(x$doc_obj$get(), "//w:instrText")
xml_remove(instrText_nodes)
@@ -178,18 +241,19 @@ docx_summary <- function(x, preserve = FALSE, remove_fields = FALSE) {
all_nodes <- xml_find_all(x$doc_obj$get(), "/w:document/w:body/*[self::w:p or self::w:tbl]")
- data <- lapply(all_nodes, node_content, x = x, preserve = preserve)
+ data <- lapply(all_nodes, node_content, x = x, preserve = preserve, detailed = detailed)
data <- mapply(function(x, id) {
x$doc_index <- id
x
}, data, seq_along(data), SIMPLIFY = FALSE)
+
data <- rbind_match_columns(data)
colnames <- c(
"doc_index", "content_type", "style_name", "text",
"level", "num_id", "row_id", "is_header", "cell_id",
- "col_span", "row_span"
+ "col_span", "row_span", "run"
)
colnames <- intersect(colnames, names(data))
data[, colnames]
diff --git a/man/docx_summary.Rd b/man/docx_summary.Rd
index f8410b5a..e388b826 100644
--- a/man/docx_summary.Rd
+++ b/man/docx_summary.Rd
@@ -4,7 +4,7 @@
\alias{docx_summary}
\title{Get Word content in a data.frame}
\usage{
-docx_summary(x, preserve = FALSE, remove_fields = FALSE)
+docx_summary(x, preserve = FALSE, remove_fields = FALSE, detailed = FALSE)
}
\arguments{
\item{x}{an rdocx object}
@@ -17,6 +17,12 @@ the \code{{docxtractr}} package by Bob Rudis.}
\item{remove_fields}{if TRUE, prevent field codes from appearing in the
returned data.frame.}
+
+\item{detailed}{Should information on runs be included in summary dataframe?
+Defaults to \code{FALSE}. If \code{TRUE} a list column \code{run} is added to the summary
+containing a summary of formatting properties of runs as a dataframe with
+rows corresponding to a single run and columns containing the information
+on formatting properties.}
}
\description{
read content of a Word document and
diff --git a/tests/testthat/test-doc-summary.R b/tests/testthat/test-doc-summary.R
index 371248a4..b59a2a0a 100644
--- a/tests/testthat/test-doc-summary.R
+++ b/tests/testthat/test-doc-summary.R
@@ -64,6 +64,75 @@ test_that("preserves non breaking hyphens", {
)
})
+test_that("detailed summary", {
+ doc <- read_docx()
+
+ fpar_ <- fpar(
+ ftext("Formatted ", prop = fp_text(bold = TRUE, color = "red")),
+ ftext("paragraph ", prop = fp_text(
+ shading.color = "blue"
+ )),
+ ftext("with multiple runs.",
+ prop = fp_text(italic = TRUE, font.size = 20, font.family = "Arial")
+ )
+ )
+
+ doc <- body_add_fpar(doc, fpar_, style = "Normal")
+
+ fpar_ <- fpar(
+ "Unformatted ",
+ "paragraph ",
+ "with multiple runs."
+ )
+
+ doc <- body_add_fpar(doc, fpar_, style = "Normal")
+
+ doc <- body_add_par(doc, "Single Run", style = "Normal")
+
+ doc <- body_add_fpar(doc,
+ fpar(
+ "Single formatetd run ",
+ fp_t = fp_text(bold = TRUE, color = "red")
+ )
+ )
+
+ xml_elt <- paste0(
+ officer:::wp_ns_yes,
+ "",
+ "NA",
+ "toggle",
+ "0",
+ "1",
+ "false",
+ "true",
+ "off",
+ "on",
+ ""
+ )
+
+ doc <- officer:::body_add_xml(
+ x = doc, str = xml_elt
+ )
+
+ doc_sum <- docx_summary(doc, detailed = TRUE)
+
+ expect_true("run" %in% names(doc_sum))
+ expect_type(doc_sum$run, "list")
+ expect_equal(lengths(doc_sum$run), rep(11, 5))
+ expect_equal(sapply(doc_sum$run, nrow), c(3, 3, 1, 1, 8))
+
+ expect_true(all(sapply(doc_sum$run$bold, is.logical)))
+ expect_true(all(sapply(doc_sum$run$italic, is.logical)))
+ expect_true(all(sapply(doc_sum$run$sz, is.integer)))
+ expect_true(all(sapply(doc_sum$run$szCs, is.integer)))
+ expect_true(all(sapply(doc_sum$run$underline, is_character)))
+ expect_true(all(sapply(doc_sum$run$color, is_character)))
+ expect_true(all(sapply(doc_sum$run$shading, is_character)))
+ expect_true(all(sapply(doc_sum$run$shading_color, is_character)))
+ expect_true(all(sapply(doc_sum$run$shading_fill, is_character)))
+})
+
+
test_that("pptx summary", {