Browse Source

Handle YAML in header and elsewhere in document

Closes #9, Closes #2
testing-framework
Noam Ross 1 year ago
parent
commit
4307dc0544
14 changed files with 178 additions and 69 deletions
  1. +2
    -1
      DESCRIPTION
  2. +5
    -0
      NAMESPACE
  3. +4
    -2
      R/document_hooks.R
  4. +43
    -37
      R/docx_reversible.R
  5. +83
    -24
      R/extract.R
  6. +13
    -0
      R/utils.R
  7. +15
    -0
      inst/protect-empty-divs.lua
  8. +1
    -0
      inst/rmarkdown/templates/rdocx_reversible/skeleton/.gitignore
  9. +1
    -1
      inst/rmarkdown/templates/rdocx_reversible/skeleton/skeleton.Rmd
  10. BIN
      inst/rmarkdown/templates/rdocx_reversible/skeleton/skeleton.docx
  11. +5
    -2
      man/undoc.Rd
  12. +1
    -1
      redoc.Rproj
  13. +5
    -1
      tests/testthat/test-reverse.R
  14. BIN
      tests/testthat/test.docx

+ 2
- 1
DESCRIPTION View File

@@ -28,6 +28,7 @@ Imports:
stringi,
xml2,
backports,
whoami
whoami,
yaml
VignetteBuilder: knitr
Roxygen: list(markdown = TRUE)

+ 5
- 0
NAMESPACE View File

@@ -20,12 +20,15 @@ importFrom(rmarkdown,pandoc_convert)
importFrom(rmarkdown,word_document)
importFrom(stringi,stri_detect_fixed)
importFrom(stringi,stri_detect_regex)
importFrom(stringi,stri_extract_first_regex)
importFrom(stringi,stri_join)
importFrom(stringi,stri_locate_all_regex)
importFrom(stringi,stri_match_all_regex)
importFrom(stringi,stri_opts_regex)
importFrom(stringi,stri_replace_all_fixed)
importFrom(stringi,stri_replace_all_regex)
importFrom(stringi,stri_replace_first_fixed)
importFrom(stringi,stri_replace_first_regex)
importFrom(stringi,stri_split_lines1)
importFrom(stringi,stri_trim_both)
importFrom(tools,file_path_sans_ext)
@@ -38,3 +41,5 @@ importFrom(xml2,xml_add_child)
importFrom(xml2,xml_child)
importFrom(xml2,xml_find_first)
importFrom(xml2,xml_set_attrs)
importFrom(yaml,as.yaml)
importFrom(yaml,yaml.load)

+ 4
- 2
R/document_hooks.R View File

@@ -19,6 +19,7 @@
#' @importFrom whoami fullname username
#' @importFrom stringi stri_opts_regex stri_match_all_regex stri_replace_first_fixed
preprocess_criticmarkup <- function(input_lines, author = NULL) {
#This could be sped up. Remember the stringi `dotall` option to search across lines.
if (is.null(author)) {
author <- fullname(fallback = username(fallback = "R User"))
}
@@ -162,12 +163,13 @@ render_pandoc_highlight <- function(text, author) {
#' @importFrom stringi stri_replace_first_fixed
wrap_yaml <- function(lines, chunk_df) {
md <- paste(lines, collapse = "\n")
chunk_df <- chunk_df[chunk_df$type == "yaml" & chunk_df$label != "yaml_header"]
chunk_df <- chunk_df[chunk_df$type == "yaml" & chunk_df$label != "yaml-header",]
for (i in seq_along(chunk_df$label)) {
md <- stri_replace_first_fixed(md, chunk_df$code[i],
paste0("::: chunk-", chunk_df$label[i],
paste0("::: {custom-style=\"chunk-", chunk_df$label[i], "\"}",
"\n\n",
chunk_df$code[i],
"\n\n:::"))
}
stri_split_lines1(md)
}

+ 43
- 37
R/docx_reversible.R View File

@@ -19,50 +19,52 @@
#' @importFrom officer read_docx
#' @importFrom tools file_path_sans_ext
#' @importFrom rmarkdown word_document
#' @importFrom knitr knit_print knit_global opts_chunk
#' @importFrom knitr knit_print knit_global opts_chunk opts_knit
#' @importFrom xfun parse_only
#' @export
rdocx_reversible <- function(highlight_outputs = FALSE, wrap = 80,
margins = NULL, line_numbers = NULL,
comment_author = NULL, ...) {
out <- word_document(
md_extensions = c("+fenced_divs", "+bracketed_spans"),
...
)
out <- word_document(...)


# Pre-parse, name inline chunks and save chunk contents to lookup table
out$pre_knit <- function(input, ...) {
chunkfile <- paste0(file_path_sans_ext(input), ".chunks.csv")
knitr::opts_knit$set(chunkfile = chunkfile)
utils::write.table(parse_rmd_to_df(input),
file = chunkfile,
sep = ",", row.names = FALSE, qmethod = "double"
)
inline_counter(reset = TRUE)
chunk_counter(reset = TRUE)
}

out$knitr <- rmarkdown::knitr_options(
# Wrap code outputs in spans and divs
knit_hooks = list(
# TODO: See if its better to make empty inline output raw openxml
evaluate.inline = function(code, envir = knit_global()) {
v <- withVisible(eval(parse_only(code), envir = envir))
if (is.null(v$value) || v$value == "") v$value <- "\uFEFF"
if (v$visible) knit_print(v$value, inline = TRUE, options = opts_chunk$get())
if (v$visible) {
knit_print(v$value, inline = TRUE, options = opts_chunk$get())
}
},
inline = function(x) {
id <- paste0("inline-", inline_counter())
paste0("[", x, "]{custom-style=\"", id, "\"}")
},
chunk = function(x, options) {
if (isFALSE(options$redoc_include)) {
# Special output for empty chunks
# TODO: move empty chunk handler to a lua filter to make more general
# TODO: test if we need special handling for other no-result chunks
paste0(
"```{=openxml}\n<w:p><w:pPr><w:pStyle w:val=\"chunk-",
options$label,
"\"/><w:rPr><w:vanish/></w:rPr></w:pPr></w:p>\n```"
)
} else {
paste0(
"::: {custom-style=\"chunk-", options$label, "\"}\n",
x,
"\n:::"
)
}
if (isFALSE(options$redoc_include)) x <- ""
paste0(
"::: {custom-style=\"chunk-", options$label, "\"}\n",
x, "\n:::"
)
},
document = function(x) {
chunkfile <- opts_knit$get("chunkfile")
x <- preprocess_criticmarkup(x, author = comment_author)
x <- wrap_yaml(x, read.csv(chunkfile, stringsAsFactors = FALSE))
x
}
),
@@ -77,31 +79,34 @@ rdocx_reversible <- function(highlight_outputs = FALSE, wrap = 80,
)
)

# Pre-parse, name inline chunks and save chunk contents to lookup table
out$pre_knit <- function(input, ...) {
utils::write.table(parse_rmd_to_df(input),
file = paste0(file_path_sans_ext(input), ".chunks.csv"),
sep = ",", row.names = FALSE, qmethod = "double"
md_extensions <- c("+smart", "+fenced_divs", "+bracketed_spans")

out$pandoc <- rmarkdown::pandoc_options(
to = "docx+styles",
from = rmarkdown::from_rmarkdown(extensions = md_extensions),
args = c(
"--lua-filter",
system.file("protect-empty-divs.lua", package = "redoc")
)
inline_counter(reset = TRUE)
chunk_counter(reset = TRUE)
}
)

out$post_processor <-
function(metadata, input_file, output_file, clean, verbose) {
docx <- read_docx(output_file)
rmd_input <- get(envir = parent.frame(n = 1), "original_input")
chunkfile <- paste0(file_path_sans_ext(rmd_input), ".chunks.csv")
chunkfile <- opts_knit$get("chunkfile")
mdfile <- paste0(basename(file_path_sans_ext(rmd_input)), ".md")

tmpd <- tempdir()

orig_rmd <- file.path(
tmpd,
paste0(
file_path_sans_ext(basename(rmd_input)),
".original.Rmd"
)
tmpd, paste0(file_path_sans_ext(basename(rmd_input)), ".original.Rmd")
)
orig_md <- file.path(
tmpd, paste0(file_path_sans_ext(file_path_sans_ext(basename(input_file))), ".md")
)
file.copy(rmd_input, orig_rmd)
file.copy(input_file, orig_md)

roundtrip_rmd <- undoc(
output_file,
@@ -113,6 +118,7 @@ rdocx_reversible <- function(highlight_outputs = FALSE, wrap = 80,
docx <- embed_file(docx, chunkfile)
docx <- embed_file(docx, orig_rmd)
docx <- embed_file(docx, roundtrip_rmd)
docx <- embed_file(docx, orig_md)

if (highlight_outputs) {
docx <- highlight_output_styles(docx)


+ 83
- 24
R/extract.R View File

@@ -4,7 +4,7 @@
#' Markdown, including changes made to text in MS Word.
#'
#' @details Internal data storing R chunks may be lost in the editing process.
#' This is known to occur with documents edited with LibreOffice.
#' This is known to occur with documents edited with LibreOffice.
#'
#' @param docx The `.docx file to convert`
#' @param to the filename to write the resulting `.Rmd` file. The default is to
@@ -13,10 +13,11 @@
#' directory
#' @param track_changes How to deal with tracked changes and comments in the
#' `.docx` file. `"accept"` accepts all changes, and `"reject"` rejects all of
#' them. The default, `"criticmarkup"`, converts the tracked changes to [Critic
#' Markup syntax](http://criticmarkup.com/spec.php#thebasicsyntax). `"all"`
#' marks up tracked changes and comments in `<span>` tags. See the [pandoc
#' manual](http://pandoc.org/MANUAL.html#option--track-changes) for details.
#' them. The default, `"criticmarkup"`, converts the tracked changes to
#' [Critic Markup syntax](http://criticmarkup.com/spec.php#thebasicsyntax).
#' `"all"` marks up tracked changes and comments in `<span>` tags. See the
#' [pandoc manual](http://pandoc.org/MANUAL.html#option--track-changes) for
#' details.
#' @param wrap The width at which to wrap text. If `NA`, text is not wrapped
#' @param overwrite Whether to overwrite an existing file
#' @param orig_chunkfile,orig_docx The original chunkfile or Word document
@@ -51,9 +52,10 @@ Alternate data may be provided via orig_chunkfile or orig_docx")
}

md_lines <- convert_docx_to_md(docx, track_changes, wrap, verbose)
md_lines <- merge_yaml_headers(md_lines, chunk_df)
md_lines <- replace_yaml_blocks(md_lines, chunk_df)
md_lines <- replace_inlines(md_lines, chunk_df)
md_lines <- replace_chunks(md_lines, chunk_df)
md_lines <- prepend_yaml(md_lines, chunk_df)

cat(md_lines, file = to, sep = "\n")

@@ -128,30 +130,39 @@ replace_inlines <- function(md_lines, chunk_df) {
#' stri_replace_all_regex stri_split_lines1 stri_detect_fixed
replace_chunks <- function(md_lines, chunk_df) {
chunk_df <- chunk_df[chunk_df$type == "block", ]
md_lines <- paste(md_lines, collapse = "\n")
md_lines <- oneline(md_lines)
if (nrow(chunk_df)) {
patterns <- paste0("[[", chunk_df$label, "]]")
replacements <- paste(chunk_df$header, chunk_df$code, "```", sep = "\n")
detected <- logical(1)
append <- ""
last_detected <- 1
start_append <- ""
last_detected <- NA_integer_
for (i in seq_along(patterns)) {
detected <- stri_detect_fixed(md_lines, patterns[i])
if (!detected) {
if (i == 1) {
md_lines <- paste0(c(patterns[i], md_lines), collapse = "\n\n")
append <- oneline(append, replacements[i], collapse = "\n\n")
} else {
if (is.na(last_detected)) {
start_append <- append
} else {
append <- paste0(c(append, replacements[i]), collapse = "\n\n")
replacements[last_detected] <-
oneline(replacements[last_detected], append, collapse = "\n\n")
}
} else {
replacements[last_detected] <-
paste0(c(replacements[last_detected], append), collapse = "\n\n")
last_detected <- i
append <- ""
}
if (append != "") {
replacements[last_detected] <-
paste0(c(replacements[last_detected], append), collapse = "\n\n")
}
if (start_append != "") {
has_yaml <- stri_detect_regex(md_lines, "(?s)^---.*---\n")
if (has_yaml) {
md_lines <- stri_replace_first_regex(
md_lines,
"(?s)(?<!^)(\n+)---\n",
oneline("$1---\n", escape_captures(start_append), "\n")
)
} else {
md_lines <- oneline(start_append, md_lines)
}
}
for (i in seq_along(patterns)) {
@@ -162,22 +173,69 @@ replace_chunks <- function(md_lines, chunk_df) {
md_lines <- stri_replace_all_fixed(md_lines, patterns[i], "")
}
md_lines <- stri_replace_all_regex(md_lines, "\n{3,}", "\n\n")
}
reline(md_lines)
}


#' @importFrom stringi stri_extract_first_regex stri_replace_first_regex
#' @importFrom yaml yaml.load as.yaml
merge_yaml_headers <- function(md_lines, chunk_df) {
chunk_df <- chunk_df[chunk_df$label == "yaml-header", ]
if (nrow(chunk_df)) {
md_lines <- paste0(md_lines, collapse = "\n")
new_yaml <- stri_extract_first_regex(md_lines, "(?s)^---\\n.*\\n---\\n")
new_metadata <- yaml.load(new_yaml)
old_metadata <- yaml.load(chunk_df$code)
for (name in names(new_metadata)) {
old_metadata[[name]] <- new_metadata[[name]]
}
md_lines <- stri_replace_first_regex(md_lines, "(?s)^---\\n.*\\n---\\n", "")
md_lines <- paste0(
paste0("---\n", as.yaml(old_metadata), "---\n"),
md_lines,
collapse = "\n"
)
md_lines <- stri_split_lines1(md_lines)
}
md_lines
}

# TODO: Re-insert non-header YAML blocks into locations in Rmd body
# BODY: Use knitr document hook to wrap YAML blocks in divs for placement, put ones with deleted placeholders (in order) at end of doc
prepend_yaml <- function(md_lines, chunk_df) {
chunk_df <- chunk_df[chunk_df$label == "yaml-header", ]
replace_yaml_blocks <- function(md_lines, chunk_df) {
chunk_df <- chunk_df[chunk_df$type == "yaml" & chunk_df$label != "yaml-header", ]
if (nrow(chunk_df)) {
md_lines <- c(chunk_df$code, "", md_lines)
md_lines <- stri_split_lines1(paste(md_lines, collapse = "\n"))
md_lines <- paste(md_lines, collapse = "\n")
patterns <- paste0("[[chunk-", chunk_df$label, "]]")
replacements <- paste(chunk_df$code)
detected <- logical(1)
prepend <- ""
for (i in seq_along(patterns)) {
detected <- stri_detect_fixed(md_lines, patterns[i])
if (!detected) {
prepend <- paste0(c(prepend, replacements[i]), collapse = "\n\n")
} else {
replacements[i] <-
paste0(c(prepend, replacements[i]), collapse = "\n\n")
prepend <- ""
}
}
for (i in seq_along(patterns)) {
md_lines <- stri_replace_first_fixed(
md_lines, patterns[i],
replacements[i]
)
md_lines <- stri_replace_all_fixed(md_lines, patterns[i], "")
}
if (prepend != "") {
md_lines <- paste0(c(md_lines, prepend), collapse = "")
}
md_lines <- stri_replace_all_regex(md_lines, "\n{3,}", "\n\n")
md_lines <- stri_split_lines1(md_lines)
}
md_lines
}


convert_docx_to_md <- function(docx, track_changes, wrap, verbose) {
docx <- normalizePath(docx)
track_changes <- match.arg(track_changes, track_changes)
@@ -202,7 +260,8 @@ convert_docx_to_md <- function(docx, track_changes, wrap, verbose) {
"--lua-filter=",
system.file("revchunks.lua", package = "redoc")
))
opts <- c(track_opts, filter_opts, wrap_opts)
other_opts <- c("--standalone") # note adding metadata args for additional title block elements here might work (possibly as title block variable), though can't control order?
opts <- c(track_opts, filter_opts, wrap_opts, other_opts)
md_tmp <- tempfile(fileext = ".md")
pandoc_convert(docx,
from = "docx+styles+empty_paragraphs",


+ 13
- 0
R/utils.R View File

@@ -0,0 +1,13 @@
#' @importFrom stringi stri_join
oneline <- function(..., collapse = "\n") {
stri_join(c(...), collapse = collapse)
}

#' @importFrom stringi stri_split_lines1
reline <- function(...) {
stri_split_lines1(oneline(...))
}

escape_captures <- function(str) {
stri_replace_all_fixed(str, "$", "\\$")
}

+ 15
- 0
inst/protect-empty-divs.lua View File

@@ -0,0 +1,15 @@
-- This filter takes empty dives that have custom style attributes, which
-- would be removed in a word doc, and replaces them with raw openxml
-- that should survive round-tripping

function Div(elem)
if elem.attributes["custom-style"] and #elem.content == 0 then
return pandoc.RawBlock("openxml",
'<w:p><w:pPr><w:pStyle w:val="' ..
elem.attr.attributes['custom-style'] ..
'"/><w:rPr><w:vanish/></w:rPr></w:pPr></w:p>'
)
else
return elem
end
end

+ 1
- 0
inst/rmarkdown/templates/rdocx_reversible/skeleton/.gitignore View File

@@ -3,3 +3,4 @@
*.original.Rmd
*.roundtrip.Rmd
*.chunks.csv
*.docx

+ 1
- 1
inst/rmarkdown/templates/rdocx_reversible/skeleton/skeleton.Rmd View File

@@ -2,7 +2,7 @@
title: "Your Title"
subtitle: "Your subtitle"
author: "Your Name"
date: "The Date"
date: Two plus two is `r 2 + 2`
output:
redoc::rdocx_reversible:
keep_md: TRUE


BIN
inst/rmarkdown/templates/rdocx_reversible/skeleton/skeleton.docx View File


+ 5
- 2
man/undoc.Rd View File

@@ -19,8 +19,11 @@ directory}

\item{track_changes}{How to deal with tracked changes and comments in the
\code{.docx} file. \code{"accept"} accepts all changes, and \code{"reject"} rejects all of
them. The default, \code{"criticmarkup"}, converts the tracked changes to \href{http://criticmarkup.com/spec.php#thebasicsyntax}{Critic Markup syntax}. \code{"all"}
marks up tracked changes and comments in \code{<span>} tags. See the \href{http://pandoc.org/MANUAL.html#option--track-changes}{pandoc manual} for details.}
them. The default, \code{"criticmarkup"}, converts the tracked changes to
\href{http://criticmarkup.com/spec.php#thebasicsyntax}{Critic Markup syntax}.
\code{"all"} marks up tracked changes and comments in \code{<span>} tags. See the
\href{http://pandoc.org/MANUAL.html#option--track-changes}{pandoc manual} for
details.}

\item{wrap}{The width at which to wrap text. If \code{NA}, text is not wrapped}



+ 1
- 1
redoc.Rproj View File

@@ -18,4 +18,4 @@ StripTrailingWhitespace: Yes
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace,vignette
PackageRoxygenize: rd,collate,namespace

+ 5
- 1
tests/testthat/test-reverse.R View File

@@ -3,9 +3,13 @@ context("redoc round-trips")
test_that("Document round-tripping works", {
rmarkdown::render(redoc_example_rmd(),
output_dir = getwd(),
output_file = "skel.docx", quiet = TRUE
output_file = "skel.docx", quiet = TRUE,
output_options = list(keep_md = FALSE),
clean = TRUE
)
rdoc <- undoc("skel.docx", overwrite = TRUE)
odoc <- redoc_extract_rmd("skel.docx", type = "roundtrip", overwrite = TRUE)
expect_equal(readLines(rdoc), readLines(odoc))
})

unlink(list.files(pattern = ("\\.(docx|Rmd|md)")))

BIN
tests/testthat/test.docx View File


Loading…
Cancel
Save