r
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

327 lines
11KB

  1. #' Convert an Reversible Document back to R Markdown
  2. #'
  3. #' Converts a document originally created with [redoc()] back to R
  4. #' Markdown, including changes made to text in MS Word.
  5. #'
  6. #' @details R chunks may be lost in the editing process if using non-Microsoft
  7. #' word processors (e.g. LibreOffice or in copy-and-pasting text into a new document.
  8. #'
  9. #' @param docx The `.docx`` file to convert
  10. #' @param to the filename to write the resulting `.Rmd` file. The default is to
  11. #' use the same basename as the docx document
  12. #' @param dir The directory to write the `.Rmd`` to. Defaults to current working
  13. #' directory
  14. #' @param track_changes How to deal with tracked changes and comments in the
  15. #' `.docx` file. `"accept"` accepts all changes, and `"reject"` rejects all of
  16. #' them. The default, `"criticmarkup"`, converts the tracked changes to
  17. #' [Critic Markup syntax](http://criticmarkup.com/spec.php#thebasicsyntax).
  18. #' "comments_only" will only convert comments, as other changes can be
  19. #' viewed with [redoc_diff()].
  20. #' `"all"` marks up tracked changes and comments in `<span>` tags and is
  21. #' useful for debugging. See the
  22. #' [pandoc manual](http://pandoc.org/MANUAL.html#option--track-changes) for
  23. #' details.
  24. #' @param block_missing,inline_missing What to do about code blocks or inline code
  25. #' whose output has been removed in the editing of the Word document. "restore"
  26. #' attempts to restore the code as close to its original location in the document
  27. #' as possible. "comment" will do so but wrap it in HTML comments. "omit" will
  28. #' not restore the code at all.
  29. #' @param wrap The width at which to wrap text. If `NA`, text is not wrapped.
  30. #' Set the default with `"redoc.wrap"` in `options()`.
  31. #' @param overwrite Whether to overwrite an existing file
  32. #' @param orig_codefile,orig_docx The original `.codelist.yml` or Word document
  33. #' created when the document was first knit. Useful for debugging, or in
  34. #' cases where the word file has been corrupted or transformed, for instance,
  35. #' by copy-and-pasting the content into a new file. If provided, `dedoc` will
  36. #' use this codefile or word file to re-create the `.Rmd` file with the text
  37. #' of the input.
  38. #' @param verbose whether to print pandoc progress text
  39. #' @importFrom rmarkdown pandoc_convert
  40. #' @importFrom yaml read_yaml
  41. #' @importFrom tools file_path_sans_ext
  42. #' @export
  43. dedoc <- function(docx, to = NULL, dir = ".",
  44. track_changes = "comments_only",
  45. block_missing = "comment",
  46. inline_missing = "omit",
  47. wrap = getOption("redoc.wrap", 80), overwrite = FALSE,
  48. orig_docx = NULL, orig_codefile = NULL,
  49. verbose = FALSE) {
  50. if (!is_redoc(docx) && is.null(orig_codefile) && is.null(orig_docx)) {
  51. md_only <- TRUE
  52. if (verbose) {
  53. message("Document is not reversible - no internal data on R chunks found.
  54. Returning markdown only. Alternate data may be provided via `orig_codefile or `orig_docx`")
  55. }
  56. } else {
  57. md_only <- FALSE
  58. }
  59. stopifnot(track_changes %in%
  60. c("comments_only", "criticmarkup", "accept", "reject", "all"))
  61. stopifnot(block_missing %in% c("comment", "omit", "restore"))
  62. stopifnot(inline_missing %in% c("comment", "omit", "restore"))
  63. if (is.null(to)) to <- paste0(file_path_sans_ext(basename(docx)), ".Rmd")
  64. if (is.null(dir)) dir <- "."
  65. to <- file.path(dir, to)
  66. if (!overwrite && file.exists(to)) stop(to, " exists and overwrite = FALSE")
  67. if (!is.null(orig_codefile)) {
  68. codelist <- read_yaml(orig_codefile)
  69. } else if (!is.null(orig_docx)) {
  70. codelist <- redoc_extract_code(orig_docx)
  71. } else {
  72. codelist <- redoc_extract_code(docx)
  73. }
  74. md <- convert_docx_to_md(docx, track_changes, wrap, verbose, md_only)
  75. if (!md_only) {
  76. codelist <- sort_by(codelist, "lineno")
  77. md <- merge_yaml_headers(md, codelist)
  78. md <- restore_code(md,
  79. codelist = list_subset(codelist, type = "block"),
  80. missing = block_missing
  81. )
  82. md <- restore_code(md,
  83. codelist = list_subset(codelist, type = "inline"),
  84. missing = inline_missing
  85. )
  86. md <- remove_extra_newlines(md)
  87. }
  88. cat(md, file = to, sep = "")
  89. return(to)
  90. }
  91. #' Extract the Rmd used to to produce a Reversible Word Doc
  92. #'
  93. #' Documents produced with [redoc()] store an copy of the original
  94. #' `.Rmd` files used to produce them. This is useful for diffing against the
  95. #' version created with [dedoc()], especially if tracked changes have not been
  96. #' used.
  97. #' @param docx A path to a word file or a an `rdocx` object created with
  98. #' [officer::read_docx()].
  99. #' @param type One of `"original"` or `"roundtrip"`. `"original"` extracts the
  100. #' exact document originally knit. `"roundtrip"` (default) extracts a document
  101. #' that has been converted to Word and back with no edits in between. The
  102. #' latter should be more useful for comparing against edits, as line-wrapping
  103. #' and placement of no-output chunks should match.
  104. #' @param dir The directory to write the `.Rmd`` to. Defaults to current working
  105. #' directory
  106. #' @param to the filename to write the resulting `.Rmd` file. The default is to
  107. #' use the the original name with either `.orignal.Rmd` or `roundtrip.Rmd`
  108. #' extensions.
  109. #' @param overwrite whether to overwrite existing files
  110. #' @export
  111. #' @importFrom stringi stri_subset_fixed stri_subset_regex
  112. #' @return The path to the extracted `.Rmd`
  113. #' @examples
  114. #' redoc_extract_rmd(redoc_example_docx(), dir = tempdir())
  115. redoc_extract_rmd <- function(docx, type = c("original", "roundtrip"),
  116. dir = ".", to = NULL, overwrite = FALSE) {
  117. docx <- to_docx(docx)
  118. assert_redoc(docx)
  119. type <- match.arg(type)
  120. rmdfiles <- list.files(file.path(docx$package_dir, "redoc"),
  121. pattern = "\\.(r|R)md$",
  122. full.names = TRUE
  123. )
  124. if (type == "original") {
  125. rmdfile <- stri_subset_regex(rmdfiles,
  126. "(?:\\.preprocessed\\.|\\.roundtrip\\.)",
  127. negate = TRUE)[1]
  128. } else if (type == "roundtrip") {
  129. rmdfile <- stri_subset_fixed(rmdfiles, ".roundtrip.")
  130. }
  131. if (is.null(to)) to <- basename(rmdfile)
  132. out <- file.path(dir, to)
  133. if (file.exists(out) && !overwrite) stop(out, " exists and overwrite=FALSE")
  134. file.copy(rmdfile, out, overwrite = overwrite)
  135. return(file.path(dir, to))
  136. }
  137. redoc_extract_code <- function(docx) {
  138. docx <- to_docx(docx)
  139. assert_redoc(docx)
  140. codefile <- list.files(file.path(docx$package_dir, "redoc"),
  141. pattern = "\\.codelist\\.yml$",
  142. full.names = TRUE
  143. )
  144. codelist <- read_yaml(codefile)
  145. codelist
  146. }
  147. #' @importFrom stringi stri_replace_first_fixed stri_detect_fixed stri_join
  148. restore_code <- function(md, codelist, missing) {
  149. if (!length(codelist)) return(md)
  150. codelist <- sort_by(codelist, "lineno")
  151. offset <- attr(md, "yaml_offset") %||% 0
  152. last_detected_end <- offset
  153. for (item in codelist) {
  154. marker <- stri_join("<<<", item$name, ">>>")
  155. marker_line <- stri_lineno_first_fixed(md, marker)
  156. if (!is.na(marker_line)) {
  157. last_detected_end <- marker_line + stri_count_lines(item$code)
  158. md <- stri_replace_first_fixed(md, marker, item$code,
  159. vectorize_all = FALSE
  160. )
  161. md <- stri_replace_all_fixed(md, marker, "")
  162. } else if (missing != "omit") {
  163. if (missing == "comment") {
  164. if (item$type == "inline") {
  165. restorecode <- stri_join(
  166. "<!--", item$code, ", originally line ",
  167. item$lineno, " -->\n"
  168. )
  169. } else {
  170. restorecode <- stri_join(
  171. "\n<!-- originally line ", item$lineno, "\n",
  172. item$code, "\n-->\n"
  173. )
  174. }
  175. } else {
  176. restorecode <- stri_join(item$code, "\n")
  177. }
  178. md <- insert_at_prior_empty_line(
  179. md, restorecode,
  180. max(last_detected_end, item$lineno + offset)
  181. )
  182. offset <- offset + 3
  183. }
  184. }
  185. return(md)
  186. }
  187. #' @importFrom stringi stri_extract_first_regex stri_replace_first_regex
  188. #' stri_replace_last_fixed
  189. #' @importFrom yaml yaml.load as.yaml
  190. merge_yaml_headers <- function(md, codelist) {
  191. old_header <- list_subset(codelist, label = "yamlheader")[[1]]
  192. new_yaml <- stri_extract_first_regex(md, "(?s)\\A\\s*---\\n.*?\\n---\\n")
  193. if (is.null(old_header) ||
  194. all(is.na(old_header)) || length(old_header) == 0) {
  195. attr(md, "yaml_offset") <- stri_count_fixed(new_yaml, "\n") - 1
  196. return(md)
  197. }
  198. old_metadata <- yaml.load(old_header$code)
  199. if (!is.na(new_yaml)) {
  200. new_metadata <- yaml.load(new_yaml)
  201. for (name in names(new_metadata)) {
  202. old_metadata[[name]] <- new_metadata[[name]]
  203. }
  204. }
  205. merged_yaml <- oneline(
  206. "---",
  207. stri_replace_last_fixed(
  208. as.yaml(old_metadata,
  209. handlers = NULL
  210. ),
  211. "\n", ""
  212. ),
  213. "---"
  214. )
  215. yaml_offset <- stri_count_lines(merged_yaml) -
  216. stri_count_lines(old_header$code)
  217. md <- stri_replace_first_regex(md, "(?s)^\\n*?---\\n.*?\\n---\\n", "")
  218. md <- oneline(merged_yaml, md)
  219. attr(md, "yaml_offset") <- yaml_offset
  220. return(md)
  221. }
  222. # replace_yaml_blocks <- function(md, codelist) {
  223. # yaml_blocks <- codelist[stri_detect_regex(names(codlist)), "^redoc-yaml-\\d+"]
  224. # if (!length(yaml_blocks)) return(md)
  225. # md <- paste(md, collapse = "\n")
  226. # patterns <- paste0("[[chunk-", codelist$label, "]]")
  227. # replacements <- paste(codelist$code)
  228. # detected <- logical(1)
  229. # prepend <- ""
  230. # for (i in seq_along(patterns)) {
  231. # detected <- stri_detect_fixed(md, patterns[i])
  232. # if (!detected) {
  233. # prepend <- paste0(c(prepend, replacements[i]), collapse = "\n\n")
  234. # } else {
  235. # replacements[i] <-
  236. # paste0(c(prepend, replacements[i]), collapse = "\n\n")
  237. # prepend <- ""
  238. # }
  239. # }
  240. # for (i in seq_along(patterns)) {
  241. # md <- stri_replace_first_fixed(
  242. # md, patterns[i],
  243. # replacements[i]
  244. # )
  245. # md <- stri_replace_all_fixed(md, patterns[i], "")
  246. # }
  247. # if (prepend != "") {
  248. # md <- paste0(c(md, prepend), collapse = "")
  249. # }
  250. # md <- stri_replace_all_regex(md, "\n{3,}", "\n\n")
  251. # md <- stri_split_lines1(md)
  252. # }
  253. convert_docx_to_md <- function(docx,
  254. track_changes,
  255. wrap = getOption("redoc.wrap", 80),
  256. verbose, md_only) {
  257. docx <- normalizePath(docx)
  258. track_changes <- match.arg(track_changes, track_changes)
  259. if (track_changes == "criticmarkup") {
  260. track_opts <- c(
  261. "--track-changes=all",
  262. paste0(
  263. "--lua-filter=",
  264. system.file("lua-filters", "criticmarkup.lua", package = "redoc")
  265. )
  266. )
  267. } else if (track_changes == "comments_only") {
  268. track_opts <- c(
  269. "--track-changes=all",
  270. paste0(
  271. "--lua-filter=",
  272. system.file("lua-filters", "criticmarkup-commentsonly.lua",
  273. package = "redoc")
  274. )
  275. )
  276. } else {
  277. track_opts <- paste0("--track-changes=", track_changes)
  278. }
  279. if (is.null(wrap)) {
  280. wrap_opts <- "--wrap=none"
  281. } else {
  282. wrap_opts <- c("--wrap=auto", paste0("--columns=", wrap))
  283. }
  284. if (!md_only) {
  285. filter_opts <- c(paste0(
  286. "--lua-filter=",
  287. system.file("lua-filters", "revchunks.lua", package = "redoc")
  288. ))
  289. from_format <- "docx+styles+empty_paragraphs"
  290. } else {
  291. filter_opts <- character(0)
  292. from_format <- "docx"
  293. }
  294. other_opts <- c("--standalone", "--eol=lf")
  295. opts <- c(filter_opts, track_opts, wrap_opts, other_opts)
  296. md_tmp <- tempfile(fileext = ".md")
  297. pandoc_convert(docx,
  298. from = from_format,
  299. to = "markdown",
  300. output = md_tmp,
  301. options = opts,
  302. verbose = verbose
  303. )
  304. return(readfile(md_tmp))
  305. }