diff --git a/DESCRIPTION b/DESCRIPTION index e748605ec..cc44c07e9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -75,5 +75,5 @@ Config/testthat/start-first: build-article, build-quarto-article, Config/usethis/last-upkeep: 2025-09-07 Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 SystemRequirements: pandoc diff --git a/NAMESPACE b/NAMESPACE index a9289d152..fc91394e3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -116,6 +116,7 @@ export(build_articles_index) export(build_favicons) export(build_home) export(build_home_index) +export(build_llm_docs) export(build_news) export(build_redirects) export(build_reference) diff --git a/NEWS.md b/NEWS.md index 3f255a728..76360bf80 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ # pkgdown (development version) +* New `build_llm_docs()` generates a `LLMs.txt` at the root directory of your site, and provides a `.md` version of every page. You can disable by adding `llm-docs: false` to your `_pkgdown.yaml` (#2914, @maelle) * Links generated with `\code{\link{foo}()}` now have the `()` moved into the `` in the generated output (@maelle). * Plots in dark mode are now transformed with a CSS filter to improve their visibility (thanks to @gadenbuie). diff --git a/R/build-llm-dl.R b/R/build-llm-dl.R new file mode 100644 index 000000000..eaf9283b3 --- /dev/null +++ b/R/build-llm-dl.R @@ -0,0 +1,66 @@ +simplify_dls <- function(html) { + dls <- xml2::xml_find_all(html, ".//dl") + for (dl in dls) { + simplify_dl(dl) + } + invisible() +} + +simplify_dl <- function(dl) { + children <- xml2::xml_children(dl) + + names <- xml2::xml_name(children) + if (!is_simple_dl(names)) { + cli::cli_warn("Skipping this
: not a simple term-definition list") + return() + } + + groups <- split(children, (seq_along(children) - 1) %/% 2) + + bullets <- lapply(groups, create_li_from_group) + ul <- xml2::read_xml("") + xml_insert(ul, bullets) + + xml2::xml_replace(dl, ul) +} + +# Must have an even number of children that alternate between dt and dd +is_simple_dl <- function(names) { + if (length(names) %% 2 != 0) { + return(FALSE) + } + odd <- names[seq_along(names) %% 2 == 1] + even <- names[seq_along(names) %% 2 == 0] + + all(odd == "dt") && all(even == "dd") +} + +create_li_from_group <- function(group) { + dt <- group[[1]] + dd <- group[[2]] + + if (has_children(dd)) { + # params case + para <- xml2::read_xml("

") + xml_insert(para, xml2::xml_contents(dt)) + xml2::xml_add_child(para, xml_text_node(": ")) + + bullet <- xml2::read_xml("
  • ") + xml2::xml_add_child(bullet, para) + } else { + # reference index + bullet <- xml2::read_xml("
  • ") + xml_insert(bullet, xml2::xml_contents(dt)) + xml2::xml_add_child(bullet, xml_text_node(": ")) + } + xml_insert(bullet, xml2::xml_contents(dd)) + + bullet +} + +has_children <- function(x) length(xml2::xml_children(x)) > 0 + +xml_text_node <- function(x) { + span <- xml2::read_xml(paste0("", x, "")) + xml2::xml_find_first(span, ".//text()") +} diff --git a/R/build-llm.R b/R/build-llm.R new file mode 100644 index 000000000..a5f3172c8 --- /dev/null +++ b/R/build-llm.R @@ -0,0 +1,199 @@ +#' Build docs for LLMs +#' +#' @description +#' `build_llm_docs()` creates an `LLMs.txt` at the root of your site +#' that contains the contents of your `README.md`, your reference index, +#' and your articles index. It also creates a `.md` file for every existing +#' `.html` file in your site. Together, this gives an LLM an overview of your +#' package and the ability to find out more by following links. +#' +#' If you don't want these files generated for your site, you can opt-out by +#' adding the following to your `pkgdown.yml`: +#' +#' ```yaml +#' llm-docs: false +#' ``` +#' +#' @family site components +#' @inheritParams build_site +#' @export +build_llm_docs <- function(pkg = ".") { + pkg <- as_pkgdown(pkg) + if (isFALSE(pkg$meta$`llm-docs`)) { + return(invisible()) + } + + cli::cli_rule("Building docs for llms") + + paths <- get_site_paths(pkg) + purrr::walk(paths, \(path) { + src_path <- path(pkg[["dst_path"]], path) + dst_path <- path_ext_set(src_path, "md") + convert_md(src_path, dst_path, full_url(pkg, path)) + }) + + index <- c( + read_lines(path(pkg$dst_path, "index.md")), + "", + read_file_if_exists(path(pkg$dst_path, "reference", "index.md")), + "", + read_file_if_exists(path(pkg$dst_path, "articles", "index.md")) + ) + write_lines(index, path(pkg$dst_path, "llms.txt")) + + invisible() +} + +full_url <- function(pkg, path) { + if (is.null(pkg$meta$url)) { + return() + } + + url <- paste0(pkg$meta$url, "/") + if (pkg$development$in_dev) { + url <- paste0(url, pkg$prefix) + } + + xml2::url_absolute(paste0(path_dir(path), "/"), url) +} + +convert_md <- function(src_path, dst_path, url = NULL) { + html <- xml2::read_html(src_path) + main_html <- xml2::xml_find_first(html, ".//main") + if (length(main_html) == 0) { + return() + } + + simplify_page_header(main_html) + simplify_anchors(main_html) + simplify_code(main_html) + simplify_popovers_to_footnotes(main_html) + simplify_lifecycle_badges(main_html) + simplify_dls(main_html) + create_absolute_links(main_html, url) + + path <- file_temp() + xml2::write_html(main_html, path, format = FALSE) + on.exit(file_delete(path), add = TRUE) + + rmarkdown::pandoc_convert( + input = path, + output = dst_path, + from = "html", + to = "gfm+definition_lists-raw_html", + ) +} + +# Helpers --------------------------------------------------------------------- + +# simplify page header (which includes logo + source link) +simplify_page_header <- function(html) { + title <- xml2::xml_find_first(html, ".//h1") + # website for a package without README/index.md + if (length(title) > 0) { + xml2::xml_remove(xml2::xml_find_first(html, ".//div[@class='page-header']")) + xml2::xml_add_child(html, title, .where = 0) + } + invisible() +} + +# drop internal anchors +simplify_anchors <- function(html) { + xml2::xml_remove(xml2::xml_find_all(html, ".//a[@class='anchor']")) + invisible() +} + +# strip extraneoous classes +simplify_code <- function(html) { + extract_lang <- function(class) { + trimws(gsub("sourceCode|downlit", "", class)) + } + code <- xml2::xml_find_all(html, ".//pre[contains(@class, 'sourceCode')]") + + purrr::walk(code, \(x) { + xml2::xml_attr(x, "class") <- extract_lang(xml2::xml_attr(x, "class")) + }) + invisible() +} + +simplify_popovers_to_footnotes <- function(main_html) { + popover_refs <- xml2::xml_find_all(main_html, ".//a[@class='footnote-ref']") + if (length(popover_refs) == 0) { + return() + } + + # Create footnotes section + footnotes_section <- xml2::xml_find_first( + main_html, + ".//section[@class='footnotes']" + ) + if (length(footnotes_section) == 0) { + footnotes_section <- xml2::xml_add_child( + main_html, + "section", + id = "footnotes", + class = "footnotes footnotes-end-of-document", + role = "doc-endnotes" + ) + xml2::xml_add_child(footnotes_section, "hr") + footnotes_ol <- xml2::xml_add_child(footnotes_section, "ol") + } else { + footnotes_ol <- xml2::xml_find_first(footnotes_section, ".//ol") + } + + purrr::iwalk(popover_refs, function(ref, i) { + text_content <- xml2::xml_attr(ref, "data-bs-content") + fn_id <- paste0("fn", i) + fnref_id <- paste0("fnref", i) + xml2::xml_attrs(ref) <- list( + href = paste0("#", fn_id), + id = fnref_id, + role = "doc-noteref", + class = "footnote-ref" + ) + + fn_li <- xml2::xml_add_child(footnotes_ol, "li", id = fn_id) + parsed_content <- xml2::read_html(text_content) |> + xml2::xml_find_first(".//body") |> + xml2::xml_children() + purrr::walk(parsed_content, \(x) xml2::xml_add_child(fn_li, x)) + }) +} + +simplify_lifecycle_badges <- function(html) { + # on reference index + badges <- xml2::xml_find_all(html, "//span[contains(@class, 'lifecycle')]") + xml2::xml_replace(badges, "strong", paste0("[", xml2::xml_text(badges), "]")) + + # on individual pages + badges <- xml2::xml_find_all( + html, + "//a[.//img[starts-with(@src, 'figures/lifecycle-')]]" + ) + imgs <- xml2::xml_find_first(badges, ".//img") + xml2::xml_replace(badges, "strong", tolower(xml2::xml_attr(imgs, "alt"))) + + invisible() +} + +create_absolute_links <- function(main_html, url = NULL) { + a <- xml2::xml_find_all(main_html, ".//a") + xml2::xml_attr(a, "class") <- NULL + + href <- xml2::xml_attr(a, "href") + is_internal <- !startsWith(href, "https") & !startsWith(href, "#") + if (!is.null(url)) { + href[is_internal] <- xml2::url_absolute(href[is_internal], url) + } + href[is_internal] <- sub("html$", "md", href[is_internal]) + + xml2::xml_attr(a[is_internal], "href") <- href[is_internal] + + invisible() +} + +read_file_if_exists <- function(path) { + if (file_exists(path)) { + read_lines(path) + } +} diff --git a/R/build.R b/R/build.R index 1b7bdaa14..f3d31c6ea 100644 --- a/R/build.R +++ b/R/build.R @@ -10,6 +10,7 @@ #' * [build_tutorials()] #' * [build_news()] #' * [build_redirects()] +#' * [build_llm_docs()] #' #' See the documentation for the each function to learn how to control #' that aspect of the site. This page documents options that affect the @@ -467,6 +468,9 @@ build_site_local <- function( build_tutorials(pkg, override = override, preview = FALSE) build_news(pkg, override = override, preview = FALSE) build_sitemap(pkg) + if (pkg$bs_version > 3) { + build_llm_docs(pkg) + } build_redirects(pkg, override = override) if (pkg$bs_version == 3) { build_docsearch_json(pkg) diff --git a/R/tweak-reference.R b/R/tweak-reference.R index 5ac643bf0..93b0c0f78 100644 --- a/R/tweak-reference.R +++ b/R/tweak-reference.R @@ -84,14 +84,16 @@ tweak_highlight_other <- function(div) { xml_replace_contents <- function(node, new) { xml2::xml_remove(xml2::xml_contents(node)) - contents <- xml2::xml_contents(new) - for (child in contents) { + xml_insert(node, contents) +} + +xml_insert <- function(node, new) { + for (child in new) { xml2::xml_add_child(node, child) } } - tweak_extra_logo <- function(html) { img <- xml2::xml_find_all( html, diff --git a/inst/BS5/templates/content-reference-index.html b/inst/BS5/templates/content-reference-index.html index d9a981066..810c2f8fc 100644 --- a/inst/BS5/templates/content-reference-index.html +++ b/inst/BS5/templates/content-reference-index.html @@ -10,14 +10,14 @@

    {{{pagetitle}}}

    {{#subtitle}}

    {{{.}}}

    {{/subtitle}} {{#desc}}
    {{{desc}}}
    {{/desc}} - {{#topics}}
    +
    {{#topics}}
    {{#has_icons}}{{#icon}}{{/icon}}{{/has_icons}} {{#aliases}}{{{.}}} {{/aliases}} {{#lifecycle}}{{.}}{{/lifecycle}}
    {{{title}}}
    -
    {{/topics}} + {{/topics}}
    {{/rows}} diff --git a/man/build_articles.Rd b/man/build_articles.Rd index 67b18f900..108e0ae25 100644 --- a/man/build_articles.Rd +++ b/man/build_articles.Rd @@ -261,6 +261,7 @@ as HTML widgets. \seealso{ Other site components: \code{\link{build_home}()}, +\code{\link{build_llm_docs}()}, \code{\link{build_news}()}, \code{\link{build_reference}()}, \code{\link{build_tutorials}()} diff --git a/man/build_home.Rd b/man/build_home.Rd index 6630fbc36..dc588e297 100644 --- a/man/build_home.Rd +++ b/man/build_home.Rd @@ -298,6 +298,7 @@ Or completely remove it: \seealso{ Other site components: \code{\link{build_articles}()}, +\code{\link{build_llm_docs}()}, \code{\link{build_news}()}, \code{\link{build_reference}()}, \code{\link{build_tutorials}()} diff --git a/man/build_llm_docs.Rd b/man/build_llm_docs.Rd new file mode 100644 index 000000000..204ffdf70 --- /dev/null +++ b/man/build_llm_docs.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/build-llm.R +\name{build_llm_docs} +\alias{build_llm_docs} +\title{Build docs for LLMs} +\usage{ +build_llm_docs(pkg = ".") +} +\arguments{ +\item{pkg}{Path to package.} +} +\description{ +\code{build_llm_docs()} creates an \code{LLMs.txt} at the root of your site +that contains the contents of your \code{README.md}, your reference index, +and your articles index. It also creates a \code{.md} file for every existing +\code{.html} file in your site. Together, this gives an LLM an overview of your +package and the ability to find out more by following links. + +If you don't want these files generated for your site, you can opt-out by +adding the following to your \code{pkgdown.yml}: + +\if{html}{\out{
    }}\preformatted{llm-docs: false +}\if{html}{\out{
    }} +} +\seealso{ +Other site components: +\code{\link{build_articles}()}, +\code{\link{build_home}()}, +\code{\link{build_news}()}, +\code{\link{build_reference}()}, +\code{\link{build_tutorials}()} +} +\concept{site components} diff --git a/man/build_news.Rd b/man/build_news.Rd index 194d6b7b2..b6d9a61d2 100644 --- a/man/build_news.Rd +++ b/man/build_news.Rd @@ -85,6 +85,7 @@ Suppress the default addition of CRAN release dates with: Other site components: \code{\link{build_articles}()}, \code{\link{build_home}()}, +\code{\link{build_llm_docs}()}, \code{\link{build_reference}()}, \code{\link{build_tutorials}()} } diff --git a/man/build_reference.Rd b/man/build_reference.Rd index 3ae9edd16..cb14c2795 100644 --- a/man/build_reference.Rd +++ b/man/build_reference.Rd @@ -190,6 +190,7 @@ as HTML widgets. Other site components: \code{\link{build_articles}()}, \code{\link{build_home}()}, +\code{\link{build_llm_docs}()}, \code{\link{build_news}()}, \code{\link{build_tutorials}()} } diff --git a/man/build_site.Rd b/man/build_site.Rd index ff5ac2ddd..c63f98cb3 100644 --- a/man/build_site.Rd +++ b/man/build_site.Rd @@ -71,6 +71,7 @@ take \code{quiet} arguments.} \item \code{\link[=build_tutorials]{build_tutorials()}} \item \code{\link[=build_news]{build_news()}} \item \code{\link[=build_redirects]{build_redirects()}} +\item \code{\link[=build_llm_docs]{build_llm_docs()}} } See the documentation for the each function to learn how to control diff --git a/man/build_tutorials.Rd b/man/build_tutorials.Rd index a2cb248bc..0934c3656 100644 --- a/man/build_tutorials.Rd +++ b/man/build_tutorials.Rd @@ -47,6 +47,7 @@ section. This should be a list where each element specifies: Other site components: \code{\link{build_articles}()}, \code{\link{build_home}()}, +\code{\link{build_llm_docs}()}, \code{\link{build_news}()}, \code{\link{build_reference}()} } diff --git a/man/pkgdown-package.Rd b/man/pkgdown-package.Rd index 519c5c181..62bb3c11f 100644 --- a/man/pkgdown-package.Rd +++ b/man/pkgdown-package.Rd @@ -32,7 +32,7 @@ Authors: Other contributors: \itemize{ - \item Posit Software, PBC (03wc8by49) [copyright holder, funder] + \item Posit Software, PBC (\href{https://ror.org/03wc8by49}{ROR}) [copyright holder, funder] } } diff --git a/man/test-dont.Rd b/man/test-dont.Rd index c0b63c0ec..7d7bba880 100644 --- a/man/test-dont.Rd +++ b/man/test-dont.Rd @@ -56,10 +56,10 @@ x # should be 4 x <- 1 -\dontshow{if (FALSE) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (FALSE) withAutoprint(\{ # examplesIf} x <- 2 \dontshow{\}) # examplesIf} -\dontshow{if (TRUE) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (TRUE) withAutoprint(\{ # examplesIf} x <- 3 \dontshow{\}) # examplesIf} x # should be 3 diff --git a/tests/testthat/_snaps/build-llm-dl.md b/tests/testthat/_snaps/build-llm-dl.md new file mode 100644 index 000000000..26b6a5f89 --- /dev/null +++ b/tests/testthat/_snaps/build-llm-dl.md @@ -0,0 +1,19 @@ +# dd with block elements simplifies correctly + + Code + xpath_xml(html, ".//li") + Output +
  • +

    a:

    +

    b

    +

    c

    +
  • + +# warns if not applied + + Code + . <- simplify_dls(html) + Condition + Warning: + Skipping this
    : not a simple term-definition list + diff --git a/tests/testthat/_snaps/build-llm.md b/tests/testthat/_snaps/build-llm.md new file mode 100644 index 000000000..e0146b987 --- /dev/null +++ b/tests/testthat/_snaps/build-llm.md @@ -0,0 +1,11 @@ +# integration test for convert_md() + + Code + write_lines(read_lines(path), stdout()) + Output + # Page title + + ## Heading + + Some text + diff --git a/tests/testthat/assets/llm.html b/tests/testthat/assets/llm.html new file mode 100644 index 000000000..c908ab2f1 --- /dev/null +++ b/tests/testthat/assets/llm.html @@ -0,0 +1,22 @@ + + + +
    + + +

    Heading

    + +

    Some text

    +
    + + diff --git a/tests/testthat/test-build-llm-dl.R b/tests/testthat/test-build-llm-dl.R new file mode 100644 index 000000000..345b60823 --- /dev/null +++ b/tests/testthat/test-build-llm-dl.R @@ -0,0 +1,59 @@ +test_that("single dt/dd pair converts to simple li", { + html <- xml2::read_html("
    ") + simplify_dls(html) + + expect_equal(xpath_length(html, ".//dl"), 0) + expect_equal(xpath_length(html, ".//ul"), 1) +}) + +test_that("single dt/dd pair converts to simple li", { + html <- xml2::read_html( + "
    +
    a
    +
    b
    +
    " + ) + simplify_dls(html) + + expect_equal(xpath_length(html, ".//dl"), 0) + expect_equal(xpath_text(html, ".//li"), "a: b") +}) + +test_that("dd with block elements simplifies correctly", { + html <- xml2::read_html( + "
    +
    a
    +
    +

    b

    +

    c

    +
    +
    " + ) + simplify_dls(html) + + expect_equal(xpath_length(html, ".//dl"), 0) + expect_equal(xpath_length(html, ".//ul"), 1) + expect_snapshot(xpath_xml(html, ".//li")) +}) + +test_that("warns if not applied", { + html <- xml2::read_html( + " +
    +
    a
    +
    + " + ) + expect_snapshot(. <- simplify_dls(html)) +}) + +test_that("correctly detects simple dls", { + expect_false(is_simple_dl("dt")) + expect_false(is_simple_dl(c("dd", "dt"))) + expect_false(is_simple_dl(c("dt", "dd", "dt"))) + expect_false(is_simple_dl(c("dd", "dt", "dd", "dt"))) + + expect_true(is_simple_dl(c())) + expect_true(is_simple_dl(c("dt", "dd"))) + expect_true(is_simple_dl(c("dt", "dd", "dt", "dd"))) +}) diff --git a/tests/testthat/test-build-llm.R b/tests/testthat/test-build-llm.R new file mode 100644 index 000000000..5c71f6474 --- /dev/null +++ b/tests/testthat/test-build-llm.R @@ -0,0 +1,63 @@ +test_that("integration test for convert_md()", { + skip_if_no_pandoc() + + path <- withr::local_tempfile(pattern = "pkgdown-llm") + convert_md(test_path("assets", "llm.html"), path) + expect_snapshot(write_lines(read_lines(path), stdout())) +}) + +test_that("simplifies page header", { + html <- xml2::read_html( + r"( +
    )" + ) + simplify_page_header(xml2::xml_find_first(html, ".//main")) + expect_equal(xpath_contents(html, ".//main"), "

    Package index

    ") +}) + +test_that("replaces lifecycle badges with strong text", { + html <- xml2::read_html( + r"( + deprecated + [Experimental] + )" + ) + simplify_lifecycle_badges(html) + expect_equal( + xpath_text(html, ".//strong"), + c("[deprecated]", "[experimental]") + ) +}) + +test_that("converts internal urls to absolute with .md ending", { + html <- xml2::read_html( + r"( + link + link + link + )" + ) + create_absolute_links(html, "https://pkgdown.r-lib.org") + expect_equal( + xpath_attr(html, ".//a", "href"), + c( + "https://pkgdown.r-lib.org/llm.md", + "#fragment", + "https://example.org" + ) + ) +}) + +test_that("adjusts extension even without url", { + html <- xml2::read_html(r"(link)") + create_absolute_links(html) + expect_equal(xpath_attr(html, ".//a", "href"), "llm.md") +}) + +test_that("strip extra classes from pre", { + html <- xml2::read_html(r"(
    1+1
    )") + simplify_code(html) + expect_equal(xpath_attr(html, ".//pre", "class"), "r") +})