CRAN

reasons

cdh <- cransays::download_history()
library("dplyr", warn.conflicts	 = FALSE)
library("lubridate", warn.conflicts	 = FALSE)
library("tidyr", warn.conflicts	 = FALSE)
diff0 <- structure(0, class = "difftime", units = "hours")
cran <- cdh |> 
  filter(!is.na(version)) |> 
  distinct() |> 
  arrange(package, snapshot_time) |> 
  group_by(package, snapshot_time) |> 
  # Remove some duplicated packages in different folders
  mutate(n = seq_len(n())) |> 
  filter(n == n()) |> 
  ungroup() |> 
  select(-n) |> 
  arrange(package, snapshot_time, version) |> 
  # Packages last seen in queue less than 24 ago are considered same submission 
  # (even if their version number differs)
  mutate(diff_time = difftime(snapshot_time, lag(snapshot_time), units = "hour"),
         diff_time = if_else(is.na(diff_time), diff0, diff_time), # Fill NAs
         diff_v = version != lag(version),
         diff_v = if_else(is.na(diff_v), TRUE, diff_v), # Fill NAs
         near_t = abs(diff_time) <= 24,
         resubmission = !near_t | diff_v, 
         resubmission = if_else(resubmission == FALSE & diff_time == 0, 
                               TRUE, resubmission),
         resubmission_n = cumsum(as.numeric(resubmission)),
         new_version = !near(diff_time, 1, tol = 24) & diff_v, 
         new_version = if_else(new_version == FALSE & diff_time == 0, 
                               TRUE, new_version),
         submission_n = cumsum(as.numeric(new_version)), .by = package) |> 
  select(-diff_time, -diff_v, -new_version, -new_version, -near_t) |> 
  mutate(version = package_version(version, strict = FALSE),
         date = as_date(snapshot_time))
cran_archive <- tools:::CRAN_archive_db()
# When row binding the data.frames that have only one row lose they row name:
# handle those cases to keep the version number:
archived <- vapply(cran_archive, NROW, numeric(1L))
names(cran_archive)[archived == 1L] <- vapply(cran_archive[archived == 1L], rownames, character(1L))
# Merge current and archive data
cran_dates <- do.call(rbind, cran_archive)
cran_dates$type <- "archived"
current <- tools:::CRAN_current_db()
current$type <- "available"
cran_h <- rbind(current, cran_dates)
# Keep minimal CRAN data archive
cran_h$pkg_v <- basename(rownames(cran_h))
rownames(cran_h) <- NULL
cda <- cran_h |> 
  mutate(strcapture(x = pkg_v, "^(.+)_([0-9]*.+).tar.gz$", 
                    proto = data.frame(package = character(), version = character())),
         package = if_else(is.na(package), pkg_v, package)) |> 
  arrange(package, mtime) |> 
  mutate(acceptance_n = seq_len(n()), .by = package) |> 
  select(package, pkg_v, version, acceptance_n, date = mtime, uname, type) |> 
  mutate(date = as_date(date))
library("ggplot2")
cdas <- cda |> 
  summarize(available = if_else(any(type == "available"), "available", "archived"),
            published = min(date),
            n_published = max(acceptance_n),
            .by = package)

ggplot(cdas) + 
  geom_point(aes(published, n_published, col = available, shape = available)) +
  theme_minimal() +
  theme(legend.position = c(0.7, 0.8), legend.background = element_rect()) +
  labs(x = element_blank(), y = "Versions", col = "Status", shape = "Status",
       title = "First publication of packages and versions published") +
  scale_x_date(expand = expansion(), date_breaks = "2 years", date_labels = "%Y")
cran_subm <- cran |> 
  summarise(
    resubmission_n = max(resubmission_n, na.rm = TRUE),
    submission_n = max(submission_n, na.rm = TRUE),
    # The number of submissions 
    submissions = resubmission_n - submission_n + 1,
    date = min(date),
    .by = c("package", "version")) |> 
  arrange(package, version)
# Filter to those packages submitted in the period we have data
cda_acc <- cda |> 
  filter(date >= min(cran_subm$date)) |> 
  select(-pkg_v) |> 
  mutate(version = package_version(version, FALSE))

accepted_subm <- merge(cda_acc, cran_subm, by = c("package", "version"),
             suffixes = c(".cran", ".subm"), all = TRUE, sort = FALSE) |> 
  arrange(package, version, date.cran, date.subm) |> 
  mutate(submissions = if_else(is.na(submissions), 1, submissions),
         acceptance_n = if_else(is.na(acceptance_n), 0, acceptance_n))
lp <- scales::label_percent(accuracy = 0.1)
accepted_subm |> 
  summarize(cransays = sum(!is.na(date.subm)),
            cran = sum(!is.na(date.cran)),
            missed_submissions = cran - cransays,
            percentaged_missed = lp(missed_submissions/cran))
accepted_subm |> 
  filter(!is.na(date.cran)) |> 
  mutate(time_diff = difftime(date.cran, date.subm, units = "weeks")) |>
  # Calculate the number of accepted packages sine the recording of submissions
  mutate(accepted_n = acceptance_n - min(acceptance_n[acceptance_n != 0L], na.rm = TRUE) + 1, .by = package) |> 
  filter(time_diff >= 0) |> 
  ggplot() + 
  geom_point(aes(date.cran, time_diff, col = accepted_n)) +
  theme_minimal() +
  theme(legend.position = c(0.2, 0.8), legend.background = element_rect()) +
  labs(x = "Published on CRAN", title = "Time since submitted to CRAN", 
       y = "Weeks", col = "Accepted")
## Don't know how to automatically pick scale for object of type <difftime>.
## Defaulting to continuous.
count_submissions <- function(x) {
  x |> 
    mutate(submission_in_period = seq_len(n()),
           date.mix = pmin(date.cran, date.subm, na.rm = TRUE),
           .by = package, .after = acceptance_n) |> 
    summarise(
      # Number of accepted packages on CRAN
      total_accepted = sum(!is.na(date.cran), 0, na.rm = TRUE),
      # At minimum 0 through {cransays}
      through_cransays = sum(!is.na(date.subm), 0, na.rm = TRUE), 
      # In case same version number is submitted at different timepoints
      resubmissions = ifelse(any(!is.na(resubmission_n)), 
                              max(resubmission_n, na.rm = TRUE) - min(resubmission_n, na.rm = TRUE) - through_cransays, 0),
      resubmissions = if_else(resubmissions < 0L, 0L, resubmissions),
      # All submission + those that were duplicated on the submission system
      total_submissions = max(submission_in_period, na.rm = TRUE) + resubmissions,
      # The submissions that were not successful
      total_attempts = total_submissions - total_accepted,
      percentage_failed_submissions = lp(total_attempts/total_accepted), 
      .by = package)
}
first_submissions <- accepted_subm |> 
  group_by(package) |> 
  # Keep submission that where eventually accepted
  filter(length(acceptance_n != 0L) > 1L && any(acceptance_n[acceptance_n != 0L] == 1)) |> 
  # Keep submissions until the first acceptance but not after
  filter(cumsum(acceptance_n) <= 1L & seq_len(n()) <= which(acceptance_n == 1L)) |> 
  ungroup()
ffs <- first_submissions |>   
  count_submissions() |> 
  count(total_attempts, sort = TRUE,  name = "packages") |> 
  mutate(percentage = lp(packages/sum(packages, na.rm = TRUE)))
ffs
submissions_with_accepted <- accepted_subm |> 
  # Filter those that were included on CRAN (not all submission rejected)
  filter(any(acceptance_n >= 1), .by = package) |> 
  mutate(date.mix = pmin(date.cran, date.subm, na.rm = TRUE)) |> 
  group_by(package) |> 
  arrange(date.mix) |> 
  filter(
    # Those that start by 0 but next acceptance is 1 or higher
     cumsum(acceptance_n) >= 1L | 
       min(acceptance_n[acceptance_n != 0L], na.rm = TRUE) >= 2) |> 
  ungroup() 
fs_exp <- count_submissions(submissions_with_accepted)
fs_exp |> 
  count(more_failed = total_accepted > total_attempts, 
            sort = TRUE, name = "packages") |> 
  mutate(percentage = lp(packages/sum(packages)))
library("ggrepel")
ggplot(fs_exp) +
  geom_abline(slope = 1, intercept = 0, linetype = 2) +
  geom_count(aes(total_accepted, total_attempts)) +
  geom_label_repel(aes(total_accepted, total_attempts, label = package), data = . %>% filter(total_attempts >= 10)) +
  labs(x = "CRAN versions", y = "Rejected submissions",  size = "Packages",
       title = "Packages after the first version", subtitle = "for the period analyzed") +
  scale_size(trans = "log10") +
  theme_minimal() +
  theme(legend.position = c(0.8, 0.7), legend.background = element_rect())
fs_exp |> 
  count(total_attempts > total_accepted, name = "packages") |> 
  mutate(percentage = lp(packages/sum(packages)))
fs <- count_submissions(accepted_subm)

ggplot(fs) +
  geom_abline(slope = 1, intercept = 0, linetype = 2) +
  geom_count(aes(total_accepted, total_attempts)) +
  geom_label_repel(aes(total_accepted, total_attempts, label = package), 
                   data = . %>% filter(total_attempts >= 12)) +
  labs(x = "CRAN versions", y = "Rejected submissions",  size = "Packages",
       title = "All packages submissions", subtitle = "for the period analyzed ~174 weeks") +
  theme_minimal() +
  scale_size(trans = "log10") +
  theme(legend.position = c(0.8, 0.7), legend.background = element_rect())
accepted_subm |> 
  filter(acceptance_n == 1L) |> 
  count(status = type, name = "packages") |> 
  mutate(percentage = lp(packages/sum(packages)))
fully_archived <- accepted_subm |>
  filter(acceptance_n != 0L) |> 
  filter(any(acceptance_n == 1L), .by = package) |> 
  summarize(archived = all(type == "archived"), .by = package) |> 
  count(archived, name = "packages") |> 
  mutate(percentage = lp(packages/sum(packages)))
fully_archived
## ─ Session info ───────────────────────────────────────────────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.3.1 (2023-06-16)
##  os       Ubuntu 22.04.3 LTS
##  system   x86_64, linux-gnu
##  ui       X11
##  language (EN)
##  collate  en_US.UTF-8
##  ctype    en_US.UTF-8
##  tz       Europe/Madrid
##  date     2024-01-20
##  pandoc   3.1.1 @ /usr/lib/rstudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────
##  package     * version date (UTC) lib source
##  blogdown      1.18    2023-06-19 [1] CRAN (R 4.3.1)
##  bookdown      0.37    2023-12-01 [1] CRAN (R 4.3.1)
##  bslib         0.6.1   2023-11-28 [1] CRAN (R 4.3.1)
##  cachem        1.0.8   2023-05-01 [1] CRAN (R 4.3.1)
##  cli           3.6.2   2023-12-11 [1] CRAN (R 4.3.1)
##  colorspace    2.1-0   2023-01-23 [1] CRAN (R 4.3.1)
##  digest        0.6.33  2023-07-07 [1] CRAN (R 4.3.1)
##  dplyr       * 1.1.4   2023-11-17 [1] CRAN (R 4.3.1)
##  evaluate      0.23    2023-11-01 [1] CRAN (R 4.3.2)
##  fansi         1.0.6   2023-12-08 [1] CRAN (R 4.3.1)
##  farver        2.1.1   2022-07-06 [1] CRAN (R 4.3.1)
##  fastmap       1.1.1   2023-02-24 [1] CRAN (R 4.3.1)
##  generics      0.1.3   2022-07-05 [1] CRAN (R 4.3.1)
##  ggplot2     * 3.4.4   2023-10-12 [1] CRAN (R 4.3.1)
##  ggrepel     * 0.9.5   2024-01-10 [1] CRAN (R 4.3.1)
##  glue          1.7.0   2024-01-09 [1] CRAN (R 4.3.1)
##  gtable        0.3.4   2023-08-21 [1] CRAN (R 4.3.1)
##  highr         0.10    2022-12-22 [1] CRAN (R 4.3.1)
##  htmltools     0.5.7   2023-11-03 [1] CRAN (R 4.3.2)
##  jquerylib     0.1.4   2021-04-26 [1] CRAN (R 4.3.1)
##  jsonlite      1.8.8   2023-12-04 [1] CRAN (R 4.3.1)
##  knitr       * 1.45    2023-10-30 [1] CRAN (R 4.3.2)
##  labeling      0.4.3   2023-08-29 [1] CRAN (R 4.3.2)
##  lifecycle     1.0.4   2023-11-07 [1] CRAN (R 4.3.2)
##  lubridate   * 1.9.3   2023-09-27 [1] CRAN (R 4.3.1)
##  magrittr      2.0.3   2022-03-30 [1] CRAN (R 4.3.1)
##  munsell       0.5.0   2018-06-12 [1] CRAN (R 4.3.1)
##  pillar        1.9.0   2023-03-22 [1] CRAN (R 4.3.1)
##  pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.3.1)
##  purrr         1.0.2   2023-08-10 [1] CRAN (R 4.3.1)
##  R6            2.5.1   2021-08-19 [1] CRAN (R 4.3.1)
##  Rcpp          1.0.12  2024-01-09 [1] CRAN (R 4.3.1)
##  rlang         1.1.3   2024-01-10 [1] CRAN (R 4.3.1)
##  rmarkdown     2.25    2023-09-18 [1] CRAN (R 4.3.1)
##  rstudioapi    0.15.0  2023-07-07 [1] CRAN (R 4.3.1)
##  sass          0.4.8   2023-12-06 [1] CRAN (R 4.3.1)
##  scales        1.3.0   2023-11-28 [1] CRAN (R 4.3.1)
##  sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.3.1)
##  tibble        3.2.1   2023-03-20 [1] CRAN (R 4.3.1)
##  tidyr       * 1.3.0   2023-01-24 [1] CRAN (R 4.3.1)
##  tidyselect    1.2.0   2022-10-10 [1] CRAN (R 4.3.1)
##  timechange    0.2.0   2023-01-11 [1] CRAN (R 4.3.1)
##  utf8          1.2.4   2023-10-22 [1] CRAN (R 4.3.2)
##  vctrs         0.6.5   2023-12-01 [1] CRAN (R 4.3.1)
##  withr         2.5.2   2023-10-30 [1] CRAN (R 4.3.2)
##  xfun          0.41    2023-11-01 [1] CRAN (R 4.3.2)
##  yaml          2.3.8   2023-12-11 [1] CRAN (R 4.3.1)
## 
##  [1] /home/lluis/bin/R/4.3.1
##  [2] /opt/R/4.3.1/lib/R/library
## 
## ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
packages_db <- as.data.frame(tools::CRAN_package_db())
cran_author <- grep("CRAN Team", x = packages_db$Author, ignore.case = TRUE)
cran_authorsR <- grep("CRAN Team", x = packages_db$`Authors@R`, ignore.case = TRUE)
CRAN_TEAM_mentioned <- union(cran_author, cran_authorsR)
unique(packages_db$Package[CRAN_TEAM_mentioned])
## [1] "fBasics"   "fMultivar" "geiger"    "plotrix"   "RCurl"     "RJSONIO"  
## [7] "udunits2"  "XML"
library("tools")
# Look up only software dependencies in Bioconductor
options(repos = BiocManager::repositories()[c("BioCsoft", "CRAN")])
ap <- available.packages()
all_deps <- package_dependencies(c("RCurl", "XML"), 
                                 reverse = TRUE, db = ap, which = "all")
all_unique_deps <- unique(unlist(all_deps, FALSE, FALSE))
first_deps <- package_dependencies(all_unique_deps, db = ap, which = "all")
first_deps_strong <- package_dependencies(all_unique_deps, db = ap, which = "strong")
strong <- sapply(first_deps_strong, function(x){any(c("XML", "RCurl") %in% x)})
deps_strong <- package_dependencies(all_unique_deps, recursive = TRUE, 
                                 db = ap, which = "strong")
first_rdeps <- package_dependencies(all_unique_deps, 
                                   reverse = TRUE, db = ap, which = "all")
deps_all <- package_dependencies(all_unique_deps, recursive = TRUE, 
                                 db = ap, which = "all")
archive <- tools:::CRAN_archive_db()[all_unique_deps]
packages <- tools::CRAN_package_db()
library("dplyr")
library("BiocPkgTools")
fr <- vapply(archive, function(x) {
  if (is.null(x)) {
    return(NA)
  }
  as.Date(x$mtime[1])
}, FUN.VALUE = Sys.Date())
fr_bioc <- biocDownloadStats("software") |> 
  filter(Package %in% all_unique_deps) |> 
  firstInBioc() |> 
  pull(Date, name = Package)
first_release <- c(as.Date(fr[!is.na(fr)]), as.Date(fr_bioc))[all_unique_deps]
last_update <- packages$Published[match(all_unique_deps, packages$Package)]
releases <- vapply(archive, NROW, numeric(1L)) + 1
maintainers <- packages_db$Maintainer[match(all_unique_deps, packages_db$Package)]
maintainers <- trimws(gsub("<.+>", "", maintainers))
library("cranlogs")
acd <- cran_downloads(intersect(all_unique_deps, packages_db$Package), 
                          when = "last-month")
cran_pkg <- summarise(acd, downloads = sum(count), .by = package)
loc <- Sys.setlocale(locale = "C")
bioc_d <- vapply(setdiff(all_unique_deps, packages_db$Package), function(x){
  pkg <- pkgDownloadStats(x)
  tail(pkg$Nb_of_downloads, 1)
  }, numeric(1L))
bioc_pkg <- data.frame(package = names(bioc_d), downloads = bioc_d)
downloads <- rbind(bioc_pkg, cran_pkg)
rownames(downloads) <- downloads$package
dwn <- downloads[all_unique_deps, ]
repo <- vector("character", length(all_unique_deps))
ap_deps <- ap[all_unique_deps, ]
repo[startsWith(ap_deps[, "Repository"], "https://bioc")] <- "Bioconductor"
repo[!startsWith(ap_deps[, "Repository"], "https://bioc")] <- "CRAN"
deps <- data.frame(package = all_unique_deps,
                   direct_dep_XML = all_unique_deps %in% all_deps$XML,
                   direct_dep_RCurl = all_unique_deps %in% all_deps$RCurl,
                   first_deps_n = lengths(first_deps),
                   deps_all_n = lengths(deps_all),
                   first_rdeps_n = lengths(first_rdeps),
                   first_deps_strong_n = lengths(first_deps_strong), 
                   deps_strong_n = lengths(deps_strong),
                   direct_strong = strong, 
                   releases = releases,
                   strong = strong, 
                   first_release = first_release,
                   last_release = last_update,
                   maintainer = maintainers,
                   downloads = dwn$downloads,
                   repository = repo) |> 
  mutate(type = case_when(direct_dep_XML & direct_dep_RCurl ~ "both",
                          direct_dep_XML ~ "XML",
                          direct_dep_RCurl ~ "RCurl"))
rownames(deps) <- NULL
head(deps)
deps |> 
  summarise(Packages = n(), deps = sum(first_deps_n),
            q25 = quantile(deps_all_n, probs = 0.25),
            mean_all = mean(deps_all_n),
            q75 = quantile(deps_all_n, probs = 0.75),
            .by = c(direct_dep_XML, direct_dep_RCurl)) |> 
  arrange(-Packages)
library("ggplot2")
library("ggrepel")
deps_wo <- filter(deps, !package %in% c("XML", "RCurl"))
deps_wo |> 
  ggplot() +
  geom_point(aes(first_deps_n, downloads, shape = type)) +
  geom_text_repel(aes(first_deps_n, downloads, label = package),
                  data = filter(deps_wo, first_deps_n > 40 | downloads > 10^5)) +
  theme_minimal() +
  scale_y_log10(labels = scales::label_log()) +
  labs(title = "Packages and downloads", 
       x = "Direct dependencies", y = "Downloads", size = "Packages")
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
deps_wo |> 
  ggplot() +
  geom_point(aes(first_deps_n, first_rdeps_n, shape = type)) +
  geom_text_repel(aes(first_deps_n, first_rdeps_n, label = package),
                  data = filter(deps_wo, first_deps_n > 60 | first_rdeps_n > 50)) +
  theme_minimal() +
  scale_y_log10(labels = scales::label_log()) +
  labs(title = "Few dependencies but lots of dependents",
    x = "Direct dependencies", y = "Depend on them", size = "Packages")
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
library("ggplot2")
library("ggrepel")
deps_wo <- filter(deps, !package %in% c("XML", "RCurl"))
deps_wo |> 
  ggplot() +
  geom_vline(xintercept = 20, linetype = 2) +
  geom_point(aes(first_deps_strong_n, downloads, shape = repository)) +
  geom_text_repel(aes(first_deps_strong_n, downloads, label = package),
                  data = filter(deps_wo, first_deps_strong_n > 20 | downloads > 10^5)) +
  theme_minimal() +
  scale_y_log10(labels = scales::label_log()) +
  labs(title = "Packages and downloads", 
       x = "Direct strong dependencies", y = "Downloads", shape = "Repository")
## Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
data_maintainers <- deps_wo |> 
  filter(!is.na(maintainer)) |> 
  summarize(n = n(), downloads = sum(downloads), .by = maintainer)
data_maintainers |> 
  ggplot() +
  geom_point(aes(n, downloads)) +
  geom_text_repel(aes(n, downloads, label = maintainer),
                  data = filter(data_maintainers, n > 2 | downloads > 10^4)) +
  scale_y_log10(labels = scales::label_log()) +
  scale_x_continuous(breaks = 1:10, minor_breaks = NULL) +
  theme_minimal() +
  labs(title = "CRAN maintainers that depend on XML and RCurl",
       x = "Packages", y = "Downloads")
## Warning: ggrepel: 15 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
cols_pca <-  c(4:7, 15)
pca_all <- prcomp(deps_wo[, cols_pca], scale. = TRUE, center = TRUE)
summary(pca_all)
## Importance of components:
##                          PC1    PC2    PC3     PC4     PC5
## Standard deviation     1.386 1.2478 0.9458 0.65380 0.44846
## Proportion of Variance 0.384 0.3114 0.1789 0.08549 0.04022
## Cumulative Proportion  0.384 0.6954 0.8743 0.95978 1.00000
pca_data <- cbind(pca_all$x, deps_wo)
ggplot(pca_data) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_point(aes(PC1, PC2, col = repository, shape = repository)) +
  geom_text_repel(aes(PC1, PC2, label = package), 
                  data = filter(pca_data, abs(PC1) > 2 | abs(PC2) > 2)) +
  theme_minimal() +
  theme(axis.text = element_blank()) +
  labs(title = "PCA of the numeric variables")
## Warning: ggrepel: 58 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
pca_all$rotation[, 1:2]
pca_strong <- prcomp(deps_wo[deps_wo$strong, cols_pca], 
                     scale. = TRUE, center = TRUE)
summary(pca_strong)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5
## Standard deviation     1.4198 1.3005 0.9373 0.49421 0.41258
## Proportion of Variance 0.4032 0.3382 0.1757 0.04885 0.03404
## Cumulative Proportion  0.4032 0.7414 0.9171 0.96596 1.00000
pca_data_strong <- cbind(pca_strong$x, deps_wo[deps_wo$strong, ])
ggplot(pca_data_strong) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_point(aes(PC1, PC2, col = repository, shape = repository)) +
    geom_text_repel(aes(PC1, PC2, label = package), 
                  data = filter(pca_data_strong, abs(PC1) > 2 | abs(PC2) > 2)) +
  theme_minimal() +
  theme(axis.text = element_blank()) +
  labs(title = "Important packages depending on XML and RCurl", 
       subtitle = "PCA of numeric variables of strong dependencies",
       col = "Repository", shape = "Repository")
## Warning: ggrepel: 42 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
pca_weak <- prcomp(deps_wo[!deps_wo$strong, cols_pca], 
                   scale. = TRUE, center = TRUE)
summary(pca_weak)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5
## Standard deviation     1.4500 1.1578 0.9901 0.63980 0.40895
## Proportion of Variance 0.4205 0.2681 0.1960 0.08187 0.03345
## Cumulative Proportion  0.4205 0.6886 0.8847 0.96655 1.00000
pca_data_weak <- cbind(pca_weak$x, deps_wo[!deps_wo$strong, ])
ggplot(pca_data_weak) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_point(aes(PC1, PC2, col = type, shape = type)) +
  geom_text_repel(aes(PC1, PC2, label = package), 
                  data = filter(pca_data_weak, abs(PC1)> 2 | abs(PC2) > 2)) +
  theme_minimal() +
  theme(axis.text = element_blank()) +
  labs(title = "PCA of packages in CRAN", col = "Type", shape = "Type")
keep <- deps_wo$repository == "CRAN" & deps_wo$strong
pca_cran <- prcomp(deps_wo[keep, cols_pca], 
                     scale. = TRUE, center = TRUE)
summary(pca_cran)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5
## Standard deviation     1.4174 1.3060 0.9244 0.51813 0.40278
## Proportion of Variance 0.4018 0.3412 0.1709 0.05369 0.03245
## Cumulative Proportion  0.4018 0.7430 0.9139 0.96755 1.00000
pca_data_strong <- cbind(pca_cran$x, deps_wo[keep, ])
ggplot(pca_data_strong) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_point(aes(PC1, PC2, col = type, shape = type)) +
    geom_text_repel(aes(PC1, PC2, label = package), 
                  data = filter(pca_data_strong, abs(PC1) > 2 | abs(PC2) > 2)) +
  theme_minimal() +
  theme(axis.text = element_blank()) +
  labs(title = "Packages in CRAN", 
       col = "Type", shape = "Type")
## Warning: ggrepel: 26 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
keep <- deps_wo$repository == "Bioconductor"  & deps_wo$strong
pca_bioc <- prcomp(deps_wo[keep, cols_pca], 
                     scale. = TRUE, center = TRUE)
summary(pca_bioc)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5
## Standard deviation     1.4913 1.3703 0.8495 0.33584 0.25281
## Proportion of Variance 0.4448 0.3755 0.1443 0.02256 0.01278
## Cumulative Proportion  0.4448 0.8203 0.9647 0.98722 1.00000
pca_data_strong <- cbind(pca_bioc$x, deps_wo[keep, ])
ggplot(pca_data_strong) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_point(aes(PC1, PC2, col = type, shape = type)) +
    geom_text_repel(aes(PC1, PC2, label = package), 
                  data = filter(pca_data_strong, abs(PC1) > 2 | abs(PC2) > 2)) +
  theme_minimal() +
  theme(axis.text = element_blank()) +
  labs(title = "Packages in Bioconductor", 
       subtitle = "PCA of numeric variables of strong dependencies",
       col = "Type", shape = "Type")
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
deps |> 
  filter(strong) |> 
  ggplot() +
  geom_vline(xintercept = as.Date("2013-06-15"), linetype = 2) +
  geom_point(aes(first_release, downloads, col = type, shape = type, 
                 size = first_deps_strong_n)) +
  geom_label(aes(first_release, downloads, label = package),
             data = filter(deps, package %in% c("XML", "RCurl")), show.legend = FALSE) +
  theme_minimal() +
  scale_y_log10(labels = scales::label_log()) +
  annotate("text", x = as.Date("2014-6-15"), y = 5*10^5, 
           label = "CRAN maintained", hjust = 0) +
  labs(x = "Release date", y = "Downloads", 
       title = "More packages added after CRAN maintenance than before",
       subtitle = "Release date and downloads",
       col = "Depends on", shape = "Depends on", size = "Direct strong dependencies") 
## Warning: Removed 34 rows containing missing values (`geom_point()`).
summarize(deps_wo,
          before = sum(first_release <= as.Date("2013-06-15"), na.rm = TRUE), 
          later = sum(first_release > as.Date("2013-06-15"), na.rm = TRUE),
          .by = type)
## - Session info ---------------------------------------------------------------
##  setting  value
##  version  R version 4.3.1 (2023-06-16)
##  os       Ubuntu 22.04.3 LTS
##  system   x86_64, linux-gnu
##  ui       X11
##  language (EN)
##  collate  C
##  ctype    C
##  tz       Europe/Madrid
##  date     2024-01-22
##  pandoc   3.1.1 @ /usr/lib/rstudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
## 
## - Packages -------------------------------------------------------------------
##  package       * version     date (UTC) lib source
##  Biobase         2.62.0      2023-10-24 [1] Bioconductor
##  BiocFileCache   2.10.1      2023-10-26 [1] Bioconductor
##  BiocGenerics    0.48.1      2023-11-01 [1] Bioconductor
##  BiocManager     1.30.22     2023-08-08 [1] CRAN (R 4.3.1)
##  BiocPkgTools  * 1.20.0      2023-10-24 [1] Bioconductor
##  biocViews       1.70.0      2023-10-24 [1] Bioconductor
##  bit             4.0.5       2022-11-15 [1] CRAN (R 4.3.1)
##  bit64           4.0.5       2020-08-30 [1] CRAN (R 4.3.1)
##  bitops          1.0-7       2021-04-24 [1] CRAN (R 4.3.1)
##  blob            1.2.4       2023-03-17 [1] CRAN (R 4.3.1)
##  blogdown        1.18        2023-06-19 [1] CRAN (R 4.3.1)
##  bookdown        0.37        2023-12-01 [1] CRAN (R 4.3.1)
##  bslib           0.6.1       2023-11-28 [1] CRAN (R 4.3.1)
##  cachem          1.0.8       2023-05-01 [1] CRAN (R 4.3.1)
##  cli             3.6.2       2023-12-11 [1] CRAN (R 4.3.1)
##  codetools       0.2-19      2023-02-01 [2] CRAN (R 4.3.1)
##  colorspace      2.1-0       2023-01-23 [1] CRAN (R 4.3.1)
##  cranlogs      * 2.1.1       2019-04-29 [1] CRAN (R 4.3.1)
##  crul            1.4.0       2023-05-17 [1] CRAN (R 4.3.1)
##  curl            5.2.0       2023-12-08 [1] CRAN (R 4.3.1)
##  DBI             1.2.1       2024-01-12 [1] CRAN (R 4.3.1)
##  dbplyr          2.4.0       2023-10-26 [1] CRAN (R 4.3.2)
##  digest          0.6.34      2024-01-11 [1] CRAN (R 4.3.1)
##  dplyr         * 1.1.4       2023-11-17 [1] CRAN (R 4.3.1)
##  DT              0.31        2023-12-09 [1] CRAN (R 4.3.1)
##  evaluate        0.23        2023-11-01 [1] CRAN (R 4.3.2)
##  fansi           1.0.6       2023-12-08 [1] CRAN (R 4.3.1)
##  farver          2.1.1       2022-07-06 [1] CRAN (R 4.3.1)
##  fastmap         1.1.1       2023-02-24 [1] CRAN (R 4.3.1)
##  fauxpas         0.5.2       2023-05-03 [1] CRAN (R 4.3.1)
##  filelock        1.0.3       2023-12-11 [1] CRAN (R 4.3.1)
##  generics        0.1.3       2022-07-05 [1] CRAN (R 4.3.1)
##  ggplot2       * 3.4.4       2023-10-12 [1] CRAN (R 4.3.1)
##  ggrepel       * 0.9.5       2024-01-10 [1] CRAN (R 4.3.1)
##  gh              1.4.0       2023-02-22 [1] CRAN (R 4.3.1)
##  glue            1.7.0       2024-01-09 [1] CRAN (R 4.3.1)
##  graph           1.80.0      2023-10-24 [1] Bioconductor
##  gtable          0.3.4       2023-08-21 [1] CRAN (R 4.3.1)
##  highr           0.10        2022-12-22 [1] CRAN (R 4.3.1)
##  hms             1.1.3       2023-03-21 [1] CRAN (R 4.3.1)
##  htmltools       0.5.7       2023-11-03 [1] CRAN (R 4.3.2)
##  htmlwidgets   * 1.6.4       2023-12-06 [1] CRAN (R 4.3.1)
##  httpcode        0.3.0       2020-04-10 [1] CRAN (R 4.3.1)
##  httr            1.4.7       2023-08-15 [1] CRAN (R 4.3.1)
##  igraph          1.6.0       2023-12-11 [1] CRAN (R 4.3.1)
##  jquerylib       0.1.4       2021-04-26 [1] CRAN (R 4.3.1)
##  jsonlite        1.8.8       2023-12-04 [1] CRAN (R 4.3.1)
##  knitr         * 1.45        2023-10-30 [1] CRAN (R 4.3.2)
##  labeling        0.4.3       2023-08-29 [1] CRAN (R 4.3.2)
##  lifecycle       1.0.4       2023-11-07 [1] CRAN (R 4.3.2)
##  magrittr        2.0.3       2022-03-30 [1] CRAN (R 4.3.1)
##  memoise         2.0.1       2021-11-26 [1] CRAN (R 4.3.1)
##  munsell         0.5.0       2018-06-12 [1] CRAN (R 4.3.1)
##  pillar          1.9.0       2023-03-22 [1] CRAN (R 4.3.1)
##  pkgconfig       2.0.3       2019-09-22 [1] CRAN (R 4.3.1)
##  purrr           1.0.2       2023-08-10 [1] CRAN (R 4.3.1)
##  R6              2.5.1       2021-08-19 [1] CRAN (R 4.3.1)
##  RBGL            1.78.0      2023-10-24 [1] Bioconductor
##  Rcpp            1.0.12      2024-01-09 [1] CRAN (R 4.3.1)
##  RCurl           1.98-1.14   2024-01-09 [1] CRAN (R 4.3.1)
##  readr           2.1.5       2024-01-10 [1] CRAN (R 4.3.1)
##  rlang           1.1.3       2024-01-10 [1] CRAN (R 4.3.1)
##  rmarkdown       2.25        2023-09-18 [1] CRAN (R 4.3.1)
##  rorcid          0.7.0       2021-01-20 [1] CRAN (R 4.3.1)
##  RSQLite         2.3.5       2024-01-21 [1] CRAN (R 4.3.1)
##  rstudioapi      0.15.0      2023-07-07 [1] CRAN (R 4.3.1)
##  RUnit           0.4.32      2018-05-18 [1] CRAN (R 4.3.1)
##  rvest           1.0.3       2022-08-19 [1] CRAN (R 4.3.1)
##  sass            0.4.8       2023-12-06 [1] CRAN (R 4.3.1)
##  scales          1.3.0       2023-11-28 [1] CRAN (R 4.3.1)
##  sessioninfo     1.2.2       2021-12-06 [1] CRAN (R 4.3.1)
##  stringi         1.8.3       2023-12-11 [1] CRAN (R 4.3.1)
##  stringr         1.5.1       2023-11-14 [1] CRAN (R 4.3.1)
##  tibble          3.2.1       2023-03-20 [1] CRAN (R 4.3.1)
##  tidyselect      1.2.0       2022-10-10 [1] CRAN (R 4.3.1)
##  tzdb            0.4.0       2023-05-12 [1] CRAN (R 4.3.1)
##  utf8            1.2.4       2023-10-22 [1] CRAN (R 4.3.2)
##  vctrs           0.6.5       2023-12-01 [1] CRAN (R 4.3.1)
##  whisker         0.4.1       2022-12-05 [1] CRAN (R 4.3.1)
##  withr           3.0.0       2024-01-16 [1] CRAN (R 4.3.1)
##  xfun            0.41        2023-11-01 [1] CRAN (R 4.3.2)
##  XML             3.99-0.16.1 2024-01-22 [1] CRAN (R 4.3.1)
##  xml2            1.3.6       2023-12-04 [1] CRAN (R 4.3.1)
##  yaml            2.3.8       2023-12-11 [1] CRAN (R 4.3.1)
## 
##  [1] /home/lluis/bin/R/4.3.1
##  [2] /opt/R/4.3.1/lib/R/library
## 
## ------------------------------------------------------------------------------
Package: <package name>
X-CRAN-Comment: Archived on YYYY-MM-DD as <reason>.
X-CRAN-History: Archived on YYYY-MM-DD as <reason>.
  Unarchived on YYYY-MM-DD.
  .
  <Optional clarification of archival reason>
<Optional fields like License_restricts_use, Replaced_by, Maintainer: ORPHANED, OS_type: unix>
## ─ Session info ───────────────────────────────────────────────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.2.0 (2022-04-22)
##  os       Ubuntu 20.04.4 LTS
##  system   x86_64, linux-gnu
##  ui       X11
##  language (EN)
##  collate  en_US.UTF-8
##  ctype    en_US.UTF-8
##  tz       Europe/Madrid
##  date     2022-05-09
##  pandoc   2.17.1.1 @ /usr/lib/rstudio/bin/quarto/bin/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────
##  package      * version date (UTC) lib source
##  assertthat     0.2.1   2019-03-21 [1] CRAN (R 4.2.0)
##  blogdown       1.9     2022-03-28 [1] CRAN (R 4.2.0)
##  bookdown       0.26    2022-04-15 [1] CRAN (R 4.2.0)
##  bslib          0.3.1   2021-10-06 [1] CRAN (R 4.2.0)
##  cli            3.3.0   2022-04-25 [1] CRAN (R 4.2.0)
##  colorspace     2.0-3   2022-02-21 [1] CRAN (R 4.2.0)
##  ComplexUpset * 1.3.3   2021-12-11 [1] CRAN (R 4.2.0)
##  crayon         1.5.1   2022-03-26 [1] CRAN (R 4.2.0)
##  DBI            1.1.2   2021-12-20 [1] CRAN (R 4.2.0)
##  digest         0.6.29  2021-12-01 [1] CRAN (R 4.2.0)
##  dplyr        * 1.0.9   2022-04-28 [1] CRAN (R 4.2.0)
##  ellipsis       0.3.2   2021-04-29 [1] CRAN (R 4.2.0)
##  evaluate       0.15    2022-02-18 [1] CRAN (R 4.2.0)
##  fansi          1.0.3   2022-03-24 [1] CRAN (R 4.2.0)
##  farver         2.1.0   2021-02-28 [1] CRAN (R 4.2.0)
##  fastmap        1.1.0   2021-01-25 [1] CRAN (R 4.2.0)
##  generics       0.1.2   2022-01-31 [1] CRAN (R 4.2.0)
##  ggplot2      * 3.3.6   2022-05-03 [1] CRAN (R 4.2.0)
##  glue           1.6.2   2022-02-24 [1] CRAN (R 4.2.0)
##  gtable         0.3.0   2019-03-25 [1] CRAN (R 4.2.0)
##  highr          0.9     2021-04-16 [1] CRAN (R 4.2.0)
##  htmltools      0.5.2   2021-08-25 [1] CRAN (R 4.2.0)
##  jquerylib      0.1.4   2021-04-26 [1] CRAN (R 4.2.0)
##  jsonlite       1.8.0   2022-02-22 [1] CRAN (R 4.2.0)
##  knitr          1.39    2022-04-26 [1] CRAN (R 4.2.0)
##  labeling       0.4.2   2020-10-20 [1] CRAN (R 4.2.0)
##  lifecycle      1.0.1   2021-09-24 [1] CRAN (R 4.2.0)
##  magrittr       2.0.3   2022-03-30 [1] CRAN (R 4.2.0)
##  munsell        0.5.0   2018-06-12 [1] CRAN (R 4.2.0)
##  patchwork      1.1.1   2020-12-17 [1] CRAN (R 4.2.0)
##  pillar         1.7.0   2022-02-01 [1] CRAN (R 4.2.0)
##  pkgconfig      2.0.3   2019-09-22 [1] CRAN (R 4.2.0)
##  purrr          0.3.4   2020-04-17 [1] CRAN (R 4.2.0)
##  R6             2.5.1   2021-08-19 [1] CRAN (R 4.2.0)
##  rlang          1.0.2   2022-03-04 [1] CRAN (R 4.2.0)
##  rmarkdown      2.14    2022-04-25 [1] CRAN (R 4.2.0)
##  rstudioapi     0.13    2020-11-12 [1] CRAN (R 4.2.0)
##  sass           0.4.1   2022-03-23 [1] CRAN (R 4.2.0)
##  scales         1.2.0   2022-04-13 [1] CRAN (R 4.2.0)
##  sessioninfo    1.2.2   2021-12-06 [1] CRAN (R 4.2.0)
##  stringi        1.7.6   2021-11-29 [1] CRAN (R 4.2.0)
##  stringr        1.4.0   2019-02-10 [1] CRAN (R 4.2.0)
##  tibble         3.1.7   2022-05-03 [1] CRAN (R 4.2.0)
##  tidyselect     1.1.2   2022-02-21 [1] CRAN (R 4.2.0)
##  utf8           1.2.2   2021-07-24 [1] CRAN (R 4.2.0)
##  vctrs          0.4.1   2022-04-13 [1] CRAN (R 4.2.0)
##  withr          2.5.0   2022-03-03 [1] CRAN (R 4.2.0)
##  xfun           0.30    2022-03-02 [1] CRAN (R 4.2.0)
##  yaml           2.3.5   2022-02-21 [1] CRAN (R 4.2.0)
## 
##  [1] /home/lluis/bin/R/4.2.0/lib/R/library
## 
## ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
# Downloading the cransays repository branch history
download.file("https://github.com/lockedata/cransays/archive/history.zip", 
              destfile = "static/cransays-history.zip")
path_zip <- here::here("static", "cransays-history.zip") 
# We unzip the files to read them
dat <- unzip(path_zip, exdir = "static")
csv <- dat[grepl("*.csv$", x = dat)]
f <- lapply(csv, read.csv)
m <- function(x, y) {
  merge(x, y, sort = FALSE, all = TRUE)
}
updates <- Reduce(m, f) # Merge all files (Because the file format changed)
write.csv(updates, file = "static/cran_till_now.csv",  row.names = FALSE)
# Clean up
unlink("static/cransays-history/", recursive = TRUE)
unlink("static/cransays-history.zip", recursive = TRUE)
library("tidyverse")
library("lubridate")
library("hms")
path_file <- here::here("static", "cran_till_now.csv")
cran_submissions <- read.csv(path_file)
theme_set(theme_minimal()) # For plotting
col_names <- c("package", "version", "snapshot_time", "folder", "subfolder")
cran_submissions <- cran_submissions[, col_names]
holidays <- data.frame(
  start = as.POSIXct("18/12/2020", format = "%d/%m/%Y", tz = "UTC"), 
  end = as.POSIXct("04/01/2021", format = "%d/%m/%Y", tz = "UTC")
)
# Use appropriate class
cran_submissions$snapshot_time <- as.POSIXct(cran_submissions$snapshot_time,
                                             tz = "UTC")
# Fix subfolders structure
cran_submissions$subfolder[cran_submissions$subfolder %in% c("", "/")] <- NA
# Remove files or submissions without version number
cran_submissions <- cran_submissions[!is.na(cran_submissions$version), ]
cran_submissions <- distinct(cran_submissions, 
                             snapshot_time, folder, package, version, subfolder,
                             .keep_all = TRUE)
packges_multiple_versions <- cran_submissions %>% 
  group_by(package, snapshot_time) %>% 
  summarize(n = n_distinct(version)) %>% 
  filter(n != 1) %>% 
  distinct(package) %>% 
  pull(package)
package_multiple <- cran_submissions %>% 
  group_by(snapshot_time, package) %>% 
  count() %>% 
  group_by(snapshot_time) %>% 
  count(n) %>% 
  filter(n != 1) %>% 
  summarise(n = sum(nn)) %>% 
  ungroup()
ggplot(package_multiple) +
  geom_point(aes(snapshot_time, n), size = 1) +
  geom_rect(data = holidays, aes(xmin = start, xmax = end, ymin = 0, ymax = 6),
            alpha = 0.25, fill = "red") +
  annotate("text", x = holidays$start + (holidays$end - holidays$start)/2, 
           y = 3.5, label = "CRAN holidays") +
  scale_x_datetime(date_labels = "%Y/%m/%d", date_breaks = "2 weeks", 
                   expand = expansion()) +
  scale_y_continuous(expand = expansion()) +
  labs(title = "Packages in multiple folders and subfolders", 
       x = element_blank(), y = element_blank())
cran_submissions <- cran_submissions %>% 
  arrange(package, snapshot_time, version, folder) %>% 
  group_by(package, snapshot_time) %>% 
  mutate(n = 1:n()) %>% 
  filter(n == n()) %>% 
  ungroup() %>% 
  select(-n)
cran_submissions <- cran_submissions %>% 
  arrange(package, snapshot_time, folder) %>% 
  group_by(package, snapshot_time) %>% 
  mutate(n = 1:n()) %>% 
  filter(n == n()) %>% 
  ungroup() %>% 
  select(-n)
diff0 <- structure(0, class = "difftime", units = "hours")
cran_submissions <- cran_submissions %>% 
  arrange(package, version, snapshot_time) %>% 
  group_by(package) %>% 
  # Packages last seen in queue less than 24 ago are considered same submission
  mutate(diff_time = difftime(snapshot_time,  lag(snapshot_time), units = "hour"),
         diff_time = if_else(is.na(diff_time), diff0, diff_time), # Fill NAs
         diff_v = version != lag(version),
         diff_v = ifelse(is.na(diff_v), TRUE, diff_v), # Fill NAs
         new_version = !near(diff_time, 1, tol = 24) & diff_v, 
         new_version = if_else(new_version == FALSE & diff_time == 0, 
                               TRUE, new_version),
         submission_n = cumsum(as.numeric(new_version))) %>%
  ungroup() %>% 
  select(-diff_time, -diff_v, -new_version)
cran_queue <- cran_submissions %>% 
  group_by(snapshot_time) %>% 
  summarize(n = n_distinct(package))
ggplot(cran_queue) +
  geom_rect(aes(xmin = start, xmax = end, ymin = 0, ymax = 230),
            alpha = 0.5, fill = "red", data = holidays) +
  annotate("text", x = holidays$start + (holidays$end - holidays$start)/2, 
           y = 150, label = "CRAN holidays") +
  geom_path(aes(snapshot_time, n)) +
  scale_x_datetime(date_labels = "%Y/%m/%d", date_breaks = "2 weeks", 
                   expand = expansion()) +
  scale_y_continuous(expand = expansion()) +
  labs(x = element_blank(), y = element_blank(), 
       title = "Packages on CRAN review process")
man_colors <- RColorBrewer::brewer.pal(8, "Dark2")
names(man_colors) <- unique(cran_submissions$folder)
cran_submissions %>% 
  group_by(folder, snapshot_time) %>% 
  summarize(packages = n_distinct(package)) %>% 
  ggplot() +
  geom_rect(data = holidays, aes(xmin = start, xmax = end, ymin = 0, ymax = 200),
            alpha = 0.25, fill = "red") +
  annotate("text", x = holidays$start + (holidays$end - holidays$start)/2, 
           y = 105, label = "CRAN holidays") +
  geom_path(aes(snapshot_time, packages, col = folder)) +
  scale_x_datetime(date_labels = "%Y/%m/%d", date_breaks = "2 weeks", 
                   expand = expansion()) +
  scale_y_continuous(expand = expansion()) +
  scale_color_manual(values = man_colors) +
  labs(x = element_blank(), y = element_blank(),
       title = "Packages by folder", col = "Folder") +
  theme(legend.position = c(0.6, 0.7))
cran_submissions %>% 
  group_by(folder, snapshot_time) %>% 
  summarize(packages = n_distinct(package)) %>% 
  filter(snapshot_time >= holidays$start) %>% 
  ggplot() +
  geom_path(aes(snapshot_time, packages, col = folder)) +
  geom_rect(data = holidays, aes(xmin = start, xmax = end, ymin = 0, ymax = 200),
            alpha = 0.25, fill = "red") +
  annotate("text", x = holidays$start + (holidays$end - holidays$start)/2, 
           y = 105, label = "CRAN holidays") +
  scale_x_datetime(date_labels = "%Y/%m/%d", date_breaks = "1 day", 
                   expand = expansion()) +
  scale_y_continuous(expand = expansion(), limits = c(0, NA)) +
  scale_color_manual(values = man_colors) +
  labs(x = element_blank(), y = element_blank(),
       title = "Holidays", col = "Folder") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = c(0.8, 0.7))
cran_times <- cran_submissions %>% 
  mutate(seconds = seconds(snapshot_time),
         month = month(snapshot_time),
         mday = mday(snapshot_time),
         wday = wday(snapshot_time, locale = "en_GB.UTF-8"),
         week = week(snapshot_time),
         date = as_date(snapshot_time))
cran_times %>% 
  arrange(folder, date, mday) %>% 
  filter(snapshot_time < holidays$start | snapshot_time  > holidays$end) %>% 
  group_by(folder, date, mday) %>% 
  summarize(packages = n_distinct(package),
            week = unique(week)) %>% 
  group_by(folder, mday) %>% 
  ggplot() +
  geom_smooth(aes(mday, packages, col = folder)) +
  labs(x = "Day of the month", y = "Packages", col = "Folder",
       title = "Evolution by month day") +
  scale_color_manual(values = man_colors) +
  coord_cartesian(ylim = c(0, NA), xlim = c(1, NA)) +
  scale_x_continuous(expand = expansion()) +
  scale_y_continuous(expand = expansion()) 
cran_times %>% 
  filter(snapshot_time < holidays$start | snapshot_time  > holidays$end) %>% 
  group_by(folder, date, wday) %>% 
  summarize(packages = n_distinct(package),
            week = unique(week)) %>% 
  group_by(folder, wday) %>% 
  ggplot() +
  geom_smooth(aes(wday, packages, col = folder)) +
  labs(x = "Day of the week", y = "Packages", col = "Folder",
       title = "Evolution by week day") +
  scale_color_manual(values = man_colors) +
  scale_x_continuous(breaks = 1:7, expand = expansion()) +
  scale_y_continuous(expand = expansion(), limits = c(0, NA))
cran_members <- c("LH", "GS", "JH")
cran_times %>% 
  filter(subfolder %in% cran_members) %>% 
  group_by(subfolder, snapshot_time) %>% 
  summarize(packages = n_distinct(package)) %>% 
  ggplot() +
  geom_smooth(aes(snapshot_time, packages, col = subfolder)) +
    labs(x = element_blank(), y = element_blank(), col = "Folder",
       title = "Packages on folders") +
  scale_y_continuous(expand = expansion(), breaks = 0:10) +
  coord_cartesian(y = c(0, NA))  +
  scale_x_datetime(date_labels = "%Y/%m/%d", date_breaks = "2 weeks", 
               expand = expansion(add = 2)) +
  theme(legend.position = c(0.1, 0.8))
cran_times %>% 
  filter(subfolder %in% cran_members) %>% 
  group_by(subfolder, mday) %>% 
  summarize(packages = n_distinct(package)) %>% 
  ungroup() %>% 
  ggplot() +
  geom_smooth(aes(mday, packages, col = subfolder)) +
  labs(x = "Day of the month", y = "Pacakges", col = "Subfolder",
       title = "Packages on subfolers by day of the month") +
  scale_y_continuous(expand = expansion()) +
  scale_x_continuous(expand = expansion(), breaks = c(1,7,14,21,29)) +
  coord_cartesian(ylim = c(0, NA))
cran_times %>% 
  filter(subfolder %in% cran_members) %>% 
  group_by(subfolder, wday) %>% 
  summarize(packages = n_distinct(package)) %>% 
  ungroup() %>% 
  ggplot() +
  geom_smooth(aes(wday, packages, col = subfolder)) +
  labs(x = "Day of the week", y = "Pacakges", col = "Subfolder",
       title = "Evolution by week day") +
  scale_y_continuous(expand = expansion()) +
  scale_x_continuous(breaks = 1:7, expand = expansion()) +
  coord_cartesian(ylim =  c(0, NA))
subm <- cran_times %>%
  arrange(snapshot_time) %>% 
  select(package, version, submission_n, snapshot_time) %>% 
  group_by(package, submission_n) %>% 
  filter(row_number() %in% c(1, last(row_number()))) %>% 
  arrange(package, submission_n)
rsubm <- subm %>% 
  filter(n_distinct(snapshot_time) %% 2 == 0) %>%
  select(-version) %>% 
  mutate(time = c("start", "end")) %>% 
  pivot_wider(values_from = snapshot_time, names_from = time) %>% 
  ungroup() %>% 
  mutate(r = row_number(), 
         time  =  round(difftime(end, start, units = "hour"), 0)) %>% 
  ungroup()
lv <- levels(fct_reorder(rsubm$package, rsubm$start, .fun = min, .desc = FALSE))
ggplot(rsubm) +
  geom_rect(data = holidays, aes(xmin = start, xmax = end), 
            ymin = first(lv), ymax = last(lv), alpha = 0.5, fill = "red") +
  geom_linerange(aes(y = fct_reorder(package, start, .fun = min, .desc = FALSE),
                      x = start, xmin = start, xmax = end, 
                     col = as.factor(submission_n))) + 
  labs(x = element_blank(), y = element_blank(), title = 
         "Packages on the queue", col = "Submissions") +
  scale_x_datetime(date_labels = "%Y/%m/%d", date_breaks = "2 weeks", 
                   expand = expansion(add = 2)) +
  scale_colour_viridis_d() +
  theme_minimal() +
  theme(panel.grid.major.y = element_blank(),
        axis.text.y = element_blank(),
        legend.position = c(0.15, 0.7))
rsubm %>% 
  arrange(start) %>% 
  filter(start < holidays$start, # Look only before the holidays
    submission_n == 1,# Use only the first submission
    start > min(start)) %>%   # Just new submissions
  mutate(r = row_number(),
         start1 = as.numeric(seconds(start))) %>% 
  lm(start1 ~ r, data = .) %>% 
  broom::tidy() %>%  
  mutate(estimate = estimate/(60*60)) # Hours
## # A tibble: 2 x 5
##   term         estimate std.error statistic p.value
##   <chr>           <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept) 444325.     6321.     253047.       0
## 2 r                1.03      4.76      779.       0
library("patchwork")
p1 <- rsubm %>% 
  group_by(package) %>% 
  summarize(time = sum(time)) %>% 
  ggplot() +
  geom_histogram(aes(time), bins = 100) +
  labs(title = "Packages total time on queue", x = "Hours", 
       y = element_blank()) +
  scale_x_continuous(expand = expansion()) +
  scale_y_continuous(expand = expansion())
p2 <- rsubm %>% 
  group_by(package) %>% 
  summarize(time = sum(time)) %>% 
  ggplot() +
  geom_histogram(aes(time), binwidth = 24) +
  coord_cartesian(xlim = c(0, 24*7)) +
  labs(subtitle = "Zoom", x = "Hours", y = element_blank()) +
  scale_x_continuous(expand = expansion(), breaks = seq(0, 24*7, by = 24)) +
  scale_y_continuous(expand = expansion()) +
  theme(panel.background = element_rect(colour = "white"))
p1 + inset_element(p2, 0.2, 0.2, 1, 1)
p1 <- rsubm %>% 
  group_by(package, submission_n) %>% 
  summarize(time = sum(time)) %>% 
  ggplot() +
  geom_histogram(aes(time), bins = 100) +
  labs(title = "Submission time on queue", x = "Hours", 
       y = element_blank()) +
  scale_x_continuous(expand = expansion()) +
  scale_y_continuous(expand = expansion())
p2 <- rsubm %>% 
  group_by(package, submission_n) %>% 
  summarize(time = sum(time)) %>%  
  ggplot() +
  geom_histogram(aes(time), binwidth = 24) +
  coord_cartesian(xlim = c(0, 24*7)) +
  labs(subtitle = "Zoom", x = "Hours", y = element_blank()) +
  scale_x_continuous(expand = expansion(), breaks = seq(0, 24*7, by = 24)) +
  scale_y_continuous(expand = expansion()) +
  theme(panel.background = element_rect(colour = "white"))
p1 + inset_element(p2, 0.2, 0.2, 1, 1)
subm2 <- cran_times %>%
  group_by(package, submission_n, folder) %>% 
  arrange(snapshot_time) %>% 
  select(package, version, submission_n, snapshot_time, folder) %>% 
  filter(row_number() %in% c(1, last(row_number()))) %>% 
  arrange(submission_n)
rsubm2 <- subm2 %>% 
  filter(n_distinct(snapshot_time) %% 2 == 0) %>%
  mutate(time = c("start", "end")) %>% 
  pivot_wider(values_from = snapshot_time, names_from = time) %>% 
  ungroup() %>% 
  mutate(r = row_number(), 
         time  =  round(difftime(end, start, units = "hour"), 0)) %>% 
  ungroup() %>% 
  filter(!is.na(start), !is.na(end))
lv <- levels(fct_reorder(rsubm2$package, rsubm2$start, .fun = min, .desc = FALSE))
ggplot(rsubm2) +
  geom_rect(data = holidays, aes(xmin = start, xmax = end), 
            ymin = first(lv), ymax = last(lv), alpha = 0.5, fill = "red") +
  geom_linerange(aes(y = fct_reorder(package, start, .fun = min, .desc = FALSE),
                      x = start, xmin = start, xmax = end, col = folder)) + 
  labs(x = element_blank(), y = element_blank(), title = 
         "Packages on the queue") +
  scale_color_manual(values = man_colors) +
  scale_x_datetime(date_labels = "%Y/%m/%d", date_breaks = "2 weeks", 
               expand = expansion(add = 2)) +
  labs(col = "Folder") +
  theme_minimal() +
  theme(panel.grid.major.y = element_blank(),
        axis.text.y = element_blank(),
        legend.position = c(0.2, 0.7))
rsubm2 %>% 
  group_by(package, submission_n) %>% 
  summarize(n_folder = n_distinct(folder)) %>% 
  ggplot() + 
  geom_histogram(aes(n_folder), bins = 5) +
  labs(title = "Folders by submission", x = element_blank(), 
       y = element_blank())
compact_folders <- function(x) {
  y <- x != lag(x)
  y[1] <- TRUE
  x[y]
}
cran_times %>% 
  group_by(package, submission_n) %>% 
  summarize (folder = list(compact_folders(folder))) %>% 
  ungroup() %>% 
  count(folder, sort = TRUE) %>% 
  top_n(5) %>% 
  rename(Folders = folder, Frequency = n) %>% 
  as.data.frame()
##            Folders Frequency
## 1          pretest      1433
## 2 pretest, inspect       422
## 3          inspect       301
## 4 pretest, newbies       279
## 5          newbies       245
subm3 <- cran_times %>%
  arrange(snapshot_time) %>% 
  group_by(package) %>% 
  mutate(autor_change = submission_n != lag(submission_n),
         cran_change = folder != lag(folder)) %>% 
  mutate(autor_change = ifelse(is.na(autor_change), TRUE, autor_change),
         cran_change = ifelse(is.na(cran_change), FALSE, cran_change)) %>% 
  mutate(cran_change = case_when(subfolder != lag(subfolder) ~ TRUE,
                                 TRUE ~ cran_change)) %>% 
  ungroup()
subm3 %>% 
  group_by(snapshot_time) %>% 
  summarize(autor_change = sum(autor_change), cran_change = sum(cran_change)) %>% 
  filter(row_number() != 1) %>% 
  filter(autor_change != 0 | cran_change != 0) %>% 
  ggplot() +
  geom_rect(data = holidays, aes(xmin = start, xmax = end), 
            ymin = -26, ymax = 26, alpha = 0.5, fill = "grey") +
  geom_point(aes(snapshot_time, autor_change), fill = "blue", size = 0) +
  geom_area(aes(snapshot_time, autor_change), fill = "blue") +
  geom_point(aes(snapshot_time, -cran_change), fill = "red", size = 0) +
  geom_area(aes(snapshot_time, -cran_change), fill = "red") +
  scale_x_datetime(date_labels = "%Y/%m/%d", date_breaks = "2 weeks", 
                   expand = expansion(add = 2)) +
  scale_y_continuous(expand = expansion(add = c(0, 0))) + 
  coord_cartesian(ylim = c(-26, 26)) +
  annotate("text", label = "CRAN's", y = 20, x = as_datetime("2020/11/02")) +
  annotate("text", label = "Maintainers'", y = -20, x = as_datetime("2020/11/02")) +
  labs(y = "Changes", x = element_blank(), title = "Activity on CRAN:")
cran_times %>% 
  ungroup() %>% 
  group_by(package, submission_n) %>% 
  arrange(snapshot_time) %>% 
  filter(1:n() == last(n())) %>% 
  ungroup() %>% 
  count(folder, sort = TRUE) %>% 
  knitr::kable(col.names = c("Last folder", "Submissions"))
package_submissions <- cran_times %>% 
  group_by(package, submission_n) %>% 
  summarise(submission_period = difftime(max(snapshot_time), 
                                         min(snapshot_time), 
                                         units = "hour"),
            submission_time = min(snapshot_time)) %>% 
  ungroup() %>% 
  filter(submission_period != 0)
package_submissions %>% 
  # filter(submission_time < holidays$start) %>% 
  ggplot() +
  geom_point(aes(submission_time, submission_period, col = submission_n)) +
  geom_rect(data = holidays, aes(xmin = start, xmax = end),
            ymin = 0, ymax = 3500, alpha = 0.5, fill = "red") + 
  scale_x_datetime(date_labels = "%Y/%m/%d", date_breaks = "2 weeks",
                   expand = expansion(add = 10)) +
  scale_y_continuous(expand = expansion(add = 10)) +
  labs(title = "Time on the queue according to the submission",
       x = "Submission", y = "Time (hours)", col = "Submission") +
  theme(legend.position = c(0.5, 0.8))
package_submissions %>% 
  filter(submission_n == 1) %>% 
  ungroup() %>%
  mutate(d = as.Date(submission_time)) %>%
  group_by(d) %>% 
  summarize(m = median(submission_period)) %>% 
  ggplot() +
  geom_rect(data = holidays, aes(xmin = as.Date(start), xmax = as.Date(end)),
            ymin = 0, ymax = 80, alpha = 0.5, fill = "red") + 
  geom_smooth(aes(d, m)) +
  coord_cartesian(ylim = c(0, NA)) +
  scale_x_date(date_labels = "%Y/%m/%d", date_breaks = "2 weeks",
                   expand = expansion(add = 1)) +
  labs(x = element_blank(), y = "Daily median time in queue (hours)", 
       title = "Submission time")
package_submissions %>% 
  group_by(submission_n) %>% 
  mutate(submission_n = as.character(submission_n)) %>% 
  ggplot() +
  geom_jitter(aes(submission_n, submission_period), height = 0) +
  scale_y_continuous(limits = c(1, NA), expand = expansion(add = c(1, 10)),
                     breaks = seq(0,  4550, by = 24*7)) +
  labs(title = "Submission time in queue", y = "Hours", x = element_blank())
## Warning: Removed 142 rows containing missing values (geom_point).
package_submissions %>% 
  filter(submission_period != 0) %>% 
  group_by(submission_n) %>% 
  mutate(submission_n = as.character(submission_n)) %>% 
  filter(n() > 5) %>% 
  summarize(median = round(median(submission_period), 2)) %>% 
  knitr::kable(col.names = c("Submission", "Median time (h)"))
gha <- cbind(cran_times[, c("month", "mday", "wday", "week")], 
      minute = minute(cran_submissions$snapshot_time), 
      hour = hour(cran_times$snapshot_time),
      type = "cransays") %>% 
  distinct()
gha %>% 
  ggplot() +
  geom_violin(aes(as.factor(hour), minute)) +
  scale_y_continuous(expand = expansion(add = 0.5), 
                     breaks = c(0, 15, 30, 45, 60), limits = c(0, 60)) +
  scale_x_discrete(expand = expansion())  +
  labs(x = "Hour", y = "Minute", title = "Daily variation")
## NULL
## ─ Session info ───────────────────────────────────────────────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 4.0.1 (2020-06-06)
##  os       Ubuntu 20.04.3 LTS          
##  system   x86_64, linux-gnu           
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       Europe/Madrid               
##  date     2021-08-25                  
## 
## ─ Packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────
##  package      * version     date       lib source                              
##  assertthat     0.2.1       2019-03-21 [1] CRAN (R 4.0.1)                      
##  backports      1.2.1       2020-12-09 [1] CRAN (R 4.0.1)                      
##  blogdown       1.3         2021-04-14 [1] CRAN (R 4.0.1)                      
##  bookdown       0.22        2021-04-22 [1] CRAN (R 4.0.1)                      
##  broom          0.7.6       2021-04-05 [1] CRAN (R 4.0.1)                      
##  bslib          0.2.5       2021-05-12 [1] CRAN (R 4.0.1)                      
##  cellranger     1.1.0       2016-07-27 [1] CRAN (R 4.0.1)                      
##  cli            2.5.0       2021-04-26 [1] CRAN (R 4.0.1)                      
##  colorspace     2.0-1       2021-05-04 [1] CRAN (R 4.0.1)                      
##  crayon         1.4.1       2021-02-08 [1] CRAN (R 4.0.1)                      
##  DBI            1.1.1       2021-01-15 [1] CRAN (R 4.0.1)                      
##  dbplyr         2.1.1       2021-04-06 [1] CRAN (R 4.0.1)                      
##  digest         0.6.27      2020-10-24 [1] CRAN (R 4.0.1)                      
##  dplyr        * 1.0.6       2021-05-05 [1] CRAN (R 4.0.1)                      
##  ellipsis       0.3.2       2021-04-29 [1] CRAN (R 4.0.1)                      
##  evaluate       0.14        2019-05-28 [1] CRAN (R 4.0.1)                      
##  fansi          0.5.0       2021-05-25 [1] CRAN (R 4.0.1)                      
##  farver         2.1.0       2021-02-28 [1] CRAN (R 4.0.1)                      
##  forcats      * 0.5.1       2021-01-27 [1] CRAN (R 4.0.1)                      
##  fs             1.5.0       2020-07-31 [1] CRAN (R 4.0.1)                      
##  generics       0.1.0       2020-10-31 [1] CRAN (R 4.0.1)                      
##  ggplot2      * 3.3.5       2021-06-25 [1] CRAN (R 4.0.1)                      
##  glue           1.4.2       2020-08-27 [1] CRAN (R 4.0.1)                      
##  gtable         0.3.0       2019-03-25 [1] CRAN (R 4.0.1)                      
##  haven          2.4.1       2021-04-23 [1] CRAN (R 4.0.1)                      
##  here           1.0.1       2020-12-13 [1] CRAN (R 4.0.1)                      
##  highr          0.9         2021-04-16 [1] CRAN (R 4.0.1)                      
##  hms          * 1.0.0       2021-01-13 [1] CRAN (R 4.0.1)                      
##  htmltools      0.5.1.1     2021-01-22 [1] CRAN (R 4.0.1)                      
##  httr           1.4.2       2020-07-20 [1] CRAN (R 4.0.1)                      
##  jquerylib      0.1.4       2021-04-26 [1] CRAN (R 4.0.1)                      
##  jsonlite       1.7.2       2020-12-09 [1] CRAN (R 4.0.1)                      
##  knitr          1.33        2021-04-24 [1] CRAN (R 4.0.1)                      
##  labeling       0.4.2       2020-10-20 [1] CRAN (R 4.0.1)                      
##  lattice        0.20-41     2020-04-02 [1] CRAN (R 4.0.1)                      
##  lifecycle      1.0.0       2021-02-15 [1] CRAN (R 4.0.1)                      
##  lubridate    * 1.7.10.9000 2021-06-12 [1] Github (tidyverse/lubridate@1e0d66f)
##  magrittr       2.0.1       2020-11-17 [1] CRAN (R 4.0.1)                      
##  Matrix         1.3-2       2021-01-06 [1] CRAN (R 4.0.1)                      
##  mgcv           1.8-35      2021-04-18 [1] CRAN (R 4.0.1)                      
##  modelr         0.1.8       2020-05-19 [1] CRAN (R 4.0.1)                      
##  munsell        0.5.0       2018-06-12 [1] CRAN (R 4.0.1)                      
##  nlme           3.1-152     2021-02-04 [1] CRAN (R 4.0.1)                      
##  patchwork    * 1.1.1       2020-12-17 [1] CRAN (R 4.0.1)                      
##  pillar         1.6.1       2021-05-16 [1] CRAN (R 4.0.1)                      
##  pkgconfig      2.0.3       2019-09-22 [1] CRAN (R 4.0.1)                      
##  purrr        * 0.3.4       2020-04-17 [1] CRAN (R 4.0.1)                      
##  R6             2.5.0       2020-10-28 [1] CRAN (R 4.0.1)                      
##  RColorBrewer   1.1-2       2014-12-07 [1] CRAN (R 4.0.1)                      
##  Rcpp           1.0.6       2021-01-15 [1] CRAN (R 4.0.1)                      
##  readr        * 1.4.0       2020-10-05 [1] CRAN (R 4.0.1)                      
##  readxl         1.3.1       2019-03-13 [1] CRAN (R 4.0.1)                      
##  reprex         2.0.0       2021-04-02 [1] CRAN (R 4.0.1)                      
##  rlang          0.4.11      2021-04-30 [1] CRAN (R 4.0.1)                      
##  rmarkdown      2.9         2021-06-15 [1] CRAN (R 4.0.1)                      
##  rprojroot      2.0.2       2020-11-15 [1] CRAN (R 4.0.1)                      
##  rstudioapi     0.13        2020-11-12 [1] CRAN (R 4.0.1)                      
##  rvest          1.0.0       2021-03-09 [1] CRAN (R 4.0.1)                      
##  sass           0.4.0       2021-05-12 [1] CRAN (R 4.0.1)                      
##  scales         1.1.1       2020-05-11 [1] CRAN (R 4.0.1)                      
##  sessioninfo    1.1.1       2018-11-05 [1] CRAN (R 4.0.1)                      
##  stringi        1.6.2       2021-05-17 [1] CRAN (R 4.0.1)                      
##  stringr      * 1.4.0       2019-02-10 [1] CRAN (R 4.0.1)                      
##  tibble       * 3.1.2       2021-05-16 [1] CRAN (R 4.0.1)                      
##  tidyr        * 1.1.3       2021-03-03 [1] CRAN (R 4.0.1)                      
##  tidyselect     1.1.1       2021-04-30 [1] CRAN (R 4.0.1)                      
##  tidyverse    * 1.3.1       2021-04-15 [1] CRAN (R 4.0.1)                      
##  utf8           1.2.1       2021-03-12 [1] CRAN (R 4.0.1)                      
##  vctrs          0.3.8       2021-04-29 [1] CRAN (R 4.0.1)                      
##  viridisLite    0.4.0       2021-04-13 [1] CRAN (R 4.0.1)                      
##  withr          2.4.2       2021-04-18 [1] CRAN (R 4.0.1)                      
##  xfun           0.24        2021-06-15 [1] CRAN (R 4.0.1)                      
##  xml2           1.3.2       2020-04-23 [1] CRAN (R 4.0.1)                      
##  yaml           2.2.1       2020-02-01 [1] CRAN (R 4.0.1)                      
## 
## [1] /home/lluis/bin/R/4.0.1/lib/R/library
ap <- available.packages()
dp <- tools::package_dependencies(rownames(ap), db = ap, which = "Imports", 
                                  recursive = FALSE)
dp_n <- lengths(dp)
tb_dp <- sort(table(dp_n), decreasing = TRUE)
barplot(tb_dp)
names(dp_n)[dp_n >= 20]
##   [1] "AdhereRViz"         "AFM"                "AirSensor"         
##   [4] "alookr"             "amt"                "animaltracker"     
##   [7] "antaresViz"         "BAMBI"              "BasketballAnalyzeR"
##  [10] "BAwiR"              "bea.R"              "BETS"              
##  [13] "bibliometrix"       "biomod2"            "bioRad"            
##  [16] "BIRDS"              "bootnet"            "bpcs"              
##  [19] "breathtestcore"     "brms"               "card"              
##  [22] "chemmodlab"         "chillR"             "Clustering"        
##  [25] "CNVScope"           "codebook"           "cSEM"              
##  [28] "ctmm"               "ctsem"              "DAMisc"            
##  [31] "dartR"              "datacleanr"         "dccvalidator"      
##  [34] "devtools"           "dextergui"          "diceR"             
##  [37] "dipsaus"            "DIscBIO"            "distill"           
##  [40] "dlookr"             "dragon"             "drhur"             
##  [43] "dyngen"             "dynwrap"            "ebirdst"           
##  [46] "ecd"                "ecochange"          "EcoGenetics"       
##  [49] "ecospat"            "EFAtools"           "eiCompare"         
##  [52] "elementR"           "emdi"               "emuR"              
##  [55] "eph"                "EpiNow2"            "epitweetr"         
##  [58] "fdm2id"             "FedData"            "finalfit"          
##  [61] "forestmangr"        "genBaRcode"         "geoviz"            
##  [64] "ggquickeda"         "GJRM"               "GmAMisc"           
##  [67] "golem"              "graph4lg"           "GWSDAT"            
##  [70] "hdpGLM"             "highcharter"        "hmi"               
##  [73] "htsr"               "hybridEnsemble"     "iCellR"            
##  [76] "immunarch"          "inlmisc"            "IntClust"          
##  [79] "iNZightTools"       "IOHanalyzer"        "isoreader"         
##  [82] "ITNr"               "jmv"                "jsmodule"          
##  [85] "JWileymisc"         "KarsTS"             "lilikoi"           
##  [88] "mdapack"            "memapp"             "metacoder"         
##  [91] "MetaDBparse"        "MetaIntegrator"     "microbial"         
##  [94] "missCompare"        "mlflow"             "modchart"          
##  [97] "modeltime"          "modeltime.ensemble" "modeltime.resample"
## [100] "momentuHMM"        
##  [ reached getOption("max.print") -- omitted 112 entries ]
BioC_software <- BiocManager::repositories()["BioCsoft"]
bp <- available.packages(contriburl = contrib.url(BioC_software))
dp_BioC <- tools::package_dependencies(rownames(bp), db = bp, which = "Imports", 
                                  recursive = FALSE)
dp_BioC_n <- lengths(dp_BioC)
tb_dp_BioC <- sort(table(dp_BioC_n), decreasing = TRUE)
barplot(tb_dp_BioC)
names(dp_BioC_n)[dp_BioC_n >= 20]
##   [1] "abseqR"              "adductomicsR"        "ALPS"               
##   [4] "AlpsNMR"             "AMARETTO"            "amplican"           
##   [7] "AneuFinder"          "animalcules"         "appreci8R"          
##  [10] "ArrayExpressHTS"     "arrayQualityMetrics" "artMS"              
##  [13] "ASpediaFI"           "ATACseqQC"           "BASiCS"             
##  [16] "BatchQC"             "bigPint"             "bioCancer"          
##  [19] "BiocOncoTK"          "BiocPkgTools"        "biovizBase"         
##  [22] "biscuiteer"          "BPRMeth"             "bsseq"              
##  [25] "BUSpaRse"            "CAGEr"               "CATALYST"           
##  [28] "celda"               "CEMiTool"            "CeTF"               
##  [31] "ChAMP"               "chimeraviz"          "chipenrich"         
##  [34] "ChIPpeakAnno"        "ChIPQC"              "ChIPseeker"         
##  [37] "ChromSCape"          "chromVAR"            "cicero"             
##  [40] "circRNAprofiler"     "CiteFuse"            "clusterExperiment"  
##  [43] "clustifyr"           "CNEr"                "CNVPanelizer"       
##  [46] "CNVRanger"           "cola"                "COMPASS"            
##  [49] "compcodeR"           "CONFESS"             "consensusDE"        
##  [52] "contiBAIT"           "crlmm"               "crossmeta"          
##  [55] "cTRAP"               "CytoML"              "CytoTree"           
##  [58] "DAMEfinder"          "DaMiRseq"            "debrowser"          
##  [61] "deco"                "decompTumor2Sig"     "DEGreport"          
##  [64] "DEP"                 "DepecheR"            "destiny"            
##  [67] "DEsubs"              "DiffBind"            "diffcyt"            
##  [70] "diffHic"             "diffloop"            "DiscoRhythm"        
##  [73] "dmrseq"              "Doscheda"            "EGSEA"              
##  [76] "ELMER"               "ENmix"               "enrichTF"           
##  [79] "esATAC"              "EventPointer"        "exomePeak2"         
##  [82] "fcoex"               "FindMyFriends"       "flowSpy"            
##  [85] "flowWorkspace"       "FRASER"              "GAPGOM"             
##  [88] "GENESIS"             "GeneTonic"           "genomation"         
##  [91] "GenomicInteractions" "GenVisR"             "ggbio"              
##  [94] "GGtools"             "GladiaTOX"           "GmicR"              
##  [97] "gQTLstats"           "Gviz"                "GWENA"              
## [100] "HiCBricks"          
##  [ reached getOption("max.print") -- omitted 119 entries ]
## ─ Session info ───────────────────────────────────────────────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 4.0.1 (2020-06-06)
##  os       Ubuntu 20.04.1 LTS          
##  system   x86_64, linux-gnu           
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       Europe/Madrid               
##  date     2021-01-08                  
## 
## ─ Packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────
##  package     * version date       lib source                           
##  assertthat    0.2.1   2019-03-21 [1] CRAN (R 4.0.1)                   
##  BiocManager   1.30.10 2019-11-16 [1] CRAN (R 4.0.1)                   
##  blogdown      0.21.84 2021-01-07 [1] Github (rstudio/blogdown@c4fbb58)
##  bookdown      0.21    2020-10-13 [1] CRAN (R 4.0.1)                   
##  cli           2.2.0   2020-11-20 [1] CRAN (R 4.0.1)                   
##  crayon        1.3.4   2017-09-16 [1] CRAN (R 4.0.1)                   
##  digest        0.6.27  2020-10-24 [1] CRAN (R 4.0.1)                   
##  evaluate      0.14    2019-05-28 [1] CRAN (R 4.0.1)                   
##  fansi         0.4.1   2020-01-08 [1] CRAN (R 4.0.1)                   
##  glue          1.4.2   2020-08-27 [1] CRAN (R 4.0.1)                   
##  htmltools     0.5.0   2020-06-16 [1] CRAN (R 4.0.1)                   
##  knitr         1.30    2020-09-22 [1] CRAN (R 4.0.1)                   
##  magrittr      2.0.1   2020-11-17 [1] CRAN (R 4.0.1)                   
##  rlang         0.4.10  2020-12-30 [1] CRAN (R 4.0.1)                   
##  rmarkdown     2.6     2020-12-14 [1] CRAN (R 4.0.1)                   
##  sessioninfo   1.1.1   2018-11-05 [1] CRAN (R 4.0.1)                   
##  stringi       1.5.3   2020-09-09 [1] CRAN (R 4.0.1)                   
##  stringr       1.4.0   2019-02-10 [1] CRAN (R 4.0.1)                   
##  withr         2.3.0   2020-09-22 [1] CRAN (R 4.0.1)                   
##  xfun          0.20    2021-01-06 [1] CRAN (R 4.0.1)                   
##  yaml          2.2.1   2020-02-01 [1] CRAN (R 4.0.1)                   
## 
## [1] /home/lluis/bin/R/4.0.1/lib/R/library

total_attempts	packages	percentage
0	3390	65.0%
1	1141	21.9%
2	425	8.2%
3	138	2.6%
4	72	1.4%
5	23	0.4%
6	12	0.2%
7	4	0.1%
8	3	0.1%
9	2	0.0%
12	1	0.0%
16	1	0.0%

	PC1	PC2
first_deps_n	-0.6521642	-0.1528947
deps_all_n	-0.3304698	-0.0549046
first_rdeps_n	0.1235972	-0.6948659
first_deps_strong_n	-0.6606765	-0.0750116
downloads	0.1170554	-0.6965223

comment	history	packages
yes	no	3612
no	yes	2345
yes	yes	434
no	no	70

Action	Events
archived	7096
orphaned	341
removed	113
renamed	2
replaced	4
unarchived	2973

CRAN	Packages	Proportion
no	3869	64%
yes	2183	36%

package	direct_dep_XML	direct_dep_RCurl	first_deps_n	deps_all_n	first_rdeps_n	first_deps_strong_n	deps_strong_n	direct_strong	releases	strong	first_release	last_release	maintainer	downloads	repository	type
AnnotationForge	TRUE	TRUE	26	2456	5	10	47	TRUE	1	TRUE	2012-02-01	NA	NA	8113	Bioconductor	both
AnnotationHubData	TRUE	TRUE	33	2456	4	26	136	TRUE	1	TRUE	2015-02-01	NA	NA	6619	Bioconductor	both
autonomics	FALSE	TRUE	61	2499	0	34	104	FALSE	1	FALSE	2021-02-01	NA	NA	91	Bioconductor	RCurl
BaseSpaceR	FALSE	TRUE	6	2456	0	3	4	TRUE	1	TRUE	2013-02-01	NA	NA	218	Bioconductor	RCurl
BayesSpace	FALSE	TRUE	34	2459	0	24	161	TRUE	1	TRUE	2020-02-01	NA	NA	221	Bioconductor	RCurl
BgeeDB	FALSE	TRUE	19	2457	2	14	71	TRUE	1	TRUE	2016-02-01	NA	NA	238	Bioconductor	RCurl

direct_dep_XML	direct_dep_RCurl	Packages	deps	q25	mean_all	q75
TRUE	FALSE	235	3584	2456	2365.596	2458.5
FALSE	TRUE	193	3187	2456	2320.855	2460.0
TRUE	TRUE	67	1216	2456	2423.119	2457.5

package_not_corrected	request_maintainer	dependencies	other	events
yes	no	no	no	4366
no	no	no	no	1530
no	no	yes	no	767
no	no	no	yes	374
yes	no	no	yes	15
yes	no	yes	no	13
no	no	yes	yes	2
yes	no	yes	yes	2
yes	yes	no	no	2
yes	yes	no	yes	1

Package	X-CRAN-Comment
geiger	Orphaned and corrected on 2022-05-09. Repeated notifications about USE_FC_LEN_T were ignored.
alphahull	Versions up to 2.3 have been removed for mirepresentation of authorship.
udunits2	Orphaned on 2022-01-06 as installation problems were not corrected.
bibtex	Orphaned and corrected on 2020-09-19 as check problems were not corrected in time.

Last folder	Submissions
pretest	1653
newbies	981
inspect	890
recheck	555
publish	469
waiting	441
human	332
pending	225

Submission	Median time (h)
1	36.13 hours
2	18.27 hours
3	16.47 hours
4	11.27 hours
5	13.37 hours
6	38.08 hours

more_failed	packages	percentage
TRUE	15337	96.2%
FALSE	600	3.8%

status	packages	percentage
archived	4763	65.4%
available	2517	34.6%

Replaced_by	packages
no	6360
yes	101

Maintainer	packages
no	6366
yes	95

CRAN | B101nfo

Submissions accepted on the first try

Main take away

Bonus

Reproducibility

CRAN maintained packages

CRAN in packages

RCurl and XML

Circular dependency

Relevant data

Dependencies

Releases

Maintainers

Downloads

Analysis

Distribution dependencies

Overview

Finding important packages

Outro

Recomendations

Reproducibility

Reasons why packages are archived on CRAN

The data

Exploring

Extracting reasons

Discussion

Reproducibility

Packages submission and reviews; how does it work?

CRAN review

Cleaning the data

CRAN load

Time patterns

By day of the month

By day of the week

Other folders

Information for submitters

Review process

Time it takes to disappear from the system

GitHub action reliability

Conclusions

Reproducibility

CRAN dependencies

New policy in CRAN

Bioconductor

Reproducibility