Glittr stats

In this report you can find some general statistics about Glittr.org. The plots and statistics created are amongst others used in the manuscript. Since Glittr.org is an ongoing project these statistics are updated weekly.

Set up the environment

This is required if you run this notebook locally. Loading required packages.

library(httr2)
library(ggplot2)
library(dplyr)
library(ggbreak)
library(cowplot)
library(downlit)
library(xml2)

To run locally, create a file named .env and add your GitHub PAT (variable named PAT ) and google api key (named GOOGLE_API_KEY) in there, e.g.:

# this is an example, store it as .env:
export PAT="ghp_aRSRESCTZII20Lklser3H"
export GOOGLE_API_KEY="AjKSLE5SklxuRsxwPP8s0"

Now use your UNIX terminal to source this file to get the keys as objects:

source .env

In R, get environment variables as objects:

pat <- Sys.getenv("PAT")
google_api_key <- Sys.getenv("GOOGLE_API_KEY")
matomo_api_key <- Sys.getenv("MATOMO_API_KEY")

Setting colors. These correspond to the category colours on glittr.org.

glittr_cols <- c(
  "Scripting and languages" =             "#3a86ff",
  "Computational methods and pipelines" = "#fb5607",
  "Omics analysis" =                      "#ff006e",
  "Reproducibility and data management" = "#ffbe0b",
  "Statistics and machine learning" =     "#8338ec",
  "Others" =                              "#000000")

Parse repository data

Using the glittr.org REST API to get repository metadata, among which the stargazers, recency, category, license and tags.

Code

response <- request("https://glittr.org/api/repositories") |>
  req_perform() |>
  resp_body_json()

# extract relevant items as dataframe
repo_info_list <- lapply(response$data, function(x) data.frame(
  repo = x$name,
  author_name = x$author$name,
  stargazers = x$stargazers,
  recency = x$days_since_last_push,
  url = x$url,
  license = ifelse(is.null(x$license), "none", x$license),
  main_tag = x$tags[[1]]$name,
  main_category = x$tags[[1]]$category,
  website = x$website,
  author_profile = x$author$profile,
  author_website = x$author$website
))

repo_info <- do.call(rbind, repo_info_list)

# create a column with provider (either github or gitlab)
repo_info$provider <- ifelse(grepl("github", repo_info$url), "github", "gitlab")

# create a factor for categories for sorting
repo_info$main_category <- factor(repo_info$main_category,
                                  levels = names(glittr_cols))

# category table to keep order the same in the plots
cat_table <- table(category = repo_info$main_category)
cat_table <- sort(cat_table)

Number of repositories: 685

Get contributors info

Using the GitHub REST API to get the number of contributors for each repository on glittr.org. This takes a few minutes, so if the contributors haven’t changed, it will use a cached version.

Code

# take long time to run, so try to use cache results if no repos have been 
# added in the meantime

# check if data/n_contributors.rds exists
if(file.exists("data/n_contributors.rds")) {
  n_contributors <- readRDS("data/n_contributors.rds")
} else {
  n_contributors <- NULL
}

# get contributors info only from github repos
repo_info_gh <- repo_info[repo_info$provider == "github", ]

# get contributor info from github api if update is needed
if(!identical(sort(repo_info_gh$repo), sort(names(n_contributors)))) {
  dir.create("data", showWarnings = FALSE)
  n_contributors <- sapply(repo_info_gh$repo, function(x) {
    
    # get repo contributors
    resp <- request("https://api.github.com/repos/") |>
      req_url_path_append(x) |>
      req_url_path_append("contributors") |>
      req_url_query(per_page = 1) |>
      req_headers(
        Accept = "application/vnd.github+json",
        Authorization = paste("Bearer", pat),
        `X-GitHub-Api-Version` = "2022-11-28",
      ) |>
      req_perform() 
    
    link_url <- resp_link_url(resp, "last")
    if(is.null(link_url)) {
      return(1)
    } else {
      npages <- strsplit(link_url, "&page=")[[1]][2] |> as.numeric()
      return(npages)
    }
  })
  
  # overwrite rds file
  saveRDS(n_contributors, "data/n_contributors.rds")
}

repo_info_gh$contributors <- n_contributors[repo_info_gh$repo]

Get country information

Here we get country information for all authors and organizations. It uses the free text specified at ‘location’. Since this can be anything, we use the google REST API to translate that into country.

Code

# check whether author info exists for caching
if(file.exists("data/author_info.rds")) {
  author_info <- readRDS("data/author_info.rds")
  author_info_authors <- unique(author_info$author) |> sort()
} else {
  author_info_authors <- NULL
}

gh_authors <- repo_info$author_name[repo_info$provider == "github"] |>
  unique() |>
  sort()

# if the author info is out of date, update it
if(!identical(gh_authors, author_info_authors)) {
  author_info_list <- list()
  for(author in gh_authors) {
    
    parsed <- request("https://api.github.com/users/") |>
      req_url_path_append(author) |>
      req_headers(
        Accept = "application/vnd.github+json",
        Authorization = paste("Bearer", pat),
        `X-GitHub-Api-Version` = "2022-11-28",
      ) |>
      req_perform() |>
      resp_body_json()
    
    author_info_list[[author]] <- data.frame(
      author = parsed$login,
      type = parsed$type,
      name = ifelse(is.null(parsed$name), NA, parsed$name),
      location = ifelse(is.null(parsed$location), NA, parsed$location)
    )
  }
  
  author_info <- do.call(rbind, author_info_list)
  
  author_info_loc <- author_info[!is.na(author_info$location), ]
  
  author_loc <- author_info_loc$location
  names(author_loc) <- author_info_loc$author
  
  ggmap::register_google(key = google_api_key)
  loc_info <- ggmap::geocode(author_loc,
                             output = 'all')
  
  get_country <- function(loc_results) {
    if("results" %in% names(loc_results)) {
      for(results in loc_results$results) {
        address_info <- results$address_components |> 
          lapply(unlist) |> 
          do.call(rbind, args = _) |>
          as.data.frame()
        country <- address_info$long_name[address_info$types1 == "country"]
        if (length(country) == 0) next
      }
      if (length(country) == 0) return(NA)
      return(country)
    } else {
      return(NA)
    }
  }
  
  countries <- sapply(loc_info, get_country)
  names(countries) <- names(author_loc)
  
  author_info$country <- countries[author_info$author]
  
  saveRDS(author_info, "data/author_info.rds")
}

repo_info <- merge(repo_info, author_info, by.x = "author_name",
                   by.y = "author")
repo_info$country[is.na(repo_info$country)] <- "undefined"

Number of authors: 324
Number of countries: 26

Parse tag data

Here, we create tag_df that contains information for each tag by using the glittr.org API.

parsed <- request("https://glittr.org/api/tags") |>
  req_perform() |>
  resp_body_json()

tag_dfs <- list()
for(i in seq_along(parsed)) {
  category <- parsed[[i]]$category
  name <- sapply(parsed[[i]]$tags, function(x) x$name)
  repositories <- sapply(parsed[[i]]$tags, function(x) x$repositories)
  tag_dfs[[category]] <- data.frame(name, category, repositories)
}

tag_df <- do.call(rbind, tag_dfs) |> arrange(repositories)

Number of tags/topics: 62

Parse outlink data

Here, we use the matomo API to retrieve outlinks to repositories. In this way, we can have an idea which repositories are popular on glittr.org and how often people click on a link. It is summarized over a year (from today).

Code

url <- "https://matomo.sib.swiss/?module=API"

oneyearago <- format(Sys.Date() - 365, "%Y-%m-%d")

# Create and send the request
response <- request(url) |>
  req_body_form(
    method = "Actions.getOutlinks",
    idSite = 217,
    format = "json",
    date = paste(oneyearago, "today", sep = ","),
    period = "range",
    expanded = 1,
    filter_limit = -1,
    token_auth = matomo_api_key
  ) |>
  req_perform() |>
  resp_body_json()

## Get outlinks metadata in a dataframe
outlinks_list <- list()
for(domain in response) {
  label <- domain$label
  url_info <- lapply(domain$subtable, function(x) {
    data.frame(
      url = ifelse(is.null(x$url),NA , x$url),
      nb_visits = ifelse(is.null(x$nb_visits),NA , x$nb_visits),
      domain = label
    )
  })
  outlinks_list[[domain$label]] <- do.call(rbind, url_info)
}

outlinks_df <- do.call(rbind, outlinks_list)
row.names(outlinks_df) <- NULL

# function to clean urls for matching
clean_url <- function(url) {
  trimws(url) |> gsub("/$", "", x = _) |> tolower()
}

# function to match outlink data with repo and website url per entry
match_url <- function(outlinks_df, repo_info, column = "repo_url") {
  url_clean <- clean_url(outlinks_df$url)
  column_clean <- clean_url(repo_info[[column]])
  outlinks_df[[paste0("is_", column)]] <- url_clean %in% column_clean
  outlinks_df[[paste0("ass_repo_", column)]] <- repo_info$repo[match(url_clean,
                                                                    column_clean)]
  return(outlinks_df)
}

# apply the match url function on urls associated with repo entry
for(column in c("url", "website", "author_profile", "author_website")) {
  outlinks_df <- match_url(outlinks_df, repo_info, column = column)
}

# filter for only repo url and website (ignore author info)
# check whether associations match
outlinks_df$associated_entry <- outlinks_df |>
  select(ass_repo_url, ass_repo_website) |>
  apply(1, function(x) {
    x <- x[!is.na(x)] |> unique()
    
    if(length(x) == 1) return(x[1]) 
    if(length(x == 0) == 0) return(NA)
    if(length(x == 2)) return("do not correspond")
  })

# create a list of outlink visits by entry
visits_by_entry <- outlinks_df |>
  select(url, nb_visits, associated_entry) |>
  filter(!is.na(associated_entry)) |>
  group_by(associated_entry) |>
  summarise(total_visits = sum(nb_visits)) 

visits_by_entry <- merge(visits_by_entry, repo_info,
                         by.x = "associated_entry",
                         by.y = "repo") |>
  arrange(desc(total_visits))

visits_by_namespace <- visits_by_entry |>
  select(total_visits, author_name) |>
  group_by(author_name) |>
  summarise(total_visits = sum(total_visits)) |>
  arrange(desc(total_visits))

# visits by tag
visits_by_main_tag <- visits_by_entry |>
  select(total_visits, main_tag) |>
  group_by(main_tag) |>
  summarise(total_visits = sum(total_visits))

# add main category information for plotting
visits_by_main_tag <- merge(visits_by_main_tag, tag_df,
                            by.x = "main_tag",
                            by.y = "name") |>
  mutate(visits_per_repo = total_visits/repositories) |>
  arrange(desc(visits_per_repo))

The number of outlink visits in the last year: 3500
Number of repositories found using Glittr.org: 281
Most popular repo was TheAlgorithms/Python with 388 outlinks.
Most popular namespace was TheAlgorithms with 388 outlinks.

Number of repositories by category

This is figure 2A in the manuscript.

cat_count_plot <- table(category = repo_info$main_category) |>
  as.data.frame() |>
  ggplot(aes(x = reorder(category, Freq), y = Freq, fill = category)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = glittr_cols) +
  coord_flip() +
  theme_classic() +
  ggtitle("Categories") +
  theme(legend.position = "none",
        axis.title.y = element_blank()) +
  ylab("Number of repositories")

print(cat_count_plot)

Figure 1: Number of repositories per category

And a table with the actual numbers

category_count <- table(category = repo_info$main_category) |> as.data.frame()
knitr::kable(category_count)

Table 1: Number of repositories per category

category	Freq
Scripting and languages	304
Computational methods and pipelines	51
Omics analysis	158
Reproducibility and data management	48
Statistics and machine learning	93
Others	29

Number of contributors per repository separated by category

This is figure 2B in the manuscript.

repo_info_gh$main_category <- factor(repo_info_gh$main_category,
                                     levels = names(cat_table))

contributors_plot <- repo_info_gh |>
  ggplot(aes(x = main_category, y = contributors, fill = main_category)) +
  geom_violin(scale = "width") +
  geom_boxplot(width = 0.1, col = "darkgrey") +
  coord_flip() +
  ggtitle("Contributors") +
  ylab("Number of contributors") +
  scale_y_sqrt() +
  scale_fill_manual(values = glittr_cols) +
  theme_bw() +
  theme(legend.position = "none",
        axis.title.y = element_blank(),
        plot.margin = margin(t = 5, r = 10, b = 5, l = 10))

print(contributors_plot)

Figure 2: Number of contributors per repository separated by category

And some statistics of contributors.

nna_contr <- repo_info_gh$contributors
param1 <- sum(nna_contr > 10)/length(nna_contr)
param2 <- sum(nna_contr > 1)/length(nna_contr)
param3 <- sum(nna_contr <= 5)/length(nna_contr)

More than 10 contributors: 24.3%
More than 1 contributor: 79.5%
Between 1 and 5 contributors: 60.5%

Number of repositories per tag

This is figure 2C in the manuscript.

tag_freq_plot <- tag_df |>
  filter(repositories > 10) |>
  ggplot(aes(x = reorder(name, repositories),
             y = repositories, fill = category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = glittr_cols) +
  ggtitle("Tags with > 10 repositories") +
  ylab("Number of repositories") +
  annotate(geom = "text", x = 2, y = 150,
           label = paste("Total number of tags: ",
                         nrow(tag_df)),
           color="black") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(tag_freq_plot)

Figure 3: Number of repostories per tag, colored by category.

And a table with the actual numbers.

tag_df |>
  filter(repositories > 10) |>
  arrange(desc(repositories)) |>
  knitr::kable(row.names = FALSE)

Table 2: Number of repositories per tag

name	category	repositories
R	Scripting and languages	269
Python	Scripting and languages	106
Transcriptomics	Omics analysis	90
RNA-seq	Omics analysis	84
Statistics	Statistics and machine learning	67
Next generation sequencing	Omics analysis	64
Data science	Statistics and machine learning	60
Machine learning	Statistics and machine learning	57
Genomics	Omics analysis	56
Single-cell sequencing	Omics analysis	52
Data management	Reproducibility and data management	46
Reproducibility	Reproducibility and data management	42
Data visualization	Scripting and languages	41
Unix/Linux	Scripting and languages	39
General	Others	34
FAIR data	Reproducibility and data management	33
Variant analysis	Omics analysis	32
Version control	Scripting and languages	31
Workflows	Computational methods and pipelines	24
Shiny	Scripting and languages	20
Containerization	Computational methods and pipelines	20
Metagenomics	Omics analysis	17
Docker	Computational methods and pipelines	15
Nextflow	Computational methods and pipelines	15
ChIP-seq	Omics analysis	15
Julia	Scripting and languages	14
Quarto	Scripting and languages	12
Image analysis	Computational methods and pipelines	12
Spatial transcriptomics	Omics analysis	12
High performance computing	Computational methods and pipelines	11
ATAC-seq	Omics analysis	11

Number of repositories by author

This is figure 2D in the manuscript.

author_freq <- table(author_name = repo_info$author_name, 
                     main_category = repo_info$main_category) |>
  as.data.frame()

author_freq$main_category <- factor(author_freq$main_category,
                                     levels = names(cat_table))

repos_per_author <- table(repo_info$author_name)

lf_authors <- names(repos_per_author)[repos_per_author < 5]

author_freq_plot <- author_freq |>
  filter(!author_name %in% lf_authors) |>
  arrange(Freq) |>
  ggplot(aes(x = reorder(author_name, Freq), y = Freq, fill = main_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  ggtitle("Author or organization") +
  ylab("Number of repositories") +
  scale_fill_manual(values = glittr_cols) +
  annotate(geom = "text", x = 2, y = 30,
           label = paste("Authors with < 5 repos: ",
                         length(lf_authors)),
           color="black") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(author_freq_plot)

Figure 4: Number of repositories per author colored by category

And a table with the actual numbers.

table(repo_info$author_name) |>
  as.data.frame() |>
  filter(Freq >= 5) |>
  arrange(desc(Freq)) |>
  knitr::kable()

Table 3: Number of repositories per author

Var1	Freq
carpentries-incubator	46
sib-swiss	29
NBISweden	22
posit-conf-2023	20
ucdavis-bioinformatics-training	18
hbctraining	17
posit-conf-2024	17
datacarpentry	16
bioinformaticsdotca	14
bioinformatics-core-shared-training	13
bioinformatics-ca	10
GTPB	10
fhdsl	9
rstudio-conf-2022	9
learnbyexample	8
RockefellerUniversity	7
semacu	7
biocorecrg	6
JuliaAcademy	6
swcarpentry	6
cambiotraining	5
carpentries-lab	5
elixir-europe-training	5
hadley	5
jhudsl	5

Number of repositories per license

This is figure 2E in the manuscript.

lic_freq_data <- table(license = repo_info$license,
                       main_category = repo_info$main_category) |>
  as.data.frame()

lic_freq_data$main_category <- factor(lic_freq_data$main_category,
                                     levels = names(cat_table))

lic_freq_plot <- lic_freq_data |>
  ggplot(aes(x = reorder(license, Freq), y = Freq, fill = main_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = glittr_cols) +
  theme_classic() +
  ggtitle("License type") +
  ylab("Number of repositories") +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(lic_freq_plot)

Figure 5: Number of repositories per license

And a table with the actual numbers.

repo_info$license |>
  table() |>
  as.data.frame() |>
  mutate(perc = round(Freq/nrow(repo_info)*100, 1)) |>
  arrange(desc(Freq)) |>
  knitr::kable()

Table 4: Number of repositories per license

Var1	Freq	perc
other	226	33.1
none	198	29.0
mit	81	11.9
cc-by-sa-4.0	48	7.0
cc-by-4.0	45	6.6
gpl-3.0	28	4.1
cc0-1.0	26	3.8
bsd-3-clause	13	1.9
apache-2.0	12	1.8
agpl-3.0	2	0.3
artistic-2.0	2	0.3
unlicense	1	0.1
wtfpl	1	0.1

Number of repositories per country

This is figure 2F in the mansucript.

country_freq <- table(country = repo_info$country, 
                      main_category = repo_info$main_category) |>
  as.data.frame()

country_freq$main_category <- factor(country_freq$main_category,
                                     levels = names(cat_table))

country_freq_plot <- country_freq |>
  filter(country != "undefined") |>
  ggplot(aes(x = reorder(country, Freq), y = Freq, fill = main_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  ggtitle("Country") +
  ylab("Number of repositories") +
  scale_fill_manual(values = glittr_cols) +
  annotate(geom = "text", x = 2, y = 70,
           label = paste("Repos with undefined country: ",
                         sum(repo_info$country == "undefined")),
           color="black") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(country_freq_plot)

Figure 6: Number of repositories per country colored by category

And a table with the actual numbers.

repo_info$country |> 
  table() |> 
  as.data.frame() |> 
  arrange(desc(Freq)) |> 
  knitr::kable()

Table 5: Number of repositories per country

Var1	Freq
undefined	273
United States	185
Switzerland	33
Canada	32
Sweden	23
United Kingdom	23
Australia	18
France	14
Germany	13
Netherlands	12
Portugal	11
Belgium	10
Spain	8
Denmark	4
India	4
Norway	4
Ireland	3
Italy	3
Bulgaria	2
Luxembourg	2
Argentina	1
China	1
Finland	1
Mexico	1
Poland	1
Ukraine	1

Number of outlinks by repo

Here, we use the site data from matomo in order to investigate what visitors of Glittr.org have clicked on. This gives us an idea about which repositories and topics are popular among users. In Figure 7 we see that TheAlgorithms/Python was the most popular repository with 388 clicks.

visits_by_entry |>
  slice(1:30) |>
  ggplot(aes(x = reorder(associated_entry, total_visits),
             y = total_visits, fill = main_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = glittr_cols) +
  ggtitle("Top 30 of most outlinked repos") +
  ylab("Number of outlinks") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

Figure 7: Top 30 of most visited repos from Glittr.org in the past year

visits_by_entry |>
  select(associated_entry, total_visits) |>
  rename(repository = associated_entry) |>
  slice(1:30) |>
  knitr::kable()

Table 6: Number visits per repository

repository	total_visits
TheAlgorithms/Python	388
jakevdp/PythonDataScienceHandbook	161
galaxyproject/training-material	122
realpython/python-guide	119
ISUgenomics/bioinformatics-workbook	108
mlabonne/llm-course	104
cxli233/FriendsDontLetFriends	98
theislab/single-cell-tutorial	87
lexfridman/mit-deep-learning	80
zhiwehu/Python-programming-exercises	71
hbctraining/Training-modules	65
bobbyiliev/introduction-to-bash-scripting	62
hadley/r4ds	61
DataTalksClub/data-engineering-zoomcamp	49
instillai/TensorFlow-Course	49
jeroenjanssens/data-science-at-the-command-line	48
rust-lang/rustlings	47
genome/bfx-workshop	46
sib-swiss/single-cell-training	46
trekhleb/learn-python	46
Yorko/mlcourse.ai	44
google/comprehensive-rust	43
hadley/ggplot2-book	37
harvardinformatics/learning-bioinformatics-at-home	37
hbctraining/scRNA-seq_online	33
clauswilke/dataviz	31
rmcelreath/stat_rethinking_2022	31
AllenDowney/ThinkStats2	30
griffithlab/rnabio.org	28
pachterlab/BI-BE-CS-183-2023	28

It becomes more interesting when we aggregate the data per main tag (Figure 8). This gives us an idea of topic which people are looking for when visiting glittr.org. The most popular topic is Python with 876 outlinks.

visits_by_main_tag |>
  slice(1:30) |>
  ggplot(aes(x = reorder(main_tag, total_visits),
             y = total_visits, fill = category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = glittr_cols) +
  ggtitle("Top 30 of most outlinked main tags") +
  ylab("Number of outlinks") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

Figure 8: Top 30 of most popular main tags from Glittr.org in the past year

When we normalize the visits by number of repositories per tag (Figure 9), we can get an idea where training materials might be lacking. A high number here show repositories that are quite unique in what they teach, but are still popular. Here, we see that there are 2 repositires with the main tag Rust with an average of 45 outlinks.

visits_by_main_tag |>
  arrange(desc(visits_per_repo)) |>
  slice(1:30) |>
  ggplot(aes(x = reorder(main_tag, visits_per_repo),
             y = visits_per_repo, fill = category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = glittr_cols) +
  ggtitle("Top 30 of most outlinked main tags normalized by repo number") +
  ylab("Number of outlinks per tag/number of repositories having tag") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

Figure 9: Top 30 of most popular main tags normalized per repo from Glittr.org in the past year

visits_by_main_tag |>
  slice(1:30) |>
  knitr::kable()

Table 7: Number visits per tag

main_tag	total_visits	category	repositories	visits_per_repo
Rust	90	Scripting and languages	2	45.0000000
Galaxy	122	Computational methods and pipelines	3	40.6666667
Artificial intelligence	125	Statistics and machine learning	9	13.8888889
General	329	Others	34	9.6764706
Python	876	Scripting and languages	106	8.2641509
Multiomics	27	Omics analysis	4	6.7500000
Spatial transcriptomics	60	Omics analysis	12	5.0000000
Data visualization	202	Scripting and languages	41	4.9268293
Single-cell sequencing	254	Omics analysis	52	4.8846154
Workflows	111	Computational methods and pipelines	24	4.6250000
Machine learning	247	Statistics and machine learning	57	4.3333333
Proteomics	29	Omics analysis	8	3.6250000
GNU Make	10	Scripting and languages	3	3.3333333
Unix/Linux	120	Scripting and languages	39	3.0769231
Genomics	122	Omics analysis	56	2.1785714
Long read sequencing	19	Omics analysis	9	2.1111111
Quarto	23	Scripting and languages	12	1.9166667
Containerization	32	Computational methods and pipelines	20	1.6000000
Statistics	102	Statistics and machine learning	67	1.5223881
Cloud computing	15	Computational methods and pipelines	10	1.5000000
Epigenetics	13	Omics analysis	10	1.3000000
High performance computing	14	Computational methods and pipelines	11	1.2727273
Data science	69	Statistics and machine learning	60	1.1500000
Metagenomics	18	Omics analysis	17	1.0588235
Transcriptomics	94	Omics analysis	90	1.0444444
MATLAB/Octave	1	Scripting and languages	1	1.0000000
Next generation sequencing	63	Omics analysis	64	0.9843750
Variant analysis	25	Omics analysis	32	0.7812500
R	208	Scripting and languages	269	0.7732342
ChIP-seq	9	Omics analysis	15	0.6000000

We can also check popularity by namespace, i.e. author (Table 8).

visits_by_namespace |>
  slice(1:30) |>
  knitr::kable()

Table 8: Number visits per namespace

author_name	total_visits
TheAlgorithms	388
sib-swiss	212
jakevdp	161
hbctraining	129
galaxyproject	122
hadley	119
realpython	119
ISUgenomics	108
mlabonne	104
theislab	102
cxli233	98
NBISweden	93
lexfridman	80
carpentries-incubator	77
zhiwehu	71
bobbyiliev	62
DataTalksClub	49
instillai	49
jeroenjanssens	48
rust-lang	47
genome	46
trekhleb	46
Yorko	44
google	43
griffithlab	38
AllenDowney	37
harvardinformatics	37
clauswilke	31
rmcelreath	31
bioinformatics-core-shared-training	29

Summary plot

Full figure 2 of the manuscript.

p <- plot_grid(cat_count_plot, contributors_plot, 
          tag_freq_plot, author_freq_plot,  
          lic_freq_plot, country_freq_plot,
          ncol = 2, labels = LETTERS[1:6],
          rel_heights = c(2,3,3))

ggsave("grid_plot_fig2.pdf", width = 10, height = 10)
ggsave("grid_plot_fig2.eps", width = 10, height = 10)