Glittr stats

In this report you can find some general statistics about Glittr.org. The plots and statistics created are amongst others used in the manuscript. Since Glittr.org is an ongoing project these statistics are updated weekly.

Set up the environment

This is required if you run this notebook locally. Loading required packages.

library(httr2)
library(ggplot2)
library(dplyr)
library(ggbreak)
library(cowplot)
library(downlit)
library(xml2)

To run locally, create a file named .env and add your GitHub PAT (variable named PAT ) and google api key (named GOOGLE_API_KEY) in there, e.g.:

# this is an example, store it as .env:
export PAT="ghp_aRSRESCTZII20Lklser3H"
export GOOGLE_API_KEY="AjKSLE5SklxuRsxwPP8s0"

Now use your UNIX terminal to source this file to get the keys as objects:

source .env

In R, get environment variables as objects:

pat <- Sys.getenv("PAT")
google_api_key <- Sys.getenv("GOOGLE_API_KEY")

Setting colors. These correspond to the category colours on glittr.org.

glittr_cols <- c(
  "Scripting and languages" =             "#3a86ff",
  "Computational methods and pipelines" = "#fb5607",
  "Omics analysis" =                      "#ff006e",
  "Reproducibility and data management" = "#ffbe0b",
  "Statistics and machine learning" =     "#8338ec",
  "Others" =                              "#000000")

Parse repository data

Using the glittr.org REST API to get repository metadata, among which the stargazers, recency, category, license and tags.

Code

# get all repositories content as nested list
parsed <- request("https://glittr.org/api/repositories") |>
  req_perform() |>
  resp_body_json()

# extract relevant items as dataframe
repo_info_list <- lapply(parsed$data, function(x) data.frame(
  repo = x$name,
  author_name = x$author$name,
  stargazers = x$stargazers,
  recency = x$days_since_last_push,
  url = x$url,
  license = ifelse(is.null(x$license), "none", x$license),
  main_tag = x$tags[[1]]$name,
  main_category = x$tags[[1]]$category
))

repo_info <- do.call(rbind, repo_info_list)

# create a column with provider (either github or gitlab)
repo_info$provider <- ifelse(grepl("github", repo_info$url), "github", "gitlab")

# create a factor for categories for sorting
repo_info$main_category <- factor(repo_info$main_category,
                                  levels = names(glittr_cols))

# category table to keep order the same in the plots
cat_table <- table(category = repo_info$main_category)
cat_table <- sort(cat_table)

Number of repositories: 664

Get contributors info

Using the GitHub REST API to get the number of contributors for each repository on glittr.org. This takes a few minutes, so if the contributors haven’t changed, it will use a cached version.

Code

# take long time to run, so try to use cache results if no repos have been 
# added in the meantime

# check if data/n_contributors.rds exists
if(file.exists("data/n_contributors.rds")) {
  n_contributors <- readRDS("data/n_contributors.rds")
} else {
  n_contributors <- NULL
}

# get contributors info only from github repos
repo_info_gh <- repo_info[repo_info$provider == "github", ]

# get contributor info from github api if update is needed
if(!identical(sort(repo_info_gh$repo), sort(names(n_contributors)))) {
  dir.create("data", showWarnings = FALSE)
  n_contributors <- sapply(repo_info_gh$repo, function(x) {
    
    # get repo contributors
    resp <- request("https://api.github.com/repos/") |>
      req_url_path_append(x) |>
      req_url_path_append("contributors") |>
      req_url_query(per_page = 1) |>
      req_headers(
        Accept = "application/vnd.github+json",
        Authorization = paste("Bearer", pat),
        `X-GitHub-Api-Version` = "2022-11-28",
      ) |>
      req_perform() 
    
    link_url <- resp_link_url(resp, "last")
    if(is.null(link_url)) {
      return(1)
    } else {
      npages <- strsplit(link_url, "&page=")[[1]][2] |> as.numeric()
      return(npages)
    }
  })
  
  # overwrite rds file
  saveRDS(n_contributors, "data/n_contributors.rds")
}

repo_info_gh$contributors <- n_contributors[repo_info_gh$repo]

Get country information

Here we get country information for all authors and organizations. It uses the free text specified at ‘location’. Since this can be anything, we use the google REST API to translate that into country.

Code

# check whether author info exists for caching
if(file.exists("data/author_info.rds")) {
  author_info <- readRDS("data/author_info.rds")
  author_info_authors <- unique(author_info$author) |> sort()
} else {
  author_info_authors <- NULL
}

gh_authors <- repo_info$author_name[repo_info$provider == "github"] |>
  unique() |>
  sort()

# if the author info is out of date, update it
if(!identical(gh_authors, author_info_authors)) {
  author_info_list <- list()
  for(author in gh_authors) {
    
    parsed <- request("https://api.github.com/users/") |>
      req_url_path_append(author) |>
      req_headers(
        Accept = "application/vnd.github+json",
        Authorization = paste("Bearer", pat),
        `X-GitHub-Api-Version` = "2022-11-28",
      ) |>
      req_perform() |>
      resp_body_json()
    
    author_info_list[[author]] <- data.frame(
      author = parsed$login,
      type = parsed$type,
      name = ifelse(is.null(parsed$name), NA, parsed$name),
      location = ifelse(is.null(parsed$location), NA, parsed$location)
    )
  }
  
  author_info <- do.call(rbind, author_info_list)
  
  author_info_loc <- author_info[!is.na(author_info$location), ]
  
  author_loc <- author_info_loc$location
  names(author_loc) <- author_info_loc$author
  
  ggmap::register_google(key = google_api_key)
  loc_info <- ggmap::geocode(author_loc,
                             output = 'all')
  
  get_country <- function(loc_results) {
    if("results" %in% names(loc_results)) {
      for(results in loc_results$results) {
        address_info <- results$address_components |> 
          lapply(unlist) |> 
          do.call(rbind, args = _) |>
          as.data.frame()
        country <- address_info$long_name[address_info$types1 == "country"]
        if (length(country) == 0) next
      }
      if (length(country) == 0) return(NA)
      return(country)
    } else {
      return(NA)
    }
  }
  
  countries <- sapply(loc_info, get_country)
  names(countries) <- names(author_loc)
  
  author_info$country <- countries[author_info$author]
  
  saveRDS(author_info, "data/author_info.rds")
}

repo_info <- merge(repo_info, author_info, by.x = "author_name",
                   by.y = "author")
repo_info$country[is.na(repo_info$country)] <- "undefined"

Number of authors: 313
Number of countries: 26

Parse tag data

Here, we create tag_df that contains information for each tag by using the glittr.org API.

parsed <- request("https://glittr.org/api/tags") |>
  req_perform() |>
  resp_body_json()

tag_dfs <- list()
for(i in seq_along(parsed)) {
  category <- parsed[[i]]$category
  name <- sapply(parsed[[i]]$tags, function(x) x$name)
  repositories <- sapply(parsed[[i]]$tags, function(x) x$repositories)
  tag_dfs[[category]] <- data.frame(name, category, repositories)
}

tag_df <- do.call(rbind, tag_dfs) |> arrange(repositories)

Number of tags/topics: 58

Number of repositories by category

This is figure 2A in the manuscript.

cat_count_plot <- table(category = repo_info$main_category) |>
  as.data.frame() |>
  ggplot(aes(x = reorder(category, Freq), y = Freq, fill = category)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = glittr_cols) +
  coord_flip() +
  theme_classic() +
  ggtitle("Categories") +
  theme(legend.position = "none",
        axis.title.y = element_blank()) +
  ylab("Number of repositories")

print(cat_count_plot)

Figure 1: Number of repositories per category

And a table with the actual numbers

category_count <- table(category = repo_info$main_category) |> as.data.frame()
knitr::kable(category_count)

Table 1: Number of repositories per category

category	Freq
Scripting and languages	367
Computational methods and pipelines	65
Omics analysis	102
Reproducibility and data management	58
Statistics and machine learning	42
Others	28

Number of contributors per repository separated by category

This is figure 2B in the manuscript.

repo_info_gh$main_category <- factor(repo_info_gh$main_category,
                                     levels = names(cat_table))

contributors_plot <- repo_info_gh |>
  ggplot(aes(x = main_category, y = contributors, fill = main_category)) +
  geom_violin(scale = "width") +
  geom_boxplot(width = 0.1, col = "darkgrey") +
  coord_flip() +
  ggtitle("Contributors") +
  ylab("Number of contributors") +
  scale_y_sqrt() +
  scale_fill_manual(values = glittr_cols) +
  theme_bw() +
  theme(legend.position = "none",
        axis.title.y = element_blank(),
        plot.margin = margin(t = 5, r = 10, b = 5, l = 10))

print(contributors_plot)

Figure 2: Number of contributors per repository separated by category

And some statistics of contributors.

nna_contr <- repo_info_gh$contributors
param1 <- sum(nna_contr > 10)/length(nna_contr)
param2 <- sum(nna_contr > 1)/length(nna_contr)
param3 <- sum(nna_contr <= 5)/length(nna_contr)

More than 10 contributors: 24.8%
More than 1 contributor: 80.1%
Between 1 and 5 contributors: 60.3%

Number of repositories per tag

This is figure 2C in the manuscript.

tag_freq_plot <- tag_df |>
  filter(repositories > 10) |>
  ggplot(aes(x = reorder(name, repositories),
             y = repositories, fill = category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = glittr_cols) +
  ggtitle("Tags with > 10 repositories") +
  ylab("Number of repositories") +
  annotate(geom = "text", x = 2, y = 150,
           label = paste("Total number of tags: ",
                         nrow(tag_df)),
           color="black") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(tag_freq_plot)

Figure 3: Number of repostories per tag, colored by category.

And a table with the actual numbers.

tag_df |>
  filter(repositories > 10) |>
  arrange(desc(repositories)) |>
  knitr::kable(row.names = FALSE)

Table 2: Number of repositories per tag

name	category	repositories
R	Scripting and languages	268
Python	Scripting and languages	100
Transcriptomics	Omics analysis	86
RNA-seq	Omics analysis	82
Next generation sequencing	Omics analysis	63
Statistics	Statistics and machine learning	61
Genomics	Omics analysis	52
Data science	Statistics and machine learning	52
Machine learning	Statistics and machine learning	50
Single-cell sequencing	Omics analysis	49
Unix/Linux	Scripting and languages	40
Data management	Reproducibility and data management	39
Reproducibility	Reproducibility and data management	39
Data visualization	Scripting and languages	34
General	Others	33
FAIR data	Reproducibility and data management	32
Variant analysis	Omics analysis	29
Version control	Scripting and languages	26
Workflows	Computational methods and pipelines	21
Shiny	Scripting and languages	20
Containerization	Computational methods and pipelines	20
Metagenomics	Omics analysis	18
Docker	Computational methods and pipelines	16
ChIP-seq	Omics analysis	16
Julia	Scripting and languages	13
Nextflow	Computational methods and pipelines	12
ATAC-seq	Omics analysis	12
Epigenetics	Omics analysis	12
Quarto	Scripting and languages	11
Image analysis	Computational methods and pipelines	11

Number of repositories by author

This is figure 2D in the manuscript.

author_freq <- table(author_name = repo_info$author_name, 
                     main_category = repo_info$main_category) |>
  as.data.frame()

author_freq$main_category <- factor(author_freq$main_category,
                                     levels = names(cat_table))

repos_per_author <- table(repo_info$author_name)

lf_authors <- names(repos_per_author)[repos_per_author < 5]

author_freq_plot <- author_freq |>
  filter(!author_name %in% lf_authors) |>
  arrange(Freq) |>
  ggplot(aes(x = reorder(author_name, Freq), y = Freq, fill = main_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  ggtitle("Author or organization") +
  ylab("Number of repositories") +
  scale_fill_manual(values = glittr_cols) +
  annotate(geom = "text", x = 2, y = 30,
           label = paste("Authors with < 5 repos: ",
                         length(lf_authors)),
           color="black") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(author_freq_plot)

Figure 4: Number of repositories per author colored by category

And a table with the actual numbers.

table(repo_info$author_name) |>
  as.data.frame() |>
  filter(Freq >= 5) |>
  arrange(desc(Freq)) |>
  knitr::kable()

Table 3: Number of repositories per author

Var1	Freq
carpentries-incubator	46
sib-swiss	27
NBISweden	22
posit-conf-2023	20
ucdavis-bioinformatics-training	18
hbctraining	17
posit-conf-2024	17
datacarpentry	16
bioinformaticsdotca	14
bioinformatics-core-shared-training	13
bioinformatics-ca	10
GTPB	10
rstudio-conf-2022	9
fhdsl	8
learnbyexample	8
RockefellerUniversity	7
semacu	7
biocorecrg	6
JuliaAcademy	6
swcarpentry	6
cambiotraining	5
carpentries-lab	5
hadley	5
jhudsl	5

Number of repositories per license

This is figure 2E in the manuscript.

lic_freq_data <- table(license = repo_info$license,
                       main_category = repo_info$main_category) |>
  as.data.frame()

lic_freq_data$main_category <- factor(lic_freq_data$main_category,
                                     levels = names(cat_table))

lic_freq_plot <- lic_freq_data |>
  ggplot(aes(x = reorder(license, Freq), y = Freq, fill = main_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = glittr_cols) +
  theme_classic() +
  ggtitle("License type") +
  ylab("Number of repositories") +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(lic_freq_plot)

Figure 5: Number of repositories per license

And a table with the actual numbers.

repo_info$license |>
  table() |>
  as.data.frame() |>
  mutate(perc = round(Freq/nrow(repo_info)*100, 1)) |>
  arrange(desc(Freq)) |>
  knitr::kable()

Table 4: Number of repositories per license

Var1	Freq	perc
other	224	33.8
none	191	28.9
mit	74	11.2
cc-by-sa-4.0	48	7.3
cc-by-4.0	40	6.0
gpl-3.0	30	4.5
cc0-1.0	25	3.8
apache-2.0	12	1.8
bsd-3-clause	12	1.8
agpl-3.0	2	0.3
artistic-2.0	2	0.3
unlicense	1	0.2
wtfpl	1	0.2

Number of repositories per country

This is figure 2F in the mansucript.

country_freq <- table(country = repo_info$country, 
                      main_category = repo_info$main_category) |>
  as.data.frame()

country_freq$main_category <- factor(country_freq$main_category,
                                     levels = names(cat_table))

country_freq_plot <- country_freq |>
  filter(country != "undefined") |>
  ggplot(aes(x = reorder(country, Freq), y = Freq, fill = main_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  ggtitle("Country") +
  ylab("Number of repositories") +
  scale_fill_manual(values = glittr_cols) +
  annotate(geom = "text", x = 2, y = 70,
           label = paste("Repos with undefined country: ",
                         sum(repo_info$country == "undefined")),
           color="black") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(country_freq_plot)

Figure 6: Number of repositories per country colored by category

And a table with the actual numbers.

repo_info$country |> 
  table() |> 
  as.data.frame() |> 
  arrange(desc(Freq)) |> 
  knitr::kable()

Table 5: Number of repositories per country

Var1	Freq
undefined	268
United States	176
Canada	31
Switzerland	30
Sweden	23
United Kingdom	23
Australia	15
France	14
Germany	14
Netherlands	12
Portugal	11
Belgium	10
Spain	8
Denmark	4
India	4
Norway	4
Ireland	3
Italy	3
Bulgaria	2
Argentina	1
China	1
Finland	1
Luxembourg	1
Mexico	1
Poland	1
Ukraine	1

Summary plot

Full figure 2 of the manuscript.

p <- plot_grid(cat_count_plot, contributors_plot, 
          tag_freq_plot, author_freq_plot,  
          lic_freq_plot, country_freq_plot,
          ncol = 2, labels = LETTERS[1:6],
          rel_heights = c(2,3,3))

ggsave("grid_plot_fig2.pdf", width = 10, height = 10)
ggsave("grid_plot_fig2.eps", width = 10, height = 10)